STLdoc
STLdocumentation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
immintrin.h
Go to the documentation of this file.
1 /***
2 * imminitrin.h - Meta Header file for Intel(R) Architecture intrinsic functions.
3 *
4 * Copyright (C) 1985-2011 Intel Corporation. All rights reserved.
5 *
6 * The information and source code contained herein is the exclusive
7 * property of Intel Corporation and may not be disclosed, examined
8 * or reproduced in whole or in part without explicit written authorization
9 * from the company.
10 *
11 *
12 *******************************************************************************/
13 
14 #pragma once
15 #ifndef __midl
16 #ifndef _INCLUDED_IMM
17 #define _INCLUDED_IMM
18 
19 #if defined (_M_CEE_PURE)
20  #error ERROR: Intel Architecture intrinsic functions not supported in the pure mode!
21 #else /* defined (_M_CEE_PURE) */
22 
23 #include <wmmintrin.h>
24 
25 #ifdef __cplusplus
26 extern "C" {
27 #endif /* __cplusplus */
28 
29 /*
30  * Intel(R) AVX compiler intrinsic functions.
31  */
32 typedef union __declspec(intrin_type) _CRT_ALIGN(32) __m256 {
33  float m256_f32[8];
35 
36 typedef struct __declspec(intrin_type) _CRT_ALIGN(32) __m256d {
37  double m256d_f64[4];
39 
40 typedef union __declspec(intrin_type) _CRT_ALIGN(32) __m256i {
41  __int8 m256i_i8[32];
42  __int16 m256i_i16[16];
43  __int32 m256i_i32[8];
44  __int64 m256i_i64[4];
45  unsigned __int8 m256i_u8[32];
46  unsigned __int16 m256i_u16[16];
47  unsigned __int32 m256i_u32[8];
48  unsigned __int64 m256i_u64[4];
50 
51 
52 /*
53  * Compare predicates for scalar and packed compare intrinsic functions
54  */
55 #define _CMP_EQ_OQ 0x00 /* Equal (ordered, nonsignaling) */
56 #define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
57 #define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
58 #define _CMP_UNORD_Q 0x03 /* Unordered (nonsignaling) */
59 #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, nonsignaling) */
60 #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
61 #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered,
62  signaling) */
63 #define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
64 #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
65 #define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered,
66  signaling) */
67 #define _CMP_NGT_US 0x0A /* Not-greater-than (unordered, signaling) */
68 #define _CMP_FALSE_OQ 0x0B /* False (ordered, nonsignaling) */
69 #define _CMP_NEQ_OQ 0x0C /* Not-equal (ordered, non-signaling) */
70 #define _CMP_GE_OS 0x0D /* Greater-than-or-equal (ordered, signaling) */
71 #define _CMP_GT_OS 0x0E /* Greater-than (ordered, signaling) */
72 #define _CMP_TRUE_UQ 0x0F /* True (unordered, non-signaling) */
73 #define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
74 #define _CMP_LT_OQ 0x11 /* Less-than (ordered, nonsignaling) */
75 #define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, nonsignaling) */
76 #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
77 #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
78 #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, nonsignaling) */
79 #define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered,
80  nonsignaling) */
81 #define _CMP_ORD_S 0x17 /* Ordered (signaling) */
82 #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
83 #define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered,
84  nonsignaling) */
85 #define _CMP_NGT_UQ 0x1A /* Not-greater-than (unordered, nonsignaling) */
86 #define _CMP_FALSE_OS 0x1B /* False (ordered, signaling) */
87 #define _CMP_NEQ_OS 0x1C /* Not-equal (ordered, signaling) */
88 #define _CMP_GE_OQ 0x1D /* Greater-than-or-equal (ordered,
89  nonsignaling) */
90 #define _CMP_GT_OQ 0x1E /* Greater-than (ordered, nonsignaling) */
91 #define _CMP_TRUE_US 0x1F /* True (unordered, signaling) */
92 
93 /*
94  * Add Packed Double Precision Floating-Point Values
95  * **** VADDPD ymm1, ymm2, ymm3/m256
96  * Performs an SIMD add of the four packed double-precision floating-point
97  * values from the first source operand to the second source operand, and
98  * stores the packed double-precision floating-point results in the
99  * destination
100  */
101 extern __m256d __cdecl _mm256_add_pd(__m256d, __m256d);
102 
103 /*
104  * Add Packed Single Precision Floating-Point Values
105  * **** VADDPS ymm1, ymm2, ymm3/m256
106  * Performs an SIMD add of the eight packed single-precision floating-point
107  * values from the first source operand to the second source operand, and
108  * stores the packed single-precision floating-point results in the
109  * destination
110  */
111 extern __m256 __cdecl _mm256_add_ps(__m256, __m256);
112 
113 /*
114  * Add/Subtract Double Precision Floating-Point Values
115  * **** VADDSUBPD ymm1, ymm2, ymm3/m256
116  * Adds odd-numbered double-precision floating-point values of the first
117  * source operand with the corresponding double-precision floating-point
118  * values from the second source operand; stores the result in the odd-numbered
119  * values of the destination. Subtracts the even-numbered double-precision
120  * floating-point values from the second source operand from the corresponding
121  * double-precision floating values in the first source operand; stores the
122  * result into the even-numbered values of the destination
123  */
124 extern __m256d __cdecl _mm256_addsub_pd(__m256d, __m256d);
125 
126 /*
127  * Add/Subtract Packed Single Precision Floating-Point Values
128  * **** VADDSUBPS ymm1, ymm2, ymm3/m256
129  * Adds odd-numbered single-precision floating-point values of the first source
130  * operand with the corresponding single-precision floating-point values from
131  * the second source operand; stores the result in the odd-numbered values of
132  * the destination. Subtracts the even-numbered single-precision floating-point
133  * values from the second source operand from the corresponding
134  * single-precision floating values in the first source operand; stores the
135  * result into the even-numbered values of the destination
136  */
137 extern __m256 __cdecl _mm256_addsub_ps(__m256, __m256);
138 
139 /*
140  * Bitwise Logical AND of Packed Double Precision Floating-Point Values
141  * **** VANDPD ymm1, ymm2, ymm3/m256
142  * Performs a bitwise logical AND of the four packed double-precision
143  * floating-point values from the first source operand and the second
144  * source operand, and stores the result in the destination
145  */
146 extern __m256d __cdecl _mm256_and_pd(__m256d, __m256d);
147 
148 /*
149  * Bitwise Logical AND of Packed Single Precision Floating-Point Values
150  * **** VANDPS ymm1, ymm2, ymm3/m256
151  * Performs a bitwise logical AND of the eight packed single-precision
152  * floating-point values from the first source operand and the second
153  * source operand, and stores the result in the destination
154  */
155 extern __m256 __cdecl _mm256_and_ps(__m256, __m256);
156 
157 /*
158  * Bitwise Logical AND NOT of Packed Double Precision Floating-Point Values
159  * **** VANDNPD ymm1, ymm2, ymm3/m256
160  * Performs a bitwise logical AND NOT of the four packed double-precision
161  * floating-point values from the first source operand and the second source
162  * operand, and stores the result in the destination
163  */
164 extern __m256d __cdecl _mm256_andnot_pd(__m256d, __m256d);
165 
166 /*
167  * Bitwise Logical AND NOT of Packed Single Precision Floating-Point Values
168  * **** VANDNPS ymm1, ymm2, ymm3/m256
169  * Performs a bitwise logical AND NOT of the eight packed single-precision
170  * floating-point values from the first source operand and the second source
171  * operand, and stores the result in the destination
172  */
173 extern __m256 __cdecl _mm256_andnot_ps(__m256, __m256);
174 
175 /*
176  * Blend Packed Double Precision Floating-Point Values
177  * **** VBLENDPD ymm1, ymm2, ymm3/m256, imm8
178  * Double-Precision Floating-Point values from the second source operand are
179  * conditionally merged with values from the first source operand and written
180  * to the destination. The immediate bits [3:0] determine whether the
181  * corresponding Double-Precision Floating Point value in the destination is
182  * copied from the second source or first source. If a bit in the mask,
183  * corresponding to a word, is "1", then the Double-Precision Floating-Point
184  * value in the second source operand is copied, else the value in the first
185  * source operand is copied
186  */
187 extern __m256d __cdecl _mm256_blend_pd(__m256d, __m256d, const int);
188 
189 /*
190  * Blend Packed Single Precision Floating-Point Values
191  * **** VBLENDPS ymm1, ymm2, ymm3/m256, imm8
192  * Single precision floating point values from the second source operand are
193  * conditionally merged with values from the first source operand and written
194  * to the destination. The immediate bits [7:0] determine whether the
195  * corresponding single precision floating-point value in the destination is
196  * copied from the second source or first source. If a bit in the mask,
197  * corresponding to a word, is "1", then the single-precision floating-point
198  * value in the second source operand is copied, else the value in the first
199  * source operand is copied
200  */
201 extern __m256 __cdecl _mm256_blend_ps(__m256, __m256, const int);
202 
203 /*
204  * Blend Packed Double Precision Floating-Point Values
205  * **** VBLENDVPD ymm1, ymm2, ymm3/m256, ymm4
206  * Conditionally copy each quadword data element of double-precision
207  * floating-point value from the second source operand (third operand) and the
208  * first source operand (second operand) depending on mask bits defined in the
209  * mask register operand (fourth operand).
210  */
212 
213 /*
214  * Blend Packed Single Precision Floating-Point Values
215  * **** VBLENDVPS ymm1, ymm2, ymm3/m256, ymm4
216  * Conditionally copy each dword data element of single-precision
217  * floating-point value from the second source operand (third operand) and the
218  * first source operand (second operand) depending on mask bits defined in the
219  * mask register operand (fourth operand).
220  */
221 extern __m256 __cdecl _mm256_blendv_ps(__m256, __m256, __m256);
222 
223 /*
224  * Divide Packed Double-Precision Floating-Point Values
225  * **** VDIVPD ymm1, ymm2, ymm3/m256
226  * Performs an SIMD divide of the four packed double-precision floating-point
227  * values in the first source operand by the four packed double-precision
228  * floating-point values in the second source operand
229  */
230 extern __m256d __cdecl _mm256_div_pd(__m256d, __m256d);
231 
232 /*
233  * Divide Packed Single-Precision Floating-Point Values
234  * **** VDIVPS ymm1, ymm2, ymm3/m256
235  * Performs an SIMD divide of the eight packed single-precision
236  * floating-point values in the first source operand by the eight packed
237  * single-precision floating-point values in the second source operand
238  */
239 extern __m256 __cdecl _mm256_div_ps(__m256, __m256);
240 
241 /*
242  * Dot Product of Packed Single-Precision Floating-Point Values
243  * **** VDPPS ymm1, ymm2, ymm3/m256, imm8
244  * Multiplies the packed single precision floating point values in the
245  * first source operand with the packed single-precision floats in the
246  * second source. Each of the four resulting single-precision values is
247  * conditionally summed depending on a mask extracted from the high 4 bits
248  * of the immediate operand. This sum is broadcast to each of 4 positions
249  * in the destination if the corresponding bit of the mask selected from
250  * the low 4 bits of the immediate operand is "1". If the corresponding
251  * low bit 0-3 of the mask is zero, the destination is set to zero.
252  * The process is replicated for the high elements of the destination.
253  */
254 extern __m256 __cdecl _mm256_dp_ps(__m256, __m256, const int);
255 
256 /*
257  * Add Horizontal Double Precision Floating-Point Values
258  * **** VHADDPD ymm1, ymm2, ymm3/m256
259  * Adds pairs of adjacent double-precision floating-point values in the
260  * first source operand and second source operand and stores results in
261  * the destination
262  */
263 extern __m256d __cdecl _mm256_hadd_pd(__m256d, __m256d);
264 
265 /*
266  * Add Horizontal Single Precision Floating-Point Values
267  * **** VHADDPS ymm1, ymm2, ymm3/m256
268  * Adds pairs of adjacent single-precision floating-point values in the
269  * first source operand and second source operand and stores results in
270  * the destination
271  */
272 extern __m256 __cdecl _mm256_hadd_ps(__m256, __m256);
273 
274 /*
275  * Subtract Horizontal Double Precision Floating-Point Values
276  * **** VHSUBPD ymm1, ymm2, ymm3/m256
277  * Subtract pairs of adjacent double-precision floating-point values in
278  * the first source operand and second source operand and stores results
279  * in the destination
280  */
281 extern __m256d __cdecl _mm256_hsub_pd(__m256d, __m256d);
282 
283 /*
284  * Subtract Horizontal Single Precision Floating-Point Values
285  * **** VHSUBPS ymm1, ymm2, ymm3/m256
286  * Subtract pairs of adjacent single-precision floating-point values in
287  * the first source operand and second source operand and stores results
288  * in the destination.
289  */
290 extern __m256 __cdecl _mm256_hsub_ps(__m256, __m256);
291 
292 /*
293  * Maximum of Packed Double Precision Floating-Point Values
294  * **** VMAXPD ymm1, ymm2, ymm3/m256
295  * Performs an SIMD compare of the packed double-precision floating-point
296  * values in the first source operand and the second source operand and
297  * returns the maximum value for each pair of values to the destination
298  */
299 extern __m256d __cdecl _mm256_max_pd(__m256d, __m256d);
300 
301 /*
302  * Maximum of Packed Single Precision Floating-Point Values
303  * **** VMAXPS ymm1, ymm2, ymm3/m256
304  * Performs an SIMD compare of the packed single-precision floating-point
305  * values in the first source operand and the second source operand and
306  * returns the maximum value for each pair of values to the destination
307  */
308 extern __m256 __cdecl _mm256_max_ps(__m256, __m256);
309 
310 /*
311  * Minimum of Packed Double Precision Floating-Point Values
312  * **** VMINPD ymm1, ymm2, ymm3/m256
313  * Performs an SIMD compare of the packed double-precision floating-point
314  * values in the first source operand and the second source operand and
315  * returns the minimum value for each pair of values to the destination
316  */
317 extern __m256d __cdecl _mm256_min_pd(__m256d, __m256d);
318 
319 /*
320  * Minimum of Packed Single Precision Floating-Point Values
321  * **** VMINPS ymm1, ymm2, ymm3/m256
322  * Performs an SIMD compare of the packed single-precision floating-point
323  * values in the first source operand and the second source operand and
324  * returns the minimum value for each pair of values to the destination
325  */
326 extern __m256 __cdecl _mm256_min_ps(__m256, __m256);
327 
328 /*
329  * Multiply Packed Double Precision Floating-Point Values
330  * **** VMULPD ymm1, ymm2, ymm3/m256
331  * Performs a SIMD multiply of the four packed double-precision floating-point
332  * values from the first Source operand to the Second Source operand, and
333  * stores the packed double-precision floating-point results in the
334  * destination
335  */
336 extern __m256d __cdecl _mm256_mul_pd(__m256d, __m256d);
337 
338 /*
339  * Multiply Packed Single Precision Floating-Point Values
340  * **** VMULPS ymm1, ymm2, ymm3/m256
341  * Performs an SIMD multiply of the eight packed single-precision
342  * floating-point values from the first source operand to the second source
343  * operand, and stores the packed double-precision floating-point results in
344  * the destination
345  */
346 extern __m256 __cdecl _mm256_mul_ps(__m256, __m256);
347 
348 /*
349  * Bitwise Logical OR of Packed Double Precision Floating-Point Values
350  * **** VORPD ymm1, ymm2, ymm3/m256
351  * Performs a bitwise logical OR of the four packed double-precision
352  * floating-point values from the first source operand and the second
353  * source operand, and stores the result in the destination
354  */
355 extern __m256d __cdecl _mm256_or_pd(__m256d, __m256d);
356 
357 /*
358  * Bitwise Logical OR of Packed Single Precision Floating-Point Values
359  * **** VORPS ymm1, ymm2, ymm3/m256
360  * Performs a bitwise logical OR of the eight packed single-precision
361  * floating-point values from the first source operand and the second
362  * source operand, and stores the result in the destination
363  */
364 extern __m256 __cdecl _mm256_or_ps(__m256, __m256);
365 
366 /*
367  * Shuffle Packed Double Precision Floating-Point Values
368  * **** VSHUFPD ymm1, ymm2, ymm3/m256, imm8
369  * Moves either of the two packed double-precision floating-point values from
370  * each double quadword in the first source operand into the low quadword
371  * of each double quadword of the destination; moves either of the two packed
372  * double-precision floating-point values from the second source operand into
373  * the high quadword of each double quadword of the destination operand.
374  * The selector operand determines which values are moved to the destination
375  */
376 extern __m256d __cdecl _mm256_shuffle_pd(__m256d, __m256d, const int);
377 
378 /*
379  * Shuffle Packed Single Precision Floating-Point Values
380  * **** VSHUFPS ymm1, ymm2, ymm3/m256, imm8
381  * Moves two of the four packed single-precision floating-point values
382  * from each double qword of the first source operand into the low
383  * quadword of each double qword of the destination; moves two of the four
384  * packed single-precision floating-point values from each double qword of
385  * the second source operand into to the high quadword of each double qword
386  * of the destination. The selector operand determines which values are moved
387  * to the destination.
388  */
389 extern __m256 __cdecl _mm256_shuffle_ps(__m256, __m256, const int);
390 
391 /*
392  * Subtract Packed Double Precision Floating-Point Values
393  * **** VSUBPD ymm1, ymm2, ymm3/m256
394  * Performs an SIMD subtract of the four packed double-precision floating-point
395  * values of the second Source operand from the first Source operand, and
396  * stores the packed double-precision floating-point results in the destination
397  */
398 extern __m256d __cdecl _mm256_sub_pd(__m256d, __m256d);
399 
400 /*
401  * Subtract Packed Single Precision Floating-Point Values
402  * **** VSUBPS ymm1, ymm2, ymm3/m256
403  * Performs an SIMD subtract of the eight packed single-precision
404  * floating-point values in the second Source operand from the First Source
405  * operand, and stores the packed single-precision floating-point results in
406  * the destination
407  */
408 extern __m256 __cdecl _mm256_sub_ps(__m256, __m256);
409 
410 /*
411  * Bitwise Logical XOR of Packed Double Precision Floating-Point Values
412  * **** VXORPD ymm1, ymm2, ymm3/m256
413  * Performs a bitwise logical XOR of the four packed double-precision
414  * floating-point values from the first source operand and the second
415  * source operand, and stores the result in the destination
416  */
417 extern __m256d __cdecl _mm256_xor_pd(__m256d, __m256d);
418 
419 /*
420  * Bitwise Logical XOR of Packed Single Precision Floating-Point Values
421  * **** VXORPS ymm1, ymm2, ymm3/m256
422  * Performs a bitwise logical XOR of the eight packed single-precision
423  * floating-point values from the first source operand and the second
424  * source operand, and stores the result in the destination
425  */
426 extern __m256 __cdecl _mm256_xor_ps(__m256, __m256);
427 
428 /*
429  * Compare Packed Double-Precision Floating-Point Values
430  * **** VCMPPD xmm1, xmm2, xmm3/m128, imm8
431  * **** VCMPPD ymm1, ymm2, ymm3/m256, imm8
432  * Performs an SIMD compare of the four packed double-precision floating-point
433  * values in the second source operand (third operand) and the first source
434  * operand (second operand) and returns the results of the comparison to the
435  * destination operand (first operand). The comparison predicate operand
436  * (immediate) specifies the type of comparison performed on each of the pairs
437  * of packed values.
438  * For 128-bit intrinsic function with compare predicate values in range 0-7
439  * compiler may generate SSE2 instructions if it is warranted for performance
440  * reasons.
441  */
442 extern __m128d __cdecl _mm_cmp_pd(__m128d, __m128d, const int);
443 extern __m256d __cdecl _mm256_cmp_pd(__m256d, __m256d, const int);
444 
445 /*
446  * Compare Packed Single-Precision Floating-Point Values
447  * **** VCMPPS xmm1, xmm2, xmm3/m256, imm8
448  * **** VCMPPS ymm1, ymm2, ymm3/m256, imm8
449  * Performs a SIMD compare of the packed single-precision floating-point values
450  * in the second source operand (third operand) and the first source operand
451  * (second operand) and returns the results of the comparison to the
452  * destination operand (first operand). The comparison predicate operand
453  * (immediate) specifies the type of comparison performed on each of the pairs
454  * of packed values.
455  * For 128-bit intrinsic function with compare predicate values in range 0-7
456  * compiler may generate SSE2 instructions if it is warranted for performance
457  * reasons.
458  */
459 extern __m128 __cdecl _mm_cmp_ps(__m128, __m128, const int);
460 extern __m256 __cdecl _mm256_cmp_ps(__m256, __m256, const int);
461 
462 /*
463  * Compare Scalar Double-Precision Floating-Point Values
464  * **** VCMPSD xmm1, xmm2, xmm3/m64, imm8
465  * Compares the low double-precision floating-point values in the second source
466  * operand (third operand) and the first source operand (second operand) and
467  * returns the results in of the comparison to the destination operand (first
468  * operand). The comparison predicate operand (immediate operand) specifies the
469  * type of comparison performed.
470  * For compare predicate values in range 0-7 compiler may generate SSE2
471  * instructions if it is warranted for performance reasons.
472  */
473 extern __m128d __cdecl _mm_cmp_sd(__m128d, __m128d, const int);
474 
475 /*
476  * Compare Scalar Single-Precision Floating-Point Values
477  * **** VCMPSS xmm1, xmm2, xmm3/m64, imm8
478  * Compares the low single-precision floating-point values in the second source
479  * operand (third operand) and the first source operand (second operand) and
480  * returns the results of the comparison to the destination operand (first
481  * operand). The comparison predicate operand (immediate operand) specifies
482  * the type of comparison performed.
483  * For compare predicate values in range 0-7 compiler may generate SSE2
484  * instructions if it is warranted for performance reasons.
485  */
486 extern __m128 __cdecl _mm_cmp_ss(__m128, __m128, const int);
487 
488 /*
489  * Convert Packed Doubleword Integers to
490  * Packed Double-Precision Floating-Point Values
491  * **** VCVTDQ2PD ymm1, xmm2/m128
492  * Converts four packed signed doubleword integers in the source operand to
493  * four packed double-precision floating-point values in the destination
494  */
495 extern __m256d __cdecl _mm256_cvtepi32_pd(__m128i);
496 
497 /*
498  * Convert Packed Doubleword Integers to
499  * Packed Single-Precision Floating-Point Values
500  * **** VCVTDQ2PS ymm1, ymm2/m256
501  * Converts eight packed signed doubleword integers in the source operand to
502  * eight packed double-precision floating-point values in the destination
503  */
504 extern __m256 __cdecl _mm256_cvtepi32_ps(__m256i);
505 
506 /*
507  * Convert Packed Double-Precision Floating-point values to
508  * Packed Single-Precision Floating-Point Values
509  * **** VCVTPD2PS xmm1, ymm2/m256
510  * Converts four packed double-precision floating-point values in the source
511  * operand to four packed single-precision floating-point values in the
512  * destination
513  */
514 extern __m128 __cdecl _mm256_cvtpd_ps(__m256d);
515 
516 /*
517  * Convert Packed Single Precision Floating-Point Values to
518  * Packed Singed Doubleword Integer Values
519  * **** VCVTPS2DQ ymm1, ymm2/m256
520  * Converts eight packed single-precision floating-point values in the source
521  * operand to eight signed doubleword integers in the destination
522  */
523 extern __m256i __cdecl _mm256_cvtps_epi32(__m256);
524 
525 /*
526  * Convert Packed Single Precision Floating-point values to
527  * Packed Double Precision Floating-Point Values
528  * **** VCVTPS2PD ymm1, xmm2/m128
529  * Converts four packed single-precision floating-point values in the source
530  * operand to four packed double-precision floating-point values in the
531  * destination
532  */
533 extern __m256d __cdecl _mm256_cvtps_pd(__m128);
534 
535 /*
536  * Convert with Truncation Packed Double-Precision Floating-Point values to
537  * Packed Doubleword Integers
538  * **** VCVTTPD2DQ xmm1, ymm2/m256
539  * Converts four packed double-precision floating-point values in the source
540  * operand to four packed signed doubleword integers in the destination.
541  * When a conversion is inexact, a truncated (round toward zero) value is
542  * returned. If a converted result is larger than the maximum signed doubleword
543  * integer, the floating-point invalid exception is raised, and if this
544  * exception is masked, the indefinite integer value (80000000H) is returned
545 */
546 extern __m128i __cdecl _mm256_cvttpd_epi32(__m256d);
547 
548 /*
549  * Convert Packed Double-Precision Floating-point values to
550  * Packed Doubleword Integers
551  * **** VCVTPD2DQ xmm1, ymm2/m256
552  * Converts four packed double-precision floating-point values in the source
553  * operand to four packed signed doubleword integers in the destination
554  */
555 extern __m128i __cdecl _mm256_cvtpd_epi32(__m256d);
556 
557 /*
558  * Convert with Truncation Packed Single Precision Floating-Point Values to
559  * Packed Singed Doubleword Integer Values
560  * **** VCVTTPS2DQ ymm1, ymm2/m256
561  * Converts eight packed single-precision floating-point values in the source
562  * operand to eight signed doubleword integers in the destination.
563  * When a conversion is inexact, a truncated (round toward zero) value is
564  * returned. If a converted result is larger than the maximum signed doubleword
565  * integer, the floating-point invalid exception is raised, and if this
566  * exception is masked, the indefinite integer value (80000000H) is returned
567  */
568 extern __m256i __cdecl _mm256_cvttps_epi32(__m256);
569 
570 /*
571  * Extract packed floating-point values
572  * **** VEXTRACTF128 xmm1/m128, ymm2, imm8
573  * Extracts 128-bits of packed floating-point values from the source operand
574  * at an 128-bit offset from imm8[0] into the destination
575  */
576 extern __m128 __cdecl _mm256_extractf128_ps(__m256, const int);
577 extern __m128d __cdecl _mm256_extractf128_pd(__m256d, const int);
578 extern __m128i __cdecl _mm256_extractf128_si256(__m256i, const int);
579 
580 /*
581  * Zero All YMM registers
582  * **** VZEROALL
583  * Zeros contents of all YMM registers
584  */
585 extern void __cdecl _mm256_zeroall(void);
586 
587 /*
588  * Zero Upper bits of YMM registers
589  * **** VZEROUPPER
590  * Zeros the upper 128 bits of all YMM registers. The lower 128-bits of the
591  * registers (the corresponding XMM registers) are unmodified
592  */
593 extern void __cdecl _mm256_zeroupper(void);
594 
595 /*
596  * Permute Single-Precision Floating-Point Values
597  * **** VPERMILPS ymm1, ymm2, ymm3/m256
598  * **** VPERMILPS xmm1, xmm2, xmm3/m128
599  * Permute Single-Precision Floating-Point values in the first source operand
600  * using 8-bit control fields in the low bytes of corresponding elements the
601  * shuffle control and store results in the destination
602  */
603 extern __m256 __cdecl _mm256_permutevar_ps(__m256, __m256i);
604 extern __m128 __cdecl _mm_permutevar_ps(__m128, __m128i);
605 
606 /*
607  * Permute Single-Precision Floating-Point Values
608  * **** VPERMILPS ymm1, ymm2/m256, imm8
609  * **** VPERMILPS xmm1, xmm2/m128, imm8
610  * Permute Single-Precision Floating-Point values in the first source operand
611  * using four 2-bit control fields in the 8-bit immediate and store results
612  * in the destination
613  */
614 extern __m256 __cdecl _mm256_permute_ps(__m256, int);
615 extern __m128 __cdecl _mm_permute_ps(__m128, int);
616 
617 /*
618  * Permute Double-Precision Floating-Point Values
619  * **** VPERMILPD ymm1, ymm2, ymm3/m256
620  * **** VPERMILPD xmm1, xmm2, xmm3/m128
621  * Permute Double-Precision Floating-Point values in the first source operand
622  * using 8-bit control fields in the low bytes of the second source operand
623  * and store results in the destination
624  */
626 extern __m128d __cdecl _mm_permutevar_pd(__m128d, __m128i);
627 
628 /*
629  * Permute Double-Precision Floating-Point Values
630  * **** VPERMILPD ymm1, ymm2/m256, imm8
631  * **** VPERMILPD xmm1, xmm2/m128, imm8
632  * Permute Double-Precision Floating-Point values in the first source operand
633  * using two, 1-bit control fields in the low 2 bits of the 8-bit immediate
634  * and store results in the destination
635  */
636 extern __m256d __cdecl _mm256_permute_pd(__m256d, int);
637 extern __m128d __cdecl _mm_permute_pd(__m128d, int);
638 
639 /*
640  * Permute Floating-Point Values
641  * **** VPERM2F128 ymm1, ymm2, ymm3/m256, imm8
642  * Permute 128 bit floating-point-containing fields from the first source
643  * operand and second source operand using bits in the 8-bit immediate and
644  * store results in the destination
645  */
646 extern __m256 __cdecl _mm256_permute2f128_ps(__m256, __m256, int);
647 extern __m256d __cdecl _mm256_permute2f128_pd(__m256d, __m256d, int);
648 extern __m256i __cdecl _mm256_permute2f128_si256(__m256i, __m256i, int);
649 
650 /*
651  * Load with Broadcast
652  * **** VBROADCASTSS ymm1, m32
653  * **** VBROADCASTSS xmm1, m32
654  * Load floating point values from the source operand and broadcast to all
655  * elements of the destination
656  */
657 extern __m256 __cdecl _mm256_broadcast_ss(float const *);
658 extern __m128 __cdecl _mm_broadcast_ss(float const *);
659 
660 /*
661  * Load with Broadcast
662  * **** VBROADCASTSD ymm1, m64
663  * Load floating point values from the source operand and broadcast to all
664  * elements of the destination
665  */
666 extern __m256d __cdecl _mm256_broadcast_sd(double const *);
667 
668 /*
669  * Load with Broadcast
670  * **** VBROADCASTF128 ymm1, m128
671  * Load floating point values from the source operand and broadcast to all
672  * elements of the destination
673  */
674 extern __m256 __cdecl _mm256_broadcast_ps(__m128 const *);
675 extern __m256d __cdecl _mm256_broadcast_pd(__m128d const *);
676 
677 /*
678  * Insert packed floating-point values
679  * **** VINSERTF128 ymm1, ymm2, xmm3/m128, imm8
680  * Performs an insertion of 128-bits of packed floating-point values from the
681  * second source operand into an the destination at an 128-bit offset from
682  * imm8[0]. The remaining portions of the destination are written by the
683  * corresponding fields of the first source operand
684  */
685 extern __m256 __cdecl _mm256_insertf128_ps(__m256, __m128, int);
686 extern __m256d __cdecl _mm256_insertf128_pd(__m256d, __m128d, int);
687 extern __m256i __cdecl _mm256_insertf128_si256(__m256i, __m128i, int);
688 
689 /*
690  * Move Aligned Packed Double-Precision Floating-Point Values
691  * **** VMOVAPD ymm1, m256
692  * **** VMOVAPD m256, ymm1
693  * Moves 4 double-precision floating-point values from the source operand to
694  * the destination
695  */
696 extern __m256d __cdecl _mm256_load_pd(double const *);
697 extern void __cdecl _mm256_store_pd(double *, __m256d);
698 
699 /*
700  * Move Aligned Packed Single-Precision Floating-Point Values
701  * **** VMOVAPS ymm1, m256
702  * **** VMOVAPS m256, ymm1
703  * Moves 8 single-precision floating-point values from the source operand to
704  * the destination
705  */
706 extern __m256 __cdecl _mm256_load_ps(float const *);
707 extern void __cdecl _mm256_store_ps(float *, __m256);
708 
709 /*
710  * Move Unaligned Packed Double-Precision Floating-Point Values
711  * **** VMOVUPD ymm1, m256
712  * **** VMOVUPD m256, ymm1
713  * Moves 256 bits of packed double-precision floating-point values from the
714  * source operand to the destination
715  */
716 extern __m256d __cdecl _mm256_loadu_pd(double const *);
717 extern void __cdecl _mm256_storeu_pd(double *, __m256d);
718 
719 /*
720  * Move Unaligned Packed Single-Precision Floating-Point Values
721  * **** VMOVUPS ymm1, m256
722  * **** VMOVUPS m256, ymm1
723  * Moves 256 bits of packed single-precision floating-point values from the
724  * source operand to the destination
725  */
726 extern __m256 __cdecl _mm256_loadu_ps(float const *);
727 extern void __cdecl _mm256_storeu_ps(float *, __m256);
728 
729 /*
730  * Move Aligned Packed Integer Values
731  * **** VMOVDQA ymm1, m256
732  * **** VMOVDQA m256, ymm1
733  * Moves 256 bits of packed integer values from the source operand to the
734  * destination
735  */
736 extern __m256i __cdecl _mm256_load_si256(__m256i const *);
737 extern void __cdecl _mm256_store_si256(__m256i *, __m256i);
738 
739 /*
740  * Move Unaligned Packed Integer Values
741  * **** VMOVDQU ymm1, m256
742  * **** VMOVDQU m256, ymm1
743  * Moves 256 bits of packed integer values from the source operand to the
744  * destination
745  */
746 extern __m256i __cdecl _mm256_loadu_si256(__m256i const *);
747 extern void __cdecl _mm256_storeu_si256(__m256i *, __m256i);
748 
749 /*
750  * Load Two Unaligned Packed 128-bit Values
751  * Loads two potentially unaligned 128-bit values
752  * and combines them into one 256-bit value.
753  *
754  * The data types here (float const*, double const* and __m128i const*)
755  * were chosen for consistency with the underlying _mm_loadu_{ps,pd,si128}
756  * intrinsics.
757  */
758 
759 #define _mm256_loadu2_m128(/* float const* */ hiaddr, \
760  /* float const* */ loaddr) \
761  _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
762 
763 #define _mm256_loadu2_m128d(/* double const* */ hiaddr, \
764  /* double const* */ loaddr) \
765  _mm256_set_m128d(_mm_loadu_pd(hiaddr), _mm_loadu_pd(loaddr))
766 
767 #define _mm256_loadu2_m128i(/* __m128i const* */ hiaddr, \
768  /* __m128i const* */ loaddr) \
769  _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
770 
771 /*
772  * Store 256-bit Value To Two Unaligned 128-bit Locations
773  * Stores the high and low 128-bit halves of a 256-bit value
774  * to two different potentially unaligned addresses.
775  */
776 
777 #define _mm256_storeu2_m128(/* float* */ hiaddr, /* float* */ loaddr, \
778  /* __m256 */ a) \
779  do { \
780  __m256 _a = (a); /* reference a only once in macro body */ \
781  _mm_storeu_ps((loaddr), _mm256_castps256_ps128(_a)); \
782  _mm_storeu_ps((hiaddr), _mm256_extractf128_ps(_a, 0x1)); \
783  } while (0)
784 
785 #define _mm256_storeu2_m128d(/* double* */ hiaddr, /* double* */ loaddr, \
786  /* __m256d */ a) \
787  do { \
788  __m256d _a = (a); /* reference a only once in macro body */ \
789  _mm_storeu_pd((loaddr), _mm256_castpd256_pd128(_a)); \
790  _mm_storeu_pd((hiaddr), _mm256_extractf128_pd(_a, 0x1)); \
791  } while (0)
792 
793 #define _mm256_storeu2_m128i(/* __m128i* */ hiaddr, /* __m128i* */ loaddr, \
794  /* __m256i */ a) \
795  do { \
796  __m256i _a = (a); /* reference a only once in macro body */ \
797  _mm_storeu_si128((loaddr), _mm256_castsi256_si128(_a)); \
798  _mm_storeu_si128((hiaddr), _mm256_extractf128_si256(_a, 0x1)); \
799  } while (0)
800 
801 /*
802  * Conditional SIMD Packed Loads and Stores
803  * **** VMASKMOVPD xmm1, xmm2, m128
804  * **** VMASKMOVPD ymm1, ymm2, m256
805  * **** VMASKMOVPD m128, xmm1, xmm2
806  * **** VMASKMOVPD m256, ymm1, ymm2
807  *
808  * Load forms:
809  * Load packed values from the 128-bit (XMM forms) or 256-bit (YMM forms)
810  * memory location (third operand) into the destination XMM or YMM register
811  * (first operand) using a mask in the first source operand (second operand).
812  *
813  * Store forms:
814  * Stores packed values from the XMM or YMM register in the second source
815  * operand (third operand) into the 128-bit (XMM forms) or 256-bit (YMM forms)
816  * memory location using a mask in first source operand (second operand).
817  * Stores are atomic.
818  */
819 extern __m256d __cdecl _mm256_maskload_pd(double const *, __m256i);
820 extern void __cdecl _mm256_maskstore_pd(double *, __m256i, __m256d);
821 extern __m128d __cdecl _mm_maskload_pd(double const *, __m128i);
822 extern void __cdecl _mm_maskstore_pd(double *, __m128i, __m128d);
823 
824 /*
825  * Conditional SIMD Packed Loads and Stores
826  * **** VMASKMOVPS xmm1, xmm2, m128
827  * **** VMASKMOVPS ymm1, ymm2, m256
828  * **** VMASKMOVPS m128, xmm1, xmm2
829  * **** VMASKMOVPS m256, ymm1, ymm2
830  *
831  * Load forms:
832  * Load packed values from the 128-bit (XMM forms) or 256-bit (YMM forms)
833  * memory location (third operand) into the destination XMM or YMM register
834  * (first operand) using a mask in the first source operand (second operand).
835  *
836  * Store forms:
837  * Stores packed values from the XMM or YMM register in the second source
838  * operand (third operand) into the 128-bit (XMM forms) or 256-bit (YMM forms)
839  * memory location using a mask in first source operand (second operand).
840  * Stores are atomic.
841  */
842 extern __m256 __cdecl _mm256_maskload_ps(float const *, __m256i);
843 extern void __cdecl _mm256_maskstore_ps(float *, __m256i, __m256);
844 extern __m128 __cdecl _mm_maskload_ps(float const *, __m128i);
845 extern void __cdecl _mm_maskstore_ps(float *, __m128i, __m128);
846 
847 /*
848  * Replicate Single-Precision Floating-Point Values
849  * **** VMOVSHDUP ymm1, ymm2/m256
850  * Duplicates odd-indexed single-precision floating-point values from the
851  * source operand
852  */
853 extern __m256 __cdecl _mm256_movehdup_ps(__m256);
854 
855 /*
856  * Replicate Single-Precision Floating-Point Values
857  * **** VMOVSLDUP ymm1, ymm2/m256
858  * Duplicates even-indexed single-precision floating-point values from the
859  * source operand
860  */
861 extern __m256 __cdecl _mm256_moveldup_ps(__m256);
862 
863 /*
864  * Replicate Double-Precision Floating-Point Values
865  * **** VMOVDDUP ymm1, ymm2/m256
866  * Duplicates even-indexed double-precision floating-point values from the
867  * source operand
868  */
869 extern __m256d __cdecl _mm256_movedup_pd(__m256d);
870 
871 /*
872  * Move Unaligned Integer
873  * **** VLDDQU ymm1, m256
874  * The instruction is functionally similar to VMOVDQU YMM, m256 for loading
875  * from memory. That is: 32 bytes of data starting at an address specified by
876  * the source memory operand are fetched from memory and placed in a
877  * destination
878  */
879 extern __m256i __cdecl _mm256_lddqu_si256(__m256i const *);
880 
881 /*
882  * Store Packed Integers Using Non-Temporal Hint
883  * **** VMOVNTDQ m256, ymm1
884  * Moves the packed integers in the source operand to the destination using a
885  * non-temporal hint to prevent caching of the data during the write to memory
886  */
887 extern void __cdecl _mm256_stream_si256(__m256i *, __m256i);
888 
889 /*
890  * Store Packed Double-Precision Floating-Point Values Using Non-Temporal Hint
891  * **** VMOVNTPD m256, ymm1
892  * Moves the packed double-precision floating-point values in the source
893  * operand to the destination operand using a non-temporal hint to prevent
894  * caching of the data during the write to memory
895  */
896 extern void __cdecl _mm256_stream_pd(double *, __m256d);
897 
898 /*
899  * Store Packed Single-Precision Floating-Point Values Using Non-Temporal Hint
900  * **** VMOVNTPS m256, ymm1
901  * Moves the packed single-precision floating-point values in the source
902  * operand to the destination operand using a non-temporal hint to prevent
903  * caching of the data during the write to memory
904  */
905 extern void __cdecl _mm256_stream_ps(float *, __m256);
906 
907 /*
908  * Compute Approximate Reciprocals of Packed Single-Precision Floating-Point
909  * Values
910  * **** VRCPPS ymm1, ymm2/m256
911  * Performs an SIMD computation of the approximate reciprocals of the eight
912  * packed single precision floating-point values in the source operand and
913  * stores the packed single-precision floating-point results in the destination
914  */
915 extern __m256 __cdecl _mm256_rcp_ps(__m256);
916 
917 /*
918  * Compute Approximate Reciprocals of Square Roots of
919  * Packed Single-Precision Floating-point Values
920  * **** VRSQRTPS ymm1, ymm2/m256
921  * Performs an SIMD computation of the approximate reciprocals of the square
922  * roots of the eight packed single precision floating-point values in the
923  * source operand and stores the packed single-precision floating-point results
924  * in the destination
925  */
926 extern __m256 __cdecl _mm256_rsqrt_ps(__m256);
927 
928 /*
929  * Square Root of Double-Precision Floating-Point Values
930  * **** VSQRTPD ymm1, ymm2/m256
931  * Performs an SIMD computation of the square roots of the two or four packed
932  * double-precision floating-point values in the source operand and stores
933  * the packed double-precision floating-point results in the destination
934  */
935 extern __m256d __cdecl _mm256_sqrt_pd(__m256d);
936 
937 /*
938  * Square Root of Single-Precision Floating-Point Values
939  * **** VSQRTPS ymm1, ymm2/m256
940  * Performs an SIMD computation of the square roots of the eight packed
941  * single-precision floating-point values in the source operand stores the
942  * packed double-precision floating-point results in the destination
943  */
944 extern __m256 __cdecl _mm256_sqrt_ps(__m256);
945 
946 /*
947  * Round Packed Double-Precision Floating-Point Values
948  * **** VROUNDPD ymm1,ymm2/m256,imm8
949  * Round the four Double-Precision Floating-Point Values values in the source
950  * operand by the rounding mode specified in the immediate operand and place
951  * the result in the destination. The rounding process rounds the input to an
952  * integral value and returns the result as a double-precision floating-point
953  * value. The Precision Floating Point Exception is signaled according to the
954  * immediate operand. If any source operand is an SNaN then it will be
955  * converted to a QNaN.
956  */
957 extern __m256d __cdecl _mm256_round_pd(__m256d, int);
958 #define _mm256_ceil_pd(val) _mm256_round_pd((val), _MM_FROUND_CEIL)
959 #define _mm256_floor_pd(val) _mm256_round_pd((val), _MM_FROUND_FLOOR)
960 
961 /*
962  * Round Packed Single-Precision Floating-Point Values
963  * **** VROUNDPS ymm1,ymm2/m256,imm8
964  * Round the four single-precision floating-point values values in the source
965  * operand by the rounding mode specified in the immediate operand and place
966  * the result in the destination. The rounding process rounds the input to an
967  * integral value and returns the result as a double-precision floating-point
968  * value. The Precision Floating Point Exception is signaled according to the
969  * immediate operand. If any source operand is an SNaN then it will be
970  * converted to a QNaN.
971  */
972 extern __m256 __cdecl _mm256_round_ps(__m256, int);
973 #define _mm256_ceil_ps(val) _mm256_round_ps((val), _MM_FROUND_CEIL)
974 #define _mm256_floor_ps(val) _mm256_round_ps((val), _MM_FROUND_FLOOR)
975 
976 /*
977  * Unpack and Interleave High Packed Double-Precision Floating-Point Values
978  * **** VUNPCKHPD ymm1,ymm2,ymm3/m256
979  * Performs an interleaved unpack of the high double-precision floating-point
980  * values from the first source operand and the second source operand.
981  */
982 extern __m256d __cdecl _mm256_unpackhi_pd(__m256d, __m256d);
983 
984 /*
985  * Unpack and Interleave High Packed Single-Precision Floating-Point Values
986  * **** VUNPCKHPS ymm1,ymm2,ymm3
987  * Performs an interleaved unpack of the high single-precision floating-point
988  * values from the first source operand and the second source operand
989  */
990 extern __m256 __cdecl _mm256_unpackhi_ps(__m256, __m256);
991 
992 /*
993  * Unpack and Interleave Low Packed Double-Precision Floating-Point Values
994  * **** VUNPCKLPD ymm1,ymm2,ymm3/m256
995  * Performs an interleaved unpack of the low double-precision floating-point
996  * values from the first source operand and the second source operand
997  */
998 extern __m256d __cdecl _mm256_unpacklo_pd(__m256d, __m256d);
999 
1000 /*
1001  * Unpack and Interleave Low Packed Single-Precision Floating-Point Values
1002  * **** VUNPCKLPS ymm1,ymm2,ymm3
1003  * Performs an interleaved unpack of the low single-precision floating-point
1004  * values from the first source operand and the second source operand
1005  */
1006 extern __m256 __cdecl _mm256_unpacklo_ps(__m256, __m256);
1007 
1008 /*
1009  * Packed Bit Test
1010  * **** VPTEST ymm1, ymm2/m256
1011  * VPTEST set the ZF flag if all bits in the result are 0 of the bitwise AND
1012  * of the first source operand and the second source operand. VPTEST sets the
1013  * CF flag if all bits in the result are 0 of the bitwise AND of the second
1014  * source operand and the logical NOT of the first source operand.
1015  */
1016 extern int __cdecl _mm256_testz_si256(__m256i, __m256i);
1017 #define _mm256_test_all_zeros(mask, val) \
1018  _mm256_testz_si256((mask), (val))
1019 
1020 extern int __cdecl _mm256_testc_si256(__m256i, __m256i);
1021 #define _mm256_test_all_ones(val) \
1022  _mm256_testc_si256((val), _mm256_cmpeq_epi32((val),(val)))
1023 
1024 extern int __cdecl _mm256_testnzc_si256(__m256i, __m256i);
1025 #define _mm256_test_mix_ones_zeros(mask, val) \
1026  _mm256_testnzc_si256((mask), (val))
1027 
1028 /*
1029  * Packed Bit Test
1030  * **** VTESTPD ymm1, ymm2/m256
1031  * **** VTESTPD xmm1, xmm2/m128
1032  * VTESTPD performs a bitwise comparison of all the sign bits of the
1033  * double-precision elements in the first source operation and corresponding
1034  * sign bits in the second source operand. If the AND of the two sets of bits
1035  * produces all zeros, the ZF is set else the ZF is clear. If the AND NOT of
1036  * the source sign bits with the dest sign bits produces all zeros the CF is
1037  * set else the CF is clear
1038  */
1039 extern int __cdecl _mm256_testz_pd(__m256d, __m256d);
1040 extern int __cdecl _mm256_testc_pd(__m256d, __m256d);
1041 extern int __cdecl _mm256_testnzc_pd(__m256d, __m256d);
1042 extern int __cdecl _mm_testz_pd(__m128d, __m128d);
1043 extern int __cdecl _mm_testc_pd(__m128d, __m128d);
1044 extern int __cdecl _mm_testnzc_pd(__m128d, __m128d);
1045 
1046 /*
1047  * Packed Bit Test
1048  * **** VTESTPS ymm1, ymm2/m256
1049  * **** VTESTPS xmm1, xmm2/m128
1050  * VTESTPS performs a bitwise comparison of all the sign bits of the packed
1051  * single-precision elements in the first source operation and corresponding
1052  * sign bits in the second source operand. If the AND of the two sets of bits
1053  * produces all zeros, the ZF is set else the ZF is clear. If the AND NOT of
1054  * the source sign bits with the dest sign bits produces all zeros the CF is
1055  * set else the CF is clear
1056  */
1057 extern int __cdecl _mm256_testz_ps(__m256, __m256);
1058 extern int __cdecl _mm256_testc_ps(__m256, __m256);
1059 extern int __cdecl _mm256_testnzc_ps(__m256, __m256);
1060 extern int __cdecl _mm_testz_ps(__m128, __m128);
1061 extern int __cdecl _mm_testc_ps(__m128, __m128);
1062 extern int __cdecl _mm_testnzc_ps(__m128, __m128);
1063 
1064 /*
1065  * Extract Double-Precision Floating-Point Sign mask
1066  * **** VMOVMSKPD r32, ymm2
1067  * Extracts the sign bits from the packed double-precision floating-point
1068  * values in the source operand, formats them into a 4-bit mask, and stores
1069  * the mask in the destination
1070  */
1071 extern int __cdecl _mm256_movemask_pd(__m256d);
1072 
1073 /*
1074  * Extract Single-Precision Floating-Point Sign mask
1075  * **** VMOVMSKPS r32, ymm2
1076  * Extracts the sign bits from the packed single-precision floating-point
1077  * values in the source operand, formats them into a 8-bit mask, and stores
1078  * the mask in the destination
1079  */
1080 extern int __cdecl _mm256_movemask_ps(__m256);
1081 
1082 /*
1083  * Return 256-bit vector with all elements set to 0
1084  */
1085 extern __m256d __cdecl _mm256_setzero_pd(void);
1086 extern __m256 __cdecl _mm256_setzero_ps(void);
1087 extern __m256i __cdecl _mm256_setzero_si256(void);
1088 
1089 /*
1090  * Return 256-bit vector initialized to specified arguments
1091  */
1092 extern __m256d __cdecl _mm256_set_pd(double, double, double, double);
1093 extern __m256 __cdecl _mm256_set_ps(float, float, float, float,
1094  float, float, float, float);
1095 extern __m256i __cdecl _mm256_set_epi8(char, char, char, char,
1096  char, char, char, char,
1097  char, char, char, char,
1098  char, char, char, char,
1099  char, char, char, char,
1100  char, char, char, char,
1101  char, char, char, char,
1102  char, char, char, char);
1103 extern __m256i __cdecl _mm256_set_epi16(short, short, short, short,
1104  short, short, short, short,
1105  short, short, short, short,
1106  short, short, short, short);
1107 extern __m256i __cdecl _mm256_set_epi32(int, int, int, int,
1108  int, int, int, int);
1109 extern __m256i __cdecl _mm256_set_epi64x(__int64, __int64,
1110  __int64, __int64);
1111 
1112 #define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \
1113  _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1)
1114 
1115 #define _mm256_set_m128d(/* __m128d */ hi, /* __m128d */ lo) \
1116  _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), (hi), 0x1)
1117 
1118 #define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \
1119  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
1120 
1121 extern __m256d __cdecl _mm256_setr_pd(double, double, double, double);
1122 extern __m256 __cdecl _mm256_setr_ps(float, float, float, float,
1123  float, float, float, float);
1124 extern __m256i __cdecl _mm256_setr_epi8(char, char, char, char,
1125  char, char, char, char,
1126  char, char, char, char,
1127  char, char, char, char,
1128  char, char, char, char,
1129  char, char, char, char,
1130  char, char, char, char,
1131  char, char, char, char);
1132 extern __m256i __cdecl _mm256_setr_epi16(short, short, short, short,
1133  short, short, short, short,
1134  short, short, short, short,
1135  short, short, short, short);
1136 extern __m256i __cdecl _mm256_setr_epi32(int, int, int, int,
1137  int, int, int, int);
1138 extern __m256i __cdecl _mm256_setr_epi64x(__int64, __int64,
1139  __int64, __int64);
1140 #define _mm256_setr_m128(lo, hi) _mm256_set_m128((hi), (lo))
1141 #define _mm256_setr_m128d(lo, hi) _mm256_set_m128d((hi), (lo))
1142 #define _mm256_setr_m128i(lo, hi) _mm256_set_m128i((hi), (lo))
1143 
1144 /*
1145  * Return 256-bit vector with all elements initialized to specified scalar
1146  */
1147 extern __m256d __cdecl _mm256_set1_pd(double);
1148 extern __m256 __cdecl _mm256_set1_ps(float);
1149 extern __m256i __cdecl _mm256_set1_epi8(char);
1150 extern __m256i __cdecl _mm256_set1_epi16(short);
1151 extern __m256i __cdecl _mm256_set1_epi32(int);
1152 extern __m256i __cdecl _mm256_set1_epi64x(long long);
1153 
1154 /*
1155  * Support intrinsic functions to do vector type casts. These functions do
1156  * not introduce extra moves to generated code. When cast is done from a 128
1157  * to 256-bit type the low 128 bits of the 256-bit result contain source
1158  * parameter value; the upper 128 bits of the result are undefined.
1159  */
1160 extern __m256 __cdecl _mm256_castpd_ps(__m256d);
1161 extern __m256d __cdecl _mm256_castps_pd(__m256);
1162 extern __m256i __cdecl _mm256_castps_si256(__m256);
1163 extern __m256i __cdecl _mm256_castpd_si256(__m256d);
1164 extern __m256 __cdecl _mm256_castsi256_ps(__m256i);
1165 extern __m256d __cdecl _mm256_castsi256_pd(__m256i);
1166 extern __m128 __cdecl _mm256_castps256_ps128(__m256);
1167 extern __m128d __cdecl _mm256_castpd256_pd128(__m256d);
1168 extern __m128i __cdecl _mm256_castsi256_si128(__m256i);
1169 extern __m256 __cdecl _mm256_castps128_ps256(__m128);
1170 extern __m256d __cdecl _mm256_castpd128_pd256(__m128d);
1171 extern __m256i __cdecl _mm256_castsi128_si256(__m128i);
1172 
1173 
1174 /*
1175  * Support for half-float conversions to/from normal float.
1176  * Immediate argument is used for special MXCSR overrides.
1177  */
1178 extern __m128 __cdecl _mm_cvtph_ps(__m128i);
1179 extern __m256 __cdecl _mm256_cvtph_ps(__m128i);
1180 extern __m128i __cdecl _mm_cvtps_ph(__m128 m1, const int imm);
1181 extern __m128i __cdecl _mm256_cvtps_ph(__m256, int);
1182 
1183 /*
1184  * Return a vector with all elements set to zero. It is recommended to use the
1185  * result of this intrinsic as an input argument to another intrinsic when the
1186  * initial value is irrelevant.
1187  */
1188 #define _mm_undefined_ps _mm_setzero_ps
1189 #define _mm_undefined_pd _mm_setzero_pd
1190 #define _mm_undefined_si128 _mm_setzero_si128
1191 #define _mm256_undefined_ps _mm256_setzero_ps
1192 #define _mm256_undefined_pd _mm256_setzero_pd
1193 #define _mm256_undefined_si256 _mm256_setzero_si256
1194 
1195 /*
1196  * The list of extended control registers.
1197  * Currently, the list includes only one register.
1198  */
1199 #define _XCR_XFEATURE_ENABLED_MASK 0
1200 
1201 /* Returns the content of the specified extended control register */
1202 extern unsigned __int64 __cdecl _xgetbv(unsigned int);
1203 
1204 /* Writes the value to the specified extended control register */
1205 extern void __cdecl _xsetbv(unsigned int, unsigned __int64);
1206 
1207 
1208 /*
1209  * Performs a full or partial save of the enabled processor state components
1210  * using the specified memory address location and a mask.
1211  */
1212 extern void __cdecl _xsave(void *, unsigned __int64);
1213 #if defined (_M_X64)
1214 extern void __cdecl _xsave64(void *, unsigned __int64);
1215 #endif /* defined (_M_X64) */
1216 
1217 /*
1218  * Performs a full or partial save of the enabled processor state components
1219  * using the specified memory address location and a mask.
1220  * Optimize the state save operation if possible.
1221  */
1222 extern void __cdecl _xsaveopt(void *, unsigned __int64);
1223 #if defined (_M_X64)
1224 extern void __cdecl _xsaveopt64(void *, unsigned __int64);
1225 #endif /* defined (_M_X64) */
1226 
1227 /*
1228  * Performs a full or partial restore of the enabled processor states
1229  * using the state information stored in the specified memory address location
1230  * and a mask.
1231  */
1232 extern void __cdecl _xrstor(void const *, unsigned __int64);
1233 #if defined (_M_X64)
1234 extern void __cdecl _xrstor64(void const *, unsigned __int64);
1235 #endif /* defined (_M_X64) */
1236 
1237 /*
1238  * Saves the current state of the x87 FPU, MMX technology, XMM,
1239  * and MXCSR registers to the specified 512-byte memory location.
1240  */
1241 extern void __cdecl _fxsave(void *);
1242 #if defined (_M_X64)
1243 extern void __cdecl _fxsave64(void *);
1244 #endif /* defined (_M_X64) */
1245 
1246 /*
1247  * Restore the current state of the x87 FPU, MMX technology, XMM,
1248  * and MXCSR registers from the specified 512-byte memory location.
1249  */
1250 extern void __cdecl _fxrstor(void const *);
1251 #if defined (_M_X64)
1252 extern void __cdecl _fxrstor64(void const *);
1253 #endif /* defined (_M_X64) */
1254 
1255 /*
1256  * Perform one attempt to generate a hardware generated random value.
1257  * The generated value is written to the given memory location and the success
1258  * status is returned: 1 if the hardware could generate a valid random number
1259  * and 0 otherwise.
1260  */
1261 extern int __cdecl _rdrand16_step(unsigned short *);
1262 extern int __cdecl _rdrand32_step(unsigned int *);
1263 #if defined (_M_X64)
1264 extern int __cdecl _rdrand64_step(unsigned __int64 *);
1265 #endif /* defined (_M_X64) */
1266 
1267 #if defined (_M_X64)
1268 /*
1269  * Return the value of the FS/GS segment base register.
1270  */
1271 extern unsigned int __cdecl _readfsbase_u32();
1272 extern unsigned int __cdecl _readgsbase_u32();
1273 extern unsigned __int64 __cdecl _readfsbase_u64();
1274 extern unsigned __int64 __cdecl _readgsbase_u64();
1275 
1276 /*
1277  * Write the value to the FS/GS segment base register.
1278  */
1279 extern void __cdecl _writefsbase_u32(unsigned int);
1280 extern void __cdecl _writegsbase_u32(unsigned int);
1281 extern void __cdecl _writefsbase_u64(unsigned __int64);
1282 extern void __cdecl _writegsbase_u64(unsigned __int64);
1283 #endif /* defined (_M_X64) */
1284 
1285 /*
1286  * Perform FMA (Fused Multiply-and-Add) operations.
1287  */
1288 extern __m128 __cdecl _mm_fmadd_ps(__m128, __m128, __m128);
1289 extern __m128d __cdecl _mm_fmadd_pd(__m128d, __m128d, __m128d);
1290 extern __m128 __cdecl _mm_fmadd_ss(__m128, __m128, __m128);
1291 extern __m128d __cdecl _mm_fmadd_sd(__m128d, __m128d, __m128d);
1292 extern __m128 __cdecl _mm_fmsub_ps(__m128, __m128, __m128);
1293 extern __m128d __cdecl _mm_fmsub_pd(__m128d, __m128d, __m128d);
1294 extern __m128 __cdecl _mm_fmsub_ss(__m128, __m128, __m128);
1295 extern __m128d __cdecl _mm_fmsub_sd(__m128d, __m128d, __m128d);
1296 extern __m128 __cdecl _mm_fnmadd_ps(__m128, __m128, __m128);
1297 extern __m128d __cdecl _mm_fnmadd_pd(__m128d, __m128d, __m128d);
1298 extern __m128 __cdecl _mm_fnmadd_ss(__m128, __m128, __m128);
1299 extern __m128d __cdecl _mm_fnmadd_sd(__m128d, __m128d, __m128d);
1300 extern __m128 __cdecl _mm_fnmsub_ps(__m128, __m128, __m128);
1301 extern __m128d __cdecl _mm_fnmsub_pd(__m128d, __m128d, __m128d);
1302 extern __m128 __cdecl _mm_fnmsub_ss(__m128, __m128, __m128);
1303 extern __m128d __cdecl _mm_fnmsub_sd(__m128d, __m128d, __m128d);
1304 
1305 extern __m256 __cdecl _mm256_fmadd_ps(__m256, __m256, __m256);
1306 extern __m256d __cdecl _mm256_fmadd_pd(__m256d, __m256d, __m256d);
1307 extern __m256 __cdecl _mm256_fmsub_ps(__m256, __m256, __m256);
1308 extern __m256d __cdecl _mm256_fmsub_pd(__m256d, __m256d, __m256d);
1309 extern __m256 __cdecl _mm256_fnmadd_ps(__m256, __m256, __m256);
1310 extern __m256d __cdecl _mm256_fnmadd_pd(__m256d, __m256d, __m256d);
1311 extern __m256 __cdecl _mm256_fnmsub_ps(__m256, __m256, __m256);
1312 extern __m256d __cdecl _mm256_fnmsub_pd(__m256d, __m256d, __m256d);
1313 
1314 
1315 /*
1316  * Fused Multiply-and-Add/Subtract__and Multiply-and-Subtract/Add operations.
1317  */
1318 extern __m128 __cdecl _mm_fmaddsub_ps(__m128, __m128, __m128);
1319 extern __m128d __cdecl _mm_fmaddsub_pd(__m128d, __m128d, __m128d);
1320 extern __m128 __cdecl _mm_fmsubadd_ps(__m128, __m128, __m128);
1321 extern __m128d __cdecl _mm_fmsubadd_pd(__m128d, __m128d, __m128d);
1322 
1323 extern __m256 __cdecl _mm256_fmaddsub_ps(__m256, __m256, __m256);
1325 extern __m256 __cdecl _mm256_fmsubadd_ps(__m256, __m256, __m256);
1327 
1328 
1329 /*
1330  * Integer 256-bit vector comparison operations.
1331  */
1332 extern __m256i __cdecl _mm256_cmpeq_epi8(__m256i, __m256i);
1333 extern __m256i __cdecl _mm256_cmpeq_epi16(__m256i, __m256i);
1334 extern __m256i __cdecl _mm256_cmpeq_epi32(__m256i, __m256i);
1335 extern __m256i __cdecl _mm256_cmpeq_epi64(__m256i, __m256i);
1336 
1337 extern __m256i __cdecl _mm256_cmpgt_epi8(__m256i, __m256i);
1338 extern __m256i __cdecl _mm256_cmpgt_epi16(__m256i, __m256i);
1339 extern __m256i __cdecl _mm256_cmpgt_epi32(__m256i, __m256i);
1340 extern __m256i __cdecl _mm256_cmpgt_epi64(__m256i, __m256i);
1341 
1342 
1343 /*
1344  * Integer 256-bit vector MIN/MAX operations.
1345  */
1346 extern __m256i __cdecl _mm256_max_epi8(__m256i, __m256i);
1347 extern __m256i __cdecl _mm256_max_epi16(__m256i, __m256i);
1348 extern __m256i __cdecl _mm256_max_epi32(__m256i, __m256i);
1349 extern __m256i __cdecl _mm256_max_epu8(__m256i, __m256i);
1350 extern __m256i __cdecl _mm256_max_epu16(__m256i, __m256i);
1351 extern __m256i __cdecl _mm256_max_epu32(__m256i, __m256i);
1352 
1353 extern __m256i __cdecl _mm256_min_epi8(__m256i, __m256i);
1354 extern __m256i __cdecl _mm256_min_epi16(__m256i, __m256i);
1355 extern __m256i __cdecl _mm256_min_epi32(__m256i, __m256i);
1356 extern __m256i __cdecl _mm256_min_epu8(__m256i, __m256i);
1357 extern __m256i __cdecl _mm256_min_epu16(__m256i, __m256i);
1358 extern __m256i __cdecl _mm256_min_epu32(__m256i, __m256i);
1359 
1360 
1361 /*
1362  * Integer 256-bit vector logical operations.
1363  */
1364 extern __m256i __cdecl _mm256_and_si256(__m256i, __m256i);
1365 extern __m256i __cdecl _mm256_andnot_si256(__m256i, __m256i);
1366 extern __m256i __cdecl _mm256_or_si256(__m256i, __m256i);
1367 extern __m256i __cdecl _mm256_xor_si256(__m256i, __m256i);
1368 
1369 
1370 /*
1371  * Integer 256-bit vector arithmetic operations.
1372  */
1373 extern __m256i __cdecl _mm256_abs_epi8(__m256i);
1374 extern __m256i __cdecl _mm256_abs_epi16(__m256i);
1375 extern __m256i __cdecl _mm256_abs_epi32(__m256i);
1376 
1377 extern __m256i __cdecl _mm256_add_epi8(__m256i, __m256i);
1378 extern __m256i __cdecl _mm256_add_epi16(__m256i, __m256i);
1379 extern __m256i __cdecl _mm256_add_epi32(__m256i, __m256i);
1380 extern __m256i __cdecl _mm256_add_epi64(__m256i, __m256i);
1381 
1382 extern __m256i __cdecl _mm256_adds_epi8(__m256i, __m256i);
1383 extern __m256i __cdecl _mm256_adds_epi16(__m256i, __m256i);
1384 extern __m256i __cdecl _mm256_adds_epu8(__m256i, __m256i);
1385 extern __m256i __cdecl _mm256_adds_epu16(__m256i, __m256i);
1386 
1387 extern __m256i __cdecl _mm256_sub_epi8(__m256i, __m256i);
1388 extern __m256i __cdecl _mm256_sub_epi16(__m256i, __m256i);
1389 extern __m256i __cdecl _mm256_sub_epi32(__m256i, __m256i);
1390 extern __m256i __cdecl _mm256_sub_epi64(__m256i, __m256i);
1391 
1392 extern __m256i __cdecl _mm256_subs_epi8(__m256i, __m256i);
1393 extern __m256i __cdecl _mm256_subs_epi16(__m256i, __m256i);
1394 extern __m256i __cdecl _mm256_subs_epu8(__m256i, __m256i);
1395 extern __m256i __cdecl _mm256_subs_epu16(__m256i, __m256i);
1396 
1397 extern __m256i __cdecl _mm256_avg_epu8(__m256i, __m256i);
1398 extern __m256i __cdecl _mm256_avg_epu16(__m256i, __m256i);
1399 
1400 extern __m256i __cdecl _mm256_hadd_epi16(__m256i, __m256i);
1401 extern __m256i __cdecl _mm256_hadd_epi32(__m256i, __m256i);
1402 extern __m256i __cdecl _mm256_hadds_epi16(__m256i, __m256i);
1403 
1404 extern __m256i __cdecl _mm256_hsub_epi16(__m256i, __m256i);
1405 extern __m256i __cdecl _mm256_hsub_epi32(__m256i, __m256i);
1406 extern __m256i __cdecl _mm256_hsubs_epi16(__m256i, __m256i);
1407 
1408 extern __m256i __cdecl _mm256_madd_epi16(__m256i, __m256i);
1409 extern __m256i __cdecl _mm256_maddubs_epi16(__m256i, __m256i);
1410 
1411 extern __m256i __cdecl _mm256_mulhi_epi16(__m256i, __m256i);
1412 extern __m256i __cdecl _mm256_mulhi_epu16(__m256i, __m256i);
1413 
1414 extern __m256i __cdecl _mm256_mullo_epi16(__m256i, __m256i);
1415 extern __m256i __cdecl _mm256_mullo_epi32(__m256i, __m256i);
1416 
1417 extern __m256i __cdecl _mm256_mul_epu32(__m256i, __m256i);
1418 extern __m256i __cdecl _mm256_mul_epi32(__m256i, __m256i);
1419 
1420 extern __m256i __cdecl _mm256_sign_epi8(__m256i, __m256i);
1421 extern __m256i __cdecl _mm256_sign_epi16(__m256i, __m256i);
1422 extern __m256i __cdecl _mm256_sign_epi32(__m256i, __m256i);
1423 
1424 extern __m256i __cdecl _mm256_mulhrs_epi16(__m256i, __m256i);
1425 
1426 extern __m256i __cdecl _mm256_sad_epu8(__m256i, __m256i);
1427 extern __m256i __cdecl _mm256_mpsadbw_epu8(__m256i, __m256i, const int);
1428 
1429 
1430 /*
1431  * Integer 256-bit vector arithmetic/logical shift operations.
1432  */
1433 extern __m256i __cdecl _mm256_slli_si256(__m256i, const int);
1434 extern __m256i __cdecl _mm256_srli_si256(__m256i, const int);
1435 
1436 extern __m256i __cdecl _mm256_sll_epi16(__m256i, __m128i);
1437 extern __m256i __cdecl _mm256_sll_epi32(__m256i, __m128i);
1438 extern __m256i __cdecl _mm256_sll_epi64(__m256i, __m128i);
1439 
1440 extern __m256i __cdecl _mm256_slli_epi16(__m256i, int);
1441 extern __m256i __cdecl _mm256_slli_epi32(__m256i, int);
1442 extern __m256i __cdecl _mm256_slli_epi64(__m256i, int);
1443 
1444 extern __m256i __cdecl _mm256_sllv_epi32(__m256i, __m256i);
1445 extern __m256i __cdecl _mm256_sllv_epi64(__m256i, __m256i);
1446 
1447 extern __m128i __cdecl _mm_sllv_epi32(__m128i, __m128i);
1448 extern __m128i __cdecl _mm_sllv_epi64(__m128i, __m128i);
1449 
1450 extern __m256i __cdecl _mm256_sra_epi16(__m256i, __m128i);
1451 extern __m256i __cdecl _mm256_sra_epi32(__m256i, __m128i);
1452 
1453 extern __m256i __cdecl _mm256_srai_epi16(__m256i, int);
1454 extern __m256i __cdecl _mm256_srai_epi32(__m256i, int);
1455 
1456 extern __m256i __cdecl _mm256_srav_epi32(__m256i, __m256i);
1457 
1458 extern __m128i __cdecl _mm_srav_epi32(__m128i, __m128i);
1459 
1460 extern __m256i __cdecl _mm256_srl_epi16(__m256i, __m128i);
1461 extern __m256i __cdecl _mm256_srl_epi32(__m256i, __m128i);
1462 extern __m256i __cdecl _mm256_srl_epi64(__m256i, __m128i);
1463 
1464 extern __m256i __cdecl _mm256_srli_epi16(__m256i, int);
1465 extern __m256i __cdecl _mm256_srli_epi32(__m256i, int);
1466 extern __m256i __cdecl _mm256_srli_epi64(__m256i, int);
1467 
1468 extern __m256i __cdecl _mm256_srlv_epi32(__m256i, __m256i);
1469 extern __m256i __cdecl _mm256_srlv_epi64(__m256i, __m256i);
1470 
1471 extern __m128i __cdecl _mm_srlv_epi32(__m128i, __m128i);
1472 extern __m128i __cdecl _mm_srlv_epi64(__m128i, __m128i);
1473 
1474 
1475 /*
1476  * Integer 128/256-bit vector pack/blend/shuffle/insert/extract operations.
1477  */
1478 extern __m128i __cdecl _mm_blend_epi32(__m128i, __m128i, const int);
1479 
1480 extern __m256i __cdecl _mm256_blend_epi32(__m256i,__m256i, const int);
1481 
1482 extern __m256i __cdecl _mm256_alignr_epi8(__m256i, __m256i, const int);
1483 
1485 extern __m256i __cdecl _mm256_blend_epi16(__m256i, __m256i, const int);
1486 
1487 extern __m256i __cdecl _mm256_packs_epi16(__m256i, __m256i);
1488 extern __m256i __cdecl _mm256_packs_epi32(__m256i, __m256i);
1489 extern __m256i __cdecl _mm256_packus_epi16(__m256i, __m256i);
1490 extern __m256i __cdecl _mm256_packus_epi32(__m256i, __m256i);
1491 
1492 extern __m256i __cdecl _mm256_unpackhi_epi8(__m256i, __m256i);
1493 extern __m256i __cdecl _mm256_unpackhi_epi16(__m256i, __m256i);
1494 extern __m256i __cdecl _mm256_unpackhi_epi32(__m256i, __m256i);
1495 extern __m256i __cdecl _mm256_unpackhi_epi64(__m256i, __m256i);
1496 
1497 extern __m256i __cdecl _mm256_unpacklo_epi8(__m256i, __m256i);
1498 extern __m256i __cdecl _mm256_unpacklo_epi16(__m256i, __m256i);
1499 extern __m256i __cdecl _mm256_unpacklo_epi32(__m256i, __m256i);
1500 extern __m256i __cdecl _mm256_unpacklo_epi64(__m256i, __m256i);
1501 
1502 extern __m256i __cdecl _mm256_shuffle_epi8(__m256i, __m256i);
1503 extern __m256i __cdecl _mm256_shuffle_epi32(__m256i, const int);
1504 
1505 extern __m256i __cdecl _mm256_shufflehi_epi16(__m256i, const int);
1506 extern __m256i __cdecl _mm256_shufflelo_epi16(__m256i, const int);
1507 
1508 extern __m128i __cdecl _mm256_extracti128_si256(__m256i, const int);
1509 extern __m256i __cdecl _mm256_inserti128_si256(__m256i, __m128i, const int);
1510 
1511 
1512 /*
1513  * Scalar to 128/256-bit vector broadcast operations.
1514  */
1515 extern __m128 __cdecl _mm_broadcastss_ps(__m128);
1516 extern __m128d __cdecl _mm_broadcastsd_pd(__m128d);
1517 
1518 extern __m128i __cdecl _mm_broadcastb_epi8(__m128i);
1519 extern __m128i __cdecl _mm_broadcastw_epi16(__m128i);
1520 extern __m128i __cdecl _mm_broadcastd_epi32(__m128i);
1521 extern __m128i __cdecl _mm_broadcastq_epi64(__m128i);
1522 
1523 extern __m256 __cdecl _mm256_broadcastss_ps(__m128);
1524 extern __m256d __cdecl _mm256_broadcastsd_pd(__m128d);
1525 
1526 extern __m256i __cdecl _mm256_broadcastb_epi8(__m128i);
1527 extern __m256i __cdecl _mm256_broadcastw_epi16(__m128i);
1528 extern __m256i __cdecl _mm256_broadcastd_epi32(__m128i);
1529 extern __m256i __cdecl _mm256_broadcastq_epi64(__m128i);
1530 
1532 
1533 
1534 
1535 /*
1536  * Integer 256-bit vector signed/unsigned extension operations.
1537  */
1538 extern __m256i __cdecl _mm256_cvtepi8_epi16(__m128i);
1539 extern __m256i __cdecl _mm256_cvtepi8_epi32(__m128i);
1540 extern __m256i __cdecl _mm256_cvtepi8_epi64(__m128i);
1541 extern __m256i __cdecl _mm256_cvtepi16_epi32(__m128i);
1542 extern __m256i __cdecl _mm256_cvtepi16_epi64(__m128i);
1543 extern __m256i __cdecl _mm256_cvtepi32_epi64(__m128i);
1544 
1545 extern __m256i __cdecl _mm256_cvtepu8_epi16(__m128i);
1546 extern __m256i __cdecl _mm256_cvtepu8_epi32(__m128i);
1547 extern __m256i __cdecl _mm256_cvtepu8_epi64(__m128i);
1548 extern __m256i __cdecl _mm256_cvtepu16_epi32(__m128i);
1549 extern __m256i __cdecl _mm256_cvtepu16_epi64(__m128i);
1550 extern __m256i __cdecl _mm256_cvtepu32_epi64(__m128i);
1551 
1552 
1553 /*
1554  * Returns a 32-bit mask made up of the most significant bit of each byte
1555  * of the 256-bit vector source operand.
1556  */
1557 extern int __cdecl _mm256_movemask_epi8(__m256i);
1558 
1559 
1560 /*
1561  * Masked load/store operations.
1562  */
1563 extern __m128i __cdecl _mm_maskload_epi32(int const * /* ptr */,
1564  __m128i /* vmask */);
1565 extern __m128i __cdecl _mm_maskload_epi64(__int64 const * /* ptr */,
1566  __m128i /* vmask */);
1567 
1568 extern void __cdecl _mm_maskstore_epi32(int * /* ptr */,
1569  __m128i /* vmask */,
1570  __m128i /* val */);
1571 extern void __cdecl _mm_maskstore_epi64(__int64 * /* ptr */,
1572  __m128i /* vmask */,
1573  __m128i /* val */);
1574 
1575 extern __m256i __cdecl _mm256_maskload_epi32(int const * /* ptr */,
1576  __m256i /* vmask */);
1577 extern __m256i __cdecl _mm256_maskload_epi64(__int64 const * /* ptr */,
1578  __m256i /* vmask */);
1579 
1580 extern void __cdecl _mm256_maskstore_epi32(int * /* ptr */,
1581  __m256i /* vmask */,
1582  __m256i /* val */);
1583 extern void __cdecl _mm256_maskstore_epi64(__int64 * /* ptr */,
1584  __m256i /* vmask */,
1585  __m256i /* val */);
1586 
1587 
1588 /*
1589  * Permute elements in vector operations.
1590  */
1593 
1594 extern __m256i __cdecl _mm256_permute4x64_epi64(__m256i, const int);
1595 extern __m256d __cdecl _mm256_permute4x64_pd(__m256d, const int);
1596 
1597 extern __m256i __cdecl _mm256_permute2x128_si256(__m256i, __m256i, const int);
1598 
1599 
1600 /*
1601  * Load 32-bytes from memory using non-temporal aligned hint.
1602  */
1603 extern __m256i __cdecl _mm256_stream_load_si256(__m256i const *);
1604 
1605 
1606 
1607 /*
1608  * Masked GATHER from memory to vector register operations.
1609  */
1610 extern __m256d __cdecl _mm256_mask_i32gather_pd(__m256d /* old_dst */,
1611  double const * /* ptr */,
1612  __m128i /* vindex */,
1613  __m256d /* vmask */,
1614  const int /* scale */);
1615 extern __m256 __cdecl _mm256_mask_i32gather_ps(__m256 /* old_dst */,
1616  float const * /* ptr */,
1617  __m256i /* vindex */,
1618  __m256 /* vmask */,
1619  const int /* scale */);
1620 extern __m256d __cdecl _mm256_mask_i64gather_pd(__m256d /* old_dst */,
1621  double const * /* ptr */,
1622  __m256i /* vindex */,
1623  __m256d /* vmask */,
1624  const int /* scale */);
1625 extern __m128 __cdecl _mm256_mask_i64gather_ps(__m128 /* old_dst */,
1626  float const * /* ptr */,
1627  __m256i /* vindex */,
1628  __m128 /* vmask */,
1629  const int /* scale */);
1630 
1631 extern __m128d __cdecl _mm_mask_i32gather_pd(__m128d /* old_dst */,
1632  double const * /* ptr */,
1633  __m128i /* vindex */,
1634  __m128d /* vmask */,
1635  const int /* scale */);
1636 extern __m128 __cdecl _mm_mask_i32gather_ps(__m128 /* old_dst */,
1637  float const * /* ptr */,
1638  __m128i /* vindex */,
1639  __m128 /* vmask */,
1640  const int /* scale */);
1641 extern __m128d __cdecl _mm_mask_i64gather_pd(__m128d /* old_dst */,
1642  double const * /* ptr */,
1643  __m128i /* vindex */,
1644  __m128d /* vmask */,
1645  const int /* scale */);
1646 extern __m128 __cdecl _mm_mask_i64gather_ps(__m128 /* old_dst */,
1647  float const * /* ptr */,
1648  __m128i /* vindex */,
1649  __m128 /* vmask */,
1650  const int /* scale */);
1651 
1652 
1653 extern __m256i __cdecl _mm256_mask_i32gather_epi32(__m256i /* old_dst */,
1654  int const * /* ptr */,
1655  __m256i /* vindex */,
1656  __m256i /* vmask */,
1657  const int /* scale */);
1658 extern __m256i __cdecl _mm256_mask_i32gather_epi64(__m256i /* old_dst */,
1659  __int64 const * /* ptr */,
1660  __m128i /* vindex */,
1661  __m256i /* vmask */,
1662  const int /* scale */);
1663 extern __m128i __cdecl _mm256_mask_i64gather_epi32(__m128i /* old_dst */,
1664  int const * /* ptr */,
1665  __m256i /* vindex */,
1666  __m128i /* vmask */,
1667  const int /* scale */);
1668 extern __m256i __cdecl _mm256_mask_i64gather_epi64(__m256i /* old_dst */,
1669  __int64 const * /* ptr */,
1670  __m256i /* vindex */,
1671  __m256i /* vmask */,
1672  const int /* scale */);
1673 
1674 extern __m128i __cdecl _mm_mask_i32gather_epi32(__m128i /* old_dst */,
1675  int const * /* ptr */,
1676  __m128i /* vindex */,
1677  __m128i /* vmask */,
1678  const int /* scale */);
1679 extern __m128i __cdecl _mm_mask_i32gather_epi64(__m128i /* old_dst */,
1680  __int64 const * /* ptr */,
1681  __m128i /* vindex */,
1682  __m128i /* vmask */,
1683  const int /* scale */);
1684 extern __m128i __cdecl _mm_mask_i64gather_epi32(__m128i /* old_dst */,
1685  int const * /* ptr */,
1686  __m128i /* vindex */,
1687  __m128i /* vmask */,
1688  const int /* scale */);
1689 extern __m128i __cdecl _mm_mask_i64gather_epi64(__m128i /* old_dst */,
1690  __int64 const * /* ptr */,
1691  __m128i /* vindex */,
1692  __m128i /* vmask */,
1693  const int /* scale */);
1694 
1695 
1696 /*
1697  * GATHER from memory to vector register operations.
1698  */
1699 extern __m256d __cdecl _mm256_i32gather_pd(double const * /* ptr */,
1700  __m128i /* vindex */,
1701  const int /* index_scale */);
1702 extern __m256 __cdecl _mm256_i32gather_ps(float const * /* ptr */,
1703  __m256i /* vindex */,
1704  const int /* index_scale */);
1705 extern __m256d __cdecl _mm256_i64gather_pd(double const * /* ptr */,
1706  __m256i /* vindex */,
1707  const int /* index_scale */);
1708 extern __m128 __cdecl _mm256_i64gather_ps(float const * /* ptr */,
1709  __m256i /* vindex */,
1710  const int /* index_scale */);
1711 
1712 extern __m128d __cdecl _mm_i32gather_pd(double const * /* ptr */,
1713  __m128i /* vindex */,
1714  const int /* index_scale */);
1715 extern __m128 __cdecl _mm_i32gather_ps(float const * /* ptr */,
1716  __m128i /* vindex */,
1717  const int /* index_scale */);
1718 extern __m128d __cdecl _mm_i64gather_pd(double const * /* ptr */,
1719  __m128i /* vindex */,
1720  const int /* index_scale */);
1721 extern __m128 __cdecl _mm_i64gather_ps(float const * /* ptr */,
1722  __m128i /* vindex */,
1723  const int /* index_scale */);
1724 
1725 extern __m256i __cdecl _mm256_i32gather_epi32(int const * /* ptr */,
1726  __m256i /* vindex */,
1727  const int /* scale */);
1728 extern __m256i __cdecl _mm256_i32gather_epi64(__int64 const * /* ptr */,
1729  __m128i /* vindex */,
1730  const int /* scale */);
1731 extern __m128i __cdecl _mm256_i64gather_epi32(int const * /* ptr */,
1732  __m256i /* vindex */,
1733  const int /* scale */);
1734 extern __m256i __cdecl _mm256_i64gather_epi64(__int64 const * /* ptr */,
1735  __m256i /* vindex */,
1736  const int /* scale */);
1737 
1738 extern __m128i __cdecl _mm_i32gather_epi32(int const * /* ptr */,
1739  __m128i /* vindex */,
1740  const int /* index_scale */);
1741 extern __m128i __cdecl _mm_i32gather_epi64(__int64 const * /* ptr */,
1742  __m128i /* vindex */,
1743  const int /* index_scale */);
1744 extern __m128i __cdecl _mm_i64gather_epi32(int const * /* ptr */,
1745  __m128i /* vindex */,
1746  const int /* index_scale */);
1747 extern __m128i __cdecl _mm_i64gather_epi64(__int64 const * /* ptr */,
1748  __m128i /* vindex */,
1749  const int /* index_scale */);
1750 
1751 
1752 /*
1753  * A collection of operations to manipulate integer data at bit-granularity.
1754  */
1755 extern unsigned int _bextr_u32(unsigned int /* src */,
1756  unsigned int /* start_bit */,
1757  unsigned int /* len_in_bits */);
1758 extern unsigned int _blsi_u32(unsigned int);
1759 extern unsigned int _blsmsk_u32(unsigned int);
1760 extern unsigned int _blsr_u32(unsigned int);
1761 extern unsigned int _bzhi_u32(unsigned int /* src */,
1762  unsigned int /* index */);
1763 extern unsigned int _mulx_u32(unsigned int /* src1 */,
1764  unsigned int /* src2 */,
1765  unsigned int * /* high_bits */);
1766 extern unsigned int _pdep_u32(unsigned int /* src */,
1767  unsigned int /* mask */);
1768 extern unsigned int _pext_u32(unsigned int /* src */,
1769  unsigned int /* mask */);
1770 extern unsigned int _rorx_u32(unsigned int /* src */,
1771  const unsigned int /* shift_count */);
1772 extern int _sarx_i32(int /* src */,
1773  unsigned int /* shift_count */);
1774 extern unsigned int _shlx_u32(unsigned int /* src */,
1775  unsigned int /* shift_count */);
1776 extern unsigned int _shrx_u32(unsigned int /* src */,
1777  unsigned int /* shift_count */);
1778 
1779 #if defined (_M_X64)
1780 extern unsigned __int64 _bextr_u64(unsigned __int64 /* src */,
1781  unsigned int /* start_bit */,
1782  unsigned int /* len_in_bits */);
1783 extern unsigned __int64 _blsi_u64(unsigned __int64);
1784 extern unsigned __int64 _blsmsk_u64(unsigned __int64);
1785 extern unsigned __int64 _blsr_u64(unsigned __int64);
1786 extern unsigned __int64 _bzhi_u64(unsigned __int64 /* src */,
1787  unsigned int /* index */);
1788 extern unsigned __int64 _mulx_u64(unsigned __int64 /* src1 */,
1789  unsigned __int64 /* src2 */,
1790  unsigned __int64 * /* high_bits */);
1791 extern unsigned __int64 _pdep_u64(unsigned __int64 /* src */,
1792  unsigned __int64 /* mask */);
1793 extern unsigned __int64 _pext_u64(unsigned __int64 /* src */,
1794  unsigned __int64 /* mask */);
1795 extern unsigned __int64 _rorx_u64(unsigned __int64 /* src */,
1796  const unsigned int /* shift_count */);
1797 extern __int64 _sarx_i64(__int64 /* src */,
1798  unsigned int /* shift_count */);
1799 extern unsigned __int64 _shlx_u64(unsigned __int64 /* src */,
1800  unsigned int /* shift_count */);
1801 extern unsigned __int64 _shrx_u64(unsigned __int64 /* src */,
1802  unsigned int /* shift_count */);
1803 #endif /* defined (_M_X64) */
1804 
1805 
1806 /*
1807  * Leading zero bit count.
1808  *
1809  * Counts the number of leading zero bits in a source operand.
1810  * Returns operand size as output when source operand is zero.
1811  */
1812 extern unsigned int _lzcnt_u32(unsigned int);
1813 #if defined (_M_X64)
1814 extern unsigned __int64 _lzcnt_u64(unsigned __int64);
1815 #endif /* defined (_M_X64) */
1816 
1817 /*
1818  * Trailing zero bit count.
1819  *
1820  * Searches the source operand (r2) for the least significant set bit
1821  * (1 bit). If a least significant 1 bit is found, its bit index is
1822  * returned, otherwise the result is the number of bits in the operand size.
1823  */
1824 extern unsigned int _tzcnt_u32(unsigned int);
1825 #if defined (_M_X64)
1826 extern unsigned __int64 _tzcnt_u64(unsigned __int64);
1827 #endif /* defined (_M_X64) */
1828 
1829 
1830 
1831 /*
1832  * Operation targeted to system software that manages processor context IDs.
1833  */
1834 extern void __cdecl _invpcid(unsigned int /* type */, void * /* descriptor */);
1835 
1836 // Hardware Lock Elision
1837 extern void _Store_HLERelease(long volatile *,long);
1838 extern void _StorePointer_HLERelease(void * volatile *,void *);
1839 
1840 extern long _InterlockedExchange_HLEAcquire(long volatile *,long);
1841 extern long _InterlockedExchange_HLERelease(long volatile *,long);
1842 extern void * _InterlockedExchangePointer_HLEAcquire(void *volatile *,void *);
1843 extern void * _InterlockedExchangePointer_HLERelease(void *volatile *,void *);
1844 
1845 extern long _InterlockedCompareExchange_HLEAcquire(long volatile *,long,long);
1846 extern long _InterlockedCompareExchange_HLERelease(long volatile *,long,long);
1847 extern __int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *,__int64,__int64);
1848 extern __int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *,__int64,__int64);
1849 extern void * _InterlockedCompareExchangePointer_HLEAcquire(void *volatile *,void *,void *);
1850 extern void * _InterlockedCompareExchangePointer_HLERelease(void *volatile *,void *,void *);
1851 
1852 extern long _InterlockedExchangeAdd_HLEAcquire(long volatile *,long);
1853 extern long _InterlockedExchangeAdd_HLERelease(long volatile *,long);
1854 
1855 extern long _InterlockedAnd_HLEAcquire(long volatile *,long);
1856 extern long _InterlockedAnd_HLERelease(long volatile *,long);
1857 extern long _InterlockedOr_HLEAcquire(long volatile *,long);
1858 extern long _InterlockedOr_HLERelease(long volatile *,long);
1859 extern long _InterlockedXor_HLEAcquire(long volatile *,long);
1860 extern long _InterlockedXor_HLERelease(long volatile *,long);
1861 
1862 extern unsigned char _interlockedbittestandset_HLEAcquire(long *a,long b);
1863 extern unsigned char _interlockedbittestandset_HLERelease(long *a,long b);
1864 extern unsigned char _interlockedbittestandreset_HLEAcquire(long *a,long b);
1865 extern unsigned char _interlockedbittestandreset_HLERelease(long *a,long b);
1866 
1867 #if defined(_M_X64)
1868 extern void _Store64_HLERelease(__int64 volatile *,__int64);
1869 extern __int64 _InterlockedExchange64_HLEAcquire(__int64 volatile *,__int64);
1870 extern __int64 _InterlockedExchange64_HLERelease(__int64 volatile *,__int64);
1871 
1872 extern __int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *,__int64);
1873 extern __int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *,__int64);
1874 
1875 extern __int64 _InterlockedAnd64_HLEAcquire(__int64 volatile *,__int64);
1876 extern __int64 _InterlockedAnd64_HLERelease(__int64 volatile *,__int64);
1877 extern __int64 _InterlockedOr64_HLEAcquire(__int64 volatile *,__int64);
1878 extern __int64 _InterlockedOr64_HLERelease(__int64 volatile *,__int64);
1879 extern __int64 _InterlockedXor64_HLEAcquire(__int64 volatile *,__int64);
1880 extern __int64 _InterlockedXor64_HLERelease(__int64 volatile *,__int64);
1881 
1882 extern unsigned char _interlockedbittestandset64_HLEAcquire(__int64 *a,__int64 b);
1883 extern unsigned char _interlockedbittestandset64_HLERelease(__int64 *a,__int64 b);
1884 extern unsigned char _interlockedbittestandreset64_HLEAcquire(__int64 *a,__int64 b);
1885 extern unsigned char _interlockedbittestandreset64_HLERelease(__int64 *a,__int64 b);
1886 #endif /* defined (_M_X64) */
1887 
1888 // Restricted Transactional Memory
1889 #define _XBEGIN_STARTED (~0u)
1890 #define _XABORT_EXPLICIT (1 << 0)
1891 #define _XABORT_RETRY (1 << 1)
1892 #define _XABORT_CONFLICT (1 << 2)
1893 #define _XABORT_CAPACITY (1 << 3)
1894 #define _XABORT_DEBUG (1 << 4)
1895 #define _XABORT_NESTED (1 << 5)
1896 #define _XABORT_CODE(x) ((unsigned char)(((x) >> 24) & 0xFF))
1897 
1898 extern unsigned int __cdecl _xbegin(void);
1899 extern void __cdecl _xend(void);
1900 extern void __cdecl _xabort(const unsigned int);
1901 extern unsigned char __cdecl _xtest(void);
1902 
1903 /*
1904  * Perform one attempt to generate a hardware generated random value
1905  * accordingly to the NIST SP 800-90B/C standards.
1906  * The generated value is written to the given memory location and the success
1907  * status is returned: 1 if the hardware could generate a valid random number
1908  * and 0 otherwise.
1909  */
1910 extern int __cdecl _rdseed16_step(unsigned short *);
1911 extern int __cdecl _rdseed32_step(unsigned int *);
1912 extern int __cdecl _rdseed64_step(unsigned __int64 *);
1913 
1914 /*
1915  * The _addcarryx... functions generate ADCX and ADOX instructions which
1916  * use CF and OF (in the flags register) respectively to propagate carry.
1917  * Because this allows two add-with-carry sequences to be interleaved
1918  * without having to save and restore the carry flag this is useful in
1919  * multiprecision multiply for example. These functions return
1920  * the carry-out, which is convenient for chaining multiple operations.
1921  * The sum is written using the given reference.
1922  */
1923 extern unsigned char __cdecl _addcarryx_u32(unsigned char /*c_in*/,
1924  unsigned int /*src1*/,
1925  unsigned int /*src2*/,
1926  unsigned int * /*out*/);
1927 #if defined(_M_X64)
1928 extern unsigned char __cdecl _addcarryx_u64(unsigned char /*c_in*/,
1929  unsigned __int64 /*src1*/,
1930  unsigned __int64 /*src2*/,
1931  unsigned __int64 * /*out*/);
1932 #endif /* defined (_M_X64) */
1933 
1934 #if defined __cplusplus
1935 }; /* End "C" */
1936 #endif /* defined __cplusplus */
1937 
1938 #endif /* defined (_M_CEE_PURE) */
1939 
1940 #endif /* _INCLUDED_IMM */
1941 #endif /* __midl */
void __cdecl _mm256_storeu_pd(double *, __m256d)
__m256 __cdecl _mm256_setzero_ps(void)
__m256i __cdecl _mm256_set_epi16(short, short, short, short, short, short, short, short, short, short, short, short, short, short, short, short)
__m256i __cdecl _mm256_i32gather_epi64(__int64 const *, __m128i, const int)
unsigned int _blsmsk_u32(unsigned int)
__m256d __cdecl _mm256_sub_pd(__m256d, __m256d)
int __cdecl _mm_testz_pd(__m128d, __m128d)
__m128i __cdecl _mm_broadcastw_epi16(__m128i)
void __cdecl _mm256_stream_si256(__m256i *, __m256i)
__m256i __cdecl _mm256_add_epi64(__m256i, __m256i)
__m128i __cdecl _mm_broadcastd_epi32(__m128i)
__m256 __cdecl _mm256_broadcast_ss(float const *)
void __cdecl _mm256_maskstore_epi64(__int64 *, __m256i, __m256i)
__m128d __cdecl _mm_permutevar_pd(__m128d, __m128i)
__m256d __cdecl _mm256_div_pd(__m256d, __m256d)
int __cdecl _mm256_testnzc_si256(__m256i, __m256i)
__m256i __cdecl _mm256_srli_epi32(__m256i, int)
long _InterlockedExchange_HLEAcquire(long volatile *, long)
__m128i __cdecl _mm_broadcastq_epi64(__m128i)
__m256i __cdecl _mm256_sll_epi16(__m256i, __m128i)
__m256i __cdecl _mm256_srav_epi32(__m256i, __m256i)
__m256 __cdecl _mm256_div_ps(__m256, __m256)
void __cdecl _xabort(const unsigned int)
long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long)
__m256d __cdecl _mm256_blend_pd(__m256d, __m256d, const int)
__m256 __cdecl _mm256_rsqrt_ps(__m256)
__m256d __cdecl _mm256_cvtps_pd(__m128)
__m256i __cdecl _mm256_load_si256(__m256i const *)
__m256i __cdecl _mm256_sllv_epi64(__m256i, __m256i)
__m256 __cdecl _mm256_i32gather_ps(float const *, __m256i, const int)
__m256i __cdecl _mm256_subs_epu8(__m256i, __m256i)
__m256i __cdecl _mm256_unpackhi_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_sub_epi16(__m256i, __m256i)
__m256d __cdecl _mm256_andnot_pd(__m256d, __m256d)
void __cdecl _xsave(void *, unsigned __int64)
__m256i __cdecl _mm256_sign_epi16(__m256i, __m256i)
__m256d __cdecl _mm256_round_pd(__m256d, int)
__m256i __cdecl _mm256_mulhrs_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_shufflelo_epi16(__m256i, const int)
__m128 __cdecl _mm_cmp_ps(__m128, __m128, const int)
__m256d __cdecl _mm256_fmsubadd_pd(__m256d, __m256d, __m256d)
__m256i __cdecl _mm256_min_epu8(__m256i, __m256i)
void __cdecl _xend(void)
__m256i __cdecl _mm256_srli_epi64(__m256i, int)
__m128 __cdecl _mm_fmadd_ss(__m128, __m128, __m128)
__m256i __cdecl _mm256_max_epi8(__m256i, __m256i)
__m128 __cdecl _mm256_i64gather_ps(float const *, __m256i, const int)
__m256i __cdecl _mm256_max_epi32(__m256i, __m256i)
__m256d __cdecl _mm256_set_pd(double, double, double, double)
__m256 __cdecl _mm256_loadu_ps(float const *)
void __cdecl _mm256_storeu_ps(float *, __m256)
__m256d __cdecl _mm256_load_pd(double const *)
__m256i __cdecl _mm256_min_epu32(__m256i, __m256i)
__m256d __cdecl _mm256_and_pd(__m256d, __m256d)
int __cdecl _rdseed32_step(unsigned int *)
__m256i __cdecl _mm256_mullo_epi32(__m256i, __m256i)
void __cdecl _mm_maskstore_ps(float *, __m128i, __m128)
__m256i __cdecl _mm256_shuffle_epi8(__m256i, __m256i)
__m256 __cdecl _mm256_insertf128_ps(__m256, __m128, int)
__m256i __cdecl _mm256_broadcastb_epi8(__m128i)
__m256i __cdecl _mm256_cmpgt_epi32(__m256i, __m256i)
__m128i __cdecl _mm_i32gather_epi32(int const *, __m128i, const int)
__m256i __cdecl _mm256_cmpgt_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_setr_epi16(short, short, short, short, short, short, short, short, short, short, short, short, short, short, short, short)
__m256d __cdecl _mm256_castps_pd(__m256)
unsigned int _blsr_u32(unsigned int)
__m256 __cdecl _mm256_sqrt_ps(__m256)
__m256d __cdecl _mm256_movedup_pd(__m256d)
__m128d __cdecl _mm_fnmsub_sd(__m128d, __m128d, __m128d)
__m256i __cdecl _mm256_unpacklo_epi16(__m256i, __m256i)
__m256d __cdecl _mm256_fnmadd_pd(__m256d, __m256d, __m256d)
__m256i __cdecl _mm256_permute2f128_si256(__m256i, __m256i, int)
__m128i __cdecl _mm_maskload_epi64(__int64 const *, __m128i)
long _InterlockedXor_HLEAcquire(long volatile *, long)
__m256d __cdecl _mm256_fnmsub_pd(__m256d, __m256d, __m256d)
__m256 __cdecl _mm256_xor_ps(__m256, __m256)
__m256i __cdecl _mm256_max_epu8(__m256i, __m256i)
__m256i __cdecl _mm256_inserti128_si256(__m256i, __m128i, const int)
long _InterlockedXor_HLERelease(long volatile *, long)
__m128i __cdecl _mm_maskload_epi32(int const *, __m128i)
__m128d __cdecl _mm_cmp_pd(__m128d, __m128d, const int)
__m256i __cdecl _mm256_avg_epu8(__m256i, __m256i)
unsigned int _rorx_u32(unsigned int, const unsigned int)
__m128d
Definition: emmintrin.h:48
__m256 __cdecl _mm256_load_ps(float const *)
__m256i __cdecl _mm256_add_epi32(__m256i, __m256i)
__m128 __cdecl _mm256_mask_i64gather_ps(__m128, float const *, __m256i, __m128, const int)
__m256i __cdecl _mm256_avg_epu16(__m256i, __m256i)
__m256d
Definition: immintrin.h:38
void __cdecl _mm256_stream_ps(float *, __m256)
__m256d __cdecl _mm256_loadu_pd(double const *)
unsigned char _interlockedbittestandset_HLEAcquire(long *a, long b)
long _InterlockedCompareExchange_HLERelease(long volatile *, long, long)
unsigned char __cdecl _addcarryx_u32(unsigned char, unsigned int, unsigned int, unsigned int *)
__m256i __cdecl _mm256_max_epu32(__m256i, __m256i)
unsigned int _mulx_u32(unsigned int, unsigned int, unsigned int *)
__m128 __cdecl _mm_maskload_ps(float const *, __m128i)
__m256i __cdecl _mm256_srli_epi16(__m256i, int)
__m128 __cdecl _mm_permutevar_ps(__m128, __m128i)
__m256 __cdecl _mm256_permutevar_ps(__m256, __m256i)
__m256 __cdecl _mm256_castps128_ps256(__m128)
long _InterlockedOr_HLERelease(long volatile *, long)
void __cdecl _mm256_store_pd(double *, __m256d)
__m256 __cdecl _mm256_fmsubadd_ps(__m256, __m256, __m256)
__m256 __cdecl _mm256_permute_ps(__m256, int)
__m128d __cdecl _mm_fmsub_pd(__m128d, __m128d, __m128d)
__m256i __cdecl _mm256_setzero_si256(void)
void * _InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *, void *)
__m256 __cdecl _mm256_add_ps(__m256, __m256)
__m256d __cdecl _mm256_permute_pd(__m256d, int)
int __cdecl _mm256_testnzc_pd(__m256d, __m256d)
unsigned int _shlx_u32(unsigned int, unsigned int)
__m256i __cdecl _mm256_cmpeq_epi64(__m256i, __m256i)
__m256d __cdecl _mm256_shuffle_pd(__m256d, __m256d, const int)
__m256i __cdecl _mm256_madd_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_set1_epi64x(long long)
__m256i __cdecl _mm256_packs_epi16(__m256i, __m256i)
__m128d __cdecl _mm_fmadd_sd(__m128d, __m128d, __m128d)
__m256i __cdecl _mm256_blend_epi16(__m256i, __m256i, const int)
__m256d __cdecl _mm256_i64gather_pd(double const *, __m256i, const int)
__m256d __cdecl _mm256_permute2f128_pd(__m256d, __m256d, int)
__m256i __cdecl _mm256_adds_epu16(__m256i, __m256i)
__m256i __cdecl _mm256_maskload_epi32(int const *, __m256i)
__m256d __cdecl _mm256_mul_pd(__m256d, __m256d)
unsigned char __cdecl _xtest(void)
__m256d __cdecl _mm256_broadcast_sd(double const *)
int __cdecl _mm256_testc_pd(__m256d, __m256d)
__m256i __cdecl _mm256_sll_epi32(__m256i, __m128i)
__m256i __cdecl _mm256_set1_epi8(char)
void _StorePointer_HLERelease(void *volatile *, void *)
__m256i __cdecl _mm256_hsubs_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_blend_epi32(__m256i, __m256i, const int)
__m256i __cdecl _mm256_alignr_epi8(__m256i, __m256i, const int)
__m256i __cdecl _mm256_mask_i64gather_epi64(__m256i, __int64 const *, __m256i, __m256i, const int)
__m256i __cdecl _mm256_unpackhi_epi16(__m256i, __m256i)
long _InterlockedExchangeAdd_HLERelease(long volatile *, long)
__m256 __cdecl _mm256_fmaddsub_ps(__m256, __m256, __m256)
__m256i __cdecl _mm256_cvtepi32_epi64(__m128i)
__m256i __cdecl _mm256_cmpeq_epi8(__m256i, __m256i)
__m256i __cdecl _mm256_adds_epu8(__m256i, __m256i)
__m256 __cdecl _mm256_castpd_ps(__m256d)
void __cdecl _mm256_maskstore_epi32(int *, __m256i, __m256i)
__m256i __cdecl _mm256_set_epi64x(__int64, __int64, __int64, __int64)
__m256d __cdecl _mm256_fmsub_pd(__m256d, __m256d, __m256d)
__m256i __cdecl _mm256_castpd_si256(__m256d)
__m256i __cdecl _mm256_packs_epi32(__m256i, __m256i)
__m128 __cdecl _mm_fmsub_ss(__m128, __m128, __m128)
__m256i __cdecl _mm256_abs_epi32(__m256i)
__m128d __cdecl _mm_fmaddsub_pd(__m128d, __m128d, __m128d)
unsigned __int64 __cdecl _xgetbv(unsigned int)
__m256i __cdecl _mm256_cvtepu8_epi32(__m128i)
int __cdecl _rdrand16_step(unsigned short *)
__m256d __cdecl _mm256_cvtepi32_pd(__m128i)
__m128 __cdecl _mm_cvtph_ps(__m128i)
__m256i __cdecl _mm256_unpacklo_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_cvtepi16_epi32(__m128i)
__m128d __cdecl _mm_broadcastsd_pd(__m128d)
__m256d __cdecl _mm256_sqrt_pd(__m256d)
__m256i __cdecl _mm256_max_epu16(__m256i, __m256i)
__m256i __cdecl _mm256_mask_i32gather_epi64(__m256i, __int64 const *, __m128i, __m256i, const int)
__m128 __cdecl _mm_fnmsub_ps(__m128, __m128, __m128)
int __cdecl _mm_testnzc_pd(__m128d, __m128d)
__m256 __cdecl _mm256_hsub_ps(__m256, __m256)
void * _InterlockedExchangePointer_HLERelease(void *volatile *, void *)
__m256 __cdecl _mm256_sub_ps(__m256, __m256)
__m128i __cdecl _mm_srlv_epi32(__m128i, __m128i)
__m256i __cdecl _mm256_abs_epi16(__m256i)
int __cdecl _mm_testc_pd(__m128d, __m128d)
__m256i __cdecl _mm256_sign_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_setr_epi32(int, int, int, int, int, int, int, int)
__m256i __cdecl _mm256_broadcastsi128_si256(__m128i)
__m256 __cdecl _mm256_moveldup_ps(__m256)
__m128i __cdecl _mm_srav_epi32(__m128i, __m128i)
__m256i __cdecl _mm256_packus_epi16(__m256i, __m256i)
__m256d __cdecl _mm256_mask_i64gather_pd(__m256d, double const *, __m256i, __m256d, const int)
__m256 __cdecl _mm256_and_ps(__m256, __m256)
__m256d __cdecl _mm256_addsub_pd(__m256d, __m256d)
__m128i __cdecl _mm_i64gather_epi64(__int64 const *, __m128i, const int)
__m256i __cdecl _mm256_sub_epi8(__m256i, __m256i)
__m256i __cdecl _mm256_and_si256(__m256i, __m256i)
__m256i __cdecl _mm256_unpacklo_epi64(__m256i, __m256i)
__m128 __cdecl _mm_cmp_ss(__m128, __m128, const int)
__m256i __cdecl _mm256_castsi128_si256(__m128i)
__m128 __cdecl _mm_i32gather_ps(float const *, __m128i, const int)
__m128d __cdecl _mm_fmsub_sd(__m128d, __m128d, __m128d)
__m256i __cdecl _mm256_abs_epi8(__m256i)
__m256i __cdecl _mm256_mul_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_sign_epi8(__m256i, __m256i)
__m256 __cdecl _mm256_cvtph_ps(__m128i)
__m128i __cdecl _mm_sllv_epi64(__m128i, __m128i)
int __cdecl _mm256_movemask_pd(__m256d)
__m128i __cdecl _mm_i64gather_epi32(int const *, __m128i, const int)
__m256 __cdecl _mm256_castsi256_ps(__m256i)
__m256 __cdecl _mm256_movehdup_ps(__m256)
void _Store_HLERelease(long volatile *, long)
__m128 __cdecl _mm_fnmadd_ss(__m128, __m128, __m128)
void __cdecl _mm256_maskstore_pd(double *, __m256i, __m256d)
__m256i __cdecl _mm256_packus_epi32(__m256i, __m256i)
__m128d __cdecl _mm_mask_i32gather_pd(__m128d, double const *, __m128i, __m128d, const int)
__m256i __cdecl _mm256_set1_epi32(int)
__m256d __cdecl _mm256_i32gather_pd(double const *, __m128i, const int)
unsigned char _interlockedbittestandreset_HLERelease(long *a, long b)
__m256d __cdecl _mm256_broadcast_pd(__m128d const *)
__m256d __cdecl _mm256_hsub_pd(__m256d, __m256d)
__m256d __cdecl _mm256_unpacklo_pd(__m256d, __m256d)
__m256d __cdecl _mm256_max_pd(__m256d, __m256d)
__m256i __cdecl _mm256_cmpgt_epi64(__m256i, __m256i)
int __cdecl _mm256_movemask_ps(__m256)
void __cdecl _fxrstor(void const *)
__m256 __cdecl _mm256_fnmsub_ps(__m256, __m256, __m256)
__m256i __cdecl _mm256_srai_epi32(__m256i, int)
unsigned int _lzcnt_u32(unsigned int)
__m128i __cdecl _mm256_extractf128_si256(__m256i, const int)
__m256 __cdecl _mm256_dp_ps(__m256, __m256, const int)
__m256 __cdecl _mm256_blendv_ps(__m256, __m256, __m256)
__m256i __cdecl _mm256_mask_i32gather_epi32(__m256i, int const *, __m256i, __m256i, const int)
__m256i __cdecl _mm256_subs_epi8(__m256i, __m256i)
__m256i __cdecl _mm256_min_epu16(__m256i, __m256i)
__m256d __cdecl _mm256_unpackhi_pd(__m256d, __m256d)
__m256i __cdecl _mm256_mulhi_epu16(__m256i, __m256i)
__m128 __cdecl _mm256_castps256_ps128(__m256)
__m256i __cdecl _mm256_sll_epi64(__m256i, __m128i)
__m128i __cdecl _mm256_mask_i64gather_epi32(__m128i, int const *, __m256i, __m128i, const int)
__m256i __cdecl _mm256_cmpeq_epi32(__m256i, __m256i)
__m128 __cdecl _mm_broadcastss_ps(__m128)
__m128d __cdecl _mm_maskload_pd(double const *, __m128i)
long _InterlockedAnd_HLERelease(long volatile *, long)
__m256 __cdecl _mm256_mask_i32gather_ps(__m256, float const *, __m256i, __m256, const int)
__m256i __cdecl _mm256_hsub_epi32(__m256i, __m256i)
__m256 __cdecl _mm256_cvtepi32_ps(__m256i)
__m256i __cdecl _mm256_unpackhi_epi64(__m256i, __m256i)
__m256i __cdecl _mm256_hadd_epi16(__m256i, __m256i)
__m256 __cdecl _mm256_hadd_ps(__m256, __m256)
__m256i __cdecl _mm256_cmpeq_epi16(__m256i, __m256i)
void __cdecl _mm_maskstore_pd(double *, __m128i, __m128d)
__m128d __cdecl _mm_fmsubadd_pd(__m128d, __m128d, __m128d)
__m256i __cdecl _mm256_cvtepu8_epi16(__m128i)
__m128i __cdecl _mm256_cvtpd_epi32(__m256d)
__m256 __cdecl _mm256_set1_ps(float)
void __cdecl _xrstor(void const *, unsigned __int64)
__m256i __cdecl _mm256_maskload_epi64(__int64 const *, __m256i)
__m256i __cdecl _mm256_srl_epi32(__m256i, __m128i)
__m256i __cdecl _mm256_mpsadbw_epu8(__m256i, __m256i, const int)
__m256 __cdecl _mm256_addsub_ps(__m256, __m256)
__m256d __cdecl _mm256_min_pd(__m256d, __m256d)
__m256i __cdecl _mm256_mulhi_epi16(__m256i, __m256i)
__m256d __cdecl _mm256_fmadd_pd(__m256d, __m256d, __m256d)
__m256i __cdecl _mm256_srlv_epi32(__m256i, __m256i)
__m128 __cdecl _mm_broadcast_ss(float const *)
__m128d __cdecl _mm256_castpd256_pd128(__m256d)
__m128i
Definition: emmintrin.h:44
__m256
Definition: immintrin.h:34
__m256 __cdecl _mm256_setr_ps(float, float, float, float, float, float, float, float)
__m128i __cdecl _mm_broadcastb_epi8(__m128i)
__m256i __cdecl _mm256_mul_epu32(__m256i, __m256i)
__m128d __cdecl _mm_fmadd_pd(__m128d, __m128d, __m128d)
int __cdecl _mm256_testz_si256(__m256i, __m256i)
__m256i __cdecl _mm256_setr_epi8(char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char)
#define _CRT_ALIGN(x)
Definition: crtdefs.h:604
__m128 __cdecl _mm_i64gather_ps(float const *, __m128i, const int)
void __cdecl _xsaveopt(void *, unsigned __int64)
__m256i __cdecl _mm256_broadcastw_epi16(__m128i)
__m256d __cdecl _mm256_permute4x64_pd(__m256d, const int)
__m256d __cdecl _mm256_setr_pd(double, double, double, double)
int __cdecl _mm256_testc_ps(__m256, __m256)
__m128
Definition: xmmintrin.h:70
void __cdecl _mm256_zeroall(void)
void __cdecl _mm256_store_ps(float *, __m256)
unsigned int _shrx_u32(unsigned int, unsigned int)
void __cdecl _mm256_storeu_si256(__m256i *, __m256i)
__m128 __cdecl _mm_fmsub_ps(__m128, __m128, __m128)
__m128i __cdecl _mm_sllv_epi32(__m128i, __m128i)
__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64, __int64)
__m256i __cdecl _mm256_min_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_cvtepi8_epi16(__m128i)
__m256i __cdecl _mm256_permute2x128_si256(__m256i, __m256i, const int)
__m256i __cdecl _mm256_stream_load_si256(__m256i const *)
__m256i __cdecl _mm256_or_si256(__m256i, __m256i)
__m256i __cdecl _mm256_add_epi8(__m256i, __m256i)
int __cdecl _mm_testz_ps(__m128, __m128)
__m256 __cdecl _mm256_mul_ps(__m256, __m256)
__m256i __cdecl _mm256_add_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_cvtepi16_epi64(__m128i)
__m256i __cdecl _mm256_insertf128_si256(__m256i, __m128i, int)
int __cdecl _mm_testnzc_ps(__m128, __m128)
__m256i __cdecl _mm256_unpacklo_epi8(__m256i, __m256i)
void __cdecl _invpcid(unsigned int, void *)
__m256 __cdecl _mm256_fnmadd_ps(__m256, __m256, __m256)
__m256i __cdecl _mm256_cvtepi8_epi64(__m128i)
__m256i __cdecl _mm256_andnot_si256(__m256i, __m256i)
void __cdecl _mm256_zeroupper(void)
__m256i __cdecl _mm256_slli_epi64(__m256i, int)
__m128i __cdecl _mm256_extracti128_si256(__m256i, const int)
unsigned int __cdecl _xbegin(void)
__m256i __cdecl _mm256_hsub_epi16(__m256i, __m256i)
int __cdecl _rdseed16_step(unsigned short *)
__m128i __cdecl _mm256_cvtps_ph(__m256, int)
__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64, __int64)
unsigned int _bextr_u32(unsigned int, unsigned int, unsigned int)
__m128d __cdecl _mm_i32gather_pd(double const *, __m128i, const int)
__m128i __cdecl _mm256_castsi256_si128(__m256i)
__m256 __cdecl _mm256_min_ps(__m256, __m256)
__m256i __cdecl _mm256_slli_si256(__m256i, const int)
__m128d __cdecl _mm_mask_i64gather_pd(__m128d, double const *, __m128i, __m128d, const int)
__m128 __cdecl _mm_fnmadd_ps(__m128, __m128, __m128)
__m256 __cdecl _mm256_andnot_ps(__m256, __m256)
void __cdecl _fxsave(void *)
__m256i __cdecl _mm256_set1_epi16(short)
__m256i __cdecl _mm256_slli_epi16(__m256i, int)
int __cdecl _rdseed64_step(unsigned __int64 *)
int _sarx_i32(int, unsigned int)
__m256 __cdecl _mm256_blend_ps(__m256, __m256, const int)
unsigned int _tzcnt_u32(unsigned int)
__m256 __cdecl _mm256_unpackhi_ps(__m256, __m256)
__m256i __cdecl _mm256_broadcastd_epi32(__m128i)
long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long)
__m256 __cdecl _mm256_round_ps(__m256, int)
__m256 __cdecl _mm256_permute2f128_ps(__m256, __m256, int)
unsigned int _pext_u32(unsigned int, unsigned int)
__m256d __cdecl _mm256_insertf128_pd(__m256d, __m128d, int)
__m256i __cdecl _mm256_castps_si256(__m256)
__m256i __cdecl _mm256_xor_si256(__m256i, __m256i)
__m256i __cdecl _mm256_cvtepu16_epi32(__m128i)
__m256d __cdecl _mm256_maskload_pd(double const *, __m256i)
unsigned int _pdep_u32(unsigned int, unsigned int)
__m256d __cdecl _mm256_castsi256_pd(__m256i)
__m256i __cdecl _mm256_sra_epi32(__m256i, __m128i)
__m128i __cdecl _mm_cvtps_ph(__m128 m1, const int imm)
void * _InterlockedExchangePointer_HLEAcquire(void *volatile *, void *)
__m256 __cdecl _mm256_maskload_ps(float const *, __m256i)
int __cdecl _mm256_testz_pd(__m256d, __m256d)
__m256d __cdecl _mm256_blendv_pd(__m256d, __m256d, __m256d)
__m256i __cdecl _mm256_shuffle_epi32(__m256i, const int)
__m128i __cdecl _mm_mask_i32gather_epi32(__m128i, int const *, __m128i, __m128i, const int)
__m256d __cdecl _mm256_or_pd(__m256d, __m256d)
__m128i __cdecl _mm_mask_i64gather_epi64(__m128i, __int64 const *, __m128i, __m128i, const int)
__m256i __cdecl _mm256_srl_epi16(__m256i, __m128i)
__m256i __cdecl _mm256_cvtepi8_epi32(__m128i)
__m128 __cdecl _mm_fmaddsub_ps(__m128, __m128, __m128)
__m256i __cdecl _mm256_slli_epi32(__m256i, int)
__m256 __cdecl _mm256_or_ps(__m256, __m256)
__m128i __cdecl _mm_i32gather_epi64(__int64 const *, __m128i, const int)
__m256i __cdecl _mm256_subs_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_lddqu_si256(__m256i const *)
__m256i __cdecl _mm256_hadd_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_adds_epi8(__m256i, __m256i)
__m256i __cdecl _mm256_srli_si256(__m256i, const int)
__m256i __cdecl _mm256_maddubs_epi16(__m256i, __m256i)
__m128 __cdecl _mm_mask_i64gather_ps(__m128, float const *, __m128i, __m128, const int)
long _InterlockedOr_HLEAcquire(long volatile *, long)
__m256d __cdecl _mm256_setzero_pd(void)
__m128d __cdecl _mm_cmp_sd(__m128d, __m128d, const int)
__m128i __cdecl _mm_srlv_epi64(__m128i, __m128i)
unsigned char _interlockedbittestandreset_HLEAcquire(long *a, long b)
__m256i __cdecl _mm256_cmpgt_epi8(__m256i, __m256i)
__m256 __cdecl _mm256_rcp_ps(__m256)
__m256i __cdecl _mm256_unpackhi_epi8(__m256i, __m256i)
__m256i __cdecl _mm256_sllv_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_i32gather_epi32(int const *, __m256i, const int)
__m256 __cdecl _mm256_permutevar8x32_ps(__m256, __m256i)
__m256i __cdecl _mm256_cvtepu8_epi64(__m128i)
__m128d __cdecl _mm_fnmsub_pd(__m128d, __m128d, __m128d)
__m128d __cdecl _mm_i64gather_pd(double const *, __m128i, const int)
int __cdecl _mm256_testnzc_ps(__m256, __m256)
__m256i __cdecl _mm256_broadcastq_epi64(__m128i)
__m256d __cdecl _mm256_broadcastsd_pd(__m128d)
__int64 m1
Definition: ivec.h:114
__m256i __cdecl _mm256_loadu_si256(__m256i const *)
__m256d __cdecl _mm256_set1_pd(double)
__m256 __cdecl _mm256_fmsub_ps(__m256, __m256, __m256)
__m256i __cdecl _mm256_srl_epi64(__m256i, __m128i)
__m128i __cdecl _mm256_cvttpd_epi32(__m256d)
__m256i __cdecl _mm256_min_epi8(__m256i, __m256i)
__m256d __cdecl _mm256_hadd_pd(__m256d, __m256d)
__m256 __cdecl _mm256_max_ps(__m256, __m256)
__m256i __cdecl _mm256_min_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_srai_epi16(__m256i, int)
__m256i __cdecl _mm256_max_epi16(__m256i, __m256i)
__m128i __cdecl _mm256_i64gather_epi32(int const *, __m256i, const int)
__m256 __cdecl _mm256_set_ps(float, float, float, float, float, float, float, float)
__m256i __cdecl _mm256_blendv_epi8(__m256i, __m256i, __m256i)
__m256 __cdecl _mm256_cmp_ps(__m256, __m256, const int)
unsigned int _bzhi_u32(unsigned int, unsigned int)
__m256d __cdecl _mm256_permutevar_pd(__m256d, __m256i)
__m128 __cdecl _mm_fmsubadd_ps(__m128, __m128, __m128)
__m256i __cdecl _mm256_cvtepu16_epi64(__m128i)
__m256i __cdecl _mm256_sra_epi16(__m256i, __m128i)
void __cdecl _xsetbv(unsigned int, unsigned __int64)
__m128 __cdecl _mm_fmadd_ps(__m128, __m128, __m128)
__m256i __cdecl _mm256_cvtps_epi32(__m256)
__m256i __cdecl _mm256_sad_epu8(__m256i, __m256i)
int __cdecl _mm256_testz_ps(__m256, __m256)
__m256 __cdecl _mm256_broadcast_ps(__m128 const *)
__m256d __cdecl _mm256_cmp_pd(__m256d, __m256d, const int)
__m256d __cdecl _mm256_xor_pd(__m256d, __m256d)
__m256i __cdecl _mm256_i64gather_epi64(__int64 const *, __m256i, const int)
__m256d __cdecl _mm256_add_pd(__m256d, __m256d)
__m256 __cdecl _mm256_broadcastss_ps(__m128)
__m256i __cdecl _mm256_sub_epi32(__m256i, __m256i)
__m256d __cdecl _mm256_mask_i32gather_pd(__m256d, double const *, __m128i, __m256d, const int)
__m256i __cdecl _mm256_setr_epi64x(__int64, __int64, __int64, __int64)
int __cdecl _mm_testc_ps(__m128, __m128)
void __cdecl _mm_maskstore_epi64(__int64 *, __m128i, __m128i)
__m128 __cdecl _mm256_cvtpd_ps(__m256d)
__m256i __cdecl _mm256_permute4x64_epi64(__m256i, const int)
__m128i __cdecl _mm_mask_i32gather_epi64(__m128i, __int64 const *, __m128i, __m128i, const int)
__m128i __cdecl _mm_blend_epi32(__m128i, __m128i, const int)
unsigned int _blsi_u32(unsigned int)
int __cdecl _mm256_movemask_epi8(__m256i)
long _InterlockedAnd_HLEAcquire(long volatile *, long)
__m256i __cdecl _mm256_hadds_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_adds_epi16(__m256i, __m256i)
__m128d __cdecl _mm_fnmadd_pd(__m128d, __m128d, __m128d)
__m256i
Definition: immintrin.h:49
__m256i __cdecl _mm256_permutevar8x32_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_set_epi32(int, int, int, int, int, int, int, int)
__m256i __cdecl _mm256_set_epi8(char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char)
__m256i __cdecl _mm256_sub_epi64(__m256i, __m256i)
void __cdecl _mm256_maskstore_ps(float *, __m256i, __m256)
int __cdecl _mm256_testc_si256(__m256i, __m256i)
void * _InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *, void *)
__m256 __cdecl _mm256_fmadd_ps(__m256, __m256, __m256)
void __cdecl _mm_maskstore_epi32(int *, __m128i, __m128i)
__m128 __cdecl _mm_fnmsub_ss(__m128, __m128, __m128)
__m256d __cdecl _mm256_fmaddsub_pd(__m256d, __m256d, __m256d)
void __cdecl _mm256_store_si256(__m256i *, __m256i)
__m128d __cdecl _mm_fnmadd_sd(__m128d, __m128d, __m128d)
__m128i __cdecl _mm_mask_i64gather_epi32(__m128i, int const *, __m128i, __m128i, const int)
__m256i __cdecl _mm256_cvttps_epi32(__m256)
__m128 __cdecl _mm_permute_ps(__m128, int)
__m256i __cdecl _mm256_srlv_epi64(__m256i, __m256i)
__m256i __cdecl _mm256_cvtepu32_epi64(__m128i)
__m256 __cdecl _mm256_shuffle_ps(__m256, __m256, const int)
__m256i __cdecl _mm256_mullo_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_shufflehi_epi16(__m256i, const int)
unsigned char _interlockedbittestandset_HLERelease(long *a, long b)
void __cdecl _mm256_stream_pd(double *, __m256d)
__m256d __cdecl _mm256_castpd128_pd256(__m128d)
long _InterlockedExchange_HLERelease(long volatile *, long)
__m128 __cdecl _mm256_extractf128_ps(__m256, const int)
__m128d __cdecl _mm256_extractf128_pd(__m256d, const int)
__m256i __cdecl _mm256_subs_epu16(__m256i, __m256i)
__m128 __cdecl _mm_mask_i32gather_ps(__m128, float const *, __m128i, __m128, const int)
int __cdecl _rdrand32_step(unsigned int *)
__m128d __cdecl _mm_permute_pd(__m128d, int)
__m256 __cdecl _mm256_unpacklo_ps(__m256, __m256)