STLdoc
STLdocumentation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
immintrin.h
Go to the documentation of this file.
1 /***
2 * imminitrin.h - Meta Header file for Intel(R) Architecture intrinsic functions.
3 *
4 * Copyright (C) 1985-2015 Intel Corporation. All rights reserved.
5 *
6 * The information and source code contained herein is the exclusive
7 * property of Intel Corporation and may not be disclosed, examined
8 * or reproduced in whole or in part without explicit written authorization
9 * from the company.
10 *
11 *
12 *******************************************************************************/
13 
14 #pragma once
15 
16 #if !defined(_M_IX86) && !defined(_M_X64)
17 #error This header is specific to X86 and X64 targets
18 #endif
19 
20 #ifndef _INCLUDED_IMM
21 #define _INCLUDED_IMM
22 #ifndef __midl
23 
24 #if defined (_M_CEE_PURE)
25  #error ERROR: Intel Architecture intrinsic functions not supported in the pure mode!
26 #else /* defined (_M_CEE_PURE) */
27 
28 #include <wmmintrin.h>
29 
30 #ifdef __cplusplus
31 extern "C" {
32 #endif /* __cplusplus */
33 
34 /*
35  * Intel(R) AVX compiler intrinsic functions.
36  */
37 typedef union __declspec(intrin_type) __declspec(align(32)) __m256 {
38  float m256_f32[8];
40 
41 typedef struct __declspec(intrin_type) __declspec(align(32)) __m256d {
42  double m256d_f64[4];
44 
45 typedef union __declspec(intrin_type) __declspec(align(32)) __m256i {
46  __int8 m256i_i8[32];
47  __int16 m256i_i16[16];
48  __int32 m256i_i32[8];
49  __int64 m256i_i64[4];
50  unsigned __int8 m256i_u8[32];
51  unsigned __int16 m256i_u16[16];
52  unsigned __int32 m256i_u32[8];
53  unsigned __int64 m256i_u64[4];
55 
56 
57 /*
58  * Compare predicates for scalar and packed compare intrinsic functions
59  */
60 #define _CMP_EQ_OQ 0x00 /* Equal (ordered, nonsignaling) */
61 #define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
62 #define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
63 #define _CMP_UNORD_Q 0x03 /* Unordered (nonsignaling) */
64 #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, nonsignaling) */
65 #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
66 #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered,
67  signaling) */
68 #define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
69 #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
70 #define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered,
71  signaling) */
72 #define _CMP_NGT_US 0x0A /* Not-greater-than (unordered, signaling) */
73 #define _CMP_FALSE_OQ 0x0B /* False (ordered, nonsignaling) */
74 #define _CMP_NEQ_OQ 0x0C /* Not-equal (ordered, non-signaling) */
75 #define _CMP_GE_OS 0x0D /* Greater-than-or-equal (ordered, signaling) */
76 #define _CMP_GT_OS 0x0E /* Greater-than (ordered, signaling) */
77 #define _CMP_TRUE_UQ 0x0F /* True (unordered, non-signaling) */
78 #define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
79 #define _CMP_LT_OQ 0x11 /* Less-than (ordered, nonsignaling) */
80 #define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, nonsignaling) */
81 #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
82 #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
83 #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, nonsignaling) */
84 #define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered,
85  nonsignaling) */
86 #define _CMP_ORD_S 0x17 /* Ordered (signaling) */
87 #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
88 #define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered,
89  nonsignaling) */
90 #define _CMP_NGT_UQ 0x1A /* Not-greater-than (unordered, nonsignaling) */
91 #define _CMP_FALSE_OS 0x1B /* False (ordered, signaling) */
92 #define _CMP_NEQ_OS 0x1C /* Not-equal (ordered, signaling) */
93 #define _CMP_GE_OQ 0x1D /* Greater-than-or-equal (ordered,
94  nonsignaling) */
95 #define _CMP_GT_OQ 0x1E /* Greater-than (ordered, nonsignaling) */
96 #define _CMP_TRUE_US 0x1F /* True (unordered, signaling) */
97 
98 /*
99  * Add Packed Double Precision Floating-Point Values
100  * **** VADDPD ymm1, ymm2, ymm3/m256
101  * Performs an SIMD add of the four packed double-precision floating-point
102  * values from the first source operand to the second source operand, and
103  * stores the packed double-precision floating-point results in the
104  * destination
105  */
106 extern __m256d __cdecl _mm256_add_pd(__m256d, __m256d);
107 
108 /*
109  * Add Packed Single Precision Floating-Point Values
110  * **** VADDPS ymm1, ymm2, ymm3/m256
111  * Performs an SIMD add of the eight packed single-precision floating-point
112  * values from the first source operand to the second source operand, and
113  * stores the packed single-precision floating-point results in the
114  * destination
115  */
116 extern __m256 __cdecl _mm256_add_ps(__m256, __m256);
117 
118 /*
119  * Add/Subtract Double Precision Floating-Point Values
120  * **** VADDSUBPD ymm1, ymm2, ymm3/m256
121  * Adds odd-numbered double-precision floating-point values of the first
122  * source operand with the corresponding double-precision floating-point
123  * values from the second source operand; stores the result in the odd-numbered
124  * values of the destination. Subtracts the even-numbered double-precision
125  * floating-point values from the second source operand from the corresponding
126  * double-precision floating values in the first source operand; stores the
127  * result into the even-numbered values of the destination
128  */
129 extern __m256d __cdecl _mm256_addsub_pd(__m256d, __m256d);
130 
131 /*
132  * Add/Subtract Packed Single Precision Floating-Point Values
133  * **** VADDSUBPS ymm1, ymm2, ymm3/m256
134  * Adds odd-numbered single-precision floating-point values of the first source
135  * operand with the corresponding single-precision floating-point values from
136  * the second source operand; stores the result in the odd-numbered values of
137  * the destination. Subtracts the even-numbered single-precision floating-point
138  * values from the second source operand from the corresponding
139  * single-precision floating values in the first source operand; stores the
140  * result into the even-numbered values of the destination
141  */
142 extern __m256 __cdecl _mm256_addsub_ps(__m256, __m256);
143 
144 /*
145  * Bitwise Logical AND of Packed Double Precision Floating-Point Values
146  * **** VANDPD ymm1, ymm2, ymm3/m256
147  * Performs a bitwise logical AND of the four packed double-precision
148  * floating-point values from the first source operand and the second
149  * source operand, and stores the result in the destination
150  */
151 extern __m256d __cdecl _mm256_and_pd(__m256d, __m256d);
152 
153 /*
154  * Bitwise Logical AND of Packed Single Precision Floating-Point Values
155  * **** VANDPS ymm1, ymm2, ymm3/m256
156  * Performs a bitwise logical AND of the eight packed single-precision
157  * floating-point values from the first source operand and the second
158  * source operand, and stores the result in the destination
159  */
160 extern __m256 __cdecl _mm256_and_ps(__m256, __m256);
161 
162 /*
163  * Bitwise Logical AND NOT of Packed Double Precision Floating-Point Values
164  * **** VANDNPD ymm1, ymm2, ymm3/m256
165  * Performs a bitwise logical AND NOT of the four packed double-precision
166  * floating-point values from the first source operand and the second source
167  * operand, and stores the result in the destination
168  */
169 extern __m256d __cdecl _mm256_andnot_pd(__m256d, __m256d);
170 
171 /*
172  * Bitwise Logical AND NOT of Packed Single Precision Floating-Point Values
173  * **** VANDNPS ymm1, ymm2, ymm3/m256
174  * Performs a bitwise logical AND NOT of the eight packed single-precision
175  * floating-point values from the first source operand and the second source
176  * operand, and stores the result in the destination
177  */
178 extern __m256 __cdecl _mm256_andnot_ps(__m256, __m256);
179 
180 /*
181  * Blend Packed Double Precision Floating-Point Values
182  * **** VBLENDPD ymm1, ymm2, ymm3/m256, imm8
183  * Double-Precision Floating-Point values from the second source operand are
184  * conditionally merged with values from the first source operand and written
185  * to the destination. The immediate bits [3:0] determine whether the
186  * corresponding Double-Precision Floating Point value in the destination is
187  * copied from the second source or first source. If a bit in the mask,
188  * corresponding to a word, is "1", then the Double-Precision Floating-Point
189  * value in the second source operand is copied, else the value in the first
190  * source operand is copied
191  */
192 extern __m256d __cdecl _mm256_blend_pd(__m256d, __m256d, const int);
193 
194 /*
195  * Blend Packed Single Precision Floating-Point Values
196  * **** VBLENDPS ymm1, ymm2, ymm3/m256, imm8
197  * Single precision floating point values from the second source operand are
198  * conditionally merged with values from the first source operand and written
199  * to the destination. The immediate bits [7:0] determine whether the
200  * corresponding single precision floating-point value in the destination is
201  * copied from the second source or first source. If a bit in the mask,
202  * corresponding to a word, is "1", then the single-precision floating-point
203  * value in the second source operand is copied, else the value in the first
204  * source operand is copied
205  */
206 extern __m256 __cdecl _mm256_blend_ps(__m256, __m256, const int);
207 
208 /*
209  * Blend Packed Double Precision Floating-Point Values
210  * **** VBLENDVPD ymm1, ymm2, ymm3/m256, ymm4
211  * Conditionally copy each quadword data element of double-precision
212  * floating-point value from the second source operand (third operand) and the
213  * first source operand (second operand) depending on mask bits defined in the
214  * mask register operand (fourth operand).
215  */
217 
218 /*
219  * Blend Packed Single Precision Floating-Point Values
220  * **** VBLENDVPS ymm1, ymm2, ymm3/m256, ymm4
221  * Conditionally copy each dword data element of single-precision
222  * floating-point value from the second source operand (third operand) and the
223  * first source operand (second operand) depending on mask bits defined in the
224  * mask register operand (fourth operand).
225  */
226 extern __m256 __cdecl _mm256_blendv_ps(__m256, __m256, __m256);
227 
228 /*
229  * Divide Packed Double-Precision Floating-Point Values
230  * **** VDIVPD ymm1, ymm2, ymm3/m256
231  * Performs an SIMD divide of the four packed double-precision floating-point
232  * values in the first source operand by the four packed double-precision
233  * floating-point values in the second source operand
234  */
235 extern __m256d __cdecl _mm256_div_pd(__m256d, __m256d);
236 
237 /*
238  * Divide Packed Single-Precision Floating-Point Values
239  * **** VDIVPS ymm1, ymm2, ymm3/m256
240  * Performs an SIMD divide of the eight packed single-precision
241  * floating-point values in the first source operand by the eight packed
242  * single-precision floating-point values in the second source operand
243  */
244 extern __m256 __cdecl _mm256_div_ps(__m256, __m256);
245 
246 /*
247  * Dot Product of Packed Single-Precision Floating-Point Values
248  * **** VDPPS ymm1, ymm2, ymm3/m256, imm8
249  * Multiplies the packed single precision floating point values in the
250  * first source operand with the packed single-precision floats in the
251  * second source. Each of the four resulting single-precision values is
252  * conditionally summed depending on a mask extracted from the high 4 bits
253  * of the immediate operand. This sum is broadcast to each of 4 positions
254  * in the destination if the corresponding bit of the mask selected from
255  * the low 4 bits of the immediate operand is "1". If the corresponding
256  * low bit 0-3 of the mask is zero, the destination is set to zero.
257  * The process is replicated for the high elements of the destination.
258  */
259 extern __m256 __cdecl _mm256_dp_ps(__m256, __m256, const int);
260 
261 /*
262  * Add Horizontal Double Precision Floating-Point Values
263  * **** VHADDPD ymm1, ymm2, ymm3/m256
264  * Adds pairs of adjacent double-precision floating-point values in the
265  * first source operand and second source operand and stores results in
266  * the destination
267  */
268 extern __m256d __cdecl _mm256_hadd_pd(__m256d, __m256d);
269 
270 /*
271  * Add Horizontal Single Precision Floating-Point Values
272  * **** VHADDPS ymm1, ymm2, ymm3/m256
273  * Adds pairs of adjacent single-precision floating-point values in the
274  * first source operand and second source operand and stores results in
275  * the destination
276  */
277 extern __m256 __cdecl _mm256_hadd_ps(__m256, __m256);
278 
279 /*
280  * Subtract Horizontal Double Precision Floating-Point Values
281  * **** VHSUBPD ymm1, ymm2, ymm3/m256
282  * Subtract pairs of adjacent double-precision floating-point values in
283  * the first source operand and second source operand and stores results
284  * in the destination
285  */
286 extern __m256d __cdecl _mm256_hsub_pd(__m256d, __m256d);
287 
288 /*
289  * Subtract Horizontal Single Precision Floating-Point Values
290  * **** VHSUBPS ymm1, ymm2, ymm3/m256
291  * Subtract pairs of adjacent single-precision floating-point values in
292  * the first source operand and second source operand and stores results
293  * in the destination.
294  */
295 extern __m256 __cdecl _mm256_hsub_ps(__m256, __m256);
296 
297 /*
298  * Maximum of Packed Double Precision Floating-Point Values
299  * **** VMAXPD ymm1, ymm2, ymm3/m256
300  * Performs an SIMD compare of the packed double-precision floating-point
301  * values in the first source operand and the second source operand and
302  * returns the maximum value for each pair of values to the destination
303  */
304 extern __m256d __cdecl _mm256_max_pd(__m256d, __m256d);
305 
306 /*
307  * Maximum of Packed Single Precision Floating-Point Values
308  * **** VMAXPS ymm1, ymm2, ymm3/m256
309  * Performs an SIMD compare of the packed single-precision floating-point
310  * values in the first source operand and the second source operand and
311  * returns the maximum value for each pair of values to the destination
312  */
313 extern __m256 __cdecl _mm256_max_ps(__m256, __m256);
314 
315 /*
316  * Minimum of Packed Double Precision Floating-Point Values
317  * **** VMINPD ymm1, ymm2, ymm3/m256
318  * Performs an SIMD compare of the packed double-precision floating-point
319  * values in the first source operand and the second source operand and
320  * returns the minimum value for each pair of values to the destination
321  */
322 extern __m256d __cdecl _mm256_min_pd(__m256d, __m256d);
323 
324 /*
325  * Minimum of Packed Single Precision Floating-Point Values
326  * **** VMINPS ymm1, ymm2, ymm3/m256
327  * Performs an SIMD compare of the packed single-precision floating-point
328  * values in the first source operand and the second source operand and
329  * returns the minimum value for each pair of values to the destination
330  */
331 extern __m256 __cdecl _mm256_min_ps(__m256, __m256);
332 
333 /*
334  * Multiply Packed Double Precision Floating-Point Values
335  * **** VMULPD ymm1, ymm2, ymm3/m256
336  * Performs a SIMD multiply of the four packed double-precision floating-point
337  * values from the first Source operand to the Second Source operand, and
338  * stores the packed double-precision floating-point results in the
339  * destination
340  */
341 extern __m256d __cdecl _mm256_mul_pd(__m256d, __m256d);
342 
343 /*
344  * Multiply Packed Single Precision Floating-Point Values
345  * **** VMULPS ymm1, ymm2, ymm3/m256
346  * Performs an SIMD multiply of the eight packed single-precision
347  * floating-point values from the first source operand to the second source
348  * operand, and stores the packed double-precision floating-point results in
349  * the destination
350  */
351 extern __m256 __cdecl _mm256_mul_ps(__m256, __m256);
352 
353 /*
354  * Bitwise Logical OR of Packed Double Precision Floating-Point Values
355  * **** VORPD ymm1, ymm2, ymm3/m256
356  * Performs a bitwise logical OR of the four packed double-precision
357  * floating-point values from the first source operand and the second
358  * source operand, and stores the result in the destination
359  */
360 extern __m256d __cdecl _mm256_or_pd(__m256d, __m256d);
361 
362 /*
363  * Bitwise Logical OR of Packed Single Precision Floating-Point Values
364  * **** VORPS ymm1, ymm2, ymm3/m256
365  * Performs a bitwise logical OR of the eight packed single-precision
366  * floating-point values from the first source operand and the second
367  * source operand, and stores the result in the destination
368  */
369 extern __m256 __cdecl _mm256_or_ps(__m256, __m256);
370 
371 /*
372  * Shuffle Packed Double Precision Floating-Point Values
373  * **** VSHUFPD ymm1, ymm2, ymm3/m256, imm8
374  * Moves either of the two packed double-precision floating-point values from
375  * each double quadword in the first source operand into the low quadword
376  * of each double quadword of the destination; moves either of the two packed
377  * double-precision floating-point values from the second source operand into
378  * the high quadword of each double quadword of the destination operand.
379  * The selector operand determines which values are moved to the destination
380  */
381 extern __m256d __cdecl _mm256_shuffle_pd(__m256d, __m256d, const int);
382 
383 /*
384  * Shuffle Packed Single Precision Floating-Point Values
385  * **** VSHUFPS ymm1, ymm2, ymm3/m256, imm8
386  * Moves two of the four packed single-precision floating-point values
387  * from each double qword of the first source operand into the low
388  * quadword of each double qword of the destination; moves two of the four
389  * packed single-precision floating-point values from each double qword of
390  * the second source operand into to the high quadword of each double qword
391  * of the destination. The selector operand determines which values are moved
392  * to the destination.
393  */
394 extern __m256 __cdecl _mm256_shuffle_ps(__m256, __m256, const int);
395 
396 /*
397  * Subtract Packed Double Precision Floating-Point Values
398  * **** VSUBPD ymm1, ymm2, ymm3/m256
399  * Performs an SIMD subtract of the four packed double-precision floating-point
400  * values of the second Source operand from the first Source operand, and
401  * stores the packed double-precision floating-point results in the destination
402  */
403 extern __m256d __cdecl _mm256_sub_pd(__m256d, __m256d);
404 
405 /*
406  * Subtract Packed Single Precision Floating-Point Values
407  * **** VSUBPS ymm1, ymm2, ymm3/m256
408  * Performs an SIMD subtract of the eight packed single-precision
409  * floating-point values in the second Source operand from the First Source
410  * operand, and stores the packed single-precision floating-point results in
411  * the destination
412  */
413 extern __m256 __cdecl _mm256_sub_ps(__m256, __m256);
414 
415 /*
416  * Bitwise Logical XOR of Packed Double Precision Floating-Point Values
417  * **** VXORPD ymm1, ymm2, ymm3/m256
418  * Performs a bitwise logical XOR of the four packed double-precision
419  * floating-point values from the first source operand and the second
420  * source operand, and stores the result in the destination
421  */
422 extern __m256d __cdecl _mm256_xor_pd(__m256d, __m256d);
423 
424 /*
425  * Bitwise Logical XOR of Packed Single Precision Floating-Point Values
426  * **** VXORPS ymm1, ymm2, ymm3/m256
427  * Performs a bitwise logical XOR of the eight packed single-precision
428  * floating-point values from the first source operand and the second
429  * source operand, and stores the result in the destination
430  */
431 extern __m256 __cdecl _mm256_xor_ps(__m256, __m256);
432 
433 /*
434  * Compare Packed Double-Precision Floating-Point Values
435  * **** VCMPPD xmm1, xmm2, xmm3/m128, imm8
436  * **** VCMPPD ymm1, ymm2, ymm3/m256, imm8
437  * Performs an SIMD compare of the four packed double-precision floating-point
438  * values in the second source operand (third operand) and the first source
439  * operand (second operand) and returns the results of the comparison to the
440  * destination operand (first operand). The comparison predicate operand
441  * (immediate) specifies the type of comparison performed on each of the pairs
442  * of packed values.
443  * For 128-bit intrinsic function with compare predicate values in range 0-7
444  * compiler may generate SSE2 instructions if it is warranted for performance
445  * reasons.
446  */
447 extern __m128d __cdecl _mm_cmp_pd(__m128d, __m128d, const int);
448 extern __m256d __cdecl _mm256_cmp_pd(__m256d, __m256d, const int);
449 
450 /*
451  * Compare Packed Single-Precision Floating-Point Values
452  * **** VCMPPS xmm1, xmm2, xmm3/m256, imm8
453  * **** VCMPPS ymm1, ymm2, ymm3/m256, imm8
454  * Performs a SIMD compare of the packed single-precision floating-point values
455  * in the second source operand (third operand) and the first source operand
456  * (second operand) and returns the results of the comparison to the
457  * destination operand (first operand). The comparison predicate operand
458  * (immediate) specifies the type of comparison performed on each of the pairs
459  * of packed values.
460  * For 128-bit intrinsic function with compare predicate values in range 0-7
461  * compiler may generate SSE2 instructions if it is warranted for performance
462  * reasons.
463  */
464 extern __m128 __cdecl _mm_cmp_ps(__m128, __m128, const int);
465 extern __m256 __cdecl _mm256_cmp_ps(__m256, __m256, const int);
466 
467 /*
468  * Compare Scalar Double-Precision Floating-Point Values
469  * **** VCMPSD xmm1, xmm2, xmm3/m64, imm8
470  * Compares the low double-precision floating-point values in the second source
471  * operand (third operand) and the first source operand (second operand) and
472  * returns the results in of the comparison to the destination operand (first
473  * operand). The comparison predicate operand (immediate operand) specifies the
474  * type of comparison performed.
475  * For compare predicate values in range 0-7 compiler may generate SSE2
476  * instructions if it is warranted for performance reasons.
477  */
478 extern __m128d __cdecl _mm_cmp_sd(__m128d, __m128d, const int);
479 
480 /* Compare Scalar Double-Precision Floating-Point Values with Integer Result
481  * This is similar to _mm_cmp_sd, except it returns the result as an integer
482  * and it supports all predicate values even when AVX support is not available.
483  */
484 extern int __cdecl _mm_comi_sd(__m128d, __m128d, const int);
485 
486 /*
487  * Compare Scalar Single-Precision Floating-Point Values
488  * **** VCMPSS xmm1, xmm2, xmm3/m64, imm8
489  * Compares the low single-precision floating-point values in the second source
490  * operand (third operand) and the first source operand (second operand) and
491  * returns the results of the comparison to the destination operand (first
492  * operand). The comparison predicate operand (immediate operand) specifies
493  * the type of comparison performed.
494  * For compare predicate values in range 0-7 compiler may generate SSE2
495  * instructions if it is warranted for performance reasons.
496  */
497 extern __m128 __cdecl _mm_cmp_ss(__m128, __m128, const int);
498 
499 /* Compare Scalar Single-Precision Floating-Point Values with Integer Result
500  * This is similar to _mm_cmp_ss, except it returns the result as an integer
501  * and it supports all predicate values even when AVX support is not available.
502  */
503 extern int __cdecl _mm_comi_ss(__m128, __m128, const int);
504 
505 /*
506  * Convert Packed Doubleword Integers to
507  * Packed Double-Precision Floating-Point Values
508  * **** VCVTDQ2PD ymm1, xmm2/m128
509  * Converts four packed signed doubleword integers in the source operand to
510  * four packed double-precision floating-point values in the destination
511  */
512 extern __m256d __cdecl _mm256_cvtepi32_pd(__m128i);
513 
514 /*
515  * Convert Packed Doubleword Integers to
516  * Packed Single-Precision Floating-Point Values
517  * **** VCVTDQ2PS ymm1, ymm2/m256
518  * Converts eight packed signed doubleword integers in the source operand to
519  * eight packed double-precision floating-point values in the destination
520  */
521 extern __m256 __cdecl _mm256_cvtepi32_ps(__m256i);
522 
523 /*
524  * Convert Packed Double-Precision Floating-point values to
525  * Packed Single-Precision Floating-Point Values
526  * **** VCVTPD2PS xmm1, ymm2/m256
527  * Converts four packed double-precision floating-point values in the source
528  * operand to four packed single-precision floating-point values in the
529  * destination
530  */
531 extern __m128 __cdecl _mm256_cvtpd_ps(__m256d);
532 
533 /*
534  * Convert Packed Single Precision Floating-Point Values to
535  * Packed Singed Doubleword Integer Values
536  * **** VCVTPS2DQ ymm1, ymm2/m256
537  * Converts eight packed single-precision floating-point values in the source
538  * operand to eight signed doubleword integers in the destination
539  */
540 extern __m256i __cdecl _mm256_cvtps_epi32(__m256);
541 
542 /*
543  * Convert Packed Single Precision Floating-point values to
544  * Packed Double Precision Floating-Point Values
545  * **** VCVTPS2PD ymm1, xmm2/m128
546  * Converts four packed single-precision floating-point values in the source
547  * operand to four packed double-precision floating-point values in the
548  * destination
549  */
550 extern __m256d __cdecl _mm256_cvtps_pd(__m128);
551 
552 /*
553  * Convert with Truncation Packed Double-Precision Floating-Point values to
554  * Packed Doubleword Integers
555  * **** VCVTTPD2DQ xmm1, ymm2/m256
556  * Converts four packed double-precision floating-point values in the source
557  * operand to four packed signed doubleword integers in the destination.
558  * When a conversion is inexact, a truncated (round toward zero) value is
559  * returned. If a converted result is larger than the maximum signed doubleword
560  * integer, the floating-point invalid exception is raised, and if this
561  * exception is masked, the indefinite integer value (80000000H) is returned
562 */
563 extern __m128i __cdecl _mm256_cvttpd_epi32(__m256d);
564 
565 /*
566  * Convert Packed Double-Precision Floating-point values to
567  * Packed Doubleword Integers
568  * **** VCVTPD2DQ xmm1, ymm2/m256
569  * Converts four packed double-precision floating-point values in the source
570  * operand to four packed signed doubleword integers in the destination
571  */
572 extern __m128i __cdecl _mm256_cvtpd_epi32(__m256d);
573 
574 /*
575  * Convert with Truncation Packed Single Precision Floating-Point Values to
576  * Packed Singed Doubleword Integer Values
577  * **** VCVTTPS2DQ ymm1, ymm2/m256
578  * Converts eight packed single-precision floating-point values in the source
579  * operand to eight signed doubleword integers in the destination.
580  * When a conversion is inexact, a truncated (round toward zero) value is
581  * returned. If a converted result is larger than the maximum signed doubleword
582  * integer, the floating-point invalid exception is raised, and if this
583  * exception is masked, the indefinite integer value (80000000H) is returned
584  */
585 extern __m256i __cdecl _mm256_cvttps_epi32(__m256);
586 
587 /*
588  * Extract packed floating-point values
589  * **** VEXTRACTF128 xmm1/m128, ymm2, imm8
590  * Extracts 128-bits of packed floating-point values from the source operand
591  * at an 128-bit offset from imm8[0] into the destination
592  */
593 extern __m128 __cdecl _mm256_extractf128_ps(__m256, const int);
594 extern __m128d __cdecl _mm256_extractf128_pd(__m256d, const int);
595 extern __m128i __cdecl _mm256_extractf128_si256(__m256i, const int);
596 
597 /*
598  * Zero All YMM registers
599  * **** VZEROALL
600  * Zeros contents of all YMM registers
601  */
602 extern void __cdecl _mm256_zeroall(void);
603 
604 /*
605  * Zero Upper bits of YMM registers
606  * **** VZEROUPPER
607  * Zeros the upper 128 bits of all YMM registers. The lower 128-bits of the
608  * registers (the corresponding XMM registers) are unmodified
609  */
610 extern void __cdecl _mm256_zeroupper(void);
611 
612 /*
613  * Permute Single-Precision Floating-Point Values
614  * **** VPERMILPS ymm1, ymm2, ymm3/m256
615  * **** VPERMILPS xmm1, xmm2, xmm3/m128
616  * Permute Single-Precision Floating-Point values in the first source operand
617  * using 8-bit control fields in the low bytes of corresponding elements the
618  * shuffle control and store results in the destination
619  */
620 extern __m256 __cdecl _mm256_permutevar_ps(__m256, __m256i);
621 extern __m128 __cdecl _mm_permutevar_ps(__m128, __m128i);
622 
623 /*
624  * Permute Single-Precision Floating-Point Values
625  * **** VPERMILPS ymm1, ymm2/m256, imm8
626  * **** VPERMILPS xmm1, xmm2/m128, imm8
627  * Permute Single-Precision Floating-Point values in the first source operand
628  * using four 2-bit control fields in the 8-bit immediate and store results
629  * in the destination
630  */
631 extern __m256 __cdecl _mm256_permute_ps(__m256, int);
632 extern __m128 __cdecl _mm_permute_ps(__m128, int);
633 
634 /*
635  * Permute Double-Precision Floating-Point Values
636  * **** VPERMILPD ymm1, ymm2, ymm3/m256
637  * **** VPERMILPD xmm1, xmm2, xmm3/m128
638  * Permute Double-Precision Floating-Point values in the first source operand
639  * using 8-bit control fields in the low bytes of the second source operand
640  * and store results in the destination
641  */
643 extern __m128d __cdecl _mm_permutevar_pd(__m128d, __m128i);
644 
645 /*
646  * Permute Double-Precision Floating-Point Values
647  * **** VPERMILPD ymm1, ymm2/m256, imm8
648  * **** VPERMILPD xmm1, xmm2/m128, imm8
649  * Permute Double-Precision Floating-Point values in the first source operand
650  * using two, 1-bit control fields in the low 2 bits of the 8-bit immediate
651  * and store results in the destination
652  */
653 extern __m256d __cdecl _mm256_permute_pd(__m256d, int);
654 extern __m128d __cdecl _mm_permute_pd(__m128d, int);
655 
656 /*
657  * Permute Floating-Point Values
658  * **** VPERM2F128 ymm1, ymm2, ymm3/m256, imm8
659  * Permute 128 bit floating-point-containing fields from the first source
660  * operand and second source operand using bits in the 8-bit immediate and
661  * store results in the destination
662  */
663 extern __m256 __cdecl _mm256_permute2f128_ps(__m256, __m256, int);
664 extern __m256d __cdecl _mm256_permute2f128_pd(__m256d, __m256d, int);
665 extern __m256i __cdecl _mm256_permute2f128_si256(__m256i, __m256i, int);
666 
667 /*
668  * Load with Broadcast
669  * **** VBROADCASTSS ymm1, m32
670  * **** VBROADCASTSS xmm1, m32
671  * Load floating point values from the source operand and broadcast to all
672  * elements of the destination
673  */
674 extern __m256 __cdecl _mm256_broadcast_ss(float const *);
675 extern __m128 __cdecl _mm_broadcast_ss(float const *);
676 
677 /*
678  * Load with Broadcast
679  * **** VBROADCASTSD ymm1, m64
680  * Load floating point values from the source operand and broadcast to all
681  * elements of the destination
682  */
683 extern __m256d __cdecl _mm256_broadcast_sd(double const *);
684 
685 /*
686  * Load with Broadcast
687  * **** VBROADCASTF128 ymm1, m128
688  * Load floating point values from the source operand and broadcast to all
689  * elements of the destination
690  */
691 extern __m256 __cdecl _mm256_broadcast_ps(__m128 const *);
692 extern __m256d __cdecl _mm256_broadcast_pd(__m128d const *);
693 
694 /*
695  * Insert packed floating-point values
696  * **** VINSERTF128 ymm1, ymm2, xmm3/m128, imm8
697  * Performs an insertion of 128-bits of packed floating-point values from the
698  * second source operand into an the destination at an 128-bit offset from
699  * imm8[0]. The remaining portions of the destination are written by the
700  * corresponding fields of the first source operand
701  */
702 extern __m256 __cdecl _mm256_insertf128_ps(__m256, __m128, int);
703 extern __m256d __cdecl _mm256_insertf128_pd(__m256d, __m128d, int);
704 extern __m256i __cdecl _mm256_insertf128_si256(__m256i, __m128i, int);
705 
706 /*
707  * Move Aligned Packed Double-Precision Floating-Point Values
708  * **** VMOVAPD ymm1, m256
709  * **** VMOVAPD m256, ymm1
710  * Moves 4 double-precision floating-point values from the source operand to
711  * the destination
712  */
713 extern __m256d __cdecl _mm256_load_pd(double const *);
714 extern void __cdecl _mm256_store_pd(double *, __m256d);
715 
716 /*
717  * Move Aligned Packed Single-Precision Floating-Point Values
718  * **** VMOVAPS ymm1, m256
719  * **** VMOVAPS m256, ymm1
720  * Moves 8 single-precision floating-point values from the source operand to
721  * the destination
722  */
723 extern __m256 __cdecl _mm256_load_ps(float const *);
724 extern void __cdecl _mm256_store_ps(float *, __m256);
725 
726 /*
727  * Move Unaligned Packed Double-Precision Floating-Point Values
728  * **** VMOVUPD ymm1, m256
729  * **** VMOVUPD m256, ymm1
730  * Moves 256 bits of packed double-precision floating-point values from the
731  * source operand to the destination
732  */
733 extern __m256d __cdecl _mm256_loadu_pd(double const *);
734 extern void __cdecl _mm256_storeu_pd(double *, __m256d);
735 
736 /*
737  * Move Unaligned Packed Single-Precision Floating-Point Values
738  * **** VMOVUPS ymm1, m256
739  * **** VMOVUPS m256, ymm1
740  * Moves 256 bits of packed single-precision floating-point values from the
741  * source operand to the destination
742  */
743 extern __m256 __cdecl _mm256_loadu_ps(float const *);
744 extern void __cdecl _mm256_storeu_ps(float *, __m256);
745 
746 /*
747  * Move Aligned Packed Integer Values
748  * **** VMOVDQA ymm1, m256
749  * **** VMOVDQA m256, ymm1
750  * Moves 256 bits of packed integer values from the source operand to the
751  * destination
752  */
753 extern __m256i __cdecl _mm256_load_si256(__m256i const *);
754 extern void __cdecl _mm256_store_si256(__m256i *, __m256i);
755 
756 /*
757  * Move Unaligned Packed Integer Values
758  * **** VMOVDQU ymm1, m256
759  * **** VMOVDQU m256, ymm1
760  * Moves 256 bits of packed integer values from the source operand to the
761  * destination
762  */
763 extern __m256i __cdecl _mm256_loadu_si256(__m256i const *);
764 extern void __cdecl _mm256_storeu_si256(__m256i *, __m256i);
765 
766 /*
767  * Load Two Unaligned Packed 128-bit Values
768  * Loads two potentially unaligned 128-bit values
769  * and combines them into one 256-bit value.
770  *
771  * The data types here (float const*, double const* and __m128i const*)
772  * were chosen for consistency with the underlying _mm_loadu_{ps,pd,si128}
773  * intrinsics.
774  */
775 
776 #define _mm256_loadu2_m128(/* float const* */ hiaddr, \
777  /* float const* */ loaddr) \
778  _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
779 
780 #define _mm256_loadu2_m128d(/* double const* */ hiaddr, \
781  /* double const* */ loaddr) \
782  _mm256_set_m128d(_mm_loadu_pd(hiaddr), _mm_loadu_pd(loaddr))
783 
784 #define _mm256_loadu2_m128i(/* __m128i const* */ hiaddr, \
785  /* __m128i const* */ loaddr) \
786  _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
787 
788 /*
789  * Store 256-bit Value To Two Unaligned 128-bit Locations
790  * Stores the high and low 128-bit halves of a 256-bit value
791  * to two different potentially unaligned addresses.
792  */
793 
794 #define _mm256_storeu2_m128(/* float* */ hiaddr, /* float* */ loaddr, \
795  /* __m256 */ a) \
796  do { \
797  __m256 _a = (a); /* reference a only once in macro body */ \
798  _mm_storeu_ps((loaddr), _mm256_castps256_ps128(_a)); \
799  _mm_storeu_ps((hiaddr), _mm256_extractf128_ps(_a, 0x1)); \
800  } while (0)
801 
802 #define _mm256_storeu2_m128d(/* double* */ hiaddr, /* double* */ loaddr, \
803  /* __m256d */ a) \
804  do { \
805  __m256d _a = (a); /* reference a only once in macro body */ \
806  _mm_storeu_pd((loaddr), _mm256_castpd256_pd128(_a)); \
807  _mm_storeu_pd((hiaddr), _mm256_extractf128_pd(_a, 0x1)); \
808  } while (0)
809 
810 #define _mm256_storeu2_m128i(/* __m128i* */ hiaddr, /* __m128i* */ loaddr, \
811  /* __m256i */ a) \
812  do { \
813  __m256i _a = (a); /* reference a only once in macro body */ \
814  _mm_storeu_si128((loaddr), _mm256_castsi256_si128(_a)); \
815  _mm_storeu_si128((hiaddr), _mm256_extractf128_si256(_a, 0x1)); \
816  } while (0)
817 
818 /*
819  * Conditional SIMD Packed Loads and Stores
820  * **** VMASKMOVPD xmm1, xmm2, m128
821  * **** VMASKMOVPD ymm1, ymm2, m256
822  * **** VMASKMOVPD m128, xmm1, xmm2
823  * **** VMASKMOVPD m256, ymm1, ymm2
824  *
825  * Load forms:
826  * Load packed values from the 128-bit (XMM forms) or 256-bit (YMM forms)
827  * memory location (third operand) into the destination XMM or YMM register
828  * (first operand) using a mask in the first source operand (second operand).
829  *
830  * Store forms:
831  * Stores packed values from the XMM or YMM register in the second source
832  * operand (third operand) into the 128-bit (XMM forms) or 256-bit (YMM forms)
833  * memory location using a mask in first source operand (second operand).
834  * Stores are atomic.
835  */
836 extern __m256d __cdecl _mm256_maskload_pd(double const *, __m256i);
837 extern void __cdecl _mm256_maskstore_pd(double *, __m256i, __m256d);
838 extern __m128d __cdecl _mm_maskload_pd(double const *, __m128i);
839 extern void __cdecl _mm_maskstore_pd(double *, __m128i, __m128d);
840 
841 /*
842  * Conditional SIMD Packed Loads and Stores
843  * **** VMASKMOVPS xmm1, xmm2, m128
844  * **** VMASKMOVPS ymm1, ymm2, m256
845  * **** VMASKMOVPS m128, xmm1, xmm2
846  * **** VMASKMOVPS m256, ymm1, ymm2
847  *
848  * Load forms:
849  * Load packed values from the 128-bit (XMM forms) or 256-bit (YMM forms)
850  * memory location (third operand) into the destination XMM or YMM register
851  * (first operand) using a mask in the first source operand (second operand).
852  *
853  * Store forms:
854  * Stores packed values from the XMM or YMM register in the second source
855  * operand (third operand) into the 128-bit (XMM forms) or 256-bit (YMM forms)
856  * memory location using a mask in first source operand (second operand).
857  * Stores are atomic.
858  */
859 extern __m256 __cdecl _mm256_maskload_ps(float const *, __m256i);
860 extern void __cdecl _mm256_maskstore_ps(float *, __m256i, __m256);
861 extern __m128 __cdecl _mm_maskload_ps(float const *, __m128i);
862 extern void __cdecl _mm_maskstore_ps(float *, __m128i, __m128);
863 
864 /*
865  * Replicate Single-Precision Floating-Point Values
866  * **** VMOVSHDUP ymm1, ymm2/m256
867  * Duplicates odd-indexed single-precision floating-point values from the
868  * source operand
869  */
870 extern __m256 __cdecl _mm256_movehdup_ps(__m256);
871 
872 /*
873  * Replicate Single-Precision Floating-Point Values
874  * **** VMOVSLDUP ymm1, ymm2/m256
875  * Duplicates even-indexed single-precision floating-point values from the
876  * source operand
877  */
878 extern __m256 __cdecl _mm256_moveldup_ps(__m256);
879 
880 /*
881  * Replicate Double-Precision Floating-Point Values
882  * **** VMOVDDUP ymm1, ymm2/m256
883  * Duplicates even-indexed double-precision floating-point values from the
884  * source operand
885  */
886 extern __m256d __cdecl _mm256_movedup_pd(__m256d);
887 
888 /*
889  * Move Unaligned Integer
890  * **** VLDDQU ymm1, m256
891  * The instruction is functionally similar to VMOVDQU YMM, m256 for loading
892  * from memory. That is: 32 bytes of data starting at an address specified by
893  * the source memory operand are fetched from memory and placed in a
894  * destination
895  */
896 extern __m256i __cdecl _mm256_lddqu_si256(__m256i const *);
897 
898 /*
899  * Store Packed Integers Using Non-Temporal Hint
900  * **** VMOVNTDQ m256, ymm1
901  * Moves the packed integers in the source operand to the destination using a
902  * non-temporal hint to prevent caching of the data during the write to memory
903  */
904 extern void __cdecl _mm256_stream_si256(__m256i *, __m256i);
905 
906 /*
907  * Store Packed Double-Precision Floating-Point Values Using Non-Temporal Hint
908  * **** VMOVNTPD m256, ymm1
909  * Moves the packed double-precision floating-point values in the source
910  * operand to the destination operand using a non-temporal hint to prevent
911  * caching of the data during the write to memory
912  */
913 extern void __cdecl _mm256_stream_pd(double *, __m256d);
914 
915 /*
916  * Store Packed Single-Precision Floating-Point Values Using Non-Temporal Hint
917  * **** VMOVNTPS m256, ymm1
918  * Moves the packed single-precision floating-point values in the source
919  * operand to the destination operand using a non-temporal hint to prevent
920  * caching of the data during the write to memory
921  */
922 extern void __cdecl _mm256_stream_ps(float *, __m256);
923 
924 /*
925  * Compute Approximate Reciprocals of Packed Single-Precision Floating-Point
926  * Values
927  * **** VRCPPS ymm1, ymm2/m256
928  * Performs an SIMD computation of the approximate reciprocals of the eight
929  * packed single precision floating-point values in the source operand and
930  * stores the packed single-precision floating-point results in the destination
931  */
932 extern __m256 __cdecl _mm256_rcp_ps(__m256);
933 
934 /*
935  * Compute Approximate Reciprocals of Square Roots of
936  * Packed Single-Precision Floating-point Values
937  * **** VRSQRTPS ymm1, ymm2/m256
938  * Performs an SIMD computation of the approximate reciprocals of the square
939  * roots of the eight packed single precision floating-point values in the
940  * source operand and stores the packed single-precision floating-point results
941  * in the destination
942  */
943 extern __m256 __cdecl _mm256_rsqrt_ps(__m256);
944 
945 /*
946  * Square Root of Double-Precision Floating-Point Values
947  * **** VSQRTPD ymm1, ymm2/m256
948  * Performs an SIMD computation of the square roots of the two or four packed
949  * double-precision floating-point values in the source operand and stores
950  * the packed double-precision floating-point results in the destination
951  */
952 extern __m256d __cdecl _mm256_sqrt_pd(__m256d);
953 
954 /*
955  * Square Root of Single-Precision Floating-Point Values
956  * **** VSQRTPS ymm1, ymm2/m256
957  * Performs an SIMD computation of the square roots of the eight packed
958  * single-precision floating-point values in the source operand stores the
959  * packed double-precision floating-point results in the destination
960  */
961 extern __m256 __cdecl _mm256_sqrt_ps(__m256);
962 
963 /*
964  * Round Packed Double-Precision Floating-Point Values
965  * **** VROUNDPD ymm1,ymm2/m256,imm8
966  * Round the four Double-Precision Floating-Point Values values in the source
967  * operand by the rounding mode specified in the immediate operand and place
968  * the result in the destination. The rounding process rounds the input to an
969  * integral value and returns the result as a double-precision floating-point
970  * value. The Precision Floating Point Exception is signaled according to the
971  * immediate operand. If any source operand is an SNaN then it will be
972  * converted to a QNaN.
973  */
974 extern __m256d __cdecl _mm256_round_pd(__m256d, int);
975 #define _mm256_ceil_pd(val) _mm256_round_pd((val), _MM_FROUND_CEIL)
976 #define _mm256_floor_pd(val) _mm256_round_pd((val), _MM_FROUND_FLOOR)
977 
978 /*
979  * Round Packed Single-Precision Floating-Point Values
980  * **** VROUNDPS ymm1,ymm2/m256,imm8
981  * Round the four single-precision floating-point values values in the source
982  * operand by the rounding mode specified in the immediate operand and place
983  * the result in the destination. The rounding process rounds the input to an
984  * integral value and returns the result as a double-precision floating-point
985  * value. The Precision Floating Point Exception is signaled according to the
986  * immediate operand. If any source operand is an SNaN then it will be
987  * converted to a QNaN.
988  */
989 extern __m256 __cdecl _mm256_round_ps(__m256, int);
990 #define _mm256_ceil_ps(val) _mm256_round_ps((val), _MM_FROUND_CEIL)
991 #define _mm256_floor_ps(val) _mm256_round_ps((val), _MM_FROUND_FLOOR)
992 
993 /*
994  * Unpack and Interleave High Packed Double-Precision Floating-Point Values
995  * **** VUNPCKHPD ymm1,ymm2,ymm3/m256
996  * Performs an interleaved unpack of the high double-precision floating-point
997  * values from the first source operand and the second source operand.
998  */
999 extern __m256d __cdecl _mm256_unpackhi_pd(__m256d, __m256d);
1000 
1001 /*
1002  * Unpack and Interleave High Packed Single-Precision Floating-Point Values
1003  * **** VUNPCKHPS ymm1,ymm2,ymm3
1004  * Performs an interleaved unpack of the high single-precision floating-point
1005  * values from the first source operand and the second source operand
1006  */
1007 extern __m256 __cdecl _mm256_unpackhi_ps(__m256, __m256);
1008 
1009 /*
1010  * Unpack and Interleave Low Packed Double-Precision Floating-Point Values
1011  * **** VUNPCKLPD ymm1,ymm2,ymm3/m256
1012  * Performs an interleaved unpack of the low double-precision floating-point
1013  * values from the first source operand and the second source operand
1014  */
1015 extern __m256d __cdecl _mm256_unpacklo_pd(__m256d, __m256d);
1016 
1017 /*
1018  * Unpack and Interleave Low Packed Single-Precision Floating-Point Values
1019  * **** VUNPCKLPS ymm1,ymm2,ymm3
1020  * Performs an interleaved unpack of the low single-precision floating-point
1021  * values from the first source operand and the second source operand
1022  */
1023 extern __m256 __cdecl _mm256_unpacklo_ps(__m256, __m256);
1024 
1025 /*
1026  * Packed Bit Test
1027  * **** VPTEST ymm1, ymm2/m256
1028  * VPTEST set the ZF flag if all bits in the result are 0 of the bitwise AND
1029  * of the first source operand and the second source operand. VPTEST sets the
1030  * CF flag if all bits in the result are 0 of the bitwise AND of the second
1031  * source operand and the logical NOT of the first source operand.
1032  */
1033 extern int __cdecl _mm256_testz_si256(__m256i, __m256i);
1034 #define _mm256_test_all_zeros(mask, val) \
1035  _mm256_testz_si256((mask), (val))
1036 
1037 extern int __cdecl _mm256_testc_si256(__m256i, __m256i);
1038 #define _mm256_test_all_ones(val) \
1039  _mm256_testc_si256((val), _mm256_cmpeq_epi32((val),(val)))
1040 
1041 extern int __cdecl _mm256_testnzc_si256(__m256i, __m256i);
1042 #define _mm256_test_mix_ones_zeros(mask, val) \
1043  _mm256_testnzc_si256((mask), (val))
1044 
1045 /*
1046  * Packed Bit Test
1047  * **** VTESTPD ymm1, ymm2/m256
1048  * **** VTESTPD xmm1, xmm2/m128
1049  * VTESTPD performs a bitwise comparison of all the sign bits of the
1050  * double-precision elements in the first source operation and corresponding
1051  * sign bits in the second source operand. If the AND of the two sets of bits
1052  * produces all zeros, the ZF is set else the ZF is clear. If the AND NOT of
1053  * the source sign bits with the dest sign bits produces all zeros the CF is
1054  * set else the CF is clear
1055  */
1056 extern int __cdecl _mm256_testz_pd(__m256d, __m256d);
1057 extern int __cdecl _mm256_testc_pd(__m256d, __m256d);
1058 extern int __cdecl _mm256_testnzc_pd(__m256d, __m256d);
1059 extern int __cdecl _mm_testz_pd(__m128d, __m128d);
1060 extern int __cdecl _mm_testc_pd(__m128d, __m128d);
1061 extern int __cdecl _mm_testnzc_pd(__m128d, __m128d);
1062 
1063 /*
1064  * Packed Bit Test
1065  * **** VTESTPS ymm1, ymm2/m256
1066  * **** VTESTPS xmm1, xmm2/m128
1067  * VTESTPS performs a bitwise comparison of all the sign bits of the packed
1068  * single-precision elements in the first source operation and corresponding
1069  * sign bits in the second source operand. If the AND of the two sets of bits
1070  * produces all zeros, the ZF is set else the ZF is clear. If the AND NOT of
1071  * the source sign bits with the dest sign bits produces all zeros the CF is
1072  * set else the CF is clear
1073  */
1074 extern int __cdecl _mm256_testz_ps(__m256, __m256);
1075 extern int __cdecl _mm256_testc_ps(__m256, __m256);
1076 extern int __cdecl _mm256_testnzc_ps(__m256, __m256);
1077 extern int __cdecl _mm_testz_ps(__m128, __m128);
1078 extern int __cdecl _mm_testc_ps(__m128, __m128);
1079 extern int __cdecl _mm_testnzc_ps(__m128, __m128);
1080 
1081 /*
1082  * Extract Double-Precision Floating-Point Sign mask
1083  * **** VMOVMSKPD r32, ymm2
1084  * Extracts the sign bits from the packed double-precision floating-point
1085  * values in the source operand, formats them into a 4-bit mask, and stores
1086  * the mask in the destination
1087  */
1088 extern int __cdecl _mm256_movemask_pd(__m256d);
1089 
1090 /*
1091  * Extract Single-Precision Floating-Point Sign mask
1092  * **** VMOVMSKPS r32, ymm2
1093  * Extracts the sign bits from the packed single-precision floating-point
1094  * values in the source operand, formats them into a 8-bit mask, and stores
1095  * the mask in the destination
1096  */
1097 extern int __cdecl _mm256_movemask_ps(__m256);
1098 
1099 /*
1100  * Return 256-bit vector with all elements set to 0
1101  */
1102 extern __m256d __cdecl _mm256_setzero_pd(void);
1103 extern __m256 __cdecl _mm256_setzero_ps(void);
1104 extern __m256i __cdecl _mm256_setzero_si256(void);
1105 
1106 /*
1107  * Return 256-bit vector initialized to specified arguments
1108  */
1109 extern __m256d __cdecl _mm256_set_pd(double, double, double, double);
1110 extern __m256 __cdecl _mm256_set_ps(float, float, float, float,
1111  float, float, float, float);
1112 extern __m256i __cdecl _mm256_set_epi8(char, char, char, char,
1113  char, char, char, char,
1114  char, char, char, char,
1115  char, char, char, char,
1116  char, char, char, char,
1117  char, char, char, char,
1118  char, char, char, char,
1119  char, char, char, char);
1120 extern __m256i __cdecl _mm256_set_epi16(short, short, short, short,
1121  short, short, short, short,
1122  short, short, short, short,
1123  short, short, short, short);
1124 extern __m256i __cdecl _mm256_set_epi32(int, int, int, int,
1125  int, int, int, int);
1126 extern __m256i __cdecl _mm256_set_epi64x(__int64, __int64,
1127  __int64, __int64);
1128 
1129 #define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \
1130  _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1)
1131 
1132 #define _mm256_set_m128d(/* __m128d */ hi, /* __m128d */ lo) \
1133  _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), (hi), 0x1)
1134 
1135 #define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \
1136  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
1137 
1138 extern __m256d __cdecl _mm256_setr_pd(double, double, double, double);
1139 extern __m256 __cdecl _mm256_setr_ps(float, float, float, float,
1140  float, float, float, float);
1141 extern __m256i __cdecl _mm256_setr_epi8(char, char, char, char,
1142  char, char, char, char,
1143  char, char, char, char,
1144  char, char, char, char,
1145  char, char, char, char,
1146  char, char, char, char,
1147  char, char, char, char,
1148  char, char, char, char);
1149 extern __m256i __cdecl _mm256_setr_epi16(short, short, short, short,
1150  short, short, short, short,
1151  short, short, short, short,
1152  short, short, short, short);
1153 extern __m256i __cdecl _mm256_setr_epi32(int, int, int, int,
1154  int, int, int, int);
1155 extern __m256i __cdecl _mm256_setr_epi64x(__int64, __int64,
1156  __int64, __int64);
1157 #define _mm256_setr_m128(lo, hi) _mm256_set_m128((hi), (lo))
1158 #define _mm256_setr_m128d(lo, hi) _mm256_set_m128d((hi), (lo))
1159 #define _mm256_setr_m128i(lo, hi) _mm256_set_m128i((hi), (lo))
1160 
1161 /*
1162  * Return 256-bit vector with all elements initialized to specified scalar
1163  */
1164 extern __m256d __cdecl _mm256_set1_pd(double);
1165 extern __m256 __cdecl _mm256_set1_ps(float);
1166 extern __m256i __cdecl _mm256_set1_epi8(char);
1167 extern __m256i __cdecl _mm256_set1_epi16(short);
1168 extern __m256i __cdecl _mm256_set1_epi32(int);
1169 extern __m256i __cdecl _mm256_set1_epi64x(long long);
1170 
1171 /*
1172  * Support intrinsic functions to do vector type casts. These functions do
1173  * not introduce extra moves to generated code. When cast is done from a 128
1174  * to 256-bit type the low 128 bits of the 256-bit result contain source
1175  * parameter value; the upper 128 bits of the result are undefined.
1176  */
1177 extern __m256 __cdecl _mm256_castpd_ps(__m256d);
1178 extern __m256d __cdecl _mm256_castps_pd(__m256);
1179 extern __m256i __cdecl _mm256_castps_si256(__m256);
1180 extern __m256i __cdecl _mm256_castpd_si256(__m256d);
1181 extern __m256 __cdecl _mm256_castsi256_ps(__m256i);
1182 extern __m256d __cdecl _mm256_castsi256_pd(__m256i);
1183 extern __m128 __cdecl _mm256_castps256_ps128(__m256);
1184 extern __m128d __cdecl _mm256_castpd256_pd128(__m256d);
1185 extern __m128i __cdecl _mm256_castsi256_si128(__m256i);
1186 extern __m256 __cdecl _mm256_castps128_ps256(__m128);
1187 extern __m256d __cdecl _mm256_castpd128_pd256(__m128d);
1188 extern __m256i __cdecl _mm256_castsi128_si256(__m128i);
1189 
1190 
1191 /*
1192  * Support for half-float conversions to/from normal float.
1193  * Immediate argument is used for special MXCSR overrides.
1194  */
1195 extern __m128 __cdecl _mm_cvtph_ps(__m128i);
1196 extern __m256 __cdecl _mm256_cvtph_ps(__m128i);
1197 extern __m128i __cdecl _mm_cvtps_ph(__m128 /* m1 */, const int /* imm */);
1198 extern __m128i __cdecl _mm256_cvtps_ph(__m256, int);
1199 
1200 /*
1201  * Return a vector with all elements set to zero. It is recommended to use the
1202  * result of this intrinsic as an input argument to another intrinsic when the
1203  * initial value is irrelevant.
1204  */
1205 #define _mm_undefined_ps _mm_setzero_ps
1206 #define _mm_undefined_pd _mm_setzero_pd
1207 #define _mm_undefined_si128 _mm_setzero_si128
1208 #define _mm256_undefined_ps _mm256_setzero_ps
1209 #define _mm256_undefined_pd _mm256_setzero_pd
1210 #define _mm256_undefined_si256 _mm256_setzero_si256
1211 
1212 /*
1213  * The list of extended control registers.
1214  * Currently, the list includes only one register.
1215  */
1216 #define _XCR_XFEATURE_ENABLED_MASK 0
1217 
1218 /* Returns the content of the specified extended control register */
1219 extern unsigned __int64 __cdecl _xgetbv(unsigned int);
1220 
1221 /* Writes the value to the specified extended control register */
1222 extern void __cdecl _xsetbv(unsigned int, unsigned __int64);
1223 
1224 
1225 /*
1226  * Performs a full or partial save of the enabled processor state components
1227  * using the specified memory address location and a mask.
1228  */
1229 extern void __cdecl _xsave(void *, unsigned __int64);
1230 #if defined (_M_X64)
1231 extern void __cdecl _xsave64(void *, unsigned __int64);
1232 #endif /* defined (_M_X64) */
1233 
1234 /*
1235  * Performs a full or partial save of the enabled processor state components
1236  * using the specified memory address location and a mask.
1237  * Optimize the state save operation if possible.
1238  */
1239 extern void __cdecl _xsaveopt(void *, unsigned __int64);
1240 #if defined (_M_X64)
1241 extern void __cdecl _xsaveopt64(void *, unsigned __int64);
1242 #endif /* defined (_M_X64) */
1243 
1244 /*
1245  * Performs a full or compressed partial save of the enabled processor state
1246  * components using the specified memory address location and a mask.
1247  */
1248 extern void __cdecl _xsavec(void *, unsigned __int64);
1249 #if defined (_M_X64)
1250 extern void __cdecl _xsavec64(void *, unsigned __int64);
1251 #endif /* defined (_M_X64) */
1252 
1253 /*
1254  * Performs a full or partial restore of the enabled processor states
1255  * using the state information stored in the specified memory address location
1256  * and a mask.
1257  */
1258 extern void __cdecl _xrstor(void const *, unsigned __int64);
1259 #if defined (_M_X64)
1260 extern void __cdecl _xrstor64(void const *, unsigned __int64);
1261 #endif /* defined (_M_X64) */
1262 
1263 /*
1264  * Performs a full or partial save of the enabled processor extended
1265  * and supervisor state components in compacted form using the
1266  * specified memory address location and masks in XCR0 and IA32_XSS MSR.
1267  */
1268 extern void __cdecl _xsaves(void *, unsigned __int64);
1269 #if defined (_M_X64)
1270 extern void __cdecl _xsaves64(void *, unsigned __int64);
1271 #endif /* defined (_M_X64) */
1272 
1273 /*
1274  * Performs a full or partial restore of the enabled processor extended
1275  * and supervisor states using the state information stored in the
1276  * specified memory address location and masks in XCR0 and IA32_XSS MSR.
1277  */
1278 extern void __cdecl _xrstors(void const *, unsigned __int64);
1279 #if defined (_M_X64)
1280 extern void __cdecl _xrstors64(void const *, unsigned __int64);
1281 #endif /* defined (_M_X64) */
1282 
1283 /*
1284  * Saves the current state of the x87 FPU, MMX technology, XMM,
1285  * and MXCSR registers to the specified 512-byte memory location.
1286  */
1287 extern void __cdecl _fxsave(void *);
1288 #if defined (_M_X64)
1289 extern void __cdecl _fxsave64(void *);
1290 #endif /* defined (_M_X64) */
1291 
1292 /*
1293  * Restore the current state of the x87 FPU, MMX technology, XMM,
1294  * and MXCSR registers from the specified 512-byte memory location.
1295  */
1296 extern void __cdecl _fxrstor(void const *);
1297 #if defined (_M_X64)
1298 extern void __cdecl _fxrstor64(void const *);
1299 #endif /* defined (_M_X64) */
1300 
1301 /*
1302  * Perform one attempt to generate a hardware generated random value.
1303  * The generated value is written to the given memory location and the success
1304  * status is returned: 1 if the hardware could generate a valid random number
1305  * and 0 otherwise.
1306  */
1307 extern int __cdecl _rdrand16_step(unsigned short *);
1308 extern int __cdecl _rdrand32_step(unsigned int *);
1309 #if defined (_M_X64)
1310 extern int __cdecl _rdrand64_step(unsigned __int64 *);
1311 #endif /* defined (_M_X64) */
1312 
1313 #if defined (_M_X64)
1314 /*
1315  * Return the value of the FS/GS segment base register.
1316  */
1317 extern unsigned int __cdecl _readfsbase_u32();
1318 extern unsigned int __cdecl _readgsbase_u32();
1319 extern unsigned __int64 __cdecl _readfsbase_u64();
1320 extern unsigned __int64 __cdecl _readgsbase_u64();
1321 
1322 /*
1323  * Write the value to the FS/GS segment base register.
1324  */
1325 extern void __cdecl _writefsbase_u32(unsigned int);
1326 extern void __cdecl _writegsbase_u32(unsigned int);
1327 extern void __cdecl _writefsbase_u64(unsigned __int64);
1328 extern void __cdecl _writegsbase_u64(unsigned __int64);
1329 #endif /* defined (_M_X64) */
1330 
1331 /*
1332  * Perform FMA (Fused Multiply-and-Add) operations.
1333  */
1334 extern __m128 __cdecl _mm_fmadd_ps(__m128, __m128, __m128);
1335 extern __m128d __cdecl _mm_fmadd_pd(__m128d, __m128d, __m128d);
1336 extern __m128 __cdecl _mm_fmadd_ss(__m128, __m128, __m128);
1337 extern __m128d __cdecl _mm_fmadd_sd(__m128d, __m128d, __m128d);
1338 extern __m128 __cdecl _mm_fmsub_ps(__m128, __m128, __m128);
1339 extern __m128d __cdecl _mm_fmsub_pd(__m128d, __m128d, __m128d);
1340 extern __m128 __cdecl _mm_fmsub_ss(__m128, __m128, __m128);
1341 extern __m128d __cdecl _mm_fmsub_sd(__m128d, __m128d, __m128d);
1342 extern __m128 __cdecl _mm_fnmadd_ps(__m128, __m128, __m128);
1343 extern __m128d __cdecl _mm_fnmadd_pd(__m128d, __m128d, __m128d);
1344 extern __m128 __cdecl _mm_fnmadd_ss(__m128, __m128, __m128);
1345 extern __m128d __cdecl _mm_fnmadd_sd(__m128d, __m128d, __m128d);
1346 extern __m128 __cdecl _mm_fnmsub_ps(__m128, __m128, __m128);
1347 extern __m128d __cdecl _mm_fnmsub_pd(__m128d, __m128d, __m128d);
1348 extern __m128 __cdecl _mm_fnmsub_ss(__m128, __m128, __m128);
1349 extern __m128d __cdecl _mm_fnmsub_sd(__m128d, __m128d, __m128d);
1350 
1351 extern __m256 __cdecl _mm256_fmadd_ps(__m256, __m256, __m256);
1352 extern __m256d __cdecl _mm256_fmadd_pd(__m256d, __m256d, __m256d);
1353 extern __m256 __cdecl _mm256_fmsub_ps(__m256, __m256, __m256);
1354 extern __m256d __cdecl _mm256_fmsub_pd(__m256d, __m256d, __m256d);
1355 extern __m256 __cdecl _mm256_fnmadd_ps(__m256, __m256, __m256);
1356 extern __m256d __cdecl _mm256_fnmadd_pd(__m256d, __m256d, __m256d);
1357 extern __m256 __cdecl _mm256_fnmsub_ps(__m256, __m256, __m256);
1358 extern __m256d __cdecl _mm256_fnmsub_pd(__m256d, __m256d, __m256d);
1359 
1360 
1361 /*
1362  * Fused Multiply-and-Add/Subtract__and Multiply-and-Subtract/Add operations.
1363  */
1364 extern __m128 __cdecl _mm_fmaddsub_ps(__m128, __m128, __m128);
1365 extern __m128d __cdecl _mm_fmaddsub_pd(__m128d, __m128d, __m128d);
1366 extern __m128 __cdecl _mm_fmsubadd_ps(__m128, __m128, __m128);
1367 extern __m128d __cdecl _mm_fmsubadd_pd(__m128d, __m128d, __m128d);
1368 
1369 extern __m256 __cdecl _mm256_fmaddsub_ps(__m256, __m256, __m256);
1371 extern __m256 __cdecl _mm256_fmsubadd_ps(__m256, __m256, __m256);
1373 
1374 
1375 /*
1376  * Integer 256-bit vector comparison operations.
1377  */
1378 extern __m256i __cdecl _mm256_cmpeq_epi8(__m256i, __m256i);
1379 extern __m256i __cdecl _mm256_cmpeq_epi16(__m256i, __m256i);
1380 extern __m256i __cdecl _mm256_cmpeq_epi32(__m256i, __m256i);
1381 extern __m256i __cdecl _mm256_cmpeq_epi64(__m256i, __m256i);
1382 
1383 extern __m256i __cdecl _mm256_cmpgt_epi8(__m256i, __m256i);
1384 extern __m256i __cdecl _mm256_cmpgt_epi16(__m256i, __m256i);
1385 extern __m256i __cdecl _mm256_cmpgt_epi32(__m256i, __m256i);
1386 extern __m256i __cdecl _mm256_cmpgt_epi64(__m256i, __m256i);
1387 
1388 
1389 /*
1390  * Integer 256-bit vector MIN/MAX operations.
1391  */
1392 extern __m256i __cdecl _mm256_max_epi8(__m256i, __m256i);
1393 extern __m256i __cdecl _mm256_max_epi16(__m256i, __m256i);
1394 extern __m256i __cdecl _mm256_max_epi32(__m256i, __m256i);
1395 extern __m256i __cdecl _mm256_max_epu8(__m256i, __m256i);
1396 extern __m256i __cdecl _mm256_max_epu16(__m256i, __m256i);
1397 extern __m256i __cdecl _mm256_max_epu32(__m256i, __m256i);
1398 
1399 extern __m256i __cdecl _mm256_min_epi8(__m256i, __m256i);
1400 extern __m256i __cdecl _mm256_min_epi16(__m256i, __m256i);
1401 extern __m256i __cdecl _mm256_min_epi32(__m256i, __m256i);
1402 extern __m256i __cdecl _mm256_min_epu8(__m256i, __m256i);
1403 extern __m256i __cdecl _mm256_min_epu16(__m256i, __m256i);
1404 extern __m256i __cdecl _mm256_min_epu32(__m256i, __m256i);
1405 
1406 
1407 /*
1408  * Integer 256-bit vector logical operations.
1409  */
1410 extern __m256i __cdecl _mm256_and_si256(__m256i, __m256i);
1411 extern __m256i __cdecl _mm256_andnot_si256(__m256i, __m256i);
1412 extern __m256i __cdecl _mm256_or_si256(__m256i, __m256i);
1413 extern __m256i __cdecl _mm256_xor_si256(__m256i, __m256i);
1414 
1415 
1416 /*
1417  * Integer 256-bit vector arithmetic operations.
1418  */
1419 extern __m256i __cdecl _mm256_abs_epi8(__m256i);
1420 extern __m256i __cdecl _mm256_abs_epi16(__m256i);
1421 extern __m256i __cdecl _mm256_abs_epi32(__m256i);
1422 
1423 extern __m256i __cdecl _mm256_add_epi8(__m256i, __m256i);
1424 extern __m256i __cdecl _mm256_add_epi16(__m256i, __m256i);
1425 extern __m256i __cdecl _mm256_add_epi32(__m256i, __m256i);
1426 extern __m256i __cdecl _mm256_add_epi64(__m256i, __m256i);
1427 
1428 extern __m256i __cdecl _mm256_adds_epi8(__m256i, __m256i);
1429 extern __m256i __cdecl _mm256_adds_epi16(__m256i, __m256i);
1430 extern __m256i __cdecl _mm256_adds_epu8(__m256i, __m256i);
1431 extern __m256i __cdecl _mm256_adds_epu16(__m256i, __m256i);
1432 
1433 extern __m256i __cdecl _mm256_sub_epi8(__m256i, __m256i);
1434 extern __m256i __cdecl _mm256_sub_epi16(__m256i, __m256i);
1435 extern __m256i __cdecl _mm256_sub_epi32(__m256i, __m256i);
1436 extern __m256i __cdecl _mm256_sub_epi64(__m256i, __m256i);
1437 
1438 extern __m256i __cdecl _mm256_subs_epi8(__m256i, __m256i);
1439 extern __m256i __cdecl _mm256_subs_epi16(__m256i, __m256i);
1440 extern __m256i __cdecl _mm256_subs_epu8(__m256i, __m256i);
1441 extern __m256i __cdecl _mm256_subs_epu16(__m256i, __m256i);
1442 
1443 extern __m256i __cdecl _mm256_avg_epu8(__m256i, __m256i);
1444 extern __m256i __cdecl _mm256_avg_epu16(__m256i, __m256i);
1445 
1446 extern __m256i __cdecl _mm256_hadd_epi16(__m256i, __m256i);
1447 extern __m256i __cdecl _mm256_hadd_epi32(__m256i, __m256i);
1448 extern __m256i __cdecl _mm256_hadds_epi16(__m256i, __m256i);
1449 
1450 extern __m256i __cdecl _mm256_hsub_epi16(__m256i, __m256i);
1451 extern __m256i __cdecl _mm256_hsub_epi32(__m256i, __m256i);
1452 extern __m256i __cdecl _mm256_hsubs_epi16(__m256i, __m256i);
1453 
1454 extern __m256i __cdecl _mm256_madd_epi16(__m256i, __m256i);
1455 extern __m256i __cdecl _mm256_maddubs_epi16(__m256i, __m256i);
1456 
1457 extern __m256i __cdecl _mm256_mulhi_epi16(__m256i, __m256i);
1458 extern __m256i __cdecl _mm256_mulhi_epu16(__m256i, __m256i);
1459 
1460 extern __m256i __cdecl _mm256_mullo_epi16(__m256i, __m256i);
1461 extern __m256i __cdecl _mm256_mullo_epi32(__m256i, __m256i);
1462 
1463 extern __m256i __cdecl _mm256_mul_epu32(__m256i, __m256i);
1464 extern __m256i __cdecl _mm256_mul_epi32(__m256i, __m256i);
1465 
1466 extern __m256i __cdecl _mm256_sign_epi8(__m256i, __m256i);
1467 extern __m256i __cdecl _mm256_sign_epi16(__m256i, __m256i);
1468 extern __m256i __cdecl _mm256_sign_epi32(__m256i, __m256i);
1469 
1470 extern __m256i __cdecl _mm256_mulhrs_epi16(__m256i, __m256i);
1471 
1472 extern __m256i __cdecl _mm256_sad_epu8(__m256i, __m256i);
1473 extern __m256i __cdecl _mm256_mpsadbw_epu8(__m256i, __m256i, const int);
1474 
1475 
1476 /*
1477  * Integer 256-bit vector arithmetic/logical shift operations.
1478  */
1479 extern __m256i __cdecl _mm256_slli_si256(__m256i, const int);
1480 #define _mm256_bslli_epi128 _mm256_slli_si256
1481 extern __m256i __cdecl _mm256_srli_si256(__m256i, const int);
1482 #define _mm256_bsrli_epi128 _mm256_srli_si256
1483 
1484 extern __m256i __cdecl _mm256_sll_epi16(__m256i, __m128i);
1485 extern __m256i __cdecl _mm256_sll_epi32(__m256i, __m128i);
1486 extern __m256i __cdecl _mm256_sll_epi64(__m256i, __m128i);
1487 
1488 extern __m256i __cdecl _mm256_slli_epi16(__m256i, int);
1489 extern __m256i __cdecl _mm256_slli_epi32(__m256i, int);
1490 extern __m256i __cdecl _mm256_slli_epi64(__m256i, int);
1491 
1492 extern __m256i __cdecl _mm256_sllv_epi32(__m256i, __m256i);
1493 extern __m256i __cdecl _mm256_sllv_epi64(__m256i, __m256i);
1494 
1495 extern __m128i __cdecl _mm_sllv_epi32(__m128i, __m128i);
1496 extern __m128i __cdecl _mm_sllv_epi64(__m128i, __m128i);
1497 
1498 extern __m256i __cdecl _mm256_sra_epi16(__m256i, __m128i);
1499 extern __m256i __cdecl _mm256_sra_epi32(__m256i, __m128i);
1500 
1501 extern __m256i __cdecl _mm256_srai_epi16(__m256i, int);
1502 extern __m256i __cdecl _mm256_srai_epi32(__m256i, int);
1503 
1504 extern __m256i __cdecl _mm256_srav_epi32(__m256i, __m256i);
1505 
1506 extern __m128i __cdecl _mm_srav_epi32(__m128i, __m128i);
1507 
1508 extern __m256i __cdecl _mm256_srl_epi16(__m256i, __m128i);
1509 extern __m256i __cdecl _mm256_srl_epi32(__m256i, __m128i);
1510 extern __m256i __cdecl _mm256_srl_epi64(__m256i, __m128i);
1511 
1512 extern __m256i __cdecl _mm256_srli_epi16(__m256i, int);
1513 extern __m256i __cdecl _mm256_srli_epi32(__m256i, int);
1514 extern __m256i __cdecl _mm256_srli_epi64(__m256i, int);
1515 
1516 extern __m256i __cdecl _mm256_srlv_epi32(__m256i, __m256i);
1517 extern __m256i __cdecl _mm256_srlv_epi64(__m256i, __m256i);
1518 
1519 extern __m128i __cdecl _mm_srlv_epi32(__m128i, __m128i);
1520 extern __m128i __cdecl _mm_srlv_epi64(__m128i, __m128i);
1521 
1522 
1523 /*
1524  * Integer 128/256-bit vector pack/blend/shuffle/insert/extract operations.
1525  */
1526 extern __m128i __cdecl _mm_blend_epi32(__m128i, __m128i, const int);
1527 
1528 extern __m256i __cdecl _mm256_blend_epi32(__m256i,__m256i, const int);
1529 
1530 extern __m256i __cdecl _mm256_alignr_epi8(__m256i, __m256i, const int);
1531 
1533 extern __m256i __cdecl _mm256_blend_epi16(__m256i, __m256i, const int);
1534 
1535 extern __m256i __cdecl _mm256_packs_epi16(__m256i, __m256i);
1536 extern __m256i __cdecl _mm256_packs_epi32(__m256i, __m256i);
1537 extern __m256i __cdecl _mm256_packus_epi16(__m256i, __m256i);
1538 extern __m256i __cdecl _mm256_packus_epi32(__m256i, __m256i);
1539 
1540 extern __m256i __cdecl _mm256_unpackhi_epi8(__m256i, __m256i);
1541 extern __m256i __cdecl _mm256_unpackhi_epi16(__m256i, __m256i);
1542 extern __m256i __cdecl _mm256_unpackhi_epi32(__m256i, __m256i);
1543 extern __m256i __cdecl _mm256_unpackhi_epi64(__m256i, __m256i);
1544 
1545 extern __m256i __cdecl _mm256_unpacklo_epi8(__m256i, __m256i);
1546 extern __m256i __cdecl _mm256_unpacklo_epi16(__m256i, __m256i);
1547 extern __m256i __cdecl _mm256_unpacklo_epi32(__m256i, __m256i);
1548 extern __m256i __cdecl _mm256_unpacklo_epi64(__m256i, __m256i);
1549 
1550 extern __m256i __cdecl _mm256_shuffle_epi8(__m256i, __m256i);
1551 extern __m256i __cdecl _mm256_shuffle_epi32(__m256i, const int);
1552 
1553 extern __m256i __cdecl _mm256_shufflehi_epi16(__m256i, const int);
1554 extern __m256i __cdecl _mm256_shufflelo_epi16(__m256i, const int);
1555 
1556 extern __m128i __cdecl _mm256_extracti128_si256(__m256i, const int);
1557 extern __m256i __cdecl _mm256_inserti128_si256(__m256i, __m128i, const int);
1558 
1559 
1560 /*
1561  * Scalar to 128/256-bit vector broadcast operations.
1562  */
1563 extern __m128 __cdecl _mm_broadcastss_ps(__m128);
1564 extern __m128d __cdecl _mm_broadcastsd_pd(__m128d);
1565 
1566 extern __m128i __cdecl _mm_broadcastb_epi8(__m128i);
1567 extern __m128i __cdecl _mm_broadcastw_epi16(__m128i);
1568 extern __m128i __cdecl _mm_broadcastd_epi32(__m128i);
1569 extern __m128i __cdecl _mm_broadcastq_epi64(__m128i);
1570 
1571 extern __m256 __cdecl _mm256_broadcastss_ps(__m128);
1572 extern __m256d __cdecl _mm256_broadcastsd_pd(__m128d);
1573 
1574 extern __m256i __cdecl _mm256_broadcastb_epi8(__m128i);
1575 extern __m256i __cdecl _mm256_broadcastw_epi16(__m128i);
1576 extern __m256i __cdecl _mm256_broadcastd_epi32(__m128i);
1577 extern __m256i __cdecl _mm256_broadcastq_epi64(__m128i);
1578 
1580 
1581 
1582 
1583 /*
1584  * Integer 256-bit vector signed/unsigned extension operations.
1585  */
1586 extern __m256i __cdecl _mm256_cvtepi8_epi16(__m128i);
1587 extern __m256i __cdecl _mm256_cvtepi8_epi32(__m128i);
1588 extern __m256i __cdecl _mm256_cvtepi8_epi64(__m128i);
1589 extern __m256i __cdecl _mm256_cvtepi16_epi32(__m128i);
1590 extern __m256i __cdecl _mm256_cvtepi16_epi64(__m128i);
1591 extern __m256i __cdecl _mm256_cvtepi32_epi64(__m128i);
1592 
1593 extern __m256i __cdecl _mm256_cvtepu8_epi16(__m128i);
1594 extern __m256i __cdecl _mm256_cvtepu8_epi32(__m128i);
1595 extern __m256i __cdecl _mm256_cvtepu8_epi64(__m128i);
1596 extern __m256i __cdecl _mm256_cvtepu16_epi32(__m128i);
1597 extern __m256i __cdecl _mm256_cvtepu16_epi64(__m128i);
1598 extern __m256i __cdecl _mm256_cvtepu32_epi64(__m128i);
1599 
1600 
1601 /*
1602  * Returns a 32-bit mask made up of the most significant bit of each byte
1603  * of the 256-bit vector source operand.
1604  */
1605 extern int __cdecl _mm256_movemask_epi8(__m256i);
1606 
1607 
1608 /*
1609  * Masked load/store operations.
1610  */
1611 extern __m128i __cdecl _mm_maskload_epi32(int const * /* ptr */,
1612  __m128i /* vmask */);
1613 extern __m128i __cdecl _mm_maskload_epi64(__int64 const * /* ptr */,
1614  __m128i /* vmask */);
1615 
1616 extern void __cdecl _mm_maskstore_epi32(int * /* ptr */,
1617  __m128i /* vmask */,
1618  __m128i /* val */);
1619 extern void __cdecl _mm_maskstore_epi64(__int64 * /* ptr */,
1620  __m128i /* vmask */,
1621  __m128i /* val */);
1622 
1623 extern __m256i __cdecl _mm256_maskload_epi32(int const * /* ptr */,
1624  __m256i /* vmask */);
1625 extern __m256i __cdecl _mm256_maskload_epi64(__int64 const * /* ptr */,
1626  __m256i /* vmask */);
1627 
1628 extern void __cdecl _mm256_maskstore_epi32(int * /* ptr */,
1629  __m256i /* vmask */,
1630  __m256i /* val */);
1631 extern void __cdecl _mm256_maskstore_epi64(__int64 * /* ptr */,
1632  __m256i /* vmask */,
1633  __m256i /* val */);
1634 
1635 
1636 /*
1637  * Permute elements in vector operations.
1638  */
1641 
1642 extern __m256i __cdecl _mm256_permute4x64_epi64(__m256i, const int);
1643 extern __m256d __cdecl _mm256_permute4x64_pd(__m256d, const int);
1644 
1645 extern __m256i __cdecl _mm256_permute2x128_si256(__m256i, __m256i, const int);
1646 
1647 
1648 /*
1649  * Load 32-bytes from memory using non-temporal aligned hint.
1650  */
1651 extern __m256i __cdecl _mm256_stream_load_si256(__m256i const *);
1652 
1653 
1654 
1655 /*
1656  * Masked GATHER from memory to vector register operations.
1657  */
1658 extern __m256d __cdecl _mm256_mask_i32gather_pd(__m256d /* old_dst */,
1659  double const * /* ptr */,
1660  __m128i /* vindex */,
1661  __m256d /* vmask */,
1662  const int /* scale */);
1663 extern __m256 __cdecl _mm256_mask_i32gather_ps(__m256 /* old_dst */,
1664  float const * /* ptr */,
1665  __m256i /* vindex */,
1666  __m256 /* vmask */,
1667  const int /* scale */);
1668 extern __m256d __cdecl _mm256_mask_i64gather_pd(__m256d /* old_dst */,
1669  double const * /* ptr */,
1670  __m256i /* vindex */,
1671  __m256d /* vmask */,
1672  const int /* scale */);
1673 extern __m128 __cdecl _mm256_mask_i64gather_ps(__m128 /* old_dst */,
1674  float const * /* ptr */,
1675  __m256i /* vindex */,
1676  __m128 /* vmask */,
1677  const int /* scale */);
1678 
1679 extern __m128d __cdecl _mm_mask_i32gather_pd(__m128d /* old_dst */,
1680  double const * /* ptr */,
1681  __m128i /* vindex */,
1682  __m128d /* vmask */,
1683  const int /* scale */);
1684 extern __m128 __cdecl _mm_mask_i32gather_ps(__m128 /* old_dst */,
1685  float const * /* ptr */,
1686  __m128i /* vindex */,
1687  __m128 /* vmask */,
1688  const int /* scale */);
1689 extern __m128d __cdecl _mm_mask_i64gather_pd(__m128d /* old_dst */,
1690  double const * /* ptr */,
1691  __m128i /* vindex */,
1692  __m128d /* vmask */,
1693  const int /* scale */);
1694 extern __m128 __cdecl _mm_mask_i64gather_ps(__m128 /* old_dst */,
1695  float const * /* ptr */,
1696  __m128i /* vindex */,
1697  __m128 /* vmask */,
1698  const int /* scale */);
1699 
1700 
1701 extern __m256i __cdecl _mm256_mask_i32gather_epi32(__m256i /* old_dst */,
1702  int const * /* ptr */,
1703  __m256i /* vindex */,
1704  __m256i /* vmask */,
1705  const int /* scale */);
1706 extern __m256i __cdecl _mm256_mask_i32gather_epi64(__m256i /* old_dst */,
1707  __int64 const * /* ptr */,
1708  __m128i /* vindex */,
1709  __m256i /* vmask */,
1710  const int /* scale */);
1711 extern __m128i __cdecl _mm256_mask_i64gather_epi32(__m128i /* old_dst */,
1712  int const * /* ptr */,
1713  __m256i /* vindex */,
1714  __m128i /* vmask */,
1715  const int /* scale */);
1716 extern __m256i __cdecl _mm256_mask_i64gather_epi64(__m256i /* old_dst */,
1717  __int64 const * /* ptr */,
1718  __m256i /* vindex */,
1719  __m256i /* vmask */,
1720  const int /* scale */);
1721 
1722 extern __m128i __cdecl _mm_mask_i32gather_epi32(__m128i /* old_dst */,
1723  int const * /* ptr */,
1724  __m128i /* vindex */,
1725  __m128i /* vmask */,
1726  const int /* scale */);
1727 extern __m128i __cdecl _mm_mask_i32gather_epi64(__m128i /* old_dst */,
1728  __int64 const * /* ptr */,
1729  __m128i /* vindex */,
1730  __m128i /* vmask */,
1731  const int /* scale */);
1732 extern __m128i __cdecl _mm_mask_i64gather_epi32(__m128i /* old_dst */,
1733  int const * /* ptr */,
1734  __m128i /* vindex */,
1735  __m128i /* vmask */,
1736  const int /* scale */);
1737 extern __m128i __cdecl _mm_mask_i64gather_epi64(__m128i /* old_dst */,
1738  __int64 const * /* ptr */,
1739  __m128i /* vindex */,
1740  __m128i /* vmask */,
1741  const int /* scale */);
1742 
1743 
1744 /*
1745  * GATHER from memory to vector register operations.
1746  */
1747 extern __m256d __cdecl _mm256_i32gather_pd(double const * /* ptr */,
1748  __m128i /* vindex */,
1749  const int /* index_scale */);
1750 extern __m256 __cdecl _mm256_i32gather_ps(float const * /* ptr */,
1751  __m256i /* vindex */,
1752  const int /* index_scale */);
1753 extern __m256d __cdecl _mm256_i64gather_pd(double const * /* ptr */,
1754  __m256i /* vindex */,
1755  const int /* index_scale */);
1756 extern __m128 __cdecl _mm256_i64gather_ps(float const * /* ptr */,
1757  __m256i /* vindex */,
1758  const int /* index_scale */);
1759 
1760 extern __m128d __cdecl _mm_i32gather_pd(double const * /* ptr */,
1761  __m128i /* vindex */,
1762  const int /* index_scale */);
1763 extern __m128 __cdecl _mm_i32gather_ps(float const * /* ptr */,
1764  __m128i /* vindex */,
1765  const int /* index_scale */);
1766 extern __m128d __cdecl _mm_i64gather_pd(double const * /* ptr */,
1767  __m128i /* vindex */,
1768  const int /* index_scale */);
1769 extern __m128 __cdecl _mm_i64gather_ps(float const * /* ptr */,
1770  __m128i /* vindex */,
1771  const int /* index_scale */);
1772 
1773 extern __m256i __cdecl _mm256_i32gather_epi32(int const * /* ptr */,
1774  __m256i /* vindex */,
1775  const int /* scale */);
1776 extern __m256i __cdecl _mm256_i32gather_epi64(__int64 const * /* ptr */,
1777  __m128i /* vindex */,
1778  const int /* scale */);
1779 extern __m128i __cdecl _mm256_i64gather_epi32(int const * /* ptr */,
1780  __m256i /* vindex */,
1781  const int /* scale */);
1782 extern __m256i __cdecl _mm256_i64gather_epi64(__int64 const * /* ptr */,
1783  __m256i /* vindex */,
1784  const int /* scale */);
1785 
1786 extern __m128i __cdecl _mm_i32gather_epi32(int const * /* ptr */,
1787  __m128i /* vindex */,
1788  const int /* index_scale */);
1789 extern __m128i __cdecl _mm_i32gather_epi64(__int64 const * /* ptr */,
1790  __m128i /* vindex */,
1791  const int /* index_scale */);
1792 extern __m128i __cdecl _mm_i64gather_epi32(int const * /* ptr */,
1793  __m128i /* vindex */,
1794  const int /* index_scale */);
1795 extern __m128i __cdecl _mm_i64gather_epi64(__int64 const * /* ptr */,
1796  __m128i /* vindex */,
1797  const int /* index_scale */);
1798 
1799 
1800 /*
1801  * A collection of operations to manipulate integer data at bit-granularity.
1802  * The names of these functions are formed from the instruction mnemonic and
1803  * the operand data type used to implement them.
1804  */
1805 extern unsigned int _bextr_u32(unsigned int /* src */,
1806  unsigned int /* start_bit */,
1807  unsigned int /* len_in_bits */);
1808 extern unsigned int _blsi_u32(unsigned int);
1809 extern unsigned int _blsmsk_u32(unsigned int);
1810 extern unsigned int _blsr_u32(unsigned int);
1811 extern unsigned int _bzhi_u32(unsigned int /* src */,
1812  unsigned int /* index */);
1813 extern unsigned int _mulx_u32(unsigned int /* src1 */,
1814  unsigned int /* src2 */,
1815  unsigned int * /* high_bits */);
1816 extern unsigned int _pdep_u32(unsigned int /* src */,
1817  unsigned int /* mask */);
1818 extern unsigned int _pext_u32(unsigned int /* src */,
1819  unsigned int /* mask */);
1820 extern unsigned int _rorx_u32(unsigned int /* src */,
1821  const unsigned int /* shift_count */);
1822 extern int _sarx_i32(int /* src */,
1823  unsigned int /* shift_count */);
1824 extern unsigned int _shlx_u32(unsigned int /* src */,
1825  unsigned int /* shift_count */);
1826 extern unsigned int _shrx_u32(unsigned int /* src */,
1827  unsigned int /* shift_count */);
1828 
1829 #if defined (_M_X64)
1830 extern unsigned __int64 _bextr_u64(unsigned __int64 /* src */,
1831  unsigned int /* start_bit */,
1832  unsigned int /* len_in_bits */);
1833 extern unsigned __int64 _blsi_u64(unsigned __int64);
1834 extern unsigned __int64 _blsmsk_u64(unsigned __int64);
1835 extern unsigned __int64 _blsr_u64(unsigned __int64);
1836 extern unsigned __int64 _bzhi_u64(unsigned __int64 /* src */,
1837  unsigned int /* index */);
1838 extern unsigned __int64 _mulx_u64(unsigned __int64 /* src1 */,
1839  unsigned __int64 /* src2 */,
1840  unsigned __int64 * /* high_bits */);
1841 extern unsigned __int64 _pdep_u64(unsigned __int64 /* src */,
1842  unsigned __int64 /* mask */);
1843 extern unsigned __int64 _pext_u64(unsigned __int64 /* src */,
1844  unsigned __int64 /* mask */);
1845 extern unsigned __int64 _rorx_u64(unsigned __int64 /* src */,
1846  const unsigned int /* shift_count */);
1847 extern __int64 _sarx_i64(__int64 /* src */,
1848  unsigned int /* shift_count */);
1849 extern unsigned __int64 _shlx_u64(unsigned __int64 /* src */,
1850  unsigned int /* shift_count */);
1851 extern unsigned __int64 _shrx_u64(unsigned __int64 /* src */,
1852  unsigned int /* shift_count */);
1853 #endif /* defined (_M_X64) */
1854 
1855 
1856 /*
1857  * Leading zero bit count.
1858  *
1859  * Counts the number of leading zero bits in a source operand.
1860  * Returns operand size as output when source operand is zero.
1861  */
1862 extern unsigned int _lzcnt_u32(unsigned int);
1863 #if defined (_M_X64)
1864 extern unsigned __int64 _lzcnt_u64(unsigned __int64);
1865 #endif /* defined (_M_X64) */
1866 
1867 /*
1868  * Trailing zero bit count.
1869  *
1870  * Searches the source operand (r2) for the least significant set bit
1871  * (1 bit). If a least significant 1 bit is found, its bit index is
1872  * returned, otherwise the result is the number of bits in the operand size.
1873  */
1874 extern unsigned int _tzcnt_u32(unsigned int);
1875 #if defined (_M_X64)
1876 extern unsigned __int64 _tzcnt_u64(unsigned __int64);
1877 #endif /* defined (_M_X64) */
1878 
1879 
1880 
1881 /*
1882  * Operation targeted to system software that manages processor context IDs.
1883  */
1884 extern void __cdecl _invpcid(unsigned int /* type */, void * /* descriptor */);
1885 
1886 // Hardware Lock Elision
1887 extern void _Store_HLERelease(long volatile *,long);
1888 extern void _StorePointer_HLERelease(void * volatile *,void *);
1889 
1890 extern long _InterlockedExchange_HLEAcquire(long volatile *,long);
1891 extern long _InterlockedExchange_HLERelease(long volatile *,long);
1892 extern void * _InterlockedExchangePointer_HLEAcquire(void *volatile *,void *);
1893 extern void * _InterlockedExchangePointer_HLERelease(void *volatile *,void *);
1894 
1895 extern long _InterlockedCompareExchange_HLEAcquire(long volatile *,long,long);
1896 extern long _InterlockedCompareExchange_HLERelease(long volatile *,long,long);
1897 extern __int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *,__int64,__int64);
1898 extern __int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *,__int64,__int64);
1899 extern void * _InterlockedCompareExchangePointer_HLEAcquire(void *volatile *,void *,void *);
1900 extern void * _InterlockedCompareExchangePointer_HLERelease(void *volatile *,void *,void *);
1901 
1902 extern long _InterlockedExchangeAdd_HLEAcquire(long volatile *,long);
1903 extern long _InterlockedExchangeAdd_HLERelease(long volatile *,long);
1904 
1905 extern long _InterlockedAnd_HLEAcquire(long volatile *,long);
1906 extern long _InterlockedAnd_HLERelease(long volatile *,long);
1907 extern long _InterlockedOr_HLEAcquire(long volatile *,long);
1908 extern long _InterlockedOr_HLERelease(long volatile *,long);
1909 extern long _InterlockedXor_HLEAcquire(long volatile *,long);
1910 extern long _InterlockedXor_HLERelease(long volatile *,long);
1911 
1912 extern unsigned char _interlockedbittestandset_HLEAcquire(long *,long);
1913 extern unsigned char _interlockedbittestandset_HLERelease(long *,long);
1914 extern unsigned char _interlockedbittestandreset_HLEAcquire(long *,long);
1915 extern unsigned char _interlockedbittestandreset_HLERelease(long *,long);
1916 
1917 #if defined(_M_X64)
1918 extern void _Store64_HLERelease(__int64 volatile *,__int64);
1919 extern __int64 _InterlockedExchange64_HLEAcquire(__int64 volatile *,__int64);
1920 extern __int64 _InterlockedExchange64_HLERelease(__int64 volatile *,__int64);
1921 
1922 extern __int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *,__int64);
1923 extern __int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *,__int64);
1924 
1925 extern __int64 _InterlockedAnd64_HLEAcquire(__int64 volatile *,__int64);
1926 extern __int64 _InterlockedAnd64_HLERelease(__int64 volatile *,__int64);
1927 extern __int64 _InterlockedOr64_HLEAcquire(__int64 volatile *,__int64);
1928 extern __int64 _InterlockedOr64_HLERelease(__int64 volatile *,__int64);
1929 extern __int64 _InterlockedXor64_HLEAcquire(__int64 volatile *,__int64);
1930 extern __int64 _InterlockedXor64_HLERelease(__int64 volatile *,__int64);
1931 
1932 extern unsigned char _interlockedbittestandset64_HLEAcquire(__int64 *,__int64);
1933 extern unsigned char _interlockedbittestandset64_HLERelease(__int64 *,__int64);
1934 extern unsigned char _interlockedbittestandreset64_HLEAcquire(__int64 *,__int64);
1935 extern unsigned char _interlockedbittestandreset64_HLERelease(__int64 *,__int64);
1936 #endif /* defined (_M_X64) */
1937 
1938 // Restricted Transactional Memory
1939 #define _XBEGIN_STARTED (~0u)
1940 #define _XABORT_EXPLICIT (1 << 0)
1941 #define _XABORT_RETRY (1 << 1)
1942 #define _XABORT_CONFLICT (1 << 2)
1943 #define _XABORT_CAPACITY (1 << 3)
1944 #define _XABORT_DEBUG (1 << 4)
1945 #define _XABORT_NESTED (1 << 5)
1946 #define _XABORT_CODE(x) ((unsigned char)(((x) >> 24) & 0xFF))
1947 
1948 extern unsigned int __cdecl _xbegin(void);
1949 extern void __cdecl _xend(void);
1950 extern void __cdecl _xabort(const unsigned int);
1951 extern unsigned char __cdecl _xtest(void);
1952 
1953 /*
1954  * Perform one attempt to generate a hardware generated random value
1955  * accordingly to the NIST SP 800-90B/C standards.
1956  * The generated value is written to the given memory location and the success
1957  * status is returned: 1 if the hardware could generate a valid random number
1958  * and 0 otherwise.
1959  */
1960 extern int __cdecl _rdseed16_step(unsigned short *);
1961 extern int __cdecl _rdseed32_step(unsigned int *);
1962 #if defined(_M_X64)
1963 extern int __cdecl _rdseed64_step(unsigned __int64 *);
1964 #endif /* defined (_M_X64) */
1965 
1966 /*
1967  * The _addcarryx... functions generate ADCX and ADOX instructions which
1968  * use CF and OF (in the flags register) respectively to propagate carry.
1969  * Because this allows two add-with-carry sequences to be interleaved
1970  * without having to save and restore the carry flag this is useful in
1971  * multiprecision multiply for example. These functions return
1972  * the carry-out, which is convenient for chaining multiple operations.
1973  * The sum is written using the given reference.
1974  */
1975 extern unsigned char __cdecl _addcarryx_u32(unsigned char /*c_in*/,
1976  unsigned int /*src1*/,
1977  unsigned int /*src2*/,
1978  unsigned int * /*out*/);
1979 
1980 
1981 #if defined(_M_X64)
1982 extern unsigned char __cdecl _addcarryx_u64(unsigned char /*c_in*/,
1983  unsigned __int64 /*src1*/,
1984  unsigned __int64 /*src2*/,
1985  unsigned __int64 * /*out*/);
1986 #endif /* defined (_M_X64) */
1987 
1988 
1989 /*
1990  * Perform load a big-endian value from memory.
1991  */
1992 extern unsigned short __cdecl _load_be_u16(void const*);
1993 extern unsigned int __cdecl _load_be_u32(void const*);
1994 extern unsigned __int64 __cdecl _load_be_u64(void const*);
1995 #define _loadbe_i16(be_ptr) ((short) _load_be_u16(be_ptr))
1996 #define _loadbe_i32(be_ptr) ((int) _load_be_u32(be_ptr))
1997 #define _loadbe_i64(be_ptr) ((__int64)_load_be_u64(be_ptr))
1998 
1999 /*
2000  * Perform store a value to memory as big-endian.
2001  */
2002 extern void __cdecl _store_be_u16(void *, unsigned short);
2003 extern void __cdecl _store_be_u32(void *, unsigned int);
2004 extern void __cdecl _store_be_u64(void *, unsigned __int64);
2005 #define _storebe_i16(be_ptr, val) _store_be_u16(be_ptr, (unsigned short)(val))
2006 #define _storebe_i32(be_ptr, val) _store_be_u32(be_ptr, (unsigned int)(val))
2007 #define _storebe_i64(be_ptr, val) _store_be_u64(be_ptr, (unsigned __int64)(__int64)(val))
2008 
2009 /*
2010  * The Secure Hash Algorithm (SHA) New Instructions.
2011 */
2012 extern __m128i __cdecl _mm_sha1msg1_epu32(__m128i, __m128i);
2013 extern __m128i __cdecl _mm_sha1msg2_epu32(__m128i, __m128i);
2014 extern __m128i __cdecl _mm_sha1nexte_epu32(__m128i, __m128i);
2015 extern __m128i __cdecl _mm_sha1rnds4_epu32(__m128i, __m128i, const int);
2016 
2017 extern __m128i __cdecl _mm_sha256msg1_epu32(__m128i, __m128i);
2018 extern __m128i __cdecl _mm_sha256msg2_epu32(__m128i, __m128i);
2020 
2021 /*
2022  * Intel(R) Memory Protection Extensions (Intel(R) MPX) intrinsic functions
2023 */
2024 extern void * __cdecl _bnd_set_ptr_bounds(const void *, size_t);
2025 extern void * __cdecl _bnd_narrow_ptr_bounds(const void *, const void *, size_t);
2026 extern void * __cdecl _bnd_copy_ptr_bounds(const void *, const void *);
2027 extern void * __cdecl _bnd_init_ptr_bounds(const void *);
2028 extern void __cdecl _bnd_store_ptr_bounds(const void **, const void *);
2029 extern void __cdecl _bnd_chk_ptr_lbounds(const void *);
2030 extern void __cdecl _bnd_chk_ptr_ubounds(const void *);
2031 extern void __cdecl _bnd_chk_ptr_bounds(const void *, size_t);
2032 extern void * __cdecl _bnd_load_ptr_bounds(const void **, const void *);
2033 extern const void * __cdecl _bnd_get_ptr_lbound(const void *);
2034 extern const void * __cdecl _bnd_get_ptr_ubound(const void *);
2035 
2036 #if defined __cplusplus
2037 }; /* End "C" */
2038 #endif /* defined __cplusplus */
2039 
2040 #endif /* defined (_M_CEE_PURE) */
2041 #endif /* __midl */
2042 #endif /* _INCLUDED_IMM */
void __cdecl _mm256_storeu_pd(double *, __m256d)
__m256 __cdecl _mm256_setzero_ps(void)
__m256i __cdecl _mm256_set_epi16(short, short, short, short, short, short, short, short, short, short, short, short, short, short, short, short)
__m256i __cdecl _mm256_i32gather_epi64(__int64 const *, __m128i, const int)
unsigned int _blsmsk_u32(unsigned int)
__m256d __cdecl _mm256_sub_pd(__m256d, __m256d)
int __cdecl _mm_testz_pd(__m128d, __m128d)
__m128i __cdecl _mm_broadcastw_epi16(__m128i)
void __cdecl _mm256_stream_si256(__m256i *, __m256i)
void __cdecl _store_be_u32(void *, unsigned int)
__m256i __cdecl _mm256_add_epi64(__m256i, __m256i)
__m128i __cdecl _mm_broadcastd_epi32(__m128i)
__m256 __cdecl _mm256_broadcast_ss(float const *)
void __cdecl _bnd_chk_ptr_lbounds(const void *)
void __cdecl _mm256_maskstore_epi64(__int64 *, __m256i, __m256i)
__m128d __cdecl _mm_permutevar_pd(__m128d, __m128i)
__m256d __cdecl _mm256_div_pd(__m256d, __m256d)
int __cdecl _mm256_testnzc_si256(__m256i, __m256i)
__m256i __cdecl _mm256_srli_epi32(__m256i, int)
long _InterlockedExchange_HLEAcquire(long volatile *, long)
__m128i __cdecl _mm_broadcastq_epi64(__m128i)
__m256i __cdecl _mm256_sll_epi16(__m256i, __m128i)
unsigned int __cdecl _load_be_u32(void const *)
__m256i __cdecl _mm256_srav_epi32(__m256i, __m256i)
__m256 __cdecl _mm256_div_ps(__m256, __m256)
void __cdecl _xabort(const unsigned int)
long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long)
__m256d __cdecl _mm256_blend_pd(__m256d, __m256d, const int)
__m256 __cdecl _mm256_rsqrt_ps(__m256)
__m256d __cdecl _mm256_cvtps_pd(__m128)
__m256i __cdecl _mm256_load_si256(__m256i const *)
__m256i __cdecl _mm256_sllv_epi64(__m256i, __m256i)
__m256 __cdecl _mm256_i32gather_ps(float const *, __m256i, const int)
__m256i __cdecl _mm256_subs_epu8(__m256i, __m256i)
__m256i __cdecl _mm256_unpackhi_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_sub_epi16(__m256i, __m256i)
__m256d __cdecl _mm256_andnot_pd(__m256d, __m256d)
void __cdecl _xsave(void *, unsigned __int64)
__m256i __cdecl _mm256_sign_epi16(__m256i, __m256i)
__m256d __cdecl _mm256_round_pd(__m256d, int)
__m256i __cdecl _mm256_mulhrs_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_shufflelo_epi16(__m256i, const int)
__m128 __cdecl _mm_cmp_ps(__m128, __m128, const int)
__m256d __cdecl _mm256_fmsubadd_pd(__m256d, __m256d, __m256d)
__m256i __cdecl _mm256_min_epu8(__m256i, __m256i)
void __cdecl _xend(void)
__m256i __cdecl _mm256_srli_epi64(__m256i, int)
__m128 __cdecl _mm_fmadd_ss(__m128, __m128, __m128)
__m256i __cdecl _mm256_max_epi8(__m256i, __m256i)
void *__cdecl _bnd_load_ptr_bounds(const void **, const void *)
__m128 __cdecl _mm256_i64gather_ps(float const *, __m256i, const int)
__m256i __cdecl _mm256_max_epi32(__m256i, __m256i)
__m256d __cdecl _mm256_set_pd(double, double, double, double)
__m256 __cdecl _mm256_loadu_ps(float const *)
void __cdecl _mm256_storeu_ps(float *, __m256)
__m256d __cdecl _mm256_load_pd(double const *)
__m256i __cdecl _mm256_min_epu32(__m256i, __m256i)
__m256d __cdecl _mm256_and_pd(__m256d, __m256d)
int __cdecl _rdseed32_step(unsigned int *)
__m256i __cdecl _mm256_mullo_epi32(__m256i, __m256i)
void __cdecl _mm_maskstore_ps(float *, __m128i, __m128)
__m256i __cdecl _mm256_shuffle_epi8(__m256i, __m256i)
__m256 __cdecl _mm256_insertf128_ps(__m256, __m128, int)
__m256i __cdecl _mm256_broadcastb_epi8(__m128i)
__m256i __cdecl _mm256_cmpgt_epi32(__m256i, __m256i)
__m128i __cdecl _mm_i32gather_epi32(int const *, __m128i, const int)
__m256i __cdecl _mm256_cmpgt_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_setr_epi16(short, short, short, short, short, short, short, short, short, short, short, short, short, short, short, short)
__m256d __cdecl _mm256_castps_pd(__m256)
unsigned int _blsr_u32(unsigned int)
__m256 __cdecl _mm256_sqrt_ps(__m256)
__m256d __cdecl _mm256_movedup_pd(__m256d)
__m128d __cdecl _mm_fnmsub_sd(__m128d, __m128d, __m128d)
__m256i __cdecl _mm256_unpacklo_epi16(__m256i, __m256i)
__m256d __cdecl _mm256_fnmadd_pd(__m256d, __m256d, __m256d)
__m256i __cdecl _mm256_permute2f128_si256(__m256i, __m256i, int)
__m128i __cdecl _mm_maskload_epi64(__int64 const *, __m128i)
long _InterlockedXor_HLEAcquire(long volatile *, long)
__m256d __cdecl _mm256_fnmsub_pd(__m256d, __m256d, __m256d)
__m256 __cdecl _mm256_xor_ps(__m256, __m256)
__m256i __cdecl _mm256_max_epu8(__m256i, __m256i)
__m256i __cdecl _mm256_inserti128_si256(__m256i, __m128i, const int)
long _InterlockedXor_HLERelease(long volatile *, long)
__m128i __cdecl _mm_maskload_epi32(int const *, __m128i)
__m128d __cdecl _mm_cmp_pd(__m128d, __m128d, const int)
__m256i __cdecl _mm256_avg_epu8(__m256i, __m256i)
const void *__cdecl _bnd_get_ptr_lbound(const void *)
unsigned int _rorx_u32(unsigned int, const unsigned int)
__m128d
Definition: emmintrin.h:57
__m256 __cdecl _mm256_load_ps(float const *)
__m256i __cdecl _mm256_add_epi32(__m256i, __m256i)
__m128 __cdecl _mm256_mask_i64gather_ps(__m128, float const *, __m256i, __m128, const int)
__m256i __cdecl _mm256_avg_epu16(__m256i, __m256i)
__m128i __cdecl _mm_sha256msg2_epu32(__m128i, __m128i)
unsigned char _interlockedbittestandreset_HLERelease(long *, long)
__m256d
Definition: immintrin.h:43
void __cdecl _mm256_stream_ps(float *, __m256)
__m256d __cdecl _mm256_loadu_pd(double const *)
void * align(size_t _Bound, size_t _Size, void *&_Ptr, size_t &_Space) _NOEXCEPT
Definition: memory:1985
long _InterlockedCompareExchange_HLERelease(long volatile *, long, long)
unsigned char __cdecl _addcarryx_u32(unsigned char, unsigned int, unsigned int, unsigned int *)
__m256i __cdecl _mm256_max_epu32(__m256i, __m256i)
unsigned int _mulx_u32(unsigned int, unsigned int, unsigned int *)
__m128 __cdecl _mm_maskload_ps(float const *, __m128i)
__m256i __cdecl _mm256_srli_epi16(__m256i, int)
__m128 __cdecl _mm_permutevar_ps(__m128, __m128i)
__m256 __cdecl _mm256_permutevar_ps(__m256, __m256i)
void __cdecl _xrstors(void const *, unsigned __int64)
__m256 __cdecl _mm256_castps128_ps256(__m128)
long _InterlockedOr_HLERelease(long volatile *, long)
void __cdecl _mm256_store_pd(double *, __m256d)
int __cdecl _mm_comi_sd(__m128d, __m128d, const int)
__m256 __cdecl _mm256_fmsubadd_ps(__m256, __m256, __m256)
__m256 __cdecl _mm256_permute_ps(__m256, int)
__m128d __cdecl _mm_fmsub_pd(__m128d, __m128d, __m128d)
__m256i __cdecl _mm256_setzero_si256(void)
void * _InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *, void *)
__m256 __cdecl _mm256_add_ps(__m256, __m256)
unsigned short __cdecl _load_be_u16(void const *)
__m256d __cdecl _mm256_permute_pd(__m256d, int)
int __cdecl _mm256_testnzc_pd(__m256d, __m256d)
unsigned int _shlx_u32(unsigned int, unsigned int)
__m256i __cdecl _mm256_cmpeq_epi64(__m256i, __m256i)
__m256d __cdecl _mm256_shuffle_pd(__m256d, __m256d, const int)
__m256i __cdecl _mm256_madd_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_set1_epi64x(long long)
__m256i __cdecl _mm256_packs_epi16(__m256i, __m256i)
__m128d __cdecl _mm_fmadd_sd(__m128d, __m128d, __m128d)
__m256i __cdecl _mm256_blend_epi16(__m256i, __m256i, const int)
__m256d __cdecl _mm256_i64gather_pd(double const *, __m256i, const int)
__m256d __cdecl _mm256_permute2f128_pd(__m256d, __m256d, int)
__m256i __cdecl _mm256_adds_epu16(__m256i, __m256i)
__m256i __cdecl _mm256_maskload_epi32(int const *, __m256i)
__m256d __cdecl _mm256_mul_pd(__m256d, __m256d)
unsigned char __cdecl _xtest(void)
__m256d __cdecl _mm256_broadcast_sd(double const *)
int __cdecl _mm256_testc_pd(__m256d, __m256d)
__m256i __cdecl _mm256_sll_epi32(__m256i, __m128i)
__m256i __cdecl _mm256_set1_epi8(char)
void _StorePointer_HLERelease(void *volatile *, void *)
__m256i __cdecl _mm256_hsubs_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_blend_epi32(__m256i, __m256i, const int)
__m256i __cdecl _mm256_alignr_epi8(__m256i, __m256i, const int)
__m256i __cdecl _mm256_mask_i64gather_epi64(__m256i, __int64 const *, __m256i, __m256i, const int)
__m256i __cdecl _mm256_unpackhi_epi16(__m256i, __m256i)
long _InterlockedExchangeAdd_HLERelease(long volatile *, long)
__m256 __cdecl _mm256_fmaddsub_ps(__m256, __m256, __m256)
__m256i __cdecl _mm256_cvtepi32_epi64(__m128i)
__m256i __cdecl _mm256_cmpeq_epi8(__m256i, __m256i)
__m256i __cdecl _mm256_adds_epu8(__m256i, __m256i)
__m256 __cdecl _mm256_castpd_ps(__m256d)
void __cdecl _mm256_maskstore_epi32(int *, __m256i, __m256i)
__m256i __cdecl _mm256_set_epi64x(__int64, __int64, __int64, __int64)
void *__cdecl _bnd_set_ptr_bounds(const void *, size_t)
__m256d __cdecl _mm256_fmsub_pd(__m256d, __m256d, __m256d)
__m256i __cdecl _mm256_castpd_si256(__m256d)
__m256i __cdecl _mm256_packs_epi32(__m256i, __m256i)
__m128 __cdecl _mm_fmsub_ss(__m128, __m128, __m128)
__m256i __cdecl _mm256_abs_epi32(__m256i)
__m128d __cdecl _mm_fmaddsub_pd(__m128d, __m128d, __m128d)
unsigned __int64 __cdecl _xgetbv(unsigned int)
__m256i __cdecl _mm256_cvtepu8_epi32(__m128i)
int __cdecl _rdrand16_step(unsigned short *)
__m256d __cdecl _mm256_cvtepi32_pd(__m128i)
__m128 __cdecl _mm_cvtph_ps(__m128i)
__m256i __cdecl _mm256_unpacklo_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_cvtepi16_epi32(__m128i)
__m128d __cdecl _mm_broadcastsd_pd(__m128d)
__m256d __cdecl _mm256_sqrt_pd(__m256d)
__m256i __cdecl _mm256_max_epu16(__m256i, __m256i)
__m256i __cdecl _mm256_mask_i32gather_epi64(__m256i, __int64 const *, __m128i, __m256i, const int)
__m128 __cdecl _mm_fnmsub_ps(__m128, __m128, __m128)
int __cdecl _mm_testnzc_pd(__m128d, __m128d)
__m256 __cdecl _mm256_hsub_ps(__m256, __m256)
__m128i __cdecl _mm_sha1msg2_epu32(__m128i, __m128i)
void * _InterlockedExchangePointer_HLERelease(void *volatile *, void *)
__m256 __cdecl _mm256_sub_ps(__m256, __m256)
void __cdecl _bnd_chk_ptr_ubounds(const void *)
__m128i __cdecl _mm_srlv_epi32(__m128i, __m128i)
__m256i __cdecl _mm256_abs_epi16(__m256i)
int __cdecl _mm_testc_pd(__m128d, __m128d)
__m256i __cdecl _mm256_sign_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_setr_epi32(int, int, int, int, int, int, int, int)
__m256i __cdecl _mm256_broadcastsi128_si256(__m128i)
__m256 __cdecl _mm256_moveldup_ps(__m256)
unsigned char _interlockedbittestandreset_HLEAcquire(long *, long)
__m128i __cdecl _mm_srav_epi32(__m128i, __m128i)
__m256i __cdecl _mm256_packus_epi16(__m256i, __m256i)
__m256d __cdecl _mm256_mask_i64gather_pd(__m256d, double const *, __m256i, __m256d, const int)
__m256 __cdecl _mm256_and_ps(__m256, __m256)
__m256d __cdecl _mm256_addsub_pd(__m256d, __m256d)
__m128i __cdecl _mm_i64gather_epi64(__int64 const *, __m128i, const int)
__m256i __cdecl _mm256_sub_epi8(__m256i, __m256i)
__m256i __cdecl _mm256_and_si256(__m256i, __m256i)
unsigned char _interlockedbittestandset_HLEAcquire(long *, long)
__m256i __cdecl _mm256_unpacklo_epi64(__m256i, __m256i)
__m128 __cdecl _mm_cmp_ss(__m128, __m128, const int)
__m256i __cdecl _mm256_castsi128_si256(__m128i)
__m128 __cdecl _mm_i32gather_ps(float const *, __m128i, const int)
__m128d __cdecl _mm_fmsub_sd(__m128d, __m128d, __m128d)
__m256i __cdecl _mm256_abs_epi8(__m256i)
__m256i __cdecl _mm256_mul_epi32(__m256i, __m256i)
union __declspec(intrin_type) __declspec(align(32)) __m256
Definition: immintrin.h:37
__m256i __cdecl _mm256_sign_epi8(__m256i, __m256i)
__m256 __cdecl _mm256_cvtph_ps(__m128i)
__m128i __cdecl _mm_sllv_epi64(__m128i, __m128i)
unsigned __int64 __cdecl _load_be_u64(void const *)
int __cdecl _mm256_movemask_pd(__m256d)
__m128i __cdecl _mm_i64gather_epi32(int const *, __m128i, const int)
__m256 __cdecl _mm256_castsi256_ps(__m256i)
__m256 __cdecl _mm256_movehdup_ps(__m256)
void _Store_HLERelease(long volatile *, long)
__m128 __cdecl _mm_fnmadd_ss(__m128, __m128, __m128)
void __cdecl _mm256_maskstore_pd(double *, __m256i, __m256d)
__m256i __cdecl _mm256_packus_epi32(__m256i, __m256i)
__m128d __cdecl _mm_mask_i32gather_pd(__m128d, double const *, __m128i, __m128d, const int)
__m256i __cdecl _mm256_set1_epi32(int)
__m256d __cdecl _mm256_i32gather_pd(double const *, __m128i, const int)
__m256d __cdecl _mm256_broadcast_pd(__m128d const *)
void *__cdecl _bnd_init_ptr_bounds(const void *)
__m256d __cdecl _mm256_hsub_pd(__m256d, __m256d)
__m256d __cdecl _mm256_unpacklo_pd(__m256d, __m256d)
__m256d __cdecl _mm256_max_pd(__m256d, __m256d)
__m256i __cdecl _mm256_cmpgt_epi64(__m256i, __m256i)
int __cdecl _mm256_movemask_ps(__m256)
void __cdecl _fxrstor(void const *)
__m256 __cdecl _mm256_fnmsub_ps(__m256, __m256, __m256)
__m256i __cdecl _mm256_srai_epi32(__m256i, int)
unsigned int _lzcnt_u32(unsigned int)
__m128i __cdecl _mm256_extractf128_si256(__m256i, const int)
__m256 __cdecl _mm256_dp_ps(__m256, __m256, const int)
__m256 __cdecl _mm256_blendv_ps(__m256, __m256, __m256)
__m256i __cdecl _mm256_mask_i32gather_epi32(__m256i, int const *, __m256i, __m256i, const int)
__m256i __cdecl _mm256_subs_epi8(__m256i, __m256i)
__m256i __cdecl _mm256_min_epu16(__m256i, __m256i)
__m256d __cdecl _mm256_unpackhi_pd(__m256d, __m256d)
__m256i __cdecl _mm256_mulhi_epu16(__m256i, __m256i)
__m128 __cdecl _mm256_castps256_ps128(__m256)
__m256i __cdecl _mm256_sll_epi64(__m256i, __m128i)
__m128i __cdecl _mm256_mask_i64gather_epi32(__m128i, int const *, __m256i, __m128i, const int)
__m256i __cdecl _mm256_cmpeq_epi32(__m256i, __m256i)
__m128 __cdecl _mm_broadcastss_ps(__m128)
__m128d __cdecl _mm_maskload_pd(double const *, __m128i)
long _InterlockedAnd_HLERelease(long volatile *, long)
__m256 __cdecl _mm256_mask_i32gather_ps(__m256, float const *, __m256i, __m256, const int)
__m256i __cdecl _mm256_hsub_epi32(__m256i, __m256i)
__m256 __cdecl _mm256_cvtepi32_ps(__m256i)
__m256i __cdecl _mm256_unpackhi_epi64(__m256i, __m256i)
__m256i __cdecl _mm256_hadd_epi16(__m256i, __m256i)
__m256 __cdecl _mm256_hadd_ps(__m256, __m256)
__m256i __cdecl _mm256_cmpeq_epi16(__m256i, __m256i)
void __cdecl _mm_maskstore_pd(double *, __m128i, __m128d)
__m128d __cdecl _mm_fmsubadd_pd(__m128d, __m128d, __m128d)
__m256i __cdecl _mm256_cvtepu8_epi16(__m128i)
__m128i __cdecl _mm256_cvtpd_epi32(__m256d)
__m256 __cdecl _mm256_set1_ps(float)
void __cdecl _xrstor(void const *, unsigned __int64)
__m256i __cdecl _mm256_maskload_epi64(__int64 const *, __m256i)
__m256i __cdecl _mm256_srl_epi32(__m256i, __m128i)
__m256i __cdecl _mm256_mpsadbw_epu8(__m256i, __m256i, const int)
__m256 __cdecl _mm256_addsub_ps(__m256, __m256)
__m256d __cdecl _mm256_min_pd(__m256d, __m256d)
__m256i __cdecl _mm256_mulhi_epi16(__m256i, __m256i)
__m256d __cdecl _mm256_fmadd_pd(__m256d, __m256d, __m256d)
__m256i __cdecl _mm256_srlv_epi32(__m256i, __m256i)
__m128 __cdecl _mm_broadcast_ss(float const *)
__m128d __cdecl _mm256_castpd256_pd128(__m256d)
int __cdecl _mm_comi_ss(__m128, __m128, const int)
__m128i
Definition: emmintrin.h:53
__m256
Definition: immintrin.h:39
__m256 __cdecl _mm256_setr_ps(float, float, float, float, float, float, float, float)
__m128i __cdecl _mm_broadcastb_epi8(__m128i)
__m128i __cdecl _mm_sha256rnds2_epu32(__m128i, __m128i, __m128i)
__m256i __cdecl _mm256_mul_epu32(__m256i, __m256i)
__m128d __cdecl _mm_fmadd_pd(__m128d, __m128d, __m128d)
int __cdecl _mm256_testz_si256(__m256i, __m256i)
__m256i __cdecl _mm256_setr_epi8(char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char)
__m128 __cdecl _mm_i64gather_ps(float const *, __m128i, const int)
void __cdecl _xsaveopt(void *, unsigned __int64)
__m256i __cdecl _mm256_broadcastw_epi16(__m128i)
__m256d __cdecl _mm256_permute4x64_pd(__m256d, const int)
__m256d __cdecl _mm256_setr_pd(double, double, double, double)
int __cdecl _mm256_testc_ps(__m256, __m256)
__m128
Definition: xmmintrin.h:75
void __cdecl _mm256_zeroall(void)
void __cdecl _mm256_store_ps(float *, __m256)
unsigned char _interlockedbittestandset_HLERelease(long *, long)
unsigned int _shrx_u32(unsigned int, unsigned int)
void __cdecl _mm256_storeu_si256(__m256i *, __m256i)
__m128 __cdecl _mm_fmsub_ps(__m128, __m128, __m128)
__m128i __cdecl _mm_sha1rnds4_epu32(__m128i, __m128i, const int)
__m128i __cdecl _mm_sllv_epi32(__m128i, __m128i)
__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64, __int64)
__m256i __cdecl _mm256_min_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_cvtepi8_epi16(__m128i)
__m256i __cdecl _mm256_permute2x128_si256(__m256i, __m256i, const int)
__m256i __cdecl _mm256_stream_load_si256(__m256i const *)
__m256i __cdecl _mm256_or_si256(__m256i, __m256i)
__m256i __cdecl _mm256_add_epi8(__m256i, __m256i)
int __cdecl _mm_testz_ps(__m128, __m128)
__m256 __cdecl _mm256_mul_ps(__m256, __m256)
__m256i __cdecl _mm256_add_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_cvtepi16_epi64(__m128i)
__m256i __cdecl _mm256_insertf128_si256(__m256i, __m128i, int)
int __cdecl _mm_testnzc_ps(__m128, __m128)
__m256i __cdecl _mm256_unpacklo_epi8(__m256i, __m256i)
void __cdecl _invpcid(unsigned int, void *)
__m256 __cdecl _mm256_fnmadd_ps(__m256, __m256, __m256)
__m256i __cdecl _mm256_cvtepi8_epi64(__m128i)
__m256i __cdecl _mm256_andnot_si256(__m256i, __m256i)
void __cdecl _mm256_zeroupper(void)
void __cdecl _store_be_u16(void *, unsigned short)
__m256i __cdecl _mm256_slli_epi64(__m256i, int)
__m128i __cdecl _mm256_extracti128_si256(__m256i, const int)
unsigned int __cdecl _xbegin(void)
__m256i __cdecl _mm256_hsub_epi16(__m256i, __m256i)
int __cdecl _rdseed16_step(unsigned short *)
__m128i __cdecl _mm256_cvtps_ph(__m256, int)
__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64, __int64)
unsigned int _bextr_u32(unsigned int, unsigned int, unsigned int)
__m128d __cdecl _mm_i32gather_pd(double const *, __m128i, const int)
__m128i __cdecl _mm256_castsi256_si128(__m256i)
__m256 __cdecl _mm256_min_ps(__m256, __m256)
__m256i __cdecl _mm256_slli_si256(__m256i, const int)
__m128d __cdecl _mm_mask_i64gather_pd(__m128d, double const *, __m128i, __m128d, const int)
__m128i __cdecl _mm_sha1nexte_epu32(__m128i, __m128i)
__m128 __cdecl _mm_fnmadd_ps(__m128, __m128, __m128)
__m256 __cdecl _mm256_andnot_ps(__m256, __m256)
void __cdecl _fxsave(void *)
__m256i __cdecl _mm256_set1_epi16(short)
__m256i __cdecl _mm256_slli_epi16(__m256i, int)
int _sarx_i32(int, unsigned int)
__m256 __cdecl _mm256_blend_ps(__m256, __m256, const int)
unsigned int _tzcnt_u32(unsigned int)
__m256 __cdecl _mm256_unpackhi_ps(__m256, __m256)
__m256i __cdecl _mm256_broadcastd_epi32(__m128i)
long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long)
__m256 __cdecl _mm256_round_ps(__m256, int)
__m256 __cdecl _mm256_permute2f128_ps(__m256, __m256, int)
unsigned int _pext_u32(unsigned int, unsigned int)
__m256d __cdecl _mm256_insertf128_pd(__m256d, __m128d, int)
void __cdecl _bnd_chk_ptr_bounds(const void *, size_t)
__m256i __cdecl _mm256_castps_si256(__m256)
__m256i __cdecl _mm256_xor_si256(__m256i, __m256i)
__m256i __cdecl _mm256_cvtepu16_epi32(__m128i)
__m256d __cdecl _mm256_maskload_pd(double const *, __m256i)
unsigned int _pdep_u32(unsigned int, unsigned int)
__m256d __cdecl _mm256_castsi256_pd(__m256i)
void __cdecl _store_be_u64(void *, unsigned __int64)
__m256i __cdecl _mm256_sra_epi32(__m256i, __m128i)
void * _InterlockedExchangePointer_HLEAcquire(void *volatile *, void *)
__m256 __cdecl _mm256_maskload_ps(float const *, __m256i)
int __cdecl _mm256_testz_pd(__m256d, __m256d)
void *__cdecl _bnd_copy_ptr_bounds(const void *, const void *)
__m256d __cdecl _mm256_blendv_pd(__m256d, __m256d, __m256d)
__m256i __cdecl _mm256_shuffle_epi32(__m256i, const int)
__m128i __cdecl _mm_mask_i32gather_epi32(__m128i, int const *, __m128i, __m128i, const int)
__m256d __cdecl _mm256_or_pd(__m256d, __m256d)
__m128i __cdecl _mm_mask_i64gather_epi64(__m128i, __int64 const *, __m128i, __m128i, const int)
__m256i __cdecl _mm256_srl_epi16(__m256i, __m128i)
void __cdecl _bnd_store_ptr_bounds(const void **, const void *)
__m256i __cdecl _mm256_cvtepi8_epi32(__m128i)
__m128 __cdecl _mm_fmaddsub_ps(__m128, __m128, __m128)
__m256i __cdecl _mm256_slli_epi32(__m256i, int)
__m256 __cdecl _mm256_or_ps(__m256, __m256)
__m128i __cdecl _mm_i32gather_epi64(__int64 const *, __m128i, const int)
__m128i __cdecl _mm_cvtps_ph(__m128, const int)
__m256i __cdecl _mm256_subs_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_lddqu_si256(__m256i const *)
__m128i __cdecl _mm_sha1msg1_epu32(__m128i, __m128i)
__m256i __cdecl _mm256_hadd_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_adds_epi8(__m256i, __m256i)
__m256i __cdecl _mm256_srli_si256(__m256i, const int)
__m256i __cdecl _mm256_maddubs_epi16(__m256i, __m256i)
__m128 __cdecl _mm_mask_i64gather_ps(__m128, float const *, __m128i, __m128, const int)
long _InterlockedOr_HLEAcquire(long volatile *, long)
__m256d __cdecl _mm256_setzero_pd(void)
__m128d __cdecl _mm_cmp_sd(__m128d, __m128d, const int)
__m128i __cdecl _mm_srlv_epi64(__m128i, __m128i)
__m256i __cdecl _mm256_cmpgt_epi8(__m256i, __m256i)
__m256 __cdecl _mm256_rcp_ps(__m256)
__m256i __cdecl _mm256_unpackhi_epi8(__m256i, __m256i)
__m256i __cdecl _mm256_sllv_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_i32gather_epi32(int const *, __m256i, const int)
__m256 __cdecl _mm256_permutevar8x32_ps(__m256, __m256i)
void __cdecl _xsaves(void *, unsigned __int64)
__m256i __cdecl _mm256_cvtepu8_epi64(__m128i)
__m128d __cdecl _mm_fnmsub_pd(__m128d, __m128d, __m128d)
__m128d __cdecl _mm_i64gather_pd(double const *, __m128i, const int)
int __cdecl _mm256_testnzc_ps(__m256, __m256)
__m256i __cdecl _mm256_broadcastq_epi64(__m128i)
__m256d __cdecl _mm256_broadcastsd_pd(__m128d)
__m256i __cdecl _mm256_loadu_si256(__m256i const *)
__m256d __cdecl _mm256_set1_pd(double)
__m256 __cdecl _mm256_fmsub_ps(__m256, __m256, __m256)
__m256i __cdecl _mm256_srl_epi64(__m256i, __m128i)
__m128i __cdecl _mm256_cvttpd_epi32(__m256d)
__m256i __cdecl _mm256_min_epi8(__m256i, __m256i)
__m256d __cdecl _mm256_hadd_pd(__m256d, __m256d)
__m256 __cdecl _mm256_max_ps(__m256, __m256)
__m256i __cdecl _mm256_min_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_srai_epi16(__m256i, int)
__m256i __cdecl _mm256_max_epi16(__m256i, __m256i)
__m128i __cdecl _mm256_i64gather_epi32(int const *, __m256i, const int)
__m256 __cdecl _mm256_set_ps(float, float, float, float, float, float, float, float)
__m256i __cdecl _mm256_blendv_epi8(__m256i, __m256i, __m256i)
__m256 __cdecl _mm256_cmp_ps(__m256, __m256, const int)
void *__cdecl _bnd_narrow_ptr_bounds(const void *, const void *, size_t)
unsigned int _bzhi_u32(unsigned int, unsigned int)
__m128i __cdecl _mm_sha256msg1_epu32(__m128i, __m128i)
__m256d __cdecl _mm256_permutevar_pd(__m256d, __m256i)
__m128 __cdecl _mm_fmsubadd_ps(__m128, __m128, __m128)
__m256i __cdecl _mm256_cvtepu16_epi64(__m128i)
__m256i __cdecl _mm256_sra_epi16(__m256i, __m128i)
void __cdecl _xsetbv(unsigned int, unsigned __int64)
__m128 __cdecl _mm_fmadd_ps(__m128, __m128, __m128)
__m256i __cdecl _mm256_cvtps_epi32(__m256)
__m256i __cdecl _mm256_sad_epu8(__m256i, __m256i)
int __cdecl _mm256_testz_ps(__m256, __m256)
__m256 __cdecl _mm256_broadcast_ps(__m128 const *)
__m256d __cdecl _mm256_cmp_pd(__m256d, __m256d, const int)
__m256d __cdecl _mm256_xor_pd(__m256d, __m256d)
__m256i __cdecl _mm256_i64gather_epi64(__int64 const *, __m256i, const int)
__m256d __cdecl _mm256_add_pd(__m256d, __m256d)
__m256 __cdecl _mm256_broadcastss_ps(__m128)
__m256i __cdecl _mm256_sub_epi32(__m256i, __m256i)
__m256d __cdecl _mm256_mask_i32gather_pd(__m256d, double const *, __m128i, __m256d, const int)
__m256i __cdecl _mm256_setr_epi64x(__int64, __int64, __int64, __int64)
int __cdecl _mm_testc_ps(__m128, __m128)
void __cdecl _mm_maskstore_epi64(__int64 *, __m128i, __m128i)
__m128 __cdecl _mm256_cvtpd_ps(__m256d)
__m256i __cdecl _mm256_permute4x64_epi64(__m256i, const int)
__m128i __cdecl _mm_mask_i32gather_epi64(__m128i, __int64 const *, __m128i, __m128i, const int)
__m128i __cdecl _mm_blend_epi32(__m128i, __m128i, const int)
unsigned int _blsi_u32(unsigned int)
const void *__cdecl _bnd_get_ptr_ubound(const void *)
int __cdecl _mm256_movemask_epi8(__m256i)
long _InterlockedAnd_HLEAcquire(long volatile *, long)
__m256i __cdecl _mm256_hadds_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_adds_epi16(__m256i, __m256i)
__m128d __cdecl _mm_fnmadd_pd(__m128d, __m128d, __m128d)
__m256i
Definition: immintrin.h:54
__m256i __cdecl _mm256_permutevar8x32_epi32(__m256i, __m256i)
__m256i __cdecl _mm256_set_epi32(int, int, int, int, int, int, int, int)
__m256i __cdecl _mm256_set_epi8(char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char)
__m256i __cdecl _mm256_sub_epi64(__m256i, __m256i)
void __cdecl _mm256_maskstore_ps(float *, __m256i, __m256)
int __cdecl _mm256_testc_si256(__m256i, __m256i)
void * _InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *, void *)
__m256 __cdecl _mm256_fmadd_ps(__m256, __m256, __m256)
void __cdecl _mm_maskstore_epi32(int *, __m128i, __m128i)
__m128 __cdecl _mm_fnmsub_ss(__m128, __m128, __m128)
void __cdecl _xsavec(void *, unsigned __int64)
__m256d __cdecl _mm256_fmaddsub_pd(__m256d, __m256d, __m256d)
void __cdecl _mm256_store_si256(__m256i *, __m256i)
__m128d __cdecl _mm_fnmadd_sd(__m128d, __m128d, __m128d)
__m128i __cdecl _mm_mask_i64gather_epi32(__m128i, int const *, __m128i, __m128i, const int)
__m256i __cdecl _mm256_cvttps_epi32(__m256)
__m128 __cdecl _mm_permute_ps(__m128, int)
__m256i __cdecl _mm256_srlv_epi64(__m256i, __m256i)
__m256i __cdecl _mm256_cvtepu32_epi64(__m128i)
__m256 __cdecl _mm256_shuffle_ps(__m256, __m256, const int)
__m256i __cdecl _mm256_mullo_epi16(__m256i, __m256i)
__m256i __cdecl _mm256_shufflehi_epi16(__m256i, const int)
void __cdecl _mm256_stream_pd(double *, __m256d)
__m256d __cdecl _mm256_castpd128_pd256(__m128d)
long _InterlockedExchange_HLERelease(long volatile *, long)
__m128 __cdecl _mm256_extractf128_ps(__m256, const int)
__m128d __cdecl _mm256_extractf128_pd(__m256d, const int)
__m256i __cdecl _mm256_subs_epu16(__m256i, __m256i)
__m128 __cdecl _mm_mask_i32gather_ps(__m128, float const *, __m128i, __m128, const int)
int __cdecl _rdrand32_step(unsigned int *)
__m128d __cdecl _mm_permute_pd(__m128d, int)
__m256 __cdecl _mm256_unpacklo_ps(__m256, __m256)