STLdoc
STLdocumentation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
avx2intrin.h
Go to the documentation of this file.
1 /* Copyright (C) 2011-2013 Free Software Foundation, Inc.
2 
3  This file is part of GCC.
4 
5  GCC is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; either version 3, or (at your option)
8  any later version.
9 
10  GCC is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  Under Section 7 of GPL version 3, you are granted additional
16  permissions described in the GCC Runtime Library Exception, version
17  3.1, as published by the Free Software Foundation.
18 
19  You should have received a copy of the GNU General Public License and
20  a copy of the GCC Runtime Library Exception along with this program;
21  see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22  <http://www.gnu.org/licenses/>. */
23 
24 #ifndef _IMMINTRIN_H_INCLUDED
25 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
26 #endif
27 
28 /* Sum absolute 8-bit integer difference of adjacent groups of 4
29  byte integers in the first 2 operands. Starting offsets within
30  operands are determined by the 3rd mask operand. */
31 #ifdef __OPTIMIZE__
32 extern __inline __m256i
33 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
34 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
35 {
36  return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
37  (__v32qi)__Y, __M);
38 }
39 #else
40 #define _mm256_mpsadbw_epu8(X, Y, M) \
41  ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \
42  (__v32qi)(__m256i)(Y), (int)(M)))
43 #endif
44 
45 extern __inline __m256i
46 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
47 _mm256_abs_epi8 (__m256i __A)
48 {
49  return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
50 }
51 
52 extern __inline __m256i
53 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
54 _mm256_abs_epi16 (__m256i __A)
55 {
56  return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
57 }
58 
59 extern __inline __m256i
60 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
61 _mm256_abs_epi32 (__m256i __A)
62 {
63  return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
64 }
65 
66 extern __inline __m256i
67 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
68 _mm256_packs_epi32 (__m256i __A, __m256i __B)
69 {
70  return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
71 }
72 
73 extern __inline __m256i
74 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
75 _mm256_packs_epi16 (__m256i __A, __m256i __B)
76 {
77  return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
78 }
79 
80 extern __inline __m256i
81 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
82 _mm256_packus_epi32 (__m256i __A, __m256i __B)
83 {
84  return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
85 }
86 
87 extern __inline __m256i
88 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
89 _mm256_packus_epi16 (__m256i __A, __m256i __B)
90 {
91  return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
92 }
93 
94 extern __inline __m256i
95 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
96 _mm256_add_epi8 (__m256i __A, __m256i __B)
97 {
98  return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
99 }
100 
101 extern __inline __m256i
102 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
103 _mm256_add_epi16 (__m256i __A, __m256i __B)
104 {
105  return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
106 }
107 
108 extern __inline __m256i
109 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
110 _mm256_add_epi32 (__m256i __A, __m256i __B)
111 {
112  return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
113 }
114 
115 extern __inline __m256i
116 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
117 _mm256_add_epi64 (__m256i __A, __m256i __B)
118 {
119  return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
120 }
121 
122 extern __inline __m256i
123 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
124 _mm256_adds_epi8 (__m256i __A, __m256i __B)
125 {
126  return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
127 }
128 
129 extern __inline __m256i
130 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
131 _mm256_adds_epi16 (__m256i __A, __m256i __B)
132 {
133  return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
134 }
135 
136 extern __inline __m256i
137 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
138 _mm256_adds_epu8 (__m256i __A, __m256i __B)
139 {
140  return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
141 }
142 
143 extern __inline __m256i
144 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
145 _mm256_adds_epu16 (__m256i __A, __m256i __B)
146 {
147  return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
148 }
149 
150 #ifdef __OPTIMIZE__
151 extern __inline __m256i
152 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
153 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
154 {
155  return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
156  (__v4di)__B,
157  __N * 8);
158 }
159 #else
160 /* In that case (__N*8) will be in vreg, and insn will not be matched. */
161 /* Use define instead */
162 #define _mm256_alignr_epi8(A, B, N) \
163  ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \
164  (__v4di)(__m256i)(B), \
165  (int)(N) * 8))
166 #endif
167 
168 extern __inline __m256i
169 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
170 _mm256_and_si256 (__m256i __A, __m256i __B)
171 {
172  return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B);
173 }
174 
175 extern __inline __m256i
176 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
177 _mm256_andnot_si256 (__m256i __A, __m256i __B)
178 {
179  return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
180 }
181 
182 extern __inline __m256i
183 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
184 _mm256_avg_epu8 (__m256i __A, __m256i __B)
185 {
186  return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
187 }
188 
189 extern __inline __m256i
190 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
191 _mm256_avg_epu16 (__m256i __A, __m256i __B)
192 {
193  return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
194 }
195 
196 extern __inline __m256i
197 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
198 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
199 {
200  return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
201  (__v32qi)__Y,
202  (__v32qi)__M);
203 }
204 
205 #ifdef __OPTIMIZE__
206 extern __inline __m256i
207 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
208 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
209 {
210  return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
211  (__v16hi)__Y,
212  __M);
213 }
214 #else
215 #define _mm256_blend_epi16(X, Y, M) \
216  ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \
217  (__v16hi)(__m256i)(Y), (int)(M)))
218 #endif
219 
220 extern __inline __m256i
221 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
222 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
223 {
224  return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B);
225 }
226 
227 extern __inline __m256i
228 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
229 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
230 {
231  return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B);
232 }
233 
234 extern __inline __m256i
235 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
236 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
237 {
238  return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B);
239 }
240 
241 extern __inline __m256i
242 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
243 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
244 {
245  return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B);
246 }
247 
248 extern __inline __m256i
249 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
250 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
251 {
252  return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A,
253  (__v32qi)__B);
254 }
255 
256 extern __inline __m256i
257 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
258 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
259 {
260  return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A,
261  (__v16hi)__B);
262 }
263 
264 extern __inline __m256i
265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
266 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
267 {
268  return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A,
269  (__v8si)__B);
270 }
271 
272 extern __inline __m256i
273 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
274 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
275 {
276  return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B);
277 }
278 
279 extern __inline __m256i
280 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
281 _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
282 {
283  return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
284  (__v16hi)__Y);
285 }
286 
287 extern __inline __m256i
288 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
289 _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
290 {
291  return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
292 }
293 
294 extern __inline __m256i
295 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
296 _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
297 {
298  return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
299  (__v16hi)__Y);
300 }
301 
302 extern __inline __m256i
303 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
304 _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
305 {
306  return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
307  (__v16hi)__Y);
308 }
309 
310 extern __inline __m256i
311 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
312 _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
313 {
314  return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
315 }
316 
317 extern __inline __m256i
318 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
319 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
320 {
321  return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
322  (__v16hi)__Y);
323 }
324 
325 extern __inline __m256i
326 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
327 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
328 {
329  return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
330  (__v32qi)__Y);
331 }
332 
333 extern __inline __m256i
334 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
335 _mm256_madd_epi16 (__m256i __A, __m256i __B)
336 {
337  return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
338  (__v16hi)__B);
339 }
340 
341 extern __inline __m256i
342 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
343 _mm256_max_epi8 (__m256i __A, __m256i __B)
344 {
345  return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
346 }
347 
348 extern __inline __m256i
349 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
350 _mm256_max_epi16 (__m256i __A, __m256i __B)
351 {
352  return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
353 }
354 
355 extern __inline __m256i
356 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
357 _mm256_max_epi32 (__m256i __A, __m256i __B)
358 {
359  return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
360 }
361 
362 extern __inline __m256i
363 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
364 _mm256_max_epu8 (__m256i __A, __m256i __B)
365 {
366  return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
367 }
368 
369 extern __inline __m256i
370 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
371 _mm256_max_epu16 (__m256i __A, __m256i __B)
372 {
373  return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
374 }
375 
376 extern __inline __m256i
377 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
378 _mm256_max_epu32 (__m256i __A, __m256i __B)
379 {
380  return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
381 }
382 
383 extern __inline __m256i
384 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
385 _mm256_min_epi8 (__m256i __A, __m256i __B)
386 {
387  return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
388 }
389 
390 extern __inline __m256i
391 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
392 _mm256_min_epi16 (__m256i __A, __m256i __B)
393 {
394  return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
395 }
396 
397 extern __inline __m256i
398 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
399 _mm256_min_epi32 (__m256i __A, __m256i __B)
400 {
401  return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
402 }
403 
404 extern __inline __m256i
405 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
406 _mm256_min_epu8 (__m256i __A, __m256i __B)
407 {
408  return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
409 }
410 
411 extern __inline __m256i
412 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
413 _mm256_min_epu16 (__m256i __A, __m256i __B)
414 {
415  return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
416 }
417 
418 extern __inline __m256i
419 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
420 _mm256_min_epu32 (__m256i __A, __m256i __B)
421 {
422  return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
423 }
424 
425 extern __inline int
426 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
427 _mm256_movemask_epi8 (__m256i __A)
428 {
429  return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
430 }
431 
432 extern __inline __m256i
433 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
434 _mm256_cvtepi8_epi16 (__m128i __X)
435 {
436  return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
437 }
438 
439 extern __inline __m256i
440 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
441 _mm256_cvtepi8_epi32 (__m128i __X)
442 {
443  return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
444 }
445 
446 extern __inline __m256i
447 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
448 _mm256_cvtepi8_epi64 (__m128i __X)
449 {
450  return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
451 }
452 
453 extern __inline __m256i
454 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
455 _mm256_cvtepi16_epi32 (__m128i __X)
456 {
457  return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
458 }
459 
460 extern __inline __m256i
461 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
462 _mm256_cvtepi16_epi64 (__m128i __X)
463 {
464  return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
465 }
466 
467 extern __inline __m256i
468 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
469 _mm256_cvtepi32_epi64 (__m128i __X)
470 {
471  return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
472 }
473 
474 extern __inline __m256i
475 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
476 _mm256_cvtepu8_epi16 (__m128i __X)
477 {
478  return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
479 }
480 
481 extern __inline __m256i
482 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
483 _mm256_cvtepu8_epi32 (__m128i __X)
484 {
485  return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
486 }
487 
488 extern __inline __m256i
489 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
490 _mm256_cvtepu8_epi64 (__m128i __X)
491 {
492  return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
493 }
494 
495 extern __inline __m256i
496 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
497 _mm256_cvtepu16_epi32 (__m128i __X)
498 {
499  return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
500 }
501 
502 extern __inline __m256i
503 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
504 _mm256_cvtepu16_epi64 (__m128i __X)
505 {
506  return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
507 }
508 
509 extern __inline __m256i
510 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
511 _mm256_cvtepu32_epi64 (__m128i __X)
512 {
513  return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
514 }
515 
516 extern __inline __m256i
517 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
518 _mm256_mul_epi32 (__m256i __X, __m256i __Y)
519 {
520  return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
521 }
522 
523 extern __inline __m256i
524 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
525 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
526 {
527  return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
528  (__v16hi)__Y);
529 }
530 
531 extern __inline __m256i
532 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
533 _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
534 {
535  return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
536 }
537 
538 extern __inline __m256i
539 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
540 _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
541 {
542  return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
543 }
544 
545 extern __inline __m256i
546 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
547 _mm256_mullo_epi16 (__m256i __A, __m256i __B)
548 {
549  return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
550 }
551 
552 extern __inline __m256i
553 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
554 _mm256_mullo_epi32 (__m256i __A, __m256i __B)
555 {
556  return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
557 }
558 
559 extern __inline __m256i
560 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
561 _mm256_mul_epu32 (__m256i __A, __m256i __B)
562 {
563  return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
564 }
565 
566 extern __inline __m256i
567 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
568 _mm256_or_si256 (__m256i __A, __m256i __B)
569 {
570  return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B);
571 }
572 
573 extern __inline __m256i
574 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
575 _mm256_sad_epu8 (__m256i __A, __m256i __B)
576 {
577  return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
578 }
579 
580 extern __inline __m256i
581 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
582 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
583 {
584  return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
585  (__v32qi)__Y);
586 }
587 
588 #ifdef __OPTIMIZE__
589 extern __inline __m256i
590 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
591 _mm256_shuffle_epi32 (__m256i __A, const int __mask)
592 {
593  return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
594 }
595 
596 extern __inline __m256i
597 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
598 _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
599 {
600  return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
601 }
602 
603 extern __inline __m256i
604 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
605 _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
606 {
607  return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
608 }
609 #else
610 #define _mm256_shuffle_epi32(A, N) \
611  ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
612 #define _mm256_shufflehi_epi16(A, N) \
613  ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
614 #define _mm256_shufflelo_epi16(A, N) \
615  ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
616 #endif
617 
618 extern __inline __m256i
619 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
620 _mm256_sign_epi8 (__m256i __X, __m256i __Y)
621 {
622  return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
623 }
624 
625 extern __inline __m256i
626 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
627 _mm256_sign_epi16 (__m256i __X, __m256i __Y)
628 {
629  return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
630 }
631 
632 extern __inline __m256i
633 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
634 _mm256_sign_epi32 (__m256i __X, __m256i __Y)
635 {
636  return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
637 }
638 
639 #ifdef __OPTIMIZE__
640 extern __inline __m256i
641 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
642 _mm256_bslli_epi128 (__m256i __A, const int __N)
643 {
644  return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
645 }
646 
647 extern __inline __m256i
648 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
649 _mm256_slli_si256 (__m256i __A, const int __N)
650 {
651  return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
652 }
653 #else
654 #define _mm256_bslli_epi128(A, N) \
655  ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
656 #define _mm256_slli_si256(A, N) \
657  ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
658 #endif
659 
660 extern __inline __m256i
661 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
662 _mm256_slli_epi16 (__m256i __A, int __B)
663 {
664  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
665 }
666 
667 extern __inline __m256i
668 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
669 _mm256_sll_epi16 (__m256i __A, __m128i __B)
670 {
671  return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
672 }
673 
674 extern __inline __m256i
675 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
676 _mm256_slli_epi32 (__m256i __A, int __B)
677 {
678  return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
679 }
680 
681 extern __inline __m256i
682 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
683 _mm256_sll_epi32 (__m256i __A, __m128i __B)
684 {
685  return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
686 }
687 
688 extern __inline __m256i
689 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
690 _mm256_slli_epi64 (__m256i __A, int __B)
691 {
692  return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
693 }
694 
695 extern __inline __m256i
696 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
697 _mm256_sll_epi64 (__m256i __A, __m128i __B)
698 {
699  return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
700 }
701 
702 extern __inline __m256i
703 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
704 _mm256_srai_epi16 (__m256i __A, int __B)
705 {
706  return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
707 }
708 
709 extern __inline __m256i
710 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
711 _mm256_sra_epi16 (__m256i __A, __m128i __B)
712 {
713  return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
714 }
715 
716 extern __inline __m256i
717 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
718 _mm256_srai_epi32 (__m256i __A, int __B)
719 {
720  return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
721 }
722 
723 extern __inline __m256i
724 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
725 _mm256_sra_epi32 (__m256i __A, __m128i __B)
726 {
727  return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
728 }
729 
730 #ifdef __OPTIMIZE__
731 extern __inline __m256i
732 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
733 _mm256_bsrli_epi128 (__m256i __A, const int __N)
734 {
735  return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
736 }
737 
738 extern __inline __m256i
739 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
740 _mm256_srli_si256 (__m256i __A, const int __N)
741 {
742  return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
743 }
744 #else
745 #define _mm256_bsrli_epi128(A, N) \
746  ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
747 #define _mm256_srli_si256(A, N) \
748  ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
749 #endif
750 
751 extern __inline __m256i
752 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
753 _mm256_srli_epi16 (__m256i __A, int __B)
754 {
755  return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
756 }
757 
758 extern __inline __m256i
759 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
760 _mm256_srl_epi16 (__m256i __A, __m128i __B)
761 {
762  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
763 }
764 
765 extern __inline __m256i
766 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
767 _mm256_srli_epi32 (__m256i __A, int __B)
768 {
769  return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
770 }
771 
772 extern __inline __m256i
773 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
774 _mm256_srl_epi32 (__m256i __A, __m128i __B)
775 {
776  return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
777 }
778 
779 extern __inline __m256i
780 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
781 _mm256_srli_epi64 (__m256i __A, int __B)
782 {
783  return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
784 }
785 
786 extern __inline __m256i
787 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
788 _mm256_srl_epi64 (__m256i __A, __m128i __B)
789 {
790  return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
791 }
792 
793 extern __inline __m256i
794 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
795 _mm256_sub_epi8 (__m256i __A, __m256i __B)
796 {
797  return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
798 }
799 
800 extern __inline __m256i
801 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
802 _mm256_sub_epi16 (__m256i __A, __m256i __B)
803 {
804  return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
805 }
806 
807 extern __inline __m256i
808 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
809 _mm256_sub_epi32 (__m256i __A, __m256i __B)
810 {
811  return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
812 }
813 
814 extern __inline __m256i
815 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
816 _mm256_sub_epi64 (__m256i __A, __m256i __B)
817 {
818  return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B);
819 }
820 
821 extern __inline __m256i
822 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
823 _mm256_subs_epi8 (__m256i __A, __m256i __B)
824 {
825  return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
826 }
827 
828 extern __inline __m256i
829 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
830 _mm256_subs_epi16 (__m256i __A, __m256i __B)
831 {
832  return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
833 }
834 
835 extern __inline __m256i
836 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
837 _mm256_subs_epu8 (__m256i __A, __m256i __B)
838 {
839  return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
840 }
841 
842 extern __inline __m256i
843 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
844 _mm256_subs_epu16 (__m256i __A, __m256i __B)
845 {
846  return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
847 }
848 
849 extern __inline __m256i
850 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
851 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
852 {
853  return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
854 }
855 
856 extern __inline __m256i
857 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
858 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
859 {
860  return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
861 }
862 
863 extern __inline __m256i
864 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
865 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
866 {
867  return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
868 }
869 
870 extern __inline __m256i
871 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
872 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
873 {
874  return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
875 }
876 
877 extern __inline __m256i
878 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
879 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
880 {
881  return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
882 }
883 
884 extern __inline __m256i
885 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
886 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
887 {
888  return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
889 }
890 
891 extern __inline __m256i
892 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
893 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
894 {
895  return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
896 }
897 
898 extern __inline __m256i
899 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
900 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
901 {
902  return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
903 }
904 
905 extern __inline __m256i
906 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
907 _mm256_xor_si256 (__m256i __A, __m256i __B)
908 {
909  return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B);
910 }
911 
912 extern __inline __m256i
913 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
914 _mm256_stream_load_si256 (__m256i const *__X)
915 {
916  return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
917 }
918 
919 extern __inline __m128
920 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
921 _mm_broadcastss_ps (__m128 __X)
922 {
923  return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
924 }
925 
926 extern __inline __m256
927 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
928 _mm256_broadcastss_ps (__m128 __X)
929 {
930  return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
931 }
932 
933 extern __inline __m256d
934 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
935 _mm256_broadcastsd_pd (__m128d __X)
936 {
937  return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
938 }
939 
940 extern __inline __m256i
941 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
942 _mm256_broadcastsi128_si256 (__m128i __X)
943 {
944  return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
945 }
946 
947 #ifdef __OPTIMIZE__
948 extern __inline __m128i
949 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
950 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
951 {
952  return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
953  (__v4si)__Y,
954  __M);
955 }
956 #else
957 #define _mm_blend_epi32(X, Y, M) \
958  ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \
959  (__v4si)(__m128i)(Y), (int)(M)))
960 #endif
961 
962 #ifdef __OPTIMIZE__
963 extern __inline __m256i
964 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
965 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
966 {
967  return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
968  (__v8si)__Y,
969  __M);
970 }
971 #else
972 #define _mm256_blend_epi32(X, Y, M) \
973  ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \
974  (__v8si)(__m256i)(Y), (int)(M)))
975 #endif
976 
977 extern __inline __m256i
978 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
979 _mm256_broadcastb_epi8 (__m128i __X)
980 {
981  return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
982 }
983 
984 extern __inline __m256i
985 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
986 _mm256_broadcastw_epi16 (__m128i __X)
987 {
988  return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
989 }
990 
991 extern __inline __m256i
992 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
993 _mm256_broadcastd_epi32 (__m128i __X)
994 {
995  return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
996 }
997 
998 extern __inline __m256i
999 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1000 _mm256_broadcastq_epi64 (__m128i __X)
1001 {
1002  return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
1003 }
1004 
1005 extern __inline __m128i
1006 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1007 _mm_broadcastb_epi8 (__m128i __X)
1008 {
1009  return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
1010 }
1011 
1012 extern __inline __m128i
1013 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1014 _mm_broadcastw_epi16 (__m128i __X)
1015 {
1016  return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
1017 }
1018 
1019 extern __inline __m128i
1020 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1021 _mm_broadcastd_epi32 (__m128i __X)
1022 {
1023  return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
1024 }
1025 
1026 extern __inline __m128i
1027 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1028 _mm_broadcastq_epi64 (__m128i __X)
1029 {
1030  return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
1031 }
1032 
1033 extern __inline __m256i
1034 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1035 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
1036 {
1037  return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
1038 }
1039 
1040 #ifdef __OPTIMIZE__
1041 extern __inline __m256d
1042 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1043 _mm256_permute4x64_pd (__m256d __X, const int __M)
1044 {
1045  return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
1046 }
1047 #else
1048 #define _mm256_permute4x64_pd(X, M) \
1049  ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
1050 #endif
1051 
1052 extern __inline __m256
1053 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1054 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
1055 {
1056  return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
1057 }
1058 
1059 #ifdef __OPTIMIZE__
1060 extern __inline __m256i
1061 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1062 _mm256_permute4x64_epi64 (__m256i __X, const int __M)
1063 {
1064  return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
1065 }
1066 #else
1067 #define _mm256_permute4x64_epi64(X, M) \
1068  ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
1069 #endif
1070 
1071 
1072 #ifdef __OPTIMIZE__
1073 extern __inline __m256i
1074 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1075 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
1076 {
1077  return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
1078 }
1079 #else
1080 #define _mm256_permute2x128_si256(X, Y, M) \
1081  ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
1082 #endif
1083 
1084 #ifdef __OPTIMIZE__
1085 extern __inline __m128i
1086 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1087 _mm256_extracti128_si256 (__m256i __X, const int __M)
1088 {
1089  return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
1090 }
1091 #else
1092 #define _mm256_extracti128_si256(X, M) \
1093  ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
1094 #endif
1095 
1096 #ifdef __OPTIMIZE__
1097 extern __inline __m256i
1098 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1099 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
1100 {
1101  return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
1102 }
1103 #else
1104 #define _mm256_inserti128_si256(X, Y, M) \
1105  ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
1106  (__v2di)(__m128i)(Y), \
1107  (int)(M)))
1108 #endif
1109 
1110 extern __inline __m256i
1111 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1112 _mm256_maskload_epi32 (int const *__X, __m256i __M )
1113 {
1114  return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
1115  (__v8si)__M);
1116 }
1117 
1118 extern __inline __m256i
1119 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1120 _mm256_maskload_epi64 (long long const *__X, __m256i __M )
1121 {
1122  return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
1123  (__v4di)__M);
1124 }
1125 
1126 extern __inline __m128i
1127 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1128 _mm_maskload_epi32 (int const *__X, __m128i __M )
1129 {
1130  return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
1131  (__v4si)__M);
1132 }
1133 
1134 extern __inline __m128i
1135 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1136 _mm_maskload_epi64 (long long const *__X, __m128i __M )
1137 {
1138  return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
1139  (__v2di)__M);
1140 }
1141 
1142 extern __inline void
1143 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1144 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
1145 {
1146  __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
1147 }
1148 
1149 extern __inline void
1150 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1151 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
1152 {
1153  __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
1154 }
1155 
1156 extern __inline void
1157 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1158 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
1159 {
1160  __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
1161 }
1162 
1163 extern __inline void
1164 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1165 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
1166 {
1167  __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
1168 }
1169 
1170 extern __inline __m256i
1171 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1172 _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
1173 {
1174  return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
1175 }
1176 
1177 extern __inline __m128i
1178 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1179 _mm_sllv_epi32 (__m128i __X, __m128i __Y)
1180 {
1181  return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
1182 }
1183 
1184 extern __inline __m256i
1185 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1186 _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
1187 {
1188  return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
1189 }
1190 
1191 extern __inline __m128i
1192 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1193 _mm_sllv_epi64 (__m128i __X, __m128i __Y)
1194 {
1195  return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
1196 }
1197 
1198 extern __inline __m256i
1199 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1200 _mm256_srav_epi32 (__m256i __X, __m256i __Y)
1201 {
1202  return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
1203 }
1204 
1205 extern __inline __m128i
1206 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1207 _mm_srav_epi32 (__m128i __X, __m128i __Y)
1208 {
1209  return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
1210 }
1211 
1212 extern __inline __m256i
1213 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1214 _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
1215 {
1216  return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
1217 }
1218 
1219 extern __inline __m128i
1220 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1221 _mm_srlv_epi32 (__m128i __X, __m128i __Y)
1222 {
1223  return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
1224 }
1225 
1226 extern __inline __m256i
1227 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1228 _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
1229 {
1230  return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
1231 }
1232 
1233 extern __inline __m128i
1234 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1235 _mm_srlv_epi64 (__m128i __X, __m128i __Y)
1236 {
1237  return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
1238 }
1239 
1240 #ifdef __OPTIMIZE__
1241 extern __inline __m128d
1242 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1243 _mm_i32gather_pd (double const *base, __m128i index, const int scale)
1244 {
1245  __v2df src = _mm_setzero_pd ();
1246  __v2df mask = _mm_cmpeq_pd (src, src);
1247 
1248  return (__m128d) __builtin_ia32_gathersiv2df (src,
1249  base,
1250  (__v4si)index,
1251  mask,
1252  scale);
1253 }
1254 
1255 extern __inline __m128d
1256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1257 _mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index,
1258  __m128d mask, const int scale)
1259 {
1260  return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src,
1261  base,
1262  (__v4si)index,
1263  (__v2df)mask,
1264  scale);
1265 }
1266 
1267 extern __inline __m256d
1268 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1269 _mm256_i32gather_pd (double const *base, __m128i index, const int scale)
1270 {
1271  __v4df src = _mm256_setzero_pd ();
1272  __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1273 
1274  return (__m256d) __builtin_ia32_gathersiv4df (src,
1275  base,
1276  (__v4si)index,
1277  mask,
1278  scale);
1279 }
1280 
1281 extern __inline __m256d
1282 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1283 _mm256_mask_i32gather_pd (__m256d src, double const *base,
1284  __m128i index, __m256d mask, const int scale)
1285 {
1286  return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src,
1287  base,
1288  (__v4si)index,
1289  (__v4df)mask,
1290  scale);
1291 }
1292 
1293 extern __inline __m128d
1294 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1295 _mm_i64gather_pd (double const *base, __m128i index, const int scale)
1296 {
1297  __v2df src = _mm_setzero_pd ();
1298  __v2df mask = _mm_cmpeq_pd (src, src);
1299 
1300  return (__m128d) __builtin_ia32_gatherdiv2df (src,
1301  base,
1302  (__v2di)index,
1303  mask,
1304  scale);
1305 }
1306 
1307 extern __inline __m128d
1308 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1309 _mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index,
1310  __m128d mask, const int scale)
1311 {
1312  return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src,
1313  base,
1314  (__v2di)index,
1315  (__v2df)mask,
1316  scale);
1317 }
1318 
1319 extern __inline __m256d
1320 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1321 _mm256_i64gather_pd (double const *base, __m256i index, const int scale)
1322 {
1323  __v4df src = _mm256_setzero_pd ();
1324  __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1325 
1326  return (__m256d) __builtin_ia32_gatherdiv4df (src,
1327  base,
1328  (__v4di)index,
1329  mask,
1330  scale);
1331 }
1332 
1333 extern __inline __m256d
1334 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1335 _mm256_mask_i64gather_pd (__m256d src, double const *base,
1336  __m256i index, __m256d mask, const int scale)
1337 {
1338  return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src,
1339  base,
1340  (__v4di)index,
1341  (__v4df)mask,
1342  scale);
1343 }
1344 
1345 extern __inline __m128
1346 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1347 _mm_i32gather_ps (float const *base, __m128i index, const int scale)
1348 {
1349  __v4sf src = _mm_setzero_ps ();
1350  __v4sf mask = _mm_cmpeq_ps (src, src);
1351 
1352  return (__m128) __builtin_ia32_gathersiv4sf (src,
1353  base,
1354  (__v4si)index,
1355  mask,
1356  scale);
1357 }
1358 
1359 extern __inline __m128
1360 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1361 _mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index,
1362  __m128 mask, const int scale)
1363 {
1364  return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src,
1365  base,
1366  (__v4si)index,
1367  (__v4sf)mask,
1368  scale);
1369 }
1370 
1371 extern __inline __m256
1372 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1373 _mm256_i32gather_ps (float const *base, __m256i index, const int scale)
1374 {
1375  __v8sf src = _mm256_setzero_ps ();
1376  __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
1377 
1378  return (__m256) __builtin_ia32_gathersiv8sf (src,
1379  base,
1380  (__v8si)index,
1381  mask,
1382  scale);
1383 }
1384 
1385 extern __inline __m256
1386 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1387 _mm256_mask_i32gather_ps (__m256 src, float const *base,
1388  __m256i index, __m256 mask, const int scale)
1389 {
1390  return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src,
1391  base,
1392  (__v8si)index,
1393  (__v8sf)mask,
1394  scale);
1395 }
1396 
1397 extern __inline __m128
1398 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1399 _mm_i64gather_ps (float const *base, __m128i index, const int scale)
1400 {
1401  __v4sf src = _mm_setzero_ps ();
1402  __v4sf mask = _mm_cmpeq_ps (src, src);
1403 
1404  return (__m128) __builtin_ia32_gatherdiv4sf (src,
1405  base,
1406  (__v2di)index,
1407  mask,
1408  scale);
1409 }
1410 
1411 extern __inline __m128
1412 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1413 _mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index,
1414  __m128 mask, const int scale)
1415 {
1416  return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src,
1417  base,
1418  (__v2di)index,
1419  (__v4sf)mask,
1420  scale);
1421 }
1422 
1423 extern __inline __m128
1424 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1425 _mm256_i64gather_ps (float const *base, __m256i index, const int scale)
1426 {
1427  __v4sf src = _mm_setzero_ps ();
1428  __v4sf mask = _mm_cmpeq_ps (src, src);
1429 
1430  return (__m128) __builtin_ia32_gatherdiv4sf256 (src,
1431  base,
1432  (__v4di)index,
1433  mask,
1434  scale);
1435 }
1436 
1437 extern __inline __m128
1438 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1439 _mm256_mask_i64gather_ps (__m128 src, float const *base,
1440  __m256i index, __m128 mask, const int scale)
1441 {
1442  return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src,
1443  base,
1444  (__v4di)index,
1445  (__v4sf)mask,
1446  scale);
1447 }
1448 
1449 extern __inline __m128i
1450 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1451 _mm_i32gather_epi64 (long long int const *base,
1452  __m128i index, const int scale)
1453 {
1454  __v2di src = __extension__ (__v2di){ 0, 0 };
1455  __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1456 
1457  return (__m128i) __builtin_ia32_gathersiv2di (src,
1458  base,
1459  (__v4si)index,
1460  mask,
1461  scale);
1462 }
1463 
1464 extern __inline __m128i
1465 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1466 _mm_mask_i32gather_epi64 (__m128i src, long long int const *base,
1467  __m128i index, __m128i mask, const int scale)
1468 {
1469  return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src,
1470  base,
1471  (__v4si)index,
1472  (__v2di)mask,
1473  scale);
1474 }
1475 
1476 extern __inline __m256i
1477 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1478 _mm256_i32gather_epi64 (long long int const *base,
1479  __m128i index, const int scale)
1480 {
1481  __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1482  __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1483 
1484  return (__m256i) __builtin_ia32_gathersiv4di (src,
1485  base,
1486  (__v4si)index,
1487  mask,
1488  scale);
1489 }
1490 
1491 extern __inline __m256i
1492 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1493 _mm256_mask_i32gather_epi64 (__m256i src, long long int const *base,
1494  __m128i index, __m256i mask, const int scale)
1495 {
1496  return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src,
1497  base,
1498  (__v4si)index,
1499  (__v4di)mask,
1500  scale);
1501 }
1502 
1503 extern __inline __m128i
1504 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1505 _mm_i64gather_epi64 (long long int const *base,
1506  __m128i index, const int scale)
1507 {
1508  __v2di src = __extension__ (__v2di){ 0, 0 };
1509  __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1510 
1511  return (__m128i) __builtin_ia32_gatherdiv2di (src,
1512  base,
1513  (__v2di)index,
1514  mask,
1515  scale);
1516 }
1517 
1518 extern __inline __m128i
1519 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1520 _mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index,
1521  __m128i mask, const int scale)
1522 {
1523  return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src,
1524  base,
1525  (__v2di)index,
1526  (__v2di)mask,
1527  scale);
1528 }
1529 
1530 extern __inline __m256i
1531 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1532 _mm256_i64gather_epi64 (long long int const *base,
1533  __m256i index, const int scale)
1534 {
1535  __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1536  __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1537 
1538  return (__m256i) __builtin_ia32_gatherdiv4di (src,
1539  base,
1540  (__v4di)index,
1541  mask,
1542  scale);
1543 }
1544 
1545 extern __inline __m256i
1546 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1547 _mm256_mask_i64gather_epi64 (__m256i src, long long int const *base,
1548  __m256i index, __m256i mask, const int scale)
1549 {
1550  return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src,
1551  base,
1552  (__v4di)index,
1553  (__v4di)mask,
1554  scale);
1555 }
1556 
1557 extern __inline __m128i
1558 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1559 _mm_i32gather_epi32 (int const *base, __m128i index, const int scale)
1560 {
1561  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1562  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1563 
1564  return (__m128i) __builtin_ia32_gathersiv4si (src,
1565  base,
1566  (__v4si)index,
1567  mask,
1568  scale);
1569 }
1570 
1571 extern __inline __m128i
1572 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1573 _mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index,
1574  __m128i mask, const int scale)
1575 {
1576  return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src,
1577  base,
1578  (__v4si)index,
1579  (__v4si)mask,
1580  scale);
1581 }
1582 
1583 extern __inline __m256i
1584 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1585 _mm256_i32gather_epi32 (int const *base, __m256i index, const int scale)
1586 {
1587  __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
1588  __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
1589 
1590  return (__m256i) __builtin_ia32_gathersiv8si (src,
1591  base,
1592  (__v8si)index,
1593  mask,
1594  scale);
1595 }
1596 
1597 extern __inline __m256i
1598 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1599 _mm256_mask_i32gather_epi32 (__m256i src, int const *base,
1600  __m256i index, __m256i mask, const int scale)
1601 {
1602  return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src,
1603  base,
1604  (__v8si)index,
1605  (__v8si)mask,
1606  scale);
1607 }
1608 
1609 extern __inline __m128i
1610 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1611 _mm_i64gather_epi32 (int const *base, __m128i index, const int scale)
1612 {
1613  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1614  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1615 
1616  return (__m128i) __builtin_ia32_gatherdiv4si (src,
1617  base,
1618  (__v2di)index,
1619  mask,
1620  scale);
1621 }
1622 
1623 extern __inline __m128i
1624 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1625 _mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index,
1626  __m128i mask, const int scale)
1627 {
1628  return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src,
1629  base,
1630  (__v2di)index,
1631  (__v4si)mask,
1632  scale);
1633 }
1634 
1635 extern __inline __m128i
1636 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1637 _mm256_i64gather_epi32 (int const *base, __m256i index, const int scale)
1638 {
1639  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1640  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1641 
1642  return (__m128i) __builtin_ia32_gatherdiv4si256 (src,
1643  base,
1644  (__v4di)index,
1645  mask,
1646  scale);
1647 }
1648 
1649 extern __inline __m128i
1650 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1651 _mm256_mask_i64gather_epi32 (__m128i src, int const *base,
1652  __m256i index, __m128i mask, const int scale)
1653 {
1654  return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src,
1655  base,
1656  (__v4di)index,
1657  (__v4si)mask,
1658  scale);
1659 }
1660 #else /* __OPTIMIZE__ */
1661 #define _mm_i32gather_pd(BASE, INDEX, SCALE) \
1662  (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \
1663  (double const *)BASE, \
1664  (__v4si)(__m128i)INDEX, \
1665  (__v2df)_mm_set1_pd( \
1666  (double)(long long int) -1), \
1667  (int)SCALE)
1668 
1669 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1670  (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \
1671  (double const *)BASE, \
1672  (__v4si)(__m128i)INDEX, \
1673  (__v2df)(__m128d)MASK, \
1674  (int)SCALE)
1675 
1676 #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \
1677  (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \
1678  (double const *)BASE, \
1679  (__v4si)(__m128i)INDEX, \
1680  (__v4df)_mm256_set1_pd( \
1681  (double)(long long int) -1), \
1682  (int)SCALE)
1683 
1684 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1685  (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \
1686  (double const *)BASE, \
1687  (__v4si)(__m128i)INDEX, \
1688  (__v4df)(__m256d)MASK, \
1689  (int)SCALE)
1690 
1691 #define _mm_i64gather_pd(BASE, INDEX, SCALE) \
1692  (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \
1693  (double const *)BASE, \
1694  (__v2di)(__m128i)INDEX, \
1695  (__v2df)_mm_set1_pd( \
1696  (double)(long long int) -1), \
1697  (int)SCALE)
1698 
1699 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1700  (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \
1701  (double const *)BASE, \
1702  (__v2di)(__m128i)INDEX, \
1703  (__v2df)(__m128d)MASK, \
1704  (int)SCALE)
1705 
1706 #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \
1707  (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \
1708  (double const *)BASE, \
1709  (__v4di)(__m256i)INDEX, \
1710  (__v4df)_mm256_set1_pd( \
1711  (double)(long long int) -1), \
1712  (int)SCALE)
1713 
1714 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1715  (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \
1716  (double const *)BASE, \
1717  (__v4di)(__m256i)INDEX, \
1718  (__v4df)(__m256d)MASK, \
1719  (int)SCALE)
1720 
1721 #define _mm_i32gather_ps(BASE, INDEX, SCALE) \
1722  (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \
1723  (float const *)BASE, \
1724  (__v4si)(__m128i)INDEX, \
1725  _mm_set1_ps ((float)(int) -1), \
1726  (int)SCALE)
1727 
1728 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1729  (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \
1730  (float const *)BASE, \
1731  (__v4si)(__m128i)INDEX, \
1732  (__v4sf)(__m128d)MASK, \
1733  (int)SCALE)
1734 
1735 #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \
1736  (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
1737  (float const *)BASE, \
1738  (__v8si)(__m256i)INDEX, \
1739  (__v8sf)_mm256_set1_ps ( \
1740  (float)(int) -1), \
1741  (int)SCALE)
1742 
1743 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1744  (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \
1745  (float const *)BASE, \
1746  (__v8si)(__m256i)INDEX, \
1747  (__v8sf)(__m256d)MASK, \
1748  (int)SCALE)
1749 
1750 #define _mm_i64gather_ps(BASE, INDEX, SCALE) \
1751  (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \
1752  (float const *)BASE, \
1753  (__v2di)(__m128i)INDEX, \
1754  (__v4sf)_mm_set1_ps ( \
1755  (float)(int) -1), \
1756  (int)SCALE)
1757 
1758 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1759  (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \
1760  (float const *)BASE, \
1761  (__v2di)(__m128i)INDEX, \
1762  (__v4sf)(__m128d)MASK, \
1763  (int)SCALE)
1764 
1765 #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \
1766  (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \
1767  (float const *)BASE, \
1768  (__v4di)(__m256i)INDEX, \
1769  (__v4sf)_mm_set1_ps( \
1770  (float)(int) -1), \
1771  (int)SCALE)
1772 
1773 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1774  (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \
1775  (float const *)BASE, \
1776  (__v4di)(__m256i)INDEX, \
1777  (__v4sf)(__m128)MASK, \
1778  (int)SCALE)
1779 
1780 #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \
1781  (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
1782  (long long const *)BASE, \
1783  (__v4si)(__m128i)INDEX, \
1784  (__v2di)_mm_set1_epi64x (-1), \
1785  (int)SCALE)
1786 
1787 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1788  (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \
1789  (long long const *)BASE, \
1790  (__v4si)(__m128i)INDEX, \
1791  (__v2di)(__m128i)MASK, \
1792  (int)SCALE)
1793 
1794 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \
1795  (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
1796  (long long const *)BASE, \
1797  (__v4si)(__m128i)INDEX, \
1798  (__v4di)_mm256_set1_epi64x (-1), \
1799  (int)SCALE)
1800 
1801 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1802  (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \
1803  (long long const *)BASE, \
1804  (__v4si)(__m128i)INDEX, \
1805  (__v4di)(__m256i)MASK, \
1806  (int)SCALE)
1807 
1808 #define _mm_i64gather_epi64(BASE, INDEX, SCALE) \
1809  (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
1810  (long long const *)BASE, \
1811  (__v2di)(__m128i)INDEX, \
1812  (__v2di)_mm_set1_epi64x (-1), \
1813  (int)SCALE)
1814 
1815 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1816  (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \
1817  (long long const *)BASE, \
1818  (__v2di)(__m128i)INDEX, \
1819  (__v2di)(__m128i)MASK, \
1820  (int)SCALE)
1821 
1822 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \
1823  (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
1824  (long long const *)BASE, \
1825  (__v4di)(__m256i)INDEX, \
1826  (__v4di)_mm256_set1_epi64x (-1), \
1827  (int)SCALE)
1828 
1829 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1830  (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \
1831  (long long const *)BASE, \
1832  (__v4di)(__m256i)INDEX, \
1833  (__v4di)(__m256i)MASK, \
1834  (int)SCALE)
1835 
1836 #define _mm_i32gather_epi32(BASE, INDEX, SCALE) \
1837  (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \
1838  (int const *)BASE, \
1839  (__v4si)(__m128i)INDEX, \
1840  (__v4si)_mm_set1_epi32 (-1), \
1841  (int)SCALE)
1842 
1843 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1844  (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \
1845  (int const *)BASE, \
1846  (__v4si)(__m128i)INDEX, \
1847  (__v4si)(__m128i)MASK, \
1848  (int)SCALE)
1849 
1850 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \
1851  (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
1852  (int const *)BASE, \
1853  (__v8si)(__m256i)INDEX, \
1854  (__v8si)_mm256_set1_epi32 (-1), \
1855  (int)SCALE)
1856 
1857 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1858  (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \
1859  (int const *)BASE, \
1860  (__v8si)(__m256i)INDEX, \
1861  (__v8si)(__m256i)MASK, \
1862  (int)SCALE)
1863 
1864 #define _mm_i64gather_epi32(BASE, INDEX, SCALE) \
1865  (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \
1866  (int const *)BASE, \
1867  (__v2di)(__m128i)INDEX, \
1868  (__v4si)_mm_set1_epi32 (-1), \
1869  (int)SCALE)
1870 
1871 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1872  (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \
1873  (int const *)BASE, \
1874  (__v2di)(__m128i)INDEX, \
1875  (__v4si)(__m128i)MASK, \
1876  (int)SCALE)
1877 
1878 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \
1879  (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
1880  (int const *)BASE, \
1881  (__v4di)(__m256i)INDEX, \
1882  (__v4si)_mm_set1_epi32(-1), \
1883  (int)SCALE)
1884 
1885 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1886  (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \
1887  (int const *)BASE, \
1888  (__v4di)(__m256i)INDEX, \
1889  (__v4si)(__m128i)MASK, \
1890  (int)SCALE)
1891 #endif /* __OPTIMIZE__ */
#define _mm256_inserti128_si256(X, Y, M)
Definition: avx2intrin.h:1104
#define _mm256_alignr_epi8(A, B, N)
Definition: avx2intrin.h:162
#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1773
#define _mm_i32gather_pd(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1661
#define _mm256_permute4x64_epi64(X, M)
Definition: avx2intrin.h:1067
#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1669
#define _mm_i32gather_epi32(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1836
#define _mm_i32gather_ps(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1721
#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1743
#define _mm_i64gather_pd(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1691
#define _mm256_extracti128_si256(X, M)
Definition: avx2intrin.h:1092
__inline __m256i __m256i __B
Definition: avx2intrin.h:69
#define _mm256_i32gather_pd(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1676
#define _mm_i64gather_epi32(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1864
#define _mm256_bslli_epi128(A, N)
Definition: avx2intrin.h:654
#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1857
#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1829
#define _mm256_i64gather_epi64(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1822
#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1871
#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1714
#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1758
#define _mm256_blend_epi16(X, Y, M)
Definition: avx2intrin.h:215
__inline void __m256d __A
Definition: avxintrin.h:828
#define _CMP_EQ_OQ
Definition: avxintrin.h:51
#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1801
#define _mm_i64gather_ps(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1750
#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1843
#define _mm256_shufflelo_epi16(A, N)
Definition: avx2intrin.h:614
#define _mm256_i32gather_ps(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1735
#define _mm256_permute4x64_pd(X, M)
Definition: avx2intrin.h:1048
#define _mm256_permute2x128_si256(X, Y, M)
Definition: avx2intrin.h:1080
__inline unsigned char unsigned int __X
Definition: adxintrin.h:33
#define _mm256_slli_si256(A, N)
Definition: avx2intrin.h:656
#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1699
__inline __m256i __m256i __Y
Definition: avx2intrin.h:198
#define _mm256_i64gather_ps(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1765
#define _mm256_i64gather_epi32(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1878
#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1684
#define _mm256_blend_epi32(X, Y, M)
Definition: avx2intrin.h:972
#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1885
#define _mm256_shufflehi_epi16(A, N)
Definition: avx2intrin.h:612
#define _mm256_srli_si256(A, N)
Definition: avx2intrin.h:747
#define _mm_blend_epi32(X, Y, M)
Definition: avx2intrin.h:957
#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1815
#define _mm_i64gather_epi64(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1808
#define _mm256_cmp_ps(X, Y, P)
Definition: avxintrin.h:409
__inline __m256i __m256i __m256i __M
Definition: avx2intrin.h:199
#define _mm256_bsrli_epi128(A, N)
Definition: avx2intrin.h:745
__inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_abs_epi8(__m256i __A)
Definition: avx2intrin.h:46
#define _mm256_i64gather_pd(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1706
#define _mm256_mpsadbw_epu8(X, Y, M)
Definition: avx2intrin.h:40
#define _mm256_cmp_pd(X, Y, P)
Definition: avxintrin.h:405
#define _mm256_i32gather_epi32(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1850
#define _mm_i32gather_epi64(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1780
#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1787
#define _mm256_shuffle_epi32(A, N)
Definition: avx2intrin.h:610
#define _mm256_i32gather_epi64(BASE, INDEX, SCALE)
Definition: avx2intrin.h:1794
#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)
Definition: avx2intrin.h:1728