STLdoc
STLdocumentation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
avxintrin.h
Go to the documentation of this file.
1 /* Copyright (C) 2008-2013 Free Software Foundation, Inc.
2 
3  This file is part of GCC.
4 
5  GCC is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; either version 3, or (at your option)
8  any later version.
9 
10  GCC is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  Under Section 7 of GPL version 3, you are granted additional
16  permissions described in the GCC Runtime Library Exception, version
17  3.1, as published by the Free Software Foundation.
18 
19  You should have received a copy of the GNU General Public License and
20  a copy of the GCC Runtime Library Exception along with this program;
21  see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22  <http://www.gnu.org/licenses/>. */
23 
24 /* Implemented from the specification included in the Intel C++ Compiler
25  User Guide and Reference, version 11.0. */
26 
27 #ifndef _IMMINTRIN_H_INCLUDED
28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29 #endif
30 
31 /* Internal data types for implementing the intrinsics. */
32 typedef double __v4df __attribute__ ((__vector_size__ (32)));
33 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
34 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
35 typedef int __v8si __attribute__ ((__vector_size__ (32)));
36 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
37 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
38 
39 /* The Intel API is flexible enough that we must allow aliasing with other
40  vector types, and their scalar components. */
41 typedef float __m256 __attribute__ ((__vector_size__ (32),
42  __may_alias__));
43 typedef long long __m256i __attribute__ ((__vector_size__ (32),
44  __may_alias__));
45 typedef double __m256d __attribute__ ((__vector_size__ (32),
46  __may_alias__));
47 
48 /* Compare predicates for scalar and packed compare intrinsics. */
49 
50 /* Equal (ordered, non-signaling) */
51 #define _CMP_EQ_OQ 0x00
52 /* Less-than (ordered, signaling) */
53 #define _CMP_LT_OS 0x01
54 /* Less-than-or-equal (ordered, signaling) */
55 #define _CMP_LE_OS 0x02
56 /* Unordered (non-signaling) */
57 #define _CMP_UNORD_Q 0x03
58 /* Not-equal (unordered, non-signaling) */
59 #define _CMP_NEQ_UQ 0x04
60 /* Not-less-than (unordered, signaling) */
61 #define _CMP_NLT_US 0x05
62 /* Not-less-than-or-equal (unordered, signaling) */
63 #define _CMP_NLE_US 0x06
64 /* Ordered (nonsignaling) */
65 #define _CMP_ORD_Q 0x07
66 /* Equal (unordered, non-signaling) */
67 #define _CMP_EQ_UQ 0x08
68 /* Not-greater-than-or-equal (unordered, signaling) */
69 #define _CMP_NGE_US 0x09
70 /* Not-greater-than (unordered, signaling) */
71 #define _CMP_NGT_US 0x0a
72 /* False (ordered, non-signaling) */
73 #define _CMP_FALSE_OQ 0x0b
74 /* Not-equal (ordered, non-signaling) */
75 #define _CMP_NEQ_OQ 0x0c
76 /* Greater-than-or-equal (ordered, signaling) */
77 #define _CMP_GE_OS 0x0d
78 /* Greater-than (ordered, signaling) */
79 #define _CMP_GT_OS 0x0e
80 /* True (unordered, non-signaling) */
81 #define _CMP_TRUE_UQ 0x0f
82 /* Equal (ordered, signaling) */
83 #define _CMP_EQ_OS 0x10
84 /* Less-than (ordered, non-signaling) */
85 #define _CMP_LT_OQ 0x11
86 /* Less-than-or-equal (ordered, non-signaling) */
87 #define _CMP_LE_OQ 0x12
88 /* Unordered (signaling) */
89 #define _CMP_UNORD_S 0x13
90 /* Not-equal (unordered, signaling) */
91 #define _CMP_NEQ_US 0x14
92 /* Not-less-than (unordered, non-signaling) */
93 #define _CMP_NLT_UQ 0x15
94 /* Not-less-than-or-equal (unordered, non-signaling) */
95 #define _CMP_NLE_UQ 0x16
96 /* Ordered (signaling) */
97 #define _CMP_ORD_S 0x17
98 /* Equal (unordered, signaling) */
99 #define _CMP_EQ_US 0x18
100 /* Not-greater-than-or-equal (unordered, non-signaling) */
101 #define _CMP_NGE_UQ 0x19
102 /* Not-greater-than (unordered, non-signaling) */
103 #define _CMP_NGT_UQ 0x1a
104 /* False (ordered, signaling) */
105 #define _CMP_FALSE_OS 0x1b
106 /* Not-equal (ordered, signaling) */
107 #define _CMP_NEQ_OS 0x1c
108 /* Greater-than-or-equal (ordered, non-signaling) */
109 #define _CMP_GE_OQ 0x1d
110 /* Greater-than (ordered, non-signaling) */
111 #define _CMP_GT_OQ 0x1e
112 /* True (unordered, signaling) */
113 #define _CMP_TRUE_US 0x1f
114 
115 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
116 _mm256_add_pd (__m256d __A, __m256d __B)
117 {
118  return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
119 }
120 
121 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
122 _mm256_add_ps (__m256 __A, __m256 __B)
123 {
124  return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
125 }
126 
127 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128 _mm256_addsub_pd (__m256d __A, __m256d __B)
129 {
130  return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
131 }
132 
133 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134 _mm256_addsub_ps (__m256 __A, __m256 __B)
135 {
136  return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
137 }
138 
139 
140 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141 _mm256_and_pd (__m256d __A, __m256d __B)
142 {
143  return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
144 }
145 
146 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147 _mm256_and_ps (__m256 __A, __m256 __B)
148 {
149  return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
150 }
151 
152 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153 _mm256_andnot_pd (__m256d __A, __m256d __B)
154 {
155  return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
156 }
157 
158 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
159 _mm256_andnot_ps (__m256 __A, __m256 __B)
160 {
161  return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
162 }
163 
164 /* Double/single precision floating point blend instructions - select
165  data from 2 sources using constant/variable mask. */
166 
167 #ifdef __OPTIMIZE__
168 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
169 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
170 {
171  return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
172  (__v4df)__Y,
173  __M);
174 }
175 
176 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
178 {
179  return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
180  (__v8sf)__Y,
181  __M);
182 }
183 #else
184 #define _mm256_blend_pd(X, Y, M) \
185  ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
186  (__v4df)(__m256d)(Y), (int)(M)))
187 
188 #define _mm256_blend_ps(X, Y, M) \
189  ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
190  (__v8sf)(__m256)(Y), (int)(M)))
191 #endif
192 
193 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
195 {
196  return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
197  (__v4df)__Y,
198  (__v4df)__M);
199 }
200 
201 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
203 {
204  return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
205  (__v8sf)__Y,
206  (__v8sf)__M);
207 }
208 
209 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
210 _mm256_div_pd (__m256d __A, __m256d __B)
211 {
212  return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
213 }
214 
215 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216 _mm256_div_ps (__m256 __A, __m256 __B)
217 {
218  return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
219 }
220 
221 /* Dot product instructions with mask-defined summing and zeroing parts
222  of result. */
223 
224 #ifdef __OPTIMIZE__
225 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
227 {
228  return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
229  (__v8sf)__Y,
230  __M);
231 }
232 #else
233 #define _mm256_dp_ps(X, Y, M) \
234  ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
235  (__v8sf)(__m256)(Y), (int)(M)))
236 #endif
237 
238 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
239 _mm256_hadd_pd (__m256d __X, __m256d __Y)
240 {
241  return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
242 }
243 
244 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
245 _mm256_hadd_ps (__m256 __X, __m256 __Y)
246 {
247  return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
248 }
249 
250 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
251 _mm256_hsub_pd (__m256d __X, __m256d __Y)
252 {
253  return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
254 }
255 
256 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257 _mm256_hsub_ps (__m256 __X, __m256 __Y)
258 {
259  return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
260 }
261 
262 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263 _mm256_max_pd (__m256d __A, __m256d __B)
264 {
265  return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
266 }
267 
268 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
269 _mm256_max_ps (__m256 __A, __m256 __B)
270 {
271  return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
272 }
273 
274 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
275 _mm256_min_pd (__m256d __A, __m256d __B)
276 {
277  return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
278 }
279 
280 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281 _mm256_min_ps (__m256 __A, __m256 __B)
282 {
283  return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
284 }
285 
286 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287 _mm256_mul_pd (__m256d __A, __m256d __B)
288 {
289  return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
290 }
291 
292 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
293 _mm256_mul_ps (__m256 __A, __m256 __B)
294 {
295  return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
296 }
297 
298 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
299 _mm256_or_pd (__m256d __A, __m256d __B)
300 {
301  return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
302 }
303 
304 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305 _mm256_or_ps (__m256 __A, __m256 __B)
306 {
307  return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
308 }
309 
310 #ifdef __OPTIMIZE__
311 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
313 {
314  return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
315  __mask);
316 }
317 
318 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
319 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
320 {
321  return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
322  __mask);
323 }
324 #else
325 #define _mm256_shuffle_pd(A, B, N) \
326  ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
327  (__v4df)(__m256d)(B), (int)(N)))
328 
329 #define _mm256_shuffle_ps(A, B, N) \
330  ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
331  (__v8sf)(__m256)(B), (int)(N)))
332 #endif
333 
334 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335 _mm256_sub_pd (__m256d __A, __m256d __B)
336 {
337  return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
338 }
339 
340 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341 _mm256_sub_ps (__m256 __A, __m256 __B)
342 {
343  return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
344 }
345 
346 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347 _mm256_xor_pd (__m256d __A, __m256d __B)
348 {
349  return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
350 }
351 
352 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
353 _mm256_xor_ps (__m256 __A, __m256 __B)
354 {
355  return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
356 }
357 
358 #ifdef __OPTIMIZE__
359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
361 {
362  return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
363 }
364 
365 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
366 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
367 {
368  return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
369 }
370 
371 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
372 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
373 {
374  return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
375  __P);
376 }
377 
378 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
380 {
381  return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
382  __P);
383 }
384 
385 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
387 {
388  return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
389 }
390 
391 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
393 {
394  return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
395 }
396 #else
397 #define _mm_cmp_pd(X, Y, P) \
398  ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
399  (__v2df)(__m128d)(Y), (int)(P)))
400 
401 #define _mm_cmp_ps(X, Y, P) \
402  ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
403  (__v4sf)(__m128)(Y), (int)(P)))
404 
405 #define _mm256_cmp_pd(X, Y, P) \
406  ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
407  (__v4df)(__m256d)(Y), (int)(P)))
408 
409 #define _mm256_cmp_ps(X, Y, P) \
410  ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
411  (__v8sf)(__m256)(Y), (int)(P)))
412 
413 #define _mm_cmp_sd(X, Y, P) \
414  ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
415  (__v2df)(__m128d)(Y), (int)(P)))
416 
417 #define _mm_cmp_ss(X, Y, P) \
418  ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
419  (__v4sf)(__m128)(Y), (int)(P)))
420 #endif
421 
422 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
423 _mm256_cvtepi32_pd (__m128i __A)
424 {
425  return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
426 }
427 
428 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
429 _mm256_cvtepi32_ps (__m256i __A)
430 {
431  return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
432 }
433 
434 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
435 _mm256_cvtpd_ps (__m256d __A)
436 {
437  return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
438 }
439 
440 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441 _mm256_cvtps_epi32 (__m256 __A)
442 {
443  return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
444 }
445 
446 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447 _mm256_cvtps_pd (__m128 __A)
448 {
449  return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
450 }
451 
452 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453 _mm256_cvttpd_epi32 (__m256d __A)
454 {
455  return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
456 }
457 
458 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459 _mm256_cvtpd_epi32 (__m256d __A)
460 {
461  return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
462 }
463 
464 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 _mm256_cvttps_epi32 (__m256 __A)
466 {
467  return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
468 }
469 
470 #ifdef __OPTIMIZE__
471 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472 _mm256_extractf128_pd (__m256d __X, const int __N)
473 {
474  return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
475 }
476 
477 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478 _mm256_extractf128_ps (__m256 __X, const int __N)
479 {
480  return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
481 }
482 
483 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484 _mm256_extractf128_si256 (__m256i __X, const int __N)
485 {
486  return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
487 }
488 
489 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490 _mm256_extract_epi32 (__m256i __X, int const __N)
491 {
492  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
493  return _mm_extract_epi32 (__Y, __N % 4);
494 }
495 
496 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
497 _mm256_extract_epi16 (__m256i __X, int const __N)
498 {
499  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
500  return _mm_extract_epi16 (__Y, __N % 8);
501 }
502 
503 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
504 _mm256_extract_epi8 (__m256i __X, int const __N)
505 {
506  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
507  return _mm_extract_epi8 (__Y, __N % 16);
508 }
509 
510 #ifdef __x86_64__
511 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
512 _mm256_extract_epi64 (__m256i __X, const int __N)
513 {
514  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
515  return _mm_extract_epi64 (__Y, __N % 2);
516 }
517 #endif
518 #else
519 #define _mm256_extractf128_pd(X, N) \
520  ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
521  (int)(N)))
522 
523 #define _mm256_extractf128_ps(X, N) \
524  ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
525  (int)(N)))
526 
527 #define _mm256_extractf128_si256(X, N) \
528  ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
529  (int)(N)))
530 
531 #define _mm256_extract_epi32(X, N) \
532  (__extension__ \
533  ({ \
534  __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
535  _mm_extract_epi32 (__Y, (N) % 4); \
536  }))
537 
538 #define _mm256_extract_epi16(X, N) \
539  (__extension__ \
540  ({ \
541  __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
542  _mm_extract_epi16 (__Y, (N) % 8); \
543  }))
544 
545 #define _mm256_extract_epi8(X, N) \
546  (__extension__ \
547  ({ \
548  __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
549  _mm_extract_epi8 (__Y, (N) % 16); \
550  }))
551 
552 #ifdef __x86_64__
553 #define _mm256_extract_epi64(X, N) \
554  (__extension__ \
555  ({ \
556  __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
557  _mm_extract_epi64 (__Y, (N) % 2); \
558  }))
559 #endif
560 #endif
561 
562 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
563 _mm256_zeroall (void)
564 {
565  __builtin_ia32_vzeroall ();
566 }
567 
568 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
569 _mm256_zeroupper (void)
570 {
571  __builtin_ia32_vzeroupper ();
572 }
573 
574 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
575 _mm_permutevar_pd (__m128d __A, __m128i __C)
576 {
577  return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
578  (__v2di)__C);
579 }
580 
581 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
582 _mm256_permutevar_pd (__m256d __A, __m256i __C)
583 {
584  return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
585  (__v4di)__C);
586 }
587 
588 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589 _mm_permutevar_ps (__m128 __A, __m128i __C)
590 {
591  return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
592  (__v4si)__C);
593 }
594 
595 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
596 _mm256_permutevar_ps (__m256 __A, __m256i __C)
597 {
598  return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
599  (__v8si)__C);
600 }
601 
602 #ifdef __OPTIMIZE__
603 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604 _mm_permute_pd (__m128d __X, const int __C)
605 {
606  return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
607 }
608 
609 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610 _mm256_permute_pd (__m256d __X, const int __C)
611 {
612  return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
613 }
614 
615 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616 _mm_permute_ps (__m128 __X, const int __C)
617 {
618  return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
619 }
620 
621 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
622 _mm256_permute_ps (__m256 __X, const int __C)
623 {
624  return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
625 }
626 #else
627 #define _mm_permute_pd(X, C) \
628  ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
629 
630 #define _mm256_permute_pd(X, C) \
631  ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
632 
633 #define _mm_permute_ps(X, C) \
634  ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
635 
636 #define _mm256_permute_ps(X, C) \
637  ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
638 #endif
639 
640 #ifdef __OPTIMIZE__
641 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
643 {
644  return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
645  (__v4df)__Y,
646  __C);
647 }
648 
649 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
650 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
651 {
652  return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
653  (__v8sf)__Y,
654  __C);
655 }
656 
657 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
658 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
659 {
660  return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
661  (__v8si)__Y,
662  __C);
663 }
664 #else
665 #define _mm256_permute2f128_pd(X, Y, C) \
666  ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
667  (__v4df)(__m256d)(Y), \
668  (int)(C)))
669 
670 #define _mm256_permute2f128_ps(X, Y, C) \
671  ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
672  (__v8sf)(__m256)(Y), \
673  (int)(C)))
674 
675 #define _mm256_permute2f128_si256(X, Y, C) \
676  ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
677  (__v8si)(__m256i)(Y), \
678  (int)(C)))
679 #endif
680 
681 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682 _mm_broadcast_ss (float const *__X)
683 {
684  return (__m128) __builtin_ia32_vbroadcastss (__X);
685 }
686 
687 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688 _mm256_broadcast_sd (double const *__X)
689 {
690  return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
691 }
692 
693 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694 _mm256_broadcast_ss (float const *__X)
695 {
696  return (__m256) __builtin_ia32_vbroadcastss256 (__X);
697 }
698 
699 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
700 _mm256_broadcast_pd (__m128d const *__X)
701 {
702  return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
703 }
704 
705 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
706 _mm256_broadcast_ps (__m128 const *__X)
707 {
708  return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
709 }
710 
711 #ifdef __OPTIMIZE__
712 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
714 {
715  return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
716  (__v2df)__Y,
717  __O);
718 }
719 
720 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
721 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
722 {
723  return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
724  (__v4sf)__Y,
725  __O);
726 }
727 
728 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
729 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
730 {
731  return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
732  (__v4si)__Y,
733  __O);
734 }
735 
736 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737 _mm256_insert_epi32 (__m256i __X, int __D, int const __N)
738 {
739  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
740  __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
741  return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
742 }
743 
744 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
745 _mm256_insert_epi16 (__m256i __X, int __D, int const __N)
746 {
747  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
748  __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
749  return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
750 }
751 
752 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
753 _mm256_insert_epi8 (__m256i __X, int __D, int const __N)
754 {
755  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
756  __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
757  return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
758 }
759 
760 #ifdef __x86_64__
761 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
762 _mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
763 {
764  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
765  __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
766  return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
767 }
768 #endif
769 #else
770 #define _mm256_insertf128_pd(X, Y, O) \
771  ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
772  (__v2df)(__m128d)(Y), \
773  (int)(O)))
774 
775 #define _mm256_insertf128_ps(X, Y, O) \
776  ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
777  (__v4sf)(__m128)(Y), \
778  (int)(O)))
779 
780 #define _mm256_insertf128_si256(X, Y, O) \
781  ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
782  (__v4si)(__m128i)(Y), \
783  (int)(O)))
784 
785 #define _mm256_insert_epi32(X, D, N) \
786  (__extension__ \
787  ({ \
788  __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
789  __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
790  _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
791  }))
792 
793 #define _mm256_insert_epi16(X, D, N) \
794  (__extension__ \
795  ({ \
796  __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
797  __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
798  _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
799  }))
800 
801 #define _mm256_insert_epi8(X, D, N) \
802  (__extension__ \
803  ({ \
804  __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
805  __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
806  _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
807  }))
808 
809 #ifdef __x86_64__
810 #define _mm256_insert_epi64(X, D, N) \
811  (__extension__ \
812  ({ \
813  __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
814  __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
815  _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
816  }))
817 #endif
818 #endif
819 
820 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821 _mm256_load_pd (double const *__P)
822 {
823  return *(__m256d *)__P;
824 }
825 
826 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
827 _mm256_store_pd (double *__P, __m256d __A)
828 {
829  *(__m256d *)__P = __A;
830 }
831 
832 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
833 _mm256_load_ps (float const *__P)
834 {
835  return *(__m256 *)__P;
836 }
837 
838 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
839 _mm256_store_ps (float *__P, __m256 __A)
840 {
841  *(__m256 *)__P = __A;
842 }
843 
844 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
845 _mm256_loadu_pd (double const *__P)
846 {
847  return (__m256d) __builtin_ia32_loadupd256 (__P);
848 }
849 
850 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
851 _mm256_storeu_pd (double *__P, __m256d __A)
852 {
853  __builtin_ia32_storeupd256 (__P, (__v4df)__A);
854 }
855 
856 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
857 _mm256_loadu_ps (float const *__P)
858 {
859  return (__m256) __builtin_ia32_loadups256 (__P);
860 }
861 
862 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
863 _mm256_storeu_ps (float *__P, __m256 __A)
864 {
865  __builtin_ia32_storeups256 (__P, (__v8sf)__A);
866 }
867 
868 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
869 _mm256_load_si256 (__m256i const *__P)
870 {
871  return *__P;
872 }
873 
874 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
875 _mm256_store_si256 (__m256i *__P, __m256i __A)
876 {
877  *__P = __A;
878 }
879 
880 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
881 _mm256_loadu_si256 (__m256i const *__P)
882 {
883  return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
884 }
885 
886 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
887 _mm256_storeu_si256 (__m256i *__P, __m256i __A)
888 {
889  __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
890 }
891 
892 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
893 _mm_maskload_pd (double const *__P, __m128i __M)
894 {
895  return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
896  (__v2di)__M);
897 }
898 
899 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
900 _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
901 {
902  __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
903 }
904 
905 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906 _mm256_maskload_pd (double const *__P, __m256i __M)
907 {
908  return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
909  (__v4di)__M);
910 }
911 
912 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
913 _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
914 {
915  __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
916 }
917 
918 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919 _mm_maskload_ps (float const *__P, __m128i __M)
920 {
921  return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
922  (__v4si)__M);
923 }
924 
925 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
926 _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
927 {
928  __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
929 }
930 
931 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
932 _mm256_maskload_ps (float const *__P, __m256i __M)
933 {
934  return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
935  (__v8si)__M);
936 }
937 
938 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939 _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
940 {
941  __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
942 }
943 
944 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945 _mm256_movehdup_ps (__m256 __X)
946 {
947  return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
948 }
949 
950 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951 _mm256_moveldup_ps (__m256 __X)
952 {
953  return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
954 }
955 
956 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
957 _mm256_movedup_pd (__m256d __X)
958 {
959  return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
960 }
961 
962 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963 _mm256_lddqu_si256 (__m256i const *__P)
964 {
965  return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
966 }
967 
968 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969 _mm256_stream_si256 (__m256i *__A, __m256i __B)
970 {
971  __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
972 }
973 
974 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975 _mm256_stream_pd (double *__A, __m256d __B)
976 {
977  __builtin_ia32_movntpd256 (__A, (__v4df)__B);
978 }
979 
980 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
981 _mm256_stream_ps (float *__P, __m256 __A)
982 {
983  __builtin_ia32_movntps256 (__P, (__v8sf)__A);
984 }
985 
986 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
987 _mm256_rcp_ps (__m256 __A)
988 {
989  return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
990 }
991 
992 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
993 _mm256_rsqrt_ps (__m256 __A)
994 {
995  return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
996 }
997 
998 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
999 _mm256_sqrt_pd (__m256d __A)
1000 {
1001  return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1002 }
1003 
1004 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005 _mm256_sqrt_ps (__m256 __A)
1006 {
1007  return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1008 }
1009 
1010 #ifdef __OPTIMIZE__
1011 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012 _mm256_round_pd (__m256d __V, const int __M)
1013 {
1014  return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1015 }
1016 
1017 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1018 _mm256_round_ps (__m256 __V, const int __M)
1019 {
1020  return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1021 }
1022 #else
1023 #define _mm256_round_pd(V, M) \
1024  ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1025 
1026 #define _mm256_round_ps(V, M) \
1027  ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1028 #endif
1029 
1030 #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
1031 #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1032 #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
1033 #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1034 
1035 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036 _mm256_unpackhi_pd (__m256d __A, __m256d __B)
1037 {
1038  return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1039 }
1040 
1041 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042 _mm256_unpacklo_pd (__m256d __A, __m256d __B)
1043 {
1044  return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1045 }
1046 
1047 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1048 _mm256_unpackhi_ps (__m256 __A, __m256 __B)
1049 {
1050  return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1051 }
1052 
1053 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1054 _mm256_unpacklo_ps (__m256 __A, __m256 __B)
1055 {
1056  return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1057 }
1058 
1059 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060 _mm_testz_pd (__m128d __M, __m128d __V)
1062  return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1063 }
1064 
1065 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066 _mm_testc_pd (__m128d __M, __m128d __V)
1067 {
1068  return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1069 }
1070 
1071 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072 _mm_testnzc_pd (__m128d __M, __m128d __V)
1073 {
1074  return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1075 }
1076 
1077 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1078 _mm_testz_ps (__m128 __M, __m128 __V)
1079 {
1080  return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1081 }
1082 
1083 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1084 _mm_testc_ps (__m128 __M, __m128 __V)
1085 {
1086  return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1087 }
1088 
1089 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090 _mm_testnzc_ps (__m128 __M, __m128 __V)
1091 {
1092  return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1093 }
1094 
1095 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096 _mm256_testz_pd (__m256d __M, __m256d __V)
1097 {
1098  return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1099 }
1100 
1101 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102 _mm256_testc_pd (__m256d __M, __m256d __V)
1103 {
1104  return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1105 }
1106 
1107 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108 _mm256_testnzc_pd (__m256d __M, __m256d __V)
1109 {
1110  return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1111 }
1112 
1113 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1114 _mm256_testz_ps (__m256 __M, __m256 __V)
1115 {
1116  return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1117 }
1118 
1119 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120 _mm256_testc_ps (__m256 __M, __m256 __V)
1121 {
1122  return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1123 }
1124 
1125 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126 _mm256_testnzc_ps (__m256 __M, __m256 __V)
1127 {
1128  return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1129 }
1130 
1131 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1132 _mm256_testz_si256 (__m256i __M, __m256i __V)
1133 {
1134  return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1135 }
1136 
1137 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1138 _mm256_testc_si256 (__m256i __M, __m256i __V)
1139 {
1140  return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1141 }
1142 
1143 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1144 _mm256_testnzc_si256 (__m256i __M, __m256i __V)
1145 {
1146  return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1147 }
1148 
1149 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1150 _mm256_movemask_pd (__m256d __A)
1151 {
1152  return __builtin_ia32_movmskpd256 ((__v4df)__A);
1153 }
1154 
1155 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156 _mm256_movemask_ps (__m256 __A)
1157 {
1158  return __builtin_ia32_movmskps256 ((__v8sf)__A);
1159 }
1160 
1161 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162 _mm256_setzero_pd (void)
1163 {
1164  return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1165 }
1166 
1167 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168 _mm256_setzero_ps (void)
1169 {
1170  return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1171  0.0, 0.0, 0.0, 0.0 };
1172 }
1173 
1174 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175 _mm256_setzero_si256 (void)
1176 {
1177  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1178 }
1179 
1180 /* Create the vector [A B C D]. */
1181 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182 _mm256_set_pd (double __A, double __B, double __C, double __D)
1184  return __extension__ (__m256d){ __D, __C, __B, __A };
1185 }
1186 
1187 /* Create the vector [A B C D E F G H]. */
1188 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1189 _mm256_set_ps (float __A, float __B, float __C, float __D,
1190  float __E, float __F, float __G, float __H)
1192  return __extension__ (__m256){ __H, __G, __F, __E,
1193  __D, __C, __B, __A };
1194 }
1195 
1196 /* Create the vector [A B C D E F G H]. */
1197 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1198 _mm256_set_epi32 (int __A, int __B, int __C, int __D,
1199  int __E, int __F, int __G, int __H)
1200 {
1201  return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1202  __D, __C, __B, __A };
1203 }
1204 
1205 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1207  short __q11, short __q10, short __q09, short __q08,
1208  short __q07, short __q06, short __q05, short __q04,
1209  short __q03, short __q02, short __q01, short __q00)
1211  return __extension__ (__m256i)(__v16hi){
1213  __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1214  };
1215 }
1216 
1217 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1218 _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
1219  char __q27, char __q26, char __q25, char __q24,
1220  char __q23, char __q22, char __q21, char __q20,
1221  char __q19, char __q18, char __q17, char __q16,
1222  char __q15, char __q14, char __q13, char __q12,
1223  char __q11, char __q10, char __q09, char __q08,
1224  char __q07, char __q06, char __q05, char __q04,
1225  char __q03, char __q02, char __q01, char __q00)
1226 {
1227  return __extension__ (__m256i)(__v32qi){
1231  __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1232  };
1233 }
1234 
1235 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1236 _mm256_set_epi64x (long long __A, long long __B, long long __C,
1237  long long __D)
1238 {
1239  return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1240 }
1241 
1242 /* Create a vector with all elements equal to A. */
1243 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244 _mm256_set1_pd (double __A)
1245 {
1246  return __extension__ (__m256d){ __A, __A, __A, __A };
1247 }
1248 
1249 /* Create a vector with all elements equal to A. */
1250 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251 _mm256_set1_ps (float __A)
1252 {
1253  return __extension__ (__m256){ __A, __A, __A, __A,
1254  __A, __A, __A, __A };
1255 }
1256 
1257 /* Create a vector with all elements equal to A. */
1258 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1259 _mm256_set1_epi32 (int __A)
1260 {
1261  return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1262  __A, __A, __A, __A };
1263 }
1264 
1265 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1266 _mm256_set1_epi16 (short __A)
1267 {
1268  return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1269  __A, __A, __A, __A, __A, __A, __A, __A);
1270 }
1271 
1272 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1273 _mm256_set1_epi8 (char __A)
1274 {
1275  return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1276  __A, __A, __A, __A, __A, __A, __A, __A,
1277  __A, __A, __A, __A, __A, __A, __A, __A,
1278  __A, __A, __A, __A, __A, __A, __A, __A);
1279 }
1280 
1281 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282 _mm256_set1_epi64x (long long __A)
1283 {
1284  return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1285 }
1286 
1287 /* Create vectors of elements in the reversed order from the
1288  _mm256_set_XXX functions. */
1289 
1290 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1291 _mm256_setr_pd (double __A, double __B, double __C, double __D)
1292 {
1293  return _mm256_set_pd (__D, __C, __B, __A);
1294 }
1295 
1296 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297 _mm256_setr_ps (float __A, float __B, float __C, float __D,
1298  float __E, float __F, float __G, float __H)
1299 {
1300  return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1301 }
1302 
1303 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304 _mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1305  int __E, int __F, int __G, int __H)
1306 {
1307  return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1308 }
1309 
1310 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1312  short __q11, short __q10, short __q09, short __q08,
1313  short __q07, short __q06, short __q05, short __q04,
1314  short __q03, short __q02, short __q01, short __q00)
1315 {
1316  return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1317  __q04, __q05, __q06, __q07,
1318  __q08, __q09, __q10, __q11,
1319  __q12, __q13, __q14, __q15);
1320 }
1321 
1322 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1323 _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
1324  char __q27, char __q26, char __q25, char __q24,
1325  char __q23, char __q22, char __q21, char __q20,
1326  char __q19, char __q18, char __q17, char __q16,
1327  char __q15, char __q14, char __q13, char __q12,
1328  char __q11, char __q10, char __q09, char __q08,
1329  char __q07, char __q06, char __q05, char __q04,
1330  char __q03, char __q02, char __q01, char __q00)
1331 {
1332  return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1333  __q04, __q05, __q06, __q07,
1334  __q08, __q09, __q10, __q11,
1335  __q12, __q13, __q14, __q15,
1336  __q16, __q17, __q18, __q19,
1337  __q20, __q21, __q22, __q23,
1338  __q24, __q25, __q26, __q27,
1339  __q28, __q29, __q30, __q31);
1340 }
1341 
1342 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343 _mm256_setr_epi64x (long long __A, long long __B, long long __C,
1344  long long __D)
1345 {
1346  return _mm256_set_epi64x (__D, __C, __B, __A);
1347 }
1348 
1349 /* Casts between various SP, DP, INT vector types. Note that these do no
1350  conversion of values, they just change the type. */
1351 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352 _mm256_castpd_ps (__m256d __A)
1353 {
1354  return (__m256) __A;
1355 }
1356 
1357 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358 _mm256_castpd_si256 (__m256d __A)
1359 {
1360  return (__m256i) __A;
1361 }
1362 
1363 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364 _mm256_castps_pd (__m256 __A)
1365 {
1366  return (__m256d) __A;
1367 }
1368 
1369 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm256_castps_si256(__m256 __A)
1371 {
1372  return (__m256i) __A;
1373 }
1374 
1375 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376 _mm256_castsi256_ps (__m256i __A)
1377 {
1378  return (__m256) __A;
1379 }
1380 
1381 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382 _mm256_castsi256_pd (__m256i __A)
1383 {
1384  return (__m256d) __A;
1385 }
1386 
1387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm256_castpd256_pd128 (__m256d __A)
1389 {
1390  return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1391 }
1392 
1393 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394 _mm256_castps256_ps128 (__m256 __A)
1395 {
1396  return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1397 }
1398 
1399 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1400 _mm256_castsi256_si128 (__m256i __A)
1401 {
1402  return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1403 }
1404 
1405 /* When cast is done from a 128 to 256-bit type, the low 128 bits of
1406  the 256-bit result contain source parameter value and the upper 128
1407  bits of the result are undefined. Those intrinsics shouldn't
1408  generate any extra moves. */
1409 
1410 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411 _mm256_castpd128_pd256 (__m128d __A)
1412 {
1413  return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1414 }
1415 
1416 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1417 _mm256_castps128_ps256 (__m128 __A)
1418 {
1419  return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1420 }
1421 
1422 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1423 _mm256_castsi128_si256 (__m128i __A)
1424 {
1425  return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1426 }
#define _mm_cmp_pd(X, Y, P)
Definition: avxintrin.h:397
__inline __m256i char char char char char char __q25
Definition: avxintrin.h:1218
__inline __m256i char char char __q28
Definition: avxintrin.h:1218
#define _mm256_extractf128_si256(X, N)
Definition: avxintrin.h:527
#define _mm256_extract_epi8(X, N)
Definition: avxintrin.h:545
#define _mm_cmp_sd(X, Y, P)
Definition: avxintrin.h:413
__inline __m256i short short short short short short short short short short short short short short short __q00
Definition: avxintrin.h:1210
#define _mm256_shuffle_pd(A, B, N)
Definition: avxintrin.h:325
__inline __m128d __m128i __C
Definition: avxintrin.h:576
__inline __m256i short short short short short short __q09
Definition: avxintrin.h:1206
#define _mm256_round_ps(V, M)
Definition: avxintrin.h:1026
#define _mm_cmp_ps(X, Y, P)
Definition: avxintrin.h:401
__inline __m256i short short short short short short short short short short __q05
Definition: avxintrin.h:1206
#define _mm256_extract_epi16(X, N)
Definition: avxintrin.h:538
__inline __m256d __m256d __B
Definition: avxintrin.h:117
#define _mm256_extractf128_pd(X, N)
Definition: avxintrin.h:519
__inline __m256i char char char char char char char char char char char char char __q18
Definition: avxintrin.h:1218
#define _mm256_insert_epi16(X, D, N)
Definition: avxintrin.h:793
#define _mm_permute_pd(X, C)
Definition: avxintrin.h:627
#define _mm256_shuffle_ps(A, B, N)
Definition: avxintrin.h:329
double __v4df __attribute__((__vector_size__(32)))
Definition: avxintrin.h:32
__inline __m256i short __q14
Definition: avxintrin.h:1206
__inline __m256i char char char char char char char char char char char __q20
Definition: avxintrin.h:1218
__inline __m256i short short __q13
Definition: avxintrin.h:1206
__inline unsigned char unsigned int unsigned int unsigned int * __P
Definition: adxintrin.h:35
__inline int __m128d __V
Definition: avxintrin.h:1061
__inline __m256i char __q30
Definition: avxintrin.h:1218
#define _mm256_dp_ps(X, Y, M)
Definition: avxintrin.h:233
#define _mm256_extract_epi32(X, N)
Definition: avxintrin.h:531
__inline void __m256d __A
Definition: avxintrin.h:828
#define _mm256_blend_ps(X, Y, M)
Definition: avxintrin.h:188
__inline __m256i char char char char char char char char char char char char __q19
Definition: avxintrin.h:1218
__inline __m256i short short short short short __q10
Definition: avxintrin.h:1206
__inline __m256d double double double __D
Definition: avxintrin.h:1183
#define _mm256_permute2f128_ps(X, Y, C)
Definition: avxintrin.h:670
__inline unsigned char unsigned int __X
Definition: adxintrin.h:33
#define _mm256_permute_pd(X, C)
Definition: avxintrin.h:630
#define _mm256_insert_epi8(X, D, N)
Definition: avxintrin.h:801
__inline __m256i short short short short short short short short __q07
Definition: avxintrin.h:1206
__inline __m256i char char char char char char char char char char char char char char char char __q15
Definition: avxintrin.h:1218
__inline __m256i char char char char char __q26
Definition: avxintrin.h:1218
__inline __m256 float float float float float __F
Definition: avxintrin.h:1189
__inline __m256i char char char char char char char char char char char char char char __q17
Definition: avxintrin.h:1218
__inline __m256i char char char char char char char char char char char char char char char __q16
Definition: avxintrin.h:1218
__inline __m256i short short short short short short short short short short short __q04
Definition: avxintrin.h:1206
#define _mm256_insertf128_si256(X, Y, O)
Definition: avxintrin.h:780
#define _mm256_insertf128_ps(X, Y, O)
Definition: avxintrin.h:775
#define _mm_cmp_ss(X, Y, P)
Definition: avxintrin.h:417
#define _mm256_permute_ps(X, C)
Definition: avxintrin.h:636
#define _mm256_permute2f128_si256(X, Y, C)
Definition: avxintrin.h:675
__inline __m256i short short short short short short short short short short short short short __q02
Definition: avxintrin.h:1206
#define _mm_permute_ps(X, C)
Definition: avxintrin.h:633
__inline __m256i short short short short short short short short short short short short __q03
Definition: avxintrin.h:1206
__inline __m256d __m256d __Y
Definition: avxintrin.h:194
#define _mm256_extractf128_ps(X, N)
Definition: avxintrin.h:523
#define _mm256_insertf128_pd(X, Y, O)
Definition: avxintrin.h:770
__inline __m256i short short short short short short short __q08
Definition: avxintrin.h:1206
#define _mm256_insert_epi32(X, D, N)
Definition: avxintrin.h:785
#define _mm256_blend_pd(X, Y, M)
Definition: avxintrin.h:184
__inline __m256i short short short short __q11
Definition: avxintrin.h:1206
__inline __m256i char char char char char char char char char __q22
Definition: avxintrin.h:1218
__inline __m256i char char char char __q27
Definition: avxintrin.h:1218
__inline __m256i char char char char char char char char __q23
Definition: avxintrin.h:1218
#define _mm256_cmp_ps(X, Y, P)
Definition: avxintrin.h:409
#define _mm256_permute2f128_pd(X, Y, C)
Definition: avxintrin.h:665
__inline __m256 float float float float __E
Definition: avxintrin.h:1189
__inline __m256d __m256d __m256d __M
Definition: avxintrin.h:195
__inline __m256i short short short short short short short short short __q06
Definition: avxintrin.h:1206
#define _mm256_round_pd(V, M)
Definition: avxintrin.h:1023
__inline __m256i short short short __q12
Definition: avxintrin.h:1206
#define _mm256_cmp_pd(X, Y, P)
Definition: avxintrin.h:405
__inline __m256 float float float float float float __G
Definition: avxintrin.h:1189
__inline __m256i short short short short short short short short short short short short short short __q01
Definition: avxintrin.h:1206
__inline __m256 float float float float float float float __H
Definition: avxintrin.h:1191
__inline __m256i char char char char char char char char char char __q21
Definition: avxintrin.h:1218
__inline __m256i char char char char char char char __q24
Definition: avxintrin.h:1218
__inline __m256i char char __q29
Definition: avxintrin.h:1218