27 #ifndef _XMMINTRIN_H_INCLUDED
28 #define _XMMINTRIN_H_INCLUDED
31 # error "SSE instruction set not enabled"
42 typedef float __m128
__attribute__ ((__vector_size__ (16), __may_alias__));
48 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
49 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
61 #define _MM_EXCEPT_MASK 0x003f
62 #define _MM_EXCEPT_INVALID 0x0001
63 #define _MM_EXCEPT_DENORM 0x0002
64 #define _MM_EXCEPT_DIV_ZERO 0x0004
65 #define _MM_EXCEPT_OVERFLOW 0x0008
66 #define _MM_EXCEPT_UNDERFLOW 0x0010
67 #define _MM_EXCEPT_INEXACT 0x0020
69 #define _MM_MASK_MASK 0x1f80
70 #define _MM_MASK_INVALID 0x0080
71 #define _MM_MASK_DENORM 0x0100
72 #define _MM_MASK_DIV_ZERO 0x0200
73 #define _MM_MASK_OVERFLOW 0x0400
74 #define _MM_MASK_UNDERFLOW 0x0800
75 #define _MM_MASK_INEXACT 0x1000
77 #define _MM_ROUND_MASK 0x6000
78 #define _MM_ROUND_NEAREST 0x0000
79 #define _MM_ROUND_DOWN 0x2000
80 #define _MM_ROUND_UP 0x4000
81 #define _MM_ROUND_TOWARD_ZERO 0x6000
83 #define _MM_FLUSH_ZERO_MASK 0x8000
84 #define _MM_FLUSH_ZERO_ON 0x8000
85 #define _MM_FLUSH_ZERO_OFF 0x0000
88 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
91 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
98 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
99 _mm_add_ss (__m128
__A, __m128
__B)
101 return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
104 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
105 _mm_sub_ss (__m128 __A, __m128 __B)
107 return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
110 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
111 _mm_mul_ss (__m128 __A, __m128 __B)
113 return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
116 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
117 _mm_div_ss (__m128 __A, __m128 __B)
119 return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
122 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
123 _mm_sqrt_ss (__m128 __A)
125 return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
128 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
129 _mm_rcp_ss (__m128 __A)
131 return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
134 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
135 _mm_rsqrt_ss (__m128 __A)
137 return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
140 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
141 _mm_min_ss (__m128 __A, __m128 __B)
143 return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
146 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
147 _mm_max_ss (__m128 __A, __m128 __B)
149 return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
154 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
155 _mm_add_ps (__m128 __A, __m128 __B)
157 return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
160 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
161 _mm_sub_ps (__m128 __A, __m128 __B)
163 return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
166 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
167 _mm_mul_ps (__m128 __A, __m128 __B)
169 return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
172 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
173 _mm_div_ps (__m128 __A, __m128 __B)
175 return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
178 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
179 _mm_sqrt_ps (__m128 __A)
181 return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
184 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
185 _mm_rcp_ps (__m128 __A)
187 return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
190 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
191 _mm_rsqrt_ps (__m128 __A)
193 return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
196 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
197 _mm_min_ps (__m128 __A, __m128 __B)
199 return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
202 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
203 _mm_max_ps (__m128 __A, __m128 __B)
205 return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
210 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 _mm_and_ps (__m128 __A, __m128 __B)
213 return __builtin_ia32_andps (__A, __B);
216 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
217 _mm_andnot_ps (__m128 __A, __m128 __B)
219 return __builtin_ia32_andnps (__A, __B);
222 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
223 _mm_or_ps (__m128 __A, __m128 __B)
225 return __builtin_ia32_orps (__A, __B);
228 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
229 _mm_xor_ps (__m128 __A, __m128 __B)
231 return __builtin_ia32_xorps (__A, __B);
238 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
239 _mm_cmpeq_ss (__m128 __A, __m128 __B)
241 return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
244 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
245 _mm_cmplt_ss (__m128 __A, __m128 __B)
247 return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
250 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
251 _mm_cmple_ss (__m128 __A, __m128 __B)
253 return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
256 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
257 _mm_cmpgt_ss (__m128 __A, __m128 __B)
259 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
261 __builtin_ia32_cmpltss ((__v4sf) __B,
266 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
267 _mm_cmpge_ss (__m128 __A, __m128 __B)
269 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
271 __builtin_ia32_cmpless ((__v4sf) __B,
276 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
277 _mm_cmpneq_ss (__m128 __A, __m128 __B)
279 return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
282 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
283 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
285 return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
288 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
289 _mm_cmpnle_ss (__m128 __A, __m128 __B)
291 return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
294 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
295 _mm_cmpngt_ss (__m128 __A, __m128 __B)
297 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
299 __builtin_ia32_cmpnltss ((__v4sf) __B,
304 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
305 _mm_cmpnge_ss (__m128 __A, __m128 __B)
307 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
309 __builtin_ia32_cmpnless ((__v4sf) __B,
314 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
315 _mm_cmpord_ss (__m128 __A, __m128 __B)
317 return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
320 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
321 _mm_cmpunord_ss (__m128 __A, __m128 __B)
323 return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
330 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
331 _mm_cmpeq_ps (__m128 __A, __m128 __B)
333 return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
336 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
337 _mm_cmplt_ps (__m128 __A, __m128 __B)
339 return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
342 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
343 _mm_cmple_ps (__m128 __A, __m128 __B)
345 return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
348 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
349 _mm_cmpgt_ps (__m128 __A, __m128 __B)
351 return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
354 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
355 _mm_cmpge_ps (__m128 __A, __m128 __B)
357 return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
360 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
361 _mm_cmpneq_ps (__m128 __A, __m128 __B)
363 return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
366 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
367 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
369 return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
372 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
373 _mm_cmpnle_ps (__m128 __A, __m128 __B)
375 return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
378 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
379 _mm_cmpngt_ps (__m128 __A, __m128 __B)
381 return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
384 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
385 _mm_cmpnge_ps (__m128 __A, __m128 __B)
387 return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
390 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
391 _mm_cmpord_ps (__m128 __A, __m128 __B)
393 return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
396 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
397 _mm_cmpunord_ps (__m128 __A, __m128 __B)
399 return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
405 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
406 _mm_comieq_ss (__m128 __A, __m128 __B)
408 return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
411 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
412 _mm_comilt_ss (__m128 __A, __m128 __B)
414 return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
417 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
418 _mm_comile_ss (__m128 __A, __m128 __B)
420 return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
423 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424 _mm_comigt_ss (__m128 __A, __m128 __B)
426 return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
429 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
430 _mm_comige_ss (__m128 __A, __m128 __B)
432 return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
435 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
436 _mm_comineq_ss (__m128 __A, __m128 __B)
438 return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
441 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
442 _mm_ucomieq_ss (__m128 __A, __m128 __B)
444 return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
447 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
448 _mm_ucomilt_ss (__m128 __A, __m128 __B)
450 return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
453 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
454 _mm_ucomile_ss (__m128 __A, __m128 __B)
456 return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
459 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460 _mm_ucomigt_ss (__m128 __A, __m128 __B)
462 return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
465 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
466 _mm_ucomige_ss (__m128 __A, __m128 __B)
468 return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
471 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472 _mm_ucomineq_ss (__m128 __A, __m128 __B)
474 return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
479 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
480 _mm_cvtss_si32 (__m128 __A)
482 return __builtin_ia32_cvtss2si ((__v4sf) __A);
485 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486 _mm_cvt_ss2si (__m128 __A)
488 return _mm_cvtss_si32 (__A);
496 extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
497 _mm_cvtss_si64 (__m128 __A)
499 return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
503 extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
504 _mm_cvtss_si64x (__m128 __A)
506 return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
512 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
513 _mm_cvtps_pi32 (__m128 __A)
515 return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
518 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
519 _mm_cvt_ps2pi (__m128 __A)
521 return _mm_cvtps_pi32 (__A);
525 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526 _mm_cvttss_si32 (__m128 __A)
528 return __builtin_ia32_cvttss2si ((__v4sf) __A);
531 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
532 _mm_cvtt_ss2si (__m128 __A)
534 return _mm_cvttss_si32 (__A);
541 extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
542 _mm_cvttss_si64 (__m128 __A)
544 return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
548 extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
549 _mm_cvttss_si64x (__m128 __A)
551 return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
557 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
558 _mm_cvttps_pi32 (__m128 __A)
560 return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
563 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
564 _mm_cvtt_ps2pi (__m128 __A)
566 return _mm_cvttps_pi32 (__A);
570 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
571 _mm_cvtsi32_ss (__m128 __A,
int __B)
573 return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
576 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
577 _mm_cvt_si2ss (__m128 __A,
int __B)
579 return _mm_cvtsi32_ss (__A, __B);
586 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
587 _mm_cvtsi64_ss (__m128 __A,
long long __B)
589 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
593 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
594 _mm_cvtsi64x_ss (__m128 __A,
long long __B)
596 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
602 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
603 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
605 return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
608 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
609 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
611 return _mm_cvtpi32_ps (__A, __B);
615 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
616 _mm_cvtpi16_ps (__m64 __A)
619 __v2si __hisi, __losi;
620 __v4sf __zero, __ra, __rb;
625 __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
628 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)
__A, __sign);
629 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)
__A, __sign);
632 __zero = (__v4sf) _mm_setzero_ps ();
633 __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
634 __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
636 return (__m128) __builtin_ia32_movlhps (__ra, __rb);
640 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
641 _mm_cvtpu16_ps (__m64 __A)
643 __v2si __hisi, __losi;
644 __v4sf __zero, __ra, __rb;
647 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)
__A, (__v4hi)0LL);
648 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)
__A, (__v4hi)0LL);
651 __zero = (__v4sf) _mm_setzero_ps ();
652 __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
653 __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
655 return (__m128) __builtin_ia32_movlhps (__ra, __rb);
659 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
660 _mm_cvtpi8_ps (__m64 __A)
667 __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
670 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)
__A, __sign);
672 return _mm_cvtpi16_ps(__A);
676 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
677 _mm_cvtpu8_ps(__m64 __A)
679 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)
__A, (__v8qi)0LL);
680 return _mm_cvtpu16_ps(__A);
684 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
685 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
687 __v4sf __zero = (__v4sf) _mm_setzero_ps ();
688 __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
689 __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
690 return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
694 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
695 _mm_cvtps_pi16(__m128 __A)
697 __v4sf __hisf = (__v4sf)__A;
698 __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
699 __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
700 __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
701 return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
705 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
706 _mm_cvtps_pi8(__m128 __A)
708 __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
709 return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
714 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
715 _mm_shuffle_ps (__m128 __A, __m128 __B,
int const __mask)
717 return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
720 #define _mm_shuffle_ps(A, B, MASK) \
721 ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A), \
722 (__v4sf)(__m128)(B), (int)(MASK)))
726 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
727 _mm_unpackhi_ps (__m128 __A, __m128 __B)
729 return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
733 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
734 _mm_unpacklo_ps (__m128 __A, __m128 __B)
736 return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
741 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
742 _mm_loadh_pi (__m128 __A, __m64 const *
__P)
744 return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (
const __v2sf *)__P);
748 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
749 _mm_storeh_pi (__m64 *__P, __m128 __A)
751 __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A);
755 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
756 _mm_movehl_ps (__m128 __A, __m128 __B)
758 return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
762 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
763 _mm_movelh_ps (__m128 __A, __m128 __B)
765 return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
770 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
771 _mm_loadl_pi (__m128 __A, __m64 const *__P)
773 return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (
const __v2sf *)__P);
777 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778 _mm_storel_pi (__m64 *__P, __m128 __A)
780 __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A);
784 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
785 _mm_movemask_ps (__m128 __A)
787 return __builtin_ia32_movmskps ((__v4sf)__A);
791 extern __inline
unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
794 return __builtin_ia32_stmxcsr ();
798 extern __inline
unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
799 _MM_GET_EXCEPTION_STATE (
void)
801 return _mm_getcsr() & _MM_EXCEPT_MASK;
804 extern __inline
unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
805 _MM_GET_EXCEPTION_MASK (
void)
807 return _mm_getcsr() & _MM_MASK_MASK;
810 extern __inline
unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
811 _MM_GET_ROUNDING_MODE (
void)
813 return _mm_getcsr() & _MM_ROUND_MASK;
816 extern __inline
unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
817 _MM_GET_FLUSH_ZERO_MODE (
void)
819 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
823 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824 _mm_setcsr (
unsigned int __I)
826 __builtin_ia32_ldmxcsr (__I);
830 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
831 _MM_SET_EXCEPTION_STATE(
unsigned int __mask)
833 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
836 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
837 _MM_SET_EXCEPTION_MASK (
unsigned int __mask)
839 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
842 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
843 _MM_SET_ROUNDING_MODE (
unsigned int __mode)
845 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
848 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
849 _MM_SET_FLUSH_ZERO_MODE (
unsigned int __mode)
851 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
855 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
856 _mm_set_ss (
float __F)
858 return __extension__ (__m128)(__v4sf){
__F, 0.0f, 0.0f, 0.0f };
862 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
863 _mm_set1_ps (
float __F)
865 return __extension__ (__m128)(__v4sf){
__F,
__F,
__F, __F };
868 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
869 _mm_set_ps1 (
float __F)
871 return _mm_set1_ps (__F);
875 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
876 _mm_load_ss (
float const *__P)
878 return _mm_set_ss (*__P);
882 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
883 _mm_load1_ps (
float const *__P)
885 return _mm_set1_ps (*__P);
888 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
889 _mm_load_ps1 (
float const *__P)
891 return _mm_load1_ps (__P);
895 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
896 _mm_load_ps (
float const *__P)
898 return (__m128) *(__v4sf *)__P;
902 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
903 _mm_loadu_ps (
float const *__P)
905 return (__m128) __builtin_ia32_loadups (__P);
909 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
910 _mm_loadr_ps (
float const *__P)
912 __v4sf __tmp = *(__v4sf *)__P;
913 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
917 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
918 _mm_set_ps (const
float __Z, const
float __Y, const
float __X, const
float __W)
920 return __extension__ (__m128)(__v4sf){ __W,
__X,
__Y, __Z };
924 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
925 _mm_setr_ps (
float __Z,
float __Y,
float __X,
float __W)
927 return __extension__ (__m128)(__v4sf){
__Z,
__Y,
__X, __W };
931 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
932 _mm_store_ss (
float *__P, __m128 __A)
934 *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
937 extern __inline
float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
938 _mm_cvtss_f32 (__m128 __A)
940 return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
944 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945 _mm_store_ps (
float *__P, __m128 __A)
947 *(__v4sf *)__P = (__v4sf)
__A;
951 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
952 _mm_storeu_ps (
float *__P, __m128 __A)
954 __builtin_ia32_storeups (__P, (__v4sf)__A);
958 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
959 _mm_store1_ps (
float *__P, __m128 __A)
961 __v4sf __va = (__v4sf)__A;
962 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
963 _mm_storeu_ps (__P, __tmp);
966 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
967 _mm_store_ps1 (
float *__P, __m128 __A)
969 _mm_store1_ps (__P, __A);
973 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
974 _mm_storer_ps (
float *__P, __m128 __A)
976 __v4sf __va = (__v4sf)__A;
977 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
978 _mm_store_ps (__P, __tmp);
982 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
983 _mm_move_ss (__m128 __A, __m128 __B)
985 return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
990 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
991 _mm_extract_pi16 (__m64 const __A,
int const __N)
993 return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
996 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
997 _m_pextrw (__m64 const __A,
int const __N)
999 return _mm_extract_pi16 (__A, __N);
1002 #define _mm_extract_pi16(A, N) \
1003 ((int) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N)))
1005 #define _m_pextrw(A, N) _mm_extract_pi16(A, N)
1011 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012 _mm_insert_pi16 (__m64 const __A,
int const
__D,
int const __N)
1014 return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
1017 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1018 _m_pinsrw (__m64 const __A,
int const __D,
int const __N)
1020 return _mm_insert_pi16 (__A, __D, __N);
1023 #define _mm_insert_pi16(A, D, N) \
1024 ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A), \
1025 (int)(D), (int)(N)))
1027 #define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N)
1031 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032 _mm_max_pi16 (__m64 __A, __m64 __B)
1034 return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
1037 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1038 _m_pmaxsw (__m64 __A, __m64 __B)
1040 return _mm_max_pi16 (__A, __B);
1044 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045 _mm_max_pu8 (__m64 __A, __m64 __B)
1047 return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
1050 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051 _m_pmaxub (__m64 __A, __m64 __B)
1053 return _mm_max_pu8 (__A, __B);
1057 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1058 _mm_min_pi16 (__m64 __A, __m64 __B)
1060 return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
1063 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1064 _m_pminsw (__m64 __A, __m64 __B)
1066 return _mm_min_pi16 (__A, __B);
1070 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1071 _mm_min_pu8 (__m64 __A, __m64 __B)
1073 return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
1076 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1077 _m_pminub (__m64 __A, __m64 __B)
1079 return _mm_min_pu8 (__A, __B);
1083 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1084 _mm_movemask_pi8 (__m64 __A)
1086 return __builtin_ia32_pmovmskb ((__v8qi)__A);
1089 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090 _m_pmovmskb (__m64 __A)
1092 return _mm_movemask_pi8 (__A);
1097 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1100 return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
1103 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104 _m_pmulhuw (__m64 __A, __m64 __B)
1106 return _mm_mulhi_pu16 (__A, __B);
1112 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1113 _mm_shuffle_pi16 (__m64 __A,
int const __N)
1115 return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
1118 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1119 _m_pshufw (__m64 __A,
int const __N)
1121 return _mm_shuffle_pi16 (__A, __N);
1124 #define _mm_shuffle_pi16(A, N) \
1125 ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
1127 #define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N)
1133 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1134 _mm_maskmove_si64 (__m64 __A, __m64 __N,
char *__P)
1136 __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
1139 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1140 _m_maskmovq (__m64 __A, __m64 __N,
char *__P)
1142 _mm_maskmove_si64 (__A, __N, __P);
1146 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147 _mm_avg_pu8 (__m64 __A, __m64 __B)
1149 return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
1152 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1153 _m_pavgb (__m64 __A, __m64 __B)
1155 return _mm_avg_pu8 (__A, __B);
1159 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1160 _mm_avg_pu16 (__m64 __A, __m64 __B)
1162 return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
1165 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1166 _m_pavgw (__m64 __A, __m64 __B)
1168 return _mm_avg_pu16 (__A, __B);
1174 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175 _mm_sad_pu8 (__m64 __A, __m64 __B)
1177 return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
1180 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181 _m_psadbw (__m64 __A, __m64 __B)
1183 return _mm_sad_pu8 (__A, __B);
1189 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1190 _mm_prefetch (const
void *__P, enum _mm_hint __I)
1192 __builtin_prefetch (__P, 0, __I);
1195 #define _mm_prefetch(P, I) \
1196 __builtin_prefetch ((P), 0, (I))
1200 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1201 _mm_stream_pi (__m64 *__P, __m64 __A)
1203 __builtin_ia32_movntq ((
unsigned long long *)__P, (
unsigned long long)__A);
1207 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1208 _mm_stream_ps (
float *__P, __m128 __A)
1210 __builtin_ia32_movntps (__P, (__v4sf)__A);
1215 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1218 __builtin_ia32_sfence ();
1224 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227 __builtin_ia32_pause ();
1231 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1233 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1234 __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1); \
1235 __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3); \
1236 __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1); \
1237 __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3); \
1238 (row0) = __builtin_ia32_movlhps (__t0, __t1); \
1239 (row1) = __builtin_ia32_movhlps (__t1, __t0); \
1240 (row2) = __builtin_ia32_movlhps (__t2, __t3); \
1241 (row3) = __builtin_ia32_movhlps (__t3, __t2); \
__inline unsigned char unsigned int unsigned int __Y
Definition: adxintrin.h:33
__inline __m256i __m256i __B
Definition: avx2intrin.h:69
double __v4df __attribute__((__vector_size__(32)))
Definition: avxintrin.h:32
__inline unsigned char unsigned int unsigned int unsigned int * __P
Definition: adxintrin.h:35
__inline unsigned int unsigned int unsigned __Z
Definition: bmiintrin.h:55
__inline void __m256d __A
Definition: avxintrin.h:828
__inline __m256d double double double __D
Definition: avxintrin.h:1183
__inline unsigned char unsigned int __X
Definition: adxintrin.h:33
__inline __m256 float float float float float __F
Definition: avxintrin.h:1189