STLdoc
STLdocumentation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
dvec.h
Go to the documentation of this file.
1 /***
2 *** Copyright (C) 1985-2015 Intel Corporation. All rights reserved.
3 ***
4 *** The information and source code contained herein is the exclusive
5 *** property of Intel Corporation and may not be disclosed, examined
6 *** or reproduced in whole or in part without explicit written authorization
7 *** from the company.
8 ***
9 ****/
10 
11 /*
12  * Definition of a C++ class interface to Intel(R) Pentium(R) 4 processor SSE2 intrinsics.
13  *
14  * File name : dvec.h class definitions
15  *
16  * Concept: A C++ abstraction of Intel(R) Pentium(R) 4 processor SSE2
17  * designed to improve programmer productivity. Speed and accuracy are
18  * sacrificed for utility. Facilitates an easy transition to compiler
19  * intrinsics or assembly language.
20  *
21  */
22 
23 #ifndef _DVEC_H_INCLUDED
24 #define _DVEC_H_INCLUDED
25 #ifndef RC_INVOKED
26 
27 #if !defined __cplusplus
28  #error ERROR: This file is only supported in C++ compilations!
29 #endif /* !defined __cplusplus */
30 
31 #if defined (_M_CEE_PURE)
32  #error ERROR: This file is not supported in the pure mode!
33 #else /* defined (_M_CEE_PURE) */
34 
35 #include <immintrin.h> /* SSE2 intrinsic function definition include file */
36 #include <fvec.h>
37 #include <vcruntime.h>
38 
39 #ifndef _VEC_ASSERT
40 #ifdef NDEBUG
41  #define _VEC_ASSERT(_Expression) ((void)0)
42 #else /* NDEBUG */
43 #ifdef __cplusplus
44  extern "C" {
45 #endif /* __cplusplus */
46 
47  void __cdecl _wassert(_In_z_ const wchar_t * _Message, _In_z_ const wchar_t *_File, _In_ unsigned _Line);
48 
49 #ifdef __cplusplus
50  }
51 #endif /* __cplusplus */
52 
53  #define _VEC_ASSERT(_Expression) (void)( (!!(_Expression)) || (_wassert(_CRT_WIDE(#_Expression), _CRT_WIDE(__FILE__), __LINE__), 0) )
54 #endif /* NDEBUG */
55 #endif /* _VEC_ASSERT */
56 
57 #pragma pack(push,_CRT_PACKING)
58 
59 /* Define _ENABLE_VEC_DEBUG to enable std::ostream inserters for debug output */
60 #if defined (_ENABLE_VEC_DEBUG)
61  #include <iostream>
62 #endif /* defined (_ENABLE_VEC_DEBUG) */
63 
64 #pragma pack(push,16) /* Must ensure class & union 16-B aligned */
65 
66 const union
67 {
68  int i[4];
70 } __f64vec2_abs_mask_cheat = {-1, 0x7fffffff, -1, 0x7fffffff};
71 
72 #define _f64vec2_abs_mask ((F64vec2)__f64vec2_abs_mask_cheat.m)
73 
74 /* EMM Functionality Intrinsics */
75 
76 class I8vec16; /* 16 elements, each element a signed or unsigned char data type */
77 class Is8vec16; /* 16 elements, each element a signed char data type */
78 class Iu8vec16; /* 16 elements, each element an unsigned char data type */
79 class I16vec8; /* 8 elements, each element a signed or unsigned short */
80 class Is16vec8; /* 8 elements, each element a signed short */
81 class Iu16vec8; /* 8 elements, each element an unsigned short */
82 class I32vec4; /* 4 elements, each element a signed or unsigned long */
83 class Is32vec4; /* 4 elements, each element a signed long */
84 class Iu32vec4; /* 4 elements, each element a unsigned long */
85 class I64vec2; /* 2 element, each a __m64 data type */
86 class I128vec1; /* 1 element, a __m128i data type */
87 
88 #define _MM_16UB(element,vector) (*((unsigned char*)&##vector + ##element))
89 #define _MM_16B(element,vector) (*((signed char*)&##vector + ##element))
90 
91 #define _MM_8UW(element,vector) (*((unsigned short*)&##vector + ##element))
92 #define _MM_8W(element,vector) (*((short*)&##vector + ##element))
93 
94 #define _MM_4UDW(element,vector) (*((unsigned int*)&##vector + ##element))
95 #define _MM_4DW(element,vector) (*((int*)&##vector + ##element))
96 
97 #define _MM_2QW(element,vector) (*((__int64*)&##vector + ##element))
98 
99 
100 /* We need a m128i constant, keeping performance in mind*/
101 
102 #pragma warning(push)
103 #pragma warning(disable : 4640)
104 inline const __m128i get_mask128()
105 {
106  static const __m128i _Mask128 = _mm_set1_epi64x(0xffffffffffffffffi64);
107  return _Mask128;
108 }
109 #pragma warning(pop)
110 
111 
112 //DEVDIV Remove alais created in public\sdk\inc\winnt.h
113 #ifdef M128
114 #undef M128
115 #endif /* M128 */
116 #ifdef PM128
117 #undef PM128
118 #endif /* PM128 */
119 //end DEVDIV
120 
121 /* M128 Class:
122  * 1 element, a __m128i data type
123  * Contructors & Logical Operations
124  */
125 
126 class M128
127 {
128 protected:
130 
131 public:
132  M128() { }
133  M128(__m128i _Mm) { vec = _Mm; }
134 
135  operator __m128i() const { return vec; }
136 
137  /* Logical Operations */
138  M128& operator&=(const M128 &_A) { return *this = (M128) _mm_and_si128(vec,_A); }
139  M128& operator|=(const M128 &_A) { return *this = (M128) _mm_or_si128(vec,_A); }
140  M128& operator^=(const M128 &_A) { return *this = (M128) _mm_xor_si128(vec,_A); }
141 
142 };
143 
144 inline M128 operator&(const M128 &_A, const M128 &_B) { return _mm_and_si128(_A,_B); }
145 inline M128 operator|(const M128 &_A, const M128 &_B) { return _mm_or_si128(_A,_B); }
146 inline M128 operator^(const M128 &_A, const M128 &_B) { return _mm_xor_si128(_A,_B); }
147 inline M128 andnot(const M128 &_A, const M128 &_B) { return _mm_andnot_si128(_A,_B); }
148 
149 /* I128vec1 Class:
150  * 1 element, a __m128i data type
151  * Contains Operations which can operate on any __m6128i data type
152  */
153 
154 class I128vec1 : public M128
155 {
156 public:
157  I128vec1() { }
158  I128vec1(__m128i _Mm) : M128(_Mm) { }
159 
160  I128vec1& operator= (const M128 &_A) { return *this = (I128vec1) _A; }
161  I128vec1& operator&=(const M128 &_A) { return *this = (I128vec1) _mm_and_si128(vec,_A); }
162  I128vec1& operator|=(const M128 &_A) { return *this = (I128vec1) _mm_or_si128(vec,_A); }
163  I128vec1& operator^=(const M128 &_A) { return *this = (I128vec1) _mm_xor_si128(vec,_A); }
164 
165 };
166 
167 /* I64vec2 Class:
168  * 2 elements, each element signed or unsigned 64-bit integer
169  */
170 class I64vec2 : public M128
171 {
172 public:
173  I64vec2() { }
174  I64vec2(__m128i _Mm) : M128(_Mm) { }
175 
176  I64vec2(__m64 _Q1, __m64 _Q0)
177  {
178  _MM_2QW(0,vec) = *(__int64*)&_Q0;
179  _MM_2QW(1,vec) = *(__int64*)&_Q1;
180  }
181 
182  /* Assignment Operator */
183  I64vec2& operator= (const M128 &_A) { return *this = (I64vec2) _A; }
184 
185  /* Logical Assignment Operators */
186  I64vec2& operator&=(const M128 &_A) { return *this = (I64vec2) _mm_and_si128(vec,_A); }
187  I64vec2& operator|=(const M128 &_A) { return *this = (I64vec2) _mm_or_si128(vec,_A); }
188  I64vec2& operator^=(const M128 &_A) { return *this = (I64vec2) _mm_xor_si128(vec,_A); }
189 
190  /* Addition & Subtraction Assignment Operators */
191  I64vec2& operator +=(const I64vec2 &_A) { return *this = (I64vec2) _mm_add_epi64(vec,_A); }
192  I64vec2& operator -=(const I64vec2 &_A) { return *this = (I64vec2) _mm_sub_epi64(vec,_A); }
193 
194  /* Shift Logical Operators */
195  I64vec2 operator<<(const I64vec2 &_A) { return _mm_sll_epi64(vec,_A); }
196  I64vec2 operator<<(int _Count) { return _mm_slli_epi64(vec,_Count); }
197  I64vec2& operator<<=(const I64vec2 &_A) { return *this = (I64vec2) _mm_sll_epi64(vec,_A); }
198  I64vec2& operator<<=(int _Count) { return *this = (I64vec2) _mm_slli_epi64(vec,_Count); }
199  I64vec2 operator>>(const I64vec2 &_A) { return _mm_srl_epi64(vec,_A); }
200  I64vec2 operator>>(int _Count) { return _mm_srli_epi64(vec,_Count); }
201  I64vec2& operator>>=(const I64vec2 &_A) { return *this = (I64vec2) _mm_srl_epi64(vec,_A); }
202  I64vec2& operator>>=(int _Count) { return *this = (I64vec2) _mm_srli_epi64(vec,_Count); }
203 
204  /* Element Access for Debug, No data modified */
205  const __int64& operator[](int _I)const
206  {
207  _VEC_ASSERT(static_cast<unsigned int>(_I) < 2); /* Only 2 elements to access */
208  return _MM_2QW(_I,vec);
209  }
210 
211  /* Element Access and Assignment for Debug */
212  __int64& operator[](int _I)
213  {
214  _VEC_ASSERT(static_cast<unsigned int>(_I) < 2); /* Only 2 elements to access */
215  return _MM_2QW(_I,vec);
216  }
217 
218 
219 };
220 
221 /* Unpacks */
222 inline I64vec2 unpack_low(const I64vec2 &_A, const I64vec2 &_B) {return _mm_unpacklo_epi64(_A,_B); }
223 inline I64vec2 unpack_high(const I64vec2 &_A, const I64vec2 &_B) {return _mm_unpackhi_epi64(_A,_B); }
224 
225 /* I32vec4 Class:
226  * 4 elements, each element either a signed or unsigned int
227  */
228 class I32vec4 : public M128
229 {
230 public:
231  I32vec4() { }
232  I32vec4(__m128i _Mm) : M128(_Mm) { }
233  I32vec4(int _I3, int _I2, int _I1, int _I0) {vec = _mm_set_epi32(_I3, _I2, _I1, _I0);}
234 
235  /* Assignment Operator */
236  I32vec4& operator= (const M128 &_A) { return *this = (I32vec4) _A; }
237 
238  /* Logicals Operators */
239  I32vec4& operator&=(const M128 &_A) { return *this = (I32vec4) _mm_and_si128(vec,_A); }
240  I32vec4& operator|=(const M128 &_A) { return *this = (I32vec4) _mm_or_si128(vec,_A); }
241  I32vec4& operator^=(const M128 &_A) { return *this = (I32vec4) _mm_xor_si128(vec,_A); }
242 
243  /* Addition & Subtraction Assignment Operators */
244  I32vec4& operator +=(const I32vec4 &_A) { return *this = (I32vec4)_mm_add_epi32(vec,_A); }
245  I32vec4& operator -=(const I32vec4 &_A) { return *this = (I32vec4)_mm_sub_epi32(vec,_A); }
246 
247  /* Shift Logical Operators */
248  I32vec4 operator<<(const I32vec4 &_A) { return _mm_sll_epi32(vec,_A); }
249  I32vec4 operator<<(int _Count) { return _mm_slli_epi32(vec,_Count); }
250  I32vec4& operator<<=(const I32vec4 &_A) { return *this = (I32vec4)_mm_sll_epi32(vec,_A); }
251  I32vec4& operator<<=(int _Count) { return *this = (I32vec4)_mm_slli_epi32(vec,_Count); }
252 
253 };
254 
255 inline I32vec4 cmpeq(const I32vec4 &_A, const I32vec4 &_B) { return _mm_cmpeq_epi32(_A,_B); }
256 inline I32vec4 cmpneq(const I32vec4 &_A, const I32vec4 &_B) { return _mm_andnot_si128(_mm_cmpeq_epi32(_A,_B), get_mask128()); }
257 
258 inline I32vec4 unpack_low(const I32vec4 &_A, const I32vec4 &_B) { return _mm_unpacklo_epi32(_A,_B); }
259 inline I32vec4 unpack_high(const I32vec4 &_A, const I32vec4 &_B) { return _mm_unpackhi_epi32(_A,_B); }
260 
261 /* Is32vec4 Class:
262  * 4 elements, each element signed integer
263  */
264 class Is32vec4 : public I32vec4
265 {
266 public:
267  Is32vec4() { }
268  Is32vec4(__m128i _Mm) : I32vec4(_Mm) { }
269  Is32vec4(int _I3, int _I2, int _I1, int _I0) : I32vec4(_I3, _I2, _I1, _I0){}
270 
271  /* Assignment Operator */
272  Is32vec4& operator= (const M128 &_A) { return *this = (Is32vec4) _A; }
273 
274  /* Logical Operators */
275  Is32vec4& operator&=(const M128 &_A) { return *this = (Is32vec4) _mm_and_si128(vec,_A); }
276  Is32vec4& operator|=(const M128 &_A) { return *this = (Is32vec4) _mm_or_si128(vec,_A); }
277  Is32vec4& operator^=(const M128 &_A) { return *this = (Is32vec4) _mm_xor_si128(vec,_A); }
278 
279  /* Addition & Subtraction Assignment Operators */
280  Is32vec4& operator +=(const I32vec4 &_A) { return *this = (Is32vec4)_mm_add_epi32(vec,_A); }
281  Is32vec4& operator -=(const I32vec4 &_A) { return *this = (Is32vec4)_mm_sub_epi32(vec,_A); }
282 
283  /* Shift Logical Operators */
284  Is32vec4 operator<<(const M128 &_A) { return _mm_sll_epi32(vec,_A); }
285  Is32vec4 operator<<(int _Count) { return _mm_slli_epi32(vec,_Count); }
286  Is32vec4& operator<<=(const M128 &_A) { return *this = (Is32vec4)_mm_sll_epi32(vec,_A); }
287  Is32vec4& operator<<=(int _Count) { return *this = (Is32vec4)_mm_slli_epi32(vec,_Count); }
288  /* Shift Arithmetic Operations */
289  Is32vec4 operator>>(const M128 &_A) { return _mm_sra_epi32(vec,_A); }
290  Is32vec4 operator>>(int _Count) { return _mm_srai_epi32(vec,_Count); }
291  Is32vec4& operator>>=(const M128 &_A) { return *this = (Is32vec4) _mm_sra_epi32(vec,_A); }
292  Is32vec4& operator>>=(int _Count) { return *this = (Is32vec4) _mm_srai_epi32(vec,_Count); }
293 
294 #if defined (_ENABLE_VEC_DEBUG)
295  /* Output for Debug */
296  friend std::ostream& operator<< (std::ostream &_Os, const Is32vec4 &_A)
297  {
298  _Os << "[3]:" << _MM_4DW(3,_A)
299  << " [2]:" << _MM_4DW(2,_A)
300  << " [1]:" << _MM_4DW(1,_A)
301  << " [0]:" << _MM_4DW(0,_A);
302  return _Os;
303  }
304 #endif /* defined (_ENABLE_VEC_DEBUG) */
305 
306  /* Element Access for Debug, No data modified */
307  const int& operator[](int _I)const
308  {
309  _VEC_ASSERT(static_cast<unsigned int>(_I) < 4); /* Only 4 elements to access */
310  return _MM_4DW(_I,vec);
311  }
312 
313  /* Element Access for Debug */
314  int& operator[](int _I)
315  {
316  _VEC_ASSERT(static_cast<unsigned int>(_I) < 4); /* Only 4 elements to access */
317  return _MM_4DW(_I,vec);
318  }
319 };
320 
321 /* Compares */
322 inline Is32vec4 cmpeq(const Is32vec4 &_A, const Is32vec4 &_B) { return _mm_cmpeq_epi32(_A,_B); }
323 inline Is32vec4 cmpneq(const Is32vec4 &_A, const Is32vec4 &_B) { return _mm_andnot_si128(_mm_cmpeq_epi32(_A,_B), get_mask128()); }
324 inline Is32vec4 cmpgt(const Is32vec4 &_A, const Is32vec4 &_B) { return _mm_cmpgt_epi32(_A,_B); }
325 inline Is32vec4 cmplt(const Is32vec4 &_A, const Is32vec4 &_B) { return _mm_cmpgt_epi32(_B,_A); }
326 
327 /* Unpacks */
328 inline Is32vec4 unpack_low(const Is32vec4 &_A, const Is32vec4 &_B) { return _mm_unpacklo_epi32(_A,_B); }
329 inline Is32vec4 unpack_high(const Is32vec4 &_A, const Is32vec4 &_B) { return _mm_unpackhi_epi32(_A,_B); }
330 
331 /* Iu32vec4 Class:
332  * 4 elements, each element unsigned int
333  */
334 class Iu32vec4 : public I32vec4
335 {
336 public:
337  Iu32vec4() { }
338  Iu32vec4(__m128i _Mm) : I32vec4(_Mm) { }
339  Iu32vec4(unsigned int _Ui3, unsigned int _Ui2, unsigned int _Ui1, unsigned int _Ui0)
340  : I32vec4(_Ui3, _Ui2, _Ui1, _Ui0) { }
341 
342  /* Assignment Operator */
343  Iu32vec4& operator= (const M128 &_A) { return *this = (Iu32vec4) _A; }
344 
345  /* Logical Assignment Operators */
346  Iu32vec4& operator&=(const M128 &_A) { return *this = (Iu32vec4) _mm_and_si128(vec,_A); }
347  Iu32vec4& operator|=(const M128 &_A) { return *this = (Iu32vec4) _mm_or_si128(vec,_A); }
348  Iu32vec4& operator^=(const M128 &_A) { return *this = (Iu32vec4) _mm_xor_si128(vec,_A); }
349 
350  /* Addition & Subtraction Assignment Operators */
351  Iu32vec4& operator +=(const I32vec4 &_A) { return *this = (Iu32vec4)_mm_add_epi32(vec,_A); }
352  Iu32vec4& operator -=(const I32vec4 &_A) { return *this = (Iu32vec4)_mm_sub_epi32(vec,_A); }
353 
354  /* Shift Logical Operators */
355  Iu32vec4 operator<<(const M128 &_A) { return _mm_sll_epi32(vec,_A); }
356  Iu32vec4 operator<<(int _Count) { return _mm_slli_epi32(vec,_Count); }
357  Iu32vec4& operator<<=(const M128 &_A) { return *this = (Iu32vec4)_mm_sll_epi32(vec,_A); }
358  Iu32vec4& operator<<=(int _Count) { return *this = (Iu32vec4)_mm_slli_epi32(vec,_Count); }
359  Iu32vec4 operator>>(const M128 &_A) { return _mm_srl_epi32(vec,_A); }
360  Iu32vec4 operator>>(int _Count) { return _mm_srli_epi32(vec,_Count); }
361  Iu32vec4& operator>>=(const M128 &_A) { return *this = (Iu32vec4) _mm_srl_epi32(vec,_A); }
362  Iu32vec4& operator>>=(int _Count) { return *this = (Iu32vec4) _mm_srli_epi32(vec,_Count); }
363 
364 #if defined (_ENABLE_VEC_DEBUG)
365  /* Output for Debug */
366  friend std::ostream& operator<< (std::ostream &_Os, const Iu32vec4 &_A)
367  {
368  _Os << "[3]:" << _MM_4UDW(3,_A)
369  << " [2]:" << _MM_4UDW(2,_A)
370  << " [1]:" << _MM_4UDW(1,_A)
371  << " [0]:" << _MM_4UDW(0,_A);
372  return _Os;
373  }
374 #endif /* defined (_ENABLE_VEC_DEBUG) */
375 
376  /* Element Access for Debug, No data modified */
377  const unsigned int& operator[](int _I)const
378  {
379  _VEC_ASSERT(static_cast<unsigned int>(_I) < 4); /* Only 4 elements to access */
380  return _MM_4UDW(_I,vec);
381  }
382 
383  /* Element Access and Assignment for Debug */
384  unsigned int& operator[](int _I)
385  {
386  _VEC_ASSERT(static_cast<unsigned int>(_I) < 4); /* Only 4 elements to access */
387  return _MM_4UDW(_I,vec);
388  }
389 };
390 
391 inline I64vec2 operator*(const Iu32vec4 &_A, const Iu32vec4 &_B) { return _mm_mul_epu32(_A,_B); }
392 inline Iu32vec4 cmpeq(const Iu32vec4 &_A, const Iu32vec4 &_B) { return _mm_cmpeq_epi32(_A,_B); }
393 inline Iu32vec4 cmpneq(const Iu32vec4 &_A, const Iu32vec4 &_B) { return _mm_andnot_si128(_mm_cmpeq_epi32(_A,_B), get_mask128()); }
394 
395 inline Iu32vec4 unpack_low(const Iu32vec4 &_A, const Iu32vec4 &_B) { return _mm_unpacklo_epi32(_A,_B); }
396 inline Iu32vec4 unpack_high(const Iu32vec4 &_A, const Iu32vec4 &_B) { return _mm_unpackhi_epi32(_A,_B); }
397 
398 /* I16vec8 Class:
399  * 8 elements, each element either unsigned or signed short
400  */
401 class I16vec8 : public M128
402 {
403 public:
404  I16vec8() { }
405  I16vec8(__m128i _Mm) : M128(_Mm) { }
406  I16vec8(short _S7, short _S6, short _S5, short _S4, short _S3, short _S2, short _S1, short _S0)
407  {
408  vec = _mm_set_epi16(_S7, _S6, _S5, _S4, _S3, _S2, _S1, _S0);
409  }
410 
411  /* Assignment Operator */
412  I16vec8& operator= (const M128 &_A) { return *this = (I16vec8) _A; }
413 
414  /* Logical Assignment Operators */
415  I16vec8& operator&=(const M128 &_A) { return *this = (I16vec8) _mm_and_si128(vec,_A); }
416  I16vec8& operator|=(const M128 &_A) { return *this = (I16vec8) _mm_or_si128(vec,_A); }
417  I16vec8& operator^=(const M128 &_A) { return *this = (I16vec8) _mm_xor_si128(vec,_A); }
418 
419  /* Addition & Subtraction Assignment Operators */
420  I16vec8& operator +=(const I16vec8 &_A) { return *this = (I16vec8) _mm_add_epi16(vec,_A); }
421  I16vec8& operator -=(const I16vec8 &_A) { return *this = (I16vec8) _mm_sub_epi16(vec,_A); }
422  I16vec8& operator *=(const I16vec8 &_A) { return *this = (I16vec8) _mm_mullo_epi16(vec,_A); }
423 
424  /* Shift Logical Operators */
425  I16vec8 operator<<(const M128 &_A) { return _mm_sll_epi16(vec,_A); }
426  I16vec8 operator<<(int _Count) { return _mm_slli_epi16(vec,_Count); }
427  I16vec8& operator<<=(const M128 &_A) { return *this = (I16vec8)_mm_sll_epi16(vec,_A); }
428  I16vec8& operator<<=(int _Count) { return *this = (I16vec8)_mm_slli_epi16(vec,_Count); }
429 
430 };
431 
432 
433 inline I16vec8 operator*(const I16vec8 &_A, const I16vec8 &_B) { return _mm_mullo_epi16(_A,_B); }
434 
435 inline I16vec8 cmpeq(const I16vec8 &_A, const I16vec8 &_B) { return _mm_cmpeq_epi16(_A,_B); }
436 inline I16vec8 cmpneq(const I16vec8 &_A, const I16vec8 &_B) { return _mm_andnot_si128(_mm_cmpeq_epi16(_A,_B), get_mask128()); }
437 
438 inline I16vec8 unpack_low(const I16vec8 &_A, const I16vec8 &_B) { return _mm_unpacklo_epi16(_A,_B); }
439 inline I16vec8 unpack_high(const I16vec8 &_A, const I16vec8 &_B) { return _mm_unpackhi_epi16(_A,_B); }
440 
441 /* Is16vec8 Class:
442  * 8 elements, each element signed short
443  */
444 class Is16vec8 : public I16vec8
445 {
446 public:
447  Is16vec8() { }
448  Is16vec8(__m128i _Mm) : I16vec8(_Mm) { }
449  Is16vec8(signed short _S7, signed short _S6, signed short _S5,
450  signed short _S4, signed short _S3, signed short _S2,
451  signed short _S1, signed short _S0)
452  : I16vec8(_S7, _S6, _S5, _S4, _S3, _S2, _S1, _S0) { }
453 
454  /* Assignment Operator */
455  Is16vec8& operator= (const M128 &_A) { return *this = (Is16vec8) _A; }
456 
457  /* Logical Assignment Operators */
458  Is16vec8& operator&=(const M128 &_A) { return *this = (Is16vec8) _mm_and_si128(vec,_A); }
459  Is16vec8& operator|=(const M128 &_A) { return *this = (Is16vec8) _mm_or_si128(vec,_A); }
460  Is16vec8& operator^=(const M128 &_A) { return *this = (Is16vec8) _mm_xor_si128(vec,_A); }
461 
462  /* Addition & Subtraction Assignment Operators */
463  Is16vec8& operator +=(const I16vec8 &_A) { return *this = (Is16vec8) _mm_add_epi16(vec,_A); }
464  Is16vec8& operator -=(const I16vec8 &_A) { return *this = (Is16vec8) _mm_sub_epi16(vec,_A); }
465  Is16vec8& operator *=(const I16vec8 &_A) { return *this = (Is16vec8) _mm_mullo_epi16(vec,_A); }
466 
467  /* Shift Logical Operators */
468  Is16vec8 operator<<(const M128 &_A) { return _mm_sll_epi16(vec,_A); }
469  Is16vec8 operator<<(int _Count) { return _mm_slli_epi16(vec,_Count); }
470  Is16vec8& operator<<=(const M128 &_A) { return *this = (Is16vec8)_mm_sll_epi16(vec,_A); }
471  Is16vec8& operator<<=(int _Count) { return *this = (Is16vec8)_mm_slli_epi16(vec,_Count); }
472  /* Shift Arithmetic Operators */
473  Is16vec8 operator>>(const M128 &_A) { return _mm_sra_epi16(vec,_A); }
474  Is16vec8 operator>>(int _Count) { return _mm_srai_epi16(vec,_Count); }
475  Is16vec8& operator>>=(const M128 &_A) { return *this = (Is16vec8)_mm_sra_epi16(vec,_A); }
476  Is16vec8& operator>>=(int _Count) { return *this = (Is16vec8)_mm_srai_epi16(vec,_Count); }
477 
478 #if defined (_ENABLE_VEC_DEBUG)
479  /* Output for Debug */
480  friend std::ostream& operator<< (std::ostream &_Os, const Is16vec8 &_A)
481  {
482  _Os << "[7]:" << _MM_8W(7,_A)
483  << " [6]:" << _MM_8W(6,_A)
484  << " [5]:" << _MM_8W(5,_A)
485  << " [4]:" << _MM_8W(4,_A)
486  << " [3]:" << _MM_8W(3,_A)
487  << " [2]:" << _MM_8W(2,_A)
488  << " [1]:" << _MM_8W(1,_A)
489  << " [0]:" << _MM_8W(0,_A);
490  return _Os;
491  }
492 #endif /* defined (_ENABLE_VEC_DEBUG) */
493 
494  /* Element Access for Debug, No data modified */
495  const signed short& operator[](int _I)const
496  {
497  _VEC_ASSERT(static_cast<unsigned int>(_I) < 8); /* Only 8 elements to access */
498  return _MM_8W(_I,vec);
499  }
500 
501  /* Element Access and Assignment for Debug */
502  signed short& operator[](int _I)
503  {
504  _VEC_ASSERT(static_cast<unsigned int>(_I) < 8); /* Only 8 elements to access */
505  return _MM_8W(_I,vec);
506  }
507 };
508 
509 inline Is16vec8 operator*(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_mullo_epi16(_A,_B); }
510 
511 
512 /* Additional Is16vec8 functions: compares, unpacks, sat add/sub */
513 inline Is16vec8 cmpeq(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_cmpeq_epi16(_A,_B); }
514 inline Is16vec8 cmpneq(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_andnot_si128(_mm_cmpeq_epi16(_A,_B), get_mask128()); }
515 inline Is16vec8 cmpgt(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_cmpgt_epi16(_A,_B); }
516 inline Is16vec8 cmplt(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_cmpgt_epi16(_B,_A); }
517 
518 inline Is16vec8 unpack_low(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_unpacklo_epi16(_A,_B); }
519 inline Is16vec8 unpack_high(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_unpackhi_epi16(_A,_B); }
520 
521 inline Is16vec8 mul_high(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_mulhi_epi16(_A,_B); }
522 inline Is32vec4 mul_add(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_madd_epi16(_A,_B);}
523 
524 inline Is16vec8 sat_add(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_adds_epi16(_A,_B); }
525 inline Is16vec8 sat_sub(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_subs_epi16(_A,_B); }
526 
527 inline Is16vec8 simd_max(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_max_epi16(_A,_B); }
528 inline Is16vec8 simd_min(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_min_epi16(_A,_B); }
529 
530 
531 /* Iu16vec8 Class:
532  * 8 elements, each element unsigned short
533  */
534 class Iu16vec8 : public I16vec8
535 {
536 public:
537  Iu16vec8() { }
538  Iu16vec8(__m128i _Mm) : I16vec8(_Mm) { }
539  Iu16vec8(unsigned short _S7, unsigned short _S6, unsigned short _S5,
540  unsigned short _S4, unsigned short _S3, unsigned short _S2,
541  unsigned short _S1, unsigned short _S0)
542  : I16vec8(_S7, _S6, _S5, _S4, _S3, _S2, _S1, _S0) { }
543 
544  /* Assignment Operator */
545  Iu16vec8& operator= (const M128 &_A) { return *this = (Iu16vec8) _A; }
546  /* Logical Assignment Operators */
547  Iu16vec8& operator&=(const M128 &_A) { return *this = (Iu16vec8) _mm_and_si128(vec,_A); }
548  Iu16vec8& operator|=(const M128 &_A) { return *this = (Iu16vec8) _mm_or_si128(vec,_A); }
549  Iu16vec8& operator^=(const M128 &_A) { return *this = (Iu16vec8) _mm_xor_si128(vec,_A); }
550  /* Addition & Subtraction Assignment Operators */
551  Iu16vec8& operator +=(const I16vec8 &_A) { return *this = (Iu16vec8) _mm_add_epi16(vec,_A); }
552  Iu16vec8& operator -=(const I16vec8 &_A) { return *this = (Iu16vec8) _mm_sub_epi16(vec,_A); }
553  Iu16vec8& operator *=(const I16vec8 &_A) { return *this = (Iu16vec8) _mm_mullo_epi16(vec,_A); }
554 
555  /* Shift Logical Operators */
556  Iu16vec8 operator<<(const M128 &_A) { return _mm_sll_epi16(vec,_A); }
557  Iu16vec8 operator<<(int _Count) { return _mm_slli_epi16(vec,_Count); }
558  Iu16vec8& operator<<=(const M128 &_A) { return *this = (Iu16vec8)_mm_sll_epi16(vec,_A); }
559  Iu16vec8& operator<<=(int _Count) { return *this = (Iu16vec8)_mm_slli_epi16(vec,_Count); }
560  Iu16vec8 operator>>(const M128 &_A) { return _mm_srl_epi16(vec,_A); }
561  Iu16vec8 operator>>(int _Count) { return _mm_srli_epi16(vec,_Count); }
562  Iu16vec8& operator>>=(const M128 &_A) { return *this = (Iu16vec8) _mm_srl_epi16(vec,_A); }
563  Iu16vec8& operator>>=(int _Count) { return *this = (Iu16vec8) _mm_srli_epi16(vec,_Count); }
564 
565 
566 #if defined (_ENABLE_VEC_DEBUG)
567  /* Output for Debug */
568  friend std::ostream& operator << (std::ostream &_Os, const Iu16vec8 &_A)
569  {
570  _Os << "[7]:" << (unsigned short)(_MM_8UW(7,_A))
571  << " [6]:" << (unsigned short)(_MM_8UW(6,_A))
572  << " [5]:" << (unsigned short)(_MM_8UW(5,_A))
573  << " [4]:" << (unsigned short)(_MM_8UW(4,_A))
574  << " [3]:" << (unsigned short)(_MM_8UW(3,_A))
575  << " [2]:" << (unsigned short)(_MM_8UW(2,_A))
576  << " [1]:" << (unsigned short)(_MM_8UW(1,_A))
577  << " [0]:" << (unsigned short)(_MM_8UW(0,_A));
578  return _Os;
579  }
580 #endif /* defined (_ENABLE_VEC_DEBUG) */
581 
582  /* Element Access for Debug, No data modified */
583  const unsigned short& operator[](int _I)const
584  {
585  _VEC_ASSERT(static_cast<unsigned int>(_I) < 8); /* Only 8 elements to access */
586  return _MM_8UW(_I,vec);
587  }
588 
589  /* Element Access for Debug */
590  unsigned short& operator[](int _I)
591  {
592  _VEC_ASSERT(static_cast<unsigned int>(_I) < 8); /* Only 8 elements to access */
593  return _MM_8UW(_I,vec);
594  }
595 };
596 
597 inline Iu16vec8 operator*(const Iu16vec8 &_A, const Iu16vec8 &_B) { return _mm_mullo_epi16(_A,_B); }
598 
599 /* Additional Iu16vec8 functions: cmpeq,cmpneq, unpacks, sat add/sub */
600 inline Iu16vec8 cmpeq(const Iu16vec8 &_A, const Iu16vec8 &_B) { return _mm_cmpeq_epi16(_A,_B); }
601 inline Iu16vec8 cmpneq(const Iu16vec8 &_A, const Iu16vec8 &_B) { return _mm_andnot_si128(_mm_cmpeq_epi16(_A,_B), get_mask128()); }
602 
603 inline Iu16vec8 unpack_low(const Iu16vec8 &_A, const Iu16vec8 &_B) { return _mm_unpacklo_epi16(_A,_B); }
604 inline Iu16vec8 unpack_high(const Iu16vec8 &_A, const Iu16vec8 &_B) { return _mm_unpackhi_epi16(_A,_B); }
605 
606 inline Iu16vec8 sat_add(const Iu16vec8 &_A, const Iu16vec8 &_B) { return _mm_adds_epu16(_A,_B); }
607 inline Iu16vec8 sat_sub(const Iu16vec8 &_A, const Iu16vec8 &_B) { return _mm_subs_epu16(_A,_B); }
608 
609 inline Iu16vec8 simd_avg(const Iu16vec8 &_A, const Iu16vec8 &_B) { return _mm_avg_epu16(_A,_B); }
610 inline I16vec8 mul_high(const Iu16vec8 &_A, const Iu16vec8 &_B) { return _mm_mulhi_epu16(_A,_B); }
611 
612 /* I8vec16 Class:
613  * 16 elements, each element either unsigned or signed char
614  */
615 class I8vec16 : public M128
616 {
617 public:
618  I8vec16() { }
619  I8vec16(__m128i _Mm) : M128(_Mm) { }
620  I8vec16(char _S15, char _S14, char _S13, char _S12, char _S11, char _S10,
621  char _S9, char _S8, char _S7, char _S6, char _S5, char _S4,
622  char _S3, char _S2, char _S1, char _S0)
623  {
624  vec = _mm_set_epi8(_S15, _S14, _S13, _S12, _S11, _S10, _S9, _S8, _S7, _S6, _S5, _S4, _S3, _S2, _S1, _S0);
625  }
626 
627  /* Assignment Operator */
628  I8vec16& operator= (const M128 &_A) { return *this = (I8vec16) _A; }
629 
630  /* Logical Assignment Operators */
631  I8vec16& operator&=(const M128 &_A) { return *this = (I8vec16) _mm_and_si128(vec,_A); }
632  I8vec16& operator|=(const M128 &_A) { return *this = (I8vec16) _mm_or_si128(vec,_A); }
633  I8vec16& operator^=(const M128 &_A) { return *this = (I8vec16) _mm_xor_si128(vec,_A); }
634 
635  /* Addition & Subtraction Assignment Operators */
636  I8vec16& operator +=(const I8vec16 &_A) { return *this = (I8vec16) _mm_add_epi8(vec,_A); }
637  I8vec16& operator -=(const I8vec16 &_A) { return *this = (I8vec16) _mm_sub_epi8(vec,_A); }
638 
639 };
640 
641 inline I8vec16 cmpeq(const I8vec16 &_A, const I8vec16 &_B) { return _mm_cmpeq_epi8(_A,_B); }
642 inline I8vec16 cmpneq(const I8vec16 &_A, const I8vec16 &_B) { return _mm_andnot_si128(_mm_cmpeq_epi8(_A,_B), get_mask128()); }
643 
644 inline I8vec16 unpack_low(const I8vec16 &_A, const I8vec16 &_B) { return _mm_unpacklo_epi8(_A,_B); }
645 inline I8vec16 unpack_high(const I8vec16 &_A, const I8vec16 &_B) { return _mm_unpackhi_epi8(_A,_B); }
646 
647 /* Is8vec16 Class:
648  * 16 elements, each element a signed char
649  */
650 class Is8vec16 : public I8vec16
651 {
652 public:
653  Is8vec16() { }
654  Is8vec16(__m128i _Mm) : I8vec16(_Mm) { }
655  Is8vec16(char _S15, char _S14, char _S13, char _S12, char _S11, char _S10,
656  char _S9, char _S8, char _S7, char _S6, char _S5, char _S4,
657  char _S3, char _S2, char _S1, char _S0)
658  : I8vec16(_S15, _S14, _S13, _S12, _S11, _S10, _S9, _S8,
659  _S7, _S6, _S5, _S4, _S3, _S2, _S1, _S0) { }
660 
661  /* Assignment Operator */
662  Is8vec16& operator= (const M128 &_A) { return *this = (Is8vec16) _A; }
663 
664  /* Logical Assignment Operators */
665  Is8vec16& operator&=(const M128 &_A) { return *this = (Is8vec16) _mm_and_si128(vec,_A); }
666  Is8vec16& operator|=(const M128 &_A) { return *this = (Is8vec16) _mm_or_si128(vec,_A); }
667  Is8vec16& operator^=(const M128 &_A) { return *this = (Is8vec16) _mm_xor_si128(vec,_A); }
668 
669  /* Addition & Subtraction Assignment Operators */
670  Is8vec16& operator +=(const I8vec16 &_A) { return *this = (Is8vec16) _mm_add_epi8(vec,_A); }
671  Is8vec16& operator -=(const I8vec16 &_A) { return *this = (Is8vec16) _mm_sub_epi8(vec,_A); }
672 
673 #if defined (_ENABLE_VEC_DEBUG)
674  /* Output for Debug */
675  friend std::ostream& operator << (std::ostream &_Os, const Is8vec16 &_A)
676  {
677  _Os << "[15]:" << short(_MM_16B(15,_A))
678  << " [14]:" << short(_MM_16B(14,_A))
679  << " [13]:" << short(_MM_16B(13,_A))
680  << " [12]:" << short(_MM_16B(12,_A))
681  << " [11]:" << short(_MM_16B(11,_A))
682  << " [10]:" << short(_MM_16B(10,_A))
683  << " [9]:" << short(_MM_16B(9,_A))
684  << " [8]:" << short(_MM_16B(8,_A))
685  << " [7]:" << short(_MM_16B(7,_A))
686  << " [6]:" << short(_MM_16B(6,_A))
687  << " [5]:" << short(_MM_16B(5,_A))
688  << " [4]:" << short(_MM_16B(4,_A))
689  << " [3]:" << short(_MM_16B(3,_A))
690  << " [2]:" << short(_MM_16B(2,_A))
691  << " [1]:" << short(_MM_16B(1,_A))
692  << " [0]:" << short(_MM_16B(0,_A));
693  return _Os;
694  }
695 #endif /* defined (_ENABLE_VEC_DEBUG) */
696 
697  /* Element Access for Debug, No data modified */
698  const signed char& operator[](int _I)const
699  {
700  _VEC_ASSERT(static_cast<unsigned int>(_I) < 16); /* Only 16 elements to access */
701  return _MM_16B(_I,vec);
702  }
703 
704  /* Element Access for Debug */
705  signed char& operator[](int _I)
706  {
707  _VEC_ASSERT(static_cast<unsigned int>(_I) < 16); /* Only 16 elements to access */
708  return _MM_16B(_I,vec);
709  }
710 
711 };
712 
713 inline Is8vec16 cmpeq(const Is8vec16 &_A, const Is8vec16 &_B) { return _mm_cmpeq_epi8(_A,_B); }
714 inline Is8vec16 cmpneq(const Is8vec16 &_A, const Is8vec16 &_B) { return _mm_andnot_si128(_mm_cmpeq_epi8(_A,_B), get_mask128()); }
715 inline Is8vec16 cmpgt(const Is8vec16 &_A, const Is8vec16 &_B) { return _mm_cmpgt_epi8(_A,_B); }
716 inline Is8vec16 cmplt(const Is8vec16 &_A, const Is8vec16 &_B) { return _mm_cmplt_epi8(_A,_B); }
717 
718 inline Is8vec16 unpack_low(const Is8vec16 &_A, const Is8vec16 &_B) { return _mm_unpacklo_epi8(_A,_B); }
719 inline Is8vec16 unpack_high(const Is8vec16 &_A, const Is8vec16 &_B) { return _mm_unpackhi_epi8(_A,_B); }
720 
721 inline Is8vec16 sat_add(const Is8vec16 &_A, const Is8vec16 &_B) { return _mm_adds_epi8(_A,_B); }
722 inline Is8vec16 sat_sub(const Is8vec16 &_A, const Is8vec16 &_B) { return _mm_subs_epi8(_A,_B); }
723 
724 /* Iu8vec16 Class:
725  * 16 elements, each element a unsigned char
726  */
727 class Iu8vec16 : public I8vec16
728 {
729 public:
730  Iu8vec16() { }
731  Iu8vec16(__m128i _Mm) : I8vec16(_Mm) { }
732  Iu8vec16(unsigned char _U15, unsigned char _U14, unsigned char _U13,
733  unsigned char _U12, unsigned char _U11, unsigned char _U10,
734  unsigned char _U9, unsigned char _U8, unsigned char _U7,
735  unsigned char _U6, unsigned char _U5, unsigned char _U4,
736  unsigned char _U3, unsigned char _U2, unsigned char _U1,
737  unsigned char _U0)
738  : I8vec16(_U15, _U14, _U13, _U12, _U11, _U10, _U9, _U8,
739  _U7, _U6, _U5, _U4, _U3, _U2, _U1, _U0) { }
740 
741  /* Assignment Operator */
742  Iu8vec16& operator= (const M128 &_A) { return *this = (Iu8vec16) _A; }
743 
744  /* Logical Assignment Operators */
745  Iu8vec16& operator&=(const M128 &_A) { return *this = (Iu8vec16) _mm_and_si128(vec,_A); }
746  Iu8vec16& operator|=(const M128 &_A) { return *this = (Iu8vec16) _mm_or_si128(vec,_A); }
747  Iu8vec16& operator^=(const M128 &_A) { return *this = (Iu8vec16) _mm_xor_si128(vec,_A); }
748 
749  /* Addition & Subtraction Assignment Operators */
750  Iu8vec16& operator +=(const I8vec16 &_A) { return *this = (Iu8vec16) _mm_add_epi8(vec,_A); }
751  Iu8vec16& operator -=(const I8vec16 &_A) { return *this = (Iu8vec16) _mm_sub_epi8(vec,_A); }
752 
753 #if defined (_ENABLE_VEC_DEBUG)
754  /* Output for Debug */
755  friend std::ostream& operator << (std::ostream &_Os, const Iu8vec16 &_A)
756  {
757  _Os << "[15]:" << (unsigned char)(_MM_16UB(15,_A))
758  << " [14]:" << (unsigned char)(_MM_16UB(14,_A))
759  << " [13]:" << (unsigned char)(_MM_16UB(13,_A))
760  << " [12]:" << (unsigned char)(_MM_16UB(12,_A))
761  << " [11]:" << (unsigned char)(_MM_16UB(11,_A))
762  << " [10]:" << (unsigned char)(_MM_16UB(10,_A))
763  << " [9]:" << (unsigned char)(_MM_16UB(9,_A))
764  << " [8]:" << (unsigned char)(_MM_16UB(8,_A))
765  << " [7]:" << (unsigned char)(_MM_16UB(7,_A))
766  << " [6]:" << (unsigned char)(_MM_16UB(6,_A))
767  << " [5]:" << (unsigned char)(_MM_16UB(5,_A))
768  << " [4]:" << (unsigned char)(_MM_16UB(4,_A))
769  << " [3]:" << (unsigned char)(_MM_16UB(3,_A))
770  << " [2]:" << (unsigned char)(_MM_16UB(2,_A))
771  << " [1]:" << (unsigned char)(_MM_16UB(1,_A))
772  << " [0]:" << (unsigned char)(_MM_16UB(0,_A));
773  return _Os;
774  }
775 #endif /* defined (_ENABLE_VEC_DEBUG) */
776 
777  /* Element Access for Debug, No data modified */
778  const unsigned char& operator[](int _I)const
779  {
780  _VEC_ASSERT(static_cast<unsigned int>(_I) < 16); /* Only 16 elements to access */
781  return _MM_16UB(_I,vec);
782  }
783 
784  /* Element Access for Debug */
785  unsigned char& operator[](int _I)
786  {
787  _VEC_ASSERT(static_cast<unsigned int>(_I) < 16); /* Only 16 elements to access */
788  return _MM_16UB(_I,vec);
789  }
790 
791 };
792 
793 inline Iu8vec16 cmpeq(const Iu8vec16 &_A, const Iu8vec16 &_B) { return _mm_cmpeq_epi8(_A,_B); }
794 inline Iu8vec16 cmpneq(const Iu8vec16 &_A, const Iu8vec16 &_B) { return _mm_andnot_si128(_mm_cmpeq_epi8(_A,_B), get_mask128()); }
795 
796 inline Iu8vec16 unpack_low(const Iu8vec16 &_A, const Iu8vec16 &_B) { return _mm_unpacklo_epi8(_A,_B); }
797 inline Iu8vec16 unpack_high(const Iu8vec16 &_A, const Iu8vec16 &_B) { return _mm_unpackhi_epi8(_A,_B); }
798 
799 inline Iu8vec16 sat_add(const Iu8vec16 &_A, const Iu8vec16 &_B) { return _mm_adds_epu8(_A,_B); }
800 inline Iu8vec16 sat_sub(const Iu8vec16 &_A, const Iu8vec16 &_B) { return _mm_subs_epu8(_A,_B); }
801 
802 inline I64vec2 sum_abs(const Iu8vec16 &_A, const Iu8vec16 &_B) { return _mm_sad_epu8(_A,_B); }
803 
804 inline Iu8vec16 simd_avg(const Iu8vec16 &_A, const Iu8vec16 &_B) { return _mm_avg_epu8(_A,_B); }
805 inline Iu8vec16 simd_max(const Iu8vec16 &_A, const Iu8vec16 &_B) { return _mm_max_epu8(_A,_B); }
806 inline Iu8vec16 simd_min(const Iu8vec16 &_A, const Iu8vec16 &_B) { return _mm_min_epu8(_A,_B); }
807 
808 /* Pack & Saturates */
809 
810 inline Is16vec8 pack_sat(const Is32vec4 &_A, const Is32vec4 &_B) { return _mm_packs_epi32(_A,_B); }
811 inline Is8vec16 pack_sat(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_packs_epi16(_A,_B); }
812 inline Iu8vec16 packu_sat(const Is16vec8 &_A, const Is16vec8 &_B) { return _mm_packus_epi16(_A,_B);}
813 
814  /********************************* Logicals ****************************************/
815 #define IVEC128_LOGICALS(vect,element) \
816 inline I##vect##vec##element operator& (const I##vect##vec##element &_A, const I##vect##vec##element &_B) \
817 { return _mm_and_si128( _A,_B); } \
818 inline I##vect##vec##element operator| (const I##vect##vec##element &_A, const I##vect##vec##element &_B) \
819 { return _mm_or_si128( _A,_B); } \
820 inline I##vect##vec##element operator^ (const I##vect##vec##element &_A, const I##vect##vec##element &_B) \
821 { return _mm_xor_si128( _A,_B); } \
822 inline I##vect##vec##element andnot (const I##vect##vec##element &_A, const I##vect##vec##element &_B) \
823 { return _mm_andnot_si128( _A,_B); }
824 
825 IVEC128_LOGICALS(8,16)
826 IVEC128_LOGICALS(u8,16)
827 IVEC128_LOGICALS(s8,16)
828 IVEC128_LOGICALS(16,8)
829 IVEC128_LOGICALS(u16,8)
830 IVEC128_LOGICALS(s16,8)
831 IVEC128_LOGICALS(32,4)
832 IVEC128_LOGICALS(u32,4)
833 IVEC128_LOGICALS(s32,4)
834 IVEC128_LOGICALS(64,2)
835 IVEC128_LOGICALS(128,1)
836 #undef IVEC128_LOGICALS
837 
838  /********************************* Add & Sub ****************************************/
839 #define IVEC128_ADD_SUB(vect,element,opsize) \
840 inline I##vect##vec##element operator+ (const I##vect##vec##element &_A, const I##vect##vec##element &_B) \
841 { return _mm_add_##opsize( _A,_B); } \
842 inline I##vect##vec##element operator- (const I##vect##vec##element &_A, const I##vect##vec##element &_B) \
843 { return _mm_sub_##opsize( _A,_B); }
844 
845 IVEC128_ADD_SUB(8,16, epi8)
846 IVEC128_ADD_SUB(u8,16, epi8)
847 IVEC128_ADD_SUB(s8,16, epi8)
848 IVEC128_ADD_SUB(16,8, epi16)
849 IVEC128_ADD_SUB(u16,8, epi16)
850 IVEC128_ADD_SUB(s16,8, epi16)
851 IVEC128_ADD_SUB(32,4, epi32)
852 IVEC128_ADD_SUB(u32,4, epi32)
853 IVEC128_ADD_SUB(s32,4, epi32)
854 IVEC128_ADD_SUB(64,2, epi64)
855 #undef IVEC128_ADD_SUB
856 
857  /************************* Conditional Select ********************************
858  * version of: retval = (a OP b)? c : d; *
859  * Where OP is one of the possible comparision operators. *
860  * Example: r = select_eq(a,b,c,d); *
861  * if "member at position x of the vector a" == *
862  * "member at position x of vector b" *
863  * assign the corresponding member in r from c, else assign from d. *
864  ************************* Conditional Select ********************************/
865 
866 #define IVEC128_SELECT(vect12,vect34,element,selop) \
867  inline I##vect34##vec##element select_##selop ( \
868  const I##vect12##vec##element &_A, \
869  const I##vect12##vec##element &_B, \
870  const I##vect34##vec##element &_C, \
871  const I##vect34##vec##element &_D) \
872 { \
873  I##vect12##vec##element _Mask = cmp##selop(_A,_B); \
874  return ( I##vect34##vec##element (_Mask & _C ) | \
875  I##vect34##vec##element ((_mm_andnot_si128(_Mask, _D )))); \
876 }
877 
878 IVEC128_SELECT(8,s8,16,eq)
879 IVEC128_SELECT(8,u8,16,eq)
880 IVEC128_SELECT(8,8,16,eq)
881 IVEC128_SELECT(8,s8,16,neq)
882 IVEC128_SELECT(8,u8,16,neq)
883 IVEC128_SELECT(8,8,16,neq)
884 
885 IVEC128_SELECT(16,s16,8,eq)
886 IVEC128_SELECT(16,u16,8,eq)
887 IVEC128_SELECT(16,16,8,eq)
888 IVEC128_SELECT(16,s16,8,neq)
889 IVEC128_SELECT(16,u16,8,neq)
890 IVEC128_SELECT(16,16,8,neq)
891 
892 IVEC128_SELECT(32,s32,4,eq)
893 IVEC128_SELECT(32,u32,4,eq)
894 IVEC128_SELECT(32,32,4,eq)
895 IVEC128_SELECT(32,s32,4,neq)
896 IVEC128_SELECT(32,u32,4,neq)
897 IVEC128_SELECT(32,32,4,neq)
898 
899 IVEC128_SELECT(s8,s8,16,gt)
900 IVEC128_SELECT(s8,u8,16,gt)
901 IVEC128_SELECT(s8,8,16,gt)
902 IVEC128_SELECT(s8,s8,16,lt)
903 IVEC128_SELECT(s8,u8,16,lt)
904 IVEC128_SELECT(s8,8,16,lt)
905 
906 IVEC128_SELECT(s16,s16,8,gt)
907 IVEC128_SELECT(s16,u16,8,gt)
908 IVEC128_SELECT(s16,16,8,gt)
909 IVEC128_SELECT(s16,s16,8,lt)
910 IVEC128_SELECT(s16,u16,8,lt)
911 IVEC128_SELECT(s16,16,8,lt)
912 
913 
914 #undef IVEC128_SELECT
915 
916 
917 class F64vec2
918 {
919 protected:
921 public:
922 
923  /* Constructors: __m128d, 2 doubles */
924  F64vec2() {}
925 
926  /* initialize 2 DP FP with __m128d data type */
927  F64vec2(__m128d _M) { vec = _M;}
928 
929  /* initialize 2 DP FPs with 2 doubles */
930  F64vec2(double _D1, double _D0) { vec= _mm_set_pd(_D1,_D0); }
931 
932  /* Explicitly initialize each of 2 DP FPs with same double */
933  explicit F64vec2(double _D) { vec = _mm_set1_pd(_D); }
934 
935  /* Conversion functions */
936  operator __m128d() const { return vec; } /* Convert to __m128d */
937 
938  /* Logical Operators */
939  friend F64vec2 operator &(const F64vec2 &_A, const F64vec2 &_B) { return _mm_and_pd(_A,_B); }
940  friend F64vec2 operator |(const F64vec2 &_A, const F64vec2 &_B) { return _mm_or_pd(_A,_B); }
941  friend F64vec2 operator ^(const F64vec2 &_A, const F64vec2 &_B) { return _mm_xor_pd(_A,_B); }
942 
943  /* Arithmetic Operators */
944  friend F64vec2 operator +(const F64vec2 &_A, const F64vec2 &_B) { return _mm_add_pd(_A,_B); }
945  friend F64vec2 operator -(const F64vec2 &_A, const F64vec2 &_B) { return _mm_sub_pd(_A,_B); }
946  friend F64vec2 operator *(const F64vec2 &_A, const F64vec2 &_B) { return _mm_mul_pd(_A,_B); }
947  friend F64vec2 operator /(const F64vec2 &_A, const F64vec2 &_B) { return _mm_div_pd(_A,_B); }
948 
949  F64vec2& operator +=(const F64vec2 &_A) { return *this = _mm_add_pd(vec,_A); }
950  F64vec2& operator -=(const F64vec2 &_A) { return *this = _mm_sub_pd(vec,_A); }
951  F64vec2& operator *=(const F64vec2 &_A) { return *this = _mm_mul_pd(vec,_A); }
952  F64vec2& operator /=(const F64vec2 &_A) { return *this = _mm_div_pd(vec,_A); }
953  F64vec2& operator &=(const F64vec2 &_A) { return *this = _mm_and_pd(vec,_A); }
954  F64vec2& operator |=(const F64vec2 &_A) { return *this = _mm_or_pd(vec,_A); }
955  F64vec2& operator ^=(const F64vec2 &_A) { return *this = _mm_xor_pd(vec,_A); }
956 
957  /* Horizontal Add */
958  friend double add_horizontal(const F64vec2 &_A)
959  {
960  F64vec2 _Ftemp = _mm_add_sd(_A,_mm_shuffle_pd(_A, _A, 1));
961  return _mm_cvtsd_f64(_Ftemp);
962  }
963 
964  /* And Not */
965  friend F64vec2 andnot(const F64vec2 &_A, const F64vec2 &_B) { return _mm_andnot_pd(_A,_B); }
966 
967  /* Square Root */
968  friend F64vec2 sqrt(const F64vec2 &_A) { return _mm_sqrt_pd(_A); }
969 
970  /* Compares: Mask is returned */
971  /* Macros expand to all compare intrinsics. Example:
972  friend F64vec2 cmpeq(const F64vec2 &_A, const F64vec2 &_B)
973  { return _mm_cmpeq_ps(_A,_B);} */
974  #define F64vec2_COMP(op) \
975  friend F64vec2 cmp##op (const F64vec2 &_A, const F64vec2 &_B) { return _mm_cmp##op##_pd(_A,_B); }
976  F64vec2_COMP(eq) /* expanded to cmpeq(_A,_B) */
977  F64vec2_COMP(lt) /* expanded to cmplt(_A,_B) */
978  F64vec2_COMP(le) /* expanded to cmple(_A,_B) */
979  F64vec2_COMP(gt) /* expanded to cmpgt(_A,_B) */
980  F64vec2_COMP(ge) /* expanded to cmpge(_A,_B) */
981  F64vec2_COMP(ngt) /* expanded to cmpngt(_A,_B) */
982  F64vec2_COMP(nge) /* expanded to cmpnge(_A,_B) */
983  F64vec2_COMP(neq) /* expanded to cmpneq(_A,_B) */
984  F64vec2_COMP(nlt) /* expanded to cmpnlt(_A,_B) */
985  F64vec2_COMP(nle) /* expanded to cmpnle(_A,_B) */
986  #undef F64vec2_COMP
987 
988  /* Min and Max */
989  friend F64vec2 simd_min(const F64vec2 &_A, const F64vec2 &_B) { return _mm_min_pd(_A,_B); }
990  friend F64vec2 simd_max(const F64vec2 &_A, const F64vec2 &_B) { return _mm_max_pd(_A,_B); }
991 
992  /* Absolute value */
993  friend F64vec2 abs(const F64vec2 &_A)
994  {
995  return _mm_and_pd(_A, _f64vec2_abs_mask);
996  }
997 
998  /* Compare lower DP FP values */
999  #define F64vec2_COMI(op) \
1000  friend int comi##op (const F64vec2 &_A, const F64vec2 &_B) { return _mm_comi##op##_sd(_A,_B); }
1001  F64vec2_COMI(eq) /* expanded to comieq(_A,_B) */
1002  F64vec2_COMI(lt) /* expanded to comilt(_A,_B) */
1003  F64vec2_COMI(le) /* expanded to comile(_A,_B) */
1004  F64vec2_COMI(gt) /* expanded to comigt(_A,_B) */
1005  F64vec2_COMI(ge) /* expanded to comige(_A,_B) */
1006  F64vec2_COMI(neq) /* expanded to comineq(_A,_B) */
1007  #undef F64vec2_COMI
1008 
1009  /* Compare lower DP FP values */
1010  #define F64vec2_UCOMI(op) \
1011  friend int ucomi##op (const F64vec2 &_A, const F64vec2 &_B) { return _mm_ucomi##op##_sd(_A,_B); }
1012  F64vec2_UCOMI(eq) /* expanded to ucomieq(_A,_B) */
1013  F64vec2_UCOMI(lt) /* expanded to ucomilt(_A,_B) */
1014  F64vec2_UCOMI(le) /* expanded to ucomile(_A,_B) */
1015  F64vec2_UCOMI(gt) /* expanded to ucomigt(_A,_B) */
1016  F64vec2_UCOMI(ge) /* expanded to ucomige(_A,_B) */
1017  F64vec2_UCOMI(neq) /* expanded to ucomineq(_A,_B) */
1018  #undef F64vec2_UCOMI
1019 
1020  /* Debug Features */
1021 #if defined (_ENABLE_VEC_DEBUG)
1022  /* Output */
1023  friend std::ostream & operator<<(std::ostream & _Os, const F64vec2 &_A)
1024  {
1025  /* To use: cout << "Elements of F64vec2 fvec are: " << fvec; */
1026  double *_Dp = (double*)&_A;
1027  _Os << "[1]:" << *(_Dp+1)
1028  << " [0]:" << *_Dp;
1029  return _Os;
1030  }
1031 #endif /* defined (_ENABLE_VEC_DEBUG) */
1032  /* Element Access Only, no modifications to elements*/
1033  const double& operator[](int _I) const
1034  {
1035  /* Assert enabled only during debug /DDEBUG */
1036  _VEC_ASSERT((0 <= _I) && (_I <= 1)); /* User should only access elements 0-1 */
1037  double *_Dp = (double*)&vec;
1038  return *(_Dp+ _I);
1039  }
1040  /* Element Access and Modification*/
1041  double& operator[](int _I)
1042  {
1043  /* Assert enabled only during debug /DDEBUG */
1044  _VEC_ASSERT((0 <= _I) && (_I <= 1)); /* User should only access elements 0-1 */
1045  double *_Dp = (double*)&vec;
1046  return *(_Dp+ _I);
1047  }
1048 };
1049 
1050  /* Miscellaneous */
1051 
1052 /* Interleave low order data elements of a and b into destination */
1053 inline F64vec2 unpack_low(const F64vec2 &_A, const F64vec2 &_B)
1054 { return _mm_unpacklo_pd(_A, _B); }
1055 
1056 /* Interleave high order data elements of a and b into target */
1057 inline F64vec2 unpack_high(const F64vec2 &_A, const F64vec2 &_B)
1058 { return _mm_unpackhi_pd(_A, _B); }
1059 
1060 /* Move Mask to Integer returns 4 bit mask formed of most significant bits of a */
1061 inline int move_mask(const F64vec2 &_A)
1062 { return _mm_movemask_pd(_A);}
1063 
1064  /* Data Motion Functions */
1065 
1066 /* Load Unaligned loadu_pd: Unaligned */
1067 inline void loadu(F64vec2 &_A, double *_P)
1068 { _A = _mm_loadu_pd(_P); }
1069 
1070 /* Store Temporal storeu_pd: Unaligned */
1071 inline void storeu(double *_P, const F64vec2 &_A)
1072 { _mm_storeu_pd(_P, _A); }
1073 
1074  /* Cacheability Support */
1075 
1076 /* Non-Temporal Store */
1077 inline void store_nta(double *_P, F64vec2 &_A)
1078 { _mm_stream_pd(_P,_A);}
1079 
1080 #define F64vec2_SELECT(op) \
1081 inline F64vec2 select_##op (const F64vec2 &_A, const F64vec2 &_B, const F64vec2 &_C, const F64vec2 &_D) \
1082 { \
1083  F64vec2 _Mask = _mm_cmp##op##_pd(_A,_B); \
1084  return( (_Mask & _C) | F64vec2((_mm_andnot_pd(_Mask,_D)))); \
1085 }
1086 F64vec2_SELECT(eq) /* generates select_eq(_A,_B) */
1087 F64vec2_SELECT(lt) /* generates select_lt(_A,_B) */
1088 F64vec2_SELECT(le) /* generates select_le(_A,_B) */
1089 F64vec2_SELECT(gt) /* generates select_gt(_A,_B) */
1090 F64vec2_SELECT(ge) /* generates select_ge(_A,_B) */
1091 F64vec2_SELECT(neq) /* generates select_neq(_A,_B) */
1092 F64vec2_SELECT(nlt) /* generates select_nlt(_A,_B) */
1093 F64vec2_SELECT(nle) /* generates select_nle(_A,_B) */
1094 #undef F64vec2_SELECT
1095 
1096 /* Convert the lower DP FP value of a to a 32 bit signed integer using Truncate*/
1097 inline int F64vec2ToInt(const F64vec2 &_A)
1098 {
1099 
1100  return _mm_cvttsd_si32(_A);
1101 
1102 }
1103 
1104 /* Convert the 4 SP FP values of a to DP FP values */
1106 {
1107  return _mm_cvtps_pd(_A);
1108 }
1109 
1110 /* Convert the 2 DP FP values of a to SP FP values */
1112 {
1113  return _mm_cvtpd_ps(_A);
1114 }
1115 
1116 /* Convert the signed int in b to a DP FP value. Upper DP FP value in a passed through */
1117 inline F64vec2 IntToF64vec2(const F64vec2 &_A, int _B)
1118 {
1119  return _mm_cvtsi32_sd(_A,_B);
1120 }
1121 
1122 #pragma pack(pop) /* 16-B aligned */
1123 
1124  /******************************************************************************/
1125  /************** Interface classes for Intel(R) AVX intrinsics *****************/
1126  /******************************************************************************/
1127 
1128 /*
1129  * class F32vec8
1130  *
1131  * Represents 256-bit vector composed of 8 single precision floating point elements.
1132  */
1133 class F32vec8
1134 {
1135 protected:
1137 
1138 public:
1139 
1140  /* Constructors: __m256, 8 floats, 1 float */
1141  F32vec8() {}
1142 
1143  /* initialize 8 SP FP with __m256 data type */
1144  F32vec8(__m256 _M) { vec = _M; }
1145 
1146  /* initialize 8 SP FPs with 8 floats */
1147  F32vec8(float _F7, float _F6, float _F5, float _F4, float _F3, float _F2, float _F1, float _F0)
1148  {
1149  vec = _mm256_set_ps(_F7, _F6, _F5, _F4, _F3, _F2,_F1, _F0);
1150  }
1151 
1152  /* Explicitly initialize each of 8 SP FPs with same float */
1153  explicit F32vec8(float _F) { vec = _mm256_set1_ps(_F); }
1154 
1155  /* Explicitly initialize each of 8 SP FPs with same double */
1156  explicit F32vec8(double _D) { vec = _mm256_set1_ps((float) _D); }
1157 
1158  /* Assignment operations */
1159  F32vec8& operator =(float _F)
1160  {
1161  vec = _mm256_set1_ps(_F);
1162  return *this;
1163  }
1164 
1165  F32vec8& operator =(double _D)
1166  {
1167  vec = _mm256_set1_ps((float) _D);
1168  return *this;
1169  }
1170 
1171  /* Conversion functions */
1172  operator __m256() const { return vec; }
1173 
1174  /* Logical Operators */
1175  friend F32vec8 operator &(const F32vec8 &_A, const F32vec8 &_B) { return _mm256_and_ps(_A,_B); }
1176  friend F32vec8 operator |(const F32vec8 &_A, const F32vec8 &_B) { return _mm256_or_ps(_A,_B); }
1177  friend F32vec8 operator ^(const F32vec8 &_A, const F32vec8 &_B) { return _mm256_xor_ps(_A,_B); }
1178 
1179  /* Arithmetic Operators */
1180  friend F32vec8 operator +(const F32vec8 &_A, const F32vec8 &_B) { return _mm256_add_ps(_A,_B); }
1181  friend F32vec8 operator -(const F32vec8 &_A, const F32vec8 &_B) { return _mm256_sub_ps(_A,_B); }
1182  friend F32vec8 operator *(const F32vec8 &_A, const F32vec8 &_B) { return _mm256_mul_ps(_A,_B); }
1183  friend F32vec8 operator /(const F32vec8 &_A, const F32vec8 &_B) { return _mm256_div_ps(_A,_B); }
1184 
1185  F32vec8& operator +=(const F32vec8 &_A) { return *this = _mm256_add_ps(vec,_A); }
1186  F32vec8& operator -=(const F32vec8 &_A) { return *this = _mm256_sub_ps(vec,_A); }
1187  F32vec8& operator *=(const F32vec8 &_A) { return *this = _mm256_mul_ps(vec,_A); }
1188  F32vec8& operator /=(const F32vec8 &_A) { return *this = _mm256_div_ps(vec,_A); }
1189  F32vec8& operator &=(const F32vec8 &_A) { return *this = _mm256_and_ps(vec,_A); }
1190  F32vec8& operator |=(const F32vec8 &_A) { return *this = _mm256_or_ps(vec,_A); }
1191  F32vec8& operator ^=(const F32vec8 &_A) { return *this = _mm256_xor_ps(vec,_A); }
1192 
1193  /* Horizontal Add */
1194  friend float add_horizontal(const F32vec8 &_A)
1195  {
1196  F32vec8 _Temp = _mm256_add_ps(_A, _mm256_permute_ps(_A, 0xee));
1197  _Temp = _mm256_add_ps(_Temp, _mm256_movehdup_ps(_Temp));
1199  }
1200 
1201  /* And Not */
1202  friend F32vec8 andnot(const F32vec8 &_A, const F32vec8 &_B) { return _mm256_andnot_ps(_A,_B); }
1203 
1204  /* Square Root */
1205  friend F32vec8 sqrt(const F32vec8 &_A) { return _mm256_sqrt_ps(_A); }
1206 
1207  /* Reciprocal */
1208  friend F32vec8 rcp(const F32vec8 &_A) { return _mm256_rcp_ps(_A); }
1209 
1210  /* Reciprocal Square Root */
1211  friend F32vec8 rsqrt(const F32vec8 &_A) { return _mm256_rsqrt_ps(_A); }
1212 
1213  /*
1214  * NewtonRaphson Reciprocal
1215  * [2 * rcpps(x) - (x * rcpps(x) * rcpps(x))]
1216  */
1217  friend F32vec8 rcp_nr(const F32vec8 &_A)
1218  {
1219  F32vec8 _Ra0 = _mm256_rcp_ps(_A);
1220  return _mm256_sub_ps(_mm256_add_ps(_Ra0, _Ra0), _mm256_mul_ps(_mm256_mul_ps(_Ra0, _A), _Ra0));
1221  }
1222 
1223  /*
1224  * NewtonRaphson Reciprocal Square Root
1225  * 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x))
1226  */
1227  friend F32vec8 rsqrt_nr(const F32vec8 &_A)
1228  {
1229 #pragma warning(push)
1230 #pragma warning(disable:4640)
1231  static const F32vec8 fvecf0pt5(0.5f);
1232  static const F32vec8 fvecf3pt0(3.0f);
1233 #pragma warning(pop)
1234  F32vec8 _Ra0 = _mm256_rsqrt_ps(_A);
1235  return (fvecf0pt5 * _Ra0) * (fvecf3pt0 - (_A * _Ra0) * _Ra0);
1236 
1237  }
1238 
1239  /* Compares: Mask is returned */
1240  friend F32vec8 cmp_eq(const F32vec8 &_A, const F32vec8 &_B)
1241  { return _mm256_cmp_ps(_A, _B, _CMP_EQ_OQ); }
1242  friend F32vec8 cmp_lt(const F32vec8 &_A, const F32vec8 &_B)
1243  { return _mm256_cmp_ps(_A, _B, _CMP_LT_OS); }
1244  friend F32vec8 cmp_le(const F32vec8 &_A, const F32vec8 &_B)
1245  { return _mm256_cmp_ps(_A, _B, _CMP_LE_OS); }
1246  friend F32vec8 cmp_gt(const F32vec8 &_A, const F32vec8 &_B)
1247  { return _mm256_cmp_ps(_A, _B, _CMP_GT_OS); }
1248  friend F32vec8 cmp_ge(const F32vec8 &_A, const F32vec8 &_B)
1249  { return _mm256_cmp_ps(_A, _B, _CMP_GE_OS); }
1250  friend F32vec8 cmp_neq(const F32vec8 &_A, const F32vec8 &_B)
1251  { return _mm256_cmp_ps(_A, _B, _CMP_NEQ_UQ); }
1252  friend F32vec8 cmp_nlt(const F32vec8 &_A, const F32vec8 &_B)
1253  { return _mm256_cmp_ps(_A, _B, _CMP_NLT_US); }
1254  friend F32vec8 cmp_nle(const F32vec8 &_A, const F32vec8 &_B)
1255  { return _mm256_cmp_ps(_A, _B, _CMP_NLE_US); }
1256  friend F32vec8 cmp_ngt(const F32vec8 &_A, const F32vec8 &_B)
1257  { return _mm256_cmp_ps(_A, _B, _CMP_NGT_US); }
1258  friend F32vec8 cmp_nge(const F32vec8 &_A, const F32vec8 &_B)
1259  { return _mm256_cmp_ps(_A, _B, _CMP_NGE_US); }
1260 
1261  /* Min and Max */
1262  friend F32vec8 simd_min(const F32vec8 &_A, const F32vec8 &_B)
1263  { return _mm256_min_ps(_A,_B); }
1264  friend F32vec8 simd_max(const F32vec8 &_A, const F32vec8 &_B)
1265  { return _mm256_max_ps(_A,_B); }
1266 
1267  /* Absolute value */
1268  friend F32vec8 abs(const F32vec8 &_A)
1269  {
1270  static const union
1271  {
1272  int i[8];
1273  __m256 m;
1274  } __f32vec8_abs_mask = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
1275  0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
1276  return _mm256_and_ps(_A, __f32vec8_abs_mask.m);
1277  }
1278 
1279  /* Debug Features */
1280 #if defined (_ENABLE_VEC_DEBUG)
1281  /* Output */
1282  friend DVEC_STD ostream & operator<<(DVEC_STD ostream &_Os, const F32vec8 &_A)
1283  {
1284  /* To use: cout << "Elements of F32vec8 fvec are: " << fvec; */
1285  float *_Fp = (float*) &_A;
1286  _Os << "[7]:" << *(_Fp+7)
1287  << " [6]:" << *(_Fp+6)
1288  << " [5]:" << *(_Fp+5)
1289  << " [4]:" << *(_Fp+4)
1290  << " [3]:" << *(_Fp+3)
1291  << " [2]:" << *(_Fp+2)
1292  << " [1]:" << *(_Fp+1)
1293  << " [0]:" << *_Fp;
1294  return _Os;
1295  }
1296 #endif /* defined (_ENABLE_VEC_DEBUG) */
1297 
1298  /* Element Access Only, no modifications to elements*/
1299  const float& operator[](int _I) const
1300  {
1301  /* Assert enabled only during debug /DDEBUG */
1302  _VEC_ASSERT((0 <= _I) && (_I <= 7));
1303  float *_Fp = (float*)&vec;
1304  return *(_Fp+ _I);
1305  }
1306 
1307  /* Element Access and Modification*/
1308  float& operator[](int _I)
1309  {
1310  /* Assert enabled only during debug /DDEBUG */
1311  _VEC_ASSERT((0 <= _I) && (_I <= 7));
1312  float *_Fp = (float*)&vec;
1313  return *(_Fp+ _I);
1314  }
1315 };
1316 
1317  /* Miscellaneous */
1318 
1319 /* Interleave low order data elements of a and b into destination */
1320 inline F32vec8 unpack_low(const F32vec8 &_A, const F32vec8 &_B){
1321  return _mm256_unpacklo_ps(_A, _B); }
1322 
1323 /* Interleave high order data elements of a and b into target */
1324 inline F32vec8 unpack_high(const F32vec8 &_A, const F32vec8 &_B){
1325  return _mm256_unpackhi_ps(_A, _B); }
1326 
1327 /* Move Mask to Integer returns 8 bit mask formed of most significant bits of a */
1328 inline int move_mask(const F32vec8 &_A){
1329  return _mm256_movemask_ps(_A); }
1330 
1331  /* Data Motion Functions */
1332 
1333 /* Load Unaligned loadu_ps: Unaligned */
1334 inline void loadu(F32vec8 &_A, const float *_P){
1335  _A = _mm256_loadu_ps(_P); }
1336 
1337 /* Store Unaligned storeu_ps: Unaligned */
1338 inline void storeu(float *_P, const F32vec8 &_A){
1339  _mm256_storeu_ps(_P, _A); }
1340 
1341  /* Cacheability Support */
1342 
1343 /* Non-Temporal Store */
1344 inline void store_nta(float *_P, const F32vec8 &_A){
1345  _mm256_stream_ps(_P, _A); }
1346 
1347  /* Conditional moves */
1348 
1349 /* Masked load */
1350 inline void maskload(F32vec8 &_A, const float *_P, const F32vec8 &_M){
1351  _A = _mm256_maskload_ps(_P, _mm256_castps_si256(_M)); }
1352 
1353 inline void maskload(F32vec4 &_A, const float *_P, const F32vec4 &_M){
1354  _A = _mm_maskload_ps(_P, _mm_castps_si128(_M)); }
1355 
1356 /* Masked store */
1357 inline void maskstore(float *_P, const F32vec8 &_A, const F32vec8 &_M){
1359 
1360 inline void maskstore(float *_P, const F32vec4 &_A, const F32vec4 &_M){
1361  _mm_maskstore_ps(_P, _mm_castps_si128(_M), _A); }
1362 
1363  /* Conditional Selects */
1364 
1365 inline F32vec8 select_eq(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D){
1366  return _mm256_blendv_ps(_D, _C, _mm256_cmp_ps(_A, _B, _CMP_EQ_OQ)); }
1367 
1368 inline F32vec8 select_lt(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D){
1369  return _mm256_blendv_ps(_D, _C, _mm256_cmp_ps(_A, _B, _CMP_LT_OS)); }
1370 
1371 inline F32vec8 select_le(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D){
1372  return _mm256_blendv_ps(_D, _C, _mm256_cmp_ps(_A, _B, _CMP_LE_OS)); }
1373 
1374 inline F32vec8 select_gt(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D){
1375  return _mm256_blendv_ps(_D, _C, _mm256_cmp_ps(_A, _B, _CMP_GT_OS)); }
1376 
1377 inline F32vec8 select_ge(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D){
1378  return _mm256_blendv_ps(_D, _C, _mm256_cmp_ps(_A, _B, _CMP_GE_OS)); }
1379 
1380 inline F32vec8 select_neq(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D){
1381  return _mm256_blendv_ps(_D, _C, _mm256_cmp_ps(_A, _B, _CMP_NEQ_UQ)); }
1382 
1383 inline F32vec8 select_nlt(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D){
1384  return _mm256_blendv_ps(_D, _C, _mm256_cmp_ps(_A, _B, _CMP_NLT_US)); }
1385 
1386 inline F32vec8 select_nle(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D){
1387  return _mm256_blendv_ps(_D, _C, _mm256_cmp_ps(_A, _B, _CMP_NLE_US)); }
1388 
1389 inline F32vec8 select_ngt(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D){
1390  return _mm256_blendv_ps(_D, _C, _mm256_cmp_ps(_A, _B, _CMP_NGT_US)); }
1391 
1392 inline F32vec8 select_nge(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D){
1393  return _mm256_blendv_ps(_D, _C, _mm256_cmp_ps(_A, _B, _CMP_NGE_US)); }
1394 
1395 /*
1396  * class F64vec4
1397  *
1398  * Represents 256-bit vector composed of 4 double precision floating point elements.
1399  */
1400 class F64vec4
1401 {
1402 protected:
1404 
1405 public:
1406 
1407  /* Constructors: __m256d, 4 doubles */
1408  F64vec4() {}
1409 
1410  /* initialize 4 DP FP with __m256d data type */
1411  F64vec4(__m256d m) { vec = m; }
1412 
1413  /* initialize 4 DP FPs with 4 doubles */
1414  F64vec4(double _D3, double _D2, double _D1, double _D0)
1415  {
1416  vec = _mm256_set_pd(_D3,_D2,_D1,_D0);
1417  }
1418 
1419  /* Explicitly initialize each of 4 DP FPs with same double */
1420  explicit F64vec4(double _D) { vec = _mm256_set1_pd(_D); }
1421 
1422  /* Conversion functions */
1423  operator __m256d() const { return vec; }
1424 
1425  /* Logical Operators */
1426  friend F64vec4 operator &(const F64vec4 &_A, const F64vec4 &_B) { return _mm256_and_pd(_A,_B); }
1427  friend F64vec4 operator |(const F64vec4 &_A, const F64vec4 &_B) { return _mm256_or_pd(_A,_B); }
1428  friend F64vec4 operator ^(const F64vec4 &_A, const F64vec4 &_B) { return _mm256_xor_pd(_A,_B); }
1429 
1430  /* Arithmetic Operators */
1431  friend F64vec4 operator +(const F64vec4 &_A, const F64vec4 &_B) { return _mm256_add_pd(_A,_B); }
1432  friend F64vec4 operator -(const F64vec4 &_A, const F64vec4 &_B) { return _mm256_sub_pd(_A,_B); }
1433  friend F64vec4 operator *(const F64vec4 &_A, const F64vec4 &_B) { return _mm256_mul_pd(_A,_B); }
1434  friend F64vec4 operator /(const F64vec4 &_A, const F64vec4 &_B) { return _mm256_div_pd(_A,_B); }
1435 
1436  F64vec4& operator +=(const F64vec4 &_A) { return *this = _mm256_add_pd(vec,_A); }
1437  F64vec4& operator -=(const F64vec4 &_A) { return *this = _mm256_sub_pd(vec,_A); }
1438  F64vec4& operator *=(const F64vec4 &_A) { return *this = _mm256_mul_pd(vec,_A); }
1439  F64vec4& operator /=(const F64vec4 &_A) { return *this = _mm256_div_pd(vec,_A); }
1440  F64vec4& operator &=(const F64vec4 &_A) { return *this = _mm256_and_pd(vec,_A); }
1441  F64vec4& operator |=(const F64vec4 &_A) { return *this = _mm256_or_pd(vec,_A); }
1442  F64vec4& operator ^=(const F64vec4 &_A) { return *this = _mm256_xor_pd(vec,_A); }
1443 
1444  /* Horizontal Add */
1445  friend double add_horizontal(const F64vec4 &_A)
1446  {
1447  F64vec4 _Temp = _mm256_add_pd(_A, _mm256_permute_pd(_A,0x05));
1449  }
1450 
1451  /* And Not */
1452  friend F64vec4 andnot(const F64vec4 &_A, const F64vec4 &_B) { return _mm256_andnot_pd(_A,_B); }
1453 
1454  /* Square Root */
1455  friend F64vec4 sqrt(const F64vec4 &_A) { return _mm256_sqrt_pd(_A); }
1456 
1457  /* Compares: Mask is returned */
1458  friend F64vec4 cmp_eq(const F64vec4 &_A, const F64vec4 &_B)
1459  { return _mm256_cmp_pd(_A, _B, _CMP_EQ_OQ); }
1460  friend F64vec4 cmp_lt(const F64vec4 &_A, const F64vec4 &_B)
1461  { return _mm256_cmp_pd(_A, _B, _CMP_LT_OS); }
1462  friend F64vec4 cmp_le(const F64vec4 &_A, const F64vec4 &_B)
1463  { return _mm256_cmp_pd(_A, _B, _CMP_LE_OS); }
1464  friend F64vec4 cmp_gt(const F64vec4 &_A, const F64vec4 &_B)
1465  { return _mm256_cmp_pd(_A, _B, _CMP_GT_OS); }
1466  friend F64vec4 cmp_ge(const F64vec4 &_A, const F64vec4 &_B)
1467  { return _mm256_cmp_pd(_A, _B, _CMP_GE_OS); }
1468  friend F64vec4 cmp_neq(const F64vec4 &_A, const F64vec4 &_B)
1469  { return _mm256_cmp_pd(_A, _B, _CMP_NEQ_UQ); }
1470  friend F64vec4 cmp_nlt(const F64vec4 &_A, const F64vec4 &_B)
1471  { return _mm256_cmp_pd(_A, _B, _CMP_NLT_US); }
1472  friend F64vec4 cmp_nle(const F64vec4 &_A, const F64vec4 &_B)
1473  { return _mm256_cmp_pd(_A, _B, _CMP_NLE_US); }
1474  friend F64vec4 cmp_ngt(const F64vec4 &_A, const F64vec4 &_B)
1475  { return _mm256_cmp_pd(_A, _B, _CMP_NGT_US); }
1476  friend F64vec4 cmp_nge(const F64vec4 &_A, const F64vec4 &_B)
1477  { return _mm256_cmp_pd(_A, _B, _CMP_NGE_US); }
1478 
1479  /* Min and Max */
1480  friend F64vec4 simd_min(const F64vec4 &_A, const F64vec4 &_B)
1481  { return _mm256_min_pd(_A,_B); }
1482  friend F64vec4 simd_max(const F64vec4 &_A, const F64vec4 &_B)
1483  { return _mm256_max_pd(_A,_B); }
1484 
1485  /* Absolute value */
1486  friend F64vec4 abs(const F64vec4 &_A)
1487  {
1488  static const union
1489  {
1490  int i[8];
1491  __m256d m;
1492  } __f64vec4_abs_mask = { -1, 0x7fffffff, -1, 0x7fffffff,
1493  -1, 0x7fffffff, -1, 0x7fffffff};
1494  return _mm256_and_pd(_A, __f64vec4_abs_mask.m);
1495  }
1496 
1497  /* Debug Features */
1498 #if defined (_ENABLE_VEC_DEBUG)
1499  /* Output */
1500  friend DVEC_STD ostream & operator<<(DVEC_STD ostream &_Os, const F64vec4 &_A)
1501  {
1502  /* To use: cout << "Elements of F64vec4 fvec are: " << fvec; */
1503  double *_Dp = (double*) &_A;
1504  _Os << "[3]:" << *(_Dp+3)
1505  << " [2]:" << *(_Dp+2)
1506  << " [3]:" << *(_Dp+1)
1507  << " [0]:" << *_Dp;
1508  return _Os;
1509  }
1510 #endif /* defined (_ENABLE_VEC_DEBUG) */
1511 
1512  /* Element Access Only, no modifications to elements */
1513  const double& operator[](int _I) const
1514  {
1515  /* Assert enabled only during debug /DDEBUG */
1516  _VEC_ASSERT((0 <= _I) && (_I <= 3));
1517  double *_Dp = (double*)&vec;
1518  return *(_Dp+ _I);
1519  }
1520  /* Element Access and Modification*/
1521  double& operator[](int _I)
1522  {
1523  /* Assert enabled only during debug /DDEBUG */
1524  _VEC_ASSERT((0 <= _I) && (_I <= 3));
1525  double *_Dp = (double*)&vec;
1526  return *(_Dp+ _I);
1527  }
1528 };
1529 
1530  /* Miscellaneous */
1531 
1532 /* Interleave low order data elements of a and b into destination */
1533 inline F64vec4 unpack_low(const F64vec4 &_A, const F64vec4 &_B){
1534  return _mm256_unpacklo_pd(_A, _B); }
1535 
1536 /* Interleave high order data elements of a and b into target */
1537 inline F64vec4 unpack_high(const F64vec4 &_A, const F64vec4 &_B){
1538  return _mm256_unpackhi_pd(_A, _B); }
1539 
1540 /* Move Mask to Integer returns 4 bit mask formed of most significant bits of a */
1541 inline int move_mask(const F64vec4 &_A){
1542  return _mm256_movemask_pd(_A); }
1543 
1544  /* Data Motion Functions */
1545 
1546 /* Load Unaligned loadu_pd: Unaligned */
1547 inline void loadu(F64vec4 &_A, double *_P){
1548  _A = _mm256_loadu_pd(_P); }
1549 
1550 /* Store Unaligned storeu_pd: Unaligned */
1551 inline void storeu(double *_P, const F64vec4 &_A){
1552  _mm256_storeu_pd(_P, _A); }
1553 
1554  /* Cacheability Support */
1555 
1556 /* Non-Temporal Store */
1557 inline void store_nta(double *_P, const F64vec4 &_A){
1558  _mm256_stream_pd(_P, _A); }
1559 
1560  /* Conditional moves */
1561 
1562 /* Masked load */
1563 inline void maskload(F64vec4 &_A, const double *_P, const F64vec4 &_M){
1564  _A = _mm256_maskload_pd(_P, _mm256_castpd_si256(_M)); }
1565 
1566 inline void maskload(F64vec2 &_A, const double *_P, const F64vec2 &_M){
1567  _A = _mm_maskload_pd(_P, _mm_castpd_si128(_M)); }
1568 
1569 /* Masked store */
1570 inline void maskstore(double *_P, const F64vec4 &_A, const F64vec4 &_M){
1572 
1573 inline void maskstore(double *_P, const F64vec2 &_A, const F64vec2 &_M){
1574  _mm_maskstore_pd(_P, _mm_castpd_si128(_M), _A); }
1575 
1576  /* Conditional Selects */
1577 
1578 inline F64vec4 select_eq(const F64vec4 &_A, const F64vec4 &_B, const F64vec4 &_C, const F64vec4 &_D){
1579  return _mm256_blendv_pd(_D, _C, _mm256_cmp_pd(_A, _B, _CMP_EQ_OQ)); }
1580 
1581 inline F64vec4 select_lt(const F64vec4 &_A, const F64vec4 &_B, const F64vec4 &_C, const F64vec4 &_D){
1582  return _mm256_blendv_pd(_D, _C, _mm256_cmp_pd(_A, _B, _CMP_LT_OS)); }
1583 
1584 inline F64vec4 select_le(const F64vec4 &_A, const F64vec4 &_B, const F64vec4 &_C, const F64vec4 &_D){
1585  return _mm256_blendv_pd(_D, _C, _mm256_cmp_pd(_A, _B, _CMP_LE_OS)); }
1586 
1587 inline F64vec4 select_gt(const F64vec4 &_A, const F64vec4 &_B, const F64vec4 &_C, const F64vec4 &_D){
1588  return _mm256_blendv_pd(_D, _C, _mm256_cmp_pd(_A, _B, _CMP_GT_OS)); }
1589 
1590 inline F64vec4 select_ge(const F64vec4 &_A, const F64vec4 &_B, const F64vec4 &_C, const F64vec4 &_D){
1591  return _mm256_blendv_pd(_D, _C, _mm256_cmp_pd(_A, _B, _CMP_GE_OS)); }
1592 
1593 inline F64vec4 select_neq(const F64vec4 &_A, const F64vec4 &_B, const F64vec4 &_C, const F64vec4 &_D){
1594  return _mm256_blendv_pd(_D, _C, _mm256_cmp_pd(_A, _B, _CMP_NEQ_UQ)); }
1595 
1596 inline F64vec4 select_nlt(const F64vec4 &_A, const F64vec4 &_B, const F64vec4 &_C, const F64vec4 &_D){
1597  return _mm256_blendv_pd(_D, _C, _mm256_cmp_pd(_A, _B, _CMP_NLT_US)); }
1598 
1599 inline F64vec4 select_nle(const F64vec4 &_A, const F64vec4 &_B, const F64vec4 &_C, const F64vec4 &_D){
1600  return _mm256_blendv_pd(_D, _C, _mm256_cmp_pd(_A, _B, _CMP_NLE_US)); }
1601 
1602 inline F64vec4 select_ngt(const F64vec4 &_A, const F64vec4 &_B, const F64vec4 &_C, const F64vec4 &_D){
1603  return _mm256_blendv_pd(_D, _C, _mm256_cmp_pd(_A, _B, _CMP_NGT_US)); }
1604 
1605 inline F64vec4 select_nge(const F64vec4 &_A, const F64vec4 &_B, const F64vec4 &_C, const F64vec4 &_D){
1606  return _mm256_blendv_pd(_D, _C, _mm256_cmp_pd(_A, _B, _CMP_NGE_US)); }
1607 
1608  /* Conversion Functions */
1609 
1610 /* Convert the 4 SP FP values of a to 4 DP FP values */
1612  return _mm256_cvtps_pd(_A); }
1613 
1614 /* Convert the 4 DP FP values of a to 4 SP FP values */
1616  return _mm256_cvtpd_ps(_A); }
1617 
1618 #undef DVEC_DEFINE_OUTPUT_OPERATORS
1619 #undef DVEC_STD
1620 
1621 #pragma pack(pop)
1622 
1623 #endif /* defined (_M_CEE_PURE) */
1624 
1625 #endif /* RC_INVOKED */
1626 #endif /* _DVEC_H_INCLUDED */
Definition: fvec.h:78
__m128i _mm_sub_epi32(__m128i _A, __m128i _B)
I64vec2 operator<<(int _Count)
Definition: dvec.h:196
void __cdecl _mm256_storeu_pd(double *, __m256d)
__m128i _mm_adds_epi8(__m128i _A, __m128i _B)
#define _CMP_NEQ_UQ
Definition: immintrin.h:64
friend F32vec8 cmp_ge(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1248
__m128i _mm_mulhi_epi16(__m128i _A, __m128i _B)
#define _CMP_NGE_US
Definition: immintrin.h:70
__m256d __cdecl _mm256_sub_pd(__m256d, __m256d)
Is32vec4 & operator>>=(int _Count)
Definition: dvec.h:292
friend F64vec4 cmp_le(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1462
unsigned short & operator[](int _I)
Definition: dvec.h:590
void __cdecl _wassert(_In_z_ const wchar_t *_Message, _In_z_ const wchar_t *_File, _In_ unsigned _Line)
I64vec2 sum_abs(const Iu8vec16 &_A, const Iu8vec16 &_B)
Definition: dvec.h:802
Definition: dvec.h:334
const unsigned short & operator[](int _I) const
Definition: dvec.h:583
__m128i _mm_xor_si128(__m128i _A, __m128i _B)
I16vec8 & operator&=(const M128 &_A)
Definition: dvec.h:415
I8vec16 & operator&=(const M128 &_A)
Definition: dvec.h:631
I64vec2 & operator<<=(const I64vec2 &_A)
Definition: dvec.h:197
Is16vec8 & operator^=(const M128 &_A)
Definition: dvec.h:460
__m256d __cdecl _mm256_div_pd(__m256d, __m256d)
friend F32vec8 rsqrt(const F32vec8 &_A)
Definition: dvec.h:1211
F64vec2 & operator*=(const F64vec2 &_A)
Definition: dvec.h:951
M128 operator&(const M128 &_A, const M128 &_B)
Definition: dvec.h:144
F32vec8 & operator=(float _F)
Definition: dvec.h:1159
Iu8vec16(__m128i _Mm)
Definition: dvec.h:731
__m128i _mm_srai_epi32(__m128i _A, int _Count)
__m256 __cdecl _mm256_div_ps(__m256, __m256)
double & operator[](int _I)
Definition: dvec.h:1041
__m256 __cdecl _mm256_rsqrt_ps(__m256)
__m256d __cdecl _mm256_cvtps_pd(__m128)
__m128i _mm_packs_epi32(__m128i _A, __m128i _B)
Iu32vec4 & operator-=(const I32vec4 &_A)
Definition: dvec.h:352
__m128i _mm_sra_epi16(__m128i _A, __m128i _Count)
F64vec4 & operator+=(const F64vec4 &_A)
Definition: dvec.h:1436
I8vec16(__m128i _Mm)
Definition: dvec.h:619
__m256d __cdecl _mm256_andnot_pd(__m256d, __m256d)
Is16vec8 simd_min(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:528
I64vec2 & operator>>=(int _Count)
Definition: dvec.h:202
I32vec4 & operator<<=(int _Count)
Definition: dvec.h:251
__m128d _mm_loadu_pd(double const *_Dp)
friend F32vec8 cmp_neq(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1250
Is32vec4 & operator>>=(const M128 &_A)
Definition: dvec.h:291
friend F32vec8 cmp_nlt(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1252
Is16vec8 & operator*=(const I16vec8 &_A)
Definition: dvec.h:465
I8vec16 & operator-=(const I8vec16 &_A)
Definition: dvec.h:637
#define _f64vec2_abs_mask
Definition: dvec.h:72
Iu32vec4 & operator<<=(const M128 &_A)
Definition: dvec.h:357
__m128d _mm_cvtsi32_sd(__m128d _A, int _B)
friend F32vec8 cmp_eq(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1240
I64vec2 operator*(const Iu32vec4 &_A, const Iu32vec4 &_B)
Definition: dvec.h:391
#define _MM_2QW(element, vector)
Definition: dvec.h:97
__m256d __cdecl _mm256_set_pd(double, double, double, double)
__m256 __cdecl _mm256_loadu_ps(float const *)
void __cdecl _mm256_storeu_ps(float *, __m256)
unsigned int _Count
Definition: xcomplex:668
__m128i _mm_cmpgt_epi32(__m128i _A, __m128i _B)
friend F64vec4 simd_max(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1482
F64vec2 F32vec4ToF64vec2(const F32vec4 &_A)
Definition: dvec.h:1105
I16vec8()
Definition: dvec.h:404
Iu32vec4 operator<<(int _Count)
Definition: dvec.h:356
__m256d __cdecl _mm256_and_pd(__m256d, __m256d)
const unsigned int & operator[](int _I) const
Definition: dvec.h:377
void __cdecl _mm_maskstore_ps(float *, __m128i, __m128)
I64vec2 operator>>(int _Count)
Definition: dvec.h:200
I16vec8 operator<<(const M128 &_A)
Definition: dvec.h:425
Iu16vec8 & operator>>=(int _Count)
Definition: dvec.h:563
Iu16vec8 operator>>(const M128 &_A)
Definition: dvec.h:560
__m128d _mm_set1_pd(double _A)
friend double add_horizontal(const F64vec4 &_A)
Definition: dvec.h:1445
I16vec8 operator<<(int _Count)
Definition: dvec.h:426
Is16vec8()
Definition: dvec.h:447
__m128d _mm_unpacklo_pd(__m128d _A, __m128d _B)
unsigned int & operator[](int _I)
Definition: dvec.h:384
__m256 __cdecl _mm256_sqrt_ps(__m256)
__m128i _mm_castpd_si128(__m128d)
Definition: dvec.h:727
const signed short & operator[](int _I) const
Definition: dvec.h:495
Is16vec8 & operator=(const M128 &_A)
Definition: dvec.h:455
const __m128i get_mask128()
Definition: dvec.h:104
__m256 __cdecl _mm256_xor_ps(__m256, __m256)
I8vec16 & operator|=(const M128 &_A)
Definition: dvec.h:632
Definition: dvec.h:1400
__m128i _mm_max_epi16(__m128i _A, __m128i _B)
#define _VEC_ASSERT(_Expression)
Definition: dvec.h:53
__m128d
Definition: emmintrin.h:57
__int64 & operator[](int _I)
Definition: dvec.h:212
I32vec4 & operator^=(const M128 &_A)
Definition: dvec.h:241
friend F64vec4 cmp_ge(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1466
Is16vec8 & operator<<=(const M128 &_A)
Definition: dvec.h:470
__m128i _mm_unpacklo_epi16(__m128i _A, __m128i _B)
const int & operator[](int _I) const
Definition: dvec.h:307
__m256d
Definition: immintrin.h:43
Iu16vec8 & operator&=(const M128 &_A)
Definition: dvec.h:547
I32vec4 & operator=(const M128 &_A)
Definition: dvec.h:236
void __cdecl _mm256_stream_ps(float *, __m256)
Is32vec4(int _I3, int _I2, int _I1, int _I0)
Definition: dvec.h:269
__m256d __cdecl _mm256_loadu_pd(double const *)
friend F64vec2 operator-(const F64vec2 &_A, const F64vec2 &_B)
Definition: dvec.h:945
__m128i _mm_add_epi64(__m128i _A, __m128i _B)
const unsigned char & operator[](int _I) const
Definition: dvec.h:778
double _mm_cvtsd_f64(__m128d _A)
friend F64vec4 cmp_lt(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1460
#define _CMP_NLT_US
Definition: immintrin.h:65
Is32vec4 cmplt(const Is32vec4 &_A, const Is32vec4 &_B)
Definition: dvec.h:325
#define _CMP_EQ_OQ
Definition: immintrin.h:60
__m128i _mm_set1_epi64x(__int64 i)
__m128i _mm_srli_epi16(__m128i _A, int _Count)
uint_2 operator<<(const uint_2 &_Lhs, const uint_2 &_Rhs) __GPU
Definition: amp_short_vectors.h:22866
__m128d _mm_shuffle_pd(__m128d _A, __m128d _B, int _I)
__m128d m
Definition: dvec.h:69
__m128 __cdecl _mm_maskload_ps(float const *, __m128i)
F64vec4 & operator/=(const F64vec4 &_A)
Definition: dvec.h:1439
const float & operator[](int _I) const
Definition: dvec.h:1299
Is16vec8 & operator|=(const M128 &_A)
Definition: dvec.h:459
I16vec8 & operator-=(const I16vec8 &_A)
Definition: dvec.h:421
F64vec2 & operator|=(const F64vec2 &_A)
Definition: dvec.h:954
I64vec2()
Definition: dvec.h:173
__m128i _mm_cmplt_epi8(__m128i _A, __m128i _B)
__m128i _mm_set_epi16(short _W7, short _W6, short _W5, short _W4, short _W3, short _W2, short _W1, short _W0)
__m128i _mm_packs_epi16(__m128i _A, __m128i _B)
friend F64vec4 abs(const F64vec4 &_A)
Definition: dvec.h:1486
Iu32vec4(__m128i _Mm)
Definition: dvec.h:338
__m256 __cdecl _mm256_permute_ps(__m256, int)
F32vec8 & operator&=(const F32vec8 &_A)
Definition: dvec.h:1189
__m128i _mm_add_epi8(__m128i _A, __m128i _B)
I8vec16 & operator+=(const I8vec16 &_A)
Definition: dvec.h:636
__m256 __cdecl _mm256_add_ps(__m256, __m256)
F32vec8 & operator/=(const F32vec8 &_A)
Definition: dvec.h:1188
__m128d _mm_set_pd(double _Z, double _Y)
I8vec16 & operator=(const M128 &_A)
Definition: dvec.h:628
__m128i _mm_srl_epi32(__m128i _A, __m128i _Count)
Is16vec8(__m128i _Mm)
Definition: dvec.h:448
Definition: dvec.h:401
Is16vec8 operator<<(const M128 &_A)
Definition: dvec.h:468
Iu8vec16 & operator^=(const M128 &_A)
Definition: dvec.h:747
I32vec4 & operator&=(const M128 &_A)
Definition: dvec.h:239
__m256d __cdecl _mm256_permute_pd(__m256d, int)
#define F64vec2_UCOMI(op)
#define _MM_8UW(element, vector)
Definition: dvec.h:91
Is32vec4 & operator+=(const I32vec4 &_A)
Definition: dvec.h:280
#define _MM_8W(element, vector)
Definition: dvec.h:92
M128 & operator^=(const M128 &_A)
Definition: dvec.h:140
__m128i _mm_sra_epi32(__m128i _A, __m128i _Count)
F64vec2_COMI(eq) F64vec2_COMI(lt) F64vec2_COMI(le) F64vec2_COMI(gt) F64vec2_COMI(ge) F64vec2_COMI(neq) F64vec2_UCOMI(eq) F64vec2_UCOMI(lt) F64vec2_UCOMI(le) F64vec2_UCOMI(gt) F64vec2_UCOMI(ge) F64vec2_UCOMI(neq) const double &operator[](int _I) const
Definition: dvec.h:1001
F64vec4(double _D)
Definition: dvec.h:1420
I64vec2 unpack_high(const I64vec2 &_A, const I64vec2 &_B)
Definition: dvec.h:223
Is16vec8 mul_high(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:521
__m256d __cdecl _mm256_mul_pd(__m256d, __m256d)
Iu8vec16(unsigned char _U15, unsigned char _U14, unsigned char _U13, unsigned char _U12, unsigned char _U11, unsigned char _U10, unsigned char _U9, unsigned char _U8, unsigned char _U7, unsigned char _U6, unsigned char _U5, unsigned char _U4, unsigned char _U3, unsigned char _U2, unsigned char _U1, unsigned char _U0)
Definition: dvec.h:732
F64vec4 & operator^=(const F64vec4 &_A)
Definition: dvec.h:1442
__m128i _mm_srli_epi32(__m128i _A, int _Count)
F32vec8 & operator^=(const F32vec8 &_A)
Definition: dvec.h:1191
friend F64vec4 cmp_neq(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1468
friend F32vec8 rcp_nr(const F32vec8 &_A)
Definition: dvec.h:1217
Is32vec4 & operator<<=(int _Count)
Definition: dvec.h:287
Iu32vec4 operator>>(int _Count)
Definition: dvec.h:360
friend F32vec8 cmp_lt(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1242
__m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0)
I32vec4 & operator+=(const I32vec4 &_A)
Definition: dvec.h:244
F64vec2 & operator^=(const F64vec2 &_A)
Definition: dvec.h:955
__m128i _mm_sll_epi16(__m128i _A, __m128i _Count)
__m128i _mm_avg_epu16(__m128i _A, __m128i _B)
F64vec2()
Definition: dvec.h:924
#define _MM_16UB(element, vector)
Definition: dvec.h:88
Is16vec8 & operator&=(const M128 &_A)
Definition: dvec.h:458
F64vec2 & operator/=(const F64vec2 &_A)
Definition: dvec.h:952
F32vec8 select_lt(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D)
Definition: dvec.h:1368
__m128i _mm_min_epi16(__m128i _A, __m128i _B)
F32vec8 select_ge(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D)
Definition: dvec.h:1377
friend F64vec4 cmp_ngt(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1474
Definition: dvec.h:264
F32vec8(__m256 _M)
Definition: dvec.h:1144
friend F64vec4 cmp_eq(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1458
friend F64vec4 operator^(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1428
I16vec8(short _S7, short _S6, short _S5, short _S4, short _S3, short _S2, short _S1, short _S0)
Definition: dvec.h:406
__m256i __cdecl _mm256_castpd_si256(__m256d)
#define _CMP_LT_OS
Definition: immintrin.h:61
__m128i _mm_unpacklo_epi32(__m128i _A, __m128i _B)
Iu32vec4()
Definition: dvec.h:337
friend F32vec8 simd_min(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1262
Iu8vec16 & operator=(const M128 &_A)
Definition: dvec.h:742
Iu16vec8 & operator-=(const I16vec8 &_A)
Definition: dvec.h:552
__m128i _mm_adds_epu8(__m128i _A, __m128i _B)
friend F64vec4 operator*(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1433
I16vec8 & operator|=(const M128 &_A)
Definition: dvec.h:416
Iu8vec16 packu_sat(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:812
F64vec4 F32vec4ToF64vec4(const F32vec4 &_A)
Definition: dvec.h:1611
I64vec2(__m128i _Mm)
Definition: dvec.h:174
I64vec2 & operator=(const M128 &_A)
Definition: dvec.h:183
friend F64vec4 operator/(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1434
friend F64vec2 operator+(const F64vec2 &_A, const F64vec2 &_B)
Definition: dvec.h:944
F32vec8(float _F7, float _F6, float _F5, float _F4, float _F3, float _F2, float _F1, float _F0)
Definition: dvec.h:1147
friend F64vec4 operator|(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1427
friend F32vec8 operator&(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1175
__m128 _mm_cvtpd_ps(__m128d _A)
#define _CMP_NGT_US
Definition: immintrin.h:72
__m128i _mm_castps_si128(__m128)
Definition: dvec.h:154
#define _CMP_GT_OS
Definition: immintrin.h:76
__m128i _mm_sll_epi32(__m128i _A, __m128i _Count)
Is16vec8 pack_sat(const Is32vec4 &_A, const Is32vec4 &_B)
Definition: dvec.h:810
Is32vec4 & operator-=(const I32vec4 &_A)
Definition: dvec.h:281
I8vec16(char _S15, char _S14, char _S13, char _S12, char _S11, char _S10, char _S9, char _S8, char _S7, char _S6, char _S5, char _S4, char _S3, char _S2, char _S1, char _S0)
Definition: dvec.h:620
__m128i _mm_unpackhi_epi16(__m128i _A, __m128i _B)
Iu8vec16 & operator+=(const I8vec16 &_A)
Definition: dvec.h:750
M128(__m128i _Mm)
Definition: dvec.h:133
__m256d __cdecl _mm256_sqrt_pd(__m256d)
friend F64vec2 operator^(const F64vec2 &_A, const F64vec2 &_B)
Definition: dvec.h:941
__m128i _mm_slli_epi16(__m128i _A, int _Count)
Is8vec16(char _S15, char _S14, char _S13, char _S12, char _S11, char _S10, char _S9, char _S8, char _S7, char _S6, char _S5, char _S4, char _S3, char _S2, char _S1, char _S0)
Definition: dvec.h:655
int _mm_movemask_pd(__m128d _A)
__m256 __cdecl _mm256_sub_ps(__m256, __m256)
int _mm_cvttsd_si32(__m128d _A)
__m128i _mm_sad_epu8(__m128i _A, __m128i _B)
F32vec8 & operator*=(const F32vec8 &_A)
Definition: dvec.h:1187
friend F32vec8 andnot(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1202
__m128i _mm_max_epu8(__m128i _A, __m128i _B)
Iu16vec8 & operator>>=(const M128 &_A)
Definition: dvec.h:562
I64vec2 operator>>(const I64vec2 &_A)
Definition: dvec.h:199
void store_nta(double *_P, F64vec2 &_A)
Definition: dvec.h:1077
Is32vec4(__m128i _Mm)
Definition: dvec.h:268
int i[4]
Definition: dvec.h:68
I8vec16 & operator^=(const M128 &_A)
Definition: dvec.h:633
__m256 __cdecl _mm256_and_ps(__m256, __m256)
Iu16vec8 & operator*=(const I16vec8 &_A)
Definition: dvec.h:553
F64vec4()
Definition: dvec.h:1408
friend F64vec4 operator+(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1431
int move_mask(const F64vec2 &_A)
Definition: dvec.h:1061
#define IVEC128_SELECT(vect12, vect34, element, selop)
Definition: dvec.h:866
F64vec4 & operator&=(const F64vec4 &_A)
Definition: dvec.h:1440
I32vec4 & operator-=(const I32vec4 &_A)
Definition: dvec.h:245
friend F32vec8 cmp_nge(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1258
F32vec8 & operator-=(const F32vec8 &_A)
Definition: dvec.h:1186
__m256 vec
Definition: dvec.h:1136
#define _In_z_
Definition: sal.h:310
Is16vec8 simd_max(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:527
#define _In_
Definition: sal.h:305
friend F32vec8 operator*(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1182
#define F64vec2_SELECT(op)
Definition: dvec.h:1080
__m128d _mm_add_pd(__m128d _A, __m128d _B)
__m128d _mm_or_pd(__m128d _A, __m128d _B)
F64vec2(double _D)
Definition: dvec.h:933
Iu16vec8(__m128i _Mm)
Definition: dvec.h:538
I64vec2 unpack_low(const I64vec2 &_A, const I64vec2 &_B)
Definition: dvec.h:222
int __cdecl _mm256_movemask_pd(__m256d)
Is8vec16 & operator=(const M128 &_A)
Definition: dvec.h:662
Is16vec8 operator>>(int _Count)
Definition: dvec.h:474
__m256 __cdecl _mm256_movehdup_ps(__m256)
void __cdecl _mm256_maskstore_pd(double *, __m256i, __m256d)
F32vec4 F64vec2ToF32vec4(const F64vec2 &_A)
Definition: dvec.h:1111
const double & operator[](int _I) const
Definition: dvec.h:1513
F32vec8 select_eq(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D)
Definition: dvec.h:1365
__m128i _mm_mul_epu32(__m128i _A, __m128i _B)
__m256d __cdecl _mm256_unpacklo_pd(__m256d, __m256d)
__m128i _mm_min_epu8(__m128i _A, __m128i _B)
__m256d __cdecl _mm256_max_pd(__m256d, __m256d)
__m128d _mm_div_pd(__m128d _A, __m128d _B)
int __cdecl _mm256_movemask_ps(__m256)
F64vec4(double _D3, double _D2, double _D1, double _D0)
Definition: dvec.h:1414
Is32vec4 mul_add(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:522
friend F64vec4 operator&(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1426
__m256 __cdecl _mm256_blendv_ps(__m256, __m256, __m256)
I64vec2 & operator>>=(const I64vec2 &_A)
Definition: dvec.h:201
Iu8vec16 & operator&=(const M128 &_A)
Definition: dvec.h:745
__m64
Definition: mmintrin.h:45
__m128d _mm_and_pd(__m128d _A, __m128d _B)
__m256d __cdecl _mm256_unpackhi_pd(__m256d, __m256d)
__m128i _mm_srl_epi16(__m128i _A, __m128i _Count)
__m128 __cdecl _mm256_castps256_ps128(__m256)
friend F32vec8 rcp(const F32vec8 &_A)
Definition: dvec.h:1208
F64vec2_COMP(eq) F64vec2_COMP(lt) F64vec2_COMP(le) F64vec2_COMP(gt) F64vec2_COMP(ge) F64vec2_COMP(ngt) F64vec2_COMP(nge) F64vec2_COMP(neq) F64vec2_COMP(nlt) F64vec2_COMP(nle) friend F64vec2 simd_min(const F64vec2 &_A
I16vec8 & operator<<=(const M128 &_A)
Definition: dvec.h:427
Is8vec16(__m128i _Mm)
Definition: dvec.h:654
Iu32vec4 & operator<<=(int _Count)
Definition: dvec.h:358
Is16vec8 operator>>(const M128 &_A)
Definition: dvec.h:473
__m128d __cdecl _mm_maskload_pd(double const *, __m128i)
M128 & operator&=(const M128 &_A)
Definition: dvec.h:138
I8vec16()
Definition: dvec.h:618
I32vec4 operator<<(const I32vec4 &_A)
Definition: dvec.h:248
I16vec8(__m128i _Mm)
Definition: dvec.h:405
__m128i _mm_slli_epi64(__m128i _A, int _Count)
__m128i _mm_cmpgt_epi8(__m128i _A, __m128i _B)
F32vec4 F64vec4ToF32vec8(const F64vec4 &_A)
Definition: dvec.h:1615
void __cdecl _mm_maskstore_pd(double *, __m128i, __m128d)
Is16vec8 & operator>>=(const M128 &_A)
Definition: dvec.h:475
__m256 __cdecl _mm256_set1_ps(float)
Is16vec8 sat_add(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:524
__m128i _mm_unpackhi_epi32(__m128i _A, __m128i _B)
friend F32vec8 rsqrt_nr(const F32vec8 &_A)
Definition: dvec.h:1227
I32vec4(__m128i _Mm)
Definition: dvec.h:232
friend F64vec2 sqrt(const F64vec2 &_A)
Definition: dvec.h:968
Definition: iosfwd:631
__m128d _mm_max_pd(__m128d _A, __m128d _B)
__m256d __cdecl _mm256_min_pd(__m256d, __m256d)
basic_ostream< char, char_traits< char > > ostream
Definition: iosfwd:679
F32vec8 select_gt(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D)
Definition: dvec.h:1374
Is16vec8(signed short _S7, signed short _S6, signed short _S5, signed short _S4, signed short _S3, signed short _S2, signed short _S1, signed short _S0)
Definition: dvec.h:449
I32vec4()
Definition: dvec.h:231
__m128d __cdecl _mm256_castpd256_pd128(__m256d)
__m128i
Definition: emmintrin.h:53
__m256
Definition: immintrin.h:39
friend F32vec8 operator-(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1181
#define _CMP_GE_OS
Definition: immintrin.h:75
I128vec1 & operator^=(const M128 &_A)
Definition: dvec.h:163
__m128i _mm_srli_epi64(__m128i _A, int _Count)
Iu16vec8 & operator|=(const M128 &_A)
Definition: dvec.h:548
M128()
Definition: dvec.h:132
F64vec2 & operator-=(const F64vec2 &_A)
Definition: dvec.h:950
signed char & operator[](int _I)
Definition: dvec.h:705
__m128i _mm_unpackhi_epi64(__m128i _A, __m128i _B)
friend F64vec2 operator|(const F64vec2 &_A, const F64vec2 &_B)
Definition: dvec.h:940
F32vec8 select_le(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D)
Definition: dvec.h:1371
Definition: dvec.h:650
friend F32vec8 abs(const F32vec8 &_A)
Definition: dvec.h:1268
I32vec4 & operator<<=(const I32vec4 &_A)
Definition: dvec.h:250
F32vec8 select_nge(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D)
Definition: dvec.h:1392
I128vec1 & operator&=(const M128 &_A)
Definition: dvec.h:161
__m128d _mm_unpackhi_pd(__m128d _A, __m128d _B)
F32vec8 select_nlt(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D)
Definition: dvec.h:1383
#define _MM_16B(element, vector)
Definition: dvec.h:89
Iu8vec16()
Definition: dvec.h:730
friend F32vec8 cmp_le(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1244
void _mm_storeu_pd(double *_Dp, __m128d _A)
__m128i _mm_cmpgt_epi16(__m128i _A, __m128i _B)
F64vec4 & operator|=(const F64vec4 &_A)
Definition: dvec.h:1441
Iu32vec4 & operator|=(const M128 &_A)
Definition: dvec.h:347
__m128d _mm_xor_pd(__m128d _A, __m128d _B)
Definition: dvec.h:534
friend F32vec8 operator|(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1176
#define _CMP_NLE_US
Definition: immintrin.h:66
I64vec2 & operator|=(const M128 &_A)
Definition: dvec.h:187
F64vec4 & operator*=(const F64vec4 &_A)
Definition: dvec.h:1438
#define IVEC128_ADD_SUB(vect, element, opsize)
Definition: dvec.h:839
const union @85 __f64vec2_abs_mask_cheat
Is8vec16 & operator-=(const I8vec16 &_A)
Definition: dvec.h:671
__m128d vec
Definition: dvec.h:920
__m256 __cdecl _mm256_mul_ps(__m256, __m256)
Iu16vec8 & operator=(const M128 &_A)
Definition: dvec.h:545
M128 andnot(const M128 &_A, const M128 &_B)
Definition: dvec.h:147
Definition: dvec.h:444
Iu32vec4 & operator=(const M128 &_A)
Definition: dvec.h:343
F64vec2 IntToF64vec2(const F64vec2 &_A, int _B)
Definition: dvec.h:1117
__m128i _mm_avg_epu8(__m128i _A, __m128i _B)
friend F64vec4 sqrt(const F64vec4 &_A)
Definition: dvec.h:1455
friend F64vec4 cmp_gt(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1464
Iu16vec8()
Definition: dvec.h:537
Definition: dvec.h:228
Iu16vec8 operator<<(int _Count)
Definition: dvec.h:557
__m128i _mm_mullo_epi16(__m128i _A, __m128i _B)
friend F32vec8 operator+(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1180
const F64vec2 &_B return _mm_min_pd(_A, _B)
M128 & operator|=(const M128 &_A)
Definition: dvec.h:139
__m128i _mm_subs_epu8(__m128i _A, __m128i _B)
Iu32vec4 & operator^=(const M128 &_A)
Definition: dvec.h:348
void loadu(F64vec2 &_A, double *_P)
Definition: dvec.h:1067
I16vec8 & operator=(const M128 &_A)
Definition: dvec.h:412
Definition: dvec.h:170
Iu16vec8 simd_avg(const Iu16vec8 &_A, const Iu16vec8 &_B)
Definition: dvec.h:609
__m256d vec
Definition: dvec.h:1403
Iu16vec8(unsigned short _S7, unsigned short _S6, unsigned short _S5, unsigned short _S4, unsigned short _S3, unsigned short _S2, unsigned short _S1, unsigned short _S0)
Definition: dvec.h:539
#define _MM_4DW(element, vector)
Definition: dvec.h:95
__m128i _mm_cmpeq_epi16(__m128i _A, __m128i _B)
__m256 __cdecl _mm256_min_ps(__m256, __m256)
__m128i _mm_or_si128(__m128i _A, __m128i _B)
Iu8vec16 & operator-=(const I8vec16 &_A)
Definition: dvec.h:751
__m128i _mm_sll_epi64(__m128i _A, __m128i _Count)
__m256 __cdecl _mm256_andnot_ps(__m256, __m256)
Is16vec8 & operator<<=(int _Count)
Definition: dvec.h:471
Is32vec4()
Definition: dvec.h:267
__m128i _mm_cmpeq_epi32(__m128i _A, __m128i _B)
Iu16vec8 & operator^=(const M128 &_A)
Definition: dvec.h:549
M128 operator|(const M128 &_A, const M128 &_B)
Definition: dvec.h:145
__m128 _mm_add_ss(__m128 _A, __m128 _B)
__m256 __cdecl _mm256_unpackhi_ps(__m256, __m256)
Is16vec8 & operator+=(const I16vec8 &_A)
Definition: dvec.h:463
__m128i _mm_adds_epu16(__m128i _A, __m128i _B)
I64vec2 & operator-=(const I64vec2 &_A)
Definition: dvec.h:192
__m128i _mm_add_epi32(__m128i _A, __m128i _B)
__m256i __cdecl _mm256_castps_si256(__m256)
Iu16vec8 & operator+=(const I16vec8 &_A)
Definition: dvec.h:551
F32vec8 & operator+=(const F32vec8 &_A)
Definition: dvec.h:1185
int & operator[](int _I)
Definition: dvec.h:314
__m256d __cdecl _mm256_maskload_pd(double const *, __m256i)
friend F32vec8 sqrt(const F32vec8 &_A)
Definition: dvec.h:1205
__m128i _mm_unpacklo_epi8(__m128i _A, __m128i _B)
friend F64vec2 andnot(const F64vec2 &_A, const F64vec2 &_B)
Definition: dvec.h:965
Iu32vec4 operator>>(const M128 &_A)
Definition: dvec.h:359
Is32vec4 & operator|=(const M128 &_A)
Definition: dvec.h:276
const signed char & operator[](int _I) const
Definition: dvec.h:698
__m128i vec
Definition: dvec.h:129
__m256 __cdecl _mm256_maskload_ps(float const *, __m256i)
__m256d __cdecl _mm256_blendv_pd(__m256d, __m256d, __m256d)
F64vec4 & operator-=(const F64vec4 &_A)
Definition: dvec.h:1437
Iu16vec8 operator<<(const M128 &_A)
Definition: dvec.h:556
Is16vec8 operator<<(int _Count)
Definition: dvec.h:469
Iu32vec4 & operator>>=(const M128 &_A)
Definition: dvec.h:361
friend F64vec4 simd_min(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1480
friend double add_horizontal(const F64vec2 &_A)
Definition: dvec.h:958
M128 operator^(const M128 &_A, const M128 &_B)
Definition: dvec.h:146
Definition: dvec.h:917
__m256d __cdecl _mm256_or_pd(__m256d, __m256d)
F32vec8 select_neq(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D)
Definition: dvec.h:1380
void storeu(double *_P, const F64vec2 &_A)
Definition: dvec.h:1071
Is32vec4 operator<<(int _Count)
Definition: dvec.h:285
I64vec2(__m64 _Q1, __m64 _Q0)
Definition: dvec.h:176
const __int64 & operator[](int _I) const
Definition: dvec.h:205
__m256 __cdecl _mm256_or_ps(__m256, __m256)
friend F32vec8 cmp_ngt(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1256
friend F64vec4 cmp_nge(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1476
I32vec4 cmpneq(const I32vec4 &_A, const I32vec4 &_B)
Definition: dvec.h:256
__m128d _mm_sub_pd(__m128d _A, __m128d _B)
friend F64vec4 cmp_nle(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1472
Iu16vec8 & operator<<=(const M128 &_A)
Definition: dvec.h:558
F32vec8 select_ngt(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D)
Definition: dvec.h:1389
friend F64vec2 operator&(const F64vec2 &_A, const F64vec2 &_B)
Definition: dvec.h:939
I16vec8 & operator^=(const M128 &_A)
Definition: dvec.h:417
__m128i _mm_slli_epi32(__m128i _A, int _Count)
I128vec1(__m128i _Mm)
Definition: dvec.h:158
signed short & operator[](int _I)
Definition: dvec.h:502
__m128i _mm_sub_epi16(__m128i _A, __m128i _B)
void _mm_stream_pd(double *_Dp, __m128d _A)
__m256 __cdecl _mm256_rcp_ps(__m256)
I128vec1 & operator|=(const M128 &_A)
Definition: dvec.h:162
I64vec2 operator<<(const I64vec2 &_A)
Definition: dvec.h:195
F32vec8(float _F)
Definition: dvec.h:1153
Is32vec4 & operator<<=(const M128 &_A)
Definition: dvec.h:286
Is8vec16 & operator|=(const M128 &_A)
Definition: dvec.h:666
I128vec1 & operator=(const M128 &_A)
Definition: dvec.h:160
unsigned char & operator[](int _I)
Definition: dvec.h:785
I32vec4 cmpeq(const I32vec4 &_A, const I32vec4 &_B)
Definition: dvec.h:255
friend F32vec8 cmp_nle(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1254
I32vec4 operator<<(int _Count)
Definition: dvec.h:249
friend F32vec8 operator^(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1177
friend F64vec2 operator/(const F64vec2 &_A, const F64vec2 &_B)
Definition: dvec.h:947
Is8vec16 & operator^=(const M128 &_A)
Definition: dvec.h:667
friend F32vec8 operator/(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1183
Is32vec4 operator>>(const M128 &_A)
Definition: dvec.h:289
Iu16vec8 operator>>(int _Count)
Definition: dvec.h:561
__m128i _mm_subs_epu16(__m128i _A, __m128i _B)
__m256d __cdecl _mm256_set1_pd(double)
Iu8vec16 & operator|=(const M128 &_A)
Definition: dvec.h:746
__m128i _mm_sub_epi8(__m128i _A, __m128i _B)
Iu32vec4 & operator>>=(int _Count)
Definition: dvec.h:362
F64vec4(__m256d m)
Definition: dvec.h:1411
F64vec2 & operator&=(const F64vec2 &_A)
Definition: dvec.h:953
friend F64vec2 abs(const F64vec2 &_A)
Definition: dvec.h:993
friend F64vec4 andnot(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1452
#define IVEC128_LOGICALS(vect, element)
Definition: dvec.h:815
I64vec2 & operator^=(const M128 &_A)
Definition: dvec.h:188
__m256 __cdecl _mm256_max_ps(__m256, __m256)
__m128i _mm_unpacklo_epi64(__m128i _A, __m128i _B)
I64vec2 & operator<<=(int _Count)
Definition: dvec.h:198
__m128i _mm_andnot_si128(__m128i _A, __m128i _B)
Definition: dvec.h:1133
friend F32vec8 simd_max(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1264
Iu32vec4 & operator+=(const I32vec4 &_A)
Definition: dvec.h:351
friend F32vec8 cmp_gt(const F32vec8 &_A, const F32vec8 &_B)
Definition: dvec.h:1246
Is32vec4 & operator^=(const M128 &_A)
Definition: dvec.h:277
Is16vec8 & operator>>=(int _Count)
Definition: dvec.h:476
__m256 __cdecl _mm256_set_ps(float, float, float, float, float, float, float, float)
__m256 __cdecl _mm256_cmp_ps(__m256, __m256, const int)
Is16vec8 & operator-=(const I16vec8 &_A)
Definition: dvec.h:464
I64vec2 & operator&=(const M128 &_A)
Definition: dvec.h:186
__m128i _mm_srai_epi16(__m128i _A, int _Count)
void maskload(F32vec8 &_A, const float *_P, const F32vec8 &_M)
Definition: dvec.h:1350
friend F64vec4 operator-(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1432
__m128i _mm_subs_epi8(__m128i _A, __m128i _B)
__m256d __cdecl _mm256_cmp_pd(__m256d, __m256d, const int)
__m256d __cdecl _mm256_xor_pd(__m256d, __m256d)
I16vec8 & operator+=(const I16vec8 &_A)
Definition: dvec.h:420
friend F64vec2 operator*(const F64vec2 &_A, const F64vec2 &_B)
Definition: dvec.h:946
__m128d _mm_mul_pd(__m128d _A, __m128d _B)
Is8vec16 & operator&=(const M128 &_A)
Definition: dvec.h:665
__m256d __cdecl _mm256_add_pd(__m256d, __m256d)
Definition: dvec.h:615
float _mm_cvtss_f32(__m128 _A)
#define _CMP_LE_OS
Definition: immintrin.h:62
__m128i _mm_add_epi16(__m128i _A, __m128i _B)
Is32vec4 operator>>(int _Count)
Definition: dvec.h:290
Is32vec4 cmpgt(const Is32vec4 &_A, const Is32vec4 &_B)
Definition: dvec.h:324
I128vec1()
Definition: dvec.h:157
__m128i _mm_set_epi8(char _B15, char _B14, char _B13, char _B12, char _B11, char _B10, char _B9, char _B8, char _B7, char _B6, char _B5, char _B4, char _B3, char _B2, char _B1, char _B0)
__m128i _mm_cmpeq_epi8(__m128i _A, __m128i _B)
void maskstore(float *_P, const F32vec8 &_A, const F32vec8 &_M)
Definition: dvec.h:1357
__m128i _mm_unpackhi_epi8(__m128i _A, __m128i _B)
__m128i _mm_adds_epi16(__m128i _A, __m128i _B)
__m128 __cdecl _mm256_cvtpd_ps(__m256d)
__m128d _mm_cvtps_pd(__m128 _A)
Is8vec16()
Definition: dvec.h:653
Iu32vec4 & operator&=(const M128 &_A)
Definition: dvec.h:346
__m128d _mm_andnot_pd(__m128d _A, __m128d _B)
__m128i _mm_sub_epi64(__m128i _A, __m128i _B)
I16vec8 & operator<<=(int _Count)
Definition: dvec.h:428
#define _MM_4UDW(element, vector)
Definition: dvec.h:94
friend F64vec4 cmp_nlt(const F64vec4 &_A, const F64vec4 &_B)
Definition: dvec.h:1470
I32vec4(int _I3, int _I2, int _I1, int _I0)
Definition: dvec.h:233
__m128i _mm_packus_epi16(__m128i _A, __m128i _B)
F32vec8(double _D)
Definition: dvec.h:1156
I32vec4 & operator|=(const M128 &_A)
Definition: dvec.h:240
Iu32vec4(unsigned int _Ui3, unsigned int _Ui2, unsigned int _Ui1, unsigned int _Ui0)
Definition: dvec.h:339
__m128i _mm_mulhi_epu16(__m128i _A, __m128i _B)
F32vec8 & operator|=(const F32vec8 &_A)
Definition: dvec.h:1190
Is32vec4 operator<<(const M128 &_A)
Definition: dvec.h:284
Is32vec4 & operator=(const M128 &_A)
Definition: dvec.h:272
F32vec8()
Definition: dvec.h:1141
void __cdecl _mm256_maskstore_ps(float *, __m256i, __m256)
friend F64vec2 simd_max(const F64vec2 &_A, const F64vec2 &_B)
Definition: dvec.h:990
I64vec2 & operator+=(const I64vec2 &_A)
Definition: dvec.h:191
__m128i _mm_srl_epi64(__m128i _A, __m128i _Count)
Iu32vec4 operator<<(const M128 &_A)
Definition: dvec.h:355
Iu16vec8 & operator<<=(int _Count)
Definition: dvec.h:559
double & operator[](int _I)
Definition: dvec.h:1521
__m128d _mm_add_sd(__m128d _A, __m128d _B)
Is16vec8 sat_sub(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:525
__m128i _mm_and_si128(__m128i _A, __m128i _B)
void __cdecl _mm256_stream_pd(double *, __m256d)
__m128i _mm_madd_epi16(__m128i _A, __m128i _B)
Is32vec4 & operator&=(const M128 &_A)
Definition: dvec.h:275
F64vec2(double _D1, double _D0)
Definition: dvec.h:930
__m128d _mm_sqrt_pd(__m128d _A)
__m128 __cdecl _mm256_extractf128_ps(__m256, const int)
F32vec8 select_nle(const F32vec8 &_A, const F32vec8 &_B, const F32vec8 &_C, const F32vec8 &_D)
Definition: dvec.h:1386
F64vec2(__m128d _M)
Definition: dvec.h:927
I16vec8 & operator*=(const I16vec8 &_A)
Definition: dvec.h:422
__m128i _mm_subs_epi16(__m128i _A, __m128i _B)
__m128d __cdecl _mm256_extractf128_pd(__m256d, const int)
friend float add_horizontal(const F32vec8 &_A)
Definition: dvec.h:1194
F64vec2 & operator+=(const F64vec2 &_A)
Definition: dvec.h:949
Is8vec16 & operator+=(const I8vec16 &_A)
Definition: dvec.h:670
__m256 __cdecl _mm256_unpacklo_ps(__m256, __m256)
float & operator[](int _I)
Definition: dvec.h:1308
Definition: dvec.h:126