STLdoc
STLdocumentation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
dvec.h
Go to the documentation of this file.
1 /***
2 *** Copyright (C) 1985-2011 Intel Corporation. All rights reserved.
3 ***
4 *** The information and source code contained herein is the exclusive
5 *** property of Intel Corporation and may not be disclosed, examined
6 *** or reproduced in whole or in part without explicit written authorization
7 *** from the company.
8 ***
9 ****/
10 
11 /*
12  * Definition of a C++ class interface to Intel(R) Pentium(R) 4 processor SSE2 intrinsics.
13  *
14  * File name : dvec.h class definitions
15  *
16  * Concept: A C++ abstraction of Intel(R) Pentium(R) 4 processor SSE2
17  * designed to improve programmer productivity. Speed and accuracy are
18  * sacrificed for utility. Facilitates an easy transition to compiler
19  * intrinsics or assembly language.
20  *
21  */
22 
23 #ifndef _DVEC_H_INCLUDED
24 #define _DVEC_H_INCLUDED
25 #ifndef RC_INVOKED
26 
27 #if !defined __cplusplus
28  #error ERROR: This file is only supported in C++ compilations!
29 #endif /* !defined __cplusplus */
30 
31 #if defined (_M_CEE_PURE)
32  #error ERROR: This file is not supported in the pure mode!
33 #else /* defined (_M_CEE_PURE) */
34 
35 #include <immintrin.h> /* SSE2 intrinsic function definition include file */
36 #include <fvec.h>
37 #include <crtdefs.h>
38 
39 #ifndef _VEC_ASSERT
40 #ifdef NDEBUG
41  #define _VEC_ASSERT(_Expression) ((void)0)
42 #else /* NDEBUG */
43 #ifdef __cplusplus
44  extern "C" {
45 #endif /* __cplusplus */
46 
47  _CRTIMP void __cdecl _wassert(_In_z_ const wchar_t * _Message, _In_z_ const wchar_t *_File, _In_ unsigned _Line);
48 
49 #ifdef __cplusplus
50  }
51 #endif /* __cplusplus */
52 
53  #define _VEC_ASSERT(_Expression) (void)( (!!(_Expression)) || (_wassert(_CRT_WIDE(#_Expression), _CRT_WIDE(__FILE__), __LINE__), 0) )
54 #endif /* NDEBUG */
55 #endif /* _VEC_ASSERT */
56 
57 #ifdef _MSC_VER
58 #pragma pack(push,_CRT_PACKING)
59 #endif /* _MSC_VER */
60 
61 /* Define _ENABLE_VEC_DEBUG to enable std::ostream inserters for debug output */
62 #if defined (_ENABLE_VEC_DEBUG)
63  #include <iostream>
64 #endif /* defined (_ENABLE_VEC_DEBUG) */
65 
66 #pragma pack(push,16) /* Must ensure class & union 16-B aligned */
67 
68 const union
69 {
70  int i[4];
72 } __f64vec2_abs_mask_cheat = {0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff};
73 
74 #define _f64vec2_abs_mask ((F64vec2)__f64vec2_abs_mask_cheat.m)
75 
76 /* EMM Functionality Intrinsics */
77 
78 class I8vec16; /* 16 elements, each element a signed or unsigned char data type */
79 class Is8vec16; /* 16 elements, each element a signed char data type */
80 class Iu8vec16; /* 16 elements, each element an unsigned char data type */
81 class I16vec8; /* 8 elements, each element a signed or unsigned short */
82 class Is16vec8; /* 8 elements, each element a signed short */
83 class Iu16vec8; /* 8 elements, each element an unsigned short */
84 class I32vec4; /* 4 elements, each element a signed or unsigned long */
85 class Is32vec4; /* 4 elements, each element a signed long */
86 class Iu32vec4; /* 4 elements, each element a unsigned long */
87 class I64vec2; /* 2 element, each a __m64 data type */
88 class I128vec1; /* 1 element, a __m128i data type */
89 
90 #define _MM_16UB(element,vector) (*((unsigned char*)&##vector + ##element))
91 #define _MM_16B(element,vector) (*((signed char*)&##vector + ##element))
92 
93 #define _MM_8UW(element,vector) (*((unsigned short*)&##vector + ##element))
94 #define _MM_8W(element,vector) (*((short*)&##vector + ##element))
95 
96 #define _MM_4UDW(element,vector) (*((unsigned int*)&##vector + ##element))
97 #define _MM_4DW(element,vector) (*((int*)&##vector + ##element))
98 
99 #define _MM_2QW(element,vector) (*((__int64*)&##vector + ##element))
100 
101 
102 /* We need a m128i constant, keeping performance in mind*/
103 
104 #pragma warning(push)
105 #pragma warning(disable : 4640)
106 inline const __m128i get_mask128()
107 {
108  static const __m128i mask128 = _mm_set1_epi64(M64(0xffffffffffffffffi64));
109  return mask128;
110 }
111 #pragma warning(pop)
112 
113 
114 //DEVDIV Remove alais created in public\sdk\inc\winnt.h
115 #ifdef M128
116 #undef M128
117 #endif /* M128 */
118 #ifdef PM128
119 #undef PM128
120 #endif /* PM128 */
121 //end DEVDIV
122 
123 /* M128 Class:
124  * 1 element, a __m128i data type
125  * Contructors & Logical Operations
126  */
127 
128 class M128
129 {
130 protected:
132 
133 public:
134  M128() { }
135  M128(__m128i mm) { vec = mm; }
136 
137  operator __m128i() const { return vec; }
138 
139  /* Logical Operations */
140  M128& operator&=(const M128 &a) { return *this = (M128) _mm_and_si128(vec,a); }
141  M128& operator|=(const M128 &a) { return *this = (M128) _mm_or_si128(vec,a); }
142  M128& operator^=(const M128 &a) { return *this = (M128) _mm_xor_si128(vec,a); }
143 
144 };
145 
146 inline M128 operator&(const M128 &a, const M128 &b) { return _mm_and_si128(a,b); }
147 inline M128 operator|(const M128 &a, const M128 &b) { return _mm_or_si128(a,b); }
148 inline M128 operator^(const M128 &a, const M128 &b) { return _mm_xor_si128(a,b); }
149 inline M128 andnot(const M128 &a, const M128 &b) { return _mm_andnot_si128(a,b); }
150 
151 /* I128vec1 Class:
152  * 1 element, a __m128i data type
153  * Contains Operations which can operate on any __m6128i data type
154  */
155 
156 class I128vec1 : public M128
157 {
158 public:
159  I128vec1() { }
160  I128vec1(__m128i mm) : M128(mm) { }
161 
162  I128vec1& operator= (const M128 &a) { return *this = (I128vec1) a; }
163  I128vec1& operator&=(const M128 &a) { return *this = (I128vec1) _mm_and_si128(vec,a); }
164  I128vec1& operator|=(const M128 &a) { return *this = (I128vec1) _mm_or_si128(vec,a); }
165  I128vec1& operator^=(const M128 &a) { return *this = (I128vec1) _mm_xor_si128(vec,a); }
166 
167 };
168 
169 /* I64vec2 Class:
170  * 2 elements, each element signed or unsigned 64-bit integer
171  */
172 class I64vec2 : public M128
173 {
174 public:
175  I64vec2() { }
176  I64vec2(__m128i mm) : M128(mm) { }
177 
179  {
180  _MM_2QW(0,vec) = *(__int64*)&q0;
181  _MM_2QW(1,vec) = *(__int64*)&q1;
182  }
183 
184  /* Assignment Operator */
185  I64vec2& operator= (const M128 &a) { return *this = (I64vec2) a; }
186 
187  /* Logical Assignment Operators */
188  I64vec2& operator&=(const M128 &a) { return *this = (I64vec2) _mm_and_si128(vec,a); }
189  I64vec2& operator|=(const M128 &a) { return *this = (I64vec2) _mm_or_si128(vec,a); }
190  I64vec2& operator^=(const M128 &a) { return *this = (I64vec2) _mm_xor_si128(vec,a); }
191 
192  /* Addition & Subtraction Assignment Operators */
193  I64vec2& operator +=(const I64vec2 &a) { return *this = (I64vec2) _mm_add_epi64(vec,a); }
194  I64vec2& operator -=(const I64vec2 &a) { return *this = (I64vec2) _mm_sub_epi64(vec,a); }
195 
196  /* Shift Logical Operators */
197  I64vec2 operator<<(const I64vec2 &a) { return _mm_sll_epi64(vec,a); }
198  I64vec2 operator<<(int count) { return _mm_slli_epi64(vec,count); }
199  I64vec2& operator<<=(const I64vec2 &a) { return *this = (I64vec2) _mm_sll_epi64(vec,a); }
200  I64vec2& operator<<=(int count) { return *this = (I64vec2) _mm_slli_epi64(vec,count); }
201  I64vec2 operator>>(const I64vec2 &a) { return _mm_srl_epi64(vec,a); }
202  I64vec2 operator>>(int count) { return _mm_srli_epi64(vec,count); }
203  I64vec2& operator>>=(const I64vec2 &a) { return *this = (I64vec2) _mm_srl_epi64(vec,a); }
204  I64vec2& operator>>=(int count) { return *this = (I64vec2) _mm_srli_epi64(vec,count); }
205 
206  /* Element Access for Debug, No data modified */
207  const __int64& operator[](int i)const
208  {
209  _VEC_ASSERT(static_cast<unsigned int>(i) < 2); /* Only 2 elements to access */
210  return _MM_2QW(i,vec);
211  }
212 
213  /* Element Access and Assignment for Debug */
214  __int64& operator[](int i)
215  {
216  _VEC_ASSERT(static_cast<unsigned int>(i) < 2); /* Only 2 elements to access */
217  return _MM_2QW(i,vec);
218  }
219 
220 
221 };
222 
223 /* Unpacks */
224 inline I64vec2 unpack_low(const I64vec2 &a, const I64vec2 &b) {return _mm_unpacklo_epi64(a,b); }
225 inline I64vec2 unpack_high(const I64vec2 &a, const I64vec2 &b) {return _mm_unpackhi_epi64(a,b); }
226 
227 /* I32vec4 Class:
228  * 4 elements, each element either a signed or unsigned int
229  */
230 class I32vec4 : public M128
231 {
232 public:
233  I32vec4() { }
234  I32vec4(__m128i mm) : M128(mm) { }
235  I32vec4(int i3, int i2, int i1, int i0) {vec = _mm_set_epi32(i3, i2, i1, i0);}
236 
237  /* Assignment Operator */
238  I32vec4& operator= (const M128 &a) { return *this = (I32vec4) a; }
239 
240  /* Logicals Operators */
241  I32vec4& operator&=(const M128 &a) { return *this = (I32vec4) _mm_and_si128(vec,a); }
242  I32vec4& operator|=(const M128 &a) { return *this = (I32vec4) _mm_or_si128(vec,a); }
243  I32vec4& operator^=(const M128 &a) { return *this = (I32vec4) _mm_xor_si128(vec,a); }
244 
245  /* Addition & Subtraction Assignment Operators */
246  I32vec4& operator +=(const I32vec4 &a) { return *this = (I32vec4)_mm_add_epi32(vec,a); }
247  I32vec4& operator -=(const I32vec4 &a) { return *this = (I32vec4)_mm_sub_epi32(vec,a); }
248 
249  /* Shift Logical Operators */
250  I32vec4 operator<<(const I32vec4 &a) { return _mm_sll_epi32(vec,a); }
251  I32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
252  I32vec4& operator<<=(const I32vec4 &a) { return *this = (I32vec4)_mm_sll_epi32(vec,a); }
253  I32vec4& operator<<=(int count) { return *this = (I32vec4)_mm_slli_epi32(vec,count); }
254 
255 };
256 
257 inline I32vec4 cmpeq(const I32vec4 &a, const I32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
258 inline I32vec4 cmpneq(const I32vec4 &a, const I32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b), get_mask128()); }
259 
260 inline I32vec4 unpack_low(const I32vec4 &a, const I32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
261 inline I32vec4 unpack_high(const I32vec4 &a, const I32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
262 
263 /* Is32vec4 Class:
264  * 4 elements, each element signed integer
265  */
266 class Is32vec4 : public I32vec4
267 {
268 public:
269  Is32vec4() { }
270  Is32vec4(__m128i mm) : I32vec4(mm) { }
271  Is32vec4(int i3, int i2, int i1, int i0) : I32vec4(i3, i2, i1, i0){}
272 
273  /* Assignment Operator */
274  Is32vec4& operator= (const M128 &a) { return *this = (Is32vec4) a; }
275 
276  /* Logical Operators */
277  Is32vec4& operator&=(const M128 &a) { return *this = (Is32vec4) _mm_and_si128(vec,a); }
278  Is32vec4& operator|=(const M128 &a) { return *this = (Is32vec4) _mm_or_si128(vec,a); }
279  Is32vec4& operator^=(const M128 &a) { return *this = (Is32vec4) _mm_xor_si128(vec,a); }
280 
281  /* Addition & Subtraction Assignment Operators */
282  Is32vec4& operator +=(const I32vec4 &a) { return *this = (Is32vec4)_mm_add_epi32(vec,a); }
283  Is32vec4& operator -=(const I32vec4 &a) { return *this = (Is32vec4)_mm_sub_epi32(vec,a); }
284 
285  /* Shift Logical Operators */
286  Is32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
287  Is32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
288  Is32vec4& operator<<=(const M128 &a) { return *this = (Is32vec4)_mm_sll_epi32(vec,a); }
289  Is32vec4& operator<<=(int count) { return *this = (Is32vec4)_mm_slli_epi32(vec,count); }
290  /* Shift Arithmetic Operations */
291  Is32vec4 operator>>(const M128 &a) { return _mm_sra_epi32(vec,a); }
292  Is32vec4 operator>>(int count) { return _mm_srai_epi32(vec,count); }
293  Is32vec4& operator>>=(const M128 &a) { return *this = (Is32vec4) _mm_sra_epi32(vec,a); }
294  Is32vec4& operator>>=(int count) { return *this = (Is32vec4) _mm_srai_epi32(vec,count); }
295 
296 #if defined (_ENABLE_VEC_DEBUG)
297  /* Output for Debug */
298  friend std::ostream& operator<< (std::ostream &os, const Is32vec4 &a)
299  {
300  os << "[3]:" << _MM_4DW(3,a)
301  << " [2]:" << _MM_4DW(2,a)
302  << " [1]:" << _MM_4DW(1,a)
303  << " [0]:" << _MM_4DW(0,a);
304  return os;
305  }
306 #endif /* defined (_ENABLE_VEC_DEBUG) */
307 
308  /* Element Access for Debug, No data modified */
309  const int& operator[](int i)const
310  {
311  _VEC_ASSERT(static_cast<unsigned int>(i) < 4); /* Only 4 elements to access */
312  return _MM_4DW(i,vec);
313  }
314 
315  /* Element Access for Debug */
316  int& operator[](int i)
317  {
318  _VEC_ASSERT(static_cast<unsigned int>(i) < 4); /* Only 4 elements to access */
319  return _MM_4DW(i,vec);
320  }
321 };
322 
323 /* Compares */
324 inline Is32vec4 cmpeq(const Is32vec4 &a, const Is32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
325 inline Is32vec4 cmpneq(const Is32vec4 &a, const Is32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b), get_mask128()); }
326 inline Is32vec4 cmpgt(const Is32vec4 &a, const Is32vec4 &b) { return _mm_cmpgt_epi32(a,b); }
327 inline Is32vec4 cmplt(const Is32vec4 &a, const Is32vec4 &b) { return _mm_cmpgt_epi32(b,a); }
328 
329 /* Unpacks */
330 inline Is32vec4 unpack_low(const Is32vec4 &a, const Is32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
331 inline Is32vec4 unpack_high(const Is32vec4 &a, const Is32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
332 
333 /* Iu32vec4 Class:
334  * 4 elements, each element unsigned int
335  */
336 class Iu32vec4 : public I32vec4
337 {
338 public:
339  Iu32vec4() { }
340  Iu32vec4(__m128i mm) : I32vec4(mm) { }
341  Iu32vec4(unsigned int ui3, unsigned int ui2, unsigned int ui1, unsigned int ui0)
342  : I32vec4(ui3, ui2, ui1, ui0) { }
343 
344  /* Assignment Operator */
345  Iu32vec4& operator= (const M128 &a) { return *this = (Iu32vec4) a; }
346 
347  /* Logical Assignment Operators */
348  Iu32vec4& operator&=(const M128 &a) { return *this = (Iu32vec4) _mm_and_si128(vec,a); }
349  Iu32vec4& operator|=(const M128 &a) { return *this = (Iu32vec4) _mm_or_si128(vec,a); }
350  Iu32vec4& operator^=(const M128 &a) { return *this = (Iu32vec4) _mm_xor_si128(vec,a); }
351 
352  /* Addition & Subtraction Assignment Operators */
353  Iu32vec4& operator +=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_add_epi32(vec,a); }
354  Iu32vec4& operator -=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_sub_epi32(vec,a); }
355 
356  /* Shift Logical Operators */
357  Iu32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
358  Iu32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
359  Iu32vec4& operator<<=(const M128 &a) { return *this = (Iu32vec4)_mm_sll_epi32(vec,a); }
360  Iu32vec4& operator<<=(int count) { return *this = (Iu32vec4)_mm_slli_epi32(vec,count); }
361  Iu32vec4 operator>>(const M128 &a) { return _mm_srl_epi32(vec,a); }
362  Iu32vec4 operator>>(int count) { return _mm_srli_epi32(vec,count); }
363  Iu32vec4& operator>>=(const M128 &a) { return *this = (Iu32vec4) _mm_srl_epi32(vec,a); }
364  Iu32vec4& operator>>=(int count) { return *this = (Iu32vec4) _mm_srli_epi32(vec,count); }
365 
366 #if defined (_ENABLE_VEC_DEBUG)
367  /* Output for Debug */
368  friend std::ostream& operator<< (std::ostream &os, const Iu32vec4 &a)
369  {
370  os << "[3]:" << _MM_4UDW(3,a)
371  << " [2]:" << _MM_4UDW(2,a)
372  << " [1]:" << _MM_4UDW(1,a)
373  << " [0]:" << _MM_4UDW(0,a);
374  return os;
375  }
376 #endif /* defined (_ENABLE_VEC_DEBUG) */
377 
378  /* Element Access for Debug, No data modified */
379  const unsigned int& operator[](int i)const
380  {
381  _VEC_ASSERT(static_cast<unsigned int>(i) < 4); /* Only 4 elements to access */
382  return _MM_4UDW(i,vec);
383  }
384 
385  /* Element Access and Assignment for Debug */
386  unsigned int& operator[](int i)
387  {
388  _VEC_ASSERT(static_cast<unsigned int>(i) < 4); /* Only 4 elements to access */
389  return _MM_4UDW(i,vec);
390  }
391 };
392 
393 inline I64vec2 operator*(const Iu32vec4 &a, const Iu32vec4 &b) { return _mm_mul_epu32(a,b); }
394 inline Iu32vec4 cmpeq(const Iu32vec4 &a, const Iu32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
395 inline Iu32vec4 cmpneq(const Iu32vec4 &a, const Iu32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b), get_mask128()); }
396 
397 inline Iu32vec4 unpack_low(const Iu32vec4 &a, const Iu32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
398 inline Iu32vec4 unpack_high(const Iu32vec4 &a, const Iu32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
399 
400 /* I16vec8 Class:
401  * 8 elements, each element either unsigned or signed short
402  */
403 class I16vec8 : public M128
404 {
405 public:
406  I16vec8() { }
407  I16vec8(__m128i mm) : M128(mm) { }
408  I16vec8(short s7, short s6, short s5, short s4, short s3, short s2, short s1, short s0)
409  {
410  vec = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
411  }
412 
413  /* Assignment Operator */
414  I16vec8& operator= (const M128 &a) { return *this = (I16vec8) a; }
415 
416  /* Logical Assignment Operators */
417  I16vec8& operator&=(const M128 &a) { return *this = (I16vec8) _mm_and_si128(vec,a); }
418  I16vec8& operator|=(const M128 &a) { return *this = (I16vec8) _mm_or_si128(vec,a); }
419  I16vec8& operator^=(const M128 &a) { return *this = (I16vec8) _mm_xor_si128(vec,a); }
420 
421  /* Addition & Subtraction Assignment Operators */
422  I16vec8& operator +=(const I16vec8 &a) { return *this = (I16vec8) _mm_add_epi16(vec,a); }
423  I16vec8& operator -=(const I16vec8 &a) { return *this = (I16vec8) _mm_sub_epi16(vec,a); }
424  I16vec8& operator *=(const I16vec8 &a) { return *this = (I16vec8) _mm_mullo_epi16(vec,a); }
425 
426  /* Shift Logical Operators */
427  I16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
428  I16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
429  I16vec8& operator<<=(const M128 &a) { return *this = (I16vec8)_mm_sll_epi16(vec,a); }
430  I16vec8& operator<<=(int count) { return *this = (I16vec8)_mm_slli_epi16(vec,count); }
431 
432 };
433 
434 
435 inline I16vec8 operator*(const I16vec8 &a, const I16vec8 &b) { return _mm_mullo_epi16(a,b); }
436 
437 inline I16vec8 cmpeq(const I16vec8 &a, const I16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
438 inline I16vec8 cmpneq(const I16vec8 &a, const I16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b), get_mask128()); }
439 
440 inline I16vec8 unpack_low(const I16vec8 &a, const I16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
441 inline I16vec8 unpack_high(const I16vec8 &a, const I16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
442 
443 /* Is16vec8 Class:
444  * 8 elements, each element signed short
445  */
446 class Is16vec8 : public I16vec8
447 {
448 public:
449  Is16vec8() { }
450  Is16vec8(__m128i mm) : I16vec8(mm) { }
451  Is16vec8(signed short s7, signed short s6, signed short s5,
452  signed short s4, signed short s3, signed short s2,
453  signed short s1, signed short s0)
454  : I16vec8(s7, s6, s5, s4, s3, s2, s1, s0) { }
455 
456  /* Assignment Operator */
457  Is16vec8& operator= (const M128 &a) { return *this = (Is16vec8) a; }
458 
459  /* Logical Assignment Operators */
460  Is16vec8& operator&=(const M128 &a) { return *this = (Is16vec8) _mm_and_si128(vec,a); }
461  Is16vec8& operator|=(const M128 &a) { return *this = (Is16vec8) _mm_or_si128(vec,a); }
462  Is16vec8& operator^=(const M128 &a) { return *this = (Is16vec8) _mm_xor_si128(vec,a); }
463 
464  /* Addition & Subtraction Assignment Operators */
465  Is16vec8& operator +=(const I16vec8 &a) { return *this = (Is16vec8) _mm_add_epi16(vec,a); }
466  Is16vec8& operator -=(const I16vec8 &a) { return *this = (Is16vec8) _mm_sub_epi16(vec,a); }
467  Is16vec8& operator *=(const I16vec8 &a) { return *this = (Is16vec8) _mm_mullo_epi16(vec,a); }
468 
469  /* Shift Logical Operators */
470  Is16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
471  Is16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
472  Is16vec8& operator<<=(const M128 &a) { return *this = (Is16vec8)_mm_sll_epi16(vec,a); }
473  Is16vec8& operator<<=(int count) { return *this = (Is16vec8)_mm_slli_epi16(vec,count); }
474  /* Shift Arithmetic Operators */
475  Is16vec8 operator>>(const M128 &a) { return _mm_sra_epi16(vec,a); }
476  Is16vec8 operator>>(int count) { return _mm_srai_epi16(vec,count); }
477  Is16vec8& operator>>=(const M128 &a) { return *this = (Is16vec8)_mm_sra_epi16(vec,a); }
478  Is16vec8& operator>>=(int count) { return *this = (Is16vec8)_mm_srai_epi16(vec,count); }
479 
480 #if defined (_ENABLE_VEC_DEBUG)
481  /* Output for Debug */
482  friend std::ostream& operator<< (std::ostream &os, const Is16vec8 &a)
483  {
484  os << "[7]:" << _MM_8W(7,a)
485  << " [6]:" << _MM_8W(6,a)
486  << " [5]:" << _MM_8W(5,a)
487  << " [4]:" << _MM_8W(4,a)
488  << " [3]:" << _MM_8W(3,a)
489  << " [2]:" << _MM_8W(2,a)
490  << " [1]:" << _MM_8W(1,a)
491  << " [0]:" << _MM_8W(0,a);
492  return os;
493  }
494 #endif /* defined (_ENABLE_VEC_DEBUG) */
495 
496  /* Element Access for Debug, No data modified */
497  const signed short& operator[](int i)const
498  {
499  _VEC_ASSERT(static_cast<unsigned int>(i) < 8); /* Only 8 elements to access */
500  return _MM_8W(i,vec);
501  }
502 
503  /* Element Access and Assignment for Debug */
504  signed short& operator[](int i)
505  {
506  _VEC_ASSERT(static_cast<unsigned int>(i) < 8); /* Only 8 elements to access */
507  return _MM_8W(i,vec);
508  }
509 };
510 
511 inline Is16vec8 operator*(const Is16vec8 &a, const Is16vec8 &b) { return _mm_mullo_epi16(a,b); }
512 
513 
514 /* Additional Is16vec8 functions: compares, unpacks, sat add/sub */
515 inline Is16vec8 cmpeq(const Is16vec8 &a, const Is16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
516 inline Is16vec8 cmpneq(const Is16vec8 &a, const Is16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b), get_mask128()); }
517 inline Is16vec8 cmpgt(const Is16vec8 &a, const Is16vec8 &b) { return _mm_cmpgt_epi16(a,b); }
518 inline Is16vec8 cmplt(const Is16vec8 &a, const Is16vec8 &b) { return _mm_cmpgt_epi16(b,a); }
519 
520 inline Is16vec8 unpack_low(const Is16vec8 &a, const Is16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
521 inline Is16vec8 unpack_high(const Is16vec8 &a, const Is16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
522 
523 inline Is16vec8 mul_high(const Is16vec8 &a, const Is16vec8 &b) { return _mm_mulhi_epi16(a,b); }
524 inline Is32vec4 mul_add(const Is16vec8 &a, const Is16vec8 &b) { return _mm_madd_epi16(a,b);}
525 
526 inline Is16vec8 sat_add(const Is16vec8 &a, const Is16vec8 &b) { return _mm_adds_epi16(a,b); }
527 inline Is16vec8 sat_sub(const Is16vec8 &a, const Is16vec8 &b) { return _mm_subs_epi16(a,b); }
528 
529 inline Is16vec8 simd_max(const Is16vec8 &a, const Is16vec8 &b) { return _mm_max_epi16(a,b); }
530 inline Is16vec8 simd_min(const Is16vec8 &a, const Is16vec8 &b) { return _mm_min_epi16(a,b); }
531 
532 
533 /* Iu16vec8 Class:
534  * 8 elements, each element unsigned short
535  */
536 class Iu16vec8 : public I16vec8
537 {
538 public:
539  Iu16vec8() { }
540  Iu16vec8(__m128i mm) : I16vec8(mm) { }
541  Iu16vec8(unsigned short s7, unsigned short s6, unsigned short s5,
542  unsigned short s4, unsigned short s3, unsigned short s2,
543  unsigned short s1, unsigned short s0)
544  : I16vec8(s7, s6, s5, s4, s3, s2, s1, s0) { }
545 
546  /* Assignment Operator */
547  Iu16vec8& operator= (const M128 &a) { return *this = (Iu16vec8) a; }
548  /* Logical Assignment Operators */
549  Iu16vec8& operator&=(const M128 &a) { return *this = (Iu16vec8) _mm_and_si128(vec,a); }
550  Iu16vec8& operator|=(const M128 &a) { return *this = (Iu16vec8) _mm_or_si128(vec,a); }
551  Iu16vec8& operator^=(const M128 &a) { return *this = (Iu16vec8) _mm_xor_si128(vec,a); }
552  /* Addition & Subtraction Assignment Operators */
553  Iu16vec8& operator +=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_add_epi16(vec,a); }
554  Iu16vec8& operator -=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_sub_epi16(vec,a); }
555  Iu16vec8& operator *=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_mullo_epi16(vec,a); }
556 
557  /* Shift Logical Operators */
558  Iu16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
559  Iu16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
560  Iu16vec8& operator<<=(const M128 &a) { return *this = (Iu16vec8)_mm_sll_epi16(vec,a); }
561  Iu16vec8& operator<<=(int count) { return *this = (Iu16vec8)_mm_slli_epi16(vec,count); }
562  Iu16vec8 operator>>(const M128 &a) { return _mm_srl_epi16(vec,a); }
563  Iu16vec8 operator>>(int count) { return _mm_srli_epi16(vec,count); }
564  Iu16vec8& operator>>=(const M128 &a) { return *this = (Iu16vec8) _mm_srl_epi16(vec,a); }
565  Iu16vec8& operator>>=(int count) { return *this = (Iu16vec8) _mm_srli_epi16(vec,count); }
566 
567 
568 #if defined (_ENABLE_VEC_DEBUG)
569  /* Output for Debug */
570  friend std::ostream& operator << (std::ostream &os, const Iu16vec8 &a)
571  {
572  os << "[7]:" << (unsigned short)(_MM_8UW(7,a))
573  << " [6]:" << (unsigned short)(_MM_8UW(6,a))
574  << " [5]:" << (unsigned short)(_MM_8UW(5,a))
575  << " [4]:" << (unsigned short)(_MM_8UW(4,a))
576  << " [3]:" << (unsigned short)(_MM_8UW(3,a))
577  << " [2]:" << (unsigned short)(_MM_8UW(2,a))
578  << " [1]:" << (unsigned short)(_MM_8UW(1,a))
579  << " [0]:" << (unsigned short)(_MM_8UW(0,a));
580  return os;
581  }
582 #endif /* defined (_ENABLE_VEC_DEBUG) */
583 
584  /* Element Access for Debug, No data modified */
585  const unsigned short& operator[](int i)const
586  {
587  _VEC_ASSERT(static_cast<unsigned int>(i) < 8); /* Only 8 elements to access */
588  return _MM_8UW(i,vec);
589  }
590 
591  /* Element Access for Debug */
592  unsigned short& operator[](int i)
593  {
594  _VEC_ASSERT(static_cast<unsigned int>(i) < 8); /* Only 8 elements to access */
595  return _MM_8UW(i,vec);
596  }
597 };
598 
599 inline Iu16vec8 operator*(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_mullo_epi16(a,b); }
600 
601 /* Additional Iu16vec8 functions: cmpeq,cmpneq, unpacks, sat add/sub */
602 inline Iu16vec8 cmpeq(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
603 inline Iu16vec8 cmpneq(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b), get_mask128()); }
604 
605 inline Iu16vec8 unpack_low(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
606 inline Iu16vec8 unpack_high(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
607 
608 inline Iu16vec8 sat_add(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_adds_epu16(a,b); }
609 inline Iu16vec8 sat_sub(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_subs_epu16(a,b); }
610 
611 inline Iu16vec8 simd_avg(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_avg_epu16(a,b); }
612 inline I16vec8 mul_high(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_mulhi_epu16(a,b); }
613 
614 /* I8vec16 Class:
615  * 16 elements, each element either unsigned or signed char
616  */
617 class I8vec16 : public M128
618 {
619 public:
620  I8vec16() { }
621  I8vec16(__m128i mm) : M128(mm) { }
622  I8vec16(char s15, char s14, char s13, char s12, char s11, char s10,
623  char s9, char s8, char s7, char s6, char s5, char s4,
624  char s3, char s2, char s1, char s0)
625  {
626  vec = _mm_set_epi8(s15, s14, s13, s12, s11, s10, s9, s8, s7, s6, s5, s4, s3, s2, s1, s0);
627  }
628 
629  /* Assignment Operator */
630  I8vec16& operator= (const M128 &a) { return *this = (I8vec16) a; }
631 
632  /* Logical Assignment Operators */
633  I8vec16& operator&=(const M128 &a) { return *this = (I8vec16) _mm_and_si128(vec,a); }
634  I8vec16& operator|=(const M128 &a) { return *this = (I8vec16) _mm_or_si128(vec,a); }
635  I8vec16& operator^=(const M128 &a) { return *this = (I8vec16) _mm_xor_si128(vec,a); }
636 
637  /* Addition & Subtraction Assignment Operators */
638  I8vec16& operator +=(const I8vec16 &a) { return *this = (I8vec16) _mm_add_epi8(vec,a); }
639  I8vec16& operator -=(const I8vec16 &a) { return *this = (I8vec16) _mm_sub_epi8(vec,a); }
640 
641 };
642 
643 inline I8vec16 cmpeq(const I8vec16 &a, const I8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
644 inline I8vec16 cmpneq(const I8vec16 &a, const I8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b), get_mask128()); }
645 
646 inline I8vec16 unpack_low(const I8vec16 &a, const I8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
647 inline I8vec16 unpack_high(const I8vec16 &a, const I8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
648 
649 /* Is8vec16 Class:
650  * 16 elements, each element a signed char
651  */
652 class Is8vec16 : public I8vec16
653 {
654 public:
655  Is8vec16() { }
656  Is8vec16(__m128i mm) : I8vec16(mm) { }
657  Is8vec16(char s15, char s14, char s13, char s12, char s11, char s10,
658  char s9, char s8, char s7, char s6, char s5, char s4,
659  char s3, char s2, char s1, char s0)
660  : I8vec16(s15, s14, s13, s12, s11, s10, s9, s8,
661  s7, s6, s5, s4, s3, s2, s1, s0) { }
662 
663  /* Assignment Operator */
664  Is8vec16& operator= (const M128 &a) { return *this = (Is8vec16) a; }
665 
666  /* Logical Assignment Operators */
667  Is8vec16& operator&=(const M128 &a) { return *this = (Is8vec16) _mm_and_si128(vec,a); }
668  Is8vec16& operator|=(const M128 &a) { return *this = (Is8vec16) _mm_or_si128(vec,a); }
669  Is8vec16& operator^=(const M128 &a) { return *this = (Is8vec16) _mm_xor_si128(vec,a); }
670 
671  /* Addition & Subtraction Assignment Operators */
672  Is8vec16& operator +=(const I8vec16 &a) { return *this = (Is8vec16) _mm_add_epi8(vec,a); }
673  Is8vec16& operator -=(const I8vec16 &a) { return *this = (Is8vec16) _mm_sub_epi8(vec,a); }
674 
675 #if defined (_ENABLE_VEC_DEBUG)
676  /* Output for Debug */
677  friend std::ostream& operator << (std::ostream &os, const Is8vec16 &a)
678  {
679  os << "[15]:" << short(_MM_16B(15,a))
680  << " [14]:" << short(_MM_16B(14,a))
681  << " [13]:" << short(_MM_16B(13,a))
682  << " [12]:" << short(_MM_16B(12,a))
683  << " [11]:" << short(_MM_16B(11,a))
684  << " [10]:" << short(_MM_16B(10,a))
685  << " [9]:" << short(_MM_16B(9,a))
686  << " [8]:" << short(_MM_16B(8,a))
687  << " [7]:" << short(_MM_16B(7,a))
688  << " [6]:" << short(_MM_16B(6,a))
689  << " [5]:" << short(_MM_16B(5,a))
690  << " [4]:" << short(_MM_16B(4,a))
691  << " [3]:" << short(_MM_16B(3,a))
692  << " [2]:" << short(_MM_16B(2,a))
693  << " [1]:" << short(_MM_16B(1,a))
694  << " [0]:" << short(_MM_16B(0,a));
695  return os;
696  }
697 #endif /* defined (_ENABLE_VEC_DEBUG) */
698 
699  /* Element Access for Debug, No data modified */
700  const signed char& operator[](int i)const
701  {
702  _VEC_ASSERT(static_cast<unsigned int>(i) < 16); /* Only 16 elements to access */
703  return _MM_16B(i,vec);
704  }
705 
706  /* Element Access for Debug */
707  signed char& operator[](int i)
708  {
709  _VEC_ASSERT(static_cast<unsigned int>(i) < 16); /* Only 16 elements to access */
710  return _MM_16B(i,vec);
711  }
712 
713 };
714 
715 inline Is8vec16 cmpeq(const Is8vec16 &a, const Is8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
716 inline Is8vec16 cmpneq(const Is8vec16 &a, const Is8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b), get_mask128()); }
717 inline Is8vec16 cmpgt(const Is8vec16 &a, const Is8vec16 &b) { return _mm_cmpgt_epi8(a,b); }
718 inline Is8vec16 cmplt(const Is8vec16 &a, const Is8vec16 &b) { return _mm_cmplt_epi8(a,b); }
719 
720 inline Is8vec16 unpack_low(const Is8vec16 &a, const Is8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
721 inline Is8vec16 unpack_high(const Is8vec16 &a, const Is8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
722 
723 inline Is8vec16 sat_add(const Is8vec16 &a, const Is8vec16 &b) { return _mm_adds_epi8(a,b); }
724 inline Is8vec16 sat_sub(const Is8vec16 &a, const Is8vec16 &b) { return _mm_subs_epi8(a,b); }
725 
726 /* Iu8vec16 Class:
727  * 16 elements, each element a unsigned char
728  */
729 class Iu8vec16 : public I8vec16
730 {
731 public:
732  Iu8vec16() { }
733  Iu8vec16(__m128i mm) : I8vec16(mm) { }
734  Iu8vec16(unsigned char u15, unsigned char u14, unsigned char u13,
735  unsigned char u12, unsigned char u11, unsigned char u10,
736  unsigned char u9, unsigned char u8, unsigned char u7,
737  unsigned char u6, unsigned char u5, unsigned char u4,
738  unsigned char u3, unsigned char u2, unsigned char u1,
739  unsigned char u0)
740  : I8vec16(u15, u14, u13, u12, u11, u10, u9, u8,
741  u7, u6, u5, u4, u3, u2, u1, u0) { }
742 
743  /* Assignment Operator */
744  Iu8vec16& operator= (const M128 &a) { return *this = (Iu8vec16) a; }
745 
746  /* Logical Assignment Operators */
747  Iu8vec16& operator&=(const M128 &a) { return *this = (Iu8vec16) _mm_and_si128(vec,a); }
748  Iu8vec16& operator|=(const M128 &a) { return *this = (Iu8vec16) _mm_or_si128(vec,a); }
749  Iu8vec16& operator^=(const M128 &a) { return *this = (Iu8vec16) _mm_xor_si128(vec,a); }
750 
751  /* Addition & Subtraction Assignment Operators */
752  Iu8vec16& operator +=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_add_epi8(vec,a); }
753  Iu8vec16& operator -=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_sub_epi8(vec,a); }
754 
755 #if defined (_ENABLE_VEC_DEBUG)
756  /* Output for Debug */
757  friend std::ostream& operator << (std::ostream &os, const Iu8vec16 &a)
758  {
759  os << "[15]:" << (unsigned char)(_MM_16UB(15,a))
760  << " [14]:" << (unsigned char)(_MM_16UB(14,a))
761  << " [13]:" << (unsigned char)(_MM_16UB(13,a))
762  << " [12]:" << (unsigned char)(_MM_16UB(12,a))
763  << " [11]:" << (unsigned char)(_MM_16UB(11,a))
764  << " [10]:" << (unsigned char)(_MM_16UB(10,a))
765  << " [9]:" << (unsigned char)(_MM_16UB(9,a))
766  << " [8]:" << (unsigned char)(_MM_16UB(8,a))
767  << " [7]:" << (unsigned char)(_MM_16UB(7,a))
768  << " [6]:" << (unsigned char)(_MM_16UB(6,a))
769  << " [5]:" << (unsigned char)(_MM_16UB(5,a))
770  << " [4]:" << (unsigned char)(_MM_16UB(4,a))
771  << " [3]:" << (unsigned char)(_MM_16UB(3,a))
772  << " [2]:" << (unsigned char)(_MM_16UB(2,a))
773  << " [1]:" << (unsigned char)(_MM_16UB(1,a))
774  << " [0]:" << (unsigned char)(_MM_16UB(0,a));
775  return os;
776  }
777 #endif /* defined (_ENABLE_VEC_DEBUG) */
778 
779  /* Element Access for Debug, No data modified */
780  const unsigned char& operator[](int i)const
781  {
782  _VEC_ASSERT(static_cast<unsigned int>(i) < 16); /* Only 16 elements to access */
783  return _MM_16UB(i,vec);
784  }
785 
786  /* Element Access for Debug */
787  unsigned char& operator[](int i)
788  {
789  _VEC_ASSERT(static_cast<unsigned int>(i) < 16); /* Only 16 elements to access */
790  return _MM_16UB(i,vec);
791  }
792 
793 };
794 
795 inline Iu8vec16 cmpeq(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
796 inline Iu8vec16 cmpneq(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b), get_mask128()); }
797 
798 inline Iu8vec16 unpack_low(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
799 inline Iu8vec16 unpack_high(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
800 
801 inline Iu8vec16 sat_add(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_adds_epu8(a,b); }
802 inline Iu8vec16 sat_sub(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_subs_epu8(a,b); }
803 
804 inline I64vec2 sum_abs(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_sad_epu8(a,b); }
805 
806 inline Iu8vec16 simd_avg(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_avg_epu8(a,b); }
807 inline Iu8vec16 simd_max(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_max_epu8(a,b); }
808 inline Iu8vec16 simd_min(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_min_epu8(a,b); }
809 
810 /* Pack & Saturates */
811 
812 inline Is16vec8 pack_sat(const Is32vec4 &a, const Is32vec4 &b) { return _mm_packs_epi32(a,b); }
813 inline Is8vec16 pack_sat(const Is16vec8 &a, const Is16vec8 &b) { return _mm_packs_epi16(a,b); }
814 inline Iu8vec16 packu_sat(const Is16vec8 &a, const Is16vec8 &b) { return _mm_packus_epi16(a,b);}
815 
816  /********************************* Logicals ****************************************/
817 #define IVEC128_LOGICALS(vect,element) \
818 inline I##vect##vec##element operator& (const I##vect##vec##element &a, const I##vect##vec##element &b) \
819 { return _mm_and_si128( a,b); } \
820 inline I##vect##vec##element operator| (const I##vect##vec##element &a, const I##vect##vec##element &b) \
821 { return _mm_or_si128( a,b); } \
822 inline I##vect##vec##element operator^ (const I##vect##vec##element &a, const I##vect##vec##element &b) \
823 { return _mm_xor_si128( a,b); } \
824 inline I##vect##vec##element andnot (const I##vect##vec##element &a, const I##vect##vec##element &b) \
825 { return _mm_andnot_si128( a,b); }
826 
827 IVEC128_LOGICALS(8,16)
828 IVEC128_LOGICALS(u8,16)
829 IVEC128_LOGICALS(s8,16)
830 IVEC128_LOGICALS(16,8)
831 IVEC128_LOGICALS(u16,8)
832 IVEC128_LOGICALS(s16,8)
833 IVEC128_LOGICALS(32,4)
834 IVEC128_LOGICALS(u32,4)
835 IVEC128_LOGICALS(s32,4)
836 IVEC128_LOGICALS(64,2)
837 IVEC128_LOGICALS(128,1)
838 #undef IVEC128_LOGICALS
839 
840  /********************************* Add & Sub ****************************************/
841 #define IVEC128_ADD_SUB(vect,element,opsize) \
842 inline I##vect##vec##element operator+ (const I##vect##vec##element &a, const I##vect##vec##element &b) \
843 { return _mm_add_##opsize( a,b); } \
844 inline I##vect##vec##element operator- (const I##vect##vec##element &a, const I##vect##vec##element &b) \
845 { return _mm_sub_##opsize( a,b); }
846 
847 IVEC128_ADD_SUB(8,16, epi8)
848 IVEC128_ADD_SUB(u8,16, epi8)
849 IVEC128_ADD_SUB(s8,16, epi8)
850 IVEC128_ADD_SUB(16,8, epi16)
851 IVEC128_ADD_SUB(u16,8, epi16)
852 IVEC128_ADD_SUB(s16,8, epi16)
853 IVEC128_ADD_SUB(32,4, epi32)
854 IVEC128_ADD_SUB(u32,4, epi32)
855 IVEC128_ADD_SUB(s32,4, epi32)
856 IVEC128_ADD_SUB(64,2, epi64)
857 #undef IVEC128_ADD_SUB
858 
859  /************************* Conditional Select ********************************
860  * version of: retval = (a OP b)? c : d; *
861  * Where OP is one of the possible comparision operators. *
862  * Example: r = select_eq(a,b,c,d); *
863  * if "member at position x of the vector a" == *
864  * "member at position x of vector b" *
865  * assign the corresponding member in r from c, else assign from d. *
866  ************************* Conditional Select ********************************/
867 
868 #define IVEC128_SELECT(vect12,vect34,element,selop) \
869  inline I##vect34##vec##element select_##selop ( \
870  const I##vect12##vec##element &a, \
871  const I##vect12##vec##element &b, \
872  const I##vect34##vec##element &c, \
873  const I##vect34##vec##element &d) \
874 { \
875  I##vect12##vec##element mask = cmp##selop(a,b); \
876  return ( I##vect34##vec##element (mask & c ) | \
877  I##vect34##vec##element ((_mm_andnot_si128(mask, d )))); \
878 }
879 
880 IVEC128_SELECT(8,s8,16,eq)
881 IVEC128_SELECT(8,u8,16,eq)
882 IVEC128_SELECT(8,8,16,eq)
883 IVEC128_SELECT(8,s8,16,neq)
884 IVEC128_SELECT(8,u8,16,neq)
885 IVEC128_SELECT(8,8,16,neq)
886 
887 IVEC128_SELECT(16,s16,8,eq)
888 IVEC128_SELECT(16,u16,8,eq)
889 IVEC128_SELECT(16,16,8,eq)
890 IVEC128_SELECT(16,s16,8,neq)
891 IVEC128_SELECT(16,u16,8,neq)
892 IVEC128_SELECT(16,16,8,neq)
893 
894 IVEC128_SELECT(32,s32,4,eq)
895 IVEC128_SELECT(32,u32,4,eq)
896 IVEC128_SELECT(32,32,4,eq)
897 IVEC128_SELECT(32,s32,4,neq)
898 IVEC128_SELECT(32,u32,4,neq)
899 IVEC128_SELECT(32,32,4,neq)
900 
901 IVEC128_SELECT(s8,s8,16,gt)
902 IVEC128_SELECT(s8,u8,16,gt)
903 IVEC128_SELECT(s8,8,16,gt)
904 IVEC128_SELECT(s8,s8,16,lt)
905 IVEC128_SELECT(s8,u8,16,lt)
906 IVEC128_SELECT(s8,8,16,lt)
907 
908 IVEC128_SELECT(s16,s16,8,gt)
909 IVEC128_SELECT(s16,u16,8,gt)
910 IVEC128_SELECT(s16,16,8,gt)
911 IVEC128_SELECT(s16,s16,8,lt)
912 IVEC128_SELECT(s16,u16,8,lt)
913 IVEC128_SELECT(s16,16,8,lt)
914 
915 
916 #undef IVEC128_SELECT
917 
918 
919 class F64vec2
920 {
921 protected:
923 public:
924 
925  /* Constructors: __m128d, 2 doubles */
926  F64vec2() {}
927 
928  /* initialize 2 DP FP with __m128d data type */
929  F64vec2(__m128d m) { vec = m;}
930 
931  /* initialize 2 DP FPs with 2 doubles */
932  F64vec2(double d1, double d0) { vec= _mm_set_pd(d1,d0); }
933 
934  /* Explicitly initialize each of 2 DP FPs with same double */
935  EXPLICIT F64vec2(double d) { vec = _mm_set1_pd(d); }
936 
937  /* Conversion functions */
938  operator __m128d() const { return vec; } /* Convert to __m128d */
939 
940  /* Logical Operators */
941  friend F64vec2 operator &(const F64vec2 &a, const F64vec2 &b) { return _mm_and_pd(a,b); }
942  friend F64vec2 operator |(const F64vec2 &a, const F64vec2 &b) { return _mm_or_pd(a,b); }
943  friend F64vec2 operator ^(const F64vec2 &a, const F64vec2 &b) { return _mm_xor_pd(a,b); }
944 
945  /* Arithmetic Operators */
946  friend F64vec2 operator +(const F64vec2 &a, const F64vec2 &b) { return _mm_add_pd(a,b); }
947  friend F64vec2 operator -(const F64vec2 &a, const F64vec2 &b) { return _mm_sub_pd(a,b); }
948  friend F64vec2 operator *(const F64vec2 &a, const F64vec2 &b) { return _mm_mul_pd(a,b); }
949  friend F64vec2 operator /(const F64vec2 &a, const F64vec2 &b) { return _mm_div_pd(a,b); }
950 
951  F64vec2& operator +=(const F64vec2 &a) { return *this = _mm_add_pd(vec,a); }
952  F64vec2& operator -=(const F64vec2 &a) { return *this = _mm_sub_pd(vec,a); }
953  F64vec2& operator *=(const F64vec2 &a) { return *this = _mm_mul_pd(vec,a); }
954  F64vec2& operator /=(const F64vec2 &a) { return *this = _mm_div_pd(vec,a); }
955  F64vec2& operator &=(const F64vec2 &a) { return *this = _mm_and_pd(vec,a); }
956  F64vec2& operator |=(const F64vec2 &a) { return *this = _mm_or_pd(vec,a); }
957  F64vec2& operator ^=(const F64vec2 &a) { return *this = _mm_xor_pd(vec,a); }
958 
959  /* Horizontal Add */
960  friend double add_horizontal(const F64vec2 &a)
961  {
962  F64vec2 ftemp = _mm_add_sd(a,_mm_shuffle_pd(a, a, 1));
963  return _mm_cvtsd_f64(ftemp);
964  }
965 
966  /* And Not */
967  friend F64vec2 andnot(const F64vec2 &a, const F64vec2 &b) { return _mm_andnot_pd(a,b); }
968 
969  /* Square Root */
970  friend F64vec2 sqrt(const F64vec2 &a) { return _mm_sqrt_pd(a); }
971 
972  /* Compares: Mask is returned */
973  /* Macros expand to all compare intrinsics. Example:
974  friend F64vec2 cmpeq(const F64vec2 &a, const F64vec2 &b)
975  { return _mm_cmpeq_ps(a,b);} */
976  #define F64vec2_COMP(op) \
977  friend F64vec2 cmp##op (const F64vec2 &a, const F64vec2 &b) { return _mm_cmp##op##_pd(a,b); }
978  F64vec2_COMP(eq) /* expanded to cmpeq(a,b) */
979  F64vec2_COMP(lt) /* expanded to cmplt(a,b) */
980  F64vec2_COMP(le) /* expanded to cmple(a,b) */
981  F64vec2_COMP(gt) /* expanded to cmpgt(a,b) */
982  F64vec2_COMP(ge) /* expanded to cmpge(a,b) */
983  F64vec2_COMP(ngt) /* expanded to cmpngt(a,b) */
984  F64vec2_COMP(nge) /* expanded to cmpnge(a,b) */
985  F64vec2_COMP(neq) /* expanded to cmpneq(a,b) */
986  F64vec2_COMP(nlt) /* expanded to cmpnlt(a,b) */
987  F64vec2_COMP(nle) /* expanded to cmpnle(a,b) */
988  #undef F64vec2_COMP
989 
990  /* Min and Max */
991  friend F64vec2 simd_min(const F64vec2 &a, const F64vec2 &b) { return _mm_min_pd(a,b); }
992  friend F64vec2 simd_max(const F64vec2 &a, const F64vec2 &b) { return _mm_max_pd(a,b); }
993 
994  /* Absolute value */
995  friend F64vec2 abs(const F64vec2 &a)
996  {
997  return _mm_and_pd(a, _f64vec2_abs_mask);
998  }
999 
1000  /* Compare lower DP FP values */
1001  #define F64vec2_COMI(op) \
1002  friend int comi##op (const F64vec2 &a, const F64vec2 &b) { return _mm_comi##op##_sd(a,b); }
1003  F64vec2_COMI(eq) /* expanded to comieq(a,b) */
1004  F64vec2_COMI(lt) /* expanded to comilt(a,b) */
1005  F64vec2_COMI(le) /* expanded to comile(a,b) */
1006  F64vec2_COMI(gt) /* expanded to comigt(a,b) */
1007  F64vec2_COMI(ge) /* expanded to comige(a,b) */
1008  F64vec2_COMI(neq) /* expanded to comineq(a,b) */
1009  #undef F64vec2_COMI
1010 
1011  /* Compare lower DP FP values */
1012  #define F64vec2_UCOMI(op) \
1013  friend int ucomi##op (const F64vec2 &a, const F64vec2 &b) { return _mm_ucomi##op##_sd(a,b); }
1014  F64vec2_UCOMI(eq) /* expanded to ucomieq(a,b) */
1015  F64vec2_UCOMI(lt) /* expanded to ucomilt(a,b) */
1016  F64vec2_UCOMI(le) /* expanded to ucomile(a,b) */
1017  F64vec2_UCOMI(gt) /* expanded to ucomigt(a,b) */
1018  F64vec2_UCOMI(ge) /* expanded to ucomige(a,b) */
1019  F64vec2_UCOMI(neq) /* expanded to ucomineq(a,b) */
1020  #undef F64vec2_UCOMI
1021 
1022  /* Debug Features */
1023 #if defined (_ENABLE_VEC_DEBUG)
1024  /* Output */
1025  friend std::ostream & operator<<(std::ostream & os, const F64vec2 &a)
1026  {
1027  /* To use: cout << "Elements of F64vec2 fvec are: " << fvec; */
1028  double *dp = (double*)&a;
1029  os << "[1]:" << *(dp+1)
1030  << " [0]:" << *dp;
1031  return os;
1032  }
1033 #endif /* defined (_ENABLE_VEC_DEBUG) */
1034  /* Element Access Only, no modifications to elements*/
1035  const double& operator[](int i) const
1036  {
1037  /* Assert enabled only during debug /DDEBUG */
1038  _VEC_ASSERT((0 <= i) && (i <= 1)); /* User should only access elements 0-1 */
1039  double *dp = (double*)&vec;
1040  return *(dp+i);
1041  }
1042  /* Element Access and Modification*/
1043  double& operator[](int i)
1044  {
1045  /* Assert enabled only during debug /DDEBUG */
1046  _VEC_ASSERT((0 <= i) && (i <= 1)); /* User should only access elements 0-1 */
1047  double *dp = (double*)&vec;
1048  return *(dp+i);
1049  }
1050 };
1051 
1052  /* Miscellaneous */
1053 
1054 /* Interleave low order data elements of a and b into destination */
1055 inline F64vec2 unpack_low(const F64vec2 &a, const F64vec2 &b)
1056 { return _mm_unpacklo_pd(a, b); }
1057 
1058 /* Interleave high order data elements of a and b into target */
1059 inline F64vec2 unpack_high(const F64vec2 &a, const F64vec2 &b)
1060 { return _mm_unpackhi_pd(a, b); }
1061 
1062 /* Move Mask to Integer returns 4 bit mask formed of most significant bits of a */
1063 inline int move_mask(const F64vec2 &a)
1064 { return _mm_movemask_pd(a);}
1065 
1066  /* Data Motion Functions */
1067 
1068 /* Load Unaligned loadu_pd: Unaligned */
1069 inline void loadu(F64vec2 &a, double *p)
1070 { a = _mm_loadu_pd(p); }
1071 
1072 /* Store Temporal storeu_pd: Unaligned */
1073 inline void storeu(double *p, const F64vec2 &a)
1074 { _mm_storeu_pd(p, a); }
1075 
1076  /* Cacheability Support */
1077 
1078 /* Non-Temporal Store */
1079 inline void store_nta(double *p, F64vec2 &a)
1080 { _mm_stream_pd(p,a);}
1081 
1082 #define F64vec2_SELECT(op) \
1083 inline F64vec2 select_##op (const F64vec2 &a, const F64vec2 &b, const F64vec2 &c, const F64vec2 &d) \
1084 { \
1085  F64vec2 mask = _mm_cmp##op##_pd(a,b); \
1086  return( (mask & c) | F64vec2((_mm_andnot_pd(mask,d)))); \
1087 }
1088 F64vec2_SELECT(eq) /* generates select_eq(a,b) */
1089 F64vec2_SELECT(lt) /* generates select_lt(a,b) */
1090 F64vec2_SELECT(le) /* generates select_le(a,b) */
1091 F64vec2_SELECT(gt) /* generates select_gt(a,b) */
1092 F64vec2_SELECT(ge) /* generates select_ge(a,b) */
1093 F64vec2_SELECT(neq) /* generates select_neq(a,b) */
1094 F64vec2_SELECT(nlt) /* generates select_nlt(a,b) */
1095 F64vec2_SELECT(nle) /* generates select_nle(a,b) */
1096 #undef F64vec2_SELECT
1097 
1098 /* Convert the lower DP FP value of a to a 32 bit signed integer using Truncate*/
1099 inline int F64vec2ToInt(const F64vec2 &a)
1100 {
1101 
1102  return _mm_cvttsd_si32(a);
1103 
1104 }
1105 
1106 /* Convert the 4 SP FP values of a to DP FP values */
1108 {
1109  return _mm_cvtps_pd(a);
1110 }
1111 
1112 /* Convert the 2 DP FP values of a to SP FP values */
1114 {
1115  return _mm_cvtpd_ps(a);
1116 }
1117 
1118 /* Convert the signed int in b to a DP FP value. Upper DP FP value in a passed through */
1119 inline F64vec2 IntToF64vec2(const F64vec2 &a, int b)
1120 {
1121  return _mm_cvtsi32_sd(a,b);
1122 }
1123 
1124 #pragma pack(pop) /* 16-B aligned */
1125 
1126  /******************************************************************************/
1127  /************** Interface classes for Intel(R) AVX intrinsics *****************/
1128  /******************************************************************************/
1129 
1130 /*
1131  * class F32vec8
1132  *
1133  * Represents 256-bit vector composed of 8 single precision floating point elements.
1134  */
1135 class F32vec8
1136 {
1137 protected:
1139 
1140 public:
1141 
1142  /* Constructors: __m256, 8 floats, 1 float */
1143  F32vec8() {}
1144 
1145  /* initialize 8 SP FP with __m256 data type */
1146  F32vec8(__m256 m) { vec = m; }
1147 
1148  /* initialize 8 SP FPs with 8 floats */
1149  F32vec8(float f7, float f6, float f5, float f4, float f3, float f2, float f1, float f0)
1150  {
1151  vec = _mm256_set_ps(f7,f6,f5,f4,f3,f2,f1,f0);
1152  }
1153 
1154  /* Explicitly initialize each of 8 SP FPs with same float */
1155  EXPLICIT F32vec8(float f) { vec = _mm256_set1_ps(f); }
1156 
1157  /* Explicitly initialize each of 8 SP FPs with same double */
1158  EXPLICIT F32vec8(double d) { vec = _mm256_set1_ps((float) d); }
1159 
1160  /* Assignment operations */
1162  {
1163  vec = _mm256_set1_ps(f);
1164  return *this;
1165  }
1166 
1167  F32vec8& operator =(double d)
1168  {
1169  vec = _mm256_set1_ps((float) d);
1170  return *this;
1171  }
1172 
1173  /* Conversion functions */
1174  operator __m256() const { return vec; }
1175 
1176  /* Logical Operators */
1177  friend F32vec8 operator &(const F32vec8 &a, const F32vec8 &b) { return _mm256_and_ps(a,b); }
1178  friend F32vec8 operator |(const F32vec8 &a, const F32vec8 &b) { return _mm256_or_ps(a,b); }
1179  friend F32vec8 operator ^(const F32vec8 &a, const F32vec8 &b) { return _mm256_xor_ps(a,b); }
1180 
1181  /* Arithmetic Operators */
1182  friend F32vec8 operator +(const F32vec8 &a, const F32vec8 &b) { return _mm256_add_ps(a,b); }
1183  friend F32vec8 operator -(const F32vec8 &a, const F32vec8 &b) { return _mm256_sub_ps(a,b); }
1184  friend F32vec8 operator *(const F32vec8 &a, const F32vec8 &b) { return _mm256_mul_ps(a,b); }
1185  friend F32vec8 operator /(const F32vec8 &a, const F32vec8 &b) { return _mm256_div_ps(a,b); }
1186 
1187  F32vec8& operator +=(const F32vec8 &a) { return *this = _mm256_add_ps(vec,a); }
1188  F32vec8& operator -=(const F32vec8 &a) { return *this = _mm256_sub_ps(vec,a); }
1189  F32vec8& operator *=(const F32vec8 &a) { return *this = _mm256_mul_ps(vec,a); }
1190  F32vec8& operator /=(const F32vec8 &a) { return *this = _mm256_div_ps(vec,a); }
1191  F32vec8& operator &=(const F32vec8 &a) { return *this = _mm256_and_ps(vec,a); }
1192  F32vec8& operator |=(const F32vec8 &a) { return *this = _mm256_or_ps(vec,a); }
1193  F32vec8& operator ^=(const F32vec8 &a) { return *this = _mm256_xor_ps(vec,a); }
1194 
1195  /* Horizontal Add */
1196  friend float add_horizontal(const F32vec8 &a)
1197  {
1198  F32vec8 temp = _mm256_add_ps(a, _mm256_permute_ps(a, 0xee));
1199  temp = _mm256_add_ps(temp, _mm256_movehdup_ps(temp));
1201  }
1202 
1203  /* And Not */
1204  friend F32vec8 andnot(const F32vec8 &a, const F32vec8 &b) { return _mm256_andnot_ps(a,b); }
1205 
1206  /* Square Root */
1207  friend F32vec8 sqrt(const F32vec8 &a) { return _mm256_sqrt_ps(a); }
1208 
1209  /* Reciprocal */
1210  friend F32vec8 rcp(const F32vec8 &a) { return _mm256_rcp_ps(a); }
1211 
1212  /* Reciprocal Square Root */
1213  friend F32vec8 rsqrt(const F32vec8 &a) { return _mm256_rsqrt_ps(a); }
1214 
1215  /*
1216  * NewtonRaphson Reciprocal
1217  * [2 * rcpps(x) - (x * rcpps(x) * rcpps(x))]
1218  */
1219  friend F32vec8 rcp_nr(const F32vec8 &a)
1220  {
1221  F32vec8 Ra0 = _mm256_rcp_ps(a);
1222  return _mm256_sub_ps(_mm256_add_ps(Ra0, Ra0), _mm256_mul_ps(_mm256_mul_ps(Ra0, a), Ra0));
1223  }
1224 
1225  /*
1226  * NewtonRaphson Reciprocal Square Root
1227  * 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x))
1228  */
1229  friend F32vec8 rsqrt_nr(const F32vec8 &a)
1230  {
1231 #pragma warning(push)
1232 #pragma warning(disable:4640)
1233  static const F32vec8 fvecf0pt5(0.5f);
1234  static const F32vec8 fvecf3pt0(3.0f);
1235 #pragma warning(pop)
1236  F32vec8 Ra0 = _mm256_rsqrt_ps(a);
1237  return (fvecf0pt5 * Ra0) * (fvecf3pt0 - (a * Ra0) * Ra0);
1238 
1239  }
1240 
1241  /* Compares: Mask is returned */
1242  friend F32vec8 cmp_eq(const F32vec8 &a, const F32vec8 &b)
1243  { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
1244  friend F32vec8 cmp_lt(const F32vec8 &a, const F32vec8 &b)
1245  { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
1246  friend F32vec8 cmp_le(const F32vec8 &a, const F32vec8 &b)
1247  { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
1248  friend F32vec8 cmp_gt(const F32vec8 &a, const F32vec8 &b)
1249  { return _mm256_cmp_ps(a, b, _CMP_GT_OS); }
1250  friend F32vec8 cmp_ge(const F32vec8 &a, const F32vec8 &b)
1251  { return _mm256_cmp_ps(a, b, _CMP_GE_OS); }
1252  friend F32vec8 cmp_neq(const F32vec8 &a, const F32vec8 &b)
1253  { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
1254  friend F32vec8 cmp_nlt(const F32vec8 &a, const F32vec8 &b)
1255  { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
1256  friend F32vec8 cmp_nle(const F32vec8 &a, const F32vec8 &b)
1257  { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
1258  friend F32vec8 cmp_ngt(const F32vec8 &a, const F32vec8 &b)
1259  { return _mm256_cmp_ps(a, b, _CMP_NGT_US); }
1260  friend F32vec8 cmp_nge(const F32vec8 &a, const F32vec8 &b)
1261  { return _mm256_cmp_ps(a, b, _CMP_NGE_US); }
1262 
1263  /* Min and Max */
1264  friend F32vec8 simd_min(const F32vec8 &a, const F32vec8 &b)
1265  { return _mm256_min_ps(a,b); }
1266  friend F32vec8 simd_max(const F32vec8 &a, const F32vec8 &b)
1267  { return _mm256_max_ps(a,b); }
1268 
1269  /* Absolute value */
1270  friend F32vec8 abs(const F32vec8 &a)
1271  {
1272  static const union
1273  {
1274  int i[8];
1275  __m256 m;
1276  } __f32vec8_abs_mask = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
1277  0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
1278  return _mm256_and_ps(a, __f32vec8_abs_mask.m);
1279  }
1280 
1281  /* Debug Features */
1282 #if defined (_ENABLE_VEC_DEBUG)
1283  /* Output */
1284  friend DVEC_STD ostream & operator<<(DVEC_STD ostream &os, const F32vec8 &a)
1285  {
1286  /* To use: cout << "Elements of F32vec8 fvec are: " << fvec; */
1287  float *fp = (float*) &a;
1288  os << "[7]:" << *(fp+7)
1289  << " [6]:" << *(fp+6)
1290  << " [5]:" << *(fp+5)
1291  << " [4]:" << *(fp+4)
1292  << " [3]:" << *(fp+3)
1293  << " [2]:" << *(fp+2)
1294  << " [1]:" << *(fp+1)
1295  << " [0]:" << *fp;
1296  return os;
1297  }
1298 #endif /* defined (_ENABLE_VEC_DEBUG) */
1299 
1300  /* Element Access Only, no modifications to elements*/
1301  const float& operator[](int i) const
1302  {
1303  /* Assert enabled only during debug /DDEBUG */
1304  _VEC_ASSERT((0 <= i) && (i <= 7));
1305  float *fp = (float*)&vec;
1306  return *(fp+i);
1307  }
1308 
1309  /* Element Access and Modification*/
1310  float& operator[](int i)
1311  {
1312  /* Assert enabled only during debug /DDEBUG */
1313  _VEC_ASSERT((0 <= i) && (i <= 7));
1314  float *fp = (float*)&vec;
1315  return *(fp+i);
1316  }
1317 };
1318 
1319  /* Miscellaneous */
1320 
1321 /* Interleave low order data elements of a and b into destination */
1322 inline F32vec8 unpack_low(const F32vec8 &a, const F32vec8 &b){
1323  return _mm256_unpacklo_ps(a, b); }
1324 
1325 /* Interleave high order data elements of a and b into target */
1326 inline F32vec8 unpack_high(const F32vec8 &a, const F32vec8 &b){
1327  return _mm256_unpackhi_ps(a, b); }
1328 
1329 /* Move Mask to Integer returns 8 bit mask formed of most significant bits of a */
1330 inline int move_mask(const F32vec8 &a){
1331  return _mm256_movemask_ps(a); }
1332 
1333  /* Data Motion Functions */
1334 
1335 /* Load Unaligned loadu_ps: Unaligned */
1336 inline void loadu(F32vec8 &a, const float *p){
1337  a = _mm256_loadu_ps(p); }
1338 
1339 /* Store Unaligned storeu_ps: Unaligned */
1340 inline void storeu(float *p, const F32vec8 &a){
1341  _mm256_storeu_ps(p, a); }
1342 
1343  /* Cacheability Support */
1344 
1345 /* Non-Temporal Store */
1346 inline void store_nta(float *p, const F32vec8 &a){
1347  _mm256_stream_ps(p, a); }
1348 
1349  /* Conditional moves */
1350 
1351 /* Masked load */
1352 inline void maskload(F32vec8 &a, const float *p, const F32vec8 &m){
1354 
1355 inline void maskload(F32vec4 &a, const float *p, const F32vec4 &m){
1356  a = _mm_maskload_ps(p, _mm_castps_si128(m)); }
1357 
1358 /* Masked store */
1359 inline void maskstore(float *p, const F32vec8 &a, const F32vec8 &m){
1361 
1362 inline void maskstore(float *p, const F32vec4 &a, const F32vec4 &m){
1363  _mm_maskstore_ps(p, _mm_castps_si128(m), a); }
1364 
1365  /* Conditional Selects */
1366 
1367 inline F32vec8 select_eq(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d){
1368  return _mm256_blendv_ps(d, c, _mm256_cmp_ps(a, b, _CMP_EQ_OQ)); }
1369 
1370 inline F32vec8 select_lt(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d){
1371  return _mm256_blendv_ps(d, c, _mm256_cmp_ps(a, b, _CMP_LT_OS)); }
1372 
1373 inline F32vec8 select_le(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d){
1374  return _mm256_blendv_ps(d, c, _mm256_cmp_ps(a, b, _CMP_LE_OS)); }
1375 
1376 inline F32vec8 select_gt(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d){
1377  return _mm256_blendv_ps(d, c, _mm256_cmp_ps(a, b, _CMP_GT_OS)); }
1378 
1379 inline F32vec8 select_ge(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d){
1380  return _mm256_blendv_ps(d, c, _mm256_cmp_ps(a, b, _CMP_GE_OS)); }
1381 
1382 inline F32vec8 select_neq(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d){
1383  return _mm256_blendv_ps(d, c, _mm256_cmp_ps(a, b, _CMP_NEQ_UQ)); }
1384 
1385 inline F32vec8 select_nlt(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d){
1386  return _mm256_blendv_ps(d, c, _mm256_cmp_ps(a, b, _CMP_NLT_US)); }
1387 
1388 inline F32vec8 select_nle(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d){
1389  return _mm256_blendv_ps(d, c, _mm256_cmp_ps(a, b, _CMP_NLE_US)); }
1390 
1391 inline F32vec8 select_ngt(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d){
1392  return _mm256_blendv_ps(d, c, _mm256_cmp_ps(a, b, _CMP_NGT_US)); }
1393 
1394 inline F32vec8 select_nge(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d){
1395  return _mm256_blendv_ps(d, c, _mm256_cmp_ps(a, b, _CMP_NGE_US)); }
1396 
1397 /*
1398  * class F64vec4
1399  *
1400  * Represents 256-bit vector composed of 4 double precision floating point elements.
1401  */
1402 class F64vec4
1403 {
1404 protected:
1406 
1407 public:
1408 
1409  /* Constructors: __m256d, 4 doubles */
1410  F64vec4() {}
1411 
1412  /* initialize 4 DP FP with __m256d data type */
1413  F64vec4(__m256d m) { vec = m; }
1414 
1415  /* initialize 4 DP FPs with 4 doubles */
1416  F64vec4(double d3, double d2, double d1, double d0)
1417  {
1418  vec = _mm256_set_pd(d3,d2,d1,d0);
1419  }
1420 
1421  /* Explicitly initialize each of 4 DP FPs with same double */
1422  EXPLICIT F64vec4(double d) { vec = _mm256_set1_pd(d); }
1423 
1424  /* Conversion functions */
1425  operator __m256d() const { return vec; }
1426 
1427  /* Logical Operators */
1428  friend F64vec4 operator &(const F64vec4 &a, const F64vec4 &b) { return _mm256_and_pd(a,b); }
1429  friend F64vec4 operator |(const F64vec4 &a, const F64vec4 &b) { return _mm256_or_pd(a,b); }
1430  friend F64vec4 operator ^(const F64vec4 &a, const F64vec4 &b) { return _mm256_xor_pd(a,b); }
1431 
1432  /* Arithmetic Operators */
1433  friend F64vec4 operator +(const F64vec4 &a, const F64vec4 &b) { return _mm256_add_pd(a,b); }
1434  friend F64vec4 operator -(const F64vec4 &a, const F64vec4 &b) { return _mm256_sub_pd(a,b); }
1435  friend F64vec4 operator *(const F64vec4 &a, const F64vec4 &b) { return _mm256_mul_pd(a,b); }
1436  friend F64vec4 operator /(const F64vec4 &a, const F64vec4 &b) { return _mm256_div_pd(a,b); }
1437 
1438  F64vec4& operator +=(const F64vec4 &a) { return *this = _mm256_add_pd(vec,a); }
1439  F64vec4& operator -=(const F64vec4 &a) { return *this = _mm256_sub_pd(vec,a); }
1440  F64vec4& operator *=(const F64vec4 &a) { return *this = _mm256_mul_pd(vec,a); }
1441  F64vec4& operator /=(const F64vec4 &a) { return *this = _mm256_div_pd(vec,a); }
1442  F64vec4& operator &=(const F64vec4 &a) { return *this = _mm256_and_pd(vec,a); }
1443  F64vec4& operator |=(const F64vec4 &a) { return *this = _mm256_or_pd(vec,a); }
1444  F64vec4& operator ^=(const F64vec4 &a) { return *this = _mm256_xor_pd(vec,a); }
1445 
1446  /* Horizontal Add */
1447  friend double add_horizontal(const F64vec4 &a)
1448  {
1449  F64vec4 temp = _mm256_add_pd(a, _mm256_permute_pd(a,0x05));
1451  }
1452 
1453  /* And Not */
1454  friend F64vec4 andnot(const F64vec4 &a, const F64vec4 &b) { return _mm256_andnot_pd(a,b); }
1455 
1456  /* Square Root */
1457  friend F64vec4 sqrt(const F64vec4 &a) { return _mm256_sqrt_pd(a); }
1458 
1459  /* Compares: Mask is returned */
1460  friend F64vec4 cmp_eq(const F64vec4 &a, const F64vec4 &b)
1461  { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
1462  friend F64vec4 cmp_lt(const F64vec4 &a, const F64vec4 &b)
1463  { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
1464  friend F64vec4 cmp_le(const F64vec4 &a, const F64vec4 &b)
1465  { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
1466  friend F64vec4 cmp_gt(const F64vec4 &a, const F64vec4 &b)
1467  { return _mm256_cmp_pd(a, b, _CMP_GT_OS); }
1468  friend F64vec4 cmp_ge(const F64vec4 &a, const F64vec4 &b)
1469  { return _mm256_cmp_pd(a, b, _CMP_GE_OS); }
1470  friend F64vec4 cmp_neq(const F64vec4 &a, const F64vec4 &b)
1471  { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
1472  friend F64vec4 cmp_nlt(const F64vec4 &a, const F64vec4 &b)
1473  { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
1474  friend F64vec4 cmp_nle(const F64vec4 &a, const F64vec4 &b)
1475  { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
1476  friend F64vec4 cmp_ngt(const F64vec4 &a, const F64vec4 &b)
1477  { return _mm256_cmp_pd(a, b, _CMP_NGT_US); }
1478  friend F64vec4 cmp_nge(const F64vec4 &a, const F64vec4 &b)
1479  { return _mm256_cmp_pd(a, b, _CMP_NGE_US); }
1480 
1481  /* Min and Max */
1482  friend F64vec4 simd_min(const F64vec4 &a, const F64vec4 &b)
1483  { return _mm256_min_pd(a,b); }
1484  friend F64vec4 simd_max(const F64vec4 &a, const F64vec4 &b)
1485  { return _mm256_max_pd(a,b); }
1486 
1487  /* Absolute value */
1488  friend F64vec4 abs(const F64vec4 &a)
1489  {
1490  static const union
1491  {
1492  int i[8];
1493  __m256d m;
1494  } __f64vec4_abs_mask = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
1495  0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff};
1496  return _mm256_and_pd(a, __f64vec4_abs_mask.m);
1497  }
1498 
1499  /* Debug Features */
1500 #if defined (_ENABLE_VEC_DEBUG)
1501  /* Output */
1502  friend DVEC_STD ostream & operator<<(DVEC_STD ostream &os, const F64vec4 &a)
1503  {
1504  /* To use: cout << "Elements of F64vec4 fvec are: " << fvec; */
1505  double *dp = (double*) &a;
1506  os << "[3]:" << *(dp+3)
1507  << " [2]:" << *(dp+2)
1508  << " [3]:" << *(dp+1)
1509  << " [0]:" << *dp;
1510  return os;
1511  }
1512 #endif /* defined (_ENABLE_VEC_DEBUG) */
1513 
1514  /* Element Access Only, no modifications to elements */
1515  const double& operator[](int i) const
1516  {
1517  /* Assert enabled only during debug /DDEBUG */
1518  _VEC_ASSERT((0 <= i) && (i <= 3));
1519  double *dp = (double*)&vec;
1520  return *(dp+i);
1521  }
1522  /* Element Access and Modification*/
1523  double& operator[](int i)
1524  {
1525  /* Assert enabled only during debug /DDEBUG */
1526  _VEC_ASSERT((0 <= i) && (i <= 3));
1527  double *dp = (double*)&vec;
1528  return *(dp+i);
1529  }
1530 };
1531 
1532  /* Miscellaneous */
1533 
1534 /* Interleave low order data elements of a and b into destination */
1535 inline F64vec4 unpack_low(const F64vec4 &a, const F64vec4 &b){
1536  return _mm256_unpacklo_pd(a, b); }
1537 
1538 /* Interleave high order data elements of a and b into target */
1539 inline F64vec4 unpack_high(const F64vec4 &a, const F64vec4 &b){
1540  return _mm256_unpackhi_pd(a, b); }
1541 
1542 /* Move Mask to Integer returns 4 bit mask formed of most significant bits of a */
1543 inline int move_mask(const F64vec4 &a){
1544  return _mm256_movemask_pd(a); }
1545 
1546  /* Data Motion Functions */
1547 
1548 /* Load Unaligned loadu_pd: Unaligned */
1549 inline void loadu(F64vec4 &a, double *p){
1550  a = _mm256_loadu_pd(p); }
1551 
1552 /* Store Unaligned storeu_pd: Unaligned */
1553 inline void storeu(double *p, const F64vec4 &a){
1554  _mm256_storeu_pd(p, a); }
1555 
1556  /* Cacheability Support */
1557 
1558 /* Non-Temporal Store */
1559 inline void store_nta(double *p, const F64vec4 &a){
1560  _mm256_stream_pd(p, a); }
1561 
1562  /* Conditional moves */
1563 
1564 /* Masked load */
1565 inline void maskload(F64vec4 &a, const double *p, const F64vec4 &m){
1567 
1568 inline void maskload(F64vec2 &a, const double *p, const F64vec2 &m){
1569  a = _mm_maskload_pd(p, _mm_castpd_si128(m)); }
1570 
1571 /* Masked store */
1572 inline void maskstore(double *p, const F64vec4 &a, const F64vec4 &m){
1574 
1575 inline void maskstore(double *p, const F64vec2 &a, const F64vec2 &m){
1576  _mm_maskstore_pd(p, _mm_castpd_si128(m), a); }
1577 
1578  /* Conditional Selects */
1579 
1580 inline F64vec4 select_eq(const F64vec4 &a, const F64vec4 &b, const F64vec4 &c, const F64vec4 &d){
1581  return _mm256_blendv_pd(d, c, _mm256_cmp_pd(a, b, _CMP_EQ_OQ)); }
1582 
1583 inline F64vec4 select_lt(const F64vec4 &a, const F64vec4 &b, const F64vec4 &c, const F64vec4 &d){
1584  return _mm256_blendv_pd(d, c, _mm256_cmp_pd(a, b, _CMP_LT_OS)); }
1585 
1586 inline F64vec4 select_le(const F64vec4 &a, const F64vec4 &b, const F64vec4 &c, const F64vec4 &d){
1587  return _mm256_blendv_pd(d, c, _mm256_cmp_pd(a, b, _CMP_LE_OS)); }
1588 
1589 inline F64vec4 select_gt(const F64vec4 &a, const F64vec4 &b, const F64vec4 &c, const F64vec4 &d){
1590  return _mm256_blendv_pd(d, c, _mm256_cmp_pd(a, b, _CMP_GT_OS)); }
1591 
1592 inline F64vec4 select_ge(const F64vec4 &a, const F64vec4 &b, const F64vec4 &c, const F64vec4 &d){
1593  return _mm256_blendv_pd(d, c, _mm256_cmp_pd(a, b, _CMP_GE_OS)); }
1594 
1595 inline F64vec4 select_neq(const F64vec4 &a, const F64vec4 &b, const F64vec4 &c, const F64vec4 &d){
1596  return _mm256_blendv_pd(d, c, _mm256_cmp_pd(a, b, _CMP_NEQ_UQ)); }
1597 
1598 inline F64vec4 select_nlt(const F64vec4 &a, const F64vec4 &b, const F64vec4 &c, const F64vec4 &d){
1599  return _mm256_blendv_pd(d, c, _mm256_cmp_pd(a, b, _CMP_NLT_US)); }
1600 
1601 inline F64vec4 select_nle(const F64vec4 &a, const F64vec4 &b, const F64vec4 &c, const F64vec4 &d){
1602  return _mm256_blendv_pd(d, c, _mm256_cmp_pd(a, b, _CMP_NLE_US)); }
1603 
1604 inline F64vec4 select_ngt(const F64vec4 &a, const F64vec4 &b, const F64vec4 &c, const F64vec4 &d){
1605  return _mm256_blendv_pd(d, c, _mm256_cmp_pd(a, b, _CMP_NGT_US)); }
1606 
1607 inline F64vec4 select_nge(const F64vec4 &a, const F64vec4 &b, const F64vec4 &c, const F64vec4 &d){
1608  return _mm256_blendv_pd(d, c, _mm256_cmp_pd(a, b, _CMP_NGE_US)); }
1609 
1610  /* Conversion Functions */
1611 
1612 /* Convert the 4 SP FP values of a to 4 DP FP values */
1614  return _mm256_cvtps_pd(a); }
1615 
1616 /* Convert the 4 DP FP values of a to 4 SP FP values */
1618  return _mm256_cvtpd_ps(a); }
1619 
1620 #undef DVEC_DEFINE_OUTPUT_OPERATORS
1621 #undef DVEC_STD
1622 
1623 #ifdef _MSC_VER
1624 #pragma pack(pop)
1625 #endif /* _MSC_VER */
1626 
1627 #endif /* defined (_M_CEE_PURE) */
1628 
1629 #endif /* RC_INVOKED */
1630 #endif /* _DVEC_H_INCLUDED */
Definition: fvec.h:82
__m128i _mm_sub_epi32(__m128i _A, __m128i _B)
void __cdecl _mm256_storeu_pd(double *, __m256d)
__m128i _mm_adds_epi8(__m128i _A, __m128i _B)
#define _CMP_NEQ_UQ
Definition: immintrin.h:59
F64vec2 F32vec4ToF64vec2(const F32vec4 &a)
Definition: dvec.h:1107
I16vec8 operator<<(int count)
Definition: dvec.h:428
EXPLICIT F32vec8(float f)
Definition: dvec.h:1155
__m128i _mm_mulhi_epi16(__m128i _A, __m128i _B)
#define _CMP_NGE_US
Definition: immintrin.h:65
I64vec2(__m128i mm)
Definition: dvec.h:176
__m256d __cdecl _mm256_sub_pd(__m256d, __m256d)
I16vec8 & operator<<=(const M128 &a)
Definition: dvec.h:429
Is16vec8 & operator+=(const I16vec8 &a)
Definition: dvec.h:465
friend F32vec8 operator-(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1183
Definition: dvec.h:336
__m128i _mm_xor_si128(__m128i _A, __m128i _B)
I32vec4 & operator|=(const M128 &a)
Definition: dvec.h:242
Is8vec16 & operator|=(const M128 &a)
Definition: dvec.h:668
I128vec1(__m128i mm)
Definition: dvec.h:160
friend F64vec4 andnot(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1454
__m256d __cdecl _mm256_div_pd(__m256d, __m256d)
__m128i _mm_srai_epi32(__m128i _A, int _Count)
I16vec8 & operator<<=(int count)
Definition: dvec.h:430
F32vec8(__m256 m)
Definition: dvec.h:1146
friend F64vec4 cmp_ge(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1468
Iu32vec4 operator>>(int count)
Definition: dvec.h:362
__m256 __cdecl _mm256_div_ps(__m256, __m256)
signed short & operator[](int i)
Definition: dvec.h:504
friend F64vec4 operator&(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1428
Is32vec4 operator<<(const M128 &a)
Definition: dvec.h:286
unsigned char & operator[](int i)
Definition: dvec.h:787
I64vec2 operator<<(int count)
Definition: dvec.h:198
__m256 __cdecl _mm256_rsqrt_ps(__m256)
__m256d __cdecl _mm256_cvtps_pd(__m128)
friend F32vec8 sqrt(const F32vec8 &a)
Definition: dvec.h:1207
__m128i _mm_packs_epi32(__m128i _A, __m128i _B)
__m128i _mm_sra_epi16(__m128i _A, __m128i _Count)
I64vec2 operator*(const Iu32vec4 &a, const Iu32vec4 &b)
Definition: dvec.h:393
friend F64vec2 abs(const F64vec2 &a)
Definition: dvec.h:995
__m256d __cdecl _mm256_andnot_pd(__m256d, __m256d)
Is32vec4 & operator-=(const I32vec4 &a)
Definition: dvec.h:283
friend F64vec4 operator|(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1429
__m128d _mm_loadu_pd(double const *_Dp)
Iu32vec4 operator>>(const M128 &a)
Definition: dvec.h:361
friend F64vec4 operator+(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1433
friend F64vec4 operator*(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1435
Iu16vec8 & operator&=(const M128 &a)
Definition: dvec.h:549
friend F32vec8 operator/(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1185
EXPLICIT F64vec2(double d)
Definition: dvec.h:935
#define _f64vec2_abs_mask
Definition: dvec.h:74
F64vec4 & operator^=(const F64vec4 &a)
Definition: dvec.h:1444
I16vec8(short s7, short s6, short s5, short s4, short s3, short s2, short s1, short s0)
Definition: dvec.h:408
I64vec2 & operator|=(const M128 &a)
Definition: dvec.h:189
Is32vec4 & operator=(const M128 &a)
Definition: dvec.h:274
__m128d _mm_cvtsi32_sd(__m128d _A, int _B)
Is8vec16 & operator-=(const I8vec16 &a)
Definition: dvec.h:673
#define _MM_2QW(element, vector)
Definition: dvec.h:99
Iu8vec16 & operator^=(const M128 &a)
Definition: dvec.h:749
__m256d __cdecl _mm256_set_pd(double, double, double, double)
__m256 __cdecl _mm256_loadu_ps(float const *)
void __cdecl _mm256_storeu_ps(float *, __m256)
friend F64vec4 cmp_nge(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1478
I16vec8 & operator+=(const I16vec8 &a)
Definition: dvec.h:422
Iu32vec4 & operator=(const M128 &a)
Definition: dvec.h:345
__m128i _mm_cmpgt_epi32(__m128i _A, __m128i _B)
I8vec16 & operator-=(const I8vec16 &a)
Definition: dvec.h:639
I16vec8()
Definition: dvec.h:406
I32vec4 & operator=(const M128 &a)
Definition: dvec.h:238
F64vec4 & operator+=(const F64vec4 &a)
Definition: dvec.h:1438
#define _CRTIMP
Definition: crtdefs.h:23
__m256d __cdecl _mm256_and_pd(__m256d, __m256d)
__m128i _mm_set1_epi64(__m64 _Q)
void __cdecl _mm_maskstore_ps(float *, __m128i, __m128)
friend F32vec8 andnot(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1204
I16vec8 & operator&=(const M128 &a)
Definition: dvec.h:417
M128 operator|(const M128 &a, const M128 &b)
Definition: dvec.h:147
__m128d _mm_set1_pd(double _A)
F64vec2 & operator*=(const F64vec2 &a)
Definition: dvec.h:953
Iu16vec8 & operator<<=(int count)
Definition: dvec.h:561
F64vec2 & operator^=(const F64vec2 &a)
Definition: dvec.h:957
Is16vec8()
Definition: dvec.h:449
__m128d _mm_unpacklo_pd(__m128d _A, __m128d _B)
friend F32vec8 rsqrt(const F32vec8 &a)
Definition: dvec.h:1213
F32vec8 select_nge(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d)
Definition: dvec.h:1394
__m256 __cdecl _mm256_sqrt_ps(__m256)
__m128i _mm_castpd_si128(__m128d)
Definition: dvec.h:729
const __m128i get_mask128()
Definition: dvec.h:106
__m256 __cdecl _mm256_xor_ps(__m256, __m256)
F64vec4(double d3, double d2, double d1, double d0)
Definition: dvec.h:1416
friend F32vec8 cmp_eq(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1242
F32vec8 & operator+=(const F32vec8 &a)
Definition: dvec.h:1187
Definition: dvec.h:1402
__m128i _mm_max_epi16(__m128i _A, __m128i _B)
iterator_traits< _InIt >::difference_type count(_InIt _First, _InIt _Last, const _Ty &_Val)
Definition: xutility:3086
#define _VEC_ASSERT(_Expression)
Definition: dvec.h:53
__m128d
Definition: emmintrin.h:48
Is16vec8 & operator|=(const M128 &a)
Definition: dvec.h:461
I32vec4 & operator+=(const I32vec4 &a)
Definition: dvec.h:246
__m128i _mm_unpacklo_epi16(__m128i _A, __m128i _B)
void maskload(F32vec8 &a, const float *p, const F32vec8 &m)
Definition: dvec.h:1352
__m256d
Definition: immintrin.h:38
Iu32vec4 & operator>>=(const M128 &a)
Definition: dvec.h:363
friend F64vec4 simd_max(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1484
void __cdecl _mm256_stream_ps(float *, __m256)
__m256d __cdecl _mm256_loadu_pd(double const *)
__m128i _mm_add_epi64(__m128i _A, __m128i _B)
M128 & operator&=(const M128 &a)
Definition: dvec.h:140
double _mm_cvtsd_f64(__m128d _A)
I16vec8 operator<<(const M128 &a)
Definition: dvec.h:427
#define _CMP_NLT_US
Definition: immintrin.h:60
const double & operator[](int i) const
Definition: dvec.h:1515
#define _CMP_EQ_OQ
Definition: immintrin.h:55
Is16vec8 simd_max(const Is16vec8 &a, const Is16vec8 &b)
Definition: dvec.h:529
__m128i _mm_srli_epi16(__m128i _A, int _Count)
__int64 & operator[](int i)
Definition: dvec.h:214
uint_2 operator<<(const uint_2 &_Lhs, const uint_2 &_Rhs) __GPU
Definition: amp_short_vectors.h:22866
__m128d _mm_shuffle_pd(__m128d _A, __m128d _B, int _I)
Is16vec8(signed short s7, signed short s6, signed short s5, signed short s4, signed short s3, signed short s2, signed short s1, signed short s0)
Definition: dvec.h:451
__m128d m
Definition: dvec.h:71
Iu16vec8 & operator<<=(const M128 &a)
Definition: dvec.h:560
__m128 __cdecl _mm_maskload_ps(float const *, __m128i)
void store_nta(double *p, F64vec2 &a)
Definition: dvec.h:1079
Iu8vec16(unsigned char u15, unsigned char u14, unsigned char u13, unsigned char u12, unsigned char u11, unsigned char u10, unsigned char u9, unsigned char u8, unsigned char u7, unsigned char u6, unsigned char u5, unsigned char u4, unsigned char u3, unsigned char u2, unsigned char u1, unsigned char u0)
Definition: dvec.h:734
I64vec2()
Definition: dvec.h:175
__m128i _mm_cmplt_epi8(__m128i _A, __m128i _B)
Is16vec8 pack_sat(const Is32vec4 &a, const Is32vec4 &b)
Definition: dvec.h:812
__m128i _mm_set_epi16(short _W7, short _W6, short _W5, short _W4, short _W3, short _W2, short _W1, short _W0)
__m128i _mm_packs_epi16(__m128i _A, __m128i _B)
__m256 __cdecl _mm256_permute_ps(__m256, int)
I128vec1 & operator|=(const M128 &a)
Definition: dvec.h:164
__m128i _mm_add_epi8(__m128i _A, __m128i _B)
__m256 __cdecl _mm256_add_ps(__m256, __m256)
__m128d _mm_set_pd(double _Z, double _Y)
Iu16vec8 & operator=(const M128 &a)
Definition: dvec.h:547
Iu32vec4 & operator<<=(const M128 &a)
Definition: dvec.h:359
__m128i _mm_srl_epi32(__m128i _A, __m128i _Count)
Definition: dvec.h:403
Iu32vec4 & operator-=(const I32vec4 &a)
Definition: dvec.h:354
__m256d __cdecl _mm256_permute_pd(__m256d, int)
#define F64vec2_UCOMI(op)
Is8vec16(__m128i mm)
Definition: dvec.h:656
friend F64vec4 operator^(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1430
#define _MM_8UW(element, vector)
Definition: dvec.h:93
#define _MM_8W(element, vector)
Definition: dvec.h:94
F64vec2 & operator/=(const F64vec2 &a)
Definition: dvec.h:954
Is16vec8 operator<<(const M128 &a)
Definition: dvec.h:470
F32vec4 F64vec2ToF32vec4(const F64vec2 &a)
Definition: dvec.h:1113
Iu8vec16 & operator+=(const I8vec16 &a)
Definition: dvec.h:752
F32vec8 & operator-=(const F32vec8 &a)
Definition: dvec.h:1188
const signed short & operator[](int i) const
Definition: dvec.h:497
__m128i _mm_sra_epi32(__m128i _A, __m128i _Count)
I8vec16(__m128i mm)
Definition: dvec.h:621
Is32vec4 & operator|=(const M128 &a)
Definition: dvec.h:278
Is32vec4 & operator&=(const M128 &a)
Definition: dvec.h:277
I32vec4 & operator<<=(int count)
Definition: dvec.h:253
Is8vec16 & operator+=(const I8vec16 &a)
Definition: dvec.h:672
F32vec8 & operator^=(const F32vec8 &a)
Definition: dvec.h:1193
M128 andnot(const M128 &a, const M128 &b)
Definition: dvec.h:149
__m256d __cdecl _mm256_mul_pd(__m256d, __m256d)
__m128i _mm_srli_epi32(__m128i _A, int _Count)
Iu8vec16 & operator&=(const M128 &a)
Definition: dvec.h:747
Is32vec4 & operator+=(const I32vec4 &a)
Definition: dvec.h:282
double & operator[](int i)
Definition: dvec.h:1523
const __int64 & operator[](int i) const
Definition: dvec.h:207
__m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0)
I32vec4 & operator&=(const M128 &a)
Definition: dvec.h:241
__m128i _mm_sll_epi16(__m128i _A, __m128i _Count)
const signed char & operator[](int i) const
Definition: dvec.h:700
__m128i _mm_avg_epu16(__m128i _A, __m128i _B)
friend F32vec8 operator*(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1184
F32vec4 F64vec4ToF32vec8(const F64vec4 &a)
Definition: dvec.h:1617
friend F64vec2 sqrt(const F64vec2 &a)
Definition: dvec.h:970
F64vec2()
Definition: dvec.h:926
#define _MM_16UB(element, vector)
Definition: dvec.h:90
Is16vec8(__m128i mm)
Definition: dvec.h:450
__m128i _mm_min_epi16(__m128i _A, __m128i _B)
Definition: dvec.h:266
F32vec8 select_eq(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d)
Definition: dvec.h:1367
F64vec4 & operator|=(const F64vec4 &a)
Definition: dvec.h:1443
Is32vec4 cmpgt(const Is32vec4 &a, const Is32vec4 &b)
Definition: dvec.h:326
Definition: ivec.h:94
__m256i __cdecl _mm256_castpd_si256(__m256d)
#define _CMP_LT_OS
Definition: immintrin.h:56
__m128i _mm_unpacklo_epi32(__m128i _A, __m128i _B)
Iu32vec4()
Definition: dvec.h:339
int & operator[](int i)
Definition: dvec.h:316
I8vec16 & operator=(const M128 &a)
Definition: dvec.h:630
friend F32vec8 simd_max(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1266
Iu16vec8 & operator-=(const I16vec8 &a)
Definition: dvec.h:554
__m128i _mm_adds_epu8(__m128i _A, __m128i _B)
F32vec8(float f7, float f6, float f5, float f4, float f3, float f2, float f1, float f0)
Definition: dvec.h:1149
Is16vec8 sat_sub(const Is16vec8 &a, const Is16vec8 &b)
Definition: dvec.h:527
void loadu(F64vec2 &a, double *p)
Definition: dvec.h:1069
Is8vec16(char s15, char s14, char s13, char s12, char s11, char s10, char s9, char s8, char s7, char s6, char s5, char s4, char s3, char s2, char s1, char s0)
Definition: dvec.h:657
__m128 _mm_cvtpd_ps(__m128d _A)
I64vec2 & operator=(const M128 &a)
Definition: dvec.h:185
#define _CMP_NGT_US
Definition: immintrin.h:67
friend float add_horizontal(const F32vec8 &a)
Definition: dvec.h:1196
F64vec2(__m128d m)
Definition: dvec.h:929
__m128i _mm_castps_si128(__m128)
Is16vec8 operator<<(int count)
Definition: dvec.h:471
friend F64vec4 cmp_nle(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1474
Definition: dvec.h:156
#define _CMP_GT_OS
Definition: immintrin.h:71
__m128i _mm_sll_epi32(__m128i _A, __m128i _Count)
__m128i _mm_unpackhi_epi16(__m128i _A, __m128i _B)
I64vec2 operator<<(const I64vec2 &a)
Definition: dvec.h:197
Is32vec4 & operator<<=(int count)
Definition: dvec.h:289
__m256d __cdecl _mm256_sqrt_pd(__m256d)
I16vec8 & operator-=(const I16vec8 &a)
Definition: dvec.h:423
unsigned short & operator[](int i)
Definition: dvec.h:592
__m128i _mm_slli_epi16(__m128i _A, int _Count)
int _mm_movemask_pd(__m128d _A)
F64vec2 IntToF64vec2(const F64vec2 &a, int b)
Definition: dvec.h:1119
__m256 __cdecl _mm256_sub_ps(__m256, __m256)
int _mm_cvttsd_si32(__m128d _A)
__m128i _mm_sad_epu8(__m128i _A, __m128i _B)
I8vec16 & operator|=(const M128 &a)
Definition: dvec.h:634
F64vec2_COMP(eq) F64vec2_COMP(lt) F64vec2_COMP(le) F64vec2_COMP(gt) F64vec2_COMP(ge) F64vec2_COMP(ngt) F64vec2_COMP(nge) F64vec2_COMP(neq) F64vec2_COMP(nlt) F64vec2_COMP(nle) friend F64vec2 simd_min(const F64vec2 &a
I8vec16 & operator^=(const M128 &a)
Definition: dvec.h:635
F32vec8 select_ge(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d)
Definition: dvec.h:1379
__m128i _mm_max_epu8(__m128i _A, __m128i _B)
F32vec8 select_nle(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d)
Definition: dvec.h:1388
int i[4]
Definition: dvec.h:70
I64vec2 sum_abs(const Iu8vec16 &a, const Iu8vec16 &b)
Definition: dvec.h:804
I16vec8(__m128i mm)
Definition: dvec.h:407
__m256 __cdecl _mm256_and_ps(__m256, __m256)
F64vec4()
Definition: dvec.h:1410
#define IVEC128_SELECT(vect12, vect34, element, selop)
Definition: dvec.h:868
F32vec8 select_gt(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d)
Definition: dvec.h:1376
F64vec4 & operator-=(const F64vec4 &a)
Definition: dvec.h:1439
Iu16vec8 operator>>(int count)
Definition: dvec.h:563
I64vec2 & operator&=(const M128 &a)
Definition: dvec.h:188
__m256 vec
Definition: dvec.h:1138
#define _In_z_
Definition: sal.h:319
Iu32vec4 & operator+=(const I32vec4 &a)
Definition: dvec.h:353
Is8vec16 & operator^=(const M128 &a)
Definition: dvec.h:669
#define _In_
Definition: sal.h:314
friend F64vec2 operator*(const F64vec2 &a, const F64vec2 &b)
Definition: dvec.h:948
const unsigned short & operator[](int i) const
Definition: dvec.h:585
#define F64vec2_SELECT(op)
Definition: dvec.h:1082
I128vec1 & operator&=(const M128 &a)
Definition: dvec.h:163
__m128d _mm_add_pd(__m128d _A, __m128d _B)
__m128d _mm_or_pd(__m128d _A, __m128d _B)
F32vec8 & operator/=(const F32vec8 &a)
Definition: dvec.h:1190
void storeu(double *p, const F64vec2 &a)
Definition: dvec.h:1073
int __cdecl _mm256_movemask_pd(__m256d)
EXPLICIT F32vec8(double d)
Definition: dvec.h:1158
friend F32vec8 operator&(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1177
Is8vec16 & operator=(const M128 &a)
Definition: dvec.h:664
__m256 __cdecl _mm256_movehdup_ps(__m256)
void __cdecl _mm256_maskstore_pd(double *, __m256i, __m256d)
Iu8vec16 & operator=(const M128 &a)
Definition: dvec.h:744
__m128i _mm_mul_epu32(__m128i _A, __m128i _B)
Iu8vec16(__m128i mm)
Definition: dvec.h:733
__m256d __cdecl _mm256_unpacklo_pd(__m256d, __m256d)
friend F32vec8 cmp_nlt(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1254
__m128i _mm_min_epu8(__m128i _A, __m128i _B)
I64vec2 & operator^=(const M128 &a)
Definition: dvec.h:190
friend F32vec8 rsqrt_nr(const F32vec8 &a)
Definition: dvec.h:1229
F64vec4 & operator&=(const F64vec4 &a)
Definition: dvec.h:1442
__m256d __cdecl _mm256_max_pd(__m256d, __m256d)
__m128d _mm_div_pd(__m128d _A, __m128d _B)
int __cdecl _mm256_movemask_ps(__m256)
I64vec2(__m64 q1, __m64 q0)
Definition: dvec.h:178
Is16vec8 & operator>>=(int count)
Definition: dvec.h:478
__m256 __cdecl _mm256_blendv_ps(__m256, __m256, __m256)
__m64
Definition: mmintrin.h:42
__m128d _mm_and_pd(__m128d _A, __m128d _B)
I64vec2 & operator>>=(const I64vec2 &a)
Definition: dvec.h:203
__m256d __cdecl _mm256_unpackhi_pd(__m256d, __m256d)
__m128i _mm_srl_epi16(__m128i _A, __m128i _Count)
F64vec2_COMI(eq) F64vec2_COMI(lt) F64vec2_COMI(le) F64vec2_COMI(gt) F64vec2_COMI(ge) F64vec2_COMI(neq) F64vec2_UCOMI(eq) F64vec2_UCOMI(lt) F64vec2_UCOMI(le) F64vec2_UCOMI(gt) F64vec2_UCOMI(ge) F64vec2_UCOMI(neq) const double &operator[](int i) const
Definition: dvec.h:1003
__m128 __cdecl _mm256_castps256_ps128(__m256)
M128 & operator^=(const M128 &a)
Definition: dvec.h:142
I32vec4 & operator<<=(const I32vec4 &a)
Definition: dvec.h:252
Is16vec8 simd_min(const Is16vec8 &a, const Is16vec8 &b)
Definition: dvec.h:530
__m128d __cdecl _mm_maskload_pd(double const *, __m128i)
M128 operator^(const M128 &a, const M128 &b)
Definition: dvec.h:148
Is32vec4 mul_add(const Is16vec8 &a, const Is16vec8 &b)
Definition: dvec.h:524
friend F32vec8 cmp_nle(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1256
I8vec16()
Definition: dvec.h:620
__m128i _mm_slli_epi64(__m128i _A, int _Count)
__m128i _mm_cmpgt_epi8(__m128i _A, __m128i _B)
void __cdecl _mm_maskstore_pd(double *, __m128i, __m128d)
__m256 __cdecl _mm256_set1_ps(float)
F32vec8 select_lt(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d)
Definition: dvec.h:1370
__m128i _mm_unpackhi_epi32(__m128i _A, __m128i _B)
F64vec2 & operator|=(const F64vec2 &a)
Definition: dvec.h:956
F32vec8 & operator*=(const F32vec8 &a)
Definition: dvec.h:1189
friend F64vec4 cmp_eq(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1460
Definition: iosfwd:630
__m128d _mm_max_pd(__m128d _A, __m128d _B)
Is32vec4 & operator<<=(const M128 &a)
Definition: dvec.h:288
friend F64vec2 operator&(const F64vec2 &a, const F64vec2 &b)
Definition: dvec.h:941
__m256d __cdecl _mm256_min_pd(__m256d, __m256d)
basic_ostream< char, char_traits< char > > ostream
Definition: iosfwd:678
friend F64vec4 operator-(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1434
I32vec4()
Definition: dvec.h:233
F32vec8 select_le(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d)
Definition: dvec.h:1373
Iu16vec8(__m128i mm)
Definition: dvec.h:540
Is32vec4 & operator^=(const M128 &a)
Definition: dvec.h:279
__m128d __cdecl _mm256_castpd256_pd128(__m256d)
__m128i
Definition: emmintrin.h:44
__m256
Definition: immintrin.h:34
Is16vec8 & operator=(const M128 &a)
Definition: dvec.h:457
F64vec2 & operator+=(const F64vec2 &a)
Definition: dvec.h:951
F32vec8 select_nlt(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d)
Definition: dvec.h:1385
#define _CMP_GE_OS
Definition: immintrin.h:70
__m128i _mm_srli_epi64(__m128i _A, int _Count)
M128()
Definition: dvec.h:134
Iu16vec8 & operator>>=(const M128 &a)
Definition: dvec.h:564
__m128i _mm_unpackhi_epi64(__m128i _A, __m128i _B)
Is32vec4 & operator>>=(const M128 &a)
Definition: dvec.h:293
I32vec4(__m128i mm)
Definition: dvec.h:234
Definition: dvec.h:652
void maskstore(float *p, const F32vec8 &a, const F32vec8 &m)
Definition: dvec.h:1359
Iu16vec8 & operator*=(const I16vec8 &a)
Definition: dvec.h:555
__m128d _mm_unpackhi_pd(__m128d _A, __m128d _B)
Is16vec8 & operator<<=(const M128 &a)
Definition: dvec.h:472
const F64vec2 &b return _mm_min_pd(a, b)
Is16vec8 operator>>(int count)
Definition: dvec.h:476
#define _MM_16B(element, vector)
Definition: dvec.h:91
Iu8vec16()
Definition: dvec.h:732
I64vec2 & operator<<=(int count)
Definition: dvec.h:200
Iu16vec8 operator>>(const M128 &a)
Definition: dvec.h:562
void _mm_storeu_pd(double *_Dp, __m128d _A)
__m128i _mm_cmpgt_epi16(__m128i _A, __m128i _B)
#define EXPLICIT
Definition: ivec.h:30
I64vec2 & operator>>=(int count)
Definition: dvec.h:204
I32vec4 & operator-=(const I32vec4 &a)
Definition: dvec.h:247
__m128d _mm_xor_pd(__m128d _A, __m128d _B)
Definition: dvec.h:536
#define _CMP_NLE_US
Definition: immintrin.h:61
friend F32vec8 cmp_le(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1246
#define IVEC128_ADD_SUB(vect, element, opsize)
Definition: dvec.h:841
I32vec4 & operator^=(const M128 &a)
Definition: dvec.h:243
I16vec8 & operator*=(const I16vec8 &a)
Definition: dvec.h:424
__m128d vec
Definition: dvec.h:922
F64vec4 & operator*=(const F64vec4 &a)
Definition: dvec.h:1440
__m256 __cdecl _mm256_mul_ps(__m256, __m256)
Definition: dvec.h:446
I32vec4 operator<<(const I32vec4 &a)
Definition: dvec.h:250
__m128i _mm_avg_epu8(__m128i _A, __m128i _B)
Iu16vec8()
Definition: dvec.h:539
Definition: dvec.h:230
I64vec2 & operator<<=(const I64vec2 &a)
Definition: dvec.h:199
__m128i _mm_mullo_epi16(__m128i _A, __m128i _B)
friend F64vec2 simd_max(const F64vec2 &a, const F64vec2 &b)
Definition: dvec.h:992
__m128i _mm_subs_epu8(__m128i _A, __m128i _B)
Definition: dvec.h:172
friend double add_horizontal(const F64vec4 &a)
Definition: dvec.h:1447
__m256d vec
Definition: dvec.h:1405
_Check_return_ _In_z_ const char _Inout_ FILE * _File
Definition: stdio.h:226
const int & operator[](int i) const
Definition: dvec.h:309
const unsigned char & operator[](int i) const
Definition: dvec.h:780
Iu16vec8 & operator|=(const M128 &a)
Definition: dvec.h:550
Iu8vec16 & operator|=(const M128 &a)
Definition: dvec.h:748
F64vec2(double d1, double d0)
Definition: dvec.h:932
#define _MM_4DW(element, vector)
Definition: dvec.h:97
friend F32vec8 operator|(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1178
float & operator[](int i)
Definition: dvec.h:1310
__m128i _mm_cmpeq_epi16(__m128i _A, __m128i _B)
__m256 __cdecl _mm256_min_ps(__m256, __m256)
__m128i _mm_or_si128(__m128i _A, __m128i _B)
Is16vec8 & operator&=(const M128 &a)
Definition: dvec.h:460
__m128i _mm_sll_epi64(__m128i _A, __m128i _Count)
__m256 __cdecl _mm256_andnot_ps(__m256, __m256)
Is32vec4()
Definition: dvec.h:269
__m128i _mm_cmpeq_epi32(__m128i _A, __m128i _B)
M128(__m128i mm)
Definition: dvec.h:135
__m128 _mm_add_ss(__m128 _A, __m128 _B)
__m256 __cdecl _mm256_unpackhi_ps(__m256, __m256)
friend F64vec2 operator/(const F64vec2 &a, const F64vec2 &b)
Definition: dvec.h:949
friend F32vec8 cmp_nge(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1260
__m128i _mm_adds_epu16(__m128i _A, __m128i _B)
friend F32vec8 operator+(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1182
friend F64vec4 sqrt(const F64vec4 &a)
Definition: dvec.h:1457
__m128i _mm_add_epi32(__m128i _A, __m128i _B)
friend F64vec4 cmp_gt(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1466
__m256i __cdecl _mm256_castps_si256(__m256)
friend F32vec8 cmp_ge(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1250
Is16vec8 & operator*=(const I16vec8 &a)
Definition: dvec.h:467
const float & operator[](int i) const
Definition: dvec.h:1301
__m256d __cdecl _mm256_maskload_pd(double const *, __m256i)
Iu32vec4 operator<<(int count)
Definition: dvec.h:358
__m128i _mm_unpacklo_epi8(__m128i _A, __m128i _B)
double & operator[](int i)
Definition: dvec.h:1043
I64vec2 unpack_high(const I64vec2 &a, const I64vec2 &b)
Definition: dvec.h:225
I16vec8 & operator^=(const M128 &a)
Definition: dvec.h:419
friend F64vec4 abs(const F64vec4 &a)
Definition: dvec.h:1488
__m128i vec
Definition: dvec.h:131
__m256 __cdecl _mm256_maskload_ps(float const *, __m256i)
__m256d __cdecl _mm256_blendv_pd(__m256d, __m256d, __m256d)
I64vec2 operator>>(const I64vec2 &a)
Definition: dvec.h:201
Is8vec16 & operator&=(const M128 &a)
Definition: dvec.h:667
Iu32vec4 & operator<<=(int count)
Definition: dvec.h:360
F32vec8 & operator=(float f)
Definition: dvec.h:1161
Definition: dvec.h:919
__m256d __cdecl _mm256_or_pd(__m256d, __m256d)
EXPLICIT F64vec4(double d)
Definition: dvec.h:1422
friend F64vec4 cmp_le(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1464
friend F64vec4 operator/(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1436
const union @88 __f64vec2_abs_mask_cheat
M128 operator&(const M128 &a, const M128 &b)
Definition: dvec.h:146
Iu32vec4 & operator&=(const M128 &a)
Definition: dvec.h:348
Iu32vec4 & operator>>=(int count)
Definition: dvec.h:364
I32vec4(int i3, int i2, int i1, int i0)
Definition: dvec.h:235
Iu16vec8 & operator^=(const M128 &a)
Definition: dvec.h:551
I64vec2 operator>>(int count)
Definition: dvec.h:202
__m256 __cdecl _mm256_or_ps(__m256, __m256)
friend F32vec8 simd_min(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1264
__m128d _mm_sub_pd(__m128d _A, __m128d _B)
I128vec1 & operator=(const M128 &a)
Definition: dvec.h:162
Is16vec8 & operator-=(const I16vec8 &a)
Definition: dvec.h:466
F64vec2 & operator&=(const F64vec2 &a)
Definition: dvec.h:955
__m128i _mm_slli_epi32(__m128i _A, int _Count)
I128vec1 & operator^=(const M128 &a)
Definition: dvec.h:165
__m128i _mm_sub_epi16(__m128i _A, __m128i _B)
void _mm_stream_pd(double *_Dp, __m128d _A)
__m256 __cdecl _mm256_rcp_ps(__m256)
Is16vec8 mul_high(const Is16vec8 &a, const Is16vec8 &b)
Definition: dvec.h:523
F32vec8 & operator|=(const F32vec8 &a)
Definition: dvec.h:1192
friend F64vec4 cmp_lt(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1462
F32vec8 select_ngt(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d)
Definition: dvec.h:1391
Iu16vec8 & operator>>=(int count)
Definition: dvec.h:565
Iu32vec4 & operator^=(const M128 &a)
Definition: dvec.h:350
I64vec2 & operator+=(const I64vec2 &a)
Definition: dvec.h:193
__m128i _mm_subs_epu16(__m128i _A, __m128i _B)
F32vec8 select_neq(const F32vec8 &a, const F32vec8 &b, const F32vec8 &c, const F32vec8 &d)
Definition: dvec.h:1382
__m256d __cdecl _mm256_set1_pd(double)
__m128i _mm_sub_epi8(__m128i _A, __m128i _B)
F64vec4(__m256d m)
Definition: dvec.h:1413
#define IVEC128_LOGICALS(vect, element)
Definition: dvec.h:817
Is32vec4 & operator>>=(int count)
Definition: dvec.h:294
friend F32vec8 abs(const F32vec8 &a)
Definition: dvec.h:1270
__m256 __cdecl _mm256_max_ps(__m256, __m256)
__m128i _mm_unpacklo_epi64(__m128i _A, __m128i _B)
__m128i _mm_andnot_si128(__m128i _A, __m128i _B)
const unsigned int & operator[](int i) const
Definition: dvec.h:379
Definition: dvec.h:1135
Is32vec4 operator>>(int count)
Definition: dvec.h:292
friend F64vec2 operator^(const F64vec2 &a, const F64vec2 &b)
Definition: dvec.h:943
__m256 __cdecl _mm256_set_ps(float, float, float, float, float, float, float, float)
Is16vec8 & operator>>=(const M128 &a)
Definition: dvec.h:477
__m256 __cdecl _mm256_cmp_ps(__m256, __m256, const int)
Iu32vec4 & operator|=(const M128 &a)
Definition: dvec.h:349
__m128i _mm_srai_epi16(__m128i _A, int _Count)
I64vec2 & operator-=(const I64vec2 &a)
Definition: dvec.h:194
friend F32vec8 cmp_lt(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1244
__m128i _mm_subs_epi8(__m128i _A, __m128i _B)
F64vec4 & operator/=(const F64vec4 &a)
Definition: dvec.h:1441
Is16vec8 & operator^=(const M128 &a)
Definition: dvec.h:462
friend F64vec2 operator+(const F64vec2 &a, const F64vec2 &b)
Definition: dvec.h:946
F64vec2 & operator-=(const F64vec2 &a)
Definition: dvec.h:952
__m256d __cdecl _mm256_cmp_pd(__m256d, __m256d, const int)
_CRTIMP void __cdecl _wassert(_In_z_ const wchar_t *_Message, _In_z_ const wchar_t *_File, _In_ unsigned _Line)
I32vec4 operator<<(int count)
Definition: dvec.h:251
friend F32vec8 rcp(const F32vec8 &a)
Definition: dvec.h:1210
int move_mask(const F64vec2 &a)
Definition: dvec.h:1063
__m256d __cdecl _mm256_xor_pd(__m256d, __m256d)
I32vec4 cmpeq(const I32vec4 &a, const I32vec4 &b)
Definition: dvec.h:257
__m128d _mm_mul_pd(__m128d _A, __m128d _B)
friend F64vec2 andnot(const F64vec2 &a, const F64vec2 &b)
Definition: dvec.h:967
__m256d __cdecl _mm256_add_pd(__m256d, __m256d)
Iu32vec4(__m128i mm)
Definition: dvec.h:340
Definition: dvec.h:617
friend F32vec8 cmp_ngt(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1258
float _mm_cvtss_f32(__m128 _A)
I8vec16(char s15, char s14, char s13, char s12, char s11, char s10, char s9, char s8, char s7, char s6, char s5, char s4, char s3, char s2, char s1, char s0)
Definition: dvec.h:622
#define _CMP_LE_OS
Definition: immintrin.h:57
__m128i _mm_add_epi16(__m128i _A, __m128i _B)
unsigned int & operator[](int i)
Definition: dvec.h:386
I8vec16 & operator+=(const I8vec16 &a)
Definition: dvec.h:638
Iu8vec16 packu_sat(const Is16vec8 &a, const Is16vec8 &b)
Definition: dvec.h:814
friend F64vec4 cmp_nlt(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1472
friend F32vec8 cmp_neq(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1252
I128vec1()
Definition: dvec.h:159
Iu32vec4(unsigned int ui3, unsigned int ui2, unsigned int ui1, unsigned int ui0)
Definition: dvec.h:341
__m128i _mm_set_epi8(char _B15, char _B14, char _B13, char _B12, char _B11, char _B10, char _B9, char _B8, char _B7, char _B6, char _B5, char _B4, char _B3, char _B2, char _B1, char _B0)
Iu8vec16 & operator-=(const I8vec16 &a)
Definition: dvec.h:753
__m128i _mm_cmpeq_epi8(__m128i _A, __m128i _B)
I32vec4 cmpneq(const I32vec4 &a, const I32vec4 &b)
Definition: dvec.h:258
Iu16vec8 & operator+=(const I16vec8 &a)
Definition: dvec.h:553
friend F64vec4 cmp_ngt(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1476
__m128i _mm_unpackhi_epi8(__m128i _A, __m128i _B)
__m128i _mm_adds_epi16(__m128i _A, __m128i _B)
__m128 __cdecl _mm256_cvtpd_ps(__m256d)
__m128d _mm_cvtps_pd(__m128 _A)
Is8vec16()
Definition: dvec.h:655
F64vec4 F32vec4ToF64vec4(const F32vec4 &a)
Definition: dvec.h:1613
friend F64vec4 simd_min(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1482
__m128d _mm_andnot_pd(__m128d _A, __m128d _B)
__m128i _mm_sub_epi64(__m128i _A, __m128i _B)
Is32vec4 operator<<(int count)
Definition: dvec.h:287
#define _MM_4UDW(element, vector)
Definition: dvec.h:96
__m128i _mm_packus_epi16(__m128i _A, __m128i _B)
I8vec16 & operator&=(const M128 &a)
Definition: dvec.h:633
Is32vec4(__m128i mm)
Definition: dvec.h:270
Iu16vec8 operator<<(int count)
Definition: dvec.h:559
__m128i _mm_mulhi_epu16(__m128i _A, __m128i _B)
M128 & operator|=(const M128 &a)
Definition: dvec.h:141
F32vec8 & operator&=(const F32vec8 &a)
Definition: dvec.h:1191
signed char & operator[](int i)
Definition: dvec.h:707
F32vec8()
Definition: dvec.h:1143
void __cdecl _mm256_maskstore_ps(float *, __m256i, __m256)
Is32vec4 cmplt(const Is32vec4 &a, const Is32vec4 &b)
Definition: dvec.h:327
Is32vec4 operator>>(const M128 &a)
Definition: dvec.h:291
Is16vec8 & operator<<=(int count)
Definition: dvec.h:473
Is16vec8 sat_add(const Is16vec8 &a, const Is16vec8 &b)
Definition: dvec.h:526
friend F32vec8 cmp_gt(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1248
Iu16vec8(unsigned short s7, unsigned short s6, unsigned short s5, unsigned short s4, unsigned short s3, unsigned short s2, unsigned short s1, unsigned short s0)
Definition: dvec.h:541
Is32vec4(int i3, int i2, int i1, int i0)
Definition: dvec.h:271
Is16vec8 operator>>(const M128 &a)
Definition: dvec.h:475
__m128i _mm_srl_epi64(__m128i _A, __m128i _Count)
__m128d _mm_add_sd(__m128d _A, __m128d _B)
friend F32vec8 operator^(const F32vec8 &a, const F32vec8 &b)
Definition: dvec.h:1179
__m128i _mm_and_si128(__m128i _A, __m128i _B)
void __cdecl _mm256_stream_pd(double *, __m256d)
__m128i _mm_madd_epi16(__m128i _A, __m128i _B)
__m128d _mm_sqrt_pd(__m128d _A)
__m128 __cdecl _mm256_extractf128_ps(__m256, const int)
friend F64vec4 cmp_neq(const F64vec4 &a, const F64vec4 &b)
Definition: dvec.h:1470
friend F64vec2 operator|(const F64vec2 &a, const F64vec2 &b)
Definition: dvec.h:942
__m128i _mm_subs_epi16(__m128i _A, __m128i _B)
I64vec2 unpack_low(const I64vec2 &a, const I64vec2 &b)
Definition: dvec.h:224
Iu32vec4 operator<<(const M128 &a)
Definition: dvec.h:357
__m128d __cdecl _mm256_extractf128_pd(__m256d, const int)
friend F64vec2 operator-(const F64vec2 &a, const F64vec2 &b)
Definition: dvec.h:947
friend double add_horizontal(const F64vec2 &a)
Definition: dvec.h:960
I16vec8 & operator=(const M128 &a)
Definition: dvec.h:414
friend F32vec8 rcp_nr(const F32vec8 &a)
Definition: dvec.h:1219
__m256 __cdecl _mm256_unpacklo_ps(__m256, __m256)
Iu16vec8 simd_avg(const Iu16vec8 &a, const Iu16vec8 &b)
Definition: dvec.h:611
Definition: dvec.h:128
I16vec8 & operator|=(const M128 &a)
Definition: dvec.h:418
Iu16vec8 operator<<(const M128 &a)
Definition: dvec.h:558