STLdoc
STLdocumentation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
fvec.h
Go to the documentation of this file.
1 /***
2 *** Copyright (C) 1985-2015 Intel Corporation. All rights reserved.
3 ***
4 *** The information and source code contained herein is the exclusive
5 *** property of Intel Corporation and may not be disclosed, examined
6 *** or reproduced in whole or in part without explicit written authorization
7 *** from the company.
8 ***
9 ****/
10 
11 /*
12  * Definition of a C++ class interface to Streaming SIMD Extension intrinsics.
13  *
14  *
15  * File name : fvec.h Fvec class definitions
16  *
17  * Concept: A C++ abstraction of Streaming SIMD Extensions designed to improve
18  *
19  * programmer productivity. Speed and accuracy are sacrificed for utility.
20  *
21  * Facilitates an easy transition to compiler intrinsics
22  *
23  * or assembly language.
24  *
25  * F32vec4: 4 packed single precision
26  * 32-bit floating point numbers
27 */
28 
29 #ifndef _FVEC_H_INCLUDED
30 #define _FVEC_H_INCLUDED
31 #ifndef RC_INVOKED
32 
33 #if !defined __cplusplus
34  #error ERROR: This file is only supported in C++ compilations!
35 #endif /* !defined __cplusplus */
36 
37 #if defined (_M_CEE_PURE)
38  #error ERROR: This file is not supported in the pure mode!
39 #else /* defined (_M_CEE_PURE) */
40 
41 #include <xmmintrin.h> /* SSE Intrinsic function definition include file */
42 #include <ivec.h>
43 #include <vcruntime.h>
44 
45 #ifndef _VEC_ASSERT
46 #ifdef NDEBUG
47  #define _VEC_ASSERT(_Expression) ((void)0)
48 #else /* NDEBUG */
49 #ifdef __cplusplus
50  extern "C" {
51 #endif /* __cplusplus */
52 
53  void __cdecl _wassert(_In_z_ const wchar_t * _Message, _In_z_ const wchar_t *_File, _In_ unsigned _Line);
54 
55 #ifdef __cplusplus
56  }
57 #endif /* __cplusplus */
58 
59  #define _VEC_ASSERT(_Expression) (void)( (!!(_Expression)) || (_wassert(_CRT_WIDE(#_Expression), _CRT_WIDE(__FILE__), __LINE__), 0) )
60 #endif /* NDEBUG */
61 #endif /* _VEC_ASSERT */
62 
63 /* Define _ENABLE_VEC_DEBUG to enable std::ostream inserters for debug output */
64 #if defined (_ENABLE_VEC_DEBUG)
65  #include <iostream>
66 #endif /* defined (_ENABLE_VEC_DEBUG) */
67 
68 #pragma pack(push,16) /* Must ensure class & union 16-B aligned */
69 
70 const union
71 {
72  int i[4];
74 } __f32vec4_abs_mask_cheat = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
75 
76 #define _f32vec4_abs_mask ((F32vec4)__f32vec4_abs_mask_cheat.m)
77 
78 class F32vec4
79 {
80 protected:
82 public:
83 
84  /* Constructors: __m128, 4 floats, 1 float */
85  F32vec4() {}
86 
87  /* initialize 4 SP FP with __m128 data type */
88  F32vec4(__m128 _M) { vec = _M;}
89 
90  /* initialize 4 SP FPs with 4 floats */
91  F32vec4(float _F3, float _F2, float _F1, float _F0) { vec= _mm_set_ps(_F3,_F2,_F1,_F0); }
92 
93  /* Explicitly initialize each of 4 SP FPs with same float */
94  explicit F32vec4(float _F) { vec = _mm_set_ps1(_F); }
95 
96  /* Explicitly initialize each of 4 SP FPs with same double */
97  explicit F32vec4(double _D) { vec = _mm_set_ps1((float)_D); }
98 
99  /* Assignment operations */
100 
101  F32vec4& operator =(float _F) { vec = _mm_set_ps1(_F); return *this; }
102 
103  F32vec4& operator =(double _D)
104  {
105  vec = _mm_set_ps1((float)_D);
106  return *this;
107  }
108 
109  /* Conversion functions */
110  operator __m128() const { return vec; } /* Convert to __m128 */
111 
112  /* Logical Operators */
113  friend F32vec4 operator &(const F32vec4 &_A, const F32vec4 &_B) { return _mm_and_ps(_A,_B); }
114  friend F32vec4 operator |(const F32vec4 &_A, const F32vec4 &_B) { return _mm_or_ps(_A,_B); }
115  friend F32vec4 operator ^(const F32vec4 &_A, const F32vec4 &_B) { return _mm_xor_ps(_A,_B); }
116 
117  /* Arithmetic Operators */
118  friend F32vec4 operator +(const F32vec4 &_A, const F32vec4 &_B) { return _mm_add_ps(_A,_B); }
119  friend F32vec4 operator -(const F32vec4 &_A, const F32vec4 &_B) { return _mm_sub_ps(_A,_B); }
120  friend F32vec4 operator *(const F32vec4 &_A, const F32vec4 &_B) { return _mm_mul_ps(_A,_B); }
121  friend F32vec4 operator /(const F32vec4 &_A, const F32vec4 &_B) { return _mm_div_ps(_A,_B); }
122 
123  F32vec4& operator =(const F32vec4 &_A) { vec = _A.vec; return *this; }
124  F32vec4& operator =(const __m128 &_Avec) { vec = _Avec; return *this; }
125  F32vec4& operator +=(const F32vec4 &_A) { return *this = _mm_add_ps(vec,_A); }
126  F32vec4& operator -=(const F32vec4 &_A) { return *this = _mm_sub_ps(vec,_A); }
127  F32vec4& operator *=(const F32vec4 &_A) { return *this = _mm_mul_ps(vec,_A); }
128  F32vec4& operator /=(const F32vec4 &_A) { return *this = _mm_div_ps(vec,_A); }
129  F32vec4& operator &=(const F32vec4 &_A) { return *this = _mm_and_ps(vec,_A); }
130  F32vec4& operator |=(const F32vec4 &_A) { return *this = _mm_or_ps(vec,_A); }
131  F32vec4& operator ^=(const F32vec4 &_A) { return *this = _mm_xor_ps(vec,_A); }
132 
133  /* Horizontal Add */
134  friend float add_horizontal(const F32vec4 &_A)
135  {
136  F32vec4 _Ftemp = _mm_add_ps(_A, _mm_movehl_ps(_A, _A));
137  _Ftemp = _mm_add_ss(_Ftemp, _mm_shuffle_ps(_Ftemp, _Ftemp, 1));
138  return _mm_cvtss_f32(_Ftemp);
139  }
140 
141  /* Square Root */
142  friend F32vec4 sqrt(const F32vec4 &_A) { return _mm_sqrt_ps(_A); }
143  /* Reciprocal */
144  friend F32vec4 rcp(const F32vec4 &_A) { return _mm_rcp_ps(_A); }
145  /* Reciprocal Square Root */
146  friend F32vec4 rsqrt(const F32vec4 &_A) { return _mm_rsqrt_ps(_A); }
147 
148  /* NewtonRaphson Reciprocal
149  [2 * rcpps(x) - (x * rcpps(x) * rcpps(x))] */
150  friend F32vec4 rcp_nr(const F32vec4 &_A)
151  {
152  F32vec4 _Ra0 = _mm_rcp_ps(_A);
153  return _mm_sub_ps(_mm_add_ps(_Ra0, _Ra0), _mm_mul_ps(_mm_mul_ps(_Ra0, _A), _Ra0));
154  }
155 
156  /* NewtonRaphson Reciprocal Square Root
157  0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) */
158 #pragma warning(push)
159 #pragma warning(disable : 4640)
160  friend F32vec4 rsqrt_nr(const F32vec4 &_A)
161  {
162  static const F32vec4 fvecf0pt5(0.5f);
163  static const F32vec4 fvecf3pt0(3.0f);
164  F32vec4 _Ra0 = _mm_rsqrt_ps(_A);
165  return (fvecf0pt5 * _Ra0) * (fvecf3pt0 - (_A * _Ra0) * _Ra0);
166  }
167 #pragma warning(pop)
168 
169  /* Compares: Mask is returned */
170  /* Macros expand to all compare intrinsics. Example:
171  friend F32vec4 cmpeq(const F32vec4 &_A, const F32vec4 &_B)
172  { return _mm_cmpeq_ps(_A,_B);} */
173  #define Fvec32s4_COMP(op) \
174  friend F32vec4 cmp##op (const F32vec4 &_A, const F32vec4 &_B) { return _mm_cmp##op##_ps(_A,_B); }
175  Fvec32s4_COMP(eq) /* expanded to cmpeq(_A,_B) */
176  Fvec32s4_COMP(lt) /* expanded to cmplt(_A,_B) */
177  Fvec32s4_COMP(le) /* expanded to cmple(_A,_B) */
178  Fvec32s4_COMP(gt) /* expanded to cmpgt(_A,_B) */
179  Fvec32s4_COMP(ge) /* expanded to cmpge(_A,_B) */
180  Fvec32s4_COMP(neq) /* expanded to cmpneq(_A,_B) */
181  Fvec32s4_COMP(nlt) /* expanded to cmpnlt(_A,_B) */
182  Fvec32s4_COMP(nle) /* expanded to cmpnle(_A,_B) */
183  Fvec32s4_COMP(ngt) /* expanded to cmpngt(_A,_B) */
184  Fvec32s4_COMP(nge) /* expanded to cmpnge(_A,_B) */
185  #undef Fvec32s4_COMP
186 
187  /* Min and Max */
188  friend F32vec4 simd_min(const F32vec4 &_A, const F32vec4 &_B) { return _mm_min_ps(_A,_B); }
189  friend F32vec4 simd_max(const F32vec4 &_A, const F32vec4 &_B) { return _mm_max_ps(_A,_B); }
190 
191  /* Absolute value */
192  friend F32vec4 abs(const F32vec4 &_A) {return _mm_and_ps(_A, _f32vec4_abs_mask); }
193 
194  /* Debug Features */
195 #if defined (_ENABLE_VEC_DEBUG)
196  /* Output */
197  friend std::ostream & operator<<(std::ostream & _Os, const F32vec4 &_A)
198  {
199  /* To use: cout << "Elements of F32vec4 fvec are: " << fvec; */
200  float *_Fp = (float*)&_A;
201  _Os << "[3]:" << *(_Fp+3)
202  << " [2]:" << *(_Fp+2)
203  << " [1]:" << *(_Fp+1)
204  << " [0]:" << *_Fp;
205  return _Os;
206  }
207 #endif /* defined (_ENABLE_VEC_DEBUG) */
208  /* Element Access Only, no modifications to elements*/
209  const float& operator[](int _I) const
210  {
211  /* Assert enabled only during debug /DDEBUG */
212  _VEC_ASSERT((0 <= _I) && (_I <= 3)); /* User should only access elements 0-3 */
213  float *_Fp = (float*)&vec;
214  return *(_Fp+ _I);
215  }
216  /* Element Access and Modification*/
217  float& operator[](int _I)
218  {
219  /* Assert enabled only during debug /DDEBUG */
220  _VEC_ASSERT((0 <= _I) && (_I <= 3)); /* User should only access elements 0-3 */
221  float *_Fp = (float*)&vec;
222  return *(_Fp+ _I);
223  }
224 };
225 
226  /* Miscellaneous */
227 
228 /* Interleave low order data elements of a and b into destination */
229 inline F32vec4 unpack_low(const F32vec4 &_A, const F32vec4 &_B)
230 { return _mm_unpacklo_ps(_A, _B); }
231 
232 /* Interleave high order data elements of a and b into target */
233 inline F32vec4 unpack_high(const F32vec4 &_A, const F32vec4 &_B)
234 { return _mm_unpackhi_ps(_A, _B); }
235 
236 /* Move Mask to Integer returns 4 bit mask formed of most significant bits of a */
237 inline int move_mask(const F32vec4 &_A)
238 { return _mm_movemask_ps(_A);}
239 
240  /* Data Motion Functions */
241 
242 /* Load Unaligned loadu_ps: Unaligned */
243 inline void loadu(F32vec4 &_A, float *_P)
244 { _A = _mm_loadu_ps(_P); }
245 
246 /* Store Temporal storeu_ps: Unaligned */
247 inline void storeu(float *_P, const F32vec4 &_A)
248 { _mm_storeu_ps(_P, _A); }
249 
250  /* Cacheability Support */
251 
252 /* Non-Temporal Store */
253 inline void store_nta(float *_P, const F32vec4 &_A)
254 { _mm_stream_ps(_P,_A);}
255 
256  /* Conditional Selects:*/
257 /*(a OP b)? c : d; where OP is any compare operator
258 Macros expand to conditional selects which use all compare intrinsics.
259 Example:
260 friend F32vec4 select_eq(const F32vec4 &_A, const F32vec4 &_B, const F32vec4 &_C, const F32vec4 &_D)
261 {
262  F32vec4 _Mask = _mm_cmpeq_ps(_A,_B);
263  return( (_Mask & _C) | F32vec4((_mm_andnot_ps(_Mask,_D))));
264 }
265 */
266 
267 #define Fvec32s4_SELECT(op) \
268 inline F32vec4 select_##op (const F32vec4 &_A, const F32vec4 &_B, const F32vec4 &_C, const F32vec4 &_D) \
269 { \
270  F32vec4 _Mask = _mm_cmp##op##_ps(_A,_B); \
271  return( (_Mask & _C) | F32vec4((_mm_andnot_ps(_Mask,_D)))); \
272 }
273 Fvec32s4_SELECT(eq) /* generates select_eq(_A,_B) */
274 Fvec32s4_SELECT(lt) /* generates select_lt(_A,_B) */
275 Fvec32s4_SELECT(le) /* generates select_le(_A,_B) */
276 Fvec32s4_SELECT(gt) /* generates select_gt(_A,_B) */
277 Fvec32s4_SELECT(ge) /* generates select_ge(_A,_B) */
278 Fvec32s4_SELECT(neq) /* generates select_neq(_A,_B) */
279 Fvec32s4_SELECT(nlt) /* generates select_nlt(_A,_B) */
280 Fvec32s4_SELECT(nle) /* generates select_nle(_A,_B) */
281 Fvec32s4_SELECT(ngt) /* generates select_ngt(_A,_B) */
282 Fvec32s4_SELECT(nge) /* generates select_nge(_A,_B) */
283 #undef Fvec32s4_SELECT
284 
285 /* Streaming SIMD Extensions Integer Intrinsic Functions */
286 
287 #if defined(_M_IX86)
288 /* Max and Min */
289 inline Is16vec4 simd_max(const Is16vec4 &_A, const Is16vec4 &_B) { return _m_pmaxsw(_A,_B);}
290 inline Is16vec4 simd_min(const Is16vec4 &_A, const Is16vec4 &_B) { return _m_pminsw(_A,_B);}
291 inline Iu8vec8 simd_max(const Iu8vec8 &_A, const Iu8vec8 &_B) { return _m_pmaxub(_A,_B);}
292 inline Iu8vec8 simd_min(const Iu8vec8 &_A, const Iu8vec8 &_B) { return _m_pminub(_A,_B);}
293 
294 /* Average */
295 inline Iu16vec4 simd_avg(const Iu16vec4 &_A, const Iu16vec4 &_B) { return _mm_avg_pu16(_A,_B); }
296 inline Iu8vec8 simd_avg(const Iu8vec8 &_A, const Iu8vec8 &_B) { return _mm_avg_pu8(_A,_B); }
297 
298 /* Move ByteMask To Int: returns mask formed from most sig bits of each vec of a */
299 inline int move_mask(const I8vec8 &_A) { return _m_pmovmskb(_A);}
300 
301 /* Packed Multiply High Unsigned */
302 inline Iu16vec4 mul_high(const Iu16vec4 &_A, const Iu16vec4 &_B) { return _m_pmulhuw(_A,_B); }
303 
304 /* Byte Mask Write: Write bytes if most significant bit in each corresponding byte is set */
305 inline void mask_move(const I8vec8 &_A, const I8vec8 &_B, char *_Addr) { _m_maskmovq(_A, _B, _Addr); }
306 
307 /* Data Motion: Store Non Temporal */
308 inline void store_nta(__m64 *_P, const M64 &_A) { _mm_stream_pi(_P,_A); }
309 
310 /* Conversions between ivec <-> fvec */
311 
312 /* Convert first element of F32vec4 to int with truncation */
313 inline int F32vec4ToInt(const F32vec4 &_A)
314 {
315 
316  return _mm_cvtt_ss2si(_A);
317 
318 }
319 
320 /* Convert two lower SP FP values of a to Is32vec2 with truncation */
321 inline Is32vec2 F32vec4ToIs32vec2 (const F32vec4 &_A)
322 {
323 
324  __m64 _Result;
325  _Result = _mm_cvtt_ps2pi(_A);
326  return Is32vec2(_Result);
327 
328 }
329 
330 /* Convert the 32-bit int i to an SP FP value; the upper three SP FP values are passed through from a. */
331 inline F32vec4 IntToF32vec4(const F32vec4 &_A, int _I)
332 {
333 
334  __m128 _Result;
335  _Result = _mm_cvt_si2ss(_A, _I);
336  return F32vec4(_Result);
337 
338 }
339 
340 /* Convert the two 32-bit integer values in b to two SP FP values; the upper two SP FP values are passed from a. */
341 inline F32vec4 Is32vec2ToF32vec4(const F32vec4 &_A, const Is32vec2 &_B)
342 {
343 
344  __m128 _Result;
345  _Result = _mm_cvt_pi2ps(_A,_B);
346  return F32vec4(_Result);
347 }
348 #endif
349 
350 class F32vec1
351 {
352 protected:
353  __m128 vec;
354 public:
355 
356  /* Constructors: 1 float */
357  F32vec1() {}
358 
359  F32vec1(int _I) { vec = _mm_cvt_si2ss(vec, _I);};
360 
361  /* Initialize each of 4 SP FPs with same float */
362  explicit F32vec1(float _F) { vec = _mm_set_ss(_F); }
363 
364  /* Initialize each of 4 SP FPs with same float */
365  explicit F32vec1(double _D) { vec = _mm_set_ss((float) _D); }
366 
367  /* initialize with __m128 data type */
368  F32vec1(__m128 _M) { vec = _M; }
369 
370  /* Conversion functions */
371  operator __m128() const { return vec; } /* Convert to float */
372 
373  /* Logical Operators */
374  friend F32vec1 operator &(const F32vec1 &_A, const F32vec1 &_B) { return _mm_and_ps(_A,_B); }
375  friend F32vec1 operator |(const F32vec1 &_A, const F32vec1 &_B) { return _mm_or_ps(_A,_B); }
376  friend F32vec1 operator ^(const F32vec1 &_A, const F32vec1 &_B) { return _mm_xor_ps(_A,_B); }
377 
378  /* Arithmetic Operators */
379  friend F32vec1 operator +(const F32vec1 &_A, const F32vec1 &_B) { return _mm_add_ss(_A,_B); }
380  friend F32vec1 operator -(const F32vec1 &_A, const F32vec1 &_B) { return _mm_sub_ss(_A,_B); }
381  friend F32vec1 operator *(const F32vec1 &_A, const F32vec1 &_B) { return _mm_mul_ss(_A,_B); }
382  friend F32vec1 operator /(const F32vec1 &_A, const F32vec1 &_B) { return _mm_div_ss(_A,_B); }
383 
384  F32vec1& operator +=(const F32vec1 &_A) { return *this = _mm_add_ss(vec,_A); }
385  F32vec1& operator -=(const F32vec1 &_A) { return *this = _mm_sub_ss(vec,_A); }
386  F32vec1& operator *=(const F32vec1 &_A) { return *this = _mm_mul_ss(vec,_A); }
387  F32vec1& operator /=(const F32vec1 &_A) { return *this = _mm_div_ss(vec,_A); }
388  F32vec1& operator &=(const F32vec1 &_A) { return *this = _mm_and_ps(vec,_A); }
389  F32vec1& operator |=(const F32vec1 &_A) { return *this = _mm_or_ps(vec,_A); }
390  F32vec1& operator ^=(const F32vec1 &_A) { return *this = _mm_xor_ps(vec,_A); }
391 
392 
393  /* Square Root */
394  friend F32vec1 sqrt(const F32vec1 &_A) { return _mm_sqrt_ss(_A); }
395  /* Reciprocal */
396  friend F32vec1 rcp(const F32vec1 &_A) { return _mm_rcp_ss(_A); }
397  /* Reciprocal Square Root */
398  friend F32vec1 rsqrt(const F32vec1 &_A) { return _mm_rsqrt_ss(_A); }
399 
400  /* NewtonRaphson Reciprocal
401  [2 * rcpss(x) - (x * rcpss(x) * rcpss(x))] */
402  friend F32vec1 rcp_nr(const F32vec1 &_A)
403  {
404  F32vec1 _Ra0 = _mm_rcp_ss(_A);
405  return _mm_sub_ss(_mm_add_ss(_Ra0, _Ra0), _mm_mul_ss(_mm_mul_ss(_Ra0, _A), _Ra0));
406  }
407 
408  /* NewtonRaphson Reciprocal Square Root
409  0.5 * rsqrtss * (3 - x * rsqrtss(x) * rsqrtss(x)) */
410 #pragma warning(push)
411 #pragma warning(disable : 4640)
412  friend F32vec1 rsqrt_nr(const F32vec1 &_A)
413  {
414  static const F32vec1 fvecf0pt5(0.5f);
415  static const F32vec1 fvecf3pt0(3.0f);
416  F32vec1 _Ra0 = _mm_rsqrt_ss(_A);
417  return (fvecf0pt5 * _Ra0) * (fvecf3pt0 - (_A * _Ra0) * _Ra0);
418  }
419 #pragma warning(pop)
420 
421  /* Compares: Mask is returned */
422  /* Macros expand to all compare intrinsics. Example:
423  friend F32vec1 cmpeq(const F32vec1 &_A, const F32vec1 &_B)
424  { return _mm_cmpeq_ss(_A,_B);} */
425  #define Fvec32s1_COMP(op) \
426  friend F32vec1 cmp##op (const F32vec1 &_A, const F32vec1 &_B) { return _mm_cmp##op##_ss(_A,_B); }
427  Fvec32s1_COMP(eq) /* expanded to cmpeq(_A,_B) */
428  Fvec32s1_COMP(lt) /* expanded to cmplt(_A,_B) */
429  Fvec32s1_COMP(le) /* expanded to cmple(_A,_B) */
430  Fvec32s1_COMP(gt) /* expanded to cmpgt(_A,_B) */
431  Fvec32s1_COMP(ge) /* expanded to cmpge(_A,_B) */
432  Fvec32s1_COMP(neq) /* expanded to cmpneq(_A,_B) */
433  Fvec32s1_COMP(nlt) /* expanded to cmpnlt(_A,_B) */
434  Fvec32s1_COMP(nle) /* expanded to cmpnle(_A,_B) */
435  Fvec32s1_COMP(ngt) /* expanded to cmpngt(_A,_B) */
436  Fvec32s1_COMP(nge) /* expanded to cmpnge(_A,_B) */
437  #undef Fvec32s1_COMP
438 
439  /* Min and Max */
440  friend F32vec1 simd_min(const F32vec1 &_A, const F32vec1 &_B) { return _mm_min_ss(_A,_B); }
441  friend F32vec1 simd_max(const F32vec1 &_A, const F32vec1 &_B) { return _mm_max_ss(_A,_B); }
442 
443  /* Debug Features */
444 #if defined (_ENABLE_VEC_DEBUG)
445  /* Output */
446  friend std::ostream & operator<<(std::ostream & _Os, const F32vec1 &_A)
447  {
448  /* To use: cout << "Elements of F32vec1 fvec are: " << fvec; */
449  float *_Fp = (float*)&_A;
450  _Os << "float:" << *_Fp;
451  return _Os;
452  }
453 #endif /* defined (_ENABLE_VEC_DEBUG) */
454 
455 };
456 
457  /* Conditional Selects:*/
458 /*(_A OP _B)? _C : _D; where OP is any compare operator
459 Macros expand to conditional selects which use all compare intrinsics.
460 Example:
461 friend F32vec1 select_eq(const F32vec1 &_A, const F32vec1 &_B, const F32vec1 &_C, const F32vec1 &_D)
462 {
463  F32vec1 _Mask = _mm_cmpeq_ss(_A,_B);
464  return( (_Mask & _C) | F32vec1((_mm_andnot_ps(_Mask,_D))));
465 }
466 */
467 
468 #define Fvec32s1_SELECT(op) \
469 inline F32vec1 select_##op (const F32vec1 &_A, const F32vec1 &_B, const F32vec1 &_C, const F32vec1 &_D) \
470 { \
471  F32vec1 _Mask = _mm_cmp##op##_ss(_A,_B); \
472  return( (_Mask & _C) | F32vec1((_mm_andnot_ps(_Mask,_D)))); \
473 }
474 Fvec32s1_SELECT(eq) /* generates select_eq(_A,_B) */
475 Fvec32s1_SELECT(lt) /* generates select_lt(_A,_B) */
476 Fvec32s1_SELECT(le) /* generates select_le(_A,_B) */
477 Fvec32s1_SELECT(gt) /* generates select_gt(_A,_B) */
478 Fvec32s1_SELECT(ge) /* generates select_ge(_A,_B) */
479 Fvec32s1_SELECT(neq) /* generates select_neq(_A,_B) */
480 Fvec32s1_SELECT(nlt) /* generates select_nlt(_A,_B) */
481 Fvec32s1_SELECT(nle) /* generates select_nle(_A,_B) */
482 Fvec32s1_SELECT(ngt) /* generates select_ngt(_A,_B) */
483 Fvec32s1_SELECT(nge) /* generates select_nge(_A,_B) */
484 #undef Fvec32s1_SELECT
485 
486 /* Conversions between ivec <-> fvec */
487 
488 /* Convert F32vec1 to int */
489 inline int F32vec1ToInt(const F32vec1 &_A)
490 {
491  return _mm_cvtt_ss2si(_A);
492 }
493 
494 
495 
496 #pragma pack(pop) /* 16-B aligned */
497 
498 #endif /* defined (_M_CEE_PURE) */
499 
500 #endif /* RC_INVOKED */
501 #endif /* _FVEC_H_INCLUDED */
Definition: fvec.h:78
F32vec4(__m128 _M)
Definition: fvec.h:88
__m128 _mm_set_ps(float _A, float _B, float _C, float _D)
#define Fvec32s4_SELECT(op)
Definition: fvec.h:267
uint_2 operator|(const uint_2 &_Lhs, const uint_2 &_Rhs) __GPU
Definition: amp_short_vectors.h:22852
__m128 _mm_and_ps(__m128 _A, __m128 _B)
friend F32vec4 simd_max(const F32vec4 &_A, const F32vec4 &_B)
Definition: fvec.h:189
#define Fvec32s1_SELECT(op)
Definition: fvec.h:468
__m128 _mm_movehl_ps(__m128, __m128)
friend F32vec4 operator&(const F32vec4 &_A, const F32vec4 &_B)
Definition: fvec.h:113
__m128 _mm_rsqrt_ss(__m128 _A)
float rsqrt(float _X) __GPU_ONLY
Returns the reciprocal of the square root of the argument
Definition: amp_math.h:954
__m128 _mm_sqrt_ps(__m128 _A)
__m128 _mm_max_ps(__m128 _A, __m128 _B)
Is16vec8 simd_min(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:528
void _mm_storeu_ps(float *_V, __m128 _A)
F32vec4 unpack_high(const F32vec4 &_A, const F32vec4 &_B)
Definition: fvec.h:233
void store_nta(float *_P, const F32vec4 &_A)
Definition: fvec.h:253
float sqrt(float _X) __GPU_ONLY
Calculates the squre root of the argument
Definition: amp_math.h:1100
friend F32vec4 sqrt(const F32vec4 &_A)
Definition: fvec.h:142
F32vec4 & operator^=(const F32vec4 &_A)
Definition: fvec.h:131
__m128 _mm_sqrt_ss(__m128 _A)
F32vec4 & operator/=(const F32vec4 &_A)
Definition: fvec.h:128
uint_2 operator<<(const uint_2 &_Lhs, const uint_2 &_Rhs) __GPU
Definition: amp_short_vectors.h:22866
std::enable_if< details::_Is_extent_or_index< _Tuple_type< _Rank > >::value, _Tuple_type< _Rank > >::type operator*(const _Tuple_type< _Rank > &_Lhs, typename _Tuple_type< _Rank >::value_type _Rhs) __GPU
Definition: amp.h:890
__m128 _mm_shuffle_ps(__m128 _A, __m128 _B, unsigned int _Imm8)
friend float add_horizontal(const F32vec4 &_A)
Definition: fvec.h:134
__m128 _mm_loadu_ps(float const *_A)
Is16vec8 mul_high(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:521
friend F32vec4 rsqrt(const F32vec4 &_A)
Definition: fvec.h:146
friend F32vec4 rsqrt_nr(const F32vec4 &_A)
Definition: fvec.h:160
uint_2 operator^(const uint_2 &_Lhs, const uint_2 &_Rhs) __GPU
Definition: amp_short_vectors.h:22845
float rcp(float _X) __GPU_ONLY
Calculates a fast, approximate reciprocal of the argument
Definition: amp.h:7493
__m128 _mm_add_ps(__m128 _A, __m128 _B)
__m128 _mm_sub_ps(__m128 _A, __m128 _B)
__m128 _mm_div_ps(__m128 _A, __m128 _B)
friend F32vec4 operator*(const F32vec4 &_A, const F32vec4 &_B)
Definition: fvec.h:120
F32vec4 & operator|=(const F32vec4 &_A)
Definition: fvec.h:130
__m128 _mm_set_ss(float _A)
friend F32vec4 operator^(const F32vec4 &_A, const F32vec4 &_B)
Definition: fvec.h:115
launch & operator^=(launch &_Left, launch _Right)
Definition: future:87
const float & operator[](int _I) const
Definition: fvec.h:209
std::enable_if< details::_Is_extent_or_index< _Tuple_type< _Rank > >::value, _Tuple_type< _Rank > >::type operator-(const _Tuple_type< _Rank > &_Lhs, const _Tuple_type< _Rank > &_Rhs) __GPU
Definition: amp.h:845
#define _In_z_
Definition: sal.h:310
Is16vec8 simd_max(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:527
#define _In_
Definition: sal.h:305
std::enable_if< details::_Is_extent_or_index< _Tuple_type< _Rank > >::value, _Tuple_type< _Rank > >::type operator+(const _Tuple_type< _Rank > &_Lhs, const _Tuple_type< _Rank > &_Rhs) __GPU
Definition: amp.h:836
friend F32vec4 operator|(const F32vec4 &_A, const F32vec4 &_B)
Definition: fvec.h:114
__m128 _mm_rcp_ps(__m128 _A)
const F32vec4 & _B
Definition: fvec.h:188
void storeu(float *_P, const F32vec4 &_A)
Definition: fvec.h:247
__m64
Definition: mmintrin.h:45
void _mm_stream_ps(float *, __m128)
friend F32vec4 operator-(const F32vec4 &_A, const F32vec4 &_B)
Definition: fvec.h:119
friend F32vec4 operator+(const F32vec4 &_A, const F32vec4 &_B)
Definition: fvec.h:118
int move_mask(const F32vec4 &_A)
Definition: fvec.h:237
friend F32vec4 rcp_nr(const F32vec4 &_A)
Definition: fvec.h:150
#define _f32vec4_abs_mask
Definition: fvec.h:76
basic_ostream< char, char_traits< char > > ostream
Definition: iosfwd:679
friend F32vec4 abs(const F32vec4 &_A)
Definition: fvec.h:192
__m128 _mm_xor_ps(__m128 _A, __m128 _B)
launch & operator&=(launch &_Left, launch _Right)
Definition: future:75
__m128
Definition: xmmintrin.h:75
F32vec4 & operator=(float _F)
Definition: fvec.h:101
__m128 m
Definition: fvec.h:73
__m128 _mm_mul_ps(__m128 _A, __m128 _B)
#define _VEC_ASSERT(_Expression)
Definition: fvec.h:59
__m128 _mm_set_ps1(float _A)
Iu16vec8 simd_avg(const Iu16vec8 &_A, const Iu16vec8 &_B)
Definition: dvec.h:609
__m128 _mm_cvt_si2ss(__m128, int)
__m128 _mm_add_ss(__m128 _A, __m128 _B)
__m128 _mm_min_ss(__m128 _A, __m128 _B)
float & operator[](int _I)
Definition: fvec.h:217
F32vec4()
Definition: fvec.h:85
__m128 _mm_min_ps(__m128 _A, __m128 _B)
int _mm_movemask_ps(__m128 _A)
friend F32vec4 operator/(const F32vec4 &_A, const F32vec4 &_B)
Definition: fvec.h:121
F32vec4 & operator+=(const F32vec4 &_A)
Definition: fvec.h:125
__m128 _mm_rcp_ss(__m128 _A)
F32vec4(double _D)
Definition: fvec.h:97
__m128 vec
Definition: fvec.h:81
__m128 _mm_sub_ss(__m128 _A, __m128 _B)
F32vec4 & operator-=(const F32vec4 &_A)
Definition: fvec.h:126
__m128 _mm_unpackhi_ps(__m128 _A, __m128 _B)
__m128 _mm_unpacklo_ps(__m128 _A, __m128 _B)
void __cdecl _wassert(_In_z_ const wchar_t *_Message, _In_z_ const wchar_t *_File, _In_ unsigned _Line)
__m128 _mm_div_ss(__m128 _A, __m128 _B)
__m128 _mm_rsqrt_ps(__m128 _A)
uint_2 operator&(const uint_2 &_Lhs, const uint_2 &_Rhs) __GPU
Definition: amp_short_vectors.h:22859
__m128 _mm_max_ss(__m128 _A, __m128 _B)
__m128 _mm_or_ps(__m128 _A, __m128 _B)
F32vec4 & operator&=(const F32vec4 &_A)
Definition: fvec.h:129
int i[4]
Definition: fvec.h:72
__m128 _mm_mul_ss(__m128 _A, __m128 _B)
float _mm_cvtss_f32(__m128 _A)
void loadu(F32vec4 &_A, float *_P)
Definition: fvec.h:243
friend F32vec4 rcp(const F32vec4 &_A)
Definition: fvec.h:144
F32vec4 unpack_low(const F32vec4 &_A, const F32vec4 &_B)
Definition: fvec.h:229
F32vec4(float _F3, float _F2, float _F1, float _F0)
Definition: fvec.h:91
int _mm_cvtt_ss2si(__m128 _A)
Fvec32s4_COMP(eq) Fvec32s4_COMP(lt) Fvec32s4_COMP(le) Fvec32s4_COMP(gt) Fvec32s4_COMP(ge) Fvec32s4_COMP(neq) Fvec32s4_COMP(nlt) Fvec32s4_COMP(nle) Fvec32s4_COMP(ngt) Fvec32s4_COMP(nge) friend F32vec4 simd_min(const F32vec4 &_A
#define Fvec32s1_COMP(op)
launch & operator|=(launch &_Left, launch _Right)
Definition: future:81
const union @87 __f32vec4_abs_mask_cheat
F32vec4(float _F)
Definition: fvec.h:94
F32vec4 & operator*=(const F32vec4 &_A)
Definition: fvec.h:127
std::enable_if< details::_Is_extent_or_index< _Tuple_type< _Rank > >::value, _Tuple_type< _Rank > >::type operator/(const _Tuple_type< _Rank > &_Lhs, typename _Tuple_type< _Rank >::value_type _Rhs) __GPU
Definition: amp.h:908