STLdoc
STLdocumentation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
Classes | Macros | Functions | Variables
fvec.h File Reference
#include <xmmintrin.h>
#include <ivec.h>
#include <vcruntime.h>

Go to the source code of this file.

Classes

class  F32vec4
 

Macros

#define _VEC_ASSERT(_Expression)   (void)( (!!(_Expression)) || (_wassert(_CRT_WIDE(#_Expression), _CRT_WIDE(__FILE__), __LINE__), 0) )
 
#define _f32vec4_abs_mask   ((F32vec4)__f32vec4_abs_mask_cheat.m)
 
#define Fvec32s4_COMP(op)   friend F32vec4 cmp##op (const F32vec4 &_A, const F32vec4 &_B) { return _mm_cmp##op##_ps(_A,_B); }
 
#define Fvec32s4_SELECT(op)
 
#define Fvec32s1_COMP(op)   friend F32vec1 cmp##op (const F32vec1 &_A, const F32vec1 &_B) { return _mm_cmp##op##_ss(_A,_B); }
 
#define Fvec32s1_SELECT(op)
 

Functions

void __cdecl _wassert (_In_z_ const wchar_t *_Message, _In_z_ const wchar_t *_File, _In_ unsigned _Line)
 
F32vec4 unpack_low (const F32vec4 &_A, const F32vec4 &_B)
 
F32vec4 unpack_high (const F32vec4 &_A, const F32vec4 &_B)
 
int move_mask (const F32vec4 &_A)
 
void loadu (F32vec4 &_A, float *_P)
 
void storeu (float *_P, const F32vec4 &_A)
 
void store_nta (float *_P, const F32vec4 &_A)
 
 Fvec32s4_SELECT (eq) Fvec32s4_SELECT(lt) Fvec32s4_SELECT(le) Fvec32s4_SELECT(gt) Fvec32s4_SELECT(ge) Fvec32s4_SELECT(neq) Fvec32s4_SELECT(nlt) Fvec32s4_SELECT(nle) Fvec32s4_SELECT(ngt) Fvec32s4_SELECT(nge) class F32vec1
 
 Fvec32s1_SELECT (eq) Fvec32s1_SELECT(lt) Fvec32s1_SELECT(le) Fvec32s1_SELECT(gt) Fvec32s1_SELECT(ge) Fvec32s1_SELECT(neq) Fvec32s1_SELECT(nlt) Fvec32s1_SELECT(nle) Fvec32s1_SELECT(ngt) Fvec32s1_SELECT(nge) inline int F32vec1ToInt(const F32vec1 &_A)
 

Variables

union {
   int   i [4]
 
   __m128   m
 
__f32vec4_abs_mask_cheat = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}
 

Macro Definition Documentation

#define _f32vec4_abs_mask   ((F32vec4)__f32vec4_abs_mask_cheat.m)
#define _VEC_ASSERT (   _Expression)    (void)( (!!(_Expression)) || (_wassert(_CRT_WIDE(#_Expression), _CRT_WIDE(__FILE__), __LINE__), 0) )
#define Fvec32s1_COMP (   op)    friend F32vec1 cmp##op (const F32vec1 &_A, const F32vec1 &_B) { return _mm_cmp##op##_ss(_A,_B); }
#define Fvec32s1_SELECT (   op)
Value:
inline F32vec1 select_##op (const F32vec1 &_A, const F32vec1 &_B, const F32vec1 &_C, const F32vec1 &_D) \
{ \
F32vec1 _Mask = _mm_cmp##op##_ss(_A,_B); \
return( (_Mask & _C) | F32vec1((_mm_andnot_ps(_Mask,_D)))); \
}
__m128 _mm_andnot_ps(__m128 _A, __m128 _B)
return(_PAIR_TYPE(_FwdIt)(_First, _First))
#define Fvec32s4_COMP (   op)    friend F32vec4 cmp##op (const F32vec4 &_A, const F32vec4 &_B) { return _mm_cmp##op##_ps(_A,_B); }
#define Fvec32s4_SELECT (   op)
Value:
inline F32vec4 select_##op (const F32vec4 &_A, const F32vec4 &_B, const F32vec4 &_C, const F32vec4 &_D) \
{ \
F32vec4 _Mask = _mm_cmp##op##_ps(_A,_B); \
return( (_Mask & _C) | F32vec4((_mm_andnot_ps(_Mask,_D)))); \
}
Definition: fvec.h:78
__m128 _mm_andnot_ps(__m128 _A, __m128 _B)
return(_PAIR_TYPE(_FwdIt)(_First, _First))

Function Documentation

void __cdecl _wassert ( _In_z_ const wchar_t _Message,
_In_z_ const wchar_t _File,
_In_ unsigned  _Line 
)
Fvec32s1_SELECT ( eq  ) const
490 {
491  return _mm_cvtt_ss2si(_A);
492 }
int _mm_cvtt_ss2si(__m128 _A)
Fvec32s4_SELECT ( eq  )
289  { return _m_pmaxsw(_A,_B);}
290 inline Is16vec4 simd_min(const Is16vec4 &_A, const Is16vec4 &_B) { return _m_pminsw(_A,_B);}
291 inline Iu8vec8 simd_max(const Iu8vec8 &_A, const Iu8vec8 &_B) { return _m_pmaxub(_A,_B);}
292 inline Iu8vec8 simd_min(const Iu8vec8 &_A, const Iu8vec8 &_B) { return _m_pminub(_A,_B);}
293 
294 /* Average */
295 inline Iu16vec4 simd_avg(const Iu16vec4 &_A, const Iu16vec4 &_B) { return _mm_avg_pu16(_A,_B); }
296 inline Iu8vec8 simd_avg(const Iu8vec8 &_A, const Iu8vec8 &_B) { return _mm_avg_pu8(_A,_B); }
297 
298 /* Move ByteMask To Int: returns mask formed from most sig bits of each vec of a */
299 inline int move_mask(const I8vec8 &_A) { return _m_pmovmskb(_A);}
300 
301 /* Packed Multiply High Unsigned */
302 inline Iu16vec4 mul_high(const Iu16vec4 &_A, const Iu16vec4 &_B) { return _m_pmulhuw(_A,_B); }
303 
304 /* Byte Mask Write: Write bytes if most significant bit in each corresponding byte is set */
305 inline void mask_move(const I8vec8 &_A, const I8vec8 &_B, char *_Addr) { _m_maskmovq(_A, _B, _Addr); }
306 
307 /* Data Motion: Store Non Temporal */
308 inline void store_nta(__m64 *_P, const M64 &_A) { _mm_stream_pi(_P,_A); }
309 
310 /* Conversions between ivec <-> fvec */
311 
312 /* Convert first element of F32vec4 to int with truncation */
313 inline int F32vec4ToInt(const F32vec4 &_A)
314 {
315 
316  return _mm_cvtt_ss2si(_A);
317 
318 }
319 
320 /* Convert two lower SP FP values of a to Is32vec2 with truncation */
321 inline Is32vec2 F32vec4ToIs32vec2 (const F32vec4 &_A)
322 {
323 
324  __m64 _Result;
325  _Result = _mm_cvtt_ps2pi(_A);
326  return Is32vec2(_Result);
327 
328 }
329 
330 /* Convert the 32-bit int i to an SP FP value; the upper three SP FP values are passed through from a. */
331 inline F32vec4 IntToF32vec4(const F32vec4 &_A, int _I)
332 {
333 
334  __m128 _Result;
335  _Result = _mm_cvt_si2ss(_A, _I);
336  return F32vec4(_Result);
337 
338 }
339 
340 /* Convert the two 32-bit integer values in b to two SP FP values; the upper two SP FP values are passed from a. */
341 inline F32vec4 Is32vec2ToF32vec4(const F32vec4 &_A, const Is32vec2 &_B)
342 {
343 
344  __m128 _Result;
345  _Result = _mm_cvt_pi2ps(_A,_B);
346  return F32vec4(_Result);
347 }
348 #endif
349 
350 class F32vec1
351 {
352 protected:
353  __m128 vec;
354 public:
355 
356  /* Constructors: 1 float */
357  F32vec1() {}
358 
359  F32vec1(int _I) { vec = _mm_cvt_si2ss(vec, _I);};
360 
361  /* Initialize each of 4 SP FPs with same float */
362  explicit F32vec1(float _F) { vec = _mm_set_ss(_F); }
363 
364  /* Initialize each of 4 SP FPs with same float */
365  explicit F32vec1(double _D) { vec = _mm_set_ss((float) _D); }
366 
367  /* initialize with __m128 data type */
368  F32vec1(__m128 _M) { vec = _M; }
369 
370  /* Conversion functions */
371  operator __m128() const { return vec; } /* Convert to float */
372 
373  /* Logical Operators */
374  friend F32vec1 operator &(const F32vec1 &_A, const F32vec1 &_B) { return _mm_and_ps(_A,_B); }
375  friend F32vec1 operator |(const F32vec1 &_A, const F32vec1 &_B) { return _mm_or_ps(_A,_B); }
376  friend F32vec1 operator ^(const F32vec1 &_A, const F32vec1 &_B) { return _mm_xor_ps(_A,_B); }
377 
378  /* Arithmetic Operators */
379  friend F32vec1 operator +(const F32vec1 &_A, const F32vec1 &_B) { return _mm_add_ss(_A,_B); }
380  friend F32vec1 operator -(const F32vec1 &_A, const F32vec1 &_B) { return _mm_sub_ss(_A,_B); }
381  friend F32vec1 operator *(const F32vec1 &_A, const F32vec1 &_B) { return _mm_mul_ss(_A,_B); }
382  friend F32vec1 operator /(const F32vec1 &_A, const F32vec1 &_B) { return _mm_div_ss(_A,_B); }
383 
384  F32vec1& operator +=(const F32vec1 &_A) { return *this = _mm_add_ss(vec,_A); }
385  F32vec1& operator -=(const F32vec1 &_A) { return *this = _mm_sub_ss(vec,_A); }
386  F32vec1& operator *=(const F32vec1 &_A) { return *this = _mm_mul_ss(vec,_A); }
387  F32vec1& operator /=(const F32vec1 &_A) { return *this = _mm_div_ss(vec,_A); }
388  F32vec1& operator &=(const F32vec1 &_A) { return *this = _mm_and_ps(vec,_A); }
389  F32vec1& operator |=(const F32vec1 &_A) { return *this = _mm_or_ps(vec,_A); }
390  F32vec1& operator ^=(const F32vec1 &_A) { return *this = _mm_xor_ps(vec,_A); }
391 
392 
393  /* Square Root */
394  friend F32vec1 sqrt(const F32vec1 &_A) { return _mm_sqrt_ss(_A); }
395  /* Reciprocal */
396  friend F32vec1 rcp(const F32vec1 &_A) { return _mm_rcp_ss(_A); }
397  /* Reciprocal Square Root */
398  friend F32vec1 rsqrt(const F32vec1 &_A) { return _mm_rsqrt_ss(_A); }
399 
400  /* NewtonRaphson Reciprocal
401  [2 * rcpss(x) - (x * rcpss(x) * rcpss(x))] */
402  friend F32vec1 rcp_nr(const F32vec1 &_A)
403  {
404  F32vec1 _Ra0 = _mm_rcp_ss(_A);
405  return _mm_sub_ss(_mm_add_ss(_Ra0, _Ra0), _mm_mul_ss(_mm_mul_ss(_Ra0, _A), _Ra0));
406  }
407 
408  /* NewtonRaphson Reciprocal Square Root
409  0.5 * rsqrtss * (3 - x * rsqrtss(x) * rsqrtss(x)) */
410 #pragma warning(push)
411 #pragma warning(disable : 4640)
412  friend F32vec1 rsqrt_nr(const F32vec1 &_A)
413  {
414  static const F32vec1 fvecf0pt5(0.5f);
415  static const F32vec1 fvecf3pt0(3.0f);
416  F32vec1 _Ra0 = _mm_rsqrt_ss(_A);
417  return (fvecf0pt5 * _Ra0) * (fvecf3pt0 - (_A * _Ra0) * _Ra0);
418  }
419 #pragma warning(pop)
420 
421  /* Compares: Mask is returned */
422  /* Macros expand to all compare intrinsics. Example:
423  friend F32vec1 cmpeq(const F32vec1 &_A, const F32vec1 &_B)
424  { return _mm_cmpeq_ss(_A,_B);} */
425  #define Fvec32s1_COMP(op) \
426  friend F32vec1 cmp##op (const F32vec1 &_A, const F32vec1 &_B) { return _mm_cmp##op##_ss(_A,_B); }
427  Fvec32s1_COMP(eq) /* expanded to cmpeq(_A,_B) */
428  Fvec32s1_COMP(lt) /* expanded to cmplt(_A,_B) */
429  Fvec32s1_COMP(le) /* expanded to cmple(_A,_B) */
430  Fvec32s1_COMP(gt) /* expanded to cmpgt(_A,_B) */
431  Fvec32s1_COMP(ge) /* expanded to cmpge(_A,_B) */
432  Fvec32s1_COMP(neq) /* expanded to cmpneq(_A,_B) */
433  Fvec32s1_COMP(nlt) /* expanded to cmpnlt(_A,_B) */
434  Fvec32s1_COMP(nle) /* expanded to cmpnle(_A,_B) */
435  Fvec32s1_COMP(ngt) /* expanded to cmpngt(_A,_B) */
436  Fvec32s1_COMP(nge) /* expanded to cmpnge(_A,_B) */
437  #undef Fvec32s1_COMP
438 
439  /* Min and Max */
440  friend F32vec1 simd_min(const F32vec1 &_A, const F32vec1 &_B) { return _mm_min_ss(_A,_B); }
441  friend F32vec1 simd_max(const F32vec1 &_A, const F32vec1 &_B) { return _mm_max_ss(_A,_B); }
442 
443  /* Debug Features */
444 #if defined (_ENABLE_VEC_DEBUG)
445  /* Output */
446  friend std::ostream & operator<<(std::ostream & _Os, const F32vec1 &_A)
447  {
448  /* To use: cout << "Elements of F32vec1 fvec are: " << fvec; */
449  float *_Fp = (float*)&_A;
450  _Os << "float:" << *_Fp;
451  return _Os;
452  }
453 #endif /* defined (_ENABLE_VEC_DEBUG) */
454 
455 };
Definition: fvec.h:78
uint_2 operator|(const uint_2 &_Lhs, const uint_2 &_Rhs) __GPU
Definition: amp_short_vectors.h:22852
__m128 _mm_and_ps(__m128 _A, __m128 _B)
__m128 _mm_rsqrt_ss(__m128 _A)
float rsqrt(float _X) __GPU_ONLY
Returns the reciprocal of the square root of the argument
Definition: amp_math.h:954
Is16vec8 simd_min(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:528
void store_nta(float *_P, const F32vec4 &_A)
Definition: fvec.h:253
float sqrt(float _X) __GPU_ONLY
Calculates the squre root of the argument
Definition: amp_math.h:1100
__m128 _mm_sqrt_ss(__m128 _A)
uint_2 operator<<(const uint_2 &_Lhs, const uint_2 &_Rhs) __GPU
Definition: amp_short_vectors.h:22866
std::enable_if< details::_Is_extent_or_index< _Tuple_type< _Rank > >::value, _Tuple_type< _Rank > >::type operator*(const _Tuple_type< _Rank > &_Lhs, typename _Tuple_type< _Rank >::value_type _Rhs) __GPU
Definition: amp.h:890
Is16vec8 mul_high(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:521
uint_2 operator^(const uint_2 &_Lhs, const uint_2 &_Rhs) __GPU
Definition: amp_short_vectors.h:22845
float rcp(float _X) __GPU_ONLY
Calculates a fast, approximate reciprocal of the argument
Definition: amp.h:7493
__m128 _mm_set_ss(float _A)
launch & operator^=(launch &_Left, launch _Right)
Definition: future:87
std::enable_if< details::_Is_extent_or_index< _Tuple_type< _Rank > >::value, _Tuple_type< _Rank > >::type operator-(const _Tuple_type< _Rank > &_Lhs, const _Tuple_type< _Rank > &_Rhs) __GPU
Definition: amp.h:845
Is16vec8 simd_max(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:527
std::enable_if< details::_Is_extent_or_index< _Tuple_type< _Rank > >::value, _Tuple_type< _Rank > >::type operator+(const _Tuple_type< _Rank > &_Lhs, const _Tuple_type< _Rank > &_Rhs) __GPU
Definition: amp.h:836
__m64
Definition: mmintrin.h:45
int move_mask(const F32vec4 &_A)
Definition: fvec.h:237
basic_ostream< char, char_traits< char > > ostream
Definition: iosfwd:679
__m128 _mm_xor_ps(__m128 _A, __m128 _B)
launch & operator&=(launch &_Left, launch _Right)
Definition: future:75
__m128
Definition: xmmintrin.h:75
Iu16vec8 simd_avg(const Iu16vec8 &_A, const Iu16vec8 &_B)
Definition: dvec.h:609
__m128 _mm_cvt_si2ss(__m128, int)
__m128 _mm_add_ss(__m128 _A, __m128 _B)
__m128 _mm_min_ss(__m128 _A, __m128 _B)
__m128 _mm_rcp_ss(__m128 _A)
__m128 _mm_sub_ss(__m128 _A, __m128 _B)
__m128 _mm_div_ss(__m128 _A, __m128 _B)
uint_2 operator&(const uint_2 &_Lhs, const uint_2 &_Rhs) __GPU
Definition: amp_short_vectors.h:22859
__m128 _mm_max_ss(__m128 _A, __m128 _B)
__m128 _mm_or_ps(__m128 _A, __m128 _B)
__m128 _mm_mul_ss(__m128 _A, __m128 _B)
int _mm_cvtt_ss2si(__m128 _A)
#define Fvec32s1_COMP(op)
launch & operator|=(launch &_Left, launch _Right)
Definition: future:81
std::enable_if< details::_Is_extent_or_index< _Tuple_type< _Rank > >::value, _Tuple_type< _Rank > >::type operator/(const _Tuple_type< _Rank > &_Lhs, typename _Tuple_type< _Rank >::value_type _Rhs) __GPU
Definition: amp.h:908
void loadu ( F32vec4 _A,
float *  _P 
)
inline
244 { _A = _mm_loadu_ps(_P); }
__m128 _mm_loadu_ps(float const *_A)
int move_mask ( const F32vec4 _A)
inline
238 { return _mm_movemask_ps(_A);}
int _mm_movemask_ps(__m128 _A)
void store_nta ( float *  _P,
const F32vec4 _A 
)
inline
254 { _mm_stream_ps(_P,_A);}
void _mm_stream_ps(float *, __m128)
void storeu ( float *  _P,
const F32vec4 _A 
)
inline
248 { _mm_storeu_ps(_P, _A); }
void _mm_storeu_ps(float *_V, __m128 _A)
F32vec4 unpack_high ( const F32vec4 _A,
const F32vec4 _B 
)
inline
234 { return _mm_unpackhi_ps(_A, _B); }
__m128 _mm_unpackhi_ps(__m128 _A, __m128 _B)
F32vec4 unpack_low ( const F32vec4 _A,
const F32vec4 _B 
)
inline
230 { return _mm_unpacklo_ps(_A, _B); }
__m128 _mm_unpacklo_ps(__m128 _A, __m128 _B)

Variable Documentation

const { ... } __f32vec4_abs_mask_cheat
int i[4]
__m128 m