289 {
return _m_pmaxsw(_A,_B);}
290 inline Is16vec4
simd_min(
const Is16vec4 &_A,
const Is16vec4 &_B) {
return _m_pminsw(_A,_B);}
291 inline Iu8vec8
simd_max(
const Iu8vec8 &_A,
const Iu8vec8 &_B) {
return _m_pmaxub(_A,_B);}
292 inline Iu8vec8
simd_min(
const Iu8vec8 &_A,
const Iu8vec8 &_B) {
return _m_pminub(_A,_B);}
295 inline Iu16vec4
simd_avg(
const Iu16vec4 &_A,
const Iu16vec4 &_B) {
return _mm_avg_pu16(_A,_B); }
296 inline Iu8vec8
simd_avg(
const Iu8vec8 &_A,
const Iu8vec8 &_B) {
return _mm_avg_pu8(_A,_B); }
299 inline int move_mask(
const I8vec8 &_A) {
return _m_pmovmskb(_A);}
302 inline Iu16vec4
mul_high(
const Iu16vec4 &_A,
const Iu16vec4 &_B) {
return _m_pmulhuw(_A,_B); }
305 inline void mask_move(
const I8vec8 &_A,
const I8vec8 &_B,
char *_Addr) { _m_maskmovq(_A, _B, _Addr); }
308 inline void store_nta(
__m64 *_P,
const M64 &_A) { _mm_stream_pi(_P,_A); }
313 inline int F32vec4ToInt(
const F32vec4 &_A)
321 inline Is32vec2 F32vec4ToIs32vec2 (
const F32vec4 &_A)
325 _Result = _mm_cvtt_ps2pi(_A);
326 return Is32vec2(_Result);
341 inline F32vec4 Is32vec2ToF32vec4(
const F32vec4 &_A,
const Is32vec2 &_B)
345 _Result = _mm_cvt_pi2ps(_A,_B);
362 explicit F32vec1(
float _F) { vec =
_mm_set_ss(_F); }
365 explicit F32vec1(
double _D) { vec =
_mm_set_ss((
float) _D); }
368 F32vec1(
__m128 _M) { vec = _M; }
371 operator __m128()
const {
return vec; }
375 friend F32vec1
operator |(
const F32vec1 &_A,
const F32vec1 &_B) {
return _mm_or_ps(_A,_B); }
384 F32vec1& operator +=(
const F32vec1 &_A) {
return *
this =
_mm_add_ss(vec,_A); }
385 F32vec1& operator -=(
const F32vec1 &_A) {
return *
this =
_mm_sub_ss(vec,_A); }
386 F32vec1& operator *=(
const F32vec1 &_A) {
return *
this =
_mm_mul_ss(vec,_A); }
387 F32vec1& operator /=(
const F32vec1 &_A) {
return *
this =
_mm_div_ss(vec,_A); }
396 friend F32vec1
rcp(
const F32vec1 &_A) {
return _mm_rcp_ss(_A); }
402 friend F32vec1 rcp_nr(
const F32vec1 &_A)
410 #pragma warning(push)
411 #pragma warning(disable : 4640)
412 friend F32vec1 rsqrt_nr(
const F32vec1 &_A)
414 static const F32vec1 fvecf0pt5(0.5f);
415 static const F32vec1 fvecf3pt0(3.0f);
417 return (fvecf0pt5 * _Ra0) * (fvecf3pt0 - (_A * _Ra0) * _Ra0);
425 #define Fvec32s1_COMP(op) \
426 friend F32vec1 cmp##op (const F32vec1 &_A, const F32vec1 &_B) { return _mm_cmp##op##_ss(_A,_B); }
440 friend F32vec1
simd_min(
const F32vec1 &_A,
const F32vec1 &_B) {
return _mm_min_ss(_A,_B); }
441 friend F32vec1
simd_max(
const F32vec1 &_A,
const F32vec1 &_B) {
return _mm_max_ss(_A,_B); }
444 #if defined (_ENABLE_VEC_DEBUG)
449 float *_Fp = (
float*)&_A;
450 _Os <<
"float:" << *_Fp;
uint_2 operator|(const uint_2 &_Lhs, const uint_2 &_Rhs) __GPU
Definition: amp_short_vectors.h:22852
__m128 _mm_and_ps(__m128 _A, __m128 _B)
__m128 _mm_rsqrt_ss(__m128 _A)
float rsqrt(float _X) __GPU_ONLY
Returns the reciprocal of the square root of the argument
Definition: amp_math.h:954
Is16vec8 simd_min(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:528
void store_nta(float *_P, const F32vec4 &_A)
Definition: fvec.h:253
float sqrt(float _X) __GPU_ONLY
Calculates the squre root of the argument
Definition: amp_math.h:1100
__m128 _mm_sqrt_ss(__m128 _A)
uint_2 operator<<(const uint_2 &_Lhs, const uint_2 &_Rhs) __GPU
Definition: amp_short_vectors.h:22866
std::enable_if< details::_Is_extent_or_index< _Tuple_type< _Rank > >::value, _Tuple_type< _Rank > >::type operator*(const _Tuple_type< _Rank > &_Lhs, typename _Tuple_type< _Rank >::value_type _Rhs) __GPU
Definition: amp.h:890
Is16vec8 mul_high(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:521
uint_2 operator^(const uint_2 &_Lhs, const uint_2 &_Rhs) __GPU
Definition: amp_short_vectors.h:22845
float rcp(float _X) __GPU_ONLY
Calculates a fast, approximate reciprocal of the argument
Definition: amp.h:7493
__m128 _mm_set_ss(float _A)
launch & operator^=(launch &_Left, launch _Right)
Definition: future:87
std::enable_if< details::_Is_extent_or_index< _Tuple_type< _Rank > >::value, _Tuple_type< _Rank > >::type operator-(const _Tuple_type< _Rank > &_Lhs, const _Tuple_type< _Rank > &_Rhs) __GPU
Definition: amp.h:845
Is16vec8 simd_max(const Is16vec8 &_A, const Is16vec8 &_B)
Definition: dvec.h:527
std::enable_if< details::_Is_extent_or_index< _Tuple_type< _Rank > >::value, _Tuple_type< _Rank > >::type operator+(const _Tuple_type< _Rank > &_Lhs, const _Tuple_type< _Rank > &_Rhs) __GPU
Definition: amp.h:836
__m64
Definition: mmintrin.h:45
int move_mask(const F32vec4 &_A)
Definition: fvec.h:237
basic_ostream< char, char_traits< char > > ostream
Definition: iosfwd:679
__m128 _mm_xor_ps(__m128 _A, __m128 _B)
launch & operator&=(launch &_Left, launch _Right)
Definition: future:75
__m128
Definition: xmmintrin.h:75
Iu16vec8 simd_avg(const Iu16vec8 &_A, const Iu16vec8 &_B)
Definition: dvec.h:609
__m128 _mm_cvt_si2ss(__m128, int)
__m128 _mm_add_ss(__m128 _A, __m128 _B)
__m128 _mm_min_ss(__m128 _A, __m128 _B)
__m128 _mm_rcp_ss(__m128 _A)
__m128 _mm_sub_ss(__m128 _A, __m128 _B)
__m128 _mm_div_ss(__m128 _A, __m128 _B)
uint_2 operator&(const uint_2 &_Lhs, const uint_2 &_Rhs) __GPU
Definition: amp_short_vectors.h:22859
__m128 _mm_max_ss(__m128 _A, __m128 _B)
__m128 _mm_or_ps(__m128 _A, __m128 _B)
__m128 _mm_mul_ss(__m128 _A, __m128 _B)
int _mm_cvtt_ss2si(__m128 _A)
#define Fvec32s1_COMP(op)
launch & operator|=(launch &_Left, launch _Right)
Definition: future:81
std::enable_if< details::_Is_extent_or_index< _Tuple_type< _Rank > >::value, _Tuple_type< _Rank > >::type operator/(const _Tuple_type< _Rank > &_Lhs, typename _Tuple_type< _Rank >::value_type _Rhs) __GPU
Definition: amp.h:908