30 #ifndef GDALSSE_PRIV_H_INCLUDED 31 #define GDALSSE_PRIV_H_INCLUDED 39 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION) 42 #include <emmintrin.h> 46 #include <smmintrin.h> 57 XMMReg2Double(
double val) { xmm = _mm_load_sd (&val); }
58 XMMReg2Double(
const XMMReg2Double& other) : xmm(other.xmm) {}
60 static inline XMMReg2Double Zero()
67 static inline XMMReg2Double Load1ValHighAndLow(
const double* ptr)
70 reg.nsLoad1ValHighAndLow(ptr);
74 static inline XMMReg2Double Load2Val(
const double* ptr)
81 static inline XMMReg2Double Load2Val(
const float* ptr)
88 static inline XMMReg2Double Load2ValAligned(
const double* ptr)
91 reg.nsLoad2ValAligned(ptr);
95 static inline XMMReg2Double Load2Val(
const unsigned char* ptr)
102 static inline XMMReg2Double Load2Val(
const short* ptr)
109 static inline XMMReg2Double Load2Val(
const unsigned short* ptr)
116 static inline XMMReg2Double Equals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
119 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
123 static inline XMMReg2Double NotEquals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
126 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
130 static inline XMMReg2Double Greater(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
133 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
137 static inline XMMReg2Double And(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
140 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
144 static inline XMMReg2Double Ternary(
const XMMReg2Double& cond,
const XMMReg2Double& true_expr,
const XMMReg2Double& false_expr)
147 reg.xmm = _mm_or_pd(_mm_and_pd (cond.xmm, true_expr.xmm), _mm_andnot_pd(cond.xmm, false_expr.xmm));
151 static inline XMMReg2Double Min(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
154 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
158 inline void nsLoad1ValHighAndLow(
const double* ptr)
160 xmm = _mm_load1_pd(ptr);
163 inline void nsLoad2Val(
const double* ptr)
165 xmm = _mm_loadu_pd(ptr);
168 inline void nsLoad2ValAligned(
const double* pval)
170 xmm = _mm_load_pd(pval);
173 inline void nsLoad2Val(
const float* pval)
175 __m128 temp1 = _mm_load_ss(pval);
176 __m128 temp2 = _mm_load_ss(pval + 1);
177 temp1 = _mm_shuffle_ps(temp1, temp2, _MM_SHUFFLE(1,0,1,0));
178 temp1 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,3,2,0));
179 xmm = _mm_cvtps_pd(temp1);
182 inline void nsLoad2Val(
const unsigned char* ptr)
184 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS 187 __m128i xmm_i = _mm_cvtsi32_si128(s);
189 __m128i xmm_i = _mm_cvtsi32_si128(*(
unsigned short*)(ptr));
192 xmm_i = _mm_cvtepu8_epi32(xmm_i);
194 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
195 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
197 xmm = _mm_cvtepi32_pd(xmm_i);
200 inline void nsLoad2Val(
const short* ptr)
204 __m128i xmm_i = _mm_cvtsi32_si128(i);
206 xmm_i = _mm_cvtepi16_epi32(xmm_i);
208 xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i);
209 xmm_i = _mm_srai_epi32(xmm_i, 16);
211 xmm = _mm_cvtepi32_pd(xmm_i);
214 inline void nsLoad2Val(
const unsigned short* ptr)
218 __m128i xmm_i = _mm_cvtsi32_si128(i);
220 xmm_i = _mm_cvtepu16_epi32(xmm_i);
222 xmm_i = _mm_unpacklo_epi16(xmm_i,_mm_setzero_si128());
224 xmm = _mm_cvtepi32_pd(xmm_i);
227 static inline void Load4Val(
const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
229 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS 232 __m128i xmm_i = _mm_cvtsi32_si128(i);
234 __m128i xmm_i = _mm_cvtsi32_si128(*(
int*)(ptr));
237 xmm_i = _mm_cvtepu8_epi32(xmm_i);
239 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
240 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
242 low.xmm = _mm_cvtepi32_pd(xmm_i);
243 high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
246 static inline void Load4Val(
const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
249 high.nsLoad2Val(ptr+2);
252 static inline void Load4Val(
const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
255 high.nsLoad2Val(ptr+2);
258 static inline void Load4Val(
const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
261 high.nsLoad2Val(ptr+2);
264 static inline void Load4Val(
const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
266 __m128 temp1 = _mm_loadu_ps(ptr);
267 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
268 low.xmm = _mm_cvtps_pd(temp1);
269 high.xmm = _mm_cvtps_pd(temp2);
272 inline void Zeroize()
274 xmm = _mm_setzero_pd();
277 inline XMMReg2Double& operator= (
const XMMReg2Double& other)
283 inline XMMReg2Double& operator+= (
const XMMReg2Double& other)
285 xmm = _mm_add_pd(xmm, other.xmm);
289 inline XMMReg2Double& operator*= (
const XMMReg2Double& other)
291 xmm = _mm_mul_pd(xmm, other.xmm);
295 inline XMMReg2Double operator+ (
const XMMReg2Double& other)
const 298 ret.xmm = _mm_add_pd(xmm, other.xmm);
302 inline XMMReg2Double operator- (
const XMMReg2Double& other)
const 305 ret.xmm = _mm_sub_pd(xmm, other.xmm);
309 inline XMMReg2Double operator* (
const XMMReg2Double& other)
const 312 ret.xmm = _mm_mul_pd(xmm, other.xmm);
316 inline XMMReg2Double operator/ (
const XMMReg2Double& other)
const 319 ret.xmm = _mm_div_pd(xmm, other.xmm);
323 inline void AddLowAndHigh()
326 xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1));
327 xmm = _mm_add_pd(xmm, xmm2);
330 inline void Store2Double(
double* pval)
const 332 _mm_storeu_pd(pval, xmm);
335 inline void Store2DoubleAligned(
double* pval)
const 337 _mm_store_pd(pval, xmm);
340 void Store2Val(
unsigned short* ptr)
const 342 __m128i tmp = _mm_cvtpd_epi32(xmm);
343 ptr[0] = (
GUInt16)_mm_extract_epi16(tmp, 0);
344 ptr[1] = (
GUInt16)_mm_extract_epi16(tmp, 2);
347 inline operator double ()
const 350 _mm_store_sd(&val, xmm);
357 #warning "Software emulation of SSE2 !" 366 XMMReg2Double(
double val) { low = val; high = 0.0; }
367 XMMReg2Double(
const XMMReg2Double& other) : low(other.low), high(other.high) {}
369 static inline XMMReg2Double Zero()
376 static inline XMMReg2Double Load1ValHighAndLow(
const double* ptr)
379 reg.nsLoad1ValHighAndLow(ptr);
383 static inline XMMReg2Double Equals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
387 if (expr1.low == expr2.low)
388 memset(&(reg.low), 0xFF,
sizeof(
double));
392 if (expr1.high == expr2.high)
393 memset(&(reg.high), 0xFF,
sizeof(
double));
400 static inline XMMReg2Double NotEquals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
404 if (expr1.low != expr2.low)
405 memset(&(reg.low), 0xFF,
sizeof(
double));
409 if (expr1.high != expr2.high)
410 memset(&(reg.high), 0xFF,
sizeof(
double));
417 static inline XMMReg2Double Greater(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
421 if (expr1.low > expr2.low)
422 memset(&(reg.low), 0xFF,
sizeof(
double));
426 if (expr1.high > expr2.high)
427 memset(&(reg.high), 0xFF,
sizeof(
double));
434 static inline XMMReg2Double And(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
437 int low1[2], high1[2];
438 int low2[2], high2[2];
439 memcpy(low1, &expr1.low,
sizeof(
double));
440 memcpy(high1, &expr1.high,
sizeof(
double));
441 memcpy(low2, &expr2.low,
sizeof(
double));
442 memcpy(high2, &expr2.high,
sizeof(
double));
445 high1[0] &= high2[0];
446 high1[1] &= high2[1];
447 memcpy(®.low, low1,
sizeof(
double));
448 memcpy(®.high, high1,
sizeof(
double));
452 static inline XMMReg2Double Ternary(
const XMMReg2Double& cond,
const XMMReg2Double& true_expr,
const XMMReg2Double& false_expr)
456 reg.low = true_expr.low;
458 reg.low = false_expr.low;
460 reg.high = true_expr.high;
462 reg.high = false_expr.high;
466 static inline XMMReg2Double Min(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
469 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.high;
470 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.low;
474 static inline XMMReg2Double Load2Val(
const double* ptr)
481 static inline XMMReg2Double Load2ValAligned(
const double* ptr)
484 reg.nsLoad2ValAligned(ptr);
488 static inline XMMReg2Double Load2Val(
const float* ptr)
495 static inline XMMReg2Double Load2Val(
const unsigned char* ptr)
502 static inline XMMReg2Double Load2Val(
const short* ptr)
509 static inline XMMReg2Double Load2Val(
const unsigned short* ptr)
516 inline void nsLoad1ValHighAndLow(
const double* pval)
522 inline void nsLoad2Val(
const double* pval)
528 inline void nsLoad2ValAligned(
const double* pval)
534 inline void nsLoad2Val(
const float* pval)
540 inline void nsLoad2Val(
const unsigned char* ptr)
546 inline void nsLoad2Val(
const short* ptr)
552 inline void nsLoad2Val(
const unsigned short* ptr)
558 static inline void Load4Val(
const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
566 static inline void Load4Val(
const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
569 high.nsLoad2Val(ptr+2);
572 static inline void Load4Val(
const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
575 high.nsLoad2Val(ptr+2);
578 static inline void Load4Val(
const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
581 high.nsLoad2Val(ptr+2);
584 static inline void Load4Val(
const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
587 high.nsLoad2Val(ptr+2);
590 inline void Zeroize()
596 inline XMMReg2Double& operator= (
const XMMReg2Double& other)
603 inline XMMReg2Double& operator+= (
const XMMReg2Double& other)
610 inline XMMReg2Double& operator*= (
const XMMReg2Double& other)
617 inline XMMReg2Double operator+ (
const XMMReg2Double& other)
const 620 ret.low = low + other.low;
621 ret.high = high + other.high;
625 inline XMMReg2Double operator- (
const XMMReg2Double& other)
const 628 ret.low = low - other.low;
629 ret.high = high - other.high;
633 inline XMMReg2Double operator* (
const XMMReg2Double& other)
const 636 ret.low = low * other.low;
637 ret.high = high * other.high;
641 inline XMMReg2Double operator/ (
const XMMReg2Double& other)
const 644 ret.low = low / other.low;
645 ret.high = high / other.high;
649 inline void AddLowAndHigh()
651 double add = low + high;
656 inline void Store2Double(
double* pval)
const 662 inline void Store2DoubleAligned(
double* pval)
const 668 void Store2Val(
unsigned short* ptr)
const 674 inline operator double ()
const 685 XMMReg2Double low, high;
688 XMMReg4Double(
const XMMReg4Double& other) : low(other.low), high(other.high) {}
690 static inline XMMReg4Double Zero()
698 static inline XMMReg4Double Load1ValHighAndLow(
const double* ptr)
701 reg.low.nsLoad1ValHighAndLow(ptr);
706 static inline XMMReg4Double Load4Val(
const unsigned char* ptr)
709 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
713 static inline XMMReg4Double Load4Val(
const short* ptr)
716 reg.low.nsLoad2Val(ptr);
717 reg.high.nsLoad2Val(ptr+2);
721 static inline XMMReg4Double Load4Val(
const unsigned short* ptr)
724 reg.low.nsLoad2Val(ptr);
725 reg.high.nsLoad2Val(ptr+2);
729 static inline XMMReg4Double Load4Val(
const double* ptr)
732 reg.low.nsLoad2Val(ptr);
733 reg.high.nsLoad2Val(ptr+2);
737 static inline XMMReg4Double Load4ValAligned(
const double* ptr)
740 reg.low.nsLoad2ValAligned(ptr);
741 reg.high.nsLoad2ValAligned(ptr+2);
745 static inline XMMReg4Double Load4Val(
const float* ptr)
748 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
752 static inline XMMReg4Double Equals(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
755 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
756 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
760 static inline XMMReg4Double NotEquals(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
763 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
764 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
768 static inline XMMReg4Double Greater(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
771 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
772 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
776 static inline XMMReg4Double And(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
779 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
780 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
784 static inline XMMReg4Double Ternary(
const XMMReg4Double& cond,
const XMMReg4Double& true_expr,
const XMMReg4Double& false_expr)
787 reg.low = XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
788 reg.high = XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
792 static inline XMMReg4Double Min(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
795 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
796 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
800 inline XMMReg4Double& operator= (
const XMMReg4Double& other)
807 inline XMMReg4Double& operator+= (
const XMMReg4Double& other)
814 inline XMMReg4Double& operator*= (
const XMMReg4Double& other)
821 inline XMMReg4Double operator+ (
const XMMReg4Double& other)
const 824 ret.low = low + other.low;
825 ret.high = high + other.high;
829 inline XMMReg4Double operator- (
const XMMReg4Double& other)
const 832 ret.low = low - other.low;
833 ret.high = high - other.high;
837 inline XMMReg4Double operator* (
const XMMReg4Double& other)
const 840 ret.low = low * other.low;
841 ret.high = high * other.high;
845 inline XMMReg4Double operator/ (
const XMMReg4Double& other)
const 848 ret.low = low / other.low;
849 ret.high = high / other.high;
853 inline void AddLowAndHigh()
859 inline XMMReg2Double& GetLow()
864 inline XMMReg2Double& GetHigh()
869 void Store4Val(
unsigned short* ptr)
const 872 high.Store2Val(ptr+2);
Core portability definitions for CPL.
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:205