29 #include "../precompiled.h"
43 #if defined(MACOS_X) && defined(__i386__)
45 #include <xmmintrin.h>
47 #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
48 #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
56 return "MMX & SSE & SSE2";
67 int i, cnt, pre, post;
79 if ( ((
int) src0) & 3 ) {
83 post = count - (cnt<<2);
103 src0_p = (
char *) src0;
104 _mm_prefetch(src0_p+64, _MM_HINT_NTA);
105 constant_p = (
char *) &constant;
106 xmm1 = _mm_load_ss((
float *)constant_p);
107 xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
133 xmm0 = _mm_loadu_ps((
float *) src0_p);
134 _mm_prefetch(src0_p+128, _MM_HINT_NTA);
135 xmm0 = _mm_cmplt_ps(xmm0, xmm1);
137 xmm0i = (__m128i) xmm0;
138 xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
139 xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
140 mask_l = _mm_cvtsi128_si32(xmm0i);
142 mask_l = mask_l & 0x01010101;
143 mask_l = mask_l << bitNum;
144 dst_l = *((
int *) dst_p);
145 mask_l = mask_l | dst_l;
146 *((
int *) dst_p) = mask_l;
147 src0_p = src0_p + 16;
155 aligned = (
float *) ((((
int) src0) + 15) & ~15);
156 if ( (
int)aligned > ((
int)src0) +
count ) {
161 pre = aligned - src0;
162 cnt = (count - pre) >> 2;
163 post = count - pre - (cnt<<2);
183 src0_p = (
char *) src0;
184 _mm_prefetch(src0_p+64, _MM_HINT_NTA);
185 constant_p = (
char *) &constant;
186 xmm1 = _mm_load_ss((
float *)constant_p);
187 xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
214 xmm0 = _mm_load_ps((
float *) src0_p);
215 _mm_prefetch(src0_p+128, _MM_HINT_NTA);
216 xmm0 = _mm_cmplt_ps(xmm0, xmm1);
218 xmm0i = (__m128i) xmm0;
219 xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
220 xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
221 mask_l = _mm_cvtsi128_si32(xmm0i);
223 mask_l = mask_l & 0x01010101;
224 mask_l = mask_l << bitNum;
225 dst_l = *((
int *) dst_p);
226 mask_l = mask_l | dst_l;
227 *((
int *) dst_p) = mask_l;
228 src0_p = src0_p + 16;
239 for ( i = 0; i < pre; i++ ) {
240 dst[
i] |= ( src0[
i] <
c ) << bitNum;
242 for ( i = count - post; i <
count; i++ ) {
243 dst[
i] |= ( src0[
i] <
c ) << bitNum;
247 #elif defined(_WIN32)
249 #include <xmmintrin.h>
251 #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
252 #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
253 #define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
254 #define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
257 #define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
258 #define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
259 #define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
261 ALIGN8_INIT1(
unsigned short SIMD_W_zero, 0 );
262 ALIGN8_INIT1(
unsigned short SIMD_W_maxShort, 1<<15 );
264 ALIGN4_INIT4(
unsigned long SIMD_SP_singleSignBitMask, (
unsigned long) ( 1 << 31 ), 0, 0, 0 );
265 ALIGN4_INIT1(
unsigned long SIMD_SP_signBitMask, (
unsigned long) ( 1 << 31 ) );
266 ALIGN4_INIT1(
unsigned long SIMD_SP_absMask, (
unsigned long) ~( 1 << 31 ) );
267 ALIGN4_INIT1(
unsigned long SIMD_SP_infinityMask, (
unsigned long) ~( 1 << 23 ) );
269 ALIGN4_INIT1(
float SIMD_SP_zero, 0.0
f );
270 ALIGN4_INIT1(
float SIMD_SP_one, 1.0
f );
271 ALIGN4_INIT1(
float SIMD_SP_two, 2.0
f );
272 ALIGN4_INIT1(
float SIMD_SP_three, 3.0
f );
273 ALIGN4_INIT1(
float SIMD_SP_four, 4.0
f );
274 ALIGN4_INIT1(
float SIMD_SP_maxShort, (1<<15) );
275 ALIGN4_INIT1(
float SIMD_SP_tiny, 1e-10
f );
289 return "MMX & SSE & SSE2";
292 #if 0 // the SSE2 code is ungodly slow
317 #define NSKIP( n, s ) ((n<<3)|(s&7))
318 switch(
NSKIP( n, skip ) ) {
319 case NSKIP( 1, 0 ): x[0] = b[0];
321 case NSKIP( 2, 0 ): x[0] = b[0];
322 case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
324 case NSKIP( 3, 0 ): x[0] = b[0];
325 case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
326 case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
328 case NSKIP( 4, 0 ): x[0] = b[0];
329 case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
330 case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
331 case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
333 case NSKIP( 5, 0 ): x[0] = b[0];
334 case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
335 case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
336 case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
337 case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
339 case NSKIP( 6, 0 ): x[0] = b[0];
340 case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
341 case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
342 case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
343 case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
344 case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
346 case NSKIP( 7, 0 ): x[0] = b[0];
347 case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
348 case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
349 case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
350 case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
351 case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
352 case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
361 case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
362 case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
363 case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
384 cvtps2pd xmm0, [esi+ecx]
385 cvtps2pd xmm2, [edi+ecx]
387 cvtps2pd xmm1, [esi+ecx+8]
388 cvtps2pd xmm3, [edi+ecx+8]
393 cvtps2pd xmm2, [esi+ecx-(16*4)]
394 cvtps2pd xmm3, [edi+ecx-(16*4)]
395 cvtps2pd xmm4, [esi+ecx-(14*4)]
397 cvtps2pd xmm5, [edi+ecx-(14*4)]
399 cvtps2pd xmm2, [esi+ecx-(12*4)]
401 cvtps2pd xmm3, [edi+ecx-(12*4)]
403 cvtps2pd xmm4, [esi+ecx-(10*4)]
405 cvtps2pd xmm5, [edi+ecx-(10*4)]
407 cvtps2pd xmm2, [esi+ecx-(8*4)]
409 cvtps2pd xmm3, [edi+ecx-(8*4)]
411 cvtps2pd xmm4, [esi+ecx-(6*4)]
413 cvtps2pd xmm5, [edi+ecx-(6*4)]
415 cvtps2pd xmm2, [esi+ecx-(4*4)]
417 cvtps2pd xmm3, [edi+ecx-(4*4)]
419 cvtps2pd xmm4, [esi+ecx-(2*4)]
421 cvtps2pd xmm5, [edi+ecx-(2*4)]
431 cvtps2pd xmm2, [esi+ecx-(8*4)]
432 cvtps2pd xmm3, [edi+ecx-(8*4)]
433 cvtps2pd xmm7, [esi+ecx-(6*4)]
435 cvtps2pd xmm5, [edi+ecx-(6*4)]
437 cvtps2pd xmm6, [esi+ecx-(4*4)]
439 cvtps2pd xmm3, [edi+ecx-(4*4)]
441 cvtps2pd xmm4, [esi+ecx-(2*4)]
443 cvtps2pd xmm7, [edi+ecx-(2*4)]
452 cvtps2pd xmm2, [esi+ecx-(4*4)]
453 cvtps2pd xmm3, [edi+ecx-(4*4)]
454 cvtps2pd xmm4, [esi+ecx-(2*4)]
456 cvtps2pd xmm5, [edi+ecx-(2*4)]
464 shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 0 )
473 cvtss2sd xmm1, [esi-(3*4)]
474 cvtss2sd xmm2, [edi-(3*4)]
478 cvtss2sd xmm3, [esi-(2*4)]
479 cvtss2sd xmm4, [edi-(2*4)]
483 cvtss2sd xmm5, [esi-(1*4)]
484 cvtss2sd xmm6, [edi-(1*4)]
488 cvtss2sd xmm1, [ebx+eax]
520 lptr = L.ToFloatPtr();
521 nc = L.GetNumColumns();
533 x[0] = b[0] - lptr[1*nc+0] * x[1];
537 x[1] = b[1] - lptr[2*nc+1] * x[2];
538 x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
542 x[2] = b[2] - lptr[3*nc+2] * x[3];
543 x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
544 x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
548 x[3] = b[3] - lptr[4*nc+3] * x[4];
549 x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
550 x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
551 x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
555 x[4] = b[4] - lptr[5*nc+4] * x[5];
556 x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
557 x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
558 x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
559 x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
563 x[5] = b[5] - lptr[6*nc+5] * x[6];
564 x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
565 x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
566 x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
567 x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
568 x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
597 cvtps2pd xmm0, [ebx+eax*4-16]
598 cvtps2pd xmm2, [ebx+eax*4-8]
605 cvtps2pd xmm4, [edi+8]
607 cvtss2sd xmm5, [esi+4*ecx+0]
608 shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
612 cvtps2pd xmm6, [edi+8]
616 cvtss2sd xmm7, [esi+4*ecx+4]
617 shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
621 cvtps2pd xmm4, [edi+8]
625 cvtss2sd xmm5, [esi+4*ecx+8]
626 shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
630 cvtps2pd xmm6, [edi+8]
634 cvtss2sd xmm7, [esi+4*ecx+12]
635 shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
645 cvtps2pd xmm4, [edi+8]
646 cvtss2sd xmm5, [esi+4*ecx]
647 shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
659 shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 )
661 shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 )
667 cvtss2sd xmm7, [edi+8]
669 cvtss2sd xmm7, [edi+4]
680 cvtss2sd xmm7, [edi+4]
705 lptr = L.ToFloatPtr() + m * L.GetNumColumns() + m - 4;
716 cvtps2pd xmm0, [ebx+eax*4-16]
717 cvtps2pd xmm2, [ebx+eax*4-8]
724 cvtps2pd xmm4, [edi+8]
726 cvtss2sd xmm5, [esi+4*ecx+0]
727 shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
731 cvtps2pd xmm6, [edi+8]
735 cvtss2sd xmm7, [esi+4*ecx+4]
736 shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
740 cvtps2pd xmm4, [edi+8]
744 cvtss2sd xmm5, [esi+4*ecx+8]
745 shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
749 cvtps2pd xmm6, [edi+8]
753 cvtss2sd xmm7, [esi+4*ecx+12]
754 shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
769 shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 )
771 shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 )
777 cvtss2sd xmm7, [edi+8]
779 cvtss2sd xmm7, [edi+4]
790 cvtss2sd xmm7, [edi+4]
816 for ( i = (m&3)-1; i >= 0; i-- ) {
819 for ( j = i + 1; j < m; j++ ) {
820 s0 -= lptr[0] * x[
j];
849 movaps xmm0, [edi+eax+0*16]
850 movaps xmm1, [edi+eax+1*16]
851 movaps xmm2, [edi+eax+2*16]
852 movaps xmm3, [edi+eax+3*16]
861 prefetchnta [edi+eax+128]
868 movlps [esi-4*4*2], xmm4
869 movhps [esi-3*4*2], xmm4
870 movlps [esi-2*4*2], xmm6
871 movhps [esi-1*4*2], xmm6
virtual void VPCALL MatX_LowerTriangularSolve(const idMatX &L, float *x, const float *b, const int n, int skip=0)
static const float INFINITY
assert(prefInfo.fullscreenBtn)
const int MIXBUFFER_SAMPLES
virtual void VPCALL MixedSoundToSamples(short *samples, const float *mixBuffer, const int numSamples)
static const float HALF_PI
virtual void VPCALL MatX_LowerTriangularSolveTranspose(const idMatX &L, float *x, const float *b, const int n)
GLuint GLuint GLsizei count
int GetNumColumns(void) const
static const float TWO_PI
virtual void VPCALL CmpLT(byte *dst, const float *src0, const float constant, const int count)
virtual const char *VPCALL GetName(void) const