doom3/doxygen/_simd___s_s_e2_8cpp_source.html

 /*

 ===========================================================================


 Doom 3 GPL Source Code

 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.


 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).


 Doom 3 Source Code is free software: you can redistribute it and/or modify

 it under the terms of the GNU General Public License as published by

 the Free Software Foundation, either version 3 of the License, or

 (at your option) any later version.


 Doom 3 Source Code is distributed in the hope that it will be useful,

 but WITHOUT ANY WARRANTY; without even the implied warranty of

 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

 GNU General Public License for more details.


 You should have received a copy of the GNU General Public License

 along with Doom 3 Source Code.  If not, see <http://www.gnu.org/licenses/>.


 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code.  If not, please request a copy in writing from id Software at the address below.


 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.


 ===========================================================================

 */


 #include "../precompiled.h"

 #pragma hdrstop


 #include "Simd_Generic.h"

 #include "Simd_MMX.h"

 #include "Simd_SSE.h"

 #include "Simd_SSE2.h"


 //===============================================================

 //

 //      SSE2 implementation of idSIMDProcessor

 //

 //===============================================================

 #if defined(MACOS_X) && defined(__i386__)


 #include <xmmintrin.h>


 #define SHUFFLEPS( x, y, z, w )         (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))

 #define R_SHUFFLEPS( x, y, z, w )       (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))


 /*

 ============

 idSIMD_SSE2::GetName

 ============

 */

 const char * idSIMD_SSE2::GetName( void ) const {

         return "MMX & SSE & SSE2";

 }


 /*

 ============

 idSIMD_SSE::CmpLT


   dst[i] |= ( src0[i] < constant ) << bitNum;

 ============

 */

 void VPCALL idSIMD_SSE2::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {

         int i, cnt, pre, post;

         float *aligned;

         __m128 xmm0, xmm1;

         __m128i xmm0i;

         int cnt_l;

         char *src0_p;

         char *constant_p;

         char *dst_p;

         int mask_l;

         int dst_l;


         /* if the float array is not aligned on a 4 byte boundary */

         if ( ((int) src0) & 3 ) {

                 /* unaligned memory access */

                 pre = 0;

                 cnt = count >> 2;

                 post = count - (cnt<<2);


         /*

                 __asm   mov                     edx, cnt

                 __asm   test            edx, edx

                 __asm   je                      doneCmp

         */

                 cnt_l = cnt;

                 if(cnt_l != 0) {

         /*

                 __asm   push            ebx

                 __asm   neg                     edx

                 __asm   mov                     esi, src0

                 __asm   prefetchnta     [esi+64]

                 __asm   movss           xmm1, constant

                 __asm   shufps          xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )

                 __asm   mov                     edi, dst

                 __asm   mov                     cl, bitNum

         */

                         cnt_l = -cnt_l;

                         src0_p = (char *) src0;

                         _mm_prefetch(src0_p+64, _MM_HINT_NTA);

                         constant_p = (char *) &constant;

                         xmm1 = _mm_load_ss((float *)constant_p);

                         xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));

                         dst_p = (char *)dst;

         /*

                         __asm loopNA:

         */

                         do {

         /*

                 __asm   movups          xmm0, [esi]

                 __asm   prefetchnta     [esi+128]

                 __asm   cmpltps         xmm0, xmm1

                 __asm   movmskps        eax, xmm0                                                                                                                                                                                                                               \

                 __asm   mov                     ah, al

                 __asm   shr                     ah, 1

                 __asm   mov                     bx, ax

                 __asm   shl                     ebx, 14

                 __asm   mov                     bx, ax

                 __asm   and                     ebx, 0x01010101

                 __asm   shl                     ebx, cl

                 __asm   or                      ebx, dword ptr [edi]

                 __asm   mov                     dword ptr [edi], ebx

                 __asm   add                     esi, 16

                 __asm   add                     edi, 4

                 __asm   inc                     edx

                 __asm   jl                      loopNA

                 __asm   pop                     ebx

         */

                                 xmm0 = _mm_loadu_ps((float *) src0_p);

                                 _mm_prefetch(src0_p+128, _MM_HINT_NTA);

                                 xmm0 = _mm_cmplt_ps(xmm0, xmm1);

                                 // Simplify using SSE2

                                 xmm0i = (__m128i) xmm0;

                                 xmm0i = _mm_packs_epi32(xmm0i, xmm0i);

                                 xmm0i = _mm_packs_epi16(xmm0i, xmm0i);

                                 mask_l = _mm_cvtsi128_si32(xmm0i);

                                 // End

                                 mask_l = mask_l &  0x01010101;

                                 mask_l = mask_l << bitNum;

                                 dst_l  = *((int *) dst_p);

                                 mask_l = mask_l | dst_l;

                                 *((int *) dst_p) = mask_l;

                                 src0_p = src0_p + 16;

                                 dst_p = dst_p + 4;

                                 cnt_l = cnt_l + 1;

                         } while (cnt_l < 0);

                 }

         }

         else {

                 /* aligned memory access */

                 aligned = (float *) ((((int) src0) + 15) & ~15);

                 if ( (int)aligned > ((int)src0) + count ) {

                         pre = count;

                         post = 0;

                 }

                 else {

                         pre = aligned - src0;

                         cnt = (count - pre) >> 2;

                         post = count - pre - (cnt<<2);

         /*

                         __asm   mov                     edx, cnt

                         __asm   test            edx, edx

                         __asm   je                      doneCmp

         */

                         cnt_l = cnt;

                         if(cnt_l != 0) {

         /*

                         __asm   push            ebx

                         __asm   neg                     edx

                         __asm   mov                     esi, aligned

                         __asm   prefetchnta     [esi+64]

                         __asm   movss           xmm1, constant

                         __asm   shufps          xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )

                         __asm   mov                     edi, dst

                         __asm   add                     edi, pre

                         __asm   mov                     cl, bitNum

         */

                                 cnt_l = -cnt_l;

                                 src0_p = (char *) src0;

                                 _mm_prefetch(src0_p+64, _MM_HINT_NTA);

                                 constant_p = (char *) &constant;

                                 xmm1 = _mm_load_ss((float *)constant_p);

                                 xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));

                                 dst_p = (char *)dst;

                                 dst_p = dst_p + pre;

         /*

                         __asm loopA:

         */

                         do {

         /*

                         __asm   movaps          xmm0, [esi]

                         __asm   prefetchnta     [esi+128]

                         __asm   cmpltps         xmm0, xmm1

                         __asm   movmskps        eax, xmm0                                                                                                                                                                                                                       \

                         __asm   mov                     ah, al

                         __asm   shr                     ah, 1

                         __asm   mov                     bx, ax

                         __asm   shl                     ebx, 14

                         __asm   mov                     bx, ax

                         __asm   and                     ebx, 0x01010101

                         __asm   shl                     ebx, cl

                         __asm   or                      ebx, dword ptr [edi]

                         __asm   mov                     dword ptr [edi], ebx

                         __asm   add                     esi, 16

                         __asm   add                     edi, 4

                         __asm   inc                     edx

                         __asm   jl                      loopA

                         __asm   pop                     ebx

         */

                                         xmm0 = _mm_load_ps((float *) src0_p);

                                         _mm_prefetch(src0_p+128, _MM_HINT_NTA);

                                         xmm0 = _mm_cmplt_ps(xmm0, xmm1);

                                         // Simplify using SSE2

                                         xmm0i = (__m128i) xmm0;

                                         xmm0i = _mm_packs_epi32(xmm0i, xmm0i);

                                         xmm0i = _mm_packs_epi16(xmm0i, xmm0i);

                                         mask_l = _mm_cvtsi128_si32(xmm0i);

                                         // End

                                         mask_l = mask_l &  0x01010101;

                                         mask_l = mask_l << bitNum;

                                         dst_l  = *((int *) dst_p);

                                         mask_l = mask_l | dst_l;

                                         *((int *) dst_p) = mask_l;

                                         src0_p = src0_p + 16;

                                         dst_p = dst_p + 4;

                                         cnt_l = cnt_l + 1;

                                 } while (cnt_l < 0);

                         }

                 }

         }

         /*

         doneCmp:

         */

         float c = constant;

         for ( i = 0; i < pre; i++ ) {

                 dst[i] |= ( src0[i] < c ) << bitNum;

         }

         for ( i = count - post; i < count; i++ ) {

                 dst[i] |= ( src0[i] < c ) << bitNum;

         }

 }


 #elif defined(_WIN32)


 #include <xmmintrin.h>


 #define SHUFFLEPS( x, y, z, w )         (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))

 #define R_SHUFFLEPS( x, y, z, w )       (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))

 #define SHUFFLEPD( x, y )                       (( (x) & 1 ) << 1 | ( (y) & 1 ))

 #define R_SHUFFLEPD( x, y )                     (( (y) & 1 ) << 1 | ( (x) & 1 ))


 #define ALIGN4_INIT1( X, INIT )                         ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }

 #define ALIGN4_INIT4( X, I0, I1, I2, I3 )       ALIGN16( static X[4] ) = { I0, I1, I2, I3 }

 #define ALIGN8_INIT1( X, INIT )                         ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }


 ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );

 ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );


 ALIGN4_INIT4( unsigned long SIMD_SP_singleSignBitMask, (unsigned long) ( 1 << 31 ), 0, 0, 0 );

 ALIGN4_INIT1( unsigned long SIMD_SP_signBitMask, (unsigned long) ( 1 << 31 ) );

 ALIGN4_INIT1( unsigned long SIMD_SP_absMask, (unsigned long) ~( 1 << 31 ) );

 ALIGN4_INIT1( unsigned long SIMD_SP_infinityMask, (unsigned long) ~( 1 << 23 ) );


 ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );

 ALIGN4_INIT1( float SIMD_SP_one, 1.0f );

 ALIGN4_INIT1( float SIMD_SP_two, 2.0f );

 ALIGN4_INIT1( float SIMD_SP_three, 3.0f );

 ALIGN4_INIT1( float SIMD_SP_four, 4.0f );

 ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );

 ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );

 ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );

 ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );

 ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );

 ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );

 ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );


 /*

 ============

 idSIMD_SSE2::GetName

 ============

 */

 const char * idSIMD_SSE2::GetName( void ) const {

         return "MMX & SSE & SSE2";

 }


 #if 0           // the SSE2 code is ungodly slow


 /*

 ============

 idSIMD_SSE2::MatX_LowerTriangularSolve


   solves x in Lx = b for the n * n sub-matrix of L

   if skip > 0 the first skip elements of x are assumed to be valid already

   L has to be a lower triangular matrix with (implicit) ones on the diagonal

   x == b is allowed

 ============

 */

 void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {

         int nc;

         const float *lptr;


         if ( skip >= n ) {

                 return;

         }


         lptr = L[skip];

         nc = L.GetNumColumns();


         // unrolled cases for n < 8

         if ( n < 8 ) {

                 #define NSKIP( n, s )   ((n<<3)|(s&7))

                 switch( NSKIP( n, skip ) ) {

                         case NSKIP( 1, 0 ): x[0] = b[0];

                                 return;

                         case NSKIP( 2, 0 ): x[0] = b[0];

                         case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];

                                 return;

                         case NSKIP( 3, 0 ): x[0] = b[0];

                         case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];

                         case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];

                                 return;

                         case NSKIP( 4, 0 ): x[0] = b[0];

                         case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];

                         case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];

                         case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];

                                 return;

                         case NSKIP( 5, 0 ): x[0] = b[0];

                         case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];

                         case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];

                         case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];

                         case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];

                                 return;

                         case NSKIP( 6, 0 ): x[0] = b[0];

                         case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];

                         case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];

                         case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];

                         case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];

                         case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];

                                 return;

                         case NSKIP( 7, 0 ): x[0] = b[0];

                         case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];

                         case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];

                         case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];

                         case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];

                         case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];

                         case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];

                                 return;

                 }

                 return;

         }


         // process first 4 rows

         switch( skip ) {

                 case 0: x[0] = b[0];

                 case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];

                 case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];

                 case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];

                                 skip = 4;

         }


         lptr = L[skip];


         __asm {

                 push            ebx

                 mov                     eax, skip                               // eax = i

                 shl                     eax, 2                                  // eax = i*4

                 mov                     edx, n                                  // edx = n

                 shl                     edx, 2                                  // edx = n*4

                 mov                     esi, x                                  // esi = x

                 mov                     edi, lptr                               // edi = lptr

                 add                     esi, eax

                 add                     edi, eax

                 mov                     ebx, b                                  // ebx = b

                 // aligned

         looprow:

                 mov                     ecx, eax

                 neg                     ecx

                 cvtps2pd        xmm0, [esi+ecx]

                 cvtps2pd        xmm2, [edi+ecx]

                 mulpd           xmm0, xmm2

                 cvtps2pd        xmm1, [esi+ecx+8]

                 cvtps2pd        xmm3, [edi+ecx+8]

                 mulpd           xmm1, xmm3

                 add                     ecx, 20*4

                 jg                      donedot16

         dot16:

                 cvtps2pd        xmm2, [esi+ecx-(16*4)]

                 cvtps2pd        xmm3, [edi+ecx-(16*4)]

                 cvtps2pd        xmm4, [esi+ecx-(14*4)]

                 mulpd           xmm2, xmm3

                 cvtps2pd        xmm5, [edi+ecx-(14*4)]

                 addpd           xmm0, xmm2

                 cvtps2pd        xmm2, [esi+ecx-(12*4)]

                 mulpd           xmm4, xmm5

                 cvtps2pd        xmm3, [edi+ecx-(12*4)]

                 addpd           xmm1, xmm4

                 cvtps2pd        xmm4, [esi+ecx-(10*4)]

                 mulpd           xmm2, xmm3

                 cvtps2pd        xmm5, [edi+ecx-(10*4)]

                 addpd           xmm0, xmm2

                 cvtps2pd        xmm2, [esi+ecx-(8*4)]

                 mulpd           xmm4, xmm5

                 cvtps2pd        xmm3, [edi+ecx-(8*4)]

                 addpd           xmm1, xmm4

                 cvtps2pd        xmm4, [esi+ecx-(6*4)]

                 mulpd           xmm2, xmm3

                 cvtps2pd        xmm5, [edi+ecx-(6*4)]

                 addpd           xmm0, xmm2

                 cvtps2pd        xmm2, [esi+ecx-(4*4)]

                 mulpd           xmm4, xmm5

                 cvtps2pd        xmm3, [edi+ecx-(4*4)]

                 addpd           xmm1, xmm4

                 cvtps2pd        xmm4, [esi+ecx-(2*4)]

                 mulpd           xmm2, xmm3

                 cvtps2pd        xmm5, [edi+ecx-(2*4)]

                 addpd           xmm0, xmm2

                 add                     ecx, 16*4

                 mulpd           xmm4, xmm5

                 addpd           xmm1, xmm4

                 jle                     dot16

         donedot16:

                 sub                     ecx, 8*4

                 jg                      donedot8

         dot8:

                 cvtps2pd        xmm2, [esi+ecx-(8*4)]

                 cvtps2pd        xmm3, [edi+ecx-(8*4)]

                 cvtps2pd        xmm7, [esi+ecx-(6*4)]

                 mulpd           xmm2, xmm3

                 cvtps2pd        xmm5, [edi+ecx-(6*4)]

                 addpd           xmm0, xmm2

                 cvtps2pd        xmm6, [esi+ecx-(4*4)]

                 mulpd           xmm7, xmm5

                 cvtps2pd        xmm3, [edi+ecx-(4*4)]

                 addpd           xmm1, xmm7

                 cvtps2pd        xmm4, [esi+ecx-(2*4)]

                 mulpd           xmm6, xmm3

                 cvtps2pd        xmm7, [edi+ecx-(2*4)]

                 addpd           xmm0, xmm6

                 add                     ecx, 8*4

                 mulpd           xmm4, xmm7

                 addpd           xmm1, xmm4

         donedot8:

                 sub                     ecx, 4*4

                 jg                      donedot4

         dot4:

                 cvtps2pd        xmm2, [esi+ecx-(4*4)]

                 cvtps2pd        xmm3, [edi+ecx-(4*4)]

                 cvtps2pd        xmm4, [esi+ecx-(2*4)]

                 mulpd           xmm2, xmm3

                 cvtps2pd        xmm5, [edi+ecx-(2*4)]

                 addpd           xmm0, xmm2

                 add                     ecx, 4*4

                 mulpd           xmm4, xmm5

                 addpd           xmm1, xmm4

         donedot4:

                 addpd           xmm0, xmm1

                 movaps          xmm1, xmm0

                 shufpd          xmm1, xmm1, R_SHUFFLEPD( 1, 0 )

                 addsd           xmm0, xmm1

                 sub                     ecx, 4*4

                 jz                      dot0

                 add                     ecx, 4

                 jz                      dot1

                 add                     ecx, 4

                 jz                      dot2

         //dot3:

                 cvtss2sd        xmm1, [esi-(3*4)]

                 cvtss2sd        xmm2, [edi-(3*4)]

                 mulsd           xmm1, xmm2

                 addsd           xmm0, xmm1

         dot2:

                 cvtss2sd        xmm3, [esi-(2*4)]

                 cvtss2sd        xmm4, [edi-(2*4)]

                 mulsd           xmm3, xmm4

                 addsd           xmm0, xmm3

         dot1:

                 cvtss2sd        xmm5, [esi-(1*4)]

                 cvtss2sd        xmm6, [edi-(1*4)]

                 mulsd           xmm5, xmm6

                 addsd           xmm0, xmm5

         dot0:

                 cvtss2sd        xmm1, [ebx+eax]

                 subsd           xmm1, xmm0

                 cvtsd2ss        xmm0, xmm1

                 movss           [esi], xmm0

                 add                     eax, 4

                 cmp                     eax, edx

                 jge                     done

                 add                     esi, 4

                 mov                     ecx, nc

                 shl                     ecx, 2

                 add                     edi, ecx

                 add                     edi, 4

                 jmp                     looprow

                 // done

         done:

                 pop                     ebx

         }

 }


 /*

 ============

 idSIMD_SSE2::MatX_LowerTriangularSolveTranspose


   solves x in L'x = b for the n * n sub-matrix of L

   L has to be a lower triangular matrix with (implicit) ones on the diagonal

   x == b is allowed

 ============

 */

 void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {

         int nc;

         const float *lptr;


         lptr = L.ToFloatPtr();

         nc = L.GetNumColumns();


         // unrolled cases for n < 8

         if ( n < 8 ) {

                 switch( n ) {

                         case 0:

                                 return;

                         case 1:

                                 x[0] = b[0];

                                 return;

                         case 2:

                                 x[1] = b[1];

                                 x[0] = b[0] - lptr[1*nc+0] * x[1];

                                 return;

                         case 3:

                                 x[2] = b[2];

                                 x[1] = b[1] - lptr[2*nc+1] * x[2];

                                 x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];

                                 return;

                         case 4:

                                 x[3] = b[3];

                                 x[2] = b[2] - lptr[3*nc+2] * x[3];

                                 x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];

                                 x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];

                                 return;

                         case 5:

                                 x[4] = b[4];

                                 x[3] = b[3] - lptr[4*nc+3] * x[4];

                                 x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];

                                 x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];

                                 x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];

                                 return;

                         case 6:

                                 x[5] = b[5];

                                 x[4] = b[4] - lptr[5*nc+4] * x[5];

                                 x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];

                                 x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];

                                 x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];

                                 x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];

                                 return;

                         case 7:

                                 x[6] = b[6];

                                 x[5] = b[5] - lptr[6*nc+5] * x[6];

                                 x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];

                                 x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];

                                 x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];

                                 x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];

                                 x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];

                                 return;

                 }

                 return;

         }


         int i, j, m;

         float *xptr;

         double s0;


         // if the number of columns is not a multiple of 2 we're screwed for alignment.

         // however, if the number of columns is a multiple of 2 but the number of to be

         // processed rows is not a multiple of 2 we can still run 8 byte aligned

         m = n;

         if ( m & 1 ) {

                 m--;

                 x[m] = b[m];


                 lptr = L[m] + m - 4;

                 xptr = x + m;

                 __asm {

                         push            ebx

                         mov                     eax, m                                  // eax = i

                         mov                     esi, xptr                               // esi = xptr

                         mov                     edi, lptr                               // edi = lptr

                         mov                     ebx, b                                  // ebx = b

                         mov                     edx, nc                                 // edx = nc*sizeof(float)

                         shl                     edx, 2

                 process4rows_1:

                         cvtps2pd        xmm0, [ebx+eax*4-16]    // load b[i-2], b[i-1]

                         cvtps2pd        xmm2, [ebx+eax*4-8]             // load b[i-4], b[i-3]

                         xor                     ecx, ecx

                         sub                     eax, m

                         neg                     eax

                         jz                      done4x4_1

                 process4x4_1:   // process 4x4 blocks

                         cvtps2pd        xmm3, [edi]

                         cvtps2pd        xmm4, [edi+8]

                         add                     edi, edx

                         cvtss2sd        xmm5, [esi+4*ecx+0]

                         shufpd          xmm5, xmm5, R_SHUFFLEPD( 0, 0 )

                         mulpd           xmm3, xmm5

                         cvtps2pd        xmm1, [edi]

                         mulpd           xmm4, xmm5

                         cvtps2pd        xmm6, [edi+8]

                         subpd           xmm0, xmm3

                         subpd           xmm2, xmm4

                         add                     edi, edx

                         cvtss2sd        xmm7, [esi+4*ecx+4]

                         shufpd          xmm7, xmm7, R_SHUFFLEPD( 0, 0 )

                         mulpd           xmm1, xmm7

                         cvtps2pd        xmm3, [edi]

                         mulpd           xmm6, xmm7

                         cvtps2pd        xmm4, [edi+8]

                         subpd           xmm0, xmm1

                         subpd           xmm2, xmm6

                         add                     edi, edx

                         cvtss2sd        xmm5, [esi+4*ecx+8]

                         shufpd          xmm5, xmm5, R_SHUFFLEPD( 0, 0 )

                         mulpd           xmm3, xmm5

                         cvtps2pd        xmm1, [edi]

                         mulpd           xmm4, xmm5

                         cvtps2pd        xmm6, [edi+8]

                         subpd           xmm0, xmm3

                         subpd           xmm2, xmm4

                         add                     edi, edx

                         cvtss2sd        xmm7, [esi+4*ecx+12]

                         shufpd          xmm7, xmm7, R_SHUFFLEPD( 0, 0 )

                         mulpd           xmm1, xmm7

                         add                     ecx, 4

                         mulpd           xmm6, xmm7

                         cmp                     ecx, eax

                         subpd           xmm0, xmm1

                         subpd           xmm2, xmm6

                         jl                      process4x4_1

                 done4x4_1:              // process left over of the 4 rows

                         cvtps2pd        xmm3, [edi]

                         cvtps2pd        xmm4, [edi+8]

                         cvtss2sd        xmm5, [esi+4*ecx]

                         shufpd          xmm5, xmm5, R_SHUFFLEPD( 0, 0 )

                         mulpd           xmm3, xmm5

                         mulpd           xmm4, xmm5

                         subpd           xmm0, xmm3

                         subpd           xmm2, xmm4

                         imul            ecx, edx

                         sub                     edi, ecx

                         neg                     eax


                         add                     eax, m

                         sub                     eax, 4

                         movapd          xmm1, xmm0

                         shufpd          xmm1, xmm1, R_SHUFFLEPD( 1, 1 )

                         movapd          xmm3, xmm2

                         shufpd          xmm3, xmm3, R_SHUFFLEPD( 1, 1 )

                         sub                     edi, edx

                         cvtsd2ss        xmm7, xmm3

                         movss           [esi-4], xmm7                   // xptr[-1] = s3

                         movsd           xmm4, xmm3

                         movsd           xmm5, xmm3

                         cvtss2sd        xmm7, [edi+8]

                         mulsd           xmm3, xmm7                              // lptr[-1*nc+2] * s3

                         cvtss2sd        xmm7, [edi+4]

                         mulsd           xmm4, xmm7                              // lptr[-1*nc+1] * s3

                         cvtss2sd        xmm7, [edi]

                         mulsd           xmm5, xmm7                              // lptr[-1*nc+0] * s3

                         subsd           xmm2, xmm3

                         cvtsd2ss        xmm7, xmm2

                         movss           [esi-8], xmm7                   // xptr[-2] = s2

                         movsd           xmm6, xmm2

                         sub                     edi, edx

                         subsd           xmm0, xmm5

                         subsd           xmm1, xmm4

                         cvtss2sd        xmm7, [edi+4]

                         mulsd           xmm2, xmm7                              // lptr[-2*nc+1] * s2

                         cvtss2sd        xmm7, [edi]

                         mulsd           xmm6, xmm7                              // lptr[-2*nc+0] * s2

                         subsd           xmm1, xmm2

                         cvtsd2ss        xmm7, xmm1

                         movss           [esi-12], xmm7                  // xptr[-3] = s1

                         subsd           xmm0, xmm6

                         sub                     edi, edx

                         cmp                     eax, 4

                         cvtss2sd        xmm7, [edi]

                         mulsd           xmm1, xmm7                              // lptr[-3*nc+0] * s1

                         subsd           xmm0, xmm1

                         cvtsd2ss        xmm7, xmm0

                         movss           [esi-16], xmm7                  // xptr[-4] = s0

                         jl                      done4rows_1

                         sub                     edi, edx

                         sub                     edi, 16

                         sub                     esi, 16

                         jmp                     process4rows_1

                 done4rows_1:

                         pop                     ebx

                 }

         }

         else {

                 lptr = L.ToFloatPtr() + m * L.GetNumColumns() + m - 4;

                 xptr = x + m;

                 __asm {

                         push            ebx

                         mov                     eax, m                                  // eax = i

                         mov                     esi, xptr                               // esi = xptr

                         mov                     edi, lptr                               // edi = lptr

                         mov                     ebx, b                                  // ebx = b

                         mov                     edx, nc                                 // edx = nc*sizeof(float)

                         shl                     edx, 2

                 process4rows:

                         cvtps2pd        xmm0, [ebx+eax*4-16]    // load b[i-2], b[i-1]

                         cvtps2pd        xmm2, [ebx+eax*4-8]             // load b[i-4], b[i-3]

                         sub                     eax, m

                         jz                      done4x4

                         neg                     eax

                         xor                     ecx, ecx

                 process4x4:             // process 4x4 blocks

                         cvtps2pd        xmm3, [edi]

                         cvtps2pd        xmm4, [edi+8]

                         add                     edi, edx

                         cvtss2sd        xmm5, [esi+4*ecx+0]

                         shufpd          xmm5, xmm5, R_SHUFFLEPD( 0, 0 )

                         mulpd           xmm3, xmm5

                         cvtps2pd        xmm1, [edi]

                         mulpd           xmm4, xmm5

                         cvtps2pd        xmm6, [edi+8]

                         subpd           xmm0, xmm3

                         subpd           xmm2, xmm4

                         add                     edi, edx

                         cvtss2sd        xmm7, [esi+4*ecx+4]

                         shufpd          xmm7, xmm7, R_SHUFFLEPD( 0, 0 )

                         mulpd           xmm1, xmm7

                         cvtps2pd        xmm3, [edi]

                         mulpd           xmm6, xmm7

                         cvtps2pd        xmm4, [edi+8]

                         subpd           xmm0, xmm1

                         subpd           xmm2, xmm6

                         add                     edi, edx

                         cvtss2sd        xmm5, [esi+4*ecx+8]

                         shufpd          xmm5, xmm5, R_SHUFFLEPD( 0, 0 )

                         mulpd           xmm3, xmm5

                         cvtps2pd        xmm1, [edi]

                         mulpd           xmm4, xmm5

                         cvtps2pd        xmm6, [edi+8]

                         subpd           xmm0, xmm3

                         subpd           xmm2, xmm4

                         add                     edi, edx

                         cvtss2sd        xmm7, [esi+4*ecx+12]

                         shufpd          xmm7, xmm7, R_SHUFFLEPD( 0, 0 )

                         mulpd           xmm1, xmm7

                         add                     ecx, 4

                         mulpd           xmm6, xmm7

                         cmp                     ecx, eax

                         subpd           xmm0, xmm1

                         subpd           xmm2, xmm6

                         jl                      process4x4

                         imul            ecx, edx

                         sub                     edi, ecx

                         neg                     eax

                 done4x4:                // process left over of the 4 rows

                         add                     eax, m

                         sub                     eax, 4

                         movapd          xmm1, xmm0

                         shufpd          xmm1, xmm1, R_SHUFFLEPD( 1, 1 )

                         movapd          xmm3, xmm2

                         shufpd          xmm3, xmm3, R_SHUFFLEPD( 1, 1 )

                         sub                     edi, edx

                         cvtsd2ss        xmm7, xmm3

                         movss           [esi-4], xmm7                   // xptr[-1] = s3

                         movsd           xmm4, xmm3

                         movsd           xmm5, xmm3

                         cvtss2sd        xmm7, [edi+8]

                         mulsd           xmm3, xmm7                              // lptr[-1*nc+2] * s3

                         cvtss2sd        xmm7, [edi+4]

                         mulsd           xmm4, xmm7                              // lptr[-1*nc+1] * s3

                         cvtss2sd        xmm7, [edi]

                         mulsd           xmm5, xmm7                              // lptr[-1*nc+0] * s3

                         subsd           xmm2, xmm3

                         cvtsd2ss        xmm7, xmm2

                         movss           [esi-8], xmm7                   // xptr[-2] = s2

                         movsd           xmm6, xmm2

                         sub                     edi, edx

                         subsd           xmm0, xmm5

                         subsd           xmm1, xmm4

                         cvtss2sd        xmm7, [edi+4]

                         mulsd           xmm2, xmm7                              // lptr[-2*nc+1] * s2

                         cvtss2sd        xmm7, [edi]

                         mulsd           xmm6, xmm7                              // lptr[-2*nc+0] * s2

                         subsd           xmm1, xmm2

                         cvtsd2ss        xmm7, xmm1

                         movss           [esi-12], xmm7                  // xptr[-3] = s1

                         subsd           xmm0, xmm6

                         sub                     edi, edx

                         cmp                     eax, 4

                         cvtss2sd        xmm7, [edi]

                         mulsd           xmm1, xmm7                              // lptr[-3*nc+0] * s1

                         subsd           xmm0, xmm1

                         cvtsd2ss        xmm7, xmm0

                         movss           [esi-16], xmm7                  // xptr[-4] = s0

                         jl                      done4rows

                         sub                     edi, edx

                         sub                     edi, 16

                         sub                     esi, 16

                         jmp                     process4rows

                 done4rows:

                         pop                     ebx

                 }

         }


         // process left over rows

         for ( i = (m&3)-1; i >= 0; i-- ) {

                 s0 = b[i];

                 lptr = L[i+1] + i;

                 for ( j = i + 1; j < m; j++ ) {

                         s0 -= lptr[0] * x[j];

                         lptr += nc;

                 }

                 x[i] = s0;

         }

 }


 #endif


 /*

 ============

 idSIMD_SSE2::MixedSoundToSamples

 ============

 */

 void VPCALL idSIMD_SSE2::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {


         assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );


         __asm {


                 mov                     eax, numSamples

                 mov                     edi, mixBuffer

                 mov                     esi, samples

                 shl                     eax, 2

                 add                     edi, eax

                 neg                     eax


         loop16:


                 movaps          xmm0, [edi+eax+0*16]

                 movaps          xmm1, [edi+eax+1*16]

                 movaps          xmm2, [edi+eax+2*16]

                 movaps          xmm3, [edi+eax+3*16]


                 add                     esi, 4*4*2


                 cvtps2dq        xmm4, xmm0

                 cvtps2dq        xmm5, xmm1

                 cvtps2dq        xmm6, xmm2

                 cvtps2dq        xmm7, xmm3


                 prefetchnta     [edi+eax+128]


                 packssdw        xmm4, xmm5

                 packssdw        xmm6, xmm7


                 add                     eax, 4*16


                 movlps          [esi-4*4*2], xmm4               // FIXME: should not use movlps/movhps to move integer data

                 movhps          [esi-3*4*2], xmm4

                 movlps          [esi-2*4*2], xmm6

                 movhps          [esi-1*4*2], xmm6


                 jl                      loop16

         }

 }


 #endif /* _WIN32 */

idSIMD_Generic::MatX_LowerTriangularSolve
virtual void VPCALL MatX_LowerTriangularSolve(const idMatX &L, float *x, const float *b, const int n, int skip=0)
Definition: Simd_Generic.cpp:1733

idMath::INFINITY
static const float INFINITY
Definition: Math.h:218

assert
assert(prefInfo.fullscreenBtn)

idMatX
Definition: Matrix.h:1786

MIXBUFFER_SAMPLES
const int MIXBUFFER_SAMPLES
Definition: Simd.h:84

const
#define const
Definition: getdate.c:251

idMath::PI
static const float PI
Definition: Math.h:205

idSIMD_Generic::MixedSoundToSamples
virtual void VPCALL MixedSoundToSamples(short *samples, const float *mixBuffer, const int numSamples)
Definition: Simd_Generic.cpp:3049

n
GLenum GLsizei n
Definition: glext.h:3705

int
case const int
Definition: Callbacks.cpp:52

Simd_SSE2.h

idMath::HALF_PI
static const float HALF_PI
Definition: Math.h:207

idSIMD_Generic::MatX_LowerTriangularSolveTranspose
virtual void VPCALL MatX_LowerTriangularSolveTranspose(const idMatX &L, float *x, const float *b, const int n)
Definition: Simd_Generic.cpp:1866

x
GLenum GLint x
Definition: glext.h:2849

process.i
int i
Definition: process.py:33

dst
GLuint dst
Definition: glext.h:5285

count
GLuint GLuint GLsizei count
Definition: glext.h:2845

idMatX::GetNumColumns
int GetNumColumns(void) const
Definition: Matrix.h:1822

idSIMD_SSE2
Definition: Simd_SSE2.h:40

c
const GLubyte * c
Definition: glext.h:4677

Simd_MMX.h

NSKIP
#define NSKIP(n, s)

Simd_Generic.h

L
Definition: eax4.h:1413

b
GLubyte GLubyte b
Definition: glext.h:4662

idMath::TWO_PI
static const float TWO_PI
Definition: Math.h:206

idal.f
tuple f
Definition: idal.py:89

byte
unsigned char byte
Definition: Lib.h:75

VPCALL
#define VPCALL
Definition: Simd.h:63

j
GLint j
Definition: qgl.h:264

Simd_SSE.h

idSIMD_Generic::CmpLT
virtual void VPCALL CmpLT(byte *dst, const float *src0, const float constant, const int count)
Definition: Simd_Generic.cpp:449

idSIMD_Generic::GetName
virtual const char *VPCALL GetName(void) const
Definition: Simd_Generic.cpp:60