doom3/doxygen/_simd___s_s_e3_8cpp_source.html

 /*

 ===========================================================================


 Doom 3 GPL Source Code

 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.


 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).


 Doom 3 Source Code is free software: you can redistribute it and/or modify

 it under the terms of the GNU General Public License as published by

 the Free Software Foundation, either version 3 of the License, or

 (at your option) any later version.


 Doom 3 Source Code is distributed in the hope that it will be useful,

 but WITHOUT ANY WARRANTY; without even the implied warranty of

 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

 GNU General Public License for more details.


 You should have received a copy of the GNU General Public License

 along with Doom 3 Source Code.  If not, see <http://www.gnu.org/licenses/>.


 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code.  If not, please request a copy in writing from id Software at the address below.


 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.


 ===========================================================================

 */


 #include "../precompiled.h"

 #pragma hdrstop


 #include "Simd_Generic.h"

 #include "Simd_MMX.h"

 #include "Simd_SSE.h"

 #include "Simd_SSE2.h"

 #include "Simd_SSE3.h"


 //===============================================================

 //

 //      SSE3 implementation of idSIMDProcessor

 //

 //===============================================================


 #if defined(MACOS_X) && defined(__i386__)


 /*

 ============

 idSIMD_SSE3::GetName

 ============

 */

 const char * idSIMD_SSE3::GetName( void ) const {

         return "MMX & SSE & SSE2 & SSE3";

 }


 #elif defined(_WIN32)


 #include <xmmintrin.h>


 #define SHUFFLEPS( x, y, z, w )         (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))

 #define R_SHUFFLEPS( x, y, z, w )       (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))

 #define SHUFFLEPD( x, y )                       (( (x) & 1 ) << 1 | ( (y) & 1 ))

 #define R_SHUFFLEPD( x, y )                     (( (y) & 1 ) << 1 | ( (x) & 1 ))


 /*


         The first argument of an instruction macro is the destination

         and the second argument is the source operand. The destination

         operand can be _xmm0 to _xmm7 only. The source operand can be

         any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,

         _ebp, _ebx, _esi, or _edi that contains the effective address.


         For instance:  haddps   xmm0, xmm1

         becomes:       haddps( _xmm0, _xmm1 )

         and:           haddps   xmm0, [esi]

         becomes:       haddps( _xmm0, _esi )


         The ADDRESS_ADDC macro can be used when the effective source address

         is formed by adding a constant to a general purpose register.

         For instance:  haddps   xmm0, [esi+48]

         becomes:       haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )


         The ADDRESS_ADDR macro can be used when the effective source address

         is formed by adding two general purpose registers.

         For instance:  haddps   xmm0, [esi+eax]

         becomes:       haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )


         The ADDRESS_ADDRC macro can be used when the effective source address

         is formed by adding two general purpose registers and a constant.

         The constant must be in the range [-128, 127].

         For instance:  haddps   xmm0, [esi+eax+48]

         becomes:       haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )


         The ADDRESS_SCALEADDR macro can be used when the effective source address is formed

         by adding a scaled general purpose register to another general purpose register.

         The scale must be either 1, 2, 4 or 8.

         For instance:  haddps   xmm0, [esi+eax*4]

         becomes:       haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )


         The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed

         by adding a scaled general purpose register to another general purpose register and

         also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must

         be in the range [-128, 127].

         For instance:  haddps   xmm0, [esi+eax*4+64]

         becomes:       haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )


 */


 #define _eax    0x00

 #define _ecx    0x01

 #define _edx    0x02

 #define _ebx    0x03

 #define _esp    0x04

 #define _ebp    0x05

 #define _esi    0x06

 #define _edi    0x07


 #define _xmm0   0xC0

 #define _xmm1   0xC1

 #define _xmm2   0xC2

 #define _xmm3   0xC3

 #define _xmm4   0xC4

 #define _xmm5   0xC5

 #define _xmm6   0xC6

 #define _xmm7   0xC7


 #define RSCALE( s )             ( (s&2)<<5 ) | ( (s&4)<<5 ) | ( (s&8)<<3 ) | ( (s&8)<<4 )


 #define ADDRESS_ADDC( reg0, constant )                                          0x40 | ( reg0 & 7 )     \

         _asm _emit constant


 #define ADDRESS_ADDR( reg0, reg1 )                                                      0x04                            \

         _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )


 #define ADDRESS_ADDRC( reg0, reg1, constant )                           0x44                            \

         _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )                                                         \

         _asm _emit constant


 #define ADDRESS_SCALEADDR( reg0, reg1, scale )                          0x04                            \

         _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )


 #define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant )       0x44                            \

         _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )                       \

         _asm _emit constant


 // Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )

 #define addsubps( dst, src )                                            \

         _asm _emit 0xF2                                                                 \

         _asm _emit 0x0F                                                                 \

         _asm _emit 0xD0                                                                 \

         _asm _emit ( ( dst & 7 ) << 3 ) | src


 // Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )

 #define addsubpd( dst, src )                                            \

         _asm _emit 0x66                                                                 \

         _asm _emit 0x0F                                                                 \

         _asm _emit 0xD0                                                                 \

         _asm _emit ( ( dst & 7 ) << 3 ) | src


 // Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )

 #define haddps( dst, src )                                                      \

         _asm _emit 0xF2                                                                 \

         _asm _emit 0x0F                                                                 \

         _asm _emit 0x7C                                                                 \

         _asm _emit ( ( dst & 7 ) << 3 ) | src


 // Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )

 #define haddpd( dst, src )                                                      \

         _asm _emit 0x66                                                                 \

         _asm _emit 0x0F                                                                 \

         _asm _emit 0x7C                                                                 \

         _asm _emit ( ( dst & 7 ) << 3 ) | src


 // Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )

 #define hsubps( dst, src )                                                      \

         _asm _emit 0xF2                                                                 \

         _asm _emit 0x0F                                                                 \

         _asm _emit 0x7D                                                                 \

         _asm _emit ( ( dst & 7 ) << 3 ) | src


 // Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )

 #define hsubpd( dst, src )                                                      \

         _asm _emit 0x66                                                                 \

         _asm _emit 0x0F                                                                 \

         _asm _emit 0x7D                                                                 \

         _asm _emit ( ( dst & 7 ) << 3 ) | src


 // Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )

 #define movsldup( dst, src )                                            \

         _asm _emit 0xF3                                                                 \

         _asm _emit 0x0F                                                                 \

         _asm _emit 0x12                                                                 \

         _asm _emit ( ( dst & 7 ) << 3 ) | src


 // Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )

 #define movdldup( dst, src )                                            \

         _asm _emit 0xF2                                                                 \

         _asm _emit 0x0F                                                                 \

         _asm _emit 0x12                                                                 \

         _asm _emit ( ( dst & 7 ) << 3 ) | src


 // Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )

 #define movshdup( dst, src )                                            \

         _asm _emit 0xF3                                                                 \

         _asm _emit 0x0F                                                                 \

         _asm _emit 0x16                                                                 \

         _asm _emit ( ( dst & 7 ) << 3 ) | src


 // Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )

 #define movdhdup( dst, src )                                            \

         _asm _emit 0xF2                                                                 \

         _asm _emit 0x0F                                                                 \

         _asm _emit 0x16                                                                 \

         _asm _emit ( ( dst & 7 ) << 3 ) | src


 // Load Unaligned Integer 128 bits

 #define lddqu( dst, src )                                                       \

         _asm _emit 0xF2                                                                 \

         _asm _emit 0x0F                                                                 \

         _asm _emit 0xF0                                                                 \

         _asm _emit ( ( dst & 7 ) << 3 ) | src


 #define DRAWVERT_SIZE                           60

 #define DRAWVERT_XYZ_OFFSET                     (0*4)

 #define DRAWVERT_ST_OFFSET                      (3*4)

 #define DRAWVERT_NORMAL_OFFSET          (5*4)

 #define DRAWVERT_TANGENT0_OFFSET        (8*4)

 #define DRAWVERT_TANGENT1_OFFSET        (11*4)

 #define DRAWVERT_COLOR_OFFSET           (14*4)


 #define JOINTQUAT_SIZE                          (7*4)

 #define JOINTMAT_SIZE                           (4*3*4)

 #define JOINTWEIGHT_SIZE                        (4*4)


 /*

 ============

 SSE3_Dot

 ============

 */

 float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {

         float d;

         __asm {

                 mov             esi, v1

                 mov             edi, v2

                 movaps  xmm0, [esi]

                 mulps   xmm0, [edi]

                 haddps( _xmm0, _xmm0 )

                 haddps( _xmm0, _xmm0 )

                 movss   d, xmm0

         }

         return d;

 }


 /*

 ============

 idSIMD_SSE3::GetName

 ============

 */

 const char * idSIMD_SSE3::GetName( void ) const {

         return "MMX & SSE & SSE2 & SSE3";

 }


 /*

 ============

 idSIMD_SSE3::TransformVerts

 ============

 */

 void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {

 #if 1


         assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );

         assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );

         assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );

         assert( sizeof( idJointMat ) == JOINTMAT_SIZE );


         __asm

         {

                 mov                     eax, numVerts

                 test            eax, eax

                 jz                      done

                 imul            eax, DRAWVERT_SIZE


                 mov                     ecx, verts

                 mov                     edx, index

                 mov                     esi, weights

                 mov                     edi, joints


                 add                     ecx, eax

                 neg                     eax


         loopVert:

                 mov                     ebx, [edx]

                 movaps          xmm2, [esi]

                 add                     edx, 8

                 movaps          xmm0, xmm2

                 add                     esi, JOINTWEIGHT_SIZE

                 movaps          xmm1, xmm2


                 mulps           xmm0, [edi+ebx+ 0]                                              // xmm0 = m0, m1, m2, t0

                 mulps           xmm1, [edi+ebx+16]                                              // xmm1 = m3, m4, m5, t1

                 mulps           xmm2, [edi+ebx+32]                                              // xmm2 = m6, m7, m8, t2


                 cmp                     dword ptr [edx-4], 0


                 jne                     doneWeight


         loopWeight:

                 mov                     ebx, [edx]

                 movaps          xmm5, [esi]

                 add                     edx, 8

                 movaps          xmm3, xmm5

                 add                     esi, JOINTWEIGHT_SIZE

                 movaps          xmm4, xmm5


                 mulps           xmm3, [edi+ebx+ 0]                                              // xmm3 = m0, m1, m2, t0

                 mulps           xmm4, [edi+ebx+16]                                              // xmm4 = m3, m4, m5, t1

                 mulps           xmm5, [edi+ebx+32]                                              // xmm5 = m6, m7, m8, t2


                 cmp                     dword ptr [edx-4], 0


                 addps           xmm0, xmm3

                 addps           xmm1, xmm4

                 addps           xmm2, xmm5


                 je                      loopWeight


         doneWeight:

                 add                     eax, DRAWVERT_SIZE


                 haddps(         _xmm0, _xmm1 )

                 haddps(         _xmm2, _xmm0 )


                 movhps          [ecx+eax-DRAWVERT_SIZE+0], xmm2


                 haddps(         _xmm2, _xmm2 )


                 movss           [ecx+eax-DRAWVERT_SIZE+8], xmm2


                 jl                      loopVert

         done:

         }


 #else


         int i, j;

         const byte *jointsPtr = (byte *)joints;


         for( j = i = 0; i < numVerts; i++ ) {

                 idVec3 v;


                 v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];

                 while( index[j*2+1] == 0 ) {

                         j++;

                         v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];

                 }

                 j++;


                 verts[i].xyz = v;

         }


 #endif

 }


 #endif /* _WIN32 */

idSIMD_Generic::TransformVerts
virtual void VPCALL TransformVerts(idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights)
Definition: Simd_Generic.cpp:2316

dword
unsigned int dword
Definition: Lib.h:77

assert
assert(prefInfo.fullscreenBtn)

weights
const GLbyte * weights
Definition: glext.h:3273

idJointMat
Definition: JointTransform.h:63

v
const GLdouble * v
Definition: glext.h:2936

idDrawVert
Definition: DrawVert.h:40

const
#define const
Definition: getdate.c:251

idDrawVert::xyz
idVec3 xyz
Definition: DrawVert.h:42

idVec3
Definition: Vector.h:316

Simd_SSE2.h

process.i
int i
Definition: process.py:33

test
int test(char *url)
Definition: lib500.c:3

v2
GLfloat GLfloat GLfloat v2
Definition: glext.h:3608

index
GLuint index
Definition: glext.h:3476

idVec4
Definition: Vector.h:808

Simd_MMX.h

Simd_SSE3.h

Simd_Generic.h

v1
GLfloat GLfloat v1
Definition: glext.h:3607

byte
unsigned char byte
Definition: Lib.h:75

VPCALL
#define VPCALL
Definition: Simd.h:63

j
GLint j
Definition: qgl.h:264

Simd_SSE.h

idSIMD_SSE3
Definition: Simd_SSE3.h:40

idSIMD_Generic::GetName
virtual const char *VPCALL GetName(void) const
Definition: Simd_Generic.cpp:60