doom3-gpl
Doom 3 GPL source release
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Simd_SSE3.cpp
Go to the documentation of this file.
1 /*
2 ===========================================================================
3 
4 Doom 3 GPL Source Code
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
6 
7 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
8 
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13 
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
21 
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
23 
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
25 
26 ===========================================================================
27 */
28 
29 #include "../precompiled.h"
30 #pragma hdrstop
31 
32 #include "Simd_Generic.h"
33 #include "Simd_MMX.h"
34 #include "Simd_SSE.h"
35 #include "Simd_SSE2.h"
36 #include "Simd_SSE3.h"
37 
38 
39 //===============================================================
40 //
41 // SSE3 implementation of idSIMDProcessor
42 //
43 //===============================================================
44 
45 #if defined(MACOS_X) && defined(__i386__)
46 
47 /*
48 ============
49 idSIMD_SSE3::GetName
50 ============
51 */
52 const char * idSIMD_SSE3::GetName( void ) const {
53  return "MMX & SSE & SSE2 & SSE3";
54 }
55 
56 #elif defined(_WIN32)
57 
58 #include <xmmintrin.h>
59 
60 #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
61 #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
62 #define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
63 #define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
64 
65 /*
66 
67  The first argument of an instruction macro is the destination
68  and the second argument is the source operand. The destination
69  operand can be _xmm0 to _xmm7 only. The source operand can be
70  any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,
71  _ebp, _ebx, _esi, or _edi that contains the effective address.
72 
73  For instance: haddps xmm0, xmm1
74  becomes: haddps( _xmm0, _xmm1 )
75  and: haddps xmm0, [esi]
76  becomes: haddps( _xmm0, _esi )
77 
78  The ADDRESS_ADDC macro can be used when the effective source address
79  is formed by adding a constant to a general purpose register.
80  For instance: haddps xmm0, [esi+48]
81  becomes: haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )
82 
83  The ADDRESS_ADDR macro can be used when the effective source address
84  is formed by adding two general purpose registers.
85  For instance: haddps xmm0, [esi+eax]
86  becomes: haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )
87 
88  The ADDRESS_ADDRC macro can be used when the effective source address
89  is formed by adding two general purpose registers and a constant.
90  The constant must be in the range [-128, 127].
91  For instance: haddps xmm0, [esi+eax+48]
92  becomes: haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )
93 
94  The ADDRESS_SCALEADDR macro can be used when the effective source address is formed
95  by adding a scaled general purpose register to another general purpose register.
96  The scale must be either 1, 2, 4 or 8.
97  For instance: haddps xmm0, [esi+eax*4]
98  becomes: haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )
99 
100  The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed
101  by adding a scaled general purpose register to another general purpose register and
102  also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must
103  be in the range [-128, 127].
104  For instance: haddps xmm0, [esi+eax*4+64]
105  becomes: haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )
106 
107 */
108 
109 #define _eax 0x00
110 #define _ecx 0x01
111 #define _edx 0x02
112 #define _ebx 0x03
113 #define _esp 0x04
114 #define _ebp 0x05
115 #define _esi 0x06
116 #define _edi 0x07
117 
118 #define _xmm0 0xC0
119 #define _xmm1 0xC1
120 #define _xmm2 0xC2
121 #define _xmm3 0xC3
122 #define _xmm4 0xC4
123 #define _xmm5 0xC5
124 #define _xmm6 0xC6
125 #define _xmm7 0xC7
126 
127 #define RSCALE( s ) ( (s&2)<<5 ) | ( (s&4)<<5 ) | ( (s&8)<<3 ) | ( (s&8)<<4 )
128 
129 #define ADDRESS_ADDC( reg0, constant ) 0x40 | ( reg0 & 7 ) \
130  _asm _emit constant
131 
132 #define ADDRESS_ADDR( reg0, reg1 ) 0x04 \
133  _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )
134 
135 #define ADDRESS_ADDRC( reg0, reg1, constant ) 0x44 \
136  _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) \
137  _asm _emit constant
138 
139 #define ADDRESS_SCALEADDR( reg0, reg1, scale ) 0x04 \
140  _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )
141 
142 #define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant ) 0x44 \
143  _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale ) \
144  _asm _emit constant
145 
146 
147 // Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )
148 #define addsubps( dst, src ) \
149  _asm _emit 0xF2 \
150  _asm _emit 0x0F \
151  _asm _emit 0xD0 \
152  _asm _emit ( ( dst & 7 ) << 3 ) | src
153 
154 // Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )
155 #define addsubpd( dst, src ) \
156  _asm _emit 0x66 \
157  _asm _emit 0x0F \
158  _asm _emit 0xD0 \
159  _asm _emit ( ( dst & 7 ) << 3 ) | src
160 
161 // Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )
162 #define haddps( dst, src ) \
163  _asm _emit 0xF2 \
164  _asm _emit 0x0F \
165  _asm _emit 0x7C \
166  _asm _emit ( ( dst & 7 ) << 3 ) | src
167 
168 // Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )
169 #define haddpd( dst, src ) \
170  _asm _emit 0x66 \
171  _asm _emit 0x0F \
172  _asm _emit 0x7C \
173  _asm _emit ( ( dst & 7 ) << 3 ) | src
174 
175 // Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )
176 #define hsubps( dst, src ) \
177  _asm _emit 0xF2 \
178  _asm _emit 0x0F \
179  _asm _emit 0x7D \
180  _asm _emit ( ( dst & 7 ) << 3 ) | src
181 
182 // Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )
183 #define hsubpd( dst, src ) \
184  _asm _emit 0x66 \
185  _asm _emit 0x0F \
186  _asm _emit 0x7D \
187  _asm _emit ( ( dst & 7 ) << 3 ) | src
188 
189 // Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )
190 #define movsldup( dst, src ) \
191  _asm _emit 0xF3 \
192  _asm _emit 0x0F \
193  _asm _emit 0x12 \
194  _asm _emit ( ( dst & 7 ) << 3 ) | src
195 
196 // Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )
197 #define movdldup( dst, src ) \
198  _asm _emit 0xF2 \
199  _asm _emit 0x0F \
200  _asm _emit 0x12 \
201  _asm _emit ( ( dst & 7 ) << 3 ) | src
202 
203 // Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )
204 #define movshdup( dst, src ) \
205  _asm _emit 0xF3 \
206  _asm _emit 0x0F \
207  _asm _emit 0x16 \
208  _asm _emit ( ( dst & 7 ) << 3 ) | src
209 
210 // Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )
211 #define movdhdup( dst, src ) \
212  _asm _emit 0xF2 \
213  _asm _emit 0x0F \
214  _asm _emit 0x16 \
215  _asm _emit ( ( dst & 7 ) << 3 ) | src
216 
217 // Load Unaligned Integer 128 bits
218 #define lddqu( dst, src ) \
219  _asm _emit 0xF2 \
220  _asm _emit 0x0F \
221  _asm _emit 0xF0 \
222  _asm _emit ( ( dst & 7 ) << 3 ) | src
223 
224 
225 #define DRAWVERT_SIZE 60
226 #define DRAWVERT_XYZ_OFFSET (0*4)
227 #define DRAWVERT_ST_OFFSET (3*4)
228 #define DRAWVERT_NORMAL_OFFSET (5*4)
229 #define DRAWVERT_TANGENT0_OFFSET (8*4)
230 #define DRAWVERT_TANGENT1_OFFSET (11*4)
231 #define DRAWVERT_COLOR_OFFSET (14*4)
232 
233 #define JOINTQUAT_SIZE (7*4)
234 #define JOINTMAT_SIZE (4*3*4)
235 #define JOINTWEIGHT_SIZE (4*4)
236 
237 
238 /*
239 ============
240 SSE3_Dot
241 ============
242 */
243 float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {
244  float d;
245  __asm {
246  mov esi, v1
247  mov edi, v2
248  movaps xmm0, [esi]
249  mulps xmm0, [edi]
250  haddps( _xmm0, _xmm0 )
251  haddps( _xmm0, _xmm0 )
252  movss d, xmm0
253  }
254  return d;
255 }
256 
257 /*
258 ============
259 idSIMD_SSE3::GetName
260 ============
261 */
262 const char * idSIMD_SSE3::GetName( void ) const {
263  return "MMX & SSE & SSE2 & SSE3";
264 }
265 
266 /*
267 ============
268 idSIMD_SSE3::TransformVerts
269 ============
270 */
271 void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
272 #if 1
273 
274  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
275  assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
276  assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
277  assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
278 
279  __asm
280  {
281  mov eax, numVerts
282  test eax, eax
283  jz done
284  imul eax, DRAWVERT_SIZE
285 
286  mov ecx, verts
287  mov edx, index
288  mov esi, weights
289  mov edi, joints
290 
291  add ecx, eax
292  neg eax
293 
294  loopVert:
295  mov ebx, [edx]
296  movaps xmm2, [esi]
297  add edx, 8
298  movaps xmm0, xmm2
299  add esi, JOINTWEIGHT_SIZE
300  movaps xmm1, xmm2
301 
302  mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
303  mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
304  mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
305 
306  cmp dword ptr [edx-4], 0
307 
308  jne doneWeight
309 
310  loopWeight:
311  mov ebx, [edx]
312  movaps xmm5, [esi]
313  add edx, 8
314  movaps xmm3, xmm5
315  add esi, JOINTWEIGHT_SIZE
316  movaps xmm4, xmm5
317 
318  mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
319  mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
320  mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
321 
322  cmp dword ptr [edx-4], 0
323 
324  addps xmm0, xmm3
325  addps xmm1, xmm4
326  addps xmm2, xmm5
327 
328  je loopWeight
329 
330  doneWeight:
331  add eax, DRAWVERT_SIZE
332 
333  haddps( _xmm0, _xmm1 )
334  haddps( _xmm2, _xmm0 )
335 
336  movhps [ecx+eax-DRAWVERT_SIZE+0], xmm2
337 
338  haddps( _xmm2, _xmm2 )
339 
340  movss [ecx+eax-DRAWVERT_SIZE+8], xmm2
341 
342  jl loopVert
343  done:
344  }
345 
346 #else
347 
348  int i, j;
349  const byte *jointsPtr = (byte *)joints;
350 
351  for( j = i = 0; i < numVerts; i++ ) {
352  idVec3 v;
353 
354  v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
355  while( index[j*2+1] == 0 ) {
356  j++;
357  v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
358  }
359  j++;
360 
361  verts[i].xyz = v;
362  }
363 
364 #endif
365 }
366 
367 #endif /* _WIN32 */
virtual void VPCALL TransformVerts(idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights)
unsigned int dword
Definition: Lib.h:77
assert(prefInfo.fullscreenBtn)
const GLbyte * weights
Definition: glext.h:3273
const GLdouble * v
Definition: glext.h:2936
#define const
Definition: getdate.c:251
idVec3 xyz
Definition: DrawVert.h:42
Definition: Vector.h:316
int i
Definition: process.py:33
int test(char *url)
Definition: lib500.c:3
GLfloat GLfloat GLfloat v2
Definition: glext.h:3608
GLuint index
Definition: glext.h:3476
Definition: Vector.h:808
GLfloat GLfloat v1
Definition: glext.h:3607
unsigned char byte
Definition: Lib.h:75
#define VPCALL
Definition: Simd.h:63
GLint j
Definition: qgl.h:264
virtual const char *VPCALL GetName(void) const