doom3-gpl
Doom 3 GPL source release
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Simd_SSE2.cpp
Go to the documentation of this file.
1 /*
2 ===========================================================================
3 
4 Doom 3 GPL Source Code
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
6 
7 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
8 
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13 
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
21 
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
23 
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
25 
26 ===========================================================================
27 */
28 
29 #include "../precompiled.h"
30 #pragma hdrstop
31 
32 #include "Simd_Generic.h"
33 #include "Simd_MMX.h"
34 #include "Simd_SSE.h"
35 #include "Simd_SSE2.h"
36 
37 
38 //===============================================================
39 //
40 // SSE2 implementation of idSIMDProcessor
41 //
42 //===============================================================
43 #if defined(MACOS_X) && defined(__i386__)
44 
45 #include <xmmintrin.h>
46 
47 #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
48 #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
49 
50 /*
51 ============
52 idSIMD_SSE2::GetName
53 ============
54 */
55 const char * idSIMD_SSE2::GetName( void ) const {
56  return "MMX & SSE & SSE2";
57 }
58 
59 /*
60 ============
61 idSIMD_SSE::CmpLT
62 
63  dst[i] |= ( src0[i] < constant ) << bitNum;
64 ============
65 */
66 void VPCALL idSIMD_SSE2::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
67  int i, cnt, pre, post;
68  float *aligned;
69  __m128 xmm0, xmm1;
70  __m128i xmm0i;
71  int cnt_l;
72  char *src0_p;
73  char *constant_p;
74  char *dst_p;
75  int mask_l;
76  int dst_l;
77 
78  /* if the float array is not aligned on a 4 byte boundary */
79  if ( ((int) src0) & 3 ) {
80  /* unaligned memory access */
81  pre = 0;
82  cnt = count >> 2;
83  post = count - (cnt<<2);
84 
85  /*
86  __asm mov edx, cnt
87  __asm test edx, edx
88  __asm je doneCmp
89  */
90  cnt_l = cnt;
91  if(cnt_l != 0) {
92  /*
93  __asm push ebx
94  __asm neg edx
95  __asm mov esi, src0
96  __asm prefetchnta [esi+64]
97  __asm movss xmm1, constant
98  __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
99  __asm mov edi, dst
100  __asm mov cl, bitNum
101  */
102  cnt_l = -cnt_l;
103  src0_p = (char *) src0;
104  _mm_prefetch(src0_p+64, _MM_HINT_NTA);
105  constant_p = (char *) &constant;
106  xmm1 = _mm_load_ss((float *)constant_p);
107  xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
108  dst_p = (char *)dst;
109  /*
110  __asm loopNA:
111  */
112  do {
113  /*
114  __asm movups xmm0, [esi]
115  __asm prefetchnta [esi+128]
116  __asm cmpltps xmm0, xmm1
117  __asm movmskps eax, xmm0 \
118  __asm mov ah, al
119  __asm shr ah, 1
120  __asm mov bx, ax
121  __asm shl ebx, 14
122  __asm mov bx, ax
123  __asm and ebx, 0x01010101
124  __asm shl ebx, cl
125  __asm or ebx, dword ptr [edi]
126  __asm mov dword ptr [edi], ebx
127  __asm add esi, 16
128  __asm add edi, 4
129  __asm inc edx
130  __asm jl loopNA
131  __asm pop ebx
132  */
133  xmm0 = _mm_loadu_ps((float *) src0_p);
134  _mm_prefetch(src0_p+128, _MM_HINT_NTA);
135  xmm0 = _mm_cmplt_ps(xmm0, xmm1);
136  // Simplify using SSE2
137  xmm0i = (__m128i) xmm0;
138  xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
139  xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
140  mask_l = _mm_cvtsi128_si32(xmm0i);
141  // End
142  mask_l = mask_l & 0x01010101;
143  mask_l = mask_l << bitNum;
144  dst_l = *((int *) dst_p);
145  mask_l = mask_l | dst_l;
146  *((int *) dst_p) = mask_l;
147  src0_p = src0_p + 16;
148  dst_p = dst_p + 4;
149  cnt_l = cnt_l + 1;
150  } while (cnt_l < 0);
151  }
152  }
153  else {
154  /* aligned memory access */
155  aligned = (float *) ((((int) src0) + 15) & ~15);
156  if ( (int)aligned > ((int)src0) + count ) {
157  pre = count;
158  post = 0;
159  }
160  else {
161  pre = aligned - src0;
162  cnt = (count - pre) >> 2;
163  post = count - pre - (cnt<<2);
164  /*
165  __asm mov edx, cnt
166  __asm test edx, edx
167  __asm je doneCmp
168  */
169  cnt_l = cnt;
170  if(cnt_l != 0) {
171  /*
172  __asm push ebx
173  __asm neg edx
174  __asm mov esi, aligned
175  __asm prefetchnta [esi+64]
176  __asm movss xmm1, constant
177  __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
178  __asm mov edi, dst
179  __asm add edi, pre
180  __asm mov cl, bitNum
181  */
182  cnt_l = -cnt_l;
183  src0_p = (char *) src0;
184  _mm_prefetch(src0_p+64, _MM_HINT_NTA);
185  constant_p = (char *) &constant;
186  xmm1 = _mm_load_ss((float *)constant_p);
187  xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
188  dst_p = (char *)dst;
189  dst_p = dst_p + pre;
190  /*
191  __asm loopA:
192  */
193  do {
194  /*
195  __asm movaps xmm0, [esi]
196  __asm prefetchnta [esi+128]
197  __asm cmpltps xmm0, xmm1
198  __asm movmskps eax, xmm0 \
199  __asm mov ah, al
200  __asm shr ah, 1
201  __asm mov bx, ax
202  __asm shl ebx, 14
203  __asm mov bx, ax
204  __asm and ebx, 0x01010101
205  __asm shl ebx, cl
206  __asm or ebx, dword ptr [edi]
207  __asm mov dword ptr [edi], ebx
208  __asm add esi, 16
209  __asm add edi, 4
210  __asm inc edx
211  __asm jl loopA
212  __asm pop ebx
213  */
214  xmm0 = _mm_load_ps((float *) src0_p);
215  _mm_prefetch(src0_p+128, _MM_HINT_NTA);
216  xmm0 = _mm_cmplt_ps(xmm0, xmm1);
217  // Simplify using SSE2
218  xmm0i = (__m128i) xmm0;
219  xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
220  xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
221  mask_l = _mm_cvtsi128_si32(xmm0i);
222  // End
223  mask_l = mask_l & 0x01010101;
224  mask_l = mask_l << bitNum;
225  dst_l = *((int *) dst_p);
226  mask_l = mask_l | dst_l;
227  *((int *) dst_p) = mask_l;
228  src0_p = src0_p + 16;
229  dst_p = dst_p + 4;
230  cnt_l = cnt_l + 1;
231  } while (cnt_l < 0);
232  }
233  }
234  }
235  /*
236  doneCmp:
237  */
238  float c = constant;
239  for ( i = 0; i < pre; i++ ) {
240  dst[i] |= ( src0[i] < c ) << bitNum;
241  }
242  for ( i = count - post; i < count; i++ ) {
243  dst[i] |= ( src0[i] < c ) << bitNum;
244  }
245 }
246 
247 #elif defined(_WIN32)
248 
249 #include <xmmintrin.h>
250 
251 #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
252 #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
253 #define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
254 #define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
255 
256 
257 #define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
258 #define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
259 #define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
260 
261 ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
262 ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
263 
264 ALIGN4_INIT4( unsigned long SIMD_SP_singleSignBitMask, (unsigned long) ( 1 << 31 ), 0, 0, 0 );
265 ALIGN4_INIT1( unsigned long SIMD_SP_signBitMask, (unsigned long) ( 1 << 31 ) );
266 ALIGN4_INIT1( unsigned long SIMD_SP_absMask, (unsigned long) ~( 1 << 31 ) );
267 ALIGN4_INIT1( unsigned long SIMD_SP_infinityMask, (unsigned long) ~( 1 << 23 ) );
268 
269 ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
270 ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
271 ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
272 ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
273 ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
274 ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
275 ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
276 ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
277 ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
278 ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
279 ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
280 ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
281 
282 
283 /*
284 ============
285 idSIMD_SSE2::GetName
286 ============
287 */
288 const char * idSIMD_SSE2::GetName( void ) const {
289  return "MMX & SSE & SSE2";
290 }
291 
292 #if 0 // the SSE2 code is ungodly slow
293 
294 /*
295 ============
296 idSIMD_SSE2::MatX_LowerTriangularSolve
297 
298  solves x in Lx = b for the n * n sub-matrix of L
299  if skip > 0 the first skip elements of x are assumed to be valid already
300  L has to be a lower triangular matrix with (implicit) ones on the diagonal
301  x == b is allowed
302 ============
303 */
304 void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
305  int nc;
306  const float *lptr;
307 
308  if ( skip >= n ) {
309  return;
310  }
311 
312  lptr = L[skip];
313  nc = L.GetNumColumns();
314 
315  // unrolled cases for n < 8
316  if ( n < 8 ) {
317  #define NSKIP( n, s ) ((n<<3)|(s&7))
318  switch( NSKIP( n, skip ) ) {
319  case NSKIP( 1, 0 ): x[0] = b[0];
320  return;
321  case NSKIP( 2, 0 ): x[0] = b[0];
322  case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
323  return;
324  case NSKIP( 3, 0 ): x[0] = b[0];
325  case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
326  case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
327  return;
328  case NSKIP( 4, 0 ): x[0] = b[0];
329  case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
330  case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
331  case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
332  return;
333  case NSKIP( 5, 0 ): x[0] = b[0];
334  case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
335  case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
336  case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
337  case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
338  return;
339  case NSKIP( 6, 0 ): x[0] = b[0];
340  case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
341  case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
342  case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
343  case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
344  case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
345  return;
346  case NSKIP( 7, 0 ): x[0] = b[0];
347  case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
348  case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
349  case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
350  case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
351  case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
352  case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
353  return;
354  }
355  return;
356  }
357 
358  // process first 4 rows
359  switch( skip ) {
360  case 0: x[0] = b[0];
361  case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
362  case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
363  case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
364  skip = 4;
365  }
366 
367  lptr = L[skip];
368 
369  __asm {
370  push ebx
371  mov eax, skip // eax = i
372  shl eax, 2 // eax = i*4
373  mov edx, n // edx = n
374  shl edx, 2 // edx = n*4
375  mov esi, x // esi = x
376  mov edi, lptr // edi = lptr
377  add esi, eax
378  add edi, eax
379  mov ebx, b // ebx = b
380  // aligned
381  looprow:
382  mov ecx, eax
383  neg ecx
384  cvtps2pd xmm0, [esi+ecx]
385  cvtps2pd xmm2, [edi+ecx]
386  mulpd xmm0, xmm2
387  cvtps2pd xmm1, [esi+ecx+8]
388  cvtps2pd xmm3, [edi+ecx+8]
389  mulpd xmm1, xmm3
390  add ecx, 20*4
391  jg donedot16
392  dot16:
393  cvtps2pd xmm2, [esi+ecx-(16*4)]
394  cvtps2pd xmm3, [edi+ecx-(16*4)]
395  cvtps2pd xmm4, [esi+ecx-(14*4)]
396  mulpd xmm2, xmm3
397  cvtps2pd xmm5, [edi+ecx-(14*4)]
398  addpd xmm0, xmm2
399  cvtps2pd xmm2, [esi+ecx-(12*4)]
400  mulpd xmm4, xmm5
401  cvtps2pd xmm3, [edi+ecx-(12*4)]
402  addpd xmm1, xmm4
403  cvtps2pd xmm4, [esi+ecx-(10*4)]
404  mulpd xmm2, xmm3
405  cvtps2pd xmm5, [edi+ecx-(10*4)]
406  addpd xmm0, xmm2
407  cvtps2pd xmm2, [esi+ecx-(8*4)]
408  mulpd xmm4, xmm5
409  cvtps2pd xmm3, [edi+ecx-(8*4)]
410  addpd xmm1, xmm4
411  cvtps2pd xmm4, [esi+ecx-(6*4)]
412  mulpd xmm2, xmm3
413  cvtps2pd xmm5, [edi+ecx-(6*4)]
414  addpd xmm0, xmm2
415  cvtps2pd xmm2, [esi+ecx-(4*4)]
416  mulpd xmm4, xmm5
417  cvtps2pd xmm3, [edi+ecx-(4*4)]
418  addpd xmm1, xmm4
419  cvtps2pd xmm4, [esi+ecx-(2*4)]
420  mulpd xmm2, xmm3
421  cvtps2pd xmm5, [edi+ecx-(2*4)]
422  addpd xmm0, xmm2
423  add ecx, 16*4
424  mulpd xmm4, xmm5
425  addpd xmm1, xmm4
426  jle dot16
427  donedot16:
428  sub ecx, 8*4
429  jg donedot8
430  dot8:
431  cvtps2pd xmm2, [esi+ecx-(8*4)]
432  cvtps2pd xmm3, [edi+ecx-(8*4)]
433  cvtps2pd xmm7, [esi+ecx-(6*4)]
434  mulpd xmm2, xmm3
435  cvtps2pd xmm5, [edi+ecx-(6*4)]
436  addpd xmm0, xmm2
437  cvtps2pd xmm6, [esi+ecx-(4*4)]
438  mulpd xmm7, xmm5
439  cvtps2pd xmm3, [edi+ecx-(4*4)]
440  addpd xmm1, xmm7
441  cvtps2pd xmm4, [esi+ecx-(2*4)]
442  mulpd xmm6, xmm3
443  cvtps2pd xmm7, [edi+ecx-(2*4)]
444  addpd xmm0, xmm6
445  add ecx, 8*4
446  mulpd xmm4, xmm7
447  addpd xmm1, xmm4
448  donedot8:
449  sub ecx, 4*4
450  jg donedot4
451  dot4:
452  cvtps2pd xmm2, [esi+ecx-(4*4)]
453  cvtps2pd xmm3, [edi+ecx-(4*4)]
454  cvtps2pd xmm4, [esi+ecx-(2*4)]
455  mulpd xmm2, xmm3
456  cvtps2pd xmm5, [edi+ecx-(2*4)]
457  addpd xmm0, xmm2
458  add ecx, 4*4
459  mulpd xmm4, xmm5
460  addpd xmm1, xmm4
461  donedot4:
462  addpd xmm0, xmm1
463  movaps xmm1, xmm0
464  shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 0 )
465  addsd xmm0, xmm1
466  sub ecx, 4*4
467  jz dot0
468  add ecx, 4
469  jz dot1
470  add ecx, 4
471  jz dot2
472  //dot3:
473  cvtss2sd xmm1, [esi-(3*4)]
474  cvtss2sd xmm2, [edi-(3*4)]
475  mulsd xmm1, xmm2
476  addsd xmm0, xmm1
477  dot2:
478  cvtss2sd xmm3, [esi-(2*4)]
479  cvtss2sd xmm4, [edi-(2*4)]
480  mulsd xmm3, xmm4
481  addsd xmm0, xmm3
482  dot1:
483  cvtss2sd xmm5, [esi-(1*4)]
484  cvtss2sd xmm6, [edi-(1*4)]
485  mulsd xmm5, xmm6
486  addsd xmm0, xmm5
487  dot0:
488  cvtss2sd xmm1, [ebx+eax]
489  subsd xmm1, xmm0
490  cvtsd2ss xmm0, xmm1
491  movss [esi], xmm0
492  add eax, 4
493  cmp eax, edx
494  jge done
495  add esi, 4
496  mov ecx, nc
497  shl ecx, 2
498  add edi, ecx
499  add edi, 4
500  jmp looprow
501  // done
502  done:
503  pop ebx
504  }
505 }
506 
507 /*
508 ============
509 idSIMD_SSE2::MatX_LowerTriangularSolveTranspose
510 
511  solves x in L'x = b for the n * n sub-matrix of L
512  L has to be a lower triangular matrix with (implicit) ones on the diagonal
513  x == b is allowed
514 ============
515 */
516 void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
517  int nc;
518  const float *lptr;
519 
520  lptr = L.ToFloatPtr();
521  nc = L.GetNumColumns();
522 
523  // unrolled cases for n < 8
524  if ( n < 8 ) {
525  switch( n ) {
526  case 0:
527  return;
528  case 1:
529  x[0] = b[0];
530  return;
531  case 2:
532  x[1] = b[1];
533  x[0] = b[0] - lptr[1*nc+0] * x[1];
534  return;
535  case 3:
536  x[2] = b[2];
537  x[1] = b[1] - lptr[2*nc+1] * x[2];
538  x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
539  return;
540  case 4:
541  x[3] = b[3];
542  x[2] = b[2] - lptr[3*nc+2] * x[3];
543  x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
544  x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
545  return;
546  case 5:
547  x[4] = b[4];
548  x[3] = b[3] - lptr[4*nc+3] * x[4];
549  x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
550  x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
551  x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
552  return;
553  case 6:
554  x[5] = b[5];
555  x[4] = b[4] - lptr[5*nc+4] * x[5];
556  x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
557  x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
558  x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
559  x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
560  return;
561  case 7:
562  x[6] = b[6];
563  x[5] = b[5] - lptr[6*nc+5] * x[6];
564  x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
565  x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
566  x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
567  x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
568  x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
569  return;
570  }
571  return;
572  }
573 
574  int i, j, m;
575  float *xptr;
576  double s0;
577 
578  // if the number of columns is not a multiple of 2 we're screwed for alignment.
579  // however, if the number of columns is a multiple of 2 but the number of to be
580  // processed rows is not a multiple of 2 we can still run 8 byte aligned
581  m = n;
582  if ( m & 1 ) {
583  m--;
584  x[m] = b[m];
585 
586  lptr = L[m] + m - 4;
587  xptr = x + m;
588  __asm {
589  push ebx
590  mov eax, m // eax = i
591  mov esi, xptr // esi = xptr
592  mov edi, lptr // edi = lptr
593  mov ebx, b // ebx = b
594  mov edx, nc // edx = nc*sizeof(float)
595  shl edx, 2
596  process4rows_1:
597  cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
598  cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
599  xor ecx, ecx
600  sub eax, m
601  neg eax
602  jz done4x4_1
603  process4x4_1: // process 4x4 blocks
604  cvtps2pd xmm3, [edi]
605  cvtps2pd xmm4, [edi+8]
606  add edi, edx
607  cvtss2sd xmm5, [esi+4*ecx+0]
608  shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
609  mulpd xmm3, xmm5
610  cvtps2pd xmm1, [edi]
611  mulpd xmm4, xmm5
612  cvtps2pd xmm6, [edi+8]
613  subpd xmm0, xmm3
614  subpd xmm2, xmm4
615  add edi, edx
616  cvtss2sd xmm7, [esi+4*ecx+4]
617  shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
618  mulpd xmm1, xmm7
619  cvtps2pd xmm3, [edi]
620  mulpd xmm6, xmm7
621  cvtps2pd xmm4, [edi+8]
622  subpd xmm0, xmm1
623  subpd xmm2, xmm6
624  add edi, edx
625  cvtss2sd xmm5, [esi+4*ecx+8]
626  shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
627  mulpd xmm3, xmm5
628  cvtps2pd xmm1, [edi]
629  mulpd xmm4, xmm5
630  cvtps2pd xmm6, [edi+8]
631  subpd xmm0, xmm3
632  subpd xmm2, xmm4
633  add edi, edx
634  cvtss2sd xmm7, [esi+4*ecx+12]
635  shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
636  mulpd xmm1, xmm7
637  add ecx, 4
638  mulpd xmm6, xmm7
639  cmp ecx, eax
640  subpd xmm0, xmm1
641  subpd xmm2, xmm6
642  jl process4x4_1
643  done4x4_1: // process left over of the 4 rows
644  cvtps2pd xmm3, [edi]
645  cvtps2pd xmm4, [edi+8]
646  cvtss2sd xmm5, [esi+4*ecx]
647  shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
648  mulpd xmm3, xmm5
649  mulpd xmm4, xmm5
650  subpd xmm0, xmm3
651  subpd xmm2, xmm4
652  imul ecx, edx
653  sub edi, ecx
654  neg eax
655 
656  add eax, m
657  sub eax, 4
658  movapd xmm1, xmm0
659  shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 )
660  movapd xmm3, xmm2
661  shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 )
662  sub edi, edx
663  cvtsd2ss xmm7, xmm3
664  movss [esi-4], xmm7 // xptr[-1] = s3
665  movsd xmm4, xmm3
666  movsd xmm5, xmm3
667  cvtss2sd xmm7, [edi+8]
668  mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
669  cvtss2sd xmm7, [edi+4]
670  mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
671  cvtss2sd xmm7, [edi]
672  mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
673  subsd xmm2, xmm3
674  cvtsd2ss xmm7, xmm2
675  movss [esi-8], xmm7 // xptr[-2] = s2
676  movsd xmm6, xmm2
677  sub edi, edx
678  subsd xmm0, xmm5
679  subsd xmm1, xmm4
680  cvtss2sd xmm7, [edi+4]
681  mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
682  cvtss2sd xmm7, [edi]
683  mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
684  subsd xmm1, xmm2
685  cvtsd2ss xmm7, xmm1
686  movss [esi-12], xmm7 // xptr[-3] = s1
687  subsd xmm0, xmm6
688  sub edi, edx
689  cmp eax, 4
690  cvtss2sd xmm7, [edi]
691  mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
692  subsd xmm0, xmm1
693  cvtsd2ss xmm7, xmm0
694  movss [esi-16], xmm7 // xptr[-4] = s0
695  jl done4rows_1
696  sub edi, edx
697  sub edi, 16
698  sub esi, 16
699  jmp process4rows_1
700  done4rows_1:
701  pop ebx
702  }
703  }
704  else {
705  lptr = L.ToFloatPtr() + m * L.GetNumColumns() + m - 4;
706  xptr = x + m;
707  __asm {
708  push ebx
709  mov eax, m // eax = i
710  mov esi, xptr // esi = xptr
711  mov edi, lptr // edi = lptr
712  mov ebx, b // ebx = b
713  mov edx, nc // edx = nc*sizeof(float)
714  shl edx, 2
715  process4rows:
716  cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
717  cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
718  sub eax, m
719  jz done4x4
720  neg eax
721  xor ecx, ecx
722  process4x4: // process 4x4 blocks
723  cvtps2pd xmm3, [edi]
724  cvtps2pd xmm4, [edi+8]
725  add edi, edx
726  cvtss2sd xmm5, [esi+4*ecx+0]
727  shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
728  mulpd xmm3, xmm5
729  cvtps2pd xmm1, [edi]
730  mulpd xmm4, xmm5
731  cvtps2pd xmm6, [edi+8]
732  subpd xmm0, xmm3
733  subpd xmm2, xmm4
734  add edi, edx
735  cvtss2sd xmm7, [esi+4*ecx+4]
736  shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
737  mulpd xmm1, xmm7
738  cvtps2pd xmm3, [edi]
739  mulpd xmm6, xmm7
740  cvtps2pd xmm4, [edi+8]
741  subpd xmm0, xmm1
742  subpd xmm2, xmm6
743  add edi, edx
744  cvtss2sd xmm5, [esi+4*ecx+8]
745  shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
746  mulpd xmm3, xmm5
747  cvtps2pd xmm1, [edi]
748  mulpd xmm4, xmm5
749  cvtps2pd xmm6, [edi+8]
750  subpd xmm0, xmm3
751  subpd xmm2, xmm4
752  add edi, edx
753  cvtss2sd xmm7, [esi+4*ecx+12]
754  shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
755  mulpd xmm1, xmm7
756  add ecx, 4
757  mulpd xmm6, xmm7
758  cmp ecx, eax
759  subpd xmm0, xmm1
760  subpd xmm2, xmm6
761  jl process4x4
762  imul ecx, edx
763  sub edi, ecx
764  neg eax
765  done4x4: // process left over of the 4 rows
766  add eax, m
767  sub eax, 4
768  movapd xmm1, xmm0
769  shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 )
770  movapd xmm3, xmm2
771  shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 )
772  sub edi, edx
773  cvtsd2ss xmm7, xmm3
774  movss [esi-4], xmm7 // xptr[-1] = s3
775  movsd xmm4, xmm3
776  movsd xmm5, xmm3
777  cvtss2sd xmm7, [edi+8]
778  mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
779  cvtss2sd xmm7, [edi+4]
780  mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
781  cvtss2sd xmm7, [edi]
782  mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
783  subsd xmm2, xmm3
784  cvtsd2ss xmm7, xmm2
785  movss [esi-8], xmm7 // xptr[-2] = s2
786  movsd xmm6, xmm2
787  sub edi, edx
788  subsd xmm0, xmm5
789  subsd xmm1, xmm4
790  cvtss2sd xmm7, [edi+4]
791  mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
792  cvtss2sd xmm7, [edi]
793  mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
794  subsd xmm1, xmm2
795  cvtsd2ss xmm7, xmm1
796  movss [esi-12], xmm7 // xptr[-3] = s1
797  subsd xmm0, xmm6
798  sub edi, edx
799  cmp eax, 4
800  cvtss2sd xmm7, [edi]
801  mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
802  subsd xmm0, xmm1
803  cvtsd2ss xmm7, xmm0
804  movss [esi-16], xmm7 // xptr[-4] = s0
805  jl done4rows
806  sub edi, edx
807  sub edi, 16
808  sub esi, 16
809  jmp process4rows
810  done4rows:
811  pop ebx
812  }
813  }
814 
815  // process left over rows
816  for ( i = (m&3)-1; i >= 0; i-- ) {
817  s0 = b[i];
818  lptr = L[i+1] + i;
819  for ( j = i + 1; j < m; j++ ) {
820  s0 -= lptr[0] * x[j];
821  lptr += nc;
822  }
823  x[i] = s0;
824  }
825 }
826 
827 #endif
828 
829 /*
830 ============
831 idSIMD_SSE2::MixedSoundToSamples
832 ============
833 */
834 void VPCALL idSIMD_SSE2::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
835 
836  assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );
837 
838  __asm {
839 
840  mov eax, numSamples
841  mov edi, mixBuffer
842  mov esi, samples
843  shl eax, 2
844  add edi, eax
845  neg eax
846 
847  loop16:
848 
849  movaps xmm0, [edi+eax+0*16]
850  movaps xmm1, [edi+eax+1*16]
851  movaps xmm2, [edi+eax+2*16]
852  movaps xmm3, [edi+eax+3*16]
853 
854  add esi, 4*4*2
855 
856  cvtps2dq xmm4, xmm0
857  cvtps2dq xmm5, xmm1
858  cvtps2dq xmm6, xmm2
859  cvtps2dq xmm7, xmm3
860 
861  prefetchnta [edi+eax+128]
862 
863  packssdw xmm4, xmm5
864  packssdw xmm6, xmm7
865 
866  add eax, 4*16
867 
868  movlps [esi-4*4*2], xmm4 // FIXME: should not use movlps/movhps to move integer data
869  movhps [esi-3*4*2], xmm4
870  movlps [esi-2*4*2], xmm6
871  movhps [esi-1*4*2], xmm6
872 
873  jl loop16
874  }
875 }
876 
877 #endif /* _WIN32 */
virtual void VPCALL MatX_LowerTriangularSolve(const idMatX &L, float *x, const float *b, const int n, int skip=0)
static const float INFINITY
Definition: Math.h:218
assert(prefInfo.fullscreenBtn)
const int MIXBUFFER_SAMPLES
Definition: Simd.h:84
#define const
Definition: getdate.c:251
static const float PI
Definition: Math.h:205
virtual void VPCALL MixedSoundToSamples(short *samples, const float *mixBuffer, const int numSamples)
GLenum GLsizei n
Definition: glext.h:3705
case const int
Definition: Callbacks.cpp:52
static const float HALF_PI
Definition: Math.h:207
virtual void VPCALL MatX_LowerTriangularSolveTranspose(const idMatX &L, float *x, const float *b, const int n)
GLenum GLint x
Definition: glext.h:2849
int i
Definition: process.py:33
GLuint dst
Definition: glext.h:5285
GLuint GLuint GLsizei count
Definition: glext.h:2845
int GetNumColumns(void) const
Definition: Matrix.h:1822
const GLubyte * c
Definition: glext.h:4677
#define NSKIP(n, s)
Definition: eax4.h:1413
GLubyte GLubyte b
Definition: glext.h:4662
static const float TWO_PI
Definition: Math.h:206
tuple f
Definition: idal.py:89
unsigned char byte
Definition: Lib.h:75
#define VPCALL
Definition: Simd.h:63
GLint j
Definition: qgl.h:264
virtual void VPCALL CmpLT(byte *dst, const float *src0, const float constant, const int count)
virtual const char *VPCALL GetName(void) const