doom3-gpl
Doom 3 GPL source release
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Simd_SSE.cpp
Go to the documentation of this file.
1 /*
2 ===========================================================================
3 
4 Doom 3 GPL Source Code
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
6 
7 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
8 
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13 
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
21 
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
23 
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
25 
26 ===========================================================================
27 */
28 
29 #include "../precompiled.h"
30 #pragma hdrstop
31 
32 #include "Simd_Generic.h"
33 #include "Simd_MMX.h"
34 #include "Simd_SSE.h"
35 
36 
37 //===============================================================
38 // M
39 // SSE implementation of idSIMDProcessor MrE
40 // E
41 //===============================================================
42 
43 
44 #if defined(MACOS_X) && defined(__i386__)
45 
46 #include <xmmintrin.h>
47 
48 #define DRAWVERT_SIZE 60
49 #define DRAWVERT_XYZ_OFFSET (0*4)
50 #define DRAWVERT_ST_OFFSET (3*4)
51 #define DRAWVERT_NORMAL_OFFSET (5*4)
52 #define DRAWVERT_TANGENT0_OFFSET (8*4)
53 #define DRAWVERT_TANGENT1_OFFSET (11*4)
54 #define DRAWVERT_COLOR_OFFSET (14*4)
55 
56 #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
57 #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
58 
59 /*
60 ============
61 idSIMD_SSE::GetName
62 ============
63 */
64 const char * idSIMD_SSE::GetName( void ) const {
65  return "MMX & SSE";
66 }
67 
68 /*
69 ============
70 idSIMD_SSE::Dot
71 
72  dst[i] = constant.Normal() * src[i].xyz + constant[3];
73 ============
74 */
75 void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
76  // 0, 1, 2
77  // 3, 4, 5
78  // 6, 7, 8
79  // 9, 10, 11
80 
81  /*
82  mov eax, count
83  mov edi, constant
84  mov edx, eax
85  mov esi, src
86  mov ecx, dst
87  */
88  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // Declare 8 xmm registers.
89  int count_l4 = count; // count_l4 = eax
90  int count_l1 = count; // count_l1 = edx
91  char *constant_p = (char *)&constant; // constant_p = edi
92  char *src_p = (char *) src; // src_p = esi
93  char *dst_p = (char *) dst; // dst_p = ecx
94 
95  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
96  assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
97 
98  /*
99  and eax, ~3
100  movss xmm4, [edi+0]
101  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
102  movss xmm5, [edi+4]
103  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
104  movss xmm6, [edi+8]
105  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
106  movss xmm7, [edi+12]
107  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
108  */
109  count_l4 = count_l4 & ~3;
110  xmm4 = _mm_load_ss((float *) (constant_p));
111  xmm4 = _mm_shuffle_ps(xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ));
112  xmm5 = _mm_load_ss((float *) (constant_p + 4));
113  xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
114  xmm6 = _mm_load_ss((float *) (constant_p + 8));
115  xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
116  xmm7 = _mm_load_ss((float *) (constant_p + 12));
117  xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
118 
119  /*
120  jz startVert1
121  */
122  if(count_l4 != 0) {
123  /*
124  imul eax, DRAWVERT_SIZE
125  add esi, eax
126  neg eax
127  */
128  count_l4 = count_l4 * DRAWVERT_SIZE;
129  src_p = src_p + count_l4;
130  count_l4 = -count_l4;
131  /*
132  loopVert4:
133  */
134  do {
135  /*
136  movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
137  movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
138  movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
139  movaps xmm1, xmm0 // 3, X, 0, 1
140  */
141  xmm0 = _mm_load_ss((float *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3, X, X, X
142  xmm2 = _mm_load_ss((float *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8)); // 2, X, X, X
143  xmm0 = _mm_loadh_pi(xmm0, (__m64 *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3, X, 0, 1
144  xmm1 = xmm0; // 3, X, 0, 1
145 
146  /*
147  movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
148  shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
149  */
150  xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 4, 5, 0, 1
151  xmm2 = _mm_shuffle_ps(xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )); // 2, X, 4, 5
152 
153  /*
154  movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
155  movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
156  shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
157  */
158  xmm3 = _mm_load_ss((float *) (src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9, X, X, X
159  xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9, X, 6, 7
160  xmm0 = _mm_shuffle_ps(xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )); // 0, 3, 6, 9
161  /*
162  movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
163  shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
164  */
165  xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 10, 11, 6, 7
166  xmm1 = _mm_shuffle_ps(xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )); // 1, 4, 7, 10
167  /*
168  movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
169  shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
170  */
171  xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8)); // 10, 11, 8, X
172  xmm2 = _mm_shuffle_ps(xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )); // 2, 5, 8, 11
173 
174  /*
175  add ecx, 16
176  add eax, 4*DRAWVERT_SIZE
177  */
178  dst_p = dst_p + 16;
179  count_l4 = count_l4 + 4*DRAWVERT_SIZE;
180 
181  /*
182  mulps xmm0, xmm4
183  mulps xmm1, xmm5
184  mulps xmm2, xmm6
185  addps xmm0, xmm7
186  addps xmm0, xmm1
187  addps xmm0, xmm2
188  */
189  xmm0 = _mm_mul_ps(xmm0, xmm4);
190  xmm1 = _mm_mul_ps(xmm1, xmm5);
191  xmm2 = _mm_mul_ps(xmm2, xmm6);
192  xmm0 = _mm_add_ps(xmm0, xmm7);
193  xmm0 = _mm_add_ps(xmm0, xmm1);
194  xmm0 = _mm_add_ps(xmm0, xmm2);
195 
196  /*
197  movlps [ecx-16+0], xmm0
198  movhps [ecx-16+8], xmm0
199  jl loopVert4
200  */
201  _mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
202  _mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
203  } while(count_l4 < 0);
204  }
205 
206  /*
207  startVert1:
208  and edx, 3
209  jz done
210  */
211  count_l1 = count_l1 & 3;
212  if(count_l1 != 0) {
213  /*
214  loopVert1:
215  movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
216  movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
217  movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
218  mulss xmm0, xmm4
219  mulss xmm1, xmm5
220  mulss xmm2, xmm6
221  addss xmm0, xmm7
222  add ecx, 4
223  addss xmm0, xmm1
224  add eax, DRAWVERT_SIZE
225  addss xmm0, xmm2
226  dec edx
227  movss [ecx-4], xmm0
228  jnz loopVert1
229  */
230  do {
231  xmm0 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+0));
232  xmm1 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+4));
233  xmm2 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+8));
234  xmm0 = _mm_mul_ss(xmm0, xmm4);
235  xmm1 = _mm_mul_ss(xmm1, xmm5);
236  xmm2 = _mm_mul_ss(xmm2, xmm6);
237  xmm0 = _mm_add_ss(xmm0, xmm7);
238  dst_p = dst_p + 4;
239  xmm0 = _mm_add_ss(xmm0, xmm1);
240  count_l4 = count_l4 + DRAWVERT_SIZE;
241  xmm0 = _mm_add_ss(xmm0, xmm2);
242  count_l1 = count_l1 - 1;
243  _mm_store_ss((float *) (dst_p-4), xmm0);
244  } while( count_l1 != 0);
245  }
246  /*
247  done:
248  */
249 }
250 
251 /*
252 ============
253 idSIMD_SSE::MinMax
254 ============
255 */
256 void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
257 
258  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
259  assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
260 
261  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
262  char *indexes_p;
263  char *src_p;
264  int count_l;
265  int edx;
266  char *min_p;
267  char *max_p;
268 
269  /*
270  movss xmm0, idMath::INFINITY
271  xorps xmm1, xmm1
272  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
273  subps xmm1, xmm0
274  movaps xmm2, xmm0
275  movaps xmm3, xmm1
276  */
277  xmm0 = _mm_load_ss(&idMath::INFINITY);
278  // To satisfy the compiler use xmm0 instead.
279  xmm1 = _mm_xor_ps(xmm0, xmm0);
280  xmm0 = _mm_shuffle_ps(xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ));
281  xmm1 = _mm_sub_ps(xmm1, xmm0);
282  xmm2 = xmm0;
283  xmm3 = xmm1;
284 
285  /*
286  mov edi, indexes
287  mov esi, src
288  mov eax, count
289  and eax, ~3
290  jz done4
291  */
292  indexes_p = (char *) indexes;
293  src_p = (char *) src;
294  count_l = count;
295  count_l = count_l & ~3;
296  if(count_l != 0) {
297  /*
298  shl eax, 2
299  add edi, eax
300  neg eax
301  */
302  count_l = count_l << 2;
303  indexes_p = indexes_p + count_l;
304  count_l = -count_l;
305  /*
306  loop4:
307 // prefetchnta [edi+128]
308 // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
309  */
310  do {
311  /*
312  mov edx, [edi+eax+0]
313  imul edx, DRAWVERT_SIZE
314  movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
315  movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
316  minps xmm0, xmm4
317  maxps xmm1, xmm4
318  */
319  edx = *((int*)(indexes_p+count_l+0));
320  edx = edx * DRAWVERT_SIZE;
321  xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
322  xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
323  xmm0 = _mm_min_ps(xmm0, xmm4);
324  xmm1 = _mm_max_ps(xmm1, xmm4);
325 
326  /*
327  mov edx, [edi+eax+4]
328  imul edx, DRAWVERT_SIZE
329  movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
330  movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
331  minps xmm2, xmm5
332  maxps xmm3, xmm5
333  */
334  edx = *((int*)(indexes_p+count_l+4));
335  edx = edx * DRAWVERT_SIZE;
336  xmm5 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
337  xmm5 = _mm_loadh_pi(xmm5, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
338  xmm2 = _mm_min_ps(xmm2, xmm5);
339  xmm3 = _mm_max_ps(xmm3, xmm5);
340 
341  /*
342  mov edx, [edi+eax+8]
343  imul edx, DRAWVERT_SIZE
344  movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
345  movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
346  minps xmm0, xmm6
347  maxps xmm1, xmm6
348  */
349  edx = *((int*)(indexes_p+count_l+8));
350  edx = edx * DRAWVERT_SIZE;
351  xmm6 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
352  xmm6 = _mm_loadh_pi(xmm6, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
353  xmm0 = _mm_min_ps(xmm0, xmm6);
354  xmm1 = _mm_max_ps(xmm1, xmm6);
355 
356  /*
357  mov edx, [edi+eax+12]
358  imul edx, DRAWVERT_SIZE
359  movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
360  movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
361  minps xmm2, xmm7
362  maxps xmm3, xmm7
363  */
364  edx = *((int*)(indexes_p+count_l+12));
365  edx = edx * DRAWVERT_SIZE;
366  xmm7 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
367  xmm7 = _mm_loadh_pi(xmm7, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
368  xmm2 = _mm_min_ps(xmm2, xmm7);
369  xmm3 = _mm_max_ps(xmm3, xmm7);
370 
371  /*
372  add eax, 4*4
373  jl loop4
374  */
375  count_l = count_l + 4*4;
376  } while (count_l < 0);
377  }
378  /*
379  done4:
380  mov eax, count
381  and eax, 3
382  jz done1
383  */
384  count_l = count;
385  count_l = count_l & 3;
386  if(count_l != 0) {
387  /*
388  shl eax, 2
389  add edi, eax
390  neg eax
391  */
392  count_l = count_l << 2;
393  indexes_p = indexes_p + count_l;
394  count_l = -count_l;
395  /*
396  loop1:
397  */
398  do{
399  /*
400  mov edx, [edi+eax+0]
401  imul edx, DRAWVERT_SIZE;
402  movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
403  movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
404  minps xmm0, xmm4
405  maxps xmm1, xmm4
406  */
407  edx = *((int*)(indexes_p+count_l+0));
408  edx = edx * DRAWVERT_SIZE;
409  xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
410  xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
411  xmm0 = _mm_min_ps(xmm0, xmm4);
412  xmm1 = _mm_max_ps(xmm1, xmm4);
413 
414  /*
415  add eax, 4
416  jl loop1
417  */
418  count_l = count_l + 4;
419  } while (count_l < 0);
420 
421  }
422 
423  /*
424  done1:
425  shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
426  shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
427  minps xmm0, xmm2
428  maxps xmm1, xmm3
429  mov esi, min
430  movhps [esi], xmm0
431  movss [esi+8], xmm0
432  mov edi, max
433  movhps [edi], xmm1
434  movss [edi+8], xmm1
435  */
436  xmm2 = _mm_shuffle_ps(xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ));
437  xmm3 = _mm_shuffle_ps(xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ));
438  xmm0 = _mm_min_ps(xmm0, xmm2);
439  xmm1 = _mm_max_ps(xmm1, xmm3);
440  min_p = (char *) &min;
441  _mm_storeh_pi((__m64 *)(min_p), xmm0);
442  _mm_store_ss((float *)(min_p+8), xmm0);
443  max_p = (char *) &max;
444  _mm_storeh_pi((__m64 *)(max_p), xmm1);
445  _mm_store_ss((float *)(max_p+8), xmm1);
446 }
447 
448 /*
449 ============
450 idSIMD_SSE::Dot
451 
452  dst[i] = constant * src[i].Normal() + src[i][3];
453 ============
454 */
455 void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
456  int count_l4;
457  int count_l1;
458  char *constant_p;
459  char *src_p;
460  char *dst_p;
461  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
462 
463  /*
464  mov eax, count
465  mov edi, constant
466  mov edx, eax
467  mov esi, src
468  mov ecx, dst
469  and eax, ~3
470  */
471  count_l4 = count;
472  constant_p = (char *) &constant;
473  count_l1 = count_l4;
474  src_p = (char *) src;
475  dst_p = (char *) dst;
476  count_l4 = count_l4 & ~3;
477 
478  /*
479  movss xmm5, [edi+0]
480  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
481  movss xmm6, [edi+4]
482  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
483  movss xmm7, [edi+8]
484  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
485  */
486  xmm5 = _mm_load_ss((float *) (constant_p+0));
487  xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
488  xmm6 = _mm_load_ss((float *) (constant_p+4));
489  xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
490  xmm7 = _mm_load_ss((float *) (constant_p+8));
491  xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
492 
493  /*
494  jz startVert1
495  */
496  if (count != 0) {
497  /*
498  imul eax, 16
499  add esi, eax
500  neg eax
501  */
502  count_l4 = count_l4 * 16;
503  src_p = src_p + count_l4;
504  count_l4 = -count_l4;
505  /*
506  loopVert4:
507  */
508  do {
509  /*
510  movlps xmm1, [esi+eax+ 0]
511  movlps xmm3, [esi+eax+ 8]
512  movhps xmm1, [esi+eax+16]
513  movhps xmm3, [esi+eax+24]
514  movlps xmm2, [esi+eax+32]
515  movlps xmm4, [esi+eax+40]
516  movhps xmm2, [esi+eax+48]
517  movhps xmm4, [esi+eax+56]
518  movaps xmm0, xmm1
519  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
520  shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
521  movaps xmm2, xmm3
522  shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
523  shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
524  */
525  xmm1 = _mm_loadl_pi(xmm1, (__m64 *)(src_p+count_l4+ 0));
526  xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+ 8));
527  xmm1 = _mm_loadh_pi(xmm1, (__m64 *)(src_p+count_l4+16));
528  xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+24));
529  xmm2 = _mm_loadl_pi(xmm2, (__m64 *)(src_p+count_l4+32));
530  xmm4 = _mm_loadl_pi(xmm4, (__m64 *)(src_p+count_l4+40));
531  xmm2 = _mm_loadh_pi(xmm2, (__m64 *)(src_p+count_l4+48));
532  xmm4 = _mm_loadh_pi(xmm4, (__m64 *)(src_p+count_l4+56));
533 
534  xmm0 = xmm1;
535  xmm0 = _mm_shuffle_ps(xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ));
536  xmm1 = _mm_shuffle_ps(xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ));
537  xmm2 = xmm3;
538  xmm2 = _mm_shuffle_ps(xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ));
539  xmm3 = _mm_shuffle_ps(xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ));
540 
541  /*
542  add ecx, 16
543  add eax, 4*16
544  */
545  dst_p = dst_p + 16;
546  count_l4 = count_l4 + 4*16;
547 
548  /*
549  mulps xmm0, xmm5
550  mulps xmm1, xmm6
551  mulps xmm2, xmm7
552  addps xmm0, xmm3
553  addps xmm0, xmm1
554  addps xmm0, xmm2
555  */
556  xmm0 = _mm_mul_ps(xmm0, xmm5);
557  xmm1 = _mm_mul_ps(xmm1, xmm6);
558  xmm2 = _mm_mul_ps(xmm2, xmm7);
559  xmm0 = _mm_add_ps(xmm0, xmm3);
560  xmm0 = _mm_add_ps(xmm0, xmm1);
561  xmm0 = _mm_add_ps(xmm0, xmm2);
562 
563  /*
564  movlps [ecx-16+0], xmm0
565  movhps [ecx-16+8], xmm0
566  jl loopVert4
567  */
568  _mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
569  _mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
570  } while (count_l4 < 0);
571  }
572 
573  /*
574  startVert1:
575  and edx, 3
576  jz done
577  */
578  count_l1 = count_l1 & 3;
579 
580  if(count_l1 != 0) {
581  /*
582  loopVert1:
583  */
584  do {
585  /*
586  movss xmm0, [esi+eax+0]
587  movss xmm1, [esi+eax+4]
588  movss xmm2, [esi+eax+8]
589  mulss xmm0, xmm5
590  mulss xmm1, xmm6
591  mulss xmm2, xmm7
592  addss xmm0, [esi+eax+12]
593  add ecx, 4
594  addss xmm0, xmm1
595  add eax, 16
596  addss xmm0, xmm2
597  dec edx
598  movss [ecx-4], xmm0
599  jnz loopVert1
600  */
601  xmm0 = _mm_load_ss((float *) (src_p+count_l4+ 0));
602  xmm1 = _mm_load_ss((float *) (src_p+count_l4+ 4));
603  xmm2 = _mm_load_ss((float *) (src_p+count_l4+ 8));
604  xmm3 = _mm_load_ss((float *) (src_p+count_l4+12));
605 
606  xmm0 = _mm_mul_ss(xmm0, xmm5);
607  xmm1 = _mm_mul_ss(xmm1, xmm6);
608  xmm2 = _mm_mul_ss(xmm2, xmm7);
609 
610  xmm0 = _mm_add_ss(xmm0, xmm3);
611  dst_p = dst_p + 4;
612  xmm0 = _mm_add_ss(xmm0, xmm1);
613  count_l4 = count_l4 + 16;
614  xmm0 = _mm_add_ss(xmm0, xmm2);
615  count_l1 = count_l1 - 1;
616  _mm_store_ss((float *) (dst_p-4), xmm0);
617  } while (count_l1 != 0);
618  }
619  /*
620  done:
621  */
622 }
623 
624 #elif defined(_WIN32)
625 
626 #include <xmmintrin.h>
627 
628 #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
629 #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
630 
631 // transpose a 4x4 matrix loaded into 4 xmm registers (reg4 is temporary)
632 #define TRANSPOSE_4x4( reg0, reg1, reg2, reg3, reg4 ) \
633  __asm movaps reg4, reg2 /* reg4 = 8, 9, 10, 11 */ \
634  __asm unpcklps reg2, reg3 /* reg2 = 8, 12, 9, 13 */ \
635  __asm unpckhps reg4, reg3 /* reg4 = 10, 14, 11, 15 */ \
636  __asm movaps reg3, reg0 /* reg3 = 0, 1, 2, 3 */ \
637  __asm unpcklps reg0, reg1 /* reg0 = 0, 4, 1, 5 */ \
638  __asm unpckhps reg3, reg1 /* reg3 = 2, 6, 3, 7 */ \
639  __asm movaps reg1, reg0 /* reg1 = 0, 4, 1, 5 */ \
640  __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg0 = 0, 4, 8, 12 */ \
641  __asm shufps reg1, reg2, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg1 = 1, 5, 9, 13 */ \
642  __asm movaps reg2, reg3 /* reg2 = 2, 6, 3, 7 */ \
643  __asm shufps reg2, reg4, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg2 = 2, 6, 10, 14 */ \
644  __asm shufps reg3, reg4, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg3 = 3, 7, 11, 15 */
645 
646 // transpose a 4x4 matrix from memory into 4 xmm registers (reg4 is temporary)
647 #define TRANPOSE_4x4_FROM_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \
648  __asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \
649  __asm movlps reg3, [address+ 8] /* reg3 = 2, 3, X, X */ \
650  __asm movhps reg1, [address+16] /* reg1 = 0, 1, 4, 5 */ \
651  __asm movhps reg3, [address+24] /* reg3 = 2, 3, 6, 7 */ \
652  __asm movlps reg2, [address+32] /* reg2 = 8, 9, X, X */ \
653  __asm movlps reg4, [address+40] /* reg4 = 10, 11, X, X */ \
654  __asm movhps reg2, [address+48] /* reg2 = 8, 9, 12, 13 */ \
655  __asm movhps reg4, [address+56] /* reg4 = 10, 11, 14, 15 */ \
656  __asm movaps reg0, reg1 /* reg0 = 0, 1, 4, 5 */ \
657  __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 2, 0, 2 ) /* reg0 = 0, 4, 8, 12 */ \
658  __asm shufps reg1, reg2, R_SHUFFLEPS( 1, 3, 1, 3 ) /* reg1 = 1, 5, 9, 13 */ \
659  __asm movaps reg2, reg3 /* reg2 = 2, 3, 6, 7 */ \
660  __asm shufps reg2, reg4, R_SHUFFLEPS( 0, 2, 0, 2 ) /* reg2 = 2, 6, 10, 14 */ \
661  __asm shufps reg3, reg4, R_SHUFFLEPS( 1, 3, 1, 3 ) /* reg3 = 3, 7, 11, 15 */
662 
663 // transpose a 4x4 matrix to memory from 4 xmm registers (reg4 is temporary)
664 #define TRANPOSE_4x4_TO_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \
665  __asm movaps reg4, reg0 /* reg4 = 0, 4, 8, 12 */ \
666  __asm unpcklps reg0, reg1 /* reg0 = 0, 1, 4, 5 */ \
667  __asm unpckhps reg4, reg1 /* reg4 = 8, 9, 12, 13 */ \
668  __asm movaps reg1, reg2 /* reg1 = 2, 6, 10, 14 */ \
669  __asm unpcklps reg2, reg3 /* reg2 = 2, 3, 6, 7 */ \
670  __asm unpckhps reg1, reg3 /* reg1 = 10, 11, 14, 15 */ \
671  __asm movlps [address+ 0], reg0 /* mem0 = 0, 1, X, X */ \
672  __asm movlps [address+ 8], reg2 /* mem0 = 0, 1, 2, 3 */ \
673  __asm movhps [address+16], reg0 /* mem1 = 4, 5, X, X */ \
674  __asm movhps [address+24], reg2 /* mem1 = 4, 5, 6, 7 */ \
675  __asm movlps [address+32], reg4 /* mem2 = 8, 9, X, X */ \
676  __asm movlps [address+40], reg1 /* mem2 = 8, 9, 10, 11 */ \
677  __asm movhps [address+48], reg4 /* mem3 = 12, 13, X, X */ \
678  __asm movhps [address+56], reg1 /* mem3 = 12, 13, 14, 15 */
679 
680 // transpose a 4x3 matrix loaded into 3 xmm registers (reg3 is temporary)
681 #define TRANSPOSE_4x3( reg0, reg1, reg2, reg3 ) \
682  __asm movaps reg3, reg2 /* reg3 = 8, 9, 10, 11 */ \
683  __asm shufps reg3, reg1, R_SHUFFLEPS( 2, 3, 0, 1 ) /* reg3 = 10, 11, 4, 5 */ \
684  __asm shufps reg2, reg0, R_SHUFFLEPS( 0, 1, 2, 3 ) /* reg2 = 8, 9, 2, 3 */ \
685  __asm shufps reg1, reg0, R_SHUFFLEPS( 2, 3, 0, 1 ) /* reg1 = 6, 7, 0, 1 */ \
686  __asm movaps reg0, reg1 /* reg0 = 6, 7, 0, 1 */ \
687  __asm shufps reg0, reg2, R_SHUFFLEPS( 2, 0, 3, 1 ) /* reg0 = 0, 6, 3, 9 */ \
688  __asm shufps reg1, reg3, R_SHUFFLEPS( 3, 1, 2, 0 ) /* reg1 = 1, 7, 4, 10 */ \
689  __asm shufps reg2, reg3, R_SHUFFLEPS( 2, 0, 3, 1 ) /* reg2 = 2, 8, 5, 11 */
690 
691 // transpose a 4x3 matrix from memory into 3 xmm registers (reg3 is temporary)
692 #define TRANSPOSE_4x3_FROM_MEMORY( address, reg0, reg1, reg2, reg3 ) \
693  __asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \
694  __asm movlps reg2, [address+ 8] /* reg2 = 2, 3, X, X */ \
695  __asm movlps reg3, [address+16] /* reg3 = 4, 5, X, X */ \
696  __asm movhps reg1, [address+24] /* reg1 = 0, 1, 6, 7 */ \
697  __asm movhps reg2, [address+32] /* reg2 = 2, 3, 8, 9 */ \
698  __asm movhps reg3, [address+40] /* reg3 = 4, 5, 10, 11 */ \
699  __asm movaps reg0, reg1 /* reg0 = 0, 1, 6, 7 */ \
700  __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 2, 1, 3 ) /* reg0 = 0, 6, 3, 9 */ \
701  __asm shufps reg1, reg3, R_SHUFFLEPS( 1, 3, 0, 2 ) /* reg1 = 1, 7, 4, 10 */ \
702  __asm shufps reg2, reg3, R_SHUFFLEPS( 0, 2, 1, 3 ) /* reg2 = 2, 8, 5, 11 */
703 
704 // transpose a 4x3 matrix to memory from 3 xmm registers (reg3 is temporary)
705 #define TRANSPOSE_4x3_TO_MEMORY( address, reg0, reg1, reg2, reg3 ) \
706  __asm movhlps reg3, reg0 /* reg3 = 3, 9, X, X */ \
707  __asm unpcklps reg0, reg1 /* reg0 = 0, 1, 6, 7 */ \
708  __asm unpckhps reg1, reg2 /* reg1 = 4, 5, 10, 11 */ \
709  __asm unpcklps reg2, reg3 /* reg2 = 2, 3, 8, 9 */ \
710  __asm movlps [address+ 0], reg0 /* mem0 = 0, 1, X, X */ \
711  __asm movlps [address+ 8], reg2 /* mem0 = 0, 1, 2, 3 */ \
712  __asm movlps [address+16], reg1 /* mem1 = 4, 5, X, X */ \
713  __asm movhps [address+24], reg0 /* mem1 = 4, 5, 6, 7 */ \
714  __asm movhps [address+32], reg2 /* mem2 = 8, 9, X, X */ \
715  __asm movhps [address+40], reg1 /* mem2 = 8, 9, 10, 11 */
716 
717 
718 // with alignment
719 #define KFLOATINITS( SRC0, COUNT, PRE, POST ) KFLOATINITDSS( SRC0,SRC0,SRC0,COUNT,PRE,POST )
720 #define KFLOATINITD( DST, COUNT, PRE, POST ) KFLOATINITDSS( DST,DST,DST,COUNT,PRE,POST )
721 #define KFLOATINITDS( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS( DST,SRC0,SRC0,COUNT,PRE,POST )
722 
723 #define KFLOATINITDSS( DST, SRC0, SRC1, COUNT, PRE, POST )\
724  __asm mov ecx,DST \
725  __asm shr ecx,2 \
726  __asm mov ebx,COUNT \
727  __asm neg ecx \
728  __asm mov edx,SRC0 \
729  __asm and ecx,3 \
730  __asm mov esi,SRC1 \
731  __asm sub ebx,ecx \
732  __asm jge noUnderFlow \
733  __asm xor ebx,ebx \
734  __asm mov ecx,COUNT \
735  __asm noUnderFlow: \
736  __asm mov PRE,ecx \
737  __asm mov eax,ebx \
738  __asm mov edi,DST \
739  __asm and eax,8-1 \
740  __asm mov POST,eax \
741  __asm and ebx,0xfffffff8 \
742  __asm jle done \
743  __asm shl ebx,2 \
744  __asm lea ecx,[ecx*4+ebx] \
745  __asm neg ebx \
746  __asm add edx,ecx \
747  __asm add esi,ecx \
748  __asm add edi,ecx \
749  __asm mov eax,edx \
750  __asm or eax,esi
751 
752 // without alignment (pre==0)
753 #define KFLOATINITS_NA( SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( SRC0,SRC0,SRC0,COUNT,PRE,POST )
754 #define KFLOATINITD_NA( DST, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,DST,DST,COUNT,PRE,POST )
755 #define KFLOATINITDS_NA( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,SRC0,SRC0,COUNT,PRE,POST )
756 #define KFLOATINITDSS_NA( DST, SRC0, SRC1, COUNT, PRE, POST )\
757  __asm mov eax,COUNT \
758  __asm mov PRE,0 \
759  __asm and eax,8-1 \
760  __asm mov ebx,COUNT \
761  __asm mov POST,eax \
762  __asm and ebx,0xfffffff8 \
763  __asm je done \
764  __asm shl ebx,2 \
765  __asm mov edx,SRC0 \
766  __asm mov esi,SRC1 \
767  __asm mov edi,DST \
768  __asm add edx,ebx \
769  __asm add esi,ebx \
770  __asm add edi,ebx \
771  __asm mov eax,edx \
772  __asm or eax,esi \
773  __asm or eax,edi \
774  __asm neg ebx \
775 
776 /*
777  when OPER is called:
778  edx = s0
779  esi = s1
780  edi = d
781  ebx = index*4
782 
783  xmm0 & xmm1 must not be trashed
784 */
785 #define KMOVDS1( DST, SRC0 ) \
786  __asm movss xmm2,SRC0 \
787  __asm movss DST,xmm2
788 #define KMOVDS4( DST, SRC0 ) \
789  __asm movups xmm2,SRC0 \
790  __asm movups DST,xmm2
791 #define KMINDS1( DST, SRC0 ) \
792  __asm movss xmm2,SRC0 \
793  __asm minss DST,xmm2
794 #define KMAXDS1( DST, SRC0 ) \
795  __asm movss xmm2,SRC0 \
796  __asm maxss DST,xmm2
797 
798 // general ALU operation
799 #define KALUDSS1( OP, DST, SRC0, SRC1 ) \
800  __asm movss xmm2,SRC0 \
801  __asm OP##ss xmm2,SRC1 \
802  __asm movss DST,xmm2
803 #define KALUDSS4( OP, DST, SRC0, SRC1 ) \
804  __asm movups xmm2,SRC0 \
805  __asm movups xmm3,SRC1 \
806  __asm OP##ps xmm2,xmm3 \
807  __asm movups DST,xmm2
808 
809 #define KADDDSS1( DST, SRC0, SRC1 ) KALUDSS1( add, DST,SRC0,SRC1 )
810 #define KADDDSS4( DST, SRC0, SRC1 ) KALUDSS4( add, DST,SRC0,SRC1 )
811 #define KSUBDSS1( DST, SRC0, SRC1 ) KALUDSS1( sub, DST,SRC0,SRC1 )
812 #define KSUBDSS4( DST, SRC0, SRC1 ) KALUDSS4( sub, DST,SRC0,SRC1 )
813 #define KMULDSS1( DST, SRC0, SRC1 ) KALUDSS1( mul, DST,SRC0,SRC1 )
814 #define KMULDSS4( DST, SRC0, SRC1 ) KALUDSS4( mul, DST,SRC0,SRC1 )
815 
816 #define KDIVDSS1( DST, SRC0, SRC1 ) \
817  __asm movss xmm2,SRC1 \
818  __asm rcpss xmm3,xmm2 \
819  __asm mulss xmm2,xmm3 \
820  __asm mulss xmm2,xmm3 \
821  __asm addss xmm3,xmm3 \
822  __asm subss xmm3,xmm2 \
823  __asm mulss xmm3,SRC0 \
824  __asm movss DST,xmm3
825 #define KDIVDSS4( DST, SRC0, SRC1 ) \
826  __asm movups xmm2,SRC1 \
827  __asm rcpps xmm3,xmm2 \
828  __asm mulps xmm2,xmm3 \
829  __asm mulps xmm2,xmm3 \
830  __asm addps xmm3,xmm3 \
831  __asm subps xmm3,xmm2 \
832  __asm movups xmm2,SRC0 \
833  __asm mulps xmm3,xmm2 \
834  __asm movups DST,xmm3
835 #define KF2IDS1( SRC0 ) \
836  __asm movss xmm2,SRC0 \
837  __asm cvttps2pi mm2,xmm2 \
838  __asm movd [edi+ebx],mm2
839 #define KF2IDS4( SRC0 ) \
840  __asm movups xmm2,SRC0 \
841  __asm cvttps2pi mm2,xmm2 \
842  __asm movq [edi+ebx+0],mm2 \
843  __asm shufps xmm2,xmm2,SHUFFLEPS(1,0,3,2) \
844  __asm cvttps2pi mm2,xmm2 \
845  __asm movq [edi+ebx+8],mm2
846 #define KISQRTDS1( DST,SRC0 ) \
847  __asm movss xmm2,SRC0 \
848  __asm rsqrtss xmm3,xmm2 \
849  __asm mulss xmm2,xmm3 \
850  __asm mulss xmm2,xmm3 \
851  __asm subss xmm2,xmm1 \
852  __asm mulss xmm3,xmm0 \
853  __asm mulss xmm3,xmm2 \
854  __asm movss DST,xmm3
855 #define KISQRTDS4( DST,SRC0 ) \
856  __asm movups xmm2,SRC0 \
857  __asm rsqrtps xmm3,xmm2 \
858  __asm mulps xmm2,xmm3 \
859  __asm mulps xmm2,xmm3 \
860  __asm subps xmm2,xmm1 \
861  __asm mulps xmm3,xmm0 \
862  __asm mulps xmm3,xmm2 \
863  __asm movups DST,xmm3
864 
865 // this is used in vector4 implementation to shift constant V4
866 #define KANDREGDSV( DST, SRC0, VALUE ) \
867  __asm mov DST,SRC0 \
868  __asm and DST,VALUE
869 
870 // this is used in vector4 code to operate with float arrays as sources
871 #define KEXPANDFLOAT( DST, SRC ) \
872  __asm movss DST,SRC \
873  __asm shufps DST,DST,0
874 
875 #define KADDDS1( DST,SRC ) KADDDSS1( DST,DST,SRC )
876 #define KADDDS4( DST,SRC ) KADDDSS4( DST,DST,SRC )
877 #define KSUBDS1( DST,SRC ) KSUBDSS1( DST,DST,SRC )
878 #define KSUBDS4( DST,SRC ) KSUBDSS4( DST,DST,SRC )
879 #define KMULDS1( DST,SRC ) KMULDSS1( DST,DST,SRC )
880 #define KMULDS4( DST,SRC ) KMULDSS4( DST,DST,SRC )
881 #define KDIVDS1( DST,SRC ) KDIVDSS1( DST,DST,SRC )
882 #define KDIVDS4( DST,SRC ) KDIVDSS4( DST,DST,SRC )
883 
884 // handles pre & post leftovers
885 #define KFLOATOPER( OPER, OPER4, COUNT ) \
886  __asm mov ecx,pre \
887  __asm mov ebx,COUNT \
888  __asm cmp ebx,ecx \
889  __asm cmovl ecx,COUNT \
890  __asm test ecx,ecx \
891  __asm je preDone \
892  __asm xor ebx,ebx \
893  __asm lpPre: \
894  OPER \
895  __asm add ebx,4 \
896  __asm dec ecx \
897  __asm jg lpPre \
898  __asm preDone: \
899  __asm mov ecx,post \
900  __asm mov ebx,COUNT \
901  __asm sub ebx,ecx \
902  __asm shl ebx,2 \
903  __asm cmp ecx,4 \
904  __asm jl post4Done \
905  OPER4 \
906  __asm sub ecx,4 \
907  __asm add ebx,4*4 \
908  __asm post4Done: \
909  __asm test ecx,ecx \
910  __asm je postDone \
911  __asm lpPost: \
912  OPER \
913  __asm add ebx,4 \
914  __asm dec ecx \
915  __asm jg lpPost \
916  __asm postDone:
917 
918 // operate on a constant and a float array
919 #define KFLOAT_CA( ALUOP, DST, SRC, CONSTANT, COUNT ) \
920  int pre,post; \
921  __asm movss xmm0,CONSTANT \
922  __asm shufps xmm0,xmm0,0 \
923  KFLOATINITDS( DST, SRC, COUNT, pre, post ) \
924  __asm and eax,15 \
925  __asm jne lpNA \
926  __asm jmp lpA \
927  __asm align 16 \
928  __asm lpA: \
929  __asm prefetchnta [edx+ebx+64] \
930  __asm movaps xmm1,xmm0 \
931  __asm movaps xmm2,xmm0 \
932  __asm ALUOP##ps xmm1,[edx+ebx] \
933  __asm ALUOP##ps xmm2,[edx+ebx+16] \
934  __asm movaps [edi+ebx],xmm1 \
935  __asm movaps [edi+ebx+16],xmm2 \
936  __asm add ebx,16*2 \
937  __asm jl lpA \
938  __asm jmp done \
939  __asm align 16 \
940  __asm lpNA: \
941  __asm prefetchnta [edx+ebx+64] \
942  __asm movaps xmm1,xmm0 \
943  __asm movaps xmm2,xmm0 \
944  __asm movups xmm3,[edx+ebx] \
945  __asm movups xmm4,[edx+ebx+16] \
946  __asm ALUOP##ps xmm1,xmm3 \
947  __asm ALUOP##ps xmm2,xmm4 \
948  __asm movaps [edi+ebx],xmm1 \
949  __asm movaps [edi+ebx+16],xmm2 \
950  __asm add ebx,16*2 \
951  __asm jl lpNA \
952  __asm done: \
953  __asm mov edx,SRC \
954  __asm mov edi,DST \
955  __asm KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), \
956  __asm KALUDSS4( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), COUNT )
957 
958 // operate on two float arrays
959 #define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT ) \
960  int pre,post; \
961  KFLOATINITDSS( DST, SRC0, SRC1, COUNT, pre, post ) \
962  __asm and eax,15 \
963  __asm jne lpNA \
964  __asm jmp lpA \
965  __asm align 16 \
966  __asm lpA: \
967  __asm movaps xmm1,[edx+ebx] \
968  __asm movaps xmm2,[edx+ebx+16] \
969  __asm ALUOP##ps xmm1,[esi+ebx] \
970  __asm ALUOP##ps xmm2,[esi+ebx+16] \
971  __asm prefetchnta [edx+ebx+64] \
972  __asm prefetchnta [esi+ebx+64] \
973  __asm movaps [edi+ebx],xmm1 \
974  __asm movaps [edi+ebx+16],xmm2 \
975  __asm add ebx,16*2 \
976  __asm jl lpA \
977  __asm jmp done \
978  __asm align 16 \
979  __asm lpNA: \
980  __asm movups xmm1,[edx+ebx] \
981  __asm movups xmm2,[edx+ebx+16] \
982  __asm movups xmm3,[esi+ebx] \
983  __asm movups xmm4,[esi+ebx+16] \
984  __asm prefetchnta [edx+ebx+64] \
985  __asm prefetchnta [esi+ebx+64] \
986  __asm ALUOP##ps xmm1,xmm3 \
987  __asm ALUOP##ps xmm2,xmm4 \
988  __asm movaps [edi+ebx],xmm1 \
989  __asm movaps [edi+ebx+16],xmm2 \
990  __asm add ebx,16*2 \
991  __asm jl lpNA \
992  __asm done: \
993  __asm mov edx,SRC0 \
994  __asm mov esi,SRC1 \
995  __asm mov edi,DST \
996  KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), \
997  KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT )
998 
999 
1000 #define DRAWVERT_SIZE 60
1001 #define DRAWVERT_XYZ_OFFSET (0*4)
1002 #define DRAWVERT_ST_OFFSET (3*4)
1003 #define DRAWVERT_NORMAL_OFFSET (5*4)
1004 #define DRAWVERT_TANGENT0_OFFSET (8*4)
1005 #define DRAWVERT_TANGENT1_OFFSET (11*4)
1006 #define DRAWVERT_COLOR_OFFSET (14*4)
1007 
1008 #define JOINTQUAT_SIZE (7*4)
1009 #define JOINTMAT_SIZE (4*3*4)
1010 #define JOINTWEIGHT_SIZE (4*4)
1011 
1012 
1013 #define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
1014 #define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
1015 #define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
1016 
1017 ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
1018 ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
1019 
1020 ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle0, (3<<0)|(2<<8)|(1<<16)|(0<<24) );
1021 ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle1, (0<<0)|(1<<8)|(2<<16)|(3<<24) );
1022 ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle2, (1<<0)|(0<<8)|(3<<16)|(2<<24) );
1023 ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle3, (2<<0)|(3<<8)|(0<<16)|(1<<24) );
1024 
1025 ALIGN4_INIT4( unsigned long SIMD_SP_singleSignBitMask, (unsigned long) ( 1 << 31 ), 0, 0, 0 );
1026 ALIGN4_INIT1( unsigned long SIMD_SP_signBitMask, (unsigned long) ( 1 << 31 ) );
1027 ALIGN4_INIT1( unsigned long SIMD_SP_absMask, (unsigned long) ~( 1 << 31 ) );
1028 ALIGN4_INIT1( unsigned long SIMD_SP_infinityMask, (unsigned long) ~( 1 << 23 ) );
1029 ALIGN4_INIT1( unsigned long SIMD_SP_not, 0xFFFFFFFF );
1030 
1031 ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
1032 ALIGN4_INIT1( float SIMD_SP_half, 0.5f );
1033 ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
1034 ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
1035 ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
1036 ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
1037 ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
1038 ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
1039 ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
1040 ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
1041 ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
1042 ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
1043 ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
1044 ALIGN4_INIT4( float SIMD_SP_lastOne, 0.0f, 0.0f, 0.0f, 1.0f );
1045 
1046 ALIGN4_INIT1( float SIMD_SP_rsqrt_c0, 3.0f );
1047 ALIGN4_INIT1( float SIMD_SP_rsqrt_c1, -0.5f );
1048 ALIGN4_INIT1( float SIMD_SP_mat2quat_rsqrt_c1, -0.5f*0.5f );
1049 
1050 ALIGN4_INIT1( float SIMD_SP_sin_c0, -2.39e-08f );
1051 ALIGN4_INIT1( float SIMD_SP_sin_c1, 2.7526e-06f );
1052 ALIGN4_INIT1( float SIMD_SP_sin_c2, -1.98409e-04f );
1053 ALIGN4_INIT1( float SIMD_SP_sin_c3, 8.3333315e-03f );
1054 ALIGN4_INIT1( float SIMD_SP_sin_c4, -1.666666664e-01f );
1055 
1056 ALIGN4_INIT1( float SIMD_SP_cos_c0, -2.605e-07f );
1057 ALIGN4_INIT1( float SIMD_SP_cos_c1, 2.47609e-05f );
1058 ALIGN4_INIT1( float SIMD_SP_cos_c2, -1.3888397e-03f );
1059 ALIGN4_INIT1( float SIMD_SP_cos_c3, 4.16666418e-02f );
1060 ALIGN4_INIT1( float SIMD_SP_cos_c4, -4.999999963e-01f );
1061 
1062 ALIGN4_INIT1( float SIMD_SP_atan_c0, 0.0028662257f );
1063 ALIGN4_INIT1( float SIMD_SP_atan_c1, -0.0161657367f );
1064 ALIGN4_INIT1( float SIMD_SP_atan_c2, 0.0429096138f );
1065 ALIGN4_INIT1( float SIMD_SP_atan_c3, -0.0752896400f );
1066 ALIGN4_INIT1( float SIMD_SP_atan_c4, 0.1065626393f );
1067 ALIGN4_INIT1( float SIMD_SP_atan_c5, -0.1420889944f );
1068 ALIGN4_INIT1( float SIMD_SP_atan_c6, 0.1999355085f );
1069 ALIGN4_INIT1( float SIMD_SP_atan_c7, -0.3333314528f );
1070 
1071 /*
1072 ============
1073 SSE_InvSqrt
1074 ============
1075 */
1076 float SSE_InvSqrt( float x ) {
1077  float y;
1078 
1079  __asm {
1080  movss xmm0, x
1081  rsqrtss xmm1, xmm0
1082  mulss xmm0, xmm1
1083  mulss xmm0, xmm1
1084  subss xmm0, SIMD_SP_rsqrt_c0
1085  mulss xmm1, SIMD_SP_rsqrt_c1
1086  mulss xmm0, xmm1
1087  movss y, xmm0
1088  }
1089  return y;
1090 }
1091 
1092 /*
1093 ============
1094 SSE_InvSqrt4
1095 ============
1096 */
1097 void SSE_InvSqrt4( float x[4] ) {
1098  __asm {
1099  mov edi, x
1100  movaps xmm0, [edi]
1101  rsqrtps xmm1, xmm0
1102  mulps xmm0, xmm1
1103  mulps xmm0, xmm1
1104  subps xmm0, SIMD_SP_rsqrt_c0
1105  mulps xmm1, SIMD_SP_rsqrt_c1
1106  mulps xmm0, xmm1
1107  movaps [edi], xmm0
1108  }
1109 }
1110 
1111 /*
1112 ============
1113 SSE_SinZeroHalfPI
1114 
1115  The angle must be between zero and half PI.
1116 ============
1117 */
1118 float SSE_SinZeroHalfPI( float a ) {
1119 #if 1
1120 
1121  float t;
1122 
1123  assert( a >= 0.0f && a <= idMath::HALF_PI );
1124 
1125  __asm {
1126  movss xmm0, a
1127  movss xmm1, xmm0
1128  mulss xmm1, xmm1
1129  movss xmm2, SIMD_SP_sin_c0
1130  mulss xmm2, xmm1
1131  addss xmm2, SIMD_SP_sin_c1
1132  mulss xmm2, xmm1
1133  addss xmm2, SIMD_SP_sin_c2
1134  mulss xmm2, xmm1
1135  addss xmm2, SIMD_SP_sin_c3
1136  mulss xmm2, xmm1
1137  addss xmm2, SIMD_SP_sin_c4
1138  mulss xmm2, xmm1
1139  addss xmm2, SIMD_SP_one
1140  mulss xmm2, xmm0
1141  movss t, xmm2
1142  }
1143 
1144  return t;
1145 
1146 #else
1147 
1148  float s, t;
1149 
1150  assert( a >= 0.0f && a <= idMath::HALF_PI );
1151 
1152  s = a * a;
1153  t = -2.39e-08f;
1154  t *= s;
1155  t += 2.7526e-06f;
1156  t *= s;
1157  t += -1.98409e-04f;
1158  t *= s;
1159  t += 8.3333315e-03f;
1160  t *= s;
1161  t += -1.666666664e-01f;
1162  t *= s;
1163  t += 1.0f;
1164  t *= a;
1165 
1166  return t;
1167 
1168 #endif
1169 }
1170 
1171 /*
1172 ============
1173 SSE_Sin4ZeroHalfPI
1174 
1175  The angle must be between zero and half PI.
1176 ============
1177 */
1178 void SSE_Sin4ZeroHalfPI( float a[4], float s[4] ) {
1179  __asm {
1180  mov edi, a
1181  mov esi, s
1182  movaps xmm0, [edi]
1183  movaps xmm1, xmm0
1184  mulps xmm1, xmm1
1185  movaps xmm2, SIMD_SP_sin_c0
1186  mulps xmm2, xmm1
1187  addps xmm2, SIMD_SP_sin_c1
1188  mulps xmm2, xmm1
1189  addps xmm2, SIMD_SP_sin_c2
1190  mulps xmm2, xmm1
1191  addps xmm2, SIMD_SP_sin_c3
1192  mulps xmm2, xmm1
1193  addps xmm2, SIMD_SP_sin_c4
1194  mulps xmm2, xmm1
1195  addps xmm2, SIMD_SP_one
1196  mulps xmm2, xmm0
1197  movaps [esi], xmm2
1198  }
1199 }
1200 
1201 /*
1202 ============
1203 SSE_Sin
1204 ============
1205 */
1206 float SSE_Sin( float a ) {
1207 #if 1
1208 
1209  float t;
1210 
1211  __asm {
1212  movss xmm1, a
1213  movss xmm2, xmm1
1214  movss xmm3, xmm1
1215  mulss xmm2, SIMD_SP_oneOverTwoPI
1216  cvttss2si ecx, xmm2
1217  cmpltss xmm3, SIMD_SP_zero
1218  andps xmm3, SIMD_SP_one
1219  cvtsi2ss xmm2, ecx
1220  subss xmm2, xmm3
1221  mulss xmm2, SIMD_SP_twoPI
1222  subss xmm1, xmm2
1223 
1224  movss xmm0, SIMD_SP_PI // xmm0 = PI
1225  subss xmm0, xmm1 // xmm0 = PI - a
1226  movss xmm1, xmm0 // xmm1 = PI - a
1227  andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
1228  movss xmm2, xmm0 // xmm2 = PI - a
1229  xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
1230  cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
1231  movss xmm3, SIMD_SP_PI // xmm3 = PI
1232  xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
1233  andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
1234  andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
1235  xorps xmm0, xmm2
1236  addps xmm0, xmm3
1237 
1238  movss xmm1, xmm0
1239  mulss xmm1, xmm1
1240  movss xmm2, SIMD_SP_sin_c0
1241  mulss xmm2, xmm1
1242  addss xmm2, SIMD_SP_sin_c1
1243  mulss xmm2, xmm1
1244  addss xmm2, SIMD_SP_sin_c2
1245  mulss xmm2, xmm1
1246  addss xmm2, SIMD_SP_sin_c3
1247  mulss xmm2, xmm1
1248  addss xmm2, SIMD_SP_sin_c4
1249  mulss xmm2, xmm1
1250  addss xmm2, SIMD_SP_one
1251  mulss xmm2, xmm0
1252  movss t, xmm2
1253  }
1254 
1255  return t;
1256 
1257 #else
1258 
1259  float s, t;
1260 
1261  if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) {
1262  a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI;
1263  }
1264 
1265  a = idMath::PI - a;
1266  if ( fabs( a ) >= idMath::HALF_PI ) {
1267  a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a;
1268  }
1269 
1270  s = a * a;
1271  t = -2.39e-08f;
1272  t *= s;
1273  t += 2.7526e-06f;
1274  t *= s;
1275  t += -1.98409e-04f;
1276  t *= s;
1277  t += 8.3333315e-03f;
1278  t *= s;
1279  t += -1.666666664e-01f;
1280  t *= s;
1281  t += 1.0f;
1282  t *= a;
1283 
1284  return t;
1285 
1286 #endif
1287 }
1288 
1289 /*
1290 ============
1291 SSE_Sin4
1292 ============
1293 */
1294 void SSE_Sin4( float a[4], float s[4] ) {
1295  __asm {
1296  mov edi, a
1297  mov esi, s
1298  movaps xmm1, [edi]
1299  movaps xmm2, xmm1
1300  mulps xmm2, SIMD_SP_oneOverTwoPI
1301  movhlps xmm3, xmm2
1302  cvttss2si ecx, xmm2
1303  cvtsi2ss xmm2, ecx
1304  cvttss2si edx, xmm3
1305  cvtsi2ss xmm3, edx
1306  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
1307  shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
1308  cvttss2si ecx, xmm2
1309  cvtsi2ss xmm2, ecx
1310  cvttss2si edx, xmm3
1311  cvtsi2ss xmm3, edx
1312  shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
1313  movaps xmm3, xmm1
1314  cmpltps xmm3, SIMD_SP_zero
1315  andps xmm3, SIMD_SP_one
1316  subps xmm2, xmm3
1317  mulps xmm2, SIMD_SP_twoPI
1318  subps xmm1, xmm2
1319 
1320  movaps xmm0, SIMD_SP_PI // xmm0 = PI
1321  subps xmm0, xmm1 // xmm0 = PI - a
1322  movaps xmm1, xmm0 // xmm1 = PI - a
1323  andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
1324  movaps xmm2, xmm0 // xmm2 = PI - a
1325  xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
1326  cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
1327  movaps xmm3, SIMD_SP_PI // xmm3 = PI
1328  xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
1329  andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
1330  andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
1331  xorps xmm0, xmm2
1332  addps xmm0, xmm3
1333 
1334  movaps xmm1, xmm0
1335  mulps xmm1, xmm1
1336  movaps xmm2, SIMD_SP_sin_c0
1337  mulps xmm2, xmm1
1338  addps xmm2, SIMD_SP_sin_c1
1339  mulps xmm2, xmm1
1340  addps xmm2, SIMD_SP_sin_c2
1341  mulps xmm2, xmm1
1342  addps xmm2, SIMD_SP_sin_c3
1343  mulps xmm2, xmm1
1344  addps xmm2, SIMD_SP_sin_c4
1345  mulps xmm2, xmm1
1346  addps xmm2, SIMD_SP_one
1347  mulps xmm2, xmm0
1348  movaps [esi], xmm2
1349  }
1350 }
1351 
1352 /*
1353 ============
1354 SSE_CosZeroHalfPI
1355 
1356  The angle must be between zero and half PI.
1357 ============
1358 */
1359 float SSE_CosZeroHalfPI( float a ) {
1360 #if 1
1361 
1362  float t;
1363 
1364  assert( a >= 0.0f && a <= idMath::HALF_PI );
1365 
1366  __asm {
1367  movss xmm0, a
1368  mulss xmm0, xmm0
1369  movss xmm1, SIMD_SP_cos_c0
1370  mulss xmm1, xmm0
1371  addss xmm1, SIMD_SP_cos_c1
1372  mulss xmm1, xmm0
1373  addss xmm1, SIMD_SP_cos_c2
1374  mulss xmm1, xmm0
1375  addss xmm1, SIMD_SP_cos_c3
1376  mulss xmm1, xmm0
1377  addss xmm1, SIMD_SP_cos_c4
1378  mulss xmm1, xmm0
1379  addss xmm1, SIMD_SP_one
1380  movss t, xmm1
1381  }
1382 
1383  return t;
1384 
1385 #else
1386 
1387  float s, t;
1388 
1389  assert( a >= 0.0f && a <= idMath::HALF_PI );
1390 
1391  s = a * a;
1392  t = -2.605e-07f;
1393  t *= s;
1394  t += 2.47609e-05f;
1395  t *= s;
1396  t += -1.3888397e-03f;
1397  t *= s;
1398  t += 4.16666418e-02f;
1399  t *= s;
1400  t += -4.999999963e-01f;
1401  t *= s;
1402  t += 1.0f;
1403 
1404  return t;
1405 
1406 #endif
1407 }
1408 
1409 /*
1410 ============
1411 SSE_Cos4ZeroHalfPI
1412 
1413  The angle must be between zero and half PI.
1414 ============
1415 */
1416 void SSE_Cos4ZeroHalfPI( float a[4], float c[4] ) {
1417  __asm {
1418  mov edi, a
1419  mov esi, c
1420  movaps xmm0, [edi]
1421  mulps xmm0, xmm0
1422  movaps xmm1, SIMD_SP_cos_c0
1423  mulps xmm1, xmm0
1424  addps xmm1, SIMD_SP_cos_c1
1425  mulps xmm1, xmm0
1426  addps xmm1, SIMD_SP_cos_c2
1427  mulps xmm1, xmm0
1428  addps xmm1, SIMD_SP_cos_c3
1429  mulps xmm1, xmm0
1430  addps xmm1, SIMD_SP_cos_c4
1431  mulps xmm1, xmm0
1432  addps xmm1, SIMD_SP_one
1433  movaps [esi], xmm2
1434  }
1435 }
1436 
1437 /*
1438 ============
1439 SSE_Cos
1440 ============
1441 */
1442 float SSE_Cos( float a ) {
1443 #if 1
1444 
1445  float t;
1446 
1447  __asm {
1448  movss xmm1, a
1449  movss xmm2, xmm1
1450  movss xmm3, xmm1
1451  mulss xmm2, SIMD_SP_oneOverTwoPI
1452  cvttss2si ecx, xmm2
1453  cmpltss xmm3, SIMD_SP_zero
1454  andps xmm3, SIMD_SP_one
1455  cvtsi2ss xmm2, ecx
1456  subss xmm2, xmm3
1457  mulss xmm2, SIMD_SP_twoPI
1458  subss xmm1, xmm2
1459 
1460  movss xmm0, SIMD_SP_PI // xmm0 = PI
1461  subss xmm0, xmm1 // xmm0 = PI - a
1462  movss xmm1, xmm0 // xmm1 = PI - a
1463  andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
1464  movss xmm2, xmm0 // xmm2 = PI - a
1465  xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
1466  cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
1467  movss xmm3, SIMD_SP_PI // xmm3 = PI
1468  xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
1469  andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
1470  andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
1471  xorps xmm0, xmm2
1472  addps xmm0, xmm3
1473 
1474  mulss xmm0, xmm0
1475  movss xmm1, SIMD_SP_cos_c0
1476  mulss xmm1, xmm0
1477  addss xmm1, SIMD_SP_cos_c1
1478  mulss xmm1, xmm0
1479  addss xmm1, SIMD_SP_cos_c2
1480  mulss xmm1, xmm0
1481  addss xmm1, SIMD_SP_cos_c3
1482  mulss xmm1, xmm0
1483  addss xmm1, SIMD_SP_cos_c4
1484  mulss xmm1, xmm0
1485  addss xmm1, SIMD_SP_one
1486  xorps xmm2, SIMD_SP_signBitMask
1487  xorps xmm1, xmm2
1488  movss t, xmm1
1489  }
1490 
1491  return t;
1492 
1493 #else
1494 
1495  float s, t;
1496 
1497  if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) {
1498  a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI;
1499  }
1500 
1501  a = idMath::PI - a;
1502  if ( fabs( a ) >= idMath::HALF_PI ) {
1503  a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a;
1504  d = 1.0f;
1505  } else {
1506  d = -1.0f;
1507  }
1508 
1509  s = a * a;
1510  t = -2.605e-07f;
1511  t *= s;
1512  t += 2.47609e-05f;
1513  t *= s;
1514  t += -1.3888397e-03f;
1515  t *= s;
1516  t += 4.16666418e-02f;
1517  t *= s;
1518  t += -4.999999963e-01f;
1519  t *= s;
1520  t += 1.0f;
1521  t *= d;
1522 
1523  return t;
1524 
1525 #endif
1526 }
1527 
1528 /*
1529 ============
1530 SSE_Cos4
1531 ============
1532 */
1533 void SSE_Cos4( float a[4], float c[4] ) {
1534  __asm {
1535  mov edi, a
1536  mov esi, c
1537  movaps xmm1, [edi]
1538  movaps xmm2, xmm1
1539  mulps xmm2, SIMD_SP_oneOverTwoPI
1540  movhlps xmm3, xmm2
1541  cvttss2si ecx, xmm2
1542  cvtsi2ss xmm2, ecx
1543  cvttss2si edx, xmm3
1544  cvtsi2ss xmm3, edx
1545  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
1546  shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
1547  cvttss2si ecx, xmm2
1548  cvtsi2ss xmm2, ecx
1549  cvttss2si edx, xmm3
1550  cvtsi2ss xmm3, edx
1551  shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
1552  movaps xmm3, xmm1
1553  cmpltps xmm3, SIMD_SP_zero
1554  andps xmm3, SIMD_SP_one
1555  subps xmm2, xmm3
1556  mulps xmm2, SIMD_SP_twoPI
1557  subps xmm1, xmm2
1558 
1559  movaps xmm0, SIMD_SP_PI // xmm0 = PI
1560  subps xmm0, xmm1 // xmm0 = PI - a
1561  movaps xmm1, xmm0 // xmm1 = PI - a
1562  andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
1563  movaps xmm2, xmm0 // xmm2 = PI - a
1564  xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
1565  cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
1566  movaps xmm3, SIMD_SP_PI // xmm3 = PI
1567  xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
1568  andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
1569  andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
1570  xorps xmm0, xmm2
1571  addps xmm0, xmm3
1572 
1573  mulps xmm0, xmm0
1574  movaps xmm1, SIMD_SP_cos_c0
1575  mulps xmm1, xmm0
1576  addps xmm1, SIMD_SP_cos_c1
1577  mulps xmm1, xmm0
1578  addps xmm1, SIMD_SP_cos_c2
1579  mulps xmm1, xmm0
1580  addps xmm1, SIMD_SP_cos_c3
1581  mulps xmm1, xmm0
1582  addps xmm1, SIMD_SP_cos_c4
1583  mulps xmm1, xmm0
1584  addps xmm1, SIMD_SP_one
1585  xorps xmm2, SIMD_SP_signBitMask
1586  xorps xmm1, xmm2
1587  movaps [esi], xmm1
1588  }
1589 }
1590 
1591 /*
1592 ============
1593 SSE_SinCos
1594 ============
1595 */
1596 void SSE_SinCos( float a, float &s, float &c ) {
1597  __asm {
1598  mov edi, s
1599  mov esi, c
1600  movss xmm1, a
1601  movss xmm2, xmm1
1602  movss xmm3, xmm1
1603  mulss xmm2, SIMD_SP_oneOverTwoPI
1604  cvttss2si ecx, xmm2
1605  cmpltss xmm3, SIMD_SP_zero
1606  andps xmm3, SIMD_SP_one
1607  cvtsi2ss xmm2, ecx
1608  subss xmm2, xmm3
1609  mulss xmm2, SIMD_SP_twoPI
1610  subss xmm1, xmm2
1611 
1612  movss xmm0, SIMD_SP_PI // xmm0 = PI
1613  subss xmm0, xmm1 // xmm0 = PI - a
1614  movss xmm1, xmm0 // xmm1 = PI - a
1615  andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
1616  movss xmm2, xmm0 // xmm2 = PI - a
1617  xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
1618  cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
1619  movss xmm3, SIMD_SP_PI // xmm3 = PI
1620  xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
1621  andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
1622  andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
1623  xorps xmm0, xmm2
1624  addps xmm0, xmm3
1625 
1626  movss xmm1, xmm0
1627  mulss xmm1, xmm1
1628  movss xmm3, SIMD_SP_sin_c0
1629  movss xmm4, SIMD_SP_cos_c0
1630  mulss xmm3, xmm1
1631  mulss xmm4, xmm1
1632  addss xmm3, SIMD_SP_sin_c1
1633  addss xmm4, SIMD_SP_cos_c1
1634  mulss xmm3, xmm1
1635  mulss xmm4, xmm1
1636  addss xmm3, SIMD_SP_sin_c2
1637  addss xmm4, SIMD_SP_cos_c2
1638  mulss xmm3, xmm1
1639  mulss xmm4, xmm1
1640  addss xmm3, SIMD_SP_sin_c3
1641  addss xmm4, SIMD_SP_cos_c3
1642  mulss xmm3, xmm1
1643  mulss xmm4, xmm1
1644  addss xmm3, SIMD_SP_sin_c4
1645  addss xmm4, SIMD_SP_cos_c4
1646  mulss xmm3, xmm1
1647  mulss xmm4, xmm1
1648  addss xmm3, SIMD_SP_one
1649  addss xmm4, SIMD_SP_one
1650  mulss xmm3, xmm0
1651  xorps xmm2, SIMD_SP_signBitMask
1652  xorps xmm4, xmm2
1653  movss [edi], xmm2
1654  movss [esi], xmm3
1655  }
1656 }
1657 
1658 /*
1659 ============
1660 SSE_SinCos4
1661 ============
1662 */
1663 void SSE_SinCos4( float a[4], float s[4], float c[4] ) {
1664  __asm {
1665  mov eax, a
1666  mov edi, s
1667  mov esi, c
1668  movaps xmm1, [eax]
1669  movaps xmm2, xmm1
1670  mulps xmm2, SIMD_SP_oneOverTwoPI
1671  movhlps xmm3, xmm2
1672  cvttss2si ecx, xmm2
1673  cvtsi2ss xmm2, ecx
1674  cvttss2si edx, xmm3
1675  cvtsi2ss xmm3, edx
1676  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
1677  shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
1678  cvttss2si ecx, xmm2
1679  cvtsi2ss xmm2, ecx
1680  cvttss2si edx, xmm3
1681  cvtsi2ss xmm3, edx
1682  shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
1683  movaps xmm3, xmm1
1684  cmpltps xmm3, SIMD_SP_zero
1685  andps xmm3, SIMD_SP_one
1686  subps xmm2, xmm3
1687  mulps xmm2, SIMD_SP_twoPI
1688  subps xmm1, xmm2
1689 
1690  movaps xmm0, SIMD_SP_PI // xmm0 = PI
1691  subps xmm0, xmm1 // xmm0 = PI - a
1692  movaps xmm1, xmm0 // xmm1 = PI - a
1693  andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
1694  movaps xmm2, xmm0 // xmm2 = PI - a
1695  xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
1696  cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
1697  movaps xmm3, SIMD_SP_PI // xmm3 = PI
1698  xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
1699  andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
1700  andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
1701  xorps xmm0, xmm2
1702  addps xmm0, xmm3
1703 
1704  movaps xmm0, [eax]
1705  movaps xmm1, xmm0
1706  mulps xmm1, xmm1
1707  movaps xmm3, SIMD_SP_sin_c0
1708  movaps xmm4, SIMD_SP_cos_c0
1709  mulps xmm3, xmm1
1710  mulps xmm4, xmm1
1711  addps xmm3, SIMD_SP_sin_c1
1712  addps xmm4, SIMD_SP_cos_c1
1713  mulps xmm3, xmm1
1714  mulps xmm4, xmm1
1715  addps xmm3, SIMD_SP_sin_c2
1716  addps xmm4, SIMD_SP_cos_c2
1717  mulps xmm3, xmm1
1718  mulps xmm4, xmm1
1719  addps xmm3, SIMD_SP_sin_c3
1720  addps xmm4, SIMD_SP_cos_c3
1721  mulps xmm3, xmm1
1722  mulps xmm4, xmm1
1723  addps xmm3, SIMD_SP_sin_c4
1724  addps xmm4, SIMD_SP_cos_c4
1725  mulps xmm3, xmm1
1726  mulps xmm4, xmm1
1727  addps xmm3, SIMD_SP_one
1728  addps xmm4, SIMD_SP_one
1729  mulps xmm3, xmm0
1730  xorps xmm2, SIMD_SP_signBitMask
1731  xorps xmm4, xmm2
1732  movaps [edi], xmm3
1733  movaps [esi], xmm4
1734  }
1735 }
1736 
1737 /*
1738 ============
1739 SSE_ATanPositive
1740 
1741  Both 'x' and 'y' must be positive.
1742 ============
1743 */
1744 float SSE_ATanPositive( float y, float x ) {
1745 #if 1
1746 
1747  float t;
1748 
1749  assert( y >= 0.0f && x >= 0.0f );
1750 
1751  __asm {
1752  movss xmm0, x
1753  movss xmm3, xmm0
1754  movss xmm1, y
1755  minss xmm0, xmm1
1756  maxss xmm1, xmm3
1757  cmpeqss xmm3, xmm0
1758  rcpss xmm2, xmm1
1759  mulss xmm1, xmm2
1760  mulss xmm1, xmm2
1761  addss xmm2, xmm2
1762  subss xmm2, xmm1 // xmm2 = 1 / y or 1 / x
1763  mulss xmm0, xmm2 // xmm0 = x / y or y / x
1764  movss xmm1, xmm3
1765  andps xmm1, SIMD_SP_signBitMask
1766  xorps xmm0, xmm1 // xmm0 = -x / y or y / x
1767  andps xmm3, SIMD_SP_halfPI // xmm3 = HALF_PI or 0.0f
1768  movss xmm1, xmm0
1769  mulss xmm1, xmm1 // xmm1 = s
1770  movss xmm2, SIMD_SP_atan_c0
1771  mulss xmm2, xmm1
1772  addss xmm2, SIMD_SP_atan_c1
1773  mulss xmm2, xmm1
1774  addss xmm2, SIMD_SP_atan_c2
1775  mulss xmm2, xmm1
1776  addss xmm2, SIMD_SP_atan_c3
1777  mulss xmm2, xmm1
1778  addss xmm2, SIMD_SP_atan_c4
1779  mulss xmm2, xmm1
1780  addss xmm2, SIMD_SP_atan_c5
1781  mulss xmm2, xmm1
1782  addss xmm2, SIMD_SP_atan_c6
1783  mulss xmm2, xmm1
1784  addss xmm2, SIMD_SP_atan_c7
1785  mulss xmm2, xmm1
1786  addss xmm2, SIMD_SP_one
1787  mulss xmm2, xmm0
1788  addss xmm2, xmm3
1789  movss t, xmm2
1790  }
1791 
1792  return t;
1793 
1794 #else
1795 
1796  float a, d, s, t;
1797 
1798  assert( y >= 0.0f && x >= 0.0f );
1799 
1800  if ( y > x ) {
1801  a = -x / y;
1802  d = idMath::HALF_PI;
1803  } else {
1804  a = y / x;
1805  d = 0.0f;
1806  }
1807  s = a * a;
1808  t = 0.0028662257f;
1809  t *= s;
1810  t += -0.0161657367f;
1811  t *= s;
1812  t += 0.0429096138f;
1813  t *= s;
1814  t += -0.0752896400f;
1815  t *= s;
1816  t += 0.1065626393f;
1817  t *= s;
1818  t += -0.1420889944f;
1819  t *= s;
1820  t += 0.1999355085f;
1821  t *= s;
1822  t += -0.3333314528f;
1823  t *= s;
1824  t += 1.0f;
1825  t *= a;
1826  t += d;
1827 
1828  return t;
1829 
1830 #endif
1831 }
1832 
1833 /*
1834 ============
1835 SSE_ATan4Positive
1836 
1837  Both 'x' and 'y' must be positive.
1838 ============
1839 */
1840 void SSE_ATan4Positive( float y[4], float x[4], float at[4] ) {
1841  __asm {
1842  mov esi, x
1843  mov edi, y
1844  mov edx, at
1845  movaps xmm0, [esi]
1846  movaps xmm3, xmm0
1847  movaps xmm1, [edi]
1848  minps xmm0, xmm1
1849  maxps xmm1, xmm3
1850  cmpeqps xmm3, xmm0
1851  rcpps xmm2, xmm1
1852  mulps xmm1, xmm2
1853  mulps xmm1, xmm2
1854  addps xmm2, xmm2
1855  subps xmm2, xmm1 // xmm2 = 1 / y or 1 / x
1856  mulps xmm0, xmm2 // xmm0 = x / y or y / x
1857  movaps xmm1, xmm3
1858  andps xmm1, SIMD_SP_signBitMask
1859  xorps xmm0, xmm1 // xmm0 = -x / y or y / x
1860  andps xmm3, SIMD_SP_halfPI // xmm3 = HALF_PI or 0.0f
1861  movaps xmm1, xmm0
1862  mulps xmm1, xmm1 // xmm1 = s
1863  movaps xmm2, SIMD_SP_atan_c0
1864  mulps xmm2, xmm1
1865  addps xmm2, SIMD_SP_atan_c1
1866  mulps xmm2, xmm1
1867  addps xmm2, SIMD_SP_atan_c2
1868  mulps xmm2, xmm1
1869  addps xmm2, SIMD_SP_atan_c3
1870  mulps xmm2, xmm1
1871  addps xmm2, SIMD_SP_atan_c4
1872  mulps xmm2, xmm1
1873  addps xmm2, SIMD_SP_atan_c5
1874  mulps xmm2, xmm1
1875  addps xmm2, SIMD_SP_atan_c6
1876  mulps xmm2, xmm1
1877  addps xmm2, SIMD_SP_atan_c7
1878  mulps xmm2, xmm1
1879  addps xmm2, SIMD_SP_one
1880  mulps xmm2, xmm0
1881  addps xmm2, xmm3
1882  movaps [edx], xmm2
1883  }
1884 }
1885 
1886 /*
1887 ============
1888 SSE_ATan
1889 ============
1890 */
1891 float SSE_ATan( float y, float x ) {
1892 #if 1
1893 
1894  float t;
1895 
1896  __asm {
1897  movss xmm0, x
1898  movss xmm3, xmm0
1899  movss xmm4, xmm0
1900  andps xmm0, SIMD_SP_absMask
1901  movss xmm1, y
1902  xorps xmm4, xmm1
1903  andps xmm1, SIMD_SP_absMask
1904  andps xmm4, SIMD_SP_signBitMask
1905  minss xmm0, xmm1
1906  maxss xmm1, xmm3
1907  cmpeqss xmm3, xmm0
1908  rcpss xmm2, xmm1
1909  mulss xmm1, xmm2
1910  mulss xmm1, xmm2
1911  addss xmm2, xmm2
1912  subss xmm2, xmm1 // xmm2 = 1 / y or 1 / x
1913  mulss xmm0, xmm2 // xmm0 = x / y or y / x
1914  xorps xmm0, xmm4
1915  movss xmm1, xmm3
1916  andps xmm1, SIMD_SP_signBitMask
1917  xorps xmm0, xmm1 // xmm0 = -x / y or y / x
1918  orps xmm4, SIMD_SP_halfPI // xmm4 = +/- HALF_PI
1919  andps xmm3, xmm4 // xmm3 = +/- HALF_PI or 0.0f
1920  movss xmm1, xmm0
1921  mulss xmm1, xmm1 // xmm1 = s
1922  movss xmm2, SIMD_SP_atan_c0
1923  mulss xmm2, xmm1
1924  addss xmm2, SIMD_SP_atan_c1
1925  mulss xmm2, xmm1
1926  addss xmm2, SIMD_SP_atan_c2
1927  mulss xmm2, xmm1
1928  addss xmm2, SIMD_SP_atan_c3
1929  mulss xmm2, xmm1
1930  addss xmm2, SIMD_SP_atan_c4
1931  mulss xmm2, xmm1
1932  addss xmm2, SIMD_SP_atan_c5
1933  mulss xmm2, xmm1
1934  addss xmm2, SIMD_SP_atan_c6
1935  mulss xmm2, xmm1
1936  addss xmm2, SIMD_SP_atan_c7
1937  mulss xmm2, xmm1
1938  addss xmm2, SIMD_SP_one
1939  mulss xmm2, xmm0
1940  addss xmm2, xmm3
1941  movss t, xmm2
1942  }
1943 
1944  return t;
1945 
1946 #else
1947 
1948  float a, d, s, t;
1949 
1950  if ( fabs( y ) > fabs( x ) ) {
1951  a = -x / y;
1952  d = idMath::HALF_PI;
1953  *((unsigned long *)&d) ^= ( *((unsigned long *)&x) ^ *((unsigned long *)&y) ) & (1<<31);
1954  } else {
1955  a = y / x;
1956  d = 0.0f;
1957  }
1958 
1959  s = a * a;
1960  t = 0.0028662257f;
1961  t *= s;
1962  t += -0.0161657367f;
1963  t *= s;
1964  t += 0.0429096138f;
1965  t *= s;
1966  t += -0.0752896400f;
1967  t *= s;
1968  t += 0.1065626393f;
1969  t *= s;
1970  t += -0.1420889944f;
1971  t *= s;
1972  t += 0.1999355085f;
1973  t *= s;
1974  t += -0.3333314528f;
1975  t *= s;
1976  t += 1.0f;
1977  t *= a;
1978  t += d;
1979 
1980  return t;
1981 
1982 #endif
1983 }
1984 
1985 /*
1986 ============
1987 SSE_ATan4
1988 ============
1989 */
1990 void SSE_ATan4( float y[4], float x[4], float at[4] ) {
1991  __asm {
1992  mov esi, x
1993  mov edi, y
1994  mov edx, at
1995  movaps xmm0, [esi]
1996  movaps xmm3, xmm0
1997  movaps xmm4, xmm0
1998  andps xmm0, SIMD_SP_absMask
1999  movaps xmm1, [edi]
2000  xorps xmm4, xmm1
2001  andps xmm1, SIMD_SP_absMask
2002  andps xmm4, SIMD_SP_signBitMask
2003  minps xmm0, xmm1
2004  maxps xmm1, xmm3
2005  cmpeqps xmm3, xmm0
2006  rcpps xmm2, xmm1
2007  mulps xmm1, xmm2
2008  mulps xmm1, xmm2
2009  addps xmm2, xmm2
2010  subps xmm2, xmm1 // xmm2 = 1 / y or 1 / x
2011  mulps xmm0, xmm2 // xmm0 = x / y or y / x
2012  xorps xmm0, xmm4
2013  movaps xmm1, xmm3
2014  andps xmm1, SIMD_SP_signBitMask
2015  xorps xmm0, xmm1 // xmm0 = -x / y or y / x
2016  orps xmm4, SIMD_SP_halfPI // xmm4 = +/- HALF_PI
2017  andps xmm3, xmm4 // xmm3 = +/- HALF_PI or 0.0f
2018  movaps xmm1, xmm0
2019  mulps xmm1, xmm1 // xmm1 = s
2020  movaps xmm2, SIMD_SP_atan_c0
2021  mulps xmm2, xmm1
2022  addps xmm2, SIMD_SP_atan_c1
2023  mulps xmm2, xmm1
2024  addps xmm2, SIMD_SP_atan_c2
2025  mulps xmm2, xmm1
2026  addps xmm2, SIMD_SP_atan_c3
2027  mulps xmm2, xmm1
2028  addps xmm2, SIMD_SP_atan_c4
2029  mulps xmm2, xmm1
2030  addps xmm2, SIMD_SP_atan_c5
2031  mulps xmm2, xmm1
2032  addps xmm2, SIMD_SP_atan_c6
2033  mulps xmm2, xmm1
2034  addps xmm2, SIMD_SP_atan_c7
2035  mulps xmm2, xmm1
2036  addps xmm2, SIMD_SP_one
2037  mulps xmm2, xmm0
2038  addps xmm2, xmm3
2039  movaps [edx], xmm2
2040  }
2041 }
2042 
2043 /*
2044 ============
2045 SSE_TestTrigonometry
2046 ============
2047 */
2048 void SSE_TestTrigonometry( void ) {
2049  int i;
2050  float a, s1, s2, c1, c2;
2051 
2052  for ( i = 0; i < 100; i++ ) {
2053  a = i * idMath::HALF_PI / 100.0f;
2054 
2055  s1 = sin( a );
2056  s2 = SSE_SinZeroHalfPI( a );
2057 
2058  if ( fabs( s1 - s2 ) > 1e-7f ) {
2059  assert( 0 );
2060  }
2061 
2062  c1 = cos( a );
2063  c2 = SSE_CosZeroHalfPI( a );
2064 
2065  if ( fabs( c1 - c2 ) > 1e-7f ) {
2066  assert( 0 );
2067  }
2068  }
2069 
2070  for ( i = -200; i < 200; i++ ) {
2071  a = i * idMath::TWO_PI / 100.0f;
2072 
2073  s1 = sin( a );
2074  s2 = SSE_Sin( a );
2075 
2076  if ( fabs( s1 - s2 ) > 1e-6f ) {
2077  assert( 0 );
2078  }
2079 
2080  c1 = cos( a );
2081  c2 = SSE_Cos( a );
2082 
2083  if ( fabs( c1 - c2 ) > 1e-6f ) {
2084  assert( 0 );
2085  }
2086 
2087  SSE_SinCos( a, s2, c2 );
2088  if ( fabs( s1 - s2 ) > 1e-6f || fabs( c1 - c2 ) > 1e-6f ) {
2089  assert( 0 );
2090  }
2091  }
2092 }
2093 
2094 /*
2095 ============
2096 idSIMD_SSE::GetName
2097 ============
2098 */
2099 const char * idSIMD_SSE::GetName( void ) const {
2100  return "MMX & SSE";
2101 }
2102 
2103 /*
2104 ============
2105 idSIMD_SSE::Add
2106 
2107  dst[i] = constant + src[i];
2108 ============
2109 */
2110 void VPCALL idSIMD_SSE::Add( float *dst, const float constant, const float *src, const int count ) {
2111  KFLOAT_CA( add, dst, src, constant, count )
2112 }
2113 
2114 /*
2115 ============
2116 idSIMD_SSE::Add
2117 
2118  dst[i] = src0[i] + src1[i];
2119 ============
2120 */
2121 void VPCALL idSIMD_SSE::Add( float *dst, const float *src0, const float *src1, const int count ) {
2122  KFLOAT_AA( add, dst, src0, src1, count )
2123 }
2124 
2125 /*
2126 ============
2127 idSIMD_SSE::Sub
2128 
2129  dst[i] = constant - src[i];
2130 ============
2131 */
2132 void VPCALL idSIMD_SSE::Sub( float *dst, const float constant, const float *src, const int count ) {
2133  KFLOAT_CA( sub, dst, src, constant, count )
2134 }
2135 
2136 /*
2137 ============
2138 idSIMD_SSE::Sub
2139 
2140  dst[i] = src0[i] - src1[i];
2141 ============
2142 */
2143 void VPCALL idSIMD_SSE::Sub( float *dst, const float *src0, const float *src1, const int count ) {
2144  KFLOAT_AA( sub, dst, src0, src1, count )
2145 }
2146 
2147 /*
2148 ============
2149 idSIMD_SSE::Mul
2150 
2151  dst[i] = constant * src[i];
2152 ============
2153 */
2154 void VPCALL idSIMD_SSE::Mul( float *dst, const float constant, const float *src, const int count ) {
2155  KFLOAT_CA( mul, dst, src, constant, count )
2156 }
2157 
2158 /*
2159 ============
2160 idSIMD_SSE::Mul
2161 
2162  dst[i] = src0[i] * src1[i];
2163 ============
2164 */
2165 void VPCALL idSIMD_SSE::Mul( float *dst, const float *src0, const float *src1, const int count ) {
2166  KFLOAT_AA( mul, dst, src0, src1, count )
2167 }
2168 
2169 /*
2170 ============
2171 idSIMD_SSE::Div
2172 
2173  dst[i] = constant / src[i];
2174 ============
2175 */
2176 void VPCALL idSIMD_SSE::Div( float *dst, const float constant, const float *src, const int count ) {
2177  int pre, post;
2178 
2179  // 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
2180  __asm
2181  {
2182  movss xmm1,constant
2183  shufps xmm1,xmm1,0
2184 
2185  KFLOATINITDS( dst, src, count, pre, post )
2186  and eax,15
2187  jne lpNA
2188  jmp lpA
2189  align 16
2190 lpA:
2191  movaps xmm2,[edx+ebx]
2192  movaps xmm3,[edx+ebx+16]
2193  rcpps xmm4,xmm2
2194  rcpps xmm5,xmm3
2195  prefetchnta [edx+ebx+64]
2196  mulps xmm2,xmm4
2197  mulps xmm2,xmm4
2198  mulps xmm3,xmm5
2199  mulps xmm3,xmm5
2200  addps xmm4,xmm4
2201  addps xmm5,xmm5
2202  subps xmm4,xmm2
2203  subps xmm5,xmm3
2204  mulps xmm4,xmm1
2205  mulps xmm5,xmm1
2206  movaps [edi+ebx],xmm4
2207  movaps [edi+ebx+16],xmm5
2208  add ebx,16*2
2209  jl lpA
2210  jmp done
2211  align 16
2212 lpNA:
2213  movups xmm2,[edx+ebx]
2214  movups xmm3,[edx+ebx+16]
2215  rcpps xmm4,xmm2
2216  rcpps xmm5,xmm3
2217  prefetchnta [edx+ebx+64]
2218  mulps xmm2,xmm4
2219  mulps xmm2,xmm4
2220  mulps xmm3,xmm5
2221  mulps xmm3,xmm5
2222  addps xmm4,xmm4
2223  addps xmm5,xmm5
2224  subps xmm4,xmm2
2225  subps xmm5,xmm3
2226  mulps xmm4,xmm1
2227  mulps xmm5,xmm1
2228  movaps [edi+ebx],xmm4
2229  movaps [edi+ebx+16],xmm5
2230  add ebx,16*2
2231  jl lpNA
2232 done:
2233  mov edx,src
2234  mov edi,dst
2235  KFLOATOPER( KDIVDSS1( [edi+ebx],xmm1,[edx+ebx] ),
2236  KDIVDSS4( [edi+ebx],xmm1,[edx+ebx] ), count )
2237  }
2238 }
2239 
2240 /*
2241 ============
2242 idSIMD_SSE::Div
2243 
2244  dst[i] = src0[i] / src1[i];
2245 ============
2246 */
2247 void VPCALL idSIMD_SSE::Div( float *dst, const float *src0, const float *src1, const int count ) {
2248  int pre,post;
2249 
2250  // 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
2251  __asm
2252  {
2253  KFLOATINITDSS( dst, src0, src1, count, pre, post )
2254  and eax,15
2255  jne lpNA
2256  jmp lpA
2257  align 16
2258 lpA:
2259  movaps xmm2,[esi+ebx]
2260  movaps xmm3,[esi+ebx+16]
2261  rcpps xmm4,xmm2
2262  rcpps xmm5,xmm3
2263  prefetchnta [esi+ebx+64]
2264  mulps xmm2,xmm4
2265  mulps xmm2,xmm4
2266  mulps xmm3,xmm5
2267  mulps xmm3,xmm5
2268  addps xmm4,xmm4
2269  addps xmm5,xmm5
2270  subps xmm4,xmm2
2271  subps xmm5,xmm3
2272  mulps xmm4,[edx+ebx]
2273  mulps xmm5,[edx+ebx+16]
2274  movaps [edi+ebx],xmm4
2275  movaps [edi+ebx+16],xmm5
2276  add ebx,16*2
2277  jl lpA
2278  jmp done
2279  align 16
2280 lpNA:
2281  movups xmm2,[esi+ebx]
2282  movups xmm3,[esi+ebx+16]
2283  rcpps xmm4,xmm2
2284  rcpps xmm5,xmm3
2285  prefetchnta [esi+ebx+64]
2286  mulps xmm2,xmm4
2287  mulps xmm2,xmm4
2288  mulps xmm3,xmm5
2289  mulps xmm3,xmm5
2290  addps xmm4,xmm4
2291  addps xmm5,xmm5
2292  subps xmm4,xmm2
2293  subps xmm5,xmm3
2294  movups xmm2,[edx+ebx]
2295  movups xmm3,[edx+ebx+16]
2296  mulps xmm4,xmm2
2297  mulps xmm5,xmm3
2298  movaps [edi+ebx],xmm4
2299  movaps [edi+ebx+16],xmm5
2300  add ebx,16*2
2301  jl lpNA
2302 done:
2303  mov edx,src0
2304  mov esi,src1
2305  mov edi,dst
2306  KFLOATOPER( KDIVDSS1( [edi+ebx],[edx+ebx],[esi+ebx] ),
2307  KDIVDSS4( [edi+ebx],[edx+ebx],[esi+ebx] ), count )
2308  }
2309 }
2310 /*
2311 ============
2312 Simd_MulAdd
2313 
2314  assumes count >= 7
2315 ============
2316 */
2317 static void Simd_MulAdd( float *dst, const float constant, const float *src, const int count ) {
2318  __asm mov esi, dst
2319  __asm mov edi, src
2320  __asm mov eax, count
2321  __asm shl eax, 2
2322  __asm mov ecx, esi
2323  __asm mov edx, eax
2324  __asm or ecx, edi
2325  __asm fld constant
2326  __asm and ecx, 15
2327  __asm jz SimdMulAdd16
2328  __asm and ecx, 3
2329  __asm jnz SimdMulAdd8
2330  __asm mov ecx, esi
2331  __asm xor ecx, edi
2332  __asm and ecx, 15
2333  __asm jnz MulAdd8
2334  __asm mov ecx, esi
2335  __asm and ecx, 15
2336  __asm neg ecx
2337  __asm add ecx, 16
2338  __asm sub eax, ecx
2339  __asm add edi, ecx
2340  __asm add esi, ecx
2341  __asm neg ecx
2342  __asm mov edx, eax
2343  __asm loopPreMulAdd16:
2344  __asm fld st
2345  __asm fmul dword ptr [edi+ecx]
2346  __asm fadd dword ptr [esi+ecx]
2347  __asm fstp dword ptr [esi+ecx]
2348  __asm add ecx, 4
2349  __asm jl loopPreMulAdd16
2350  __asm SimdMulAdd16:
2351  __asm and eax, ~15
2352  __asm movss xmm1, constant
2353  __asm shufps xmm1, xmm1, 0x00
2354  __asm add esi, eax
2355  __asm add edi, eax
2356  __asm neg eax
2357  __asm align 16
2358  __asm loopMulAdd16:
2359  __asm movaps xmm0, [edi+eax]
2360  __asm mulps xmm0, xmm1
2361  __asm addps xmm0, [esi+eax]
2362  __asm movaps [esi+eax], xmm0
2363  __asm add eax, 16
2364  __asm jl loopMulAdd16
2365  __asm jmp postMulAdd
2366  __asm MulAdd8:
2367  __asm mov ecx, esi
2368  __asm and ecx, 7
2369  __asm jz SimdMulAdd8
2370  __asm sub eax, ecx
2371  __asm add esi, ecx
2372  __asm add edi, ecx
2373  __asm neg ecx
2374  __asm mov edx, eax
2375  __asm loopPreMulAdd8:
2376  __asm fld st
2377  __asm fmul dword ptr [edi+ecx]
2378  __asm fadd dword ptr [esi+ecx]
2379  __asm fstp dword ptr [esi+ecx]
2380  __asm add ecx, 4
2381  __asm jl loopPreMulAdd8
2382  __asm SimdMulAdd8:
2383  __asm and eax, ~15
2384  __asm movss xmm1, constant
2385  __asm shufps xmm1, xmm1, 0x00
2386  __asm add esi, eax
2387  __asm add edi, eax
2388  __asm neg eax
2389  __asm align 16
2390  __asm loopMulAdd8:
2391  __asm movlps xmm0, [edi+eax]
2392  __asm movhps xmm0, [edi+eax+8]
2393  __asm mulps xmm0, xmm1
2394  __asm movlps xmm2, [esi+eax]
2395  __asm movhps xmm2, [esi+eax+8]
2396  __asm addps xmm0, xmm2
2397  __asm movlps [esi+eax], xmm0
2398  __asm movhps [esi+eax+8], xmm0
2399  __asm add eax, 16
2400  __asm jl loopMulAdd8
2401  __asm jmp postMulAdd
2402  __asm postMulAdd:
2403  __asm and edx, 15
2404  __asm jz MulAddDone
2405  __asm add esi, edx
2406  __asm add edi, edx
2407  __asm neg edx
2408  __asm loopPostMulAdd:
2409  __asm fld st
2410  __asm fmul dword ptr [edi+edx]
2411  __asm fadd dword ptr [esi+edx]
2412  __asm fstp dword ptr [esi+edx]
2413  __asm add edx, 4
2414  __asm jl loopPostMulAdd
2415  __asm MulAddDone:
2416  __asm fstp st
2417 }
2418 
2419 #define MULADD_FEW( OPER ) \
2420 switch( count ) { \
2421  case 0: \
2422  return; \
2423  case 1: \
2424  dst[0] OPER c * src[0]; \
2425  return; \
2426  case 2: \
2427  dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; \
2428  return; \
2429  case 3: \
2430  dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; \
2431  return; \
2432  case 4: \
2433  dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2434  return; \
2435  case 5: \
2436  dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2437  dst[4] OPER c * src[4]; \
2438  return; \
2439  case 6: \
2440  dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2441  dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; \
2442  return; \
2443  case 7: \
2444  dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2445  dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; \
2446  return; \
2447  case 8: \
2448  dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2449  dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
2450  return; \
2451  case 9: \
2452  dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2453  dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
2454  dst[8] OPER c * src[8]; \
2455  return; \
2456  case 10: \
2457  dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2458  dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
2459  dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; \
2460  return; \
2461  case 11: \
2462  dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2463  dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
2464  dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; dst[10] OPER c * src[10]; \
2465  return; \
2466 }
2467 
2468 /*
2469 ============
2470 idSIMD_SSE::MulAdd
2471 
2472  dst[i] += constant * src[i];
2473 ============
2474 */
2475 void VPCALL idSIMD_SSE::MulAdd( float *dst, const float constant, const float *src, const int count ) {
2476  float c = constant;
2477  MULADD_FEW( += )
2478  Simd_MulAdd( dst, constant, src, count );
2479 }
2480 
2481 /*
2482 ============
2483 idSIMD_SSE::MulAdd
2484 
2485  dst[i] += src0[i] * src1[i];
2486 ============
2487 */
2488 void VPCALL idSIMD_SSE::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
2489  for ( int i = 0; i < count; i++ ) {
2490  dst[i] += src0[i] + src1[i];
2491  }
2492 }
2493 
2494 /*
2495 ============
2496 idSIMD_SSE::MulSub
2497 
2498  dst[i] -= constant * src[i];
2499 ============
2500 */
2501 void VPCALL idSIMD_SSE::MulSub( float *dst, const float constant, const float *src, const int count ) {
2502  float c = constant;
2503  MULADD_FEW( -= )
2504  Simd_MulAdd( dst, -constant, src, count );
2505 }
2506 
2507 /*
2508 ============
2509 idSIMD_SSE::MulSub
2510 
2511  dst[i] -= src0[i] * src1[i];
2512 ============
2513 */
2514 void VPCALL idSIMD_SSE::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
2515  for ( int i = 0; i < count; i++ ) {
2516  dst[i] -= src0[i] + src1[i];
2517  }
2518 }
2519 
2520 /*
2521 ============
2522 idSIMD_SSE::Dot
2523 
2524  dst[i] = constant * src[i];
2525 ============
2526 */
2527 void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
2528  __asm
2529  {
2530  mov eax, count
2531  mov edi, constant
2532  mov edx, eax
2533  mov esi, src
2534  mov ecx, dst
2535  and eax, ~3
2536 
2537  movss xmm4, [edi+0]
2538  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
2539  movss xmm5, [edi+4]
2540  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
2541  movss xmm6, [edi+8]
2542  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
2543 
2544  jz done4
2545  imul eax, 12
2546  add esi, eax
2547  neg eax
2548 
2549  loop4:
2550  movlps xmm1, [esi+eax+ 0]
2551  movlps xmm2, [esi+eax+ 8]
2552  movlps xmm3, [esi+eax+16]
2553  movhps xmm1, [esi+eax+24]
2554  movhps xmm2, [esi+eax+32]
2555  movhps xmm3, [esi+eax+40]
2556  movaps xmm0, xmm1
2557  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
2558  shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
2559  shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
2560  add ecx, 16
2561  add eax, 4*12
2562  mulps xmm0, xmm4
2563  mulps xmm1, xmm5
2564  mulps xmm2, xmm6
2565  addps xmm0, xmm1
2566  addps xmm0, xmm2
2567  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
2568  movlps [ecx-16+0], xmm0
2569  movhps [ecx-16+8], xmm0
2570  jl loop4
2571 
2572  done4:
2573  and edx, 3
2574  jz done1
2575 
2576  loop1:
2577  movss xmm0, [esi+eax+0]
2578  movss xmm1, [esi+eax+4]
2579  movss xmm2, [esi+eax+8]
2580  mulss xmm0, xmm4
2581  mulss xmm1, xmm5
2582  mulss xmm2, xmm6
2583  add ecx, 4
2584  addss xmm0, xmm1
2585  add eax, 12
2586  addss xmm0, xmm2
2587  dec edx
2588  movss [ecx-4], xmm0
2589  jnz loop1
2590 
2591  done1:
2592  }
2593 }
2594 
2595 /*
2596 ============
2597 idSIMD_SSE::Dot
2598 
2599  dst[i] = constant * src[i].Normal() + src[i][3];
2600 ============
2601 */
2602 void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
2603  __asm {
2604  mov eax, count
2605  mov edi, constant
2606  mov edx, eax
2607  mov esi, src
2608  mov ecx, dst
2609  and eax, ~3
2610 
2611  movss xmm5, [edi+0]
2612  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
2613  movss xmm6, [edi+4]
2614  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
2615  movss xmm7, [edi+8]
2616  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
2617 
2618  jz startVert1
2619  imul eax, 16
2620  add esi, eax
2621  neg eax
2622 
2623  loopVert4:
2624 
2625  movlps xmm1, [esi+eax+ 0]
2626  movlps xmm3, [esi+eax+ 8]
2627  movhps xmm1, [esi+eax+16]
2628  movhps xmm3, [esi+eax+24]
2629  movlps xmm2, [esi+eax+32]
2630  movlps xmm4, [esi+eax+40]
2631  movhps xmm2, [esi+eax+48]
2632  movhps xmm4, [esi+eax+56]
2633  movaps xmm0, xmm1
2634  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
2635  shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
2636  movaps xmm2, xmm3
2637  shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
2638  shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
2639 
2640  add ecx, 16
2641  add eax, 4*16
2642 
2643  mulps xmm0, xmm5
2644  mulps xmm1, xmm6
2645  mulps xmm2, xmm7
2646  addps xmm0, xmm3
2647  addps xmm0, xmm1
2648  addps xmm0, xmm2
2649 
2650  movlps [ecx-16+0], xmm0
2651  movhps [ecx-16+8], xmm0
2652  jl loopVert4
2653 
2654  startVert1:
2655  and edx, 3
2656  jz done
2657 
2658  loopVert1:
2659  movss xmm0, [esi+eax+0]
2660  movss xmm1, [esi+eax+4]
2661  movss xmm2, [esi+eax+8]
2662  mulss xmm0, xmm5
2663  mulss xmm1, xmm6
2664  mulss xmm2, xmm7
2665  addss xmm0, [esi+eax+12]
2666  add ecx, 4
2667  addss xmm0, xmm1
2668  add eax, 16
2669  addss xmm0, xmm2
2670  dec edx
2671  movss [ecx-4], xmm0
2672  jnz loopVert1
2673 
2674  done:
2675  }
2676 }
2677 
2678 /*
2679 ============
2680 idSIMD_SSE::Dot
2681 
2682  dst[i] = constant * src[i].xyz;
2683 ============
2684 */
2685 void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
2686 
2687  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
2688  assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
2689 
2690  // 0, 1, 2
2691  // 3, 4, 5
2692  // 6, 7, 8
2693  // 9, 10, 11
2694 
2695  __asm {
2696  mov eax, count
2697  mov edi, constant
2698  mov edx, eax
2699  mov esi, src
2700  mov ecx, dst
2701  and eax, ~3
2702 
2703  movss xmm4, [edi+0]
2704  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
2705  movss xmm5, [edi+4]
2706  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
2707  movss xmm6, [edi+8]
2708  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
2709 
2710  jz startVert1
2711  imul eax, DRAWVERT_SIZE
2712  add esi, eax
2713  neg eax
2714 
2715  loopVert4:
2716  movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
2717  movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
2718  movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
2719  movaps xmm1, xmm0 // 3, X, 0, 1
2720 
2721  movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
2722  shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
2723 
2724  movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
2725  movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
2726  shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
2727 
2728  movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
2729  shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
2730 
2731  movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
2732  shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
2733 
2734  add ecx, 16
2735  add eax, 4*DRAWVERT_SIZE
2736 
2737  mulps xmm0, xmm4
2738  mulps xmm1, xmm5
2739  mulps xmm2, xmm6
2740  addps xmm0, xmm1
2741  addps xmm0, xmm2
2742 
2743  movlps [ecx-16+0], xmm0
2744  movhps [ecx-16+8], xmm0
2745  jl loopVert4
2746 
2747  startVert1:
2748  and edx, 3
2749  jz done
2750 
2751  loopVert1:
2752  movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
2753  movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
2754  movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
2755  mulss xmm0, xmm4
2756  mulss xmm1, xmm5
2757  mulss xmm2, xmm6
2758  add ecx, 4
2759  addss xmm0, xmm1
2760  add eax, DRAWVERT_SIZE
2761  addss xmm0, xmm2
2762  dec edx
2763  movss [ecx-4], xmm0
2764  jnz loopVert1
2765 
2766  done:
2767  }
2768 }
2769 
2770 /*
2771 ============
2772 idSIMD_SSE::Dot
2773 
2774  dst[i] = constant.Normal() * src[i] + constant[3];
2775 ============
2776 */
2777 void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
2778  __asm
2779  {
2780  mov eax, count
2781  mov edi, constant
2782  mov edx, eax
2783  mov esi, src
2784  mov ecx, dst
2785  and eax, ~3
2786 
2787  movss xmm4, [edi+0]
2788  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
2789  movss xmm5, [edi+4]
2790  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
2791  movss xmm6, [edi+8]
2792  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
2793  movss xmm7, [edi+12]
2794  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
2795 
2796  jz done4
2797  imul eax, 12
2798  add esi, eax
2799  neg eax
2800 
2801  loop4:
2802  movlps xmm1, [esi+eax+ 0]
2803  movlps xmm2, [esi+eax+ 8]
2804  movlps xmm3, [esi+eax+16]
2805  movhps xmm1, [esi+eax+24]
2806  movhps xmm2, [esi+eax+32]
2807  movhps xmm3, [esi+eax+40]
2808  movaps xmm0, xmm1
2809  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
2810  shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
2811  shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
2812 
2813  add ecx, 16
2814  add eax, 4*12
2815 
2816  mulps xmm0, xmm4
2817  mulps xmm1, xmm5
2818  mulps xmm2, xmm6
2819  addps xmm0, xmm7
2820  addps xmm0, xmm1
2821  addps xmm0, xmm2
2822  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
2823 
2824  movlps [ecx-16+0], xmm0
2825  movhps [ecx-16+8], xmm0
2826  jl loop4
2827 
2828  done4:
2829  and edx, 3
2830  jz done1
2831 
2832  loop1:
2833  movss xmm0, [esi+eax+0]
2834  movss xmm1, [esi+eax+4]
2835  movss xmm2, [esi+eax+8]
2836  mulss xmm0, xmm4
2837  mulss xmm1, xmm5
2838  mulss xmm2, xmm6
2839  addss xmm0, xmm7
2840  add ecx, 4
2841  addss xmm0, xmm1
2842  add eax, 12
2843  addss xmm0, xmm2
2844  dec edx
2845  movss [ecx-4], xmm0
2846  jnz loop1
2847 
2848  done1:
2849  }
2850 }
2851 
2852 /*
2853 ============
2854 idSIMD_SSE::Dot
2855 
2856  dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
2857 ============
2858 */
2859 void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
2860 
2861 #define SINGLE_OP(SRC, DEST) \
2862  __asm movlps xmm0,[SRC] \
2863  __asm movlps xmm1,[SRC+8] \
2864  __asm mulps xmm0,xmm4 \
2865  __asm mulps xmm1,xmm5 \
2866  __asm addps xmm0,xmm1 \
2867  __asm movaps xmm1,xmm0 \
2868  __asm shufps xmm1,xmm1,SHUFFLEPS(1,1,1,1) \
2869  __asm addss xmm0,xmm1 \
2870  __asm movss [DEST],xmm0 \
2871  __asm add SRC,16 \
2872  __asm add DEST,4
2873 
2874 #define DUAL_OP(SRC, DEST) \
2875  __asm movlps xmm0,[SRC] \
2876  __asm movlps xmm1,[SRC+8] \
2877  __asm movhps xmm0,[SRC+16] \
2878  __asm movhps xmm1,[SRC+24] \
2879  __asm mulps xmm0,xmm4 \
2880  __asm mulps xmm1,xmm5 \
2881  __asm addps xmm0,xmm1 \
2882  __asm shufps xmm1,xmm0,SHUFFLEPS(2,0,1,0) \
2883  __asm shufps xmm0,xmm0,SHUFFLEPS(3,1,2,0) \
2884  __asm addps xmm0,xmm1 \
2885  __asm movhps [DEST],xmm0 \
2886  __asm add SRC,32 \
2887  __asm add DEST,8
2888 
2889  __asm {
2890  mov edx, dst
2891  mov eax, src
2892  mov ebx, constant
2893  mov ecx, count
2894 
2895  movlps xmm4, [ebx]
2896  shufps xmm4, xmm4, SHUFFLEPS(1,0,1,0)
2897  movlps xmm5, [ebx+8]
2898  shufps xmm5, xmm5, SHUFFLEPS(1,0,1,0)
2899 
2900  xorps xmm0, xmm0
2901  xorps xmm1, xmm1
2902 
2903  _lpAlignDest:
2904  test edx, 0x0f
2905  jz _destAligned
2906  SINGLE_OP(eax,edx)
2907  dec ecx
2908  jnz _lpAlignDest
2909  jmp _vpExit
2910 
2911  _destAligned:
2912  push ecx
2913 
2914  cmp ecx, 4
2915  jl _post
2916 
2917  and ecx, ~3
2918  shl ecx, 2
2919  lea eax, [eax+ecx*4]
2920  add edx, ecx
2921  neg ecx
2922 
2923  movlps xmm0, [eax+ecx*4]
2924  movhps xmm0, [eax+ecx*4+16]
2925  movlps xmm2, [eax+ecx*4+32]
2926  movhps xmm2, [eax+ecx*4+48]
2927  jmp _lpStart
2928 
2929  align 16
2930  _lp:
2931  prefetchnta [eax+ecx*4+128]
2932  addps xmm1, xmm0
2933  movlps xmm0, [eax+ecx*4]
2934  movhps xmm0, [eax+ecx*4+16]
2935  movlps xmm2, [eax+ecx*4+32]
2936  movhps xmm2, [eax+ecx*4+48]
2937  movaps [edx+ecx-16],xmm1
2938  _lpStart:
2939  movlps xmm1, [eax+ecx*4+8]
2940  movhps xmm1, [eax+ecx*4+24]
2941  movlps xmm3, [eax+ecx*4+40]
2942  movhps xmm3, [eax+ecx*4+56]
2943  add ecx, 16
2944  mulps xmm1, xmm5
2945  mulps xmm2, xmm4
2946  mulps xmm3, xmm5
2947  addps xmm2, xmm3 // y3+w3 x3+z3 y2+w2 x2+z2
2948  mulps xmm0, xmm4
2949  addps xmm0, xmm1 // y1+w1 x1+z1 y0+w0 x0+z0
2950  movaps xmm1, xmm0
2951  shufps xmm0, xmm2, SHUFFLEPS(2,0,2,0) // x3+z3 x2+z2 x1+z1 x0+z0
2952  shufps xmm1, xmm2, SHUFFLEPS(3,1,3,1) // y3+w3 y2+w2 y1+w1 y0+w0
2953  js _lp
2954  addps xmm1, xmm0
2955  movaps [edx+ecx-16], xmm1
2956  _post:
2957  pop ecx
2958  and ecx, 0x3
2959  cmp ecx, 2
2960  jl _post1
2961  DUAL_OP(eax,edx)
2962  sub ecx, 2
2963  _post1:
2964  cmp ecx, 1
2965  jne _vpExit
2966  SINGLE_OP(eax,edx)
2967  _vpExit:
2968  }
2969 
2970 #undef DUAL_OP
2971 #undef SINGLE_OP
2972 
2973 }
2974 
2975 /*
2976 ============
2977 idSIMD_SSE::Dot
2978 
2979  dst[i] = constant.Normal() * src[i].xyz + constant[3];
2980 ============
2981 */
2982 void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
2983 
2984  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
2985  assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
2986 
2987  // 0, 1, 2
2988  // 3, 4, 5
2989  // 6, 7, 8
2990  // 9, 10, 11
2991 
2992  __asm {
2993  mov eax, count
2994  mov edi, constant
2995  mov edx, eax
2996  mov esi, src
2997  mov ecx, dst
2998  and eax, ~3
2999 
3000  movss xmm4, [edi+0]
3001  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
3002  movss xmm5, [edi+4]
3003  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
3004  movss xmm6, [edi+8]
3005  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
3006  movss xmm7, [edi+12]
3007  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
3008 
3009  jz startVert1
3010  imul eax, DRAWVERT_SIZE
3011  add esi, eax
3012  neg eax
3013 
3014  loopVert4:
3015  movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
3016  movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
3017  movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
3018  movaps xmm1, xmm0 // 3, X, 0, 1
3019 
3020  movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
3021  shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
3022 
3023  movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
3024  movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
3025  shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
3026 
3027  movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
3028  shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
3029 
3030  movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
3031  shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
3032 
3033  add ecx, 16
3034  add eax, 4*DRAWVERT_SIZE
3035 
3036  mulps xmm0, xmm4
3037  mulps xmm1, xmm5
3038  mulps xmm2, xmm6
3039  addps xmm0, xmm7
3040  addps xmm0, xmm1
3041  addps xmm0, xmm2
3042 
3043  movlps [ecx-16+0], xmm0
3044  movhps [ecx-16+8], xmm0
3045  jl loopVert4
3046 
3047  startVert1:
3048  and edx, 3
3049  jz done
3050 
3051  loopVert1:
3052  movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
3053  movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
3054  movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
3055  mulss xmm0, xmm4
3056  mulss xmm1, xmm5
3057  mulss xmm2, xmm6
3058  addss xmm0, xmm7
3059  add ecx, 4
3060  addss xmm0, xmm1
3061  add eax, DRAWVERT_SIZE
3062  addss xmm0, xmm2
3063  dec edx
3064  movss [ecx-4], xmm0
3065  jnz loopVert1
3066 
3067  done:
3068  }
3069 }
3070 
3071 /*
3072 ============
3073 idSIMD_SSE::Dot
3074 
3075  dst[i] = src0[i] * src1[i];
3076 ============
3077 */
3078 void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
3079  __asm
3080  {
3081  mov eax, count
3082  mov edi, src0
3083  mov edx, eax
3084  mov esi, src1
3085  mov ecx, dst
3086  and eax, ~3
3087 
3088  jz done4
3089  imul eax, 12
3090  add edi, eax
3091  add esi, eax
3092  neg eax
3093 
3094  loop4:
3095  movlps xmm0, [esi+eax] // 0, 1, X, X
3096  movlps xmm3, [edi+eax] // 0, 1, X, X
3097  movlps xmm1, [esi+eax+8] // 2, 3, X, X
3098  movlps xmm4, [edi+eax+8] // 2, 3, X, X
3099  movhps xmm0, [esi+eax+24] // 0, 1, 6, 7
3100  movhps xmm3, [edi+eax+24] // 0, 1, 6, 7
3101  movhps xmm1, [esi+eax+32] // 2, 3, 8, 9
3102  movhps xmm4, [edi+eax+32] // 2, 3, 8, 9
3103  movlps xmm2, [esi+eax+16] // 4, 5, X, X
3104  movlps xmm5, [edi+eax+16] // 4, 5, X, X
3105  movhps xmm2, [esi+eax+40] // 4, 5, 10, 11
3106  movhps xmm5, [edi+eax+40] // 4, 5, 10, 11
3107 
3108  add ecx, 16
3109  add eax, 48
3110 
3111  mulps xmm0, xmm3
3112  mulps xmm1, xmm4
3113  mulps xmm2, xmm5
3114  movaps xmm7, xmm0
3115  shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) // 0, 6, 3, 9
3116  shufps xmm0, xmm2, R_SHUFFLEPS( 1, 3, 0, 2 ) // 1, 7, 4, 10
3117  shufps xmm1, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 ) // 2, 8, 5, 11
3118  addps xmm7, xmm0
3119  addps xmm7, xmm1
3120  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 1, 3 )
3121 
3122  movlps [ecx-16+0], xmm7
3123  movhps [ecx-16+8], xmm7
3124  jl loop4
3125 
3126  done4:
3127  and edx, 3
3128  jz done1
3129 
3130  loop1:
3131  movss xmm0, [esi+eax+0]
3132  movss xmm3, [edi+eax+0]
3133  movss xmm1, [esi+eax+4]
3134  movss xmm4, [edi+eax+4]
3135  movss xmm2, [esi+eax+8]
3136  movss xmm5, [edi+eax+8]
3137  mulss xmm0, xmm3
3138  mulss xmm1, xmm4
3139  mulss xmm2, xmm5
3140  add ecx, 4
3141  addss xmm0, xmm1
3142  add eax, 12
3143  addss xmm0, xmm2
3144  dec edx
3145  movss [ecx-4], xmm0
3146  jnz loop1
3147 
3148  done1:
3149  }
3150 }
3151 
3152 /*
3153 ============
3154 idSIMD_SSE::Dot
3155 
3156  dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
3157 ============
3158 */
3159 void VPCALL idSIMD_SSE::Dot( float &dot, const float *src1, const float *src2, const int count ) {
3160  switch( count ) {
3161  case 0:
3162  dot = 0.0f;
3163  return;
3164  case 1:
3165  dot = src1[0] * src2[0];
3166  return;
3167  case 2:
3168  dot = src1[0] * src2[0] + src1[1] * src2[1];
3169  return;
3170  case 3:
3171  dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
3172  return;
3173  default:
3174  __asm {
3175  mov ecx, src1
3176  mov edx, src2
3177  mov eax, ecx
3178  or eax, edx
3179  and eax, 15
3180  jz alignedDot
3181  // unaligned
3182  mov eax, count
3183  shr eax, 2
3184  shl eax, 4
3185  add ecx, eax
3186  add edx, eax
3187  neg eax
3188  movups xmm0, [ecx+eax]
3189  movups xmm1, [edx+eax]
3190  mulps xmm0, xmm1
3191  add eax, 16
3192  jz doneDot
3193  loopUnalignedDot:
3194  movups xmm1, [ecx+eax]
3195  movups xmm2, [edx+eax]
3196  mulps xmm1, xmm2
3197  addps xmm0, xmm1
3198  add eax, 16
3199  jl loopUnalignedDot
3200  jmp doneDot
3201  // aligned
3202  alignedDot:
3203  mov eax, count
3204  shr eax, 2
3205  shl eax, 4
3206  add ecx, eax
3207  add edx, eax
3208  neg eax
3209  movaps xmm0, [ecx+eax]
3210  movaps xmm1, [edx+eax]
3211  mulps xmm0, xmm1
3212  add eax, 16
3213  jz doneDot
3214  loopAlignedDot:
3215  movaps xmm1, [ecx+eax]
3216  movaps xmm2, [edx+eax]
3217  mulps xmm1, xmm2
3218  addps xmm0, xmm1
3219  add eax, 16
3220  jl loopAlignedDot
3221  doneDot:
3222  }
3223  switch( count & 3 ) {
3224  case 1:
3225  __asm {
3226  movss xmm1, [ecx]
3227  movss xmm2, [edx]
3228  mulss xmm1, xmm2
3229  addss xmm0, xmm1
3230  }
3231  break;
3232  case 2:
3233  __asm {
3234  xorps xmm2, xmm2
3235  movlps xmm1, [ecx]
3236  movlps xmm2, [edx]
3237  mulps xmm1, xmm2
3238  addps xmm0, xmm1
3239  }
3240  break;
3241  case 3:
3242  __asm {
3243  movss xmm1, [ecx]
3244  movhps xmm1, [ecx+4]
3245  movss xmm2, [edx]
3246  movhps xmm2, [edx+4]
3247  mulps xmm1, xmm2
3248  addps xmm0, xmm1
3249  }
3250  break;
3251  }
3252  __asm {
3253  movhlps xmm1, xmm0
3254  addps xmm0, xmm1
3255  movaps xmm1, xmm0
3256  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
3257  addss xmm0, xmm1
3258  mov eax, dot
3259  movss [eax], xmm0
3260  }
3261  return;
3262  }
3263 }
3264 
3265 //
3266 // cmpeqps == Equal
3267 // cmpneqps != Not Equal
3268 // cmpltps < Less Than
3269 // cmpnltps >= Not Less Than
3270 // cmpnleps > Not Less Or Equal
3271 //
3272 #define FLIP not al
3273 #define NOFLIP
3274 
3275 #define COMPARECONSTANT( DST, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \
3276  int i, cnt, pre, post; \
3277  float *aligned; \
3278  \
3279  /* if the float array is not aligned on a 4 byte boundary */ \
3280  if ( ((int) SRC0) & 3 ) { \
3281  /* unaligned memory access */ \
3282  pre = 0; \
3283  cnt = COUNT >> 2; \
3284  post = COUNT - (cnt<<2); \
3285  __asm mov edx, cnt \
3286  __asm test edx, edx \
3287  __asm je doneCmp \
3288  __asm push ebx \
3289  __asm neg edx \
3290  __asm mov esi, SRC0 \
3291  __asm prefetchnta [esi+64] \
3292  __asm movss xmm1, CONSTANT \
3293  __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
3294  __asm mov edi, DST \
3295  __asm mov ecx, 0x01010101 \
3296  __asm loopNA: \
3297  __asm movups xmm0, [esi] \
3298  __asm prefetchnta [esi+128] \
3299  __asm CMPSIMD xmm0, xmm1 \
3300  __asm movmskps eax, xmm0 \
3301  __asm DOFLIP \
3302  __asm mov ah, al \
3303  __asm shr ah, 1 \
3304  __asm mov bx, ax \
3305  __asm shl ebx, 14 \
3306  __asm mov bx, ax \
3307  __asm and ebx, ecx \
3308  __asm mov dword ptr [edi], ebx \
3309  __asm add esi, 16 \
3310  __asm add edi, 4 \
3311  __asm inc edx \
3312  __asm jl loopNA \
3313  __asm pop ebx \
3314  } \
3315  else { \
3316  /* aligned memory access */ \
3317  aligned = (float *) ((((int) SRC0) + 15) & ~15); \
3318  if ( (int)aligned > ((int)src0) + COUNT ) { \
3319  pre = COUNT; \
3320  post = 0; \
3321  } \
3322  else { \
3323  pre = aligned - SRC0; \
3324  cnt = (COUNT - pre) >> 2; \
3325  post = COUNT - pre - (cnt<<2); \
3326  __asm mov edx, cnt \
3327  __asm test edx, edx \
3328  __asm je doneCmp \
3329  __asm push ebx \
3330  __asm neg edx \
3331  __asm mov esi, aligned \
3332  __asm prefetchnta [esi+64] \
3333  __asm movss xmm1, CONSTANT \
3334  __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
3335  __asm mov edi, DST \
3336  __asm add edi, pre \
3337  __asm mov ecx, 0x01010101 \
3338  __asm loopA: \
3339  __asm movaps xmm0, [esi] \
3340  __asm prefetchnta [esi+128] \
3341  __asm CMPSIMD xmm0, xmm1 \
3342  __asm movmskps eax, xmm0 \
3343  __asm DOFLIP \
3344  __asm mov ah, al \
3345  __asm shr ah, 1 \
3346  __asm mov bx, ax \
3347  __asm shl ebx, 14 \
3348  __asm mov bx, ax \
3349  __asm and ebx, ecx \
3350  __asm mov dword ptr [edi], ebx \
3351  __asm add esi, 16 \
3352  __asm add edi, 4 \
3353  __asm inc edx \
3354  __asm jl loopA \
3355  __asm pop ebx \
3356  } \
3357  } \
3358  doneCmp: \
3359  double c = constant; \
3360  for ( i = 0; i < pre; i++ ) { \
3361  dst[i] = src0[i] CMP c; \
3362  } \
3363  for ( i = count - post; i < count; i++ ) { \
3364  dst[i] = src0[i] CMP c; \
3365  }
3366 
3367 #define COMPAREBITCONSTANT( DST, BITNUM, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \
3368  int i, cnt, pre, post; \
3369  float *aligned; \
3370  \
3371  /* if the float array is not aligned on a 4 byte boundary */ \
3372  if ( ((int) SRC0) & 3 ) { \
3373  /* unaligned memory access */ \
3374  pre = 0; \
3375  cnt = COUNT >> 2; \
3376  post = COUNT - (cnt<<2); \
3377  __asm mov edx, cnt \
3378  __asm test edx, edx \
3379  __asm je doneCmp \
3380  __asm push ebx \
3381  __asm neg edx \
3382  __asm mov esi, SRC0 \
3383  __asm prefetchnta [esi+64] \
3384  __asm movss xmm1, CONSTANT \
3385  __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
3386  __asm mov edi, DST \
3387  __asm mov cl, bitNum \
3388  __asm loopNA: \
3389  __asm movups xmm0, [esi] \
3390  __asm prefetchnta [esi+128] \
3391  __asm CMPSIMD xmm0, xmm1 \
3392  __asm movmskps eax, xmm0 \
3393  __asm DOFLIP \
3394  __asm mov ah, al \
3395  __asm shr ah, 1 \
3396  __asm mov bx, ax \
3397  __asm shl ebx, 14 \
3398  __asm mov bx, ax \
3399  __asm and ebx, 0x01010101 \
3400  __asm shl ebx, cl \
3401  __asm or ebx, dword ptr [edi] \
3402  __asm mov dword ptr [edi], ebx \
3403  __asm add esi, 16 \
3404  __asm add edi, 4 \
3405  __asm inc edx \
3406  __asm jl loopNA \
3407  __asm pop ebx \
3408  } \
3409  else { \
3410  /* aligned memory access */ \
3411  aligned = (float *) ((((int) SRC0) + 15) & ~15); \
3412  if ( (int)aligned > ((int)src0) + COUNT ) { \
3413  pre = COUNT; \
3414  post = 0; \
3415  } \
3416  else { \
3417  pre = aligned - SRC0; \
3418  cnt = (COUNT - pre) >> 2; \
3419  post = COUNT - pre - (cnt<<2); \
3420  __asm mov edx, cnt \
3421  __asm test edx, edx \
3422  __asm je doneCmp \
3423  __asm push ebx \
3424  __asm neg edx \
3425  __asm mov esi, aligned \
3426  __asm prefetchnta [esi+64] \
3427  __asm movss xmm1, CONSTANT \
3428  __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
3429  __asm mov edi, DST \
3430  __asm add edi, pre \
3431  __asm mov cl, bitNum \
3432  __asm loopA: \
3433  __asm movaps xmm0, [esi] \
3434  __asm prefetchnta [esi+128] \
3435  __asm CMPSIMD xmm0, xmm1 \
3436  __asm movmskps eax, xmm0 \
3437  __asm DOFLIP \
3438  __asm mov ah, al \
3439  __asm shr ah, 1 \
3440  __asm mov bx, ax \
3441  __asm shl ebx, 14 \
3442  __asm mov bx, ax \
3443  __asm and ebx, 0x01010101 \
3444  __asm shl ebx, cl \
3445  __asm or ebx, dword ptr [edi] \
3446  __asm mov dword ptr [edi], ebx \
3447  __asm add esi, 16 \
3448  __asm add edi, 4 \
3449  __asm inc edx \
3450  __asm jl loopA \
3451  __asm pop ebx \
3452  } \
3453  } \
3454  doneCmp: \
3455  float c = constant; \
3456  for ( i = 0; i < pre; i++ ) { \
3457  dst[i] |= ( src0[i] CMP c ) << BITNUM; \
3458  } \
3459  for ( i = count - post; i < count; i++ ) { \
3460  dst[i] |= ( src0[i] CMP c ) << BITNUM; \
3461  }
3462 
3463 /*
3464 ============
3465 idSIMD_SSE::CmpGT
3466 
3467  dst[i] = src0[i] > constant;
3468 ============
3469 */
3470 void VPCALL idSIMD_SSE::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
3471  COMPARECONSTANT( dst, src0, constant, count, >, cmpnleps, NOFLIP )
3472 }
3473 
3474 /*
3475 ============
3476 idSIMD_SSE::CmpGT
3477 
3478  dst[i] |= ( src0[i] > constant ) << bitNum;
3479 ============
3480 */
3481 void VPCALL idSIMD_SSE::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
3482  COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >, cmpnleps, NOFLIP )
3483 }
3484 
3485 /*
3486 ============
3487 idSIMD_SSE::CmpGE
3488 
3489  dst[i] = src0[i] >= constant;
3490 ============
3491 */
3492 void VPCALL idSIMD_SSE::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
3493  COMPARECONSTANT( dst, src0, constant, count, >=, cmpnltps, NOFLIP )
3494 }
3495 
3496 /*
3497 ============
3498 idSIMD_SSE::CmpGE
3499 
3500  dst[i] |= ( src0[i] >= constant ) << bitNum;
3501 ============
3502 */
3503 void VPCALL idSIMD_SSE::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
3504  COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >=, cmpnltps, NOFLIP )
3505 }
3506 
3507 /*
3508 ============
3509 idSIMD_SSE::CmpLT
3510 
3511  dst[i] = src0[i] < constant;
3512 ============
3513 */
3514 void VPCALL idSIMD_SSE::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
3515  COMPARECONSTANT( dst, src0, constant, count, <, cmpltps, NOFLIP )
3516 }
3517 
3518 /*
3519 ============
3520 idSIMD_SSE::CmpLT
3521 
3522  dst[i] |= ( src0[i] < constant ) << bitNum;
3523 ============
3524 */
3525 void VPCALL idSIMD_SSE::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
3526  COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <, cmpltps, NOFLIP )
3527 }
3528 
3529 /*
3530 ============
3531 idSIMD_SSE::CmpLE
3532 
3533  dst[i] = src0[i] <= constant;
3534 ============
3535 */
3536 void VPCALL idSIMD_SSE::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
3537  COMPARECONSTANT( dst, src0, constant, count, <=, cmpnleps, FLIP )
3538 }
3539 
3540 /*
3541 ============
3542 idSIMD_SSE::CmpLE
3543 
3544  dst[i] |= ( src0[i] <= constant ) << bitNum;
3545 ============
3546 */
3547 void VPCALL idSIMD_SSE::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
3548  COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <=, cmpnleps, FLIP )
3549 }
3550 
3551 /*
3552 ============
3553 idSIMD_SSE::MinMax
3554 ============
3555 */
3556 void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const int count ) {
3557  int i, pre, post;
3558 
3559  min = idMath::INFINITY; max = -idMath::INFINITY;
3560 
3561  __asm
3562  {
3563  push ebx
3564  mov eax, min
3565  mov ebx, max
3566  movss xmm0, [eax]
3567  movss xmm1, [ebx]
3568  shufps xmm0, xmm0, 0
3569  shufps xmm1, xmm1, 0
3570 
3571  KFLOATINITS( src, count, pre, post )
3572  and eax, 15
3573  jz lpA
3574  jmp lpNA
3575  align 16
3576 lpNA:
3577  movups xmm2, [edx+ebx]
3578  movups xmm3, [edx+ebx+16]
3579  minps xmm0, xmm2
3580  maxps xmm1, xmm2
3581  prefetchnta [edx+ebx+64]
3582  minps xmm0, xmm3
3583  maxps xmm1, xmm3
3584  add ebx, 16*2
3585  jl lpNA
3586  jmp done2
3587 lpA:
3588  movaps xmm2, [edx+ebx]
3589  movaps xmm3, [edx+ebx+16]
3590  minps xmm0, xmm2
3591  maxps xmm1, xmm2
3592  prefetchnta [edx+ebx+64]
3593  minps xmm0, xmm3
3594  maxps xmm1, xmm3
3595  add ebx, 16*2
3596  jl lpA
3597  jmp done2
3598  align 16
3599 done2:
3600  movaps xmm2, xmm0
3601  movaps xmm3, xmm1
3602  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
3603  shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
3604  minss xmm0, xmm2
3605  maxss xmm1, xmm3
3606  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
3607  shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
3608  minss xmm0, xmm2
3609  maxss xmm1, xmm3
3610  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
3611  shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
3612  minss xmm0, xmm2
3613  maxss xmm1, xmm3
3614  mov eax, min
3615  mov ebx, max
3616  movss [eax], xmm0
3617  movss [ebx], xmm1
3618 done:
3619  pop ebx
3620  }
3621 
3622  for ( i = 0; i < pre; i++ ) {
3623  float tmp = src[i];
3624  if ( tmp > max ) {
3625  max = tmp;
3626  }
3627  if ( tmp < min ) {
3628  min = tmp;
3629  }
3630  }
3631  for ( i = count - post; i < count; i++ ) {
3632  float tmp = src[i];
3633  if ( tmp > max ) {
3634  max = tmp;
3635  }
3636  if ( tmp < min ) {
3637  min = tmp;
3638  }
3639  }
3640 }
3641 
3642 /*
3643 ============
3644 idSIMD_SSE::MinMax
3645 ============
3646 */
3647 void VPCALL idSIMD_SSE::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
3648  __asm {
3649  mov eax, count
3650  test eax, eax
3651  movss xmm0, idMath::INFINITY
3652  xorps xmm1, xmm1
3653  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
3654  subps xmm1, xmm0
3655  jz done
3656  mov ecx, eax
3657  and ecx, 1
3658  mov esi, src
3659  jz startLoop
3660  movlps xmm2, [esi]
3661  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
3662  dec eax
3663  add esi, 2*4
3664  minps xmm0, xmm2
3665  maxps xmm1, xmm2
3666  startLoop:
3667  imul eax, 2*4
3668  add esi, eax
3669  neg eax
3670  loopVert:
3671  movlps xmm2, [esi+eax]
3672  movhps xmm2, [esi+eax+8]
3673  add eax, 4*4
3674  minps xmm0, xmm2
3675  maxps xmm1, xmm2
3676  jl loopVert
3677  done:
3678  movaps xmm2, xmm0
3679  shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 )
3680  minps xmm0, xmm2
3681  mov esi, min
3682  movlps [esi], xmm0
3683  movaps xmm3, xmm1
3684  shufps xmm3, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
3685  maxps xmm1, xmm3
3686  mov edi, max
3687  movlps [edi], xmm1
3688  }
3689 }
3690 
3691 /*
3692 ============
3693 idSIMD_SSE::MinMax
3694 ============
3695 */
3696 void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
3697  __asm {
3698 
3699  movss xmm0, idMath::INFINITY
3700  xorps xmm1, xmm1
3701  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
3702  subps xmm1, xmm0
3703  movaps xmm2, xmm0
3704  movaps xmm3, xmm1
3705 
3706  mov esi, src
3707  mov eax, count
3708  and eax, ~3
3709  jz done4
3710  imul eax, 12
3711  add esi, eax
3712  neg eax
3713 
3714  loop4:
3715 // prefetchnta [esi+4*12]
3716 
3717  movss xmm4, [esi+eax+0*12+8]
3718  movhps xmm4, [esi+eax+0*12+0]
3719  minps xmm0, xmm4
3720  maxps xmm1, xmm4
3721 
3722  movss xmm5, [esi+eax+1*12+0]
3723  movhps xmm5, [esi+eax+1*12+4]
3724  minps xmm2, xmm5
3725  maxps xmm3, xmm5
3726 
3727  movss xmm6, [esi+eax+2*12+8]
3728  movhps xmm6, [esi+eax+2*12+0]
3729  minps xmm0, xmm6
3730  maxps xmm1, xmm6
3731 
3732  movss xmm7, [esi+eax+3*12+0]
3733  movhps xmm7, [esi+eax+3*12+4]
3734  minps xmm2, xmm7
3735  maxps xmm3, xmm7
3736 
3737  add eax, 4*12
3738  jl loop4
3739 
3740  done4:
3741  mov eax, count
3742  and eax, 3
3743  jz done1
3744  imul eax, 12
3745  add esi, eax
3746  neg eax
3747 
3748  loop1:
3749  movss xmm4, [esi+eax+0*12+8]
3750  movhps xmm4, [esi+eax+0*12+0]
3751  minps xmm0, xmm4
3752  maxps xmm1, xmm4
3753 
3754  add eax, 12
3755  jl loop1
3756 
3757  done1:
3758  shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
3759  shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
3760  minps xmm0, xmm2
3761  maxps xmm1, xmm3
3762  mov esi, min
3763  movhps [esi], xmm0
3764  movss [esi+8], xmm0
3765  mov edi, max
3766  movhps [edi], xmm1
3767  movss [edi+8], xmm1
3768  }
3769 }
3770 
3771 /*
3772 ============
3773 idSIMD_SSE::MinMax
3774 ============
3775 */
3776 void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
3777 
3778  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
3779  assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
3780 
3781  __asm {
3782 
3783  movss xmm0, idMath::INFINITY
3784  xorps xmm1, xmm1
3785  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
3786  subps xmm1, xmm0
3787  movaps xmm2, xmm0
3788  movaps xmm3, xmm1
3789 
3790  mov esi, src
3791  mov eax, count
3792  and eax, ~3
3793  jz done4
3794  imul eax, DRAWVERT_SIZE
3795  add esi, eax
3796  neg eax
3797 
3798  loop4:
3799 // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
3800 
3801  movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
3802  movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3803  minps xmm0, xmm4
3804  maxps xmm1, xmm4
3805 
3806  movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3807  movhps xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
3808  minps xmm2, xmm5
3809  maxps xmm3, xmm5
3810 
3811  movss xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
3812  movhps xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3813  minps xmm0, xmm6
3814  maxps xmm1, xmm6
3815 
3816  movss xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3817  movhps xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
3818  minps xmm2, xmm7
3819  maxps xmm3, xmm7
3820 
3821  add eax, 4*DRAWVERT_SIZE
3822  jl loop4
3823 
3824  done4:
3825  mov eax, count
3826  and eax, 3
3827  jz done1
3828  imul eax, DRAWVERT_SIZE
3829  add esi, eax
3830  neg eax
3831 
3832  loop1:
3833  movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
3834  movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3835  minps xmm0, xmm4
3836  maxps xmm1, xmm4
3837 
3838  add eax, DRAWVERT_SIZE
3839  jl loop1
3840 
3841  done1:
3842  shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
3843  shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
3844  minps xmm0, xmm2
3845  maxps xmm1, xmm3
3846  mov esi, min
3847  movhps [esi], xmm0
3848  movss [esi+8], xmm0
3849  mov edi, max
3850  movhps [edi], xmm1
3851  movss [edi+8], xmm1
3852  }
3853 }
3854 
3855 /*
3856 ============
3857 idSIMD_SSE::MinMax
3858 ============
3859 */
3860 void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
3861 
3862  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
3863  assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
3864 
3865  __asm {
3866 
3867  movss xmm0, idMath::INFINITY
3868  xorps xmm1, xmm1
3869  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
3870  subps xmm1, xmm0
3871  movaps xmm2, xmm0
3872  movaps xmm3, xmm1
3873 
3874  mov edi, indexes
3875  mov esi, src
3876  mov eax, count
3877  and eax, ~3
3878  jz done4
3879  shl eax, 2
3880  add edi, eax
3881  neg eax
3882 
3883  loop4:
3884 // prefetchnta [edi+128]
3885 // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
3886 
3887  mov edx, [edi+eax+0]
3888  imul edx, DRAWVERT_SIZE
3889  movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
3890  movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3891  minps xmm0, xmm4
3892  maxps xmm1, xmm4
3893 
3894  mov edx, [edi+eax+4]
3895  imul edx, DRAWVERT_SIZE
3896  movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3897  movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
3898  minps xmm2, xmm5
3899  maxps xmm3, xmm5
3900 
3901  mov edx, [edi+eax+8]
3902  imul edx, DRAWVERT_SIZE
3903  movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
3904  movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3905  minps xmm0, xmm6
3906  maxps xmm1, xmm6
3907 
3908  mov edx, [edi+eax+12]
3909  imul edx, DRAWVERT_SIZE
3910  movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3911  movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
3912  minps xmm2, xmm7
3913  maxps xmm3, xmm7
3914 
3915  add eax, 4*4
3916  jl loop4
3917 
3918  done4:
3919  mov eax, count
3920  and eax, 3
3921  jz done1
3922  shl eax, 2
3923  add edi, eax
3924  neg eax
3925 
3926  loop1:
3927  mov edx, [edi+eax+0]
3928  imul edx, DRAWVERT_SIZE;
3929  movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
3930  movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3931  minps xmm0, xmm4
3932  maxps xmm1, xmm4
3933 
3934  add eax, 4
3935  jl loop1
3936 
3937  done1:
3938  shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
3939  shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
3940  minps xmm0, xmm2
3941  maxps xmm1, xmm3
3942  mov esi, min
3943  movhps [esi], xmm0
3944  movss [esi+8], xmm0
3945  mov edi, max
3946  movhps [edi], xmm1
3947  movss [edi+8], xmm1
3948  }
3949 }
3950 
3951 /*
3952 ============
3953 idSIMD_SSE::Clamp
3954 ============
3955 */
3956 void VPCALL idSIMD_SSE::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
3957  int i, pre, post;
3958 
3959  __asm
3960  {
3961  movss xmm0,min
3962  movss xmm1,max
3963  shufps xmm0,xmm0,0
3964  shufps xmm1,xmm1,0
3965 
3966  KFLOATINITDS( dst, src, count, pre, post )
3967  and eax,15
3968  jne lpNA
3969  jmp lpA
3970  align 16
3971 lpA:
3972  movaps xmm2,[edx+ebx]
3973  movaps xmm3,[edx+ebx+16]
3974  maxps xmm2,xmm0
3975  maxps xmm3,xmm0
3976  prefetchnta [edx+ebx+64]
3977  minps xmm2,xmm1
3978  minps xmm3,xmm1
3979  movaps [edi+ebx],xmm2
3980  movaps [edi+ebx+16],xmm3
3981  add ebx,16*2
3982  jl lpA
3983  jmp done
3984 
3985  align 16
3986 lpNA:
3987  movups xmm2,[edx+ebx]
3988  movups xmm3,[edx+ebx+16]
3989  maxps xmm2,xmm0
3990  maxps xmm3,xmm0
3991  prefetchnta [edx+ebx+64]
3992  minps xmm2,xmm1
3993  minps xmm3,xmm1
3994  movaps [edi+ebx],xmm2
3995  movaps [edi+ebx+16],xmm3
3996  add ebx,16*2
3997  jl lpNA
3998 done:
3999  }
4000 
4001  for ( i = 0; i < pre; i++ ) {
4002  if ( src[i] < min )
4003  dst[i] = min;
4004  else if ( src[i] > max )
4005  dst[i] = max;
4006  else
4007  dst[i] = src[i];
4008  }
4009 
4010  for( i = count - post; i < count; i++ ) {
4011  if ( src[i] < min )
4012  dst[i] = min;
4013  else if ( src[i] > max )
4014  dst[i] = max;
4015  else
4016  dst[i] = src[i];
4017  }
4018 }
4019 
4020 /*
4021 ============
4022 idSIMD_SSE::ClampMin
4023 ============
4024 */
4025 void VPCALL idSIMD_SSE::ClampMin( float *dst, const float *src, const float min, const int count ) {
4026  int i, pre, post;
4027 
4028  __asm
4029  {
4030  movss xmm0,min
4031  shufps xmm0,xmm0,0
4032 
4033  KFLOATINITDS( dst, src, count, pre, post )
4034  and eax,15
4035  jne lpNA
4036  jmp lpA
4037  align 16
4038 lpA:
4039  movaps xmm2,[edx+ebx]
4040  movaps xmm3,[edx+ebx+16]
4041  maxps xmm2,xmm0
4042  prefetchnta [edx+ebx+64]
4043  maxps xmm3,xmm0
4044  movaps [edi+ebx],xmm2
4045  movaps [edi+ebx+16],xmm3
4046  add ebx,16*2
4047  jl lpA
4048  jmp done
4049 
4050  align 16
4051 lpNA:
4052  movups xmm2,[edx+ebx]
4053  movups xmm3,[edx+ebx+16]
4054  maxps xmm2,xmm0
4055  prefetchnta [edx+ebx+64]
4056  maxps xmm3,xmm0
4057  movaps [edi+ebx],xmm2
4058  movaps [edi+ebx+16],xmm3
4059  add ebx,16*2
4060  jl lpNA
4061 done:
4062  }
4063 
4064  for( i = 0; i < pre; i++ ) {
4065  if ( src[i] < min )
4066  dst[i] = min;
4067  else
4068  dst[i] = src[i];
4069  }
4070  for( i = count - post; i < count; i++ ) {
4071  if ( src[i] < min )
4072  dst[i] = min;
4073  else
4074  dst[i] = src[i];
4075  }
4076 }
4077 
4078 /*
4079 ============
4080 idSIMD_SSE::ClampMax
4081 ============
4082 */
4083 void VPCALL idSIMD_SSE::ClampMax( float *dst, const float *src, const float max, const int count ) {
4084  int i, pre, post;
4085 
4086  __asm
4087  {
4088  movss xmm1,max
4089  shufps xmm1,xmm1,0
4090 
4091  KFLOATINITDS( dst, src, count, pre, post )
4092  and eax,15
4093  jne lpNA
4094  jmp lpA
4095  align 16
4096 lpA:
4097  movaps xmm2,[edx+ebx]
4098  movaps xmm3,[edx+ebx+16]
4099  minps xmm2,xmm1
4100  prefetchnta [edx+ebx+64]
4101  minps xmm3,xmm1
4102  movaps [edi+ebx],xmm2
4103  movaps [edi+ebx+16],xmm3
4104  add ebx,16*2
4105  jl lpA
4106  jmp done
4107 
4108  align 16
4109 lpNA:
4110  movups xmm2,[edx+ebx]
4111  movups xmm3,[edx+ebx+16]
4112  minps xmm2,xmm1
4113  prefetchnta [edx+ebx+64]
4114  minps xmm3,xmm1
4115  movaps [edi+ebx],xmm2
4116  movaps [edi+ebx+16],xmm3
4117  add ebx,16*2
4118  jl lpNA
4119 done:
4120  }
4121 
4122  for( i = 0; i < pre; i++ ) {
4123  if ( src[i] > max )
4124  dst[i] = max;
4125  else
4126  dst[i] = src[i];
4127  }
4128 
4129  for( i = count - post; i < count; i++ ) {
4130  if ( src[i] > max )
4131  dst[i] = max;
4132  else
4133  dst[i] = src[i];
4134  }
4135 }
4136 
4137 /*
4138 ============
4139 idSIMD_SSE::Zero16
4140 ============
4141 */
4142 void VPCALL idSIMD_SSE::Zero16( float *dst, const int count ) {
4143  __asm {
4144  mov edx, dst
4145  mov eax, count
4146  add eax, 3
4147  shr eax, 2
4148  jz doneZero16
4149  shl eax, 4
4150  add edx, eax
4151  neg eax
4152  xorps xmm0, xmm0
4153  loopZero16:
4154  movaps [edx+eax], xmm0
4155  add eax, 16
4156  jl loopZero16
4157  doneZero16:
4158  }
4159 }
4160 
4161 /*
4162 ============
4163 idSIMD_SSE::Negate16
4164 ============
4165 */
4166 void VPCALL idSIMD_SSE::Negate16( float *dst, const int count ) {
4167  __asm {
4168  mov edx, dst
4169  mov eax, count
4170  add eax, 3
4171  shr eax, 2
4172  jz doneNegate16
4173  shl eax, 4
4174  add edx, eax
4175  neg eax
4176  movss xmm0, SIMD_SP_signBitMask
4177  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
4178  loopNegate16:
4179  movaps xmm1, [edx+eax]
4180  xorps xmm1, xmm0
4181  movaps [edx+eax], xmm1
4182  add eax, 16
4183  jl loopNegate16
4184  doneNegate16:
4185  }
4186 }
4187 
4188 /*
4189 ============
4190 idSIMD_SSE::Copy16
4191 ============
4192 */
4193 void VPCALL idSIMD_SSE::Copy16( float *dst, const float *src, const int count ) {
4194  __asm {
4195  mov ecx, src
4196  mov edx, dst
4197  mov eax, count
4198  add eax, 3
4199  shr eax, 2
4200  jz doneCopy16
4201  shl eax, 4
4202  add ecx, eax
4203  add edx, eax
4204  neg eax
4205  loopCopy16:
4206  movaps xmm0, [ecx+eax]
4207  movaps [edx+eax], xmm0
4208  add eax, 16
4209  jl loopCopy16
4210  doneCopy16:
4211  }
4212 }
4213 
4214 /*
4215 ============
4216 idSIMD_SSE::Add16
4217 ============
4218 */
4219 void VPCALL idSIMD_SSE::Add16( float *dst, const float *src1, const float *src2, const int count ) {
4220  __asm {
4221  mov ecx, src1
4222  mov edx, src2
4223  mov esi, dst
4224  mov eax, count
4225  add eax, 3
4226  shr eax, 2
4227  jz doneAdd16
4228  shl eax, 4
4229  add esi, eax
4230  add ecx, eax
4231  add edx, eax
4232  neg eax
4233  loopAdd16:
4234  movaps xmm0, [ecx+eax]
4235  addps xmm0, [edx+eax]
4236  movaps [esi+eax], xmm0
4237  add eax, 16
4238  jl loopAdd16
4239  doneAdd16:
4240  }
4241 }
4242 
4243 /*
4244 ============
4245 idSIMD_SSE::Sub16
4246 ============
4247 */
4248 void VPCALL idSIMD_SSE::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
4249  __asm {
4250  mov ecx, src1
4251  mov edx, src2
4252  mov esi, dst
4253  mov eax, count
4254  add eax, 3
4255  shr eax, 2
4256  jz doneSub16
4257  shl eax, 4
4258  add esi, eax
4259  add ecx, eax
4260  add edx, eax
4261  neg eax
4262  loopSub16:
4263  movaps xmm0, [ecx+eax]
4264  subps xmm0, [edx+eax]
4265  movaps [esi+eax], xmm0
4266  add eax, 16
4267  jl loopSub16
4268  doneSub16:
4269  }
4270 }
4271 
4272 /*
4273 ============
4274 idSIMD_SSE::Mul16
4275 ============
4276 */
4277 void VPCALL idSIMD_SSE::Mul16( float *dst, const float *src1, const float constant, const int count ) {
4278  __asm {
4279  mov ecx, dst
4280  mov edx, src1
4281  mov eax, count
4282  add eax, 3
4283  shr eax, 2
4284  jz doneMulScalar16
4285  movss xmm1, constant
4286  shl eax, 4
4287  add ecx, eax
4288  add edx, eax
4289  neg eax
4290  shufps xmm1, xmm1, 0x00
4291  loopMulScalar16:
4292  movaps xmm0, [edx+eax]
4293  mulps xmm0, xmm1
4294  movaps [ecx+eax], xmm0
4295  add eax, 16
4296  jl loopMulScalar16
4297  doneMulScalar16:
4298  }
4299 }
4300 
4301 /*
4302 ============
4303 idSIMD_SSE::AddAssign16
4304 ============
4305 */
4306 void VPCALL idSIMD_SSE::AddAssign16( float *dst, const float *src, const int count ) {
4307  __asm {
4308  mov ecx, dst
4309  mov edx, src
4310  mov eax, count
4311  add eax, 3
4312  shr eax, 2
4313  jz doneAddAssign16
4314  shl eax, 4
4315  add ecx, eax
4316  add edx, eax
4317  neg eax
4318  loopAddAssign16:
4319  movaps xmm0, [ecx+eax]
4320  addps xmm0, [edx+eax]
4321  movaps [ecx+eax], xmm0
4322  add eax, 16
4323  jl loopAddAssign16
4324  doneAddAssign16:
4325  }
4326 }
4327 
4328 /*
4329 ============
4330 idSIMD_SSE::SubAssign16
4331 ============
4332 */
4333 void VPCALL idSIMD_SSE::SubAssign16( float *dst, const float *src, const int count ) {
4334  __asm {
4335  mov ecx, dst
4336  mov edx, src
4337  mov eax, count
4338  add eax, 3
4339  shr eax, 2
4340  jz doneSubAssign16
4341  shl eax, 4
4342  add ecx, eax
4343  add edx, eax
4344  neg eax
4345  loopSubAssign16:
4346  movaps xmm0, [ecx+eax]
4347  subps xmm0, [edx+eax]
4348  movaps [ecx+eax], xmm0
4349  add eax, 16
4350  jl loopSubAssign16
4351  doneSubAssign16:
4352  }
4353 }
4354 
4355 /*
4356 ============
4357 idSIMD_SSE::MulAssign16
4358 ============
4359 */
4360 void VPCALL idSIMD_SSE::MulAssign16( float *dst, const float constant, const int count ) {
4361  __asm {
4362  mov ecx, dst
4363  mov eax, count
4364  add eax, 3
4365  shr eax, 2
4366  jz doneMulAssign16
4367  movss xmm1, constant
4368  shl eax, 4
4369  add ecx, eax
4370  neg eax
4371  shufps xmm1, xmm1, 0x00
4372  loopMulAssign16:
4373  movaps xmm0, [ecx+eax]
4374  mulps xmm0, xmm1
4375  movaps [ecx+eax], xmm0
4376  add eax, 16
4377  jl loopMulAssign16
4378  doneMulAssign16:
4379  }
4380 }
4381 
4382 /*
4383 ============
4384 idSIMD_SSE::MatX_MultiplyVecX
4385 
4386  optimizes the following matrix multiplications:
4387 
4388  NxN * Nx1
4389  Nx6 * 6x1
4390  6xN * Nx1
4391 
4392  with N in the range [1-6]
4393 ============
4394 */
4395 void VPCALL idSIMD_SSE::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
4396 #define STORE1( offset, reg1, reg2 ) \
4397  __asm movss [eax+offset], reg1
4398 #define STORE2LO( offset, reg1, reg2 ) \
4399  __asm movlps [eax+offset], reg1
4400 #define STORE2HI( offset, reg1, reg2 ) \
4401  __asm movhps [eax+offset], reg1
4402 #define STORE4( offset, reg1, reg2 ) \
4403  __asm movlps [eax+offset], reg1 \
4404  __asm movhps [eax+offset+8], reg1
4405 #define STOREC =
4406 
4407  int numRows;
4408  const float *mPtr, *vPtr;
4409  float *dstPtr;
4410 
4411  assert( vec.GetSize() >= mat.GetNumColumns() );
4412  assert( dst.GetSize() >= mat.GetNumRows() );
4413 
4414  mPtr = mat.ToFloatPtr();
4415  vPtr = vec.ToFloatPtr();
4416  dstPtr = dst.ToFloatPtr();
4417  numRows = mat.GetNumRows();
4418  switch( mat.GetNumColumns() ) {
4419  case 1: {
4420  switch( numRows ) {
4421  case 1: { // 1x1 * 1x1
4422  __asm {
4423  mov esi, vPtr
4424  mov edi, mPtr
4425  mov eax, dstPtr
4426  movss xmm0, [esi]
4427  mulss xmm0, [edi]
4428  STORE1( 0, xmm0, xmm1 )
4429  }
4430  return;
4431  }
4432  case 6: { // 6x1 * 1x1
4433  __asm {
4434  mov esi, vPtr
4435  mov edi, mPtr
4436  mov eax, dstPtr
4437  movss xmm0, [esi]
4438  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
4439  movaps xmm1, xmm0
4440  mulps xmm0, [edi]
4441  mulps xmm1, [edi+16]
4442  STORE4( 0, xmm0, xmm2 )
4443  STORE2LO( 16, xmm1, xmm2 )
4444  }
4445  return;
4446  }
4447  default: {
4448  for ( int i = 0; i < numRows; i++ ) {
4449  dstPtr[i] STOREC mPtr[0] * vPtr[0];
4450  mPtr++;
4451  }
4452  return;
4453  }
4454  }
4455  break;
4456  }
4457  case 2: {
4458  switch( numRows ) {
4459  case 2: { // 2x2 * 2x1
4460  __asm {
4461  mov esi, vPtr
4462  mov edi, mPtr
4463  mov eax, dstPtr
4464  movss xmm0, [esi]
4465  movss xmm1, [esi+4]
4466  movss xmm2, [edi]
4467  mulss xmm2, xmm0
4468  movss xmm3, [edi+4]
4469  mulss xmm3, xmm1
4470  addss xmm2, xmm3
4471  STORE1( 0, xmm2, xmm4 )
4472  mulss xmm0, [edi+8]
4473  mulss xmm1, [edi+8+4]
4474  addss xmm0, xmm1
4475  STORE1( 4, xmm0, xmm4 )
4476  }
4477  return;
4478  }
4479  case 6: { // 6x2 * 2x1
4480  __asm {
4481  mov esi, vPtr
4482  mov edi, mPtr
4483  mov eax, dstPtr
4484  movlps xmm7, [esi]
4485  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
4486  movaps xmm0, [edi]
4487  mulps xmm0, xmm7
4488  movaps xmm1, [edi+16]
4489  mulps xmm1, xmm7
4490  movaps xmm2, xmm0
4491  shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
4492  shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
4493  movaps xmm3, [edi+32]
4494  addps xmm0, xmm2
4495  mulps xmm3, xmm7
4496  STORE4( 0, xmm0, xmm4 )
4497  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
4498  movhlps xmm1, xmm3
4499  addps xmm3, xmm1
4500  STORE2LO( 16, xmm3, xmm4 )
4501  }
4502  return;
4503  }
4504  default: {
4505  for ( int i = 0; i < numRows; i++ ) {
4506  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
4507  mPtr += 2;
4508  }
4509  return;
4510  }
4511  }
4512  break;
4513  }
4514  case 3: {
4515  switch( numRows ) {
4516  case 3: { // 3x3 * 3x1
4517  __asm {
4518  mov esi, vPtr
4519  mov edi, mPtr
4520  mov eax, dstPtr
4521  movss xmm0, [esi]
4522  movss xmm4, [edi]
4523  mulss xmm4, xmm0
4524  movss xmm1, [esi+4]
4525  movss xmm5, [edi+4]
4526  mulss xmm5, xmm1
4527  addss xmm4, xmm5
4528  movss xmm2, [esi+8]
4529  movss xmm6, [edi+8]
4530  mulss xmm6, xmm2
4531  addss xmm4, xmm6
4532  movss xmm3, [edi+12]
4533  mulss xmm3, xmm0
4534  STORE1( 0, xmm4, xmm7 );
4535  movss xmm5, [edi+12+4]
4536  mulss xmm5, xmm1
4537  addss xmm3, xmm5
4538  movss xmm6, [edi+12+8]
4539  mulss xmm6, xmm2
4540  addss xmm3, xmm6
4541  mulss xmm0, [edi+24]
4542  mulss xmm1, [edi+24+4]
4543  STORE1( 4, xmm3, xmm7 );
4544  addss xmm0, xmm1
4545  mulss xmm2, [edi+24+8]
4546  addss xmm0, xmm2
4547  STORE1( 8, xmm0, xmm7 );
4548  }
4549  return;
4550  }
4551  case 6: { // 6x3 * 3x1
4552  __asm {
4553  mov esi, vPtr
4554  mov edi, mPtr
4555  mov eax, dstPtr
4556  movss xmm5, [esi]
4557  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
4558  movss xmm6, [esi+4]
4559  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
4560  movss xmm7, [esi+8]
4561  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
4562  movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
4563  movlps xmm1, [edi+4*4]
4564  shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
4565  movlps xmm2, [edi+6*4]
4566  movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
4567  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
4568  mulps xmm0, xmm5
4569  movlps xmm3, [edi+10*4]
4570  shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
4571  movaps xmm3, xmm1
4572  shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
4573  mulps xmm1, xmm6
4574  shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
4575  mulps xmm3, xmm7
4576  addps xmm0, xmm1
4577  addps xmm0, xmm3
4578  STORE4( 0, xmm0, xmm4 )
4579  movss xmm1, [edi+12*4]
4580  mulss xmm1, xmm5
4581  movss xmm2, [edi+13*4]
4582  mulss xmm2, xmm6
4583  movss xmm3, [edi+14*4]
4584  mulss xmm3, xmm7
4585  addss xmm1, xmm2
4586  addss xmm1, xmm3
4587  STORE1( 16, xmm1, xmm4 )
4588  mulss xmm5, [edi+15*4]
4589  mulss xmm6, [edi+16*4]
4590  mulss xmm7, [edi+17*4]
4591  addss xmm5, xmm6
4592  addss xmm5, xmm7
4593  STORE1( 20, xmm5, xmm4 )
4594  }
4595  return;
4596  }
4597  default: {
4598  for ( int i = 0; i < numRows; i++ ) {
4599  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
4600  mPtr += 3;
4601  }
4602  return;
4603  }
4604  }
4605  break;
4606  }
4607  case 4: {
4608  switch( numRows ) {
4609  case 4: { // 4x4 * 4x1
4610  __asm {
4611  mov esi, vPtr
4612  mov edi, mPtr
4613  mov eax, dstPtr
4614  movlps xmm6, qword ptr [esi ]
4615  movlps xmm0, qword ptr [edi ]
4616  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
4617  movhps xmm0, qword ptr [edi+16]
4618  mulps xmm0, xmm6
4619  movlps xmm7, qword ptr [esi+ 8]
4620  movlps xmm2, qword ptr [edi+ 8]
4621  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
4622  movhps xmm2, qword ptr [edi+24]
4623  mulps xmm2, xmm7
4624  movlps xmm1, qword ptr [edi+32]
4625  movhps xmm1, qword ptr [edi+48]
4626  mulps xmm1, xmm6
4627  movlps xmm3, qword ptr [edi+40]
4628  addps xmm0, xmm2
4629  movhps xmm3, qword ptr [edi+56]
4630  mulps xmm3, xmm7
4631  movaps xmm4, xmm0
4632  addps xmm1, xmm3
4633  shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
4634  shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
4635  addps xmm0, xmm4
4636  STORE4( 0, xmm0, xmm2 )
4637  }
4638  return;
4639  }
4640  case 6: { // 6x4 * 4x1
4641  __asm {
4642  mov esi, vPtr
4643  mov edi, mPtr
4644  mov eax, dstPtr
4645  movlps xmm6, qword ptr [esi+ 0]
4646  movlps xmm0, qword ptr [edi+ 0]
4647  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
4648  movhps xmm0, qword ptr [edi+16]
4649  mulps xmm0, xmm6
4650  movlps xmm7, qword ptr [esi+ 8]
4651  movlps xmm2, qword ptr [edi+ 8]
4652  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
4653  movhps xmm2, qword ptr [edi+24]
4654  mulps xmm2, xmm7
4655  movlps xmm1, qword ptr [edi+32]
4656  movhps xmm1, qword ptr [edi+48]
4657  mulps xmm1, xmm6
4658  movlps xmm3, qword ptr [edi+40]
4659  addps xmm0, xmm2
4660  movhps xmm3, qword ptr [edi+56]
4661  mulps xmm3, xmm7
4662  movaps xmm4, xmm0
4663  addps xmm1, xmm3
4664  shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
4665  shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
4666  addps xmm0, xmm4
4667  movlps xmm1, qword ptr [edi+64]
4668  movhps xmm1, qword ptr [edi+80]
4669  STORE4( 0, xmm0, xmm4 )
4670  mulps xmm1, xmm6
4671  movlps xmm2, qword ptr [edi+72]
4672  movhps xmm2, qword ptr [edi+88]
4673  mulps xmm2, xmm7
4674  addps xmm1, xmm2
4675  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
4676  movhlps xmm3, xmm1
4677  addps xmm1, xmm3
4678  STORE2LO( 16, xmm1, xmm4 )
4679  }
4680  return;
4681  }
4682  default: {
4683  for ( int i = 0; i < numRows; i++ ) {
4684  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
4685  mPtr += 4;
4686  }
4687  return;
4688  }
4689  }
4690  break;
4691  }
4692  case 5: {
4693  switch( numRows ) {
4694  case 5: { // 5x5 * 5x1
4695  __asm {
4696  mov esi, vPtr
4697  mov edi, mPtr
4698  mov eax, dstPtr
4699  movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
4700  movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
4701  movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
4702  movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
4703  movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
4704  shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
4705  movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
4706  movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
4707  movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
4708  shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
4709  movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
4710  movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
4711  movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
4712  shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
4713  movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
4714  movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
4715  movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
4716  movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
4717  shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
4718  movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
4719  shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
4720  movss xmm7, [esi+0*4]
4721  shufps xmm7, xmm7, 0
4722  mulps xmm0, xmm7
4723  movss xmm5, [esi+1*4]
4724  shufps xmm5, xmm5, 0
4725  mulps xmm1, xmm5
4726  addps xmm0, xmm1
4727  movss xmm6, [esi+2*4]
4728  shufps xmm6, xmm6, 0
4729  mulps xmm2, xmm6
4730  addps xmm0, xmm2
4731  movss xmm1, [esi+3*4]
4732  shufps xmm1, xmm1, 0
4733  mulps xmm3, xmm1
4734  addps xmm0, xmm3
4735  movss xmm2, [esi+4*4]
4736  shufps xmm2, xmm2, 0
4737  mulps xmm4, xmm2
4738  addps xmm0, xmm4
4739  mulss xmm7, [edi+20*4]
4740  mulss xmm5, [edi+21*4]
4741  addps xmm7, xmm5
4742  mulss xmm6, [edi+22*4]
4743  addps xmm7, xmm6
4744  mulss xmm1, [edi+23*4]
4745  addps xmm7, xmm1
4746  mulss xmm2, [edi+24*4]
4747  addps xmm7, xmm2
4748  STORE4( 0, xmm0, xmm3 )
4749  STORE1( 16, xmm7, xmm4 )
4750  }
4751  return;
4752  }
4753  case 6: { // 6x5 * 5x1
4754  __asm {
4755  mov esi, vPtr
4756  mov edi, mPtr
4757  mov eax, dstPtr
4758  movlps xmm6, [esi]
4759  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
4760  movlps xmm7, [esi+8]
4761  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
4762  movlps xmm0, [edi]
4763  movhps xmm3, [edi+8]
4764  movaps xmm1, [edi+16]
4765  movlps xmm2, [edi+32]
4766  shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
4767  shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
4768  shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
4769  mulps xmm0, xmm6
4770  mulps xmm3, xmm7
4771  movlps xmm2, [edi+40]
4772  addps xmm0, xmm3 // xmm0 + xmm1
4773  movhps xmm5, [edi+40+8]
4774  movlps xmm3, [edi+40+16]
4775  movhps xmm3, [edi+40+24]
4776  movlps xmm4, [edi+40+32]
4777  shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
4778  shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
4779  shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
4780  mulps xmm2, xmm6
4781  mulps xmm5, xmm7
4782  addps xmm2, xmm5 // xmm2 + xmm3
4783  movss xmm5, [esi+16]
4784  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
4785  movaps xmm4, xmm0
4786  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
4787  shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
4788  shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
4789  addps xmm0, xmm4
4790  mulps xmm1, xmm5
4791  addps xmm0, xmm1
4792  STORE4( 0, xmm0, xmm2 )
4793  movlps xmm4, [edi+80]
4794  movhps xmm3, [edi+80+8]
4795  movaps xmm1, [edi+80+16]
4796  movlps xmm2, [edi+80+32]
4797  shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
4798  shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
4799  shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
4800  mulps xmm4, xmm6
4801  mulps xmm3, xmm7
4802  mulps xmm1, xmm5
4803  addps xmm4, xmm3 // xmm4 + xmm1
4804  shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
4805  shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
4806  addps xmm4, xmm1
4807  shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
4808  addps xmm4, xmm1
4809  STORE2LO( 16, xmm4, xmm2 )
4810  }
4811  return;
4812  }
4813  default: {
4814  for ( int i = 0; i < numRows; i++ ) {
4815  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
4816  mPtr += 5;
4817  }
4818  return;
4819  }
4820  }
4821  break;
4822  }
4823  case 6: {
4824  switch( numRows ) {
4825  case 1: { // 1x6 * 6x1
4826  __asm {
4827  mov esi, vPtr
4828  mov edi, mPtr
4829  mov eax, dstPtr
4830  movss xmm0, [esi]
4831  mulss xmm0, [edi]
4832  movss xmm1, [esi+4]
4833  mulss xmm1, [edi+4]
4834  movss xmm2, [esi+8]
4835  addss xmm0, xmm1
4836  mulss xmm2, [edi+8]
4837  movss xmm3, [esi+12]
4838  addss xmm0, xmm2
4839  mulss xmm3, [edi+12]
4840  movss xmm4, [esi+16]
4841  addss xmm0, xmm3
4842  mulss xmm4, [edi+16]
4843  movss xmm5, [esi+20]
4844  addss xmm0, xmm4
4845  mulss xmm5, [edi+20]
4846  movss xmm6, [esi+24]
4847  addss xmm0, xmm5
4848  mulss xmm6, [edi+24]
4849  addss xmm0, xmm6
4850  STORE1( 0, xmm0, xmm7 )
4851  }
4852  return;
4853  }
4854  case 2: { // 2x6 * 6x1
4855  __asm {
4856  mov esi, vPtr
4857  mov edi, mPtr
4858  mov eax, dstPtr
4859  // load idVecX
4860  movlps xmm4, [esi]
4861  movhps xmm4, [esi+8]
4862  movlps xmm5, [esi+16]
4863  movlhps xmm5, xmm4
4864  movhlps xmm6, xmm4
4865  movlhps xmm6, xmm5
4866  // row 0 and 1
4867  movaps xmm0, [edi]
4868  movaps xmm1, [edi+16]
4869  movaps xmm2, [edi+32]
4870  mulps xmm0, xmm4
4871  mulps xmm1, xmm5
4872  mulps xmm2, xmm6
4873  movhlps xmm3, xmm0
4874  movlhps xmm3, xmm2
4875  addps xmm1, xmm3
4876  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4877  addps xmm1, xmm0
4878  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
4879  movhlps xmm0, xmm1
4880  addps xmm0, xmm1
4881  STORE2LO( 0, xmm0, xmm3 )
4882  }
4883  return;
4884  }
4885  case 3: { // 3x6 * 6x1
4886  __asm {
4887  mov esi, vPtr
4888  mov edi, mPtr
4889  mov eax, dstPtr
4890  // load idVecX
4891  movlps xmm4, [esi]
4892  movhps xmm4, [esi+8]
4893  movlps xmm5, [esi+16]
4894  movlhps xmm5, xmm4
4895  movhlps xmm6, xmm4
4896  movlhps xmm6, xmm5
4897  // row 0 and 1
4898  movaps xmm0, [edi]
4899  movaps xmm1, [edi+16]
4900  movaps xmm2, [edi+32]
4901  mulps xmm0, xmm4
4902  mulps xmm1, xmm5
4903  mulps xmm2, xmm6
4904  movhlps xmm3, xmm0
4905  movlhps xmm3, xmm2
4906  addps xmm1, xmm3
4907  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4908  addps xmm1, xmm0
4909  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
4910  movhlps xmm0, xmm1
4911  addps xmm0, xmm1
4912  STORE2LO( 0, xmm0, xmm3 )
4913  // row 2
4914  movaps xmm0, [edi+48]
4915  movaps xmm1, [edi+48+16]
4916  mulps xmm0, xmm4
4917  mulps xmm1, xmm5
4918  addps xmm0, xmm1
4919  movhlps xmm1, xmm0
4920  addps xmm0, xmm1
4921  movaps xmm1, xmm0
4922  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
4923  addss xmm0, xmm1
4924  STORE1( 8, xmm0, xmm3 )
4925  }
4926  return;
4927  }
4928  case 4: { // 4x6 * 6x1
4929  __asm {
4930  mov esi, vPtr
4931  mov edi, mPtr
4932  mov eax, dstPtr
4933  // load idVecX
4934  movlps xmm4, [esi]
4935  movhps xmm4, [esi+8]
4936  movlps xmm5, [esi+16]
4937  movlhps xmm5, xmm4
4938  movhlps xmm6, xmm4
4939  movlhps xmm6, xmm5
4940  // row 0 and 1
4941  movaps xmm0, [edi]
4942  movaps xmm1, [edi+16]
4943  movaps xmm2, [edi+32]
4944  mulps xmm0, xmm4
4945  mulps xmm1, xmm5
4946  mulps xmm2, xmm6
4947  movhlps xmm7, xmm0
4948  movlhps xmm7, xmm2
4949  addps xmm7, xmm1
4950  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4951  addps xmm7, xmm0
4952  // row 2 and 3
4953  movaps xmm0, [edi+48]
4954  movaps xmm1, [edi+48+16]
4955  movaps xmm2, [edi+48+32]
4956  mulps xmm0, xmm4
4957  mulps xmm1, xmm5
4958  mulps xmm2, xmm6
4959  movhlps xmm3, xmm0
4960  movlhps xmm3, xmm2
4961  addps xmm1, xmm3
4962  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4963  addps xmm1, xmm0
4964  // last 4 additions for the first 4 rows and store result
4965  movaps xmm0, xmm7
4966  shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
4967  shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
4968  addps xmm0, xmm7
4969  STORE4( 0, xmm0, xmm4 )
4970  }
4971  return;
4972  }
4973  case 5: { // 5x6 * 6x1
4974  __asm {
4975  mov esi, vPtr
4976  mov edi, mPtr
4977  mov eax, dstPtr
4978  // load idVecX
4979  movlps xmm4, [esi]
4980  movhps xmm4, [esi+8]
4981  movlps xmm5, [esi+16]
4982  movlhps xmm5, xmm4
4983  movhlps xmm6, xmm4
4984  movlhps xmm6, xmm5
4985  // row 0 and 1
4986  movaps xmm0, [edi]
4987  movaps xmm1, [edi+16]
4988  movaps xmm2, [edi+32]
4989  mulps xmm0, xmm4
4990  mulps xmm1, xmm5
4991  mulps xmm2, xmm6
4992  movhlps xmm7, xmm0
4993  movlhps xmm7, xmm2
4994  addps xmm7, xmm1
4995  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4996  addps xmm7, xmm0
4997  // row 2 and 3
4998  movaps xmm0, [edi+48]
4999  movaps xmm1, [edi+48+16]
5000  movaps xmm2, [edi+48+32]
5001  mulps xmm0, xmm4
5002  mulps xmm1, xmm5
5003  mulps xmm2, xmm6
5004  movhlps xmm3, xmm0
5005  movlhps xmm3, xmm2
5006  addps xmm1, xmm3
5007  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5008  addps xmm1, xmm0
5009  // last 4 additions for the first 4 rows and store result
5010  movaps xmm0, xmm7
5011  shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5012  shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5013  addps xmm0, xmm7
5014  STORE4( 0, xmm0, xmm3 )
5015  // row 5
5016  movaps xmm0, [edi+96]
5017  movaps xmm1, [edi+96+16]
5018  mulps xmm0, xmm4
5019  mulps xmm1, xmm5
5020  addps xmm0, xmm1
5021  movhlps xmm1, xmm0
5022  addps xmm0, xmm1
5023  movaps xmm1, xmm0
5024  shufps xmm1, xmm1, 0x01
5025  addss xmm0, xmm1
5026  STORE1( 16, xmm0, xmm3 )
5027  }
5028  return;
5029  }
5030  case 6: { // 6x6 * 6x1
5031  __asm {
5032  mov esi, vPtr
5033  mov edi, mPtr
5034  mov eax, dstPtr
5035  movlps xmm7, qword ptr [esi]
5036  movlps xmm6, qword ptr [esi+8]
5037  shufps xmm7, xmm7, 0x44
5038  shufps xmm6, xmm6, 0x44
5039  movlps xmm0, qword ptr [edi ]
5040  movhps xmm0, qword ptr [edi+ 24]
5041  mulps xmm0, xmm7
5042  movlps xmm3, qword ptr [edi+ 8]
5043  movhps xmm3, qword ptr [edi+ 32]
5044  mulps xmm3, xmm6
5045  movlps xmm1, qword ptr [edi+ 48]
5046  movhps xmm1, qword ptr [edi+ 72]
5047  mulps xmm1, xmm7
5048  movlps xmm2, qword ptr [edi+ 96]
5049  movhps xmm2, qword ptr [edi+120]
5050  mulps xmm2, xmm7
5051  movlps xmm4, qword ptr [edi+ 56]
5052  movhps xmm4, qword ptr [edi+ 80]
5053  movlps xmm5, qword ptr [edi+104]
5054  movhps xmm5, qword ptr [edi+128]
5055  mulps xmm4, xmm6
5056  movlps xmm7, qword ptr [esi+16]
5057  addps xmm0, xmm3
5058  shufps xmm7, xmm7, 0x44
5059  mulps xmm5, xmm6
5060  addps xmm1, xmm4
5061  movlps xmm3, qword ptr [edi+ 16]
5062  movhps xmm3, qword ptr [edi+ 40]
5063  addps xmm2, xmm5
5064  movlps xmm4, qword ptr [edi+ 64]
5065  movhps xmm4, qword ptr [edi+ 88]
5066  mulps xmm3, xmm7
5067  movlps xmm5, qword ptr [edi+112]
5068  movhps xmm5, qword ptr [edi+136]
5069  addps xmm0, xmm3
5070  mulps xmm4, xmm7
5071  mulps xmm5, xmm7
5072  addps xmm1, xmm4
5073  addps xmm2, xmm5
5074  movaps xmm6, xmm0
5075  shufps xmm0, xmm1, 0x88
5076  shufps xmm6, xmm1, 0xDD
5077  movaps xmm7, xmm2
5078  shufps xmm7, xmm2, 0x88
5079  shufps xmm2, xmm2, 0xDD
5080  addps xmm0, xmm6
5081  addps xmm2, xmm7
5082  STORE4( 0, xmm0, xmm3 )
5083  STORE2LO( 16, xmm2, xmm4 )
5084  }
5085  return;
5086  }
5087  default: {
5088  for ( int i = 0; i < numRows; i++ ) {
5089  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
5090  mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
5091  mPtr += 6;
5092  }
5093  return;
5094  }
5095  }
5096  break;
5097  }
5098  default: {
5099  int numColumns = mat.GetNumColumns();
5100  for ( int i = 0; i < numRows; i++ ) {
5101  float sum = mPtr[0] * vPtr[0];
5102  for ( int j = 1; j < numColumns; j++ ) {
5103  sum += mPtr[j] * vPtr[j];
5104  }
5105  dstPtr[i] STOREC sum;
5106  mPtr += numColumns;
5107  }
5108  break;
5109  }
5110  }
5111 
5112 #undef STOREC
5113 #undef STORE4
5114 #undef STORE2HI
5115 #undef STORE2LO
5116 #undef STORE1
5117 }
5118 
5119 /*
5120 ============
5121 idSIMD_SSE::MatX_MultiplyAddVecX
5122 
5123  optimizes the following matrix multiplications:
5124 
5125  NxN * Nx1
5126  Nx6 * 6x1
5127  6xN * Nx1
5128 
5129  with N in the range [1-6]
5130 ============
5131 */
5132 void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
5133 #define STORE1( offset, reg1, reg2 ) \
5134  __asm movss reg2, [eax+offset] \
5135  __asm addss reg2, reg1 \
5136  __asm movss [eax+offset], reg2
5137 #define STORE2LO( offset, reg1, reg2 ) \
5138  __asm movlps reg2, [eax+offset] \
5139  __asm addps reg2, reg1 \
5140  __asm movlps [eax+offset], reg2
5141 #define STORE2HI( offset, reg1, reg2 ) \
5142  __asm movhps reg2, [eax+offset] \
5143  __asm addps reg2, reg1 \
5144  __asm movhps [eax+offset], reg2
5145 #define STORE4( offset, reg1, reg2 ) \
5146  __asm movlps reg2, [eax+offset] \
5147  __asm movhps reg2, [eax+offset+8] \
5148  __asm addps reg2, reg1 \
5149  __asm movlps [eax+offset], reg2 \
5150  __asm movhps [eax+offset+8], reg2
5151 #define STOREC +=
5152 
5153  int numRows;
5154  const float *mPtr, *vPtr;
5155  float *dstPtr;
5156 
5157  assert( vec.GetSize() >= mat.GetNumColumns() );
5158  assert( dst.GetSize() >= mat.GetNumRows() );
5159 
5160  mPtr = mat.ToFloatPtr();
5161  vPtr = vec.ToFloatPtr();
5162  dstPtr = dst.ToFloatPtr();
5163  numRows = mat.GetNumRows();
5164  switch( mat.GetNumColumns() ) {
5165  case 1: {
5166  switch( numRows ) {
5167  case 1: { // 1x1 * 1x1
5168  __asm {
5169  mov esi, vPtr
5170  mov edi, mPtr
5171  mov eax, dstPtr
5172  movss xmm0, [esi]
5173  mulss xmm0, [edi]
5174  STORE1( 0, xmm0, xmm1 )
5175  }
5176  return;
5177  }
5178  case 6: { // 6x1 * 1x1
5179  __asm {
5180  mov esi, vPtr
5181  mov edi, mPtr
5182  mov eax, dstPtr
5183  movss xmm0, [esi]
5184  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
5185  movaps xmm1, xmm0
5186  mulps xmm0, [edi]
5187  mulps xmm1, [edi+16]
5188  STORE4( 0, xmm0, xmm2 )
5189  STORE2LO( 16, xmm1, xmm2 )
5190  }
5191  return;
5192  }
5193  default: {
5194  for ( int i = 0; i < numRows; i++ ) {
5195  dstPtr[i] STOREC mPtr[0] * vPtr[0];
5196  mPtr++;
5197  }
5198  return;
5199  }
5200  }
5201  break;
5202  }
5203  case 2: {
5204  switch( numRows ) {
5205  case 2: { // 2x2 * 2x1
5206  __asm {
5207  mov esi, vPtr
5208  mov edi, mPtr
5209  mov eax, dstPtr
5210  movss xmm0, [esi]
5211  movss xmm1, [esi+4]
5212  movss xmm2, [edi]
5213  mulss xmm2, xmm0
5214  movss xmm3, [edi+4]
5215  mulss xmm3, xmm1
5216  addss xmm2, xmm3
5217  STORE1( 0, xmm2, xmm4 )
5218  mulss xmm0, [edi+8]
5219  mulss xmm1, [edi+8+4]
5220  addss xmm0, xmm1
5221  STORE1( 4, xmm0, xmm4 )
5222  }
5223  return;
5224  }
5225  case 6: { // 6x2 * 2x1
5226  __asm {
5227  mov esi, vPtr
5228  mov edi, mPtr
5229  mov eax, dstPtr
5230  movlps xmm7, [esi]
5231  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5232  movaps xmm0, [edi]
5233  mulps xmm0, xmm7
5234  movaps xmm1, [edi+16]
5235  mulps xmm1, xmm7
5236  movaps xmm2, xmm0
5237  shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5238  shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5239  movaps xmm3, [edi+32]
5240  addps xmm0, xmm2
5241  mulps xmm3, xmm7
5242  STORE4( 0, xmm0, xmm4 )
5243  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
5244  movhlps xmm1, xmm3
5245  addps xmm3, xmm1
5246  STORE2LO( 16, xmm3, xmm4 )
5247  }
5248  return;
5249  }
5250  default: {
5251  for ( int i = 0; i < numRows; i++ ) {
5252  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
5253  mPtr += 2;
5254  }
5255  return;
5256  }
5257  }
5258  break;
5259  }
5260  case 3: {
5261  switch( numRows ) {
5262  case 3: { // 3x3 * 3x1
5263  __asm {
5264  mov esi, vPtr
5265  mov edi, mPtr
5266  mov eax, dstPtr
5267  movss xmm0, [esi]
5268  movss xmm4, [edi]
5269  mulss xmm4, xmm0
5270  movss xmm1, [esi+4]
5271  movss xmm5, [edi+4]
5272  mulss xmm5, xmm1
5273  addss xmm4, xmm5
5274  movss xmm2, [esi+8]
5275  movss xmm6, [edi+8]
5276  mulss xmm6, xmm2
5277  addss xmm4, xmm6
5278  movss xmm3, [edi+12]
5279  mulss xmm3, xmm0
5280  STORE1( 0, xmm4, xmm7 );
5281  movss xmm5, [edi+12+4]
5282  mulss xmm5, xmm1
5283  addss xmm3, xmm5
5284  movss xmm6, [edi+12+8]
5285  mulss xmm6, xmm2
5286  addss xmm3, xmm6
5287  mulss xmm0, [edi+24]
5288  mulss xmm1, [edi+24+4]
5289  STORE1( 4, xmm3, xmm7 );
5290  addss xmm0, xmm1
5291  mulss xmm2, [edi+24+8]
5292  addss xmm0, xmm2
5293  STORE1( 8, xmm0, xmm7 );
5294  }
5295  return;
5296  }
5297  case 6: { // 6x3 * 3x1
5298  __asm {
5299  mov esi, vPtr
5300  mov edi, mPtr
5301  mov eax, dstPtr
5302  movss xmm5, [esi]
5303  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
5304  movss xmm6, [esi+4]
5305  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
5306  movss xmm7, [esi+8]
5307  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
5308  movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
5309  movlps xmm1, [edi+4*4]
5310  shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
5311  movlps xmm2, [edi+6*4]
5312  movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
5313  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
5314  mulps xmm0, xmm5
5315  movlps xmm3, [edi+10*4]
5316  shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
5317  movaps xmm3, xmm1
5318  shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
5319  mulps xmm1, xmm6
5320  shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
5321  mulps xmm3, xmm7
5322  addps xmm0, xmm1
5323  addps xmm0, xmm3
5324  STORE4( 0, xmm0, xmm4 )
5325  movss xmm1, [edi+12*4]
5326  mulss xmm1, xmm5
5327  movss xmm2, [edi+13*4]
5328  mulss xmm2, xmm6
5329  movss xmm3, [edi+14*4]
5330  mulss xmm3, xmm7
5331  addss xmm1, xmm2
5332  addss xmm1, xmm3
5333  STORE1( 16, xmm1, xmm4 )
5334  mulss xmm5, [edi+15*4]
5335  mulss xmm6, [edi+16*4]
5336  mulss xmm7, [edi+17*4]
5337  addss xmm5, xmm6
5338  addss xmm5, xmm7
5339  STORE1( 20, xmm5, xmm4 )
5340  }
5341  return;
5342  }
5343  default: {
5344  for ( int i = 0; i < numRows; i++ ) {
5345  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
5346  mPtr += 3;
5347  }
5348  return;
5349  }
5350  }
5351  break;
5352  }
5353  case 4: {
5354  switch( numRows ) {
5355  case 4: { // 4x4 * 4x1
5356  __asm {
5357  mov esi, vPtr
5358  mov edi, mPtr
5359  mov eax, dstPtr
5360  movlps xmm6, qword ptr [esi ]
5361  movlps xmm0, qword ptr [edi ]
5362  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
5363  movhps xmm0, qword ptr [edi+16]
5364  mulps xmm0, xmm6
5365  movlps xmm7, qword ptr [esi+ 8]
5366  movlps xmm2, qword ptr [edi+ 8]
5367  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5368  movhps xmm2, qword ptr [edi+24]
5369  mulps xmm2, xmm7
5370  movlps xmm1, qword ptr [edi+32]
5371  movhps xmm1, qword ptr [edi+48]
5372  mulps xmm1, xmm6
5373  movlps xmm3, qword ptr [edi+40]
5374  addps xmm0, xmm2
5375  movhps xmm3, qword ptr [edi+56]
5376  mulps xmm3, xmm7
5377  movaps xmm4, xmm0
5378  addps xmm1, xmm3
5379  shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5380  shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5381  addps xmm0, xmm4
5382  STORE4( 0, xmm0, xmm2 )
5383  }
5384  return;
5385  }
5386  case 6: { // 6x4 * 4x1
5387  __asm {
5388  mov esi, vPtr
5389  mov edi, mPtr
5390  mov eax, dstPtr
5391  movlps xmm6, qword ptr [esi+ 0]
5392  movlps xmm0, qword ptr [edi+ 0]
5393  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
5394  movhps xmm0, qword ptr [edi+16]
5395  mulps xmm0, xmm6
5396  movlps xmm7, qword ptr [esi+ 8]
5397  movlps xmm2, qword ptr [edi+ 8]
5398  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5399  movhps xmm2, qword ptr [edi+24]
5400  mulps xmm2, xmm7
5401  movlps xmm1, qword ptr [edi+32]
5402  movhps xmm1, qword ptr [edi+48]
5403  mulps xmm1, xmm6
5404  movlps xmm3, qword ptr [edi+40]
5405  addps xmm0, xmm2
5406  movhps xmm3, qword ptr [edi+56]
5407  mulps xmm3, xmm7
5408  movaps xmm4, xmm0
5409  addps xmm1, xmm3
5410  shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5411  shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5412  addps xmm0, xmm4
5413  movlps xmm1, qword ptr [edi+64]
5414  movhps xmm1, qword ptr [edi+80]
5415  STORE4( 0, xmm0, xmm4 )
5416  mulps xmm1, xmm6
5417  movlps xmm2, qword ptr [edi+72]
5418  movhps xmm2, qword ptr [edi+88]
5419  mulps xmm2, xmm7
5420  addps xmm1, xmm2
5421  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
5422  movhlps xmm3, xmm1
5423  addps xmm1, xmm3
5424  STORE2LO( 16, xmm1, xmm4 )
5425  }
5426  return;
5427  }
5428  default: {
5429  for ( int i = 0; i < numRows; i++ ) {
5430  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
5431  mPtr += 4;
5432  }
5433  return;
5434  }
5435  }
5436  break;
5437  }
5438  case 5: {
5439  switch( numRows ) {
5440  case 5: { // 5x5 * 5x1
5441  __asm {
5442  mov esi, vPtr
5443  mov edi, mPtr
5444  mov eax, dstPtr
5445  movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
5446  movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
5447  movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
5448  movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
5449  movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
5450  shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
5451  movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
5452  movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
5453  movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
5454  shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
5455  movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
5456  movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
5457  movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
5458  shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
5459  movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
5460  movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
5461  movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
5462  movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
5463  shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
5464  movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
5465  shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
5466  movss xmm7, [esi+0*4]
5467  shufps xmm7, xmm7, 0
5468  mulps xmm0, xmm7
5469  movss xmm5, [esi+1*4]
5470  shufps xmm5, xmm5, 0
5471  mulps xmm1, xmm5
5472  addps xmm0, xmm1
5473  movss xmm6, [esi+2*4]
5474  shufps xmm6, xmm6, 0
5475  mulps xmm2, xmm6
5476  addps xmm0, xmm2
5477  movss xmm1, [esi+3*4]
5478  shufps xmm1, xmm1, 0
5479  mulps xmm3, xmm1
5480  addps xmm0, xmm3
5481  movss xmm2, [esi+4*4]
5482  shufps xmm2, xmm2, 0
5483  mulps xmm4, xmm2
5484  addps xmm0, xmm4
5485  mulss xmm7, [edi+20*4]
5486  mulss xmm5, [edi+21*4]
5487  addps xmm7, xmm5
5488  mulss xmm6, [edi+22*4]
5489  addps xmm7, xmm6
5490  mulss xmm1, [edi+23*4]
5491  addps xmm7, xmm1
5492  mulss xmm2, [edi+24*4]
5493  addps xmm7, xmm2
5494  STORE4( 0, xmm0, xmm3 )
5495  STORE1( 16, xmm7, xmm4 )
5496  }
5497  return;
5498  }
5499  case 6: { // 6x5 * 5x1
5500  __asm {
5501  mov esi, vPtr
5502  mov edi, mPtr
5503  mov eax, dstPtr
5504  movlps xmm6, [esi]
5505  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
5506  movlps xmm7, [esi+8]
5507  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5508  movlps xmm0, [edi]
5509  movhps xmm3, [edi+8]
5510  movaps xmm1, [edi+16]
5511  movlps xmm2, [edi+32]
5512  shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
5513  shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
5514  shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
5515  mulps xmm0, xmm6
5516  mulps xmm3, xmm7
5517  movlps xmm2, [edi+40]
5518  addps xmm0, xmm3 // xmm0 + xmm1
5519  movhps xmm5, [edi+40+8]
5520  movlps xmm3, [edi+40+16]
5521  movhps xmm3, [edi+40+24]
5522  movlps xmm4, [edi+40+32]
5523  shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
5524  shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
5525  shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
5526  mulps xmm2, xmm6
5527  mulps xmm5, xmm7
5528  addps xmm2, xmm5 // xmm2 + xmm3
5529  movss xmm5, [esi+16]
5530  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
5531  movaps xmm4, xmm0
5532  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
5533  shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
5534  shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
5535  addps xmm0, xmm4
5536  mulps xmm1, xmm5
5537  addps xmm0, xmm1
5538  STORE4( 0, xmm0, xmm2 )
5539  movlps xmm4, [edi+80]
5540  movhps xmm3, [edi+80+8]
5541  movaps xmm1, [edi+80+16]
5542  movlps xmm2, [edi+80+32]
5543  shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
5544  shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
5545  shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
5546  mulps xmm4, xmm6
5547  mulps xmm3, xmm7
5548  mulps xmm1, xmm5
5549  addps xmm4, xmm3 // xmm4 + xmm1
5550  shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
5551  shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
5552  addps xmm4, xmm1
5553  shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
5554  addps xmm4, xmm1
5555  STORE2LO( 16, xmm4, xmm2 )
5556  }
5557  return;
5558  }
5559  default: {
5560  for ( int i = 0; i < numRows; i++ ) {
5561  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
5562  mPtr += 5;
5563  }
5564  return;
5565  }
5566  }
5567  break;
5568  }
5569  case 6: {
5570  switch( numRows ) {
5571  case 1: { // 1x6 * 6x1
5572  __asm {
5573  mov esi, vPtr
5574  mov edi, mPtr
5575  mov eax, dstPtr
5576  movss xmm0, [esi]
5577  mulss xmm0, [edi]
5578  movss xmm1, [esi+4]
5579  mulss xmm1, [edi+4]
5580  movss xmm2, [esi+8]
5581  addss xmm0, xmm1
5582  mulss xmm2, [edi+8]
5583  movss xmm3, [esi+12]
5584  addss xmm0, xmm2
5585  mulss xmm3, [edi+12]
5586  movss xmm4, [esi+16]
5587  addss xmm0, xmm3
5588  mulss xmm4, [edi+16]
5589  movss xmm5, [esi+20]
5590  addss xmm0, xmm4
5591  mulss xmm5, [edi+20]
5592  movss xmm6, [esi+24]
5593  addss xmm0, xmm5
5594  mulss xmm6, [edi+24]
5595  addss xmm0, xmm6
5596  STORE1( 0, xmm0, xmm7 )
5597  }
5598  return;
5599  }
5600  case 2: { // 2x6 * 6x1
5601  __asm {
5602  mov esi, vPtr
5603  mov edi, mPtr
5604  mov eax, dstPtr
5605  // load idVecX
5606  movlps xmm4, [esi]
5607  movhps xmm4, [esi+8]
5608  movlps xmm5, [esi+16]
5609  movlhps xmm5, xmm4
5610  movhlps xmm6, xmm4
5611  movlhps xmm6, xmm5
5612  // row 0 and 1
5613  movaps xmm0, [edi]
5614  movaps xmm1, [edi+16]
5615  movaps xmm2, [edi+32]
5616  mulps xmm0, xmm4
5617  mulps xmm1, xmm5
5618  mulps xmm2, xmm6
5619  movhlps xmm3, xmm0
5620  movlhps xmm3, xmm2
5621  addps xmm1, xmm3
5622  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5623  addps xmm1, xmm0
5624  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
5625  movhlps xmm0, xmm1
5626  addps xmm0, xmm1
5627  STORE2LO( 0, xmm0, xmm3 )
5628  }
5629  return;
5630  }
5631  case 3: { // 3x6 * 6x1
5632  __asm {
5633  mov esi, vPtr
5634  mov edi, mPtr
5635  mov eax, dstPtr
5636  // load idVecX
5637  movlps xmm4, [esi]
5638  movhps xmm4, [esi+8]
5639  movlps xmm5, [esi+16]
5640  movlhps xmm5, xmm4
5641  movhlps xmm6, xmm4
5642  movlhps xmm6, xmm5
5643  // row 0 and 1
5644  movaps xmm0, [edi]
5645  movaps xmm1, [edi+16]
5646  movaps xmm2, [edi+32]
5647  mulps xmm0, xmm4
5648  mulps xmm1, xmm5
5649  mulps xmm2, xmm6
5650  movhlps xmm3, xmm0
5651  movlhps xmm3, xmm2
5652  addps xmm1, xmm3
5653  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5654  addps xmm1, xmm0
5655  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
5656  movhlps xmm0, xmm1
5657  addps xmm0, xmm1
5658  STORE2LO( 0, xmm0, xmm3 )
5659  // row 2
5660  movaps xmm0, [edi+48]
5661  movaps xmm1, [edi+48+16]
5662  mulps xmm0, xmm4
5663  mulps xmm1, xmm5
5664  addps xmm0, xmm1
5665  movhlps xmm1, xmm0
5666  addps xmm0, xmm1
5667  movaps xmm1, xmm0
5668  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
5669  addss xmm0, xmm1
5670  STORE1( 8, xmm0, xmm3 )
5671  }
5672  return;
5673  }
5674  case 4: { // 4x6 * 6x1
5675  __asm {
5676  mov esi, vPtr
5677  mov edi, mPtr
5678  mov eax, dstPtr
5679  // load idVecX
5680  movlps xmm4, [esi]
5681  movhps xmm4, [esi+8]
5682  movlps xmm5, [esi+16]
5683  movlhps xmm5, xmm4
5684  movhlps xmm6, xmm4
5685  movlhps xmm6, xmm5
5686  // row 0 and 1
5687  movaps xmm0, [edi]
5688  movaps xmm1, [edi+16]
5689  movaps xmm2, [edi+32]
5690  mulps xmm0, xmm4
5691  mulps xmm1, xmm5
5692  mulps xmm2, xmm6
5693  movhlps xmm7, xmm0
5694  movlhps xmm7, xmm2
5695  addps xmm7, xmm1
5696  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5697  addps xmm7, xmm0
5698  // row 2 and 3
5699  movaps xmm0, [edi+48]
5700  movaps xmm1, [edi+48+16]
5701  movaps xmm2, [edi+48+32]
5702  mulps xmm0, xmm4
5703  mulps xmm1, xmm5
5704  mulps xmm2, xmm6
5705  movhlps xmm3, xmm0
5706  movlhps xmm3, xmm2
5707  addps xmm1, xmm3
5708  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5709  addps xmm1, xmm0
5710  // last 4 additions for the first 4 rows and store result
5711  movaps xmm0, xmm7
5712  shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5713  shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5714  addps xmm0, xmm7
5715  STORE4( 0, xmm0, xmm4 )
5716  }
5717  return;
5718  }
5719  case 5: { // 5x6 * 6x1
5720  __asm {
5721  mov esi, vPtr
5722  mov edi, mPtr
5723  mov eax, dstPtr
5724  // load idVecX
5725  movlps xmm4, [esi]
5726  movhps xmm4, [esi+8]
5727  movlps xmm5, [esi+16]
5728  movlhps xmm5, xmm4
5729  movhlps xmm6, xmm4
5730  movlhps xmm6, xmm5
5731  // row 0 and 1
5732  movaps xmm0, [edi]
5733  movaps xmm1, [edi+16]
5734  movaps xmm2, [edi+32]
5735  mulps xmm0, xmm4
5736  mulps xmm1, xmm5
5737  mulps xmm2, xmm6
5738  movhlps xmm7, xmm0
5739  movlhps xmm7, xmm2
5740  addps xmm7, xmm1
5741  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5742  addps xmm7, xmm0
5743  // row 2 and 3
5744  movaps xmm0, [edi+48]
5745  movaps xmm1, [edi+48+16]
5746  movaps xmm2, [edi+48+32]
5747  mulps xmm0, xmm4
5748  mulps xmm1, xmm5
5749  mulps xmm2, xmm6
5750  movhlps xmm3, xmm0
5751  movlhps xmm3, xmm2
5752  addps xmm1, xmm3
5753  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5754  addps xmm1, xmm0
5755  // last 4 additions for the first 4 rows and store result
5756  movaps xmm0, xmm7
5757  shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5758  shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5759  addps xmm0, xmm7
5760  STORE4( 0, xmm0, xmm3 )
5761  // row 5
5762  movaps xmm0, [edi+96]
5763  movaps xmm1, [edi+96+16]
5764  mulps xmm0, xmm4
5765  mulps xmm1, xmm5
5766  addps xmm0, xmm1
5767  movhlps xmm1, xmm0
5768  addps xmm0, xmm1
5769  movaps xmm1, xmm0
5770  shufps xmm1, xmm1, 0x01
5771  addss xmm0, xmm1
5772  STORE1( 16, xmm0, xmm3 )
5773  }
5774  return;
5775  }
5776  case 6: { // 6x6 * 6x1
5777  __asm {
5778  mov esi, vPtr
5779  mov edi, mPtr
5780  mov eax, dstPtr
5781  movlps xmm7, qword ptr [esi]
5782  movlps xmm6, qword ptr [esi+8]
5783  shufps xmm7, xmm7, 0x44
5784  shufps xmm6, xmm6, 0x44
5785  movlps xmm0, qword ptr [edi ]
5786  movhps xmm0, qword ptr [edi+ 24]
5787  mulps xmm0, xmm7
5788  movlps xmm3, qword ptr [edi+ 8]
5789  movhps xmm3, qword ptr [edi+ 32]
5790  mulps xmm3, xmm6
5791  movlps xmm1, qword ptr [edi+ 48]
5792  movhps xmm1, qword ptr [edi+ 72]
5793  mulps xmm1, xmm7
5794  movlps xmm2, qword ptr [edi+ 96]
5795  movhps xmm2, qword ptr [edi+120]
5796  mulps xmm2, xmm7
5797  movlps xmm4, qword ptr [edi+ 56]
5798  movhps xmm4, qword ptr [edi+ 80]
5799  movlps xmm5, qword ptr [edi+104]
5800  movhps xmm5, qword ptr [edi+128]
5801  mulps xmm4, xmm6
5802  movlps xmm7, qword ptr [esi+16]
5803  addps xmm0, xmm3
5804  shufps xmm7, xmm7, 0x44
5805  mulps xmm5, xmm6
5806  addps xmm1, xmm4
5807  movlps xmm3, qword ptr [edi+ 16]
5808  movhps xmm3, qword ptr [edi+ 40]
5809  addps xmm2, xmm5
5810  movlps xmm4, qword ptr [edi+ 64]
5811  movhps xmm4, qword ptr [edi+ 88]
5812  mulps xmm3, xmm7
5813  movlps xmm5, qword ptr [edi+112]
5814  movhps xmm5, qword ptr [edi+136]
5815  addps xmm0, xmm3
5816  mulps xmm4, xmm7
5817  mulps xmm5, xmm7
5818  addps xmm1, xmm4
5819  addps xmm2, xmm5
5820  movaps xmm6, xmm0
5821  shufps xmm0, xmm1, 0x88
5822  shufps xmm6, xmm1, 0xDD
5823  movaps xmm7, xmm2
5824  shufps xmm7, xmm2, 0x88
5825  shufps xmm2, xmm2, 0xDD
5826  addps xmm0, xmm6
5827  addps xmm2, xmm7
5828  STORE4( 0, xmm0, xmm3 )
5829  STORE2LO( 16, xmm2, xmm4 )
5830  }
5831  return;
5832  }
5833  default: {
5834  for ( int i = 0; i < numRows; i++ ) {
5835  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
5836  mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
5837  mPtr += 6;
5838  }
5839  return;
5840  }
5841  }
5842  break;
5843  }
5844  default: {
5845  int numColumns = mat.GetNumColumns();
5846  for ( int i = 0; i < numRows; i++ ) {
5847  float sum = mPtr[0] * vPtr[0];
5848  for ( int j = 1; j < numColumns; j++ ) {
5849  sum += mPtr[j] * vPtr[j];
5850  }
5851  dstPtr[i] STOREC sum;
5852  mPtr += numColumns;
5853  }
5854  break;
5855  }
5856  }
5857 
5858 #undef STOREC
5859 #undef STORE4
5860 #undef STORE2HI
5861 #undef STORE2LO
5862 #undef STORE1
5863 }
5864 
5865 /*
5866 ============
5867 idSIMD_SSE::MatX_MultiplySubVecX
5868 
5869  optimizes the following matrix multiplications:
5870 
5871  NxN * Nx1
5872  Nx6 * 6x1
5873  6xN * Nx1
5874 
5875  with N in the range [1-6]
5876 ============
5877 */
5878 void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
5879 #define STORE1( offset, reg1, reg2 ) \
5880  __asm movss reg2, [eax+offset] \
5881  __asm subss reg2, reg1 \
5882  __asm movss [eax+offset], reg2
5883 #define STORE2LO( offset, reg1, reg2 ) \
5884  __asm movlps reg2, [eax+offset] \
5885  __asm subps reg2, reg1 \
5886  __asm movlps [eax+offset], reg2
5887 #define STORE2HI( offset, reg1, reg2 ) \
5888  __asm movhps reg2, [eax+offset] \
5889  __asm subps reg2, reg1 \
5890  __asm movhps [eax+offset], reg2
5891 #define STORE4( offset, reg1, reg2 ) \
5892  __asm movlps reg2, [eax+offset] \
5893  __asm movhps reg2, [eax+offset+8] \
5894  __asm subps reg2, reg1 \
5895  __asm movlps [eax+offset], reg2 \
5896  __asm movhps [eax+offset+8], reg2
5897 #define STOREC -=
5898 
5899  int numRows;
5900  const float *mPtr, *vPtr;
5901  float *dstPtr;
5902 
5903  assert( vec.GetSize() >= mat.GetNumColumns() );
5904  assert( dst.GetSize() >= mat.GetNumRows() );
5905 
5906  mPtr = mat.ToFloatPtr();
5907  vPtr = vec.ToFloatPtr();
5908  dstPtr = dst.ToFloatPtr();
5909  numRows = mat.GetNumRows();
5910  switch( mat.GetNumColumns() ) {
5911  case 1: {
5912  switch( numRows ) {
5913  case 1: { // 1x1 * 1x1
5914  __asm {
5915  mov esi, vPtr
5916  mov edi, mPtr
5917  mov eax, dstPtr
5918  movss xmm0, [esi]
5919  mulss xmm0, [edi]
5920  STORE1( 0, xmm0, xmm1 )
5921  }
5922  return;
5923  }
5924  case 6: { // 6x1 * 1x1
5925  __asm {
5926  mov esi, vPtr
5927  mov edi, mPtr
5928  mov eax, dstPtr
5929  movss xmm0, [esi]
5930  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
5931  movaps xmm1, xmm0
5932  mulps xmm0, [edi]
5933  mulps xmm1, [edi+16]
5934  STORE4( 0, xmm0, xmm2 )
5935  STORE2LO( 16, xmm1, xmm2 )
5936  }
5937  return;
5938  }
5939  default: {
5940  for ( int i = 0; i < numRows; i++ ) {
5941  dstPtr[i] STOREC mPtr[0] * vPtr[0];
5942  mPtr++;
5943  }
5944  return;
5945  }
5946  }
5947  break;
5948  }
5949  case 2: {
5950  switch( numRows ) {
5951  case 2: { // 2x2 * 2x1
5952  __asm {
5953  mov esi, vPtr
5954  mov edi, mPtr
5955  mov eax, dstPtr
5956  movss xmm0, [esi]
5957  movss xmm1, [esi+4]
5958  movss xmm2, [edi]
5959  mulss xmm2, xmm0
5960  movss xmm3, [edi+4]
5961  mulss xmm3, xmm1
5962  addss xmm2, xmm3
5963  STORE1( 0, xmm2, xmm4 )
5964  mulss xmm0, [edi+8]
5965  mulss xmm1, [edi+8+4]
5966  addss xmm0, xmm1
5967  STORE1( 4, xmm0, xmm4 )
5968  }
5969  return;
5970  }
5971  case 6: { // 6x2 * 2x1
5972  __asm {
5973  mov esi, vPtr
5974  mov edi, mPtr
5975  mov eax, dstPtr
5976  movlps xmm7, [esi]
5977  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5978  movaps xmm0, [edi]
5979  mulps xmm0, xmm7
5980  movaps xmm1, [edi+16]
5981  mulps xmm1, xmm7
5982  movaps xmm2, xmm0
5983  shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5984  shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5985  movaps xmm3, [edi+32]
5986  addps xmm0, xmm2
5987  mulps xmm3, xmm7
5988  STORE4( 0, xmm0, xmm4 )
5989  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
5990  movhlps xmm1, xmm3
5991  addps xmm3, xmm1
5992  STORE2LO( 16, xmm3, xmm4 )
5993  }
5994  return;
5995  }
5996  default: {
5997  for ( int i = 0; i < numRows; i++ ) {
5998  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
5999  mPtr += 2;
6000  }
6001  return;
6002  }
6003  }
6004  break;
6005  }
6006  case 3: {
6007  switch( numRows ) {
6008  case 3: { // 3x3 * 3x1
6009  __asm {
6010  mov esi, vPtr
6011  mov edi, mPtr
6012  mov eax, dstPtr
6013  movss xmm0, [esi]
6014  movss xmm4, [edi]
6015  mulss xmm4, xmm0
6016  movss xmm1, [esi+4]
6017  movss xmm5, [edi+4]
6018  mulss xmm5, xmm1
6019  addss xmm4, xmm5
6020  movss xmm2, [esi+8]
6021  movss xmm6, [edi+8]
6022  mulss xmm6, xmm2
6023  addss xmm4, xmm6
6024  movss xmm3, [edi+12]
6025  mulss xmm3, xmm0
6026  STORE1( 0, xmm4, xmm7 );
6027  movss xmm5, [edi+12+4]
6028  mulss xmm5, xmm1
6029  addss xmm3, xmm5
6030  movss xmm6, [edi+12+8]
6031  mulss xmm6, xmm2
6032  addss xmm3, xmm6
6033  mulss xmm0, [edi+24]
6034  mulss xmm1, [edi+24+4]
6035  STORE1( 4, xmm3, xmm7 );
6036  addss xmm0, xmm1
6037  mulss xmm2, [edi+24+8]
6038  addss xmm0, xmm2
6039  STORE1( 8, xmm0, xmm7 );
6040  }
6041  return;
6042  }
6043  case 6: { // 6x3 * 3x1
6044  __asm {
6045  mov esi, vPtr
6046  mov edi, mPtr
6047  mov eax, dstPtr
6048  movss xmm5, [esi]
6049  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
6050  movss xmm6, [esi+4]
6051  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6052  movss xmm7, [esi+8]
6053  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6054  movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
6055  movlps xmm1, [edi+4*4]
6056  shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
6057  movlps xmm2, [edi+6*4]
6058  movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
6059  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
6060  mulps xmm0, xmm5
6061  movlps xmm3, [edi+10*4]
6062  shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
6063  movaps xmm3, xmm1
6064  shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
6065  mulps xmm1, xmm6
6066  shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
6067  mulps xmm3, xmm7
6068  addps xmm0, xmm1
6069  addps xmm0, xmm3
6070  STORE4( 0, xmm0, xmm4 )
6071  movss xmm1, [edi+12*4]
6072  mulss xmm1, xmm5
6073  movss xmm2, [edi+13*4]
6074  mulss xmm2, xmm6
6075  movss xmm3, [edi+14*4]
6076  mulss xmm3, xmm7
6077  addss xmm1, xmm2
6078  addss xmm1, xmm3
6079  STORE1( 16, xmm1, xmm4 )
6080  mulss xmm5, [edi+15*4]
6081  mulss xmm6, [edi+16*4]
6082  mulss xmm7, [edi+17*4]
6083  addss xmm5, xmm6
6084  addss xmm5, xmm7
6085  STORE1( 20, xmm5, xmm4 )
6086  }
6087  return;
6088  }
6089  default: {
6090  for ( int i = 0; i < numRows; i++ ) {
6091  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
6092  mPtr += 3;
6093  }
6094  return;
6095  }
6096  }
6097  break;
6098  }
6099  case 4: {
6100  switch( numRows ) {
6101  case 4: { // 4x4 * 4x1
6102  __asm {
6103  mov esi, vPtr
6104  mov edi, mPtr
6105  mov eax, dstPtr
6106  movlps xmm6, qword ptr [esi ]
6107  movlps xmm0, qword ptr [edi ]
6108  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
6109  movhps xmm0, qword ptr [edi+16]
6110  mulps xmm0, xmm6
6111  movlps xmm7, qword ptr [esi+ 8]
6112  movlps xmm2, qword ptr [edi+ 8]
6113  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
6114  movhps xmm2, qword ptr [edi+24]
6115  mulps xmm2, xmm7
6116  movlps xmm1, qword ptr [edi+32]
6117  movhps xmm1, qword ptr [edi+48]
6118  mulps xmm1, xmm6
6119  movlps xmm3, qword ptr [edi+40]
6120  addps xmm0, xmm2
6121  movhps xmm3, qword ptr [edi+56]
6122  mulps xmm3, xmm7
6123  movaps xmm4, xmm0
6124  addps xmm1, xmm3
6125  shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
6126  shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
6127  addps xmm0, xmm4
6128  STORE4( 0, xmm0, xmm2 )
6129  }
6130  return;
6131  }
6132  case 6: { // 6x4 * 4x1
6133  __asm {
6134  mov esi, vPtr
6135  mov edi, mPtr
6136  mov eax, dstPtr
6137  movlps xmm6, qword ptr [esi+ 0]
6138  movlps xmm0, qword ptr [edi+ 0]
6139  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
6140  movhps xmm0, qword ptr [edi+16]
6141  mulps xmm0, xmm6
6142  movlps xmm7, qword ptr [esi+ 8]
6143  movlps xmm2, qword ptr [edi+ 8]
6144  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
6145  movhps xmm2, qword ptr [edi+24]
6146  mulps xmm2, xmm7
6147  movlps xmm1, qword ptr [edi+32]
6148  movhps xmm1, qword ptr [edi+48]
6149  mulps xmm1, xmm6
6150  movlps xmm3, qword ptr [edi+40]
6151  addps xmm0, xmm2
6152  movhps xmm3, qword ptr [edi+56]
6153  mulps xmm3, xmm7
6154  movaps xmm4, xmm0
6155  addps xmm1, xmm3
6156  shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
6157  shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
6158  addps xmm0, xmm4
6159  movlps xmm1, qword ptr [edi+64]
6160  movhps xmm1, qword ptr [edi+80]
6161  STORE4( 0, xmm0, xmm4 )
6162  mulps xmm1, xmm6
6163  movlps xmm2, qword ptr [edi+72]
6164  movhps xmm2, qword ptr [edi+88]
6165  mulps xmm2, xmm7
6166  addps xmm1, xmm2
6167  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
6168  movhlps xmm3, xmm1
6169  addps xmm1, xmm3
6170  STORE2LO( 16, xmm1, xmm4 )
6171  }
6172  return;
6173  }
6174  default: {
6175  for ( int i = 0; i < numRows; i++ ) {
6176  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
6177  mPtr += 4;
6178  }
6179  return;
6180  }
6181  }
6182  break;
6183  }
6184  case 5: {
6185  switch( numRows ) {
6186  case 5: { // 5x5 * 5x1
6187  __asm {
6188  mov esi, vPtr
6189  mov edi, mPtr
6190  mov eax, dstPtr
6191  movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
6192  movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
6193  movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
6194  movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
6195  movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
6196  shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
6197  movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
6198  movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
6199  movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
6200  shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
6201  movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
6202  movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
6203  movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
6204  shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
6205  movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
6206  movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
6207  movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
6208  movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
6209  shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
6210  movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
6211  shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
6212  movss xmm7, [esi+0*4]
6213  shufps xmm7, xmm7, 0
6214  mulps xmm0, xmm7
6215  movss xmm5, [esi+1*4]
6216  shufps xmm5, xmm5, 0
6217  mulps xmm1, xmm5
6218  addps xmm0, xmm1
6219  movss xmm6, [esi+2*4]
6220  shufps xmm6, xmm6, 0
6221  mulps xmm2, xmm6
6222  addps xmm0, xmm2
6223  movss xmm1, [esi+3*4]
6224  shufps xmm1, xmm1, 0
6225  mulps xmm3, xmm1
6226  addps xmm0, xmm3
6227  movss xmm2, [esi+4*4]
6228  shufps xmm2, xmm2, 0
6229  mulps xmm4, xmm2
6230  addps xmm0, xmm4
6231  mulss xmm7, [edi+20*4]
6232  mulss xmm5, [edi+21*4]
6233  addps xmm7, xmm5
6234  mulss xmm6, [edi+22*4]
6235  addps xmm7, xmm6
6236  mulss xmm1, [edi+23*4]
6237  addps xmm7, xmm1
6238  mulss xmm2, [edi+24*4]
6239  addps xmm7, xmm2
6240  STORE4( 0, xmm0, xmm3 )
6241  STORE1( 16, xmm7, xmm4 )
6242  }
6243  return;
6244  }
6245  case 6: { // 6x5 * 5x1
6246  __asm {
6247  mov esi, vPtr
6248  mov edi, mPtr
6249  mov eax, dstPtr
6250  movlps xmm6, [esi]
6251  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
6252  movlps xmm7, [esi+8]
6253  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
6254  movlps xmm0, [edi]
6255  movhps xmm3, [edi+8]
6256  movaps xmm1, [edi+16]
6257  movlps xmm2, [edi+32]
6258  shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
6259  shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
6260  shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
6261  mulps xmm0, xmm6
6262  mulps xmm3, xmm7
6263  movlps xmm2, [edi+40]
6264  addps xmm0, xmm3 // xmm0 + xmm1
6265  movhps xmm5, [edi+40+8]
6266  movlps xmm3, [edi+40+16]
6267  movhps xmm3, [edi+40+24]
6268  movlps xmm4, [edi+40+32]
6269  shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
6270  shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
6271  shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
6272  mulps xmm2, xmm6
6273  mulps xmm5, xmm7
6274  addps xmm2, xmm5 // xmm2 + xmm3
6275  movss xmm5, [esi+16]
6276  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
6277  movaps xmm4, xmm0
6278  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
6279  shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
6280  shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
6281  addps xmm0, xmm4
6282  mulps xmm1, xmm5
6283  addps xmm0, xmm1
6284  STORE4( 0, xmm0, xmm2 )
6285  movlps xmm4, [edi+80]
6286  movhps xmm3, [edi+80+8]
6287  movaps xmm1, [edi+80+16]
6288  movlps xmm2, [edi+80+32]
6289  shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
6290  shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
6291  shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
6292  mulps xmm4, xmm6
6293  mulps xmm3, xmm7
6294  mulps xmm1, xmm5
6295  addps xmm4, xmm3 // xmm4 + xmm1
6296  shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
6297  shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
6298  addps xmm4, xmm1
6299  shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
6300  addps xmm4, xmm1
6301  STORE2LO( 16, xmm4, xmm2 )
6302  }
6303  return;
6304  }
6305  default: {
6306  for ( int i = 0; i < numRows; i++ ) {
6307  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
6308  mPtr += 5;
6309  }
6310  return;
6311  }
6312  }
6313  break;
6314  }
6315  case 6: {
6316  switch( numRows ) {
6317  case 1: { // 1x6 * 6x1
6318  __asm {
6319  mov esi, vPtr
6320  mov edi, mPtr
6321  mov eax, dstPtr
6322  movss xmm0, [esi]
6323  mulss xmm0, [edi]
6324  movss xmm1, [esi+4]
6325  mulss xmm1, [edi+4]
6326  movss xmm2, [esi+8]
6327  addss xmm0, xmm1
6328  mulss xmm2, [edi+8]
6329  movss xmm3, [esi+12]
6330  addss xmm0, xmm2
6331  mulss xmm3, [edi+12]
6332  movss xmm4, [esi+16]
6333  addss xmm0, xmm3
6334  mulss xmm4, [edi+16]
6335  movss xmm5, [esi+20]
6336  addss xmm0, xmm4
6337  mulss xmm5, [edi+20]
6338  movss xmm6, [esi+24]
6339  addss xmm0, xmm5
6340  mulss xmm6, [edi+24]
6341  addss xmm0, xmm6
6342  STORE1( 0, xmm0, xmm7 )
6343  }
6344  return;
6345  }
6346  case 2: { // 2x6 * 6x1
6347  __asm {
6348  mov esi, vPtr
6349  mov edi, mPtr
6350  mov eax, dstPtr
6351  // load idVecX
6352  movlps xmm4, [esi]
6353  movhps xmm4, [esi+8]
6354  movlps xmm5, [esi+16]
6355  movlhps xmm5, xmm4
6356  movhlps xmm6, xmm4
6357  movlhps xmm6, xmm5
6358  // row 0 and 1
6359  movaps xmm0, [edi]
6360  movaps xmm1, [edi+16]
6361  movaps xmm2, [edi+32]
6362  mulps xmm0, xmm4
6363  mulps xmm1, xmm5
6364  mulps xmm2, xmm6
6365  movhlps xmm3, xmm0
6366  movlhps xmm3, xmm2
6367  addps xmm1, xmm3
6368  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6369  addps xmm1, xmm0
6370  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
6371  movhlps xmm0, xmm1
6372  addps xmm0, xmm1
6373  STORE2LO( 0, xmm0, xmm3 )
6374  }
6375  return;
6376  }
6377  case 3: { // 3x6 * 6x1
6378  __asm {
6379  mov esi, vPtr
6380  mov edi, mPtr
6381  mov eax, dstPtr
6382  // load idVecX
6383  movlps xmm4, [esi]
6384  movhps xmm4, [esi+8]
6385  movlps xmm5, [esi+16]
6386  movlhps xmm5, xmm4
6387  movhlps xmm6, xmm4
6388  movlhps xmm6, xmm5
6389  // row 0 and 1
6390  movaps xmm0, [edi]
6391  movaps xmm1, [edi+16]
6392  movaps xmm2, [edi+32]
6393  mulps xmm0, xmm4
6394  mulps xmm1, xmm5
6395  mulps xmm2, xmm6
6396  movhlps xmm3, xmm0
6397  movlhps xmm3, xmm2
6398  addps xmm1, xmm3
6399  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6400  addps xmm1, xmm0
6401  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
6402  movhlps xmm0, xmm1
6403  addps xmm0, xmm1
6404  STORE2LO( 0, xmm0, xmm3 )
6405  // row 2
6406  movaps xmm0, [edi+48]
6407  movaps xmm1, [edi+48+16]
6408  mulps xmm0, xmm4
6409  mulps xmm1, xmm5
6410  addps xmm0, xmm1
6411  movhlps xmm1, xmm0
6412  addps xmm0, xmm1
6413  movaps xmm1, xmm0
6414  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
6415  addss xmm0, xmm1
6416  STORE1( 8, xmm0, xmm3 )
6417  }
6418  return;
6419  }
6420  case 4: { // 4x6 * 6x1
6421  __asm {
6422  mov esi, vPtr
6423  mov edi, mPtr
6424  mov eax, dstPtr
6425  // load idVecX
6426  movlps xmm4, [esi]
6427  movhps xmm4, [esi+8]
6428  movlps xmm5, [esi+16]
6429  movlhps xmm5, xmm4
6430  movhlps xmm6, xmm4
6431  movlhps xmm6, xmm5
6432  // row 0 and 1
6433  movaps xmm0, [edi]
6434  movaps xmm1, [edi+16]
6435  movaps xmm2, [edi+32]
6436  mulps xmm0, xmm4
6437  mulps xmm1, xmm5
6438  mulps xmm2, xmm6
6439  movhlps xmm7, xmm0
6440  movlhps xmm7, xmm2
6441  addps xmm7, xmm1
6442  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6443  addps xmm7, xmm0
6444  // row 2 and 3
6445  movaps xmm0, [edi+48]
6446  movaps xmm1, [edi+48+16]
6447  movaps xmm2, [edi+48+32]
6448  mulps xmm0, xmm4
6449  mulps xmm1, xmm5
6450  mulps xmm2, xmm6
6451  movhlps xmm3, xmm0
6452  movlhps xmm3, xmm2
6453  addps xmm1, xmm3
6454  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6455  addps xmm1, xmm0
6456  // last 4 additions for the first 4 rows and store result
6457  movaps xmm0, xmm7
6458  shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
6459  shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
6460  addps xmm0, xmm7
6461  STORE4( 0, xmm0, xmm4 )
6462  }
6463  return;
6464  }
6465  case 5: { // 5x6 * 6x1
6466  __asm {
6467  mov esi, vPtr
6468  mov edi, mPtr
6469  mov eax, dstPtr
6470  // load idVecX
6471  movlps xmm4, [esi]
6472  movhps xmm4, [esi+8]
6473  movlps xmm5, [esi+16]
6474  movlhps xmm5, xmm4
6475  movhlps xmm6, xmm4
6476  movlhps xmm6, xmm5
6477  // row 0 and 1
6478  movaps xmm0, [edi]
6479  movaps xmm1, [edi+16]
6480  movaps xmm2, [edi+32]
6481  mulps xmm0, xmm4
6482  mulps xmm1, xmm5
6483  mulps xmm2, xmm6
6484  movhlps xmm7, xmm0
6485  movlhps xmm7, xmm2
6486  addps xmm7, xmm1
6487  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6488  addps xmm7, xmm0
6489  // row 2 and 3
6490  movaps xmm0, [edi+48]
6491  movaps xmm1, [edi+48+16]
6492  movaps xmm2, [edi+48+32]
6493  mulps xmm0, xmm4
6494  mulps xmm1, xmm5
6495  mulps xmm2, xmm6
6496  movhlps xmm3, xmm0
6497  movlhps xmm3, xmm2
6498  addps xmm1, xmm3
6499  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6500  addps xmm1, xmm0
6501  // last 4 additions for the first 4 rows and store result
6502  movaps xmm0, xmm7
6503  shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
6504  shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
6505  addps xmm0, xmm7
6506  STORE4( 0, xmm0, xmm3 )
6507  // row 5
6508  movaps xmm0, [edi+96]
6509  movaps xmm1, [edi+96+16]
6510  mulps xmm0, xmm4
6511  mulps xmm1, xmm5
6512  addps xmm0, xmm1
6513  movhlps xmm1, xmm0
6514  addps xmm0, xmm1
6515  movaps xmm1, xmm0
6516  shufps xmm1, xmm1, 0x01
6517  addss xmm0, xmm1
6518  STORE1( 16, xmm0, xmm3 )
6519  }
6520  return;
6521  }
6522  case 6: { // 6x6 * 6x1
6523  __asm {
6524  mov esi, vPtr
6525  mov edi, mPtr
6526  mov eax, dstPtr
6527  movlps xmm7, qword ptr [esi]
6528  movlps xmm6, qword ptr [esi+8]
6529  shufps xmm7, xmm7, 0x44
6530  shufps xmm6, xmm6, 0x44
6531  movlps xmm0, qword ptr [edi ]
6532  movhps xmm0, qword ptr [edi+ 24]
6533  mulps xmm0, xmm7
6534  movlps xmm3, qword ptr [edi+ 8]
6535  movhps xmm3, qword ptr [edi+ 32]
6536  mulps xmm3, xmm6
6537  movlps xmm1, qword ptr [edi+ 48]
6538  movhps xmm1, qword ptr [edi+ 72]
6539  mulps xmm1, xmm7
6540  movlps xmm2, qword ptr [edi+ 96]
6541  movhps xmm2, qword ptr [edi+120]
6542  mulps xmm2, xmm7
6543  movlps xmm4, qword ptr [edi+ 56]
6544  movhps xmm4, qword ptr [edi+ 80]
6545  movlps xmm5, qword ptr [edi+104]
6546  movhps xmm5, qword ptr [edi+128]
6547  mulps xmm4, xmm6
6548  movlps xmm7, qword ptr [esi+16]
6549  addps xmm0, xmm3
6550  shufps xmm7, xmm7, 0x44
6551  mulps xmm5, xmm6
6552  addps xmm1, xmm4
6553  movlps xmm3, qword ptr [edi+ 16]
6554  movhps xmm3, qword ptr [edi+ 40]
6555  addps xmm2, xmm5
6556  movlps xmm4, qword ptr [edi+ 64]
6557  movhps xmm4, qword ptr [edi+ 88]
6558  mulps xmm3, xmm7
6559  movlps xmm5, qword ptr [edi+112]
6560  movhps xmm5, qword ptr [edi+136]
6561  addps xmm0, xmm3
6562  mulps xmm4, xmm7
6563  mulps xmm5, xmm7
6564  addps xmm1, xmm4
6565  addps xmm2, xmm5
6566  movaps xmm6, xmm0
6567  shufps xmm0, xmm1, 0x88
6568  shufps xmm6, xmm1, 0xDD
6569  movaps xmm7, xmm2
6570  shufps xmm7, xmm2, 0x88
6571  shufps xmm2, xmm2, 0xDD
6572  addps xmm0, xmm6
6573  addps xmm2, xmm7
6574  STORE4( 0, xmm0, xmm3 )
6575  STORE2LO( 16, xmm2, xmm4 )
6576  }
6577  return;
6578  }
6579  default: {
6580  for ( int i = 0; i < numRows; i++ ) {
6581  dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
6582  mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
6583  mPtr += 6;
6584  }
6585  return;
6586  }
6587  }
6588  break;
6589  }
6590  default: {
6591  int numColumns = mat.GetNumColumns();
6592  for ( int i = 0; i < numRows; i++ ) {
6593  float sum = mPtr[0] * vPtr[0];
6594  for ( int j = 1; j < numColumns; j++ ) {
6595  sum += mPtr[j] * vPtr[j];
6596  }
6597  dstPtr[i] STOREC sum;
6598  mPtr += numColumns;
6599  }
6600  break;
6601  }
6602  }
6603 
6604 #undef STOREC
6605 #undef STORE4
6606 #undef STORE2HI
6607 #undef STORE2LO
6608 #undef STORE1
6609 }
6610 
6611 /*
6612 ============
6613 idSIMD_SSE::MatX_TransposeMultiplyVecX
6614 
6615  optimizes the following matrix multiplications:
6616 
6617  Nx6 * Nx1
6618  6xN * 6x1
6619 
6620  with N in the range [1-6]
6621 ============
6622 */
6623 void VPCALL idSIMD_SSE::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
6624 #define STORE1( offset, reg1, reg2 ) \
6625  __asm movss [eax+offset], reg1
6626 #define STORE2LO( offset, reg1, reg2 ) \
6627  __asm movlps [eax+offset], reg1
6628 #define STORE2HI( offset, reg1, reg2 ) \
6629  __asm movhps [eax+offset], reg1
6630 #define STORE4( offset, reg1, reg2 ) \
6631  __asm movlps [eax+offset], reg1 \
6632  __asm movhps [eax+offset+8], reg1
6633 #define STOREC =
6634 
6635  int numColumns;
6636  const float *mPtr, *vPtr;
6637  float *dstPtr;
6638 
6639  assert( vec.GetSize() >= mat.GetNumRows() );
6640  assert( dst.GetSize() >= mat.GetNumColumns() );
6641 
6642  mPtr = mat.ToFloatPtr();
6643  vPtr = vec.ToFloatPtr();
6644  dstPtr = dst.ToFloatPtr();
6645  numColumns = mat.GetNumColumns();
6646  switch( mat.GetNumRows() ) {
6647  case 1:
6648  switch( numColumns ) {
6649  case 6: { // 1x6 * 1x1
6650  __asm {
6651  mov esi, vPtr
6652  mov edi, mPtr
6653  mov eax, dstPtr
6654  movss xmm0, [esi]
6655  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
6656  movaps xmm1, xmm0
6657  mulps xmm0, [edi]
6658  mulps xmm1, [edi+16]
6659  STORE4( 0, xmm0, xmm2 )
6660  STORE2LO( 16, xmm1, xmm3 )
6661  }
6662  return;
6663  }
6664  default: {
6665  for ( int i = 0; i < numColumns; i++ ) {
6666  dstPtr[i] STOREC *(mPtr) * vPtr[0];
6667  mPtr++;
6668  }
6669  return;
6670  }
6671  }
6672  break;
6673  case 2:
6674  switch( numColumns ) {
6675  case 6: { // 2x6 * 2x1
6676  __asm {
6677  mov esi, vPtr
6678  mov edi, mPtr
6679  mov eax, dstPtr
6680  movlps xmm0, [esi]
6681  movaps xmm1, xmm0
6682  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
6683  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
6684  movaps xmm2, [edi]
6685  mulps xmm2, xmm0
6686  movlps xmm3, [edi+24]
6687  movhps xmm3, [edi+32]
6688  mulps xmm3, xmm1
6689  addps xmm2, xmm3
6690  shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
6691  movlps xmm4, [edi+16]
6692  movhps xmm4, [edi+40]
6693  mulps xmm4, xmm0
6694  movhlps xmm3, xmm4
6695  addps xmm3, xmm4
6696  STORE4( 0, xmm2, xmm5 )
6697  STORE2LO( 16, xmm3, xmm6 )
6698  }
6699  return;
6700  }
6701  default: {
6702  for ( int i = 0; i < numColumns; i++ ) {
6703  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
6704  mPtr++;
6705  }
6706  return;
6707  }
6708  }
6709  break;
6710  case 3:
6711  switch( numColumns ) {
6712  case 6: { // 3x6 * 3x1
6713  __asm {
6714  mov esi, vPtr
6715  mov edi, mPtr
6716  mov eax, dstPtr
6717  movlps xmm0, [esi+0*4]
6718  movss xmm1, [esi+2*4]
6719  movlps xmm3, [edi+(0*6+0)*4]
6720  movhps xmm3, [edi+(0*6+2)*4]
6721  movaps xmm4, xmm0
6722  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
6723  mulps xmm3, xmm4
6724  movlps xmm5, [edi+(1*6+0)*4]
6725  movhps xmm5, [edi+(1*6+2)*4]
6726  movaps xmm6, xmm0
6727  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6728  mulps xmm5, xmm6
6729  addps xmm3, xmm5
6730  movlps xmm4, [edi+(2*6+0)*4]
6731  movhps xmm4, [edi+(2*6+2)*4]
6732  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
6733  mulps xmm4, xmm1
6734  addps xmm3, xmm4
6735  STORE4( 0, xmm3, xmm7 )
6736  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
6737  movlps xmm3, [edi+(0*6+4)*4]
6738  movhps xmm3, [edi+(1*6+4)*4]
6739  mulps xmm3, xmm0
6740  movhlps xmm4, xmm3
6741  addps xmm3, xmm4
6742  movlps xmm5, [edi+(2*6+4)*4]
6743  mulps xmm5, xmm1
6744  addps xmm3, xmm5
6745  STORE2LO( 16, xmm3, xmm7 )
6746  }
6747  return;
6748  }
6749  default: {
6750  for ( int i = 0; i < numColumns; i++ ) {
6751  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
6752  mPtr++;
6753  }
6754  return;
6755  }
6756  }
6757  break;
6758  case 4:
6759  switch( numColumns ) {
6760  case 6: { // 4x6 * 4x1
6761  __asm {
6762  mov esi, vPtr
6763  mov edi, mPtr
6764  mov eax, dstPtr
6765  movlps xmm0, [esi+0*4]
6766  movlps xmm1, [esi+2*4]
6767  movaps xmm3, xmm0
6768  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
6769  mulps xmm3, [edi+(0*6+0)*4]
6770  movlps xmm5, [edi+(1*6+0)*4]
6771  movhps xmm5, [edi+(1*6+2)*4]
6772  movaps xmm6, xmm0
6773  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6774  mulps xmm5, xmm6
6775  addps xmm3, xmm5
6776  movlps xmm4, [edi+(2*6+0)*4]
6777  movhps xmm4, [edi+(2*6+2)*4]
6778  movaps xmm6, xmm1
6779  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6780  mulps xmm4, xmm6
6781  addps xmm3, xmm4
6782  movlps xmm5, [edi+(3*6+0)*4]
6783  movhps xmm5, [edi+(3*6+2)*4]
6784  movaps xmm6, xmm1
6785  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6786  mulps xmm5, xmm6
6787  addps xmm3, xmm5
6788  STORE4( 0, xmm3, xmm7 )
6789  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
6790  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
6791  movlps xmm3, [edi+(0*6+4)*4]
6792  movhps xmm3, [edi+(1*6+4)*4]
6793  mulps xmm3, xmm0
6794  movlps xmm4, [edi+(2*6+4)*4]
6795  movhps xmm4, [edi+(3*6+4)*4]
6796  mulps xmm4, xmm1
6797  addps xmm3, xmm4
6798  movhlps xmm4, xmm3
6799  addps xmm3, xmm4
6800  STORE2LO( 16, xmm3, xmm7 )
6801  }
6802  return;
6803  }
6804  default: {
6805  for ( int i = 0; i < numColumns; i++ ) {
6806  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
6807  *(mPtr+3*numColumns) * vPtr[3];
6808  mPtr++;
6809  }
6810  return;
6811  }
6812  }
6813  break;
6814  case 5:
6815  switch( numColumns ) {
6816  case 6: { // 5x6 * 5x1
6817  __asm {
6818  mov esi, vPtr
6819  mov edi, mPtr
6820  mov eax, dstPtr
6821  movlps xmm0, [esi+0*4]
6822  movlps xmm1, [esi+2*4]
6823  movss xmm2, [esi+4*4]
6824  movaps xmm3, xmm0
6825  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
6826  mulps xmm3, [edi+(0*6+0)*4]
6827  movlps xmm5, [edi+(1*6+0)*4]
6828  movhps xmm5, [edi+(1*6+2)*4]
6829  movaps xmm6, xmm0
6830  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6831  mulps xmm5, xmm6
6832  addps xmm3, xmm5
6833  movaps xmm6, xmm1
6834  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6835  mulps xmm6, [edi+(2*6+0)*4]
6836  addps xmm3, xmm6
6837  movlps xmm5, [edi+(3*6+0)*4]
6838  movhps xmm5, [edi+(3*6+2)*4]
6839  movaps xmm6, xmm1
6840  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6841  mulps xmm5, xmm6
6842  addps xmm3, xmm5
6843  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
6844  movaps xmm4, xmm2
6845  mulps xmm4, [edi+(4*6+0)*4]
6846  addps xmm3, xmm4
6847  STORE4( 0, xmm3, xmm7 )
6848  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
6849  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
6850  movlps xmm3, [edi+(0*6+4)*4]
6851  movhps xmm3, [edi+(1*6+4)*4]
6852  mulps xmm3, xmm0
6853  movlps xmm4, [edi+(2*6+4)*4]
6854  movhps xmm4, [edi+(3*6+4)*4]
6855  mulps xmm4, xmm1
6856  addps xmm3, xmm4
6857  movhlps xmm4, xmm3
6858  addps xmm3, xmm4
6859  movlps xmm5, [edi+(4*6+4)*4]
6860  mulps xmm5, xmm2
6861  addps xmm3, xmm5
6862  STORE2LO( 16, xmm3, xmm7 )
6863  }
6864  return;
6865  }
6866  default: {
6867  for ( int i = 0; i < numColumns; i++ ) {
6868  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
6869  *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
6870  mPtr++;
6871  }
6872  return;
6873  }
6874  }
6875  break;
6876  case 6:
6877  switch( numColumns ) {
6878  case 1: { // 6x1 * 6x1
6879  __asm {
6880  mov esi, vPtr
6881  mov edi, mPtr
6882  mov eax, dstPtr
6883  movlps xmm0, [esi]
6884  movhps xmm0, [esi+8]
6885  movlps xmm1, [esi+16]
6886  mulps xmm0, [edi]
6887  mulps xmm1, [edi+16]
6888  shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
6889  addps xmm0, xmm1
6890  movhlps xmm2, xmm0
6891  addss xmm2, xmm0
6892  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
6893  addss xmm2, xmm0
6894  STORE1( 0, xmm2, xmm3 )
6895  }
6896  return;
6897  }
6898  case 2: { // 6x2 * 6x1
6899  __asm {
6900  mov esi, vPtr
6901  mov edi, mPtr
6902  mov eax, dstPtr
6903  movlps xmm0, [esi+0*4]
6904  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
6905  movaps xmm6, [edi+0*4]
6906  mulps xmm6, xmm0
6907  movlps xmm1, [esi+2*4]
6908  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
6909  movaps xmm7, [edi+4*4]
6910  mulps xmm7, xmm1
6911  addps xmm6, xmm7
6912  movlps xmm2, [esi+4*4]
6913  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
6914  movaps xmm7, [edi+8*4]
6915  mulps xmm7, xmm2
6916  addps xmm6, xmm7
6917  movhlps xmm3, xmm6
6918  addps xmm3, xmm6
6919  STORE2LO( 0, xmm3, xmm7 )
6920  }
6921  return;
6922  }
6923  case 3: { // 6x3 * 6x1
6924  __asm {
6925  mov esi, vPtr
6926  mov edi, mPtr
6927  mov eax, dstPtr
6928  movss xmm0, [edi+(0*3+2)*4]
6929  movhps xmm0, [edi+(0*3+0)*4]
6930  shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
6931  movss xmm6, [esi+0*4]
6932  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6933  mulps xmm6, xmm0
6934  movss xmm1, [edi+(1*3+0)*4]
6935  movhps xmm1, [edi+(1*3+1)*4]
6936  movss xmm7, [esi+1*4]
6937  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6938  mulps xmm7, xmm1
6939  addps xmm6, xmm7
6940  movss xmm2, [edi+(2*3+2)*4]
6941  movhps xmm2, [edi+(2*3+0)*4]
6942  shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
6943  movss xmm7, [esi+2*4]
6944  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6945  mulps xmm7, xmm2
6946  addps xmm6, xmm7
6947  movss xmm3, [edi+(3*3+0)*4]
6948  movhps xmm3, [edi+(3*3+1)*4]
6949  movss xmm7, [esi+3*4]
6950  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6951  mulps xmm7, xmm3
6952  addps xmm6, xmm7
6953  movss xmm4, [edi+(4*3+2)*4]
6954  movhps xmm4, [edi+(4*3+0)*4]
6955  shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
6956  movss xmm7, [esi+4*4]
6957  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6958  mulps xmm7, xmm4
6959  addps xmm6, xmm7
6960  movss xmm5, [edi+(5*3+0)*4]
6961  movhps xmm5, [edi+(5*3+1)*4]
6962  movss xmm7, [esi+5*4]
6963  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6964  mulps xmm7, xmm5
6965  addps xmm6, xmm7
6966  STORE1( 0, xmm6, xmm7 )
6967  STORE2HI( 4, xmm6, xmm7 )
6968  }
6969  return;
6970  }
6971  case 4: { // 6x4 * 6x1
6972  __asm {
6973  mov esi, vPtr
6974  mov edi, mPtr
6975  mov eax, dstPtr
6976  movlps xmm3, [edi+(0*4+0)*4]
6977  movhps xmm3, [edi+(0*4+2)*4]
6978  movss xmm4, [esi+0*4]
6979  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
6980  mulps xmm3, xmm4
6981  movlps xmm5, [edi+(1*4+0)*4]
6982  movhps xmm5, [edi+(1*4+2)*4]
6983  movss xmm6, [esi+1*4]
6984  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6985  mulps xmm5, xmm6
6986  addps xmm3, xmm5
6987  movlps xmm4, [edi+(2*4+0)*4]
6988  movhps xmm4, [edi+(2*4+2)*4]
6989  movss xmm6, [esi+2*4]
6990  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6991  mulps xmm4, xmm6
6992  addps xmm3, xmm4
6993  movlps xmm5, [edi+(3*4+0)*4]
6994  movhps xmm5, [edi+(3*4+2)*4]
6995  movss xmm6, [esi+3*4]
6996  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6997  mulps xmm5, xmm6
6998  addps xmm3, xmm5
6999  movlps xmm4, [edi+(4*4+0)*4]
7000  movhps xmm4, [edi+(4*4+2)*4]
7001  movss xmm6, [esi+4*4]
7002  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7003  mulps xmm4, xmm6
7004  addps xmm3, xmm4
7005  movlps xmm5, [edi+(5*4+0)*4]
7006  movhps xmm5, [edi+(5*4+2)*4]
7007  movss xmm6, [esi+5*4]
7008  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7009  mulps xmm5, xmm6
7010  addps xmm3, xmm5
7011  STORE4( 0, xmm3, xmm7 )
7012  }
7013  return;
7014  }
7015  case 5: { // 6x5 * 6x1
7016  __asm {
7017  mov esi, vPtr
7018  mov edi, mPtr
7019  mov eax, dstPtr
7020  movlps xmm6, [edi+(0*5+0)*4]
7021  movhps xmm6, [edi+(0*5+2)*4]
7022  movss xmm0, [esi+0*4]
7023  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7024  mulps xmm6, xmm0
7025  movlps xmm7, [edi+(1*5+0)*4]
7026  movhps xmm7, [edi+(1*5+2)*4]
7027  movss xmm1, [esi+1*4]
7028  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7029  mulps xmm7, xmm1
7030  addps xmm6, xmm7
7031  movlps xmm7, [edi+(2*5+0)*4]
7032  movhps xmm7, [edi+(2*5+2)*4]
7033  movss xmm2, [esi+2*4]
7034  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
7035  mulps xmm7, xmm2
7036  addps xmm6, xmm7
7037  movlps xmm7, [edi+(3*5+0)*4]
7038  movhps xmm7, [edi+(3*5+2)*4]
7039  movss xmm3, [esi+3*4]
7040  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7041  mulps xmm7, xmm3
7042  addps xmm6, xmm7
7043  movlps xmm7, [edi+(4*5+0)*4]
7044  movhps xmm7, [edi+(4*5+2)*4]
7045  movss xmm4, [esi+4*4]
7046  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7047  mulps xmm7, xmm4
7048  addps xmm6, xmm7
7049  movlps xmm7, [edi+(5*5+0)*4]
7050  movhps xmm7, [edi+(5*5+2)*4]
7051  movss xmm5, [esi+5*4]
7052  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
7053  mulps xmm7, xmm5
7054  addps xmm6, xmm7
7055  STORE4( 0, xmm6, xmm7 )
7056  movss xmm6, [edi+(0*5+4)*4]
7057  mulss xmm6, xmm0
7058  movss xmm7, [edi+(1*5+4)*4]
7059  mulss xmm7, xmm1
7060  addss xmm6, xmm7
7061  movss xmm7, [edi+(2*5+4)*4]
7062  mulss xmm7, xmm2
7063  addss xmm6, xmm7
7064  movss xmm7, [edi+(3*5+4)*4]
7065  mulss xmm7, xmm3
7066  addss xmm6, xmm7
7067  movss xmm7, [edi+(4*5+4)*4]
7068  mulss xmm7, xmm4
7069  addss xmm6, xmm7
7070  movss xmm7, [edi+(5*5+4)*4]
7071  mulss xmm7, xmm5
7072  addss xmm6, xmm7
7073  STORE1( 16, xmm6, xmm7 )
7074  }
7075  return;
7076  }
7077  case 6: { // 6x6 * 6x1
7078  __asm {
7079  mov esi, vPtr
7080  mov edi, mPtr
7081  mov eax, dstPtr
7082  movlps xmm0, [esi+0*4]
7083  movlps xmm1, [esi+2*4]
7084  movlps xmm2, [esi+4*4]
7085  movaps xmm3, xmm0
7086  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7087  mulps xmm3, [edi+(0*6+0)*4]
7088  movlps xmm5, [edi+(1*6+0)*4]
7089  movhps xmm5, [edi+(1*6+2)*4]
7090  movaps xmm6, xmm0
7091  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7092  mulps xmm5, xmm6
7093  addps xmm3, xmm5
7094  movaps xmm6, xmm1
7095  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7096  mulps xmm6, [edi+(2*6+0)*4]
7097  addps xmm3, xmm6
7098  movaps xmm6, xmm1
7099  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7100  movlps xmm5, [edi+(3*6+0)*4]
7101  movhps xmm5, [edi+(3*6+2)*4]
7102  mulps xmm5, xmm6
7103  addps xmm3, xmm5
7104  movaps xmm6, xmm2
7105  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7106  mulps xmm6, [edi+(4*6+0)*4]
7107  addps xmm3, xmm6
7108  movaps xmm6, xmm2
7109  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7110  movlps xmm5, [edi+(5*6+0)*4]
7111  movhps xmm5, [edi+(5*6+2)*4]
7112  mulps xmm5, xmm6
7113  addps xmm3, xmm5
7114  STORE4( 0, xmm3, xmm7 )
7115  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7116  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7117  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
7118  movlps xmm3, [edi+(0*6+4)*4]
7119  movhps xmm3, [edi+(1*6+4)*4]
7120  mulps xmm3, xmm0
7121  movlps xmm4, [edi+(2*6+4)*4]
7122  movhps xmm4, [edi+(3*6+4)*4]
7123  mulps xmm4, xmm1
7124  addps xmm3, xmm4
7125  movlps xmm5, [edi+(4*6+4)*4]
7126  movhps xmm5, [edi+(5*6+4)*4]
7127  mulps xmm5, xmm2
7128  addps xmm3, xmm5
7129  movhlps xmm4, xmm3
7130  addps xmm3, xmm4
7131  STORE2LO( 16, xmm3, xmm7 )
7132  }
7133  return;
7134  }
7135  default: {
7136  for ( int i = 0; i < numColumns; i++ ) {
7137  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7138  *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
7139  mPtr++;
7140  }
7141  return;
7142  }
7143  }
7144  break;
7145  default:
7146  int numRows = mat.GetNumRows();
7147  for ( int i = 0; i < numColumns; i++ ) {
7148  mPtr = mat.ToFloatPtr() + i;
7149  float sum = mPtr[0] * vPtr[0];
7150  for ( int j = 1; j < numRows; j++ ) {
7151  mPtr += numColumns;
7152  sum += mPtr[0] * vPtr[j];
7153  }
7154  dstPtr[i] STOREC sum;
7155  }
7156  break;
7157  }
7158 
7159 #undef STOREC
7160 #undef STORE4
7161 #undef STORE2HI
7162 #undef STORE2LO
7163 #undef STORE1
7164 }
7165 
7166 /*
7167 ============
7168 idSIMD_SSE::MatX_TransposeMultiplyAddVecX
7169 
7170  optimizes the following matrix multiplications:
7171 
7172  Nx6 * Nx1
7173  6xN * 6x1
7174 
7175  with N in the range [1-6]
7176 ============
7177 */
7178 void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
7179 #define STORE1( offset, reg1, reg2 ) \
7180  __asm movss reg2, [eax+offset] \
7181  __asm addss reg2, reg1 \
7182  __asm movss [eax+offset], reg2
7183 #define STORE2LO( offset, reg1, reg2 ) \
7184  __asm movlps reg2, [eax+offset] \
7185  __asm addps reg2, reg1 \
7186  __asm movlps [eax+offset], reg2
7187 #define STORE2HI( offset, reg1, reg2 ) \
7188  __asm movhps reg2, [eax+offset] \
7189  __asm addps reg2, reg1 \
7190  __asm movhps [eax+offset], reg2
7191 #define STORE4( offset, reg1, reg2 ) \
7192  __asm movlps reg2, [eax+offset] \
7193  __asm movhps reg2, [eax+offset+8] \
7194  __asm addps reg2, reg1 \
7195  __asm movlps [eax+offset], reg2 \
7196  __asm movhps [eax+offset+8], reg2
7197 #define STOREC +=
7198 
7199  int numColumns;
7200  const float *mPtr, *vPtr;
7201  float *dstPtr;
7202 
7203  assert( vec.GetSize() >= mat.GetNumRows() );
7204  assert( dst.GetSize() >= mat.GetNumColumns() );
7205 
7206  mPtr = mat.ToFloatPtr();
7207  vPtr = vec.ToFloatPtr();
7208  dstPtr = dst.ToFloatPtr();
7209  numColumns = mat.GetNumColumns();
7210  switch( mat.GetNumRows() ) {
7211  case 1:
7212  switch( numColumns ) {
7213  case 6: { // 1x6 * 1x1
7214  __asm {
7215  mov esi, vPtr
7216  mov edi, mPtr
7217  mov eax, dstPtr
7218  movss xmm0, [esi]
7219  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7220  movaps xmm1, xmm0
7221  mulps xmm0, [edi]
7222  mulps xmm1, [edi+16]
7223  STORE4( 0, xmm0, xmm2 )
7224  STORE2LO( 16, xmm1, xmm3 )
7225  }
7226  return;
7227  }
7228  default: {
7229  for ( int i = 0; i < numColumns; i++ ) {
7230  dstPtr[i] STOREC *(mPtr) * vPtr[0];
7231  mPtr++;
7232  }
7233  return;
7234  }
7235  }
7236  break;
7237  case 2:
7238  switch( numColumns ) {
7239  case 6: { // 2x6 * 2x1
7240  __asm {
7241  mov esi, vPtr
7242  mov edi, mPtr
7243  mov eax, dstPtr
7244  movlps xmm0, [esi]
7245  movaps xmm1, xmm0
7246  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7247  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
7248  movaps xmm2, [edi]
7249  mulps xmm2, xmm0
7250  movlps xmm3, [edi+24]
7251  movhps xmm3, [edi+32]
7252  mulps xmm3, xmm1
7253  addps xmm2, xmm3
7254  shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7255  movlps xmm4, [edi+16]
7256  movhps xmm4, [edi+40]
7257  mulps xmm4, xmm0
7258  movhlps xmm3, xmm4
7259  addps xmm3, xmm4
7260  STORE4( 0, xmm2, xmm5 )
7261  STORE2LO( 16, xmm3, xmm6 )
7262  }
7263  return;
7264  }
7265  default: {
7266  for ( int i = 0; i < numColumns; i++ ) {
7267  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
7268  mPtr++;
7269  }
7270  return;
7271  }
7272  }
7273  break;
7274  case 3:
7275  switch( numColumns ) {
7276  case 6: { // 3x6 * 3x1
7277  __asm {
7278  mov esi, vPtr
7279  mov edi, mPtr
7280  mov eax, dstPtr
7281  movlps xmm0, [esi+0*4]
7282  movss xmm1, [esi+2*4]
7283  movlps xmm3, [edi+(0*6+0)*4]
7284  movhps xmm3, [edi+(0*6+2)*4]
7285  movaps xmm4, xmm0
7286  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7287  mulps xmm3, xmm4
7288  movlps xmm5, [edi+(1*6+0)*4]
7289  movhps xmm5, [edi+(1*6+2)*4]
7290  movaps xmm6, xmm0
7291  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7292  mulps xmm5, xmm6
7293  addps xmm3, xmm5
7294  movlps xmm4, [edi+(2*6+0)*4]
7295  movhps xmm4, [edi+(2*6+2)*4]
7296  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7297  mulps xmm4, xmm1
7298  addps xmm3, xmm4
7299  STORE4( 0, xmm3, xmm7 )
7300  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7301  movlps xmm3, [edi+(0*6+4)*4]
7302  movhps xmm3, [edi+(1*6+4)*4]
7303  mulps xmm3, xmm0
7304  movhlps xmm4, xmm3
7305  addps xmm3, xmm4
7306  movlps xmm5, [edi+(2*6+4)*4]
7307  mulps xmm5, xmm1
7308  addps xmm3, xmm5
7309  STORE2LO( 16, xmm3, xmm7 )
7310  }
7311  return;
7312  }
7313  default: {
7314  for ( int i = 0; i < numColumns; i++ ) {
7315  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
7316  mPtr++;
7317  }
7318  return;
7319  }
7320  }
7321  break;
7322  case 4:
7323  switch( numColumns ) {
7324  case 6: { // 4x6 * 4x1
7325  __asm {
7326  mov esi, vPtr
7327  mov edi, mPtr
7328  mov eax, dstPtr
7329  movlps xmm0, [esi+0*4]
7330  movlps xmm1, [esi+2*4]
7331  movaps xmm3, xmm0
7332  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7333  mulps xmm3, [edi+(0*6+0)*4]
7334  movlps xmm5, [edi+(1*6+0)*4]
7335  movhps xmm5, [edi+(1*6+2)*4]
7336  movaps xmm6, xmm0
7337  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7338  mulps xmm5, xmm6
7339  addps xmm3, xmm5
7340  movlps xmm4, [edi+(2*6+0)*4]
7341  movhps xmm4, [edi+(2*6+2)*4]
7342  movaps xmm6, xmm1
7343  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7344  mulps xmm4, xmm6
7345  addps xmm3, xmm4
7346  movlps xmm5, [edi+(3*6+0)*4]
7347  movhps xmm5, [edi+(3*6+2)*4]
7348  movaps xmm6, xmm1
7349  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7350  mulps xmm5, xmm6
7351  addps xmm3, xmm5
7352  STORE4( 0, xmm3, xmm7 )
7353  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7354  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7355  movlps xmm3, [edi+(0*6+4)*4]
7356  movhps xmm3, [edi+(1*6+4)*4]
7357  mulps xmm3, xmm0
7358  movlps xmm4, [edi+(2*6+4)*4]
7359  movhps xmm4, [edi+(3*6+4)*4]
7360  mulps xmm4, xmm1
7361  addps xmm3, xmm4
7362  movhlps xmm4, xmm3
7363  addps xmm3, xmm4
7364  STORE2LO( 16, xmm3, xmm7 )
7365  }
7366  return;
7367  }
7368  default: {
7369  for ( int i = 0; i < numColumns; i++ ) {
7370  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7371  *(mPtr+3*numColumns) * vPtr[3];
7372  mPtr++;
7373  }
7374  return;
7375  }
7376  }
7377  break;
7378  case 5:
7379  switch( numColumns ) {
7380  case 6: { // 5x6 * 5x1
7381  __asm {
7382  mov esi, vPtr
7383  mov edi, mPtr
7384  mov eax, dstPtr
7385  movlps xmm0, [esi+0*4]
7386  movlps xmm1, [esi+2*4]
7387  movss xmm2, [esi+4*4]
7388  movaps xmm3, xmm0
7389  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7390  mulps xmm3, [edi+(0*6+0)*4]
7391  movlps xmm5, [edi+(1*6+0)*4]
7392  movhps xmm5, [edi+(1*6+2)*4]
7393  movaps xmm6, xmm0
7394  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7395  mulps xmm5, xmm6
7396  addps xmm3, xmm5
7397  movaps xmm6, xmm1
7398  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7399  mulps xmm6, [edi+(2*6+0)*4]
7400  addps xmm3, xmm6
7401  movlps xmm5, [edi+(3*6+0)*4]
7402  movhps xmm5, [edi+(3*6+2)*4]
7403  movaps xmm6, xmm1
7404  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7405  mulps xmm5, xmm6
7406  addps xmm3, xmm5
7407  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
7408  movaps xmm4, xmm2
7409  mulps xmm4, [edi+(4*6+0)*4]
7410  addps xmm3, xmm4
7411  STORE4( 0, xmm3, xmm7 )
7412  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7413  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7414  movlps xmm3, [edi+(0*6+4)*4]
7415  movhps xmm3, [edi+(1*6+4)*4]
7416  mulps xmm3, xmm0
7417  movlps xmm4, [edi+(2*6+4)*4]
7418  movhps xmm4, [edi+(3*6+4)*4]
7419  mulps xmm4, xmm1
7420  addps xmm3, xmm4
7421  movhlps xmm4, xmm3
7422  addps xmm3, xmm4
7423  movlps xmm5, [edi+(4*6+4)*4]
7424  mulps xmm5, xmm2
7425  addps xmm3, xmm5
7426  STORE2LO( 16, xmm3, xmm7 )
7427  }
7428  return;
7429  }
7430  default: {
7431  for ( int i = 0; i < numColumns; i++ ) {
7432  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7433  *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
7434  mPtr++;
7435  }
7436  return;
7437  }
7438  }
7439  break;
7440  case 6:
7441  switch( numColumns ) {
7442  case 1: { // 6x1 * 6x1
7443  __asm {
7444  mov esi, vPtr
7445  mov edi, mPtr
7446  mov eax, dstPtr
7447  movlps xmm0, [esi]
7448  movhps xmm0, [esi+8]
7449  movlps xmm1, [esi+16]
7450  mulps xmm0, [edi]
7451  mulps xmm1, [edi+16]
7452  shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
7453  addps xmm0, xmm1
7454  movhlps xmm2, xmm0
7455  addss xmm2, xmm0
7456  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
7457  addss xmm2, xmm0
7458  STORE1( 0, xmm2, xmm3 )
7459  }
7460  return;
7461  }
7462  case 2: { // 6x2 * 6x1
7463  __asm {
7464  mov esi, vPtr
7465  mov edi, mPtr
7466  mov eax, dstPtr
7467  movlps xmm0, [esi+0*4]
7468  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7469  movaps xmm6, [edi+0*4]
7470  mulps xmm6, xmm0
7471  movlps xmm1, [esi+2*4]
7472  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7473  movaps xmm7, [edi+4*4]
7474  mulps xmm7, xmm1
7475  addps xmm6, xmm7
7476  movlps xmm2, [esi+4*4]
7477  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
7478  movaps xmm7, [edi+8*4]
7479  mulps xmm7, xmm2
7480  addps xmm6, xmm7
7481  movhlps xmm3, xmm6
7482  addps xmm3, xmm6
7483  STORE2LO( 0, xmm3, xmm7 )
7484  }
7485  return;
7486  }
7487  case 3: { // 6x3 * 6x1
7488  __asm {
7489  mov esi, vPtr
7490  mov edi, mPtr
7491  mov eax, dstPtr
7492  movss xmm0, [edi+(0*3+2)*4]
7493  movhps xmm0, [edi+(0*3+0)*4]
7494  shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
7495  movss xmm6, [esi+0*4]
7496  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7497  mulps xmm6, xmm0
7498  movss xmm1, [edi+(1*3+0)*4]
7499  movhps xmm1, [edi+(1*3+1)*4]
7500  movss xmm7, [esi+1*4]
7501  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7502  mulps xmm7, xmm1
7503  addps xmm6, xmm7
7504  movss xmm2, [edi+(2*3+2)*4]
7505  movhps xmm2, [edi+(2*3+0)*4]
7506  shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
7507  movss xmm7, [esi+2*4]
7508  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7509  mulps xmm7, xmm2
7510  addps xmm6, xmm7
7511  movss xmm3, [edi+(3*3+0)*4]
7512  movhps xmm3, [edi+(3*3+1)*4]
7513  movss xmm7, [esi+3*4]
7514  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7515  mulps xmm7, xmm3
7516  addps xmm6, xmm7
7517  movss xmm4, [edi+(4*3+2)*4]
7518  movhps xmm4, [edi+(4*3+0)*4]
7519  shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
7520  movss xmm7, [esi+4*4]
7521  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7522  mulps xmm7, xmm4
7523  addps xmm6, xmm7
7524  movss xmm5, [edi+(5*3+0)*4]
7525  movhps xmm5, [edi+(5*3+1)*4]
7526  movss xmm7, [esi+5*4]
7527  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7528  mulps xmm7, xmm5
7529  addps xmm6, xmm7
7530  STORE1( 0, xmm6, xmm7 )
7531  STORE2HI( 4, xmm6, xmm7 )
7532  }
7533  return;
7534  }
7535  case 4: { // 6x4 * 6x1
7536  __asm {
7537  mov esi, vPtr
7538  mov edi, mPtr
7539  mov eax, dstPtr
7540  movlps xmm3, [edi+(0*4+0)*4]
7541  movhps xmm3, [edi+(0*4+2)*4]
7542  movss xmm4, [esi+0*4]
7543  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7544  mulps xmm3, xmm4
7545  movlps xmm5, [edi+(1*4+0)*4]
7546  movhps xmm5, [edi+(1*4+2)*4]
7547  movss xmm6, [esi+1*4]
7548  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7549  mulps xmm5, xmm6
7550  addps xmm3, xmm5
7551  movlps xmm4, [edi+(2*4+0)*4]
7552  movhps xmm4, [edi+(2*4+2)*4]
7553  movss xmm6, [esi+2*4]
7554  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7555  mulps xmm4, xmm6
7556  addps xmm3, xmm4
7557  movlps xmm5, [edi+(3*4+0)*4]
7558  movhps xmm5, [edi+(3*4+2)*4]
7559  movss xmm6, [esi+3*4]
7560  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7561  mulps xmm5, xmm6
7562  addps xmm3, xmm5
7563  movlps xmm4, [edi+(4*4+0)*4]
7564  movhps xmm4, [edi+(4*4+2)*4]
7565  movss xmm6, [esi+4*4]
7566  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7567  mulps xmm4, xmm6
7568  addps xmm3, xmm4
7569  movlps xmm5, [edi+(5*4+0)*4]
7570  movhps xmm5, [edi+(5*4+2)*4]
7571  movss xmm6, [esi+5*4]
7572  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7573  mulps xmm5, xmm6
7574  addps xmm3, xmm5
7575  STORE4( 0, xmm3, xmm7 )
7576  }
7577  return;
7578  }
7579  case 5: { // 6x5 * 6x1
7580  __asm {
7581  mov esi, vPtr
7582  mov edi, mPtr
7583  mov eax, dstPtr
7584  movlps xmm6, [edi+(0*5+0)*4]
7585  movhps xmm6, [edi+(0*5+2)*4]
7586  movss xmm0, [esi+0*4]
7587  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7588  mulps xmm6, xmm0
7589  movlps xmm7, [edi+(1*5+0)*4]
7590  movhps xmm7, [edi+(1*5+2)*4]
7591  movss xmm1, [esi+1*4]
7592  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7593  mulps xmm7, xmm1
7594  addps xmm6, xmm7
7595  movlps xmm7, [edi+(2*5+0)*4]
7596  movhps xmm7, [edi+(2*5+2)*4]
7597  movss xmm2, [esi+2*4]
7598  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
7599  mulps xmm7, xmm2
7600  addps xmm6, xmm7
7601  movlps xmm7, [edi+(3*5+0)*4]
7602  movhps xmm7, [edi+(3*5+2)*4]
7603  movss xmm3, [esi+3*4]
7604  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7605  mulps xmm7, xmm3
7606  addps xmm6, xmm7
7607  movlps xmm7, [edi+(4*5+0)*4]
7608  movhps xmm7, [edi+(4*5+2)*4]
7609  movss xmm4, [esi+4*4]
7610  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7611  mulps xmm7, xmm4
7612  addps xmm6, xmm7
7613  movlps xmm7, [edi+(5*5+0)*4]
7614  movhps xmm7, [edi+(5*5+2)*4]
7615  movss xmm5, [esi+5*4]
7616  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
7617  mulps xmm7, xmm5
7618  addps xmm6, xmm7
7619  STORE4( 0, xmm6, xmm7 )
7620  movss xmm6, [edi+(0*5+4)*4]
7621  mulss xmm6, xmm0
7622  movss xmm7, [edi+(1*5+4)*4]
7623  mulss xmm7, xmm1
7624  addss xmm6, xmm7
7625  movss xmm7, [edi+(2*5+4)*4]
7626  mulss xmm7, xmm2
7627  addss xmm6, xmm7
7628  movss xmm7, [edi+(3*5+4)*4]
7629  mulss xmm7, xmm3
7630  addss xmm6, xmm7
7631  movss xmm7, [edi+(4*5+4)*4]
7632  mulss xmm7, xmm4
7633  addss xmm6, xmm7
7634  movss xmm7, [edi+(5*5+4)*4]
7635  mulss xmm7, xmm5
7636  addss xmm6, xmm7
7637  STORE1( 16, xmm6, xmm7 )
7638  }
7639  return;
7640  }
7641  case 6: { // 6x6 * 6x1
7642  __asm {
7643  mov esi, vPtr
7644  mov edi, mPtr
7645  mov eax, dstPtr
7646  movlps xmm0, [esi+0*4]
7647  movlps xmm1, [esi+2*4]
7648  movlps xmm2, [esi+4*4]
7649  movaps xmm3, xmm0
7650  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7651  mulps xmm3, [edi+(0*6+0)*4]
7652  movlps xmm5, [edi+(1*6+0)*4]
7653  movhps xmm5, [edi+(1*6+2)*4]
7654  movaps xmm6, xmm0
7655  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7656  mulps xmm5, xmm6
7657  addps xmm3, xmm5
7658  movaps xmm6, xmm1
7659  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7660  mulps xmm6, [edi+(2*6+0)*4]
7661  addps xmm3, xmm6
7662  movaps xmm6, xmm1
7663  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7664  movlps xmm5, [edi+(3*6+0)*4]
7665  movhps xmm5, [edi+(3*6+2)*4]
7666  mulps xmm5, xmm6
7667  addps xmm3, xmm5
7668  movaps xmm6, xmm2
7669  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7670  mulps xmm6, [edi+(4*6+0)*4]
7671  addps xmm3, xmm6
7672  movaps xmm6, xmm2
7673  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7674  movlps xmm5, [edi+(5*6+0)*4]
7675  movhps xmm5, [edi+(5*6+2)*4]
7676  mulps xmm5, xmm6
7677  addps xmm3, xmm5
7678  STORE4( 0, xmm3, xmm7 )
7679  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7680  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7681  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
7682  movlps xmm3, [edi+(0*6+4)*4]
7683  movhps xmm3, [edi+(1*6+4)*4]
7684  mulps xmm3, xmm0
7685  movlps xmm4, [edi+(2*6+4)*4]
7686  movhps xmm4, [edi+(3*6+4)*4]
7687  mulps xmm4, xmm1
7688  addps xmm3, xmm4
7689  movlps xmm5, [edi+(4*6+4)*4]
7690  movhps xmm5, [edi+(5*6+4)*4]
7691  mulps xmm5, xmm2
7692  addps xmm3, xmm5
7693  movhlps xmm4, xmm3
7694  addps xmm3, xmm4
7695  STORE2LO( 16, xmm3, xmm7 )
7696  }
7697  return;
7698  }
7699  default: {
7700  for ( int i = 0; i < numColumns; i++ ) {
7701  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7702  *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
7703  mPtr++;
7704  }
7705  return;
7706  }
7707  }
7708  break;
7709  default:
7710  int numRows = mat.GetNumRows();
7711  for ( int i = 0; i < numColumns; i++ ) {
7712  mPtr = mat.ToFloatPtr() + i;
7713  float sum = mPtr[0] * vPtr[0];
7714  for ( int j = 1; j < numRows; j++ ) {
7715  mPtr += numColumns;
7716  sum += mPtr[0] * vPtr[j];
7717  }
7718  dstPtr[i] STOREC sum;
7719  }
7720  break;
7721  }
7722 
7723 #undef STOREC
7724 #undef STORE4
7725 #undef STORE2HI
7726 #undef STORE2LO
7727 #undef STORE1
7728 }
7729 
7730 /*
7731 ============
7732 void idSIMD_SSE::MatX_TransposeMultiplySubVecX
7733 
7734  optimizes the following matrix multiplications:
7735 
7736  Nx6 * Nx1
7737  6xN * 6x1
7738 
7739  with N in the range [1-6]
7740 ============
7741 */
7742 void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
7743 #define STORE1( offset, reg1, reg2 ) \
7744  __asm movss reg2, [eax+offset] \
7745  __asm subss reg2, reg1 \
7746  __asm movss [eax+offset], reg2
7747 #define STORE2LO( offset, reg1, reg2 ) \
7748  __asm movlps reg2, [eax+offset] \
7749  __asm subps reg2, reg1 \
7750  __asm movlps [eax+offset], reg2
7751 #define STORE2HI( offset, reg1, reg2 ) \
7752  __asm movhps reg2, [eax+offset] \
7753  __asm subps reg2, reg1 \
7754  __asm movhps [eax+offset], reg2
7755 #define STORE4( offset, reg1, reg2 ) \
7756  __asm movlps reg2, [eax+offset] \
7757  __asm movhps reg2, [eax+offset+8] \
7758  __asm subps reg2, reg1 \
7759  __asm movlps [eax+offset], reg2 \
7760  __asm movhps [eax+offset+8], reg2
7761 #define STOREC -=
7762 
7763  int numColumns;
7764  const float *mPtr, *vPtr;
7765  float *dstPtr;
7766 
7767  assert( vec.GetSize() >= mat.GetNumRows() );
7768  assert( dst.GetSize() >= mat.GetNumColumns() );
7769 
7770  mPtr = mat.ToFloatPtr();
7771  vPtr = vec.ToFloatPtr();
7772  dstPtr = dst.ToFloatPtr();
7773  numColumns = mat.GetNumColumns();
7774  switch( mat.GetNumRows() ) {
7775  case 1:
7776  switch( numColumns ) {
7777  case 6: { // 1x6 * 1x1
7778  __asm {
7779  mov esi, vPtr
7780  mov edi, mPtr
7781  mov eax, dstPtr
7782  movss xmm0, [esi]
7783  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7784  movaps xmm1, xmm0
7785  mulps xmm0, [edi]
7786  mulps xmm1, [edi+16]
7787  STORE4( 0, xmm0, xmm2 )
7788  STORE2LO( 16, xmm1, xmm3 )
7789  }
7790  return;
7791  }
7792  default: {
7793  for ( int i = 0; i < numColumns; i++ ) {
7794  dstPtr[i] STOREC *(mPtr) * vPtr[0];
7795  mPtr++;
7796  }
7797  return;
7798  }
7799  }
7800  break;
7801  case 2:
7802  switch( numColumns ) {
7803  case 6: { // 2x6 * 2x1
7804  __asm {
7805  mov esi, vPtr
7806  mov edi, mPtr
7807  mov eax, dstPtr
7808  movlps xmm0, [esi]
7809  movaps xmm1, xmm0
7810  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7811  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
7812  movaps xmm2, [edi]
7813  mulps xmm2, xmm0
7814  movlps xmm3, [edi+24]
7815  movhps xmm3, [edi+32]
7816  mulps xmm3, xmm1
7817  addps xmm2, xmm3
7818  shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7819  movlps xmm4, [edi+16]
7820  movhps xmm4, [edi+40]
7821  mulps xmm4, xmm0
7822  movhlps xmm3, xmm4
7823  addps xmm3, xmm4
7824  STORE4( 0, xmm2, xmm5 )
7825  STORE2LO( 16, xmm3, xmm6 )
7826  }
7827  return;
7828  }
7829  default: {
7830  for ( int i = 0; i < numColumns; i++ ) {
7831  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
7832  mPtr++;
7833  }
7834  return;
7835  }
7836  }
7837  break;
7838  case 3:
7839  switch( numColumns ) {
7840  case 6: { // 3x6 * 3x1
7841  __asm {
7842  mov esi, vPtr
7843  mov edi, mPtr
7844  mov eax, dstPtr
7845  movlps xmm0, [esi+0*4]
7846  movss xmm1, [esi+2*4]
7847  movlps xmm3, [edi+(0*6+0)*4]
7848  movhps xmm3, [edi+(0*6+2)*4]
7849  movaps xmm4, xmm0
7850  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7851  mulps xmm3, xmm4
7852  movlps xmm5, [edi+(1*6+0)*4]
7853  movhps xmm5, [edi+(1*6+2)*4]
7854  movaps xmm6, xmm0
7855  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7856  mulps xmm5, xmm6
7857  addps xmm3, xmm5
7858  movlps xmm4, [edi+(2*6+0)*4]
7859  movhps xmm4, [edi+(2*6+2)*4]
7860  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7861  mulps xmm4, xmm1
7862  addps xmm3, xmm4
7863  STORE4( 0, xmm3, xmm7 )
7864  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7865  movlps xmm3, [edi+(0*6+4)*4]
7866  movhps xmm3, [edi+(1*6+4)*4]
7867  mulps xmm3, xmm0
7868  movhlps xmm4, xmm3
7869  addps xmm3, xmm4
7870  movlps xmm5, [edi+(2*6+4)*4]
7871  mulps xmm5, xmm1
7872  addps xmm3, xmm5
7873  STORE2LO( 16, xmm3, xmm7 )
7874  }
7875  return;
7876  }
7877  default: {
7878  for ( int i = 0; i < numColumns; i++ ) {
7879  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
7880  mPtr++;
7881  }
7882  return;
7883  }
7884  }
7885  break;
7886  case 4:
7887  switch( numColumns ) {
7888  case 6: { // 4x6 * 4x1
7889  __asm {
7890  mov esi, vPtr
7891  mov edi, mPtr
7892  mov eax, dstPtr
7893  movlps xmm0, [esi+0*4]
7894  movlps xmm1, [esi+2*4]
7895  movaps xmm3, xmm0
7896  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7897  mulps xmm3, [edi+(0*6+0)*4]
7898  movlps xmm5, [edi+(1*6+0)*4]
7899  movhps xmm5, [edi+(1*6+2)*4]
7900  movaps xmm6, xmm0
7901  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7902  mulps xmm5, xmm6
7903  addps xmm3, xmm5
7904  movlps xmm4, [edi+(2*6+0)*4]
7905  movhps xmm4, [edi+(2*6+2)*4]
7906  movaps xmm6, xmm1
7907  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7908  mulps xmm4, xmm6
7909  addps xmm3, xmm4
7910  movlps xmm5, [edi+(3*6+0)*4]
7911  movhps xmm5, [edi+(3*6+2)*4]
7912  movaps xmm6, xmm1
7913  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7914  mulps xmm5, xmm6
7915  addps xmm3, xmm5
7916  STORE4( 0, xmm3, xmm7 )
7917  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7918  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7919  movlps xmm3, [edi+(0*6+4)*4]
7920  movhps xmm3, [edi+(1*6+4)*4]
7921  mulps xmm3, xmm0
7922  movlps xmm4, [edi+(2*6+4)*4]
7923  movhps xmm4, [edi+(3*6+4)*4]
7924  mulps xmm4, xmm1
7925  addps xmm3, xmm4
7926  movhlps xmm4, xmm3
7927  addps xmm3, xmm4
7928  STORE2LO( 16, xmm3, xmm7 )
7929  }
7930  return;
7931  }
7932  default: {
7933  for ( int i = 0; i < numColumns; i++ ) {
7934  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7935  *(mPtr+3*numColumns) * vPtr[3];
7936  mPtr++;
7937  }
7938  return;
7939  }
7940  }
7941  break;
7942  case 5:
7943  switch( numColumns ) {
7944  case 6: { // 5x6 * 5x1
7945  __asm {
7946  mov esi, vPtr
7947  mov edi, mPtr
7948  mov eax, dstPtr
7949  movlps xmm0, [esi+0*4]
7950  movlps xmm1, [esi+2*4]
7951  movss xmm2, [esi+4*4]
7952  movaps xmm3, xmm0
7953  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7954  mulps xmm3, [edi+(0*6+0)*4]
7955  movlps xmm5, [edi+(1*6+0)*4]
7956  movhps xmm5, [edi+(1*6+2)*4]
7957  movaps xmm6, xmm0
7958  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7959  mulps xmm5, xmm6
7960  addps xmm3, xmm5
7961  movaps xmm6, xmm1
7962  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7963  mulps xmm6, [edi+(2*6+0)*4]
7964  addps xmm3, xmm6
7965  movlps xmm5, [edi+(3*6+0)*4]
7966  movhps xmm5, [edi+(3*6+2)*4]
7967  movaps xmm6, xmm1
7968  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7969  mulps xmm5, xmm6
7970  addps xmm3, xmm5
7971  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
7972  movaps xmm4, xmm2
7973  mulps xmm4, [edi+(4*6+0)*4]
7974  addps xmm3, xmm4
7975  STORE4( 0, xmm3, xmm7 )
7976  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7977  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7978  movlps xmm3, [edi+(0*6+4)*4]
7979  movhps xmm3, [edi+(1*6+4)*4]
7980  mulps xmm3, xmm0
7981  movlps xmm4, [edi+(2*6+4)*4]
7982  movhps xmm4, [edi+(3*6+4)*4]
7983  mulps xmm4, xmm1
7984  addps xmm3, xmm4
7985  movhlps xmm4, xmm3
7986  addps xmm3, xmm4
7987  movlps xmm5, [edi+(4*6+4)*4]
7988  mulps xmm5, xmm2
7989  addps xmm3, xmm5
7990  STORE2LO( 16, xmm3, xmm7 )
7991  }
7992  return;
7993  }
7994  default: {
7995  for ( int i = 0; i < numColumns; i++ ) {
7996  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7997  *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
7998  mPtr++;
7999  }
8000  return;
8001  }
8002  }
8003  break;
8004  case 6:
8005  switch( numColumns ) {
8006  case 1: { // 6x1 * 6x1
8007  __asm {
8008  mov esi, vPtr
8009  mov edi, mPtr
8010  mov eax, dstPtr
8011  movlps xmm0, [esi]
8012  movhps xmm0, [esi+8]
8013  movlps xmm1, [esi+16]
8014  mulps xmm0, [edi]
8015  mulps xmm1, [edi+16]
8016  shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
8017  addps xmm0, xmm1
8018  movhlps xmm2, xmm0
8019  addss xmm2, xmm0
8020  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
8021  addss xmm2, xmm0
8022  STORE1( 0, xmm2, xmm3 )
8023  }
8024  return;
8025  }
8026  case 2: { // 6x2 * 6x1
8027  __asm {
8028  mov esi, vPtr
8029  mov edi, mPtr
8030  mov eax, dstPtr
8031  movlps xmm0, [esi+0*4]
8032  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
8033  movaps xmm6, [edi+0*4]
8034  mulps xmm6, xmm0
8035  movlps xmm1, [esi+2*4]
8036  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
8037  movaps xmm7, [edi+4*4]
8038  mulps xmm7, xmm1
8039  addps xmm6, xmm7
8040  movlps xmm2, [esi+4*4]
8041  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
8042  movaps xmm7, [edi+8*4]
8043  mulps xmm7, xmm2
8044  addps xmm6, xmm7
8045  movhlps xmm3, xmm6
8046  addps xmm3, xmm6
8047  STORE2LO( 0, xmm3, xmm7 )
8048  }
8049  return;
8050  }
8051  case 3: { // 6x3 * 6x1
8052  __asm {
8053  mov esi, vPtr
8054  mov edi, mPtr
8055  mov eax, dstPtr
8056  movss xmm0, [edi+(0*3+2)*4]
8057  movhps xmm0, [edi+(0*3+0)*4]
8058  shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
8059  movss xmm6, [esi+0*4]
8060  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8061  mulps xmm6, xmm0
8062  movss xmm1, [edi+(1*3+0)*4]
8063  movhps xmm1, [edi+(1*3+1)*4]
8064  movss xmm7, [esi+1*4]
8065  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8066  mulps xmm7, xmm1
8067  addps xmm6, xmm7
8068  movss xmm2, [edi+(2*3+2)*4]
8069  movhps xmm2, [edi+(2*3+0)*4]
8070  shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
8071  movss xmm7, [esi+2*4]
8072  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8073  mulps xmm7, xmm2
8074  addps xmm6, xmm7
8075  movss xmm3, [edi+(3*3+0)*4]
8076  movhps xmm3, [edi+(3*3+1)*4]
8077  movss xmm7, [esi+3*4]
8078  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8079  mulps xmm7, xmm3
8080  addps xmm6, xmm7
8081  movss xmm4, [edi+(4*3+2)*4]
8082  movhps xmm4, [edi+(4*3+0)*4]
8083  shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
8084  movss xmm7, [esi+4*4]
8085  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8086  mulps xmm7, xmm4
8087  addps xmm6, xmm7
8088  movss xmm5, [edi+(5*3+0)*4]
8089  movhps xmm5, [edi+(5*3+1)*4]
8090  movss xmm7, [esi+5*4]
8091  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8092  mulps xmm7, xmm5
8093  addps xmm6, xmm7
8094  STORE1( 0, xmm6, xmm7 )
8095  STORE2HI( 4, xmm6, xmm7 )
8096  }
8097  return;
8098  }
8099  case 4: { // 6x4 * 6x1
8100  __asm {
8101  mov esi, vPtr
8102  mov edi, mPtr
8103  mov eax, dstPtr
8104  movlps xmm3, [edi+(0*4+0)*4]
8105  movhps xmm3, [edi+(0*4+2)*4]
8106  movss xmm4, [esi+0*4]
8107  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
8108  mulps xmm3, xmm4
8109  movlps xmm5, [edi+(1*4+0)*4]
8110  movhps xmm5, [edi+(1*4+2)*4]
8111  movss xmm6, [esi+1*4]
8112  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8113  mulps xmm5, xmm6
8114  addps xmm3, xmm5
8115  movlps xmm4, [edi+(2*4+0)*4]
8116  movhps xmm4, [edi+(2*4+2)*4]
8117  movss xmm6, [esi+2*4]
8118  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8119  mulps xmm4, xmm6
8120  addps xmm3, xmm4
8121  movlps xmm5, [edi+(3*4+0)*4]
8122  movhps xmm5, [edi+(3*4+2)*4]
8123  movss xmm6, [esi+3*4]
8124  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8125  mulps xmm5, xmm6
8126  addps xmm3, xmm5
8127  movlps xmm4, [edi+(4*4+0)*4]
8128  movhps xmm4, [edi+(4*4+2)*4]
8129  movss xmm6, [esi+4*4]
8130  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8131  mulps xmm4, xmm6
8132  addps xmm3, xmm4
8133  movlps xmm5, [edi+(5*4+0)*4]
8134  movhps xmm5, [edi+(5*4+2)*4]
8135  movss xmm6, [esi+5*4]
8136  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8137  mulps xmm5, xmm6
8138  addps xmm3, xmm5
8139  STORE4( 0, xmm3, xmm7 )
8140  }
8141  return;
8142  }
8143  case 5: { // 6x5 * 6x1
8144  __asm {
8145  mov esi, vPtr
8146  mov edi, mPtr
8147  mov eax, dstPtr
8148  movlps xmm6, [edi+(0*5+0)*4]
8149  movhps xmm6, [edi+(0*5+2)*4]
8150  movss xmm0, [esi+0*4]
8151  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
8152  mulps xmm6, xmm0
8153  movlps xmm7, [edi+(1*5+0)*4]
8154  movhps xmm7, [edi+(1*5+2)*4]
8155  movss xmm1, [esi+1*4]
8156  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
8157  mulps xmm7, xmm1
8158  addps xmm6, xmm7
8159  movlps xmm7, [edi+(2*5+0)*4]
8160  movhps xmm7, [edi+(2*5+2)*4]
8161  movss xmm2, [esi+2*4]
8162  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
8163  mulps xmm7, xmm2
8164  addps xmm6, xmm7
8165  movlps xmm7, [edi+(3*5+0)*4]
8166  movhps xmm7, [edi+(3*5+2)*4]
8167  movss xmm3, [esi+3*4]
8168  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
8169  mulps xmm7, xmm3
8170  addps xmm6, xmm7
8171  movlps xmm7, [edi+(4*5+0)*4]
8172  movhps xmm7, [edi+(4*5+2)*4]
8173  movss xmm4, [esi+4*4]
8174  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
8175  mulps xmm7, xmm4
8176  addps xmm6, xmm7
8177  movlps xmm7, [edi+(5*5+0)*4]
8178  movhps xmm7, [edi+(5*5+2)*4]
8179  movss xmm5, [esi+5*4]
8180  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
8181  mulps xmm7, xmm5
8182  addps xmm6, xmm7
8183  STORE4( 0, xmm6, xmm7 )
8184  movss xmm6, [edi+(0*5+4)*4]
8185  mulss xmm6, xmm0
8186  movss xmm7, [edi+(1*5+4)*4]
8187  mulss xmm7, xmm1
8188  addss xmm6, xmm7
8189  movss xmm7, [edi+(2*5+4)*4]
8190  mulss xmm7, xmm2
8191  addss xmm6, xmm7
8192  movss xmm7, [edi+(3*5+4)*4]
8193  mulss xmm7, xmm3
8194  addss xmm6, xmm7
8195  movss xmm7, [edi+(4*5+4)*4]
8196  mulss xmm7, xmm4
8197  addss xmm6, xmm7
8198  movss xmm7, [edi+(5*5+4)*4]
8199  mulss xmm7, xmm5
8200  addss xmm6, xmm7
8201  STORE1( 16, xmm6, xmm7 )
8202  }
8203  return;
8204  }
8205  case 6: { // 6x6 * 6x1
8206  __asm {
8207  mov esi, vPtr
8208  mov edi, mPtr
8209  mov eax, dstPtr
8210  movlps xmm0, [esi+0*4]
8211  movlps xmm1, [esi+2*4]
8212  movlps xmm2, [esi+4*4]
8213  movaps xmm3, xmm0
8214  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
8215  mulps xmm3, [edi+(0*6+0)*4]
8216  movlps xmm5, [edi+(1*6+0)*4]
8217  movhps xmm5, [edi+(1*6+2)*4]
8218  movaps xmm6, xmm0
8219  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
8220  mulps xmm5, xmm6
8221  addps xmm3, xmm5
8222  movaps xmm6, xmm1
8223  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8224  mulps xmm6, [edi+(2*6+0)*4]
8225  addps xmm3, xmm6
8226  movaps xmm6, xmm1
8227  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
8228  movlps xmm5, [edi+(3*6+0)*4]
8229  movhps xmm5, [edi+(3*6+2)*4]
8230  mulps xmm5, xmm6
8231  addps xmm3, xmm5
8232  movaps xmm6, xmm2
8233  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8234  mulps xmm6, [edi+(4*6+0)*4]
8235  addps xmm3, xmm6
8236  movaps xmm6, xmm2
8237  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
8238  movlps xmm5, [edi+(5*6+0)*4]
8239  movhps xmm5, [edi+(5*6+2)*4]
8240  mulps xmm5, xmm6
8241  addps xmm3, xmm5
8242  STORE4( 0, xmm3, xmm7 )
8243  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
8244  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
8245  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
8246  movlps xmm3, [edi+(0*6+4)*4]
8247  movhps xmm3, [edi+(1*6+4)*4]
8248  mulps xmm3, xmm0
8249  movlps xmm4, [edi+(2*6+4)*4]
8250  movhps xmm4, [edi+(3*6+4)*4]
8251  mulps xmm4, xmm1
8252  addps xmm3, xmm4
8253  movlps xmm5, [edi+(4*6+4)*4]
8254  movhps xmm5, [edi+(5*6+4)*4]
8255  mulps xmm5, xmm2
8256  addps xmm3, xmm5
8257  movhlps xmm4, xmm3
8258  addps xmm3, xmm4
8259  STORE2LO( 16, xmm3, xmm7 )
8260  }
8261  return;
8262  }
8263  default: {
8264  for ( int i = 0; i < numColumns; i++ ) {
8265  dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
8266  *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
8267  mPtr++;
8268  }
8269  return;
8270  }
8271  }
8272  break;
8273  default:
8274  int numRows = mat.GetNumRows();
8275  for ( int i = 0; i < numColumns; i++ ) {
8276  mPtr = mat.ToFloatPtr() + i;
8277  float sum = mPtr[0] * vPtr[0];
8278  for ( int j = 1; j < numRows; j++ ) {
8279  mPtr += numColumns;
8280  sum += mPtr[0] * vPtr[j];
8281  }
8282  dstPtr[i] STOREC sum;
8283  }
8284  break;
8285  }
8286 
8287 #undef STOREC
8288 #undef STORE4
8289 #undef STORE2HI
8290 #undef STORE2LO
8291 #undef STORE1
8292 }
8293 
8294 /*
8295 ============
8296 idSIMD_SSE::MatX_MultiplyMatX
8297 
8298  optimizes the following matrix multiplications:
8299 
8300  NxN * Nx6
8301  6xN * Nx6
8302  Nx6 * 6xN
8303  6x6 * 6xN
8304 
8305  with N in the range [1-6].
8306 
8307  The hot cache clock cycle counts are generally better for the SIMD version than the
8308  FPU version. At times up to 40% less clock cycles on a P3. In practise however,
8309  the results are poor probably due to memory access.
8310 ============
8311 */
8312 void VPCALL idSIMD_SSE::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
8313  int i, j, k, l, n;
8314  float *dstPtr;
8315  const float *m1Ptr, *m2Ptr;
8316  double sum;
8317 
8318  assert( m1.GetNumColumns() == m2.GetNumRows() );
8319 
8320  dstPtr = dst.ToFloatPtr();
8321  m1Ptr = m1.ToFloatPtr();
8322  m2Ptr = m2.ToFloatPtr();
8323  k = m1.GetNumRows();
8324  l = m2.GetNumColumns();
8325  n = m1.GetNumColumns();
8326 
8327  switch( n ) {
8328  case 1: {
8329  if ( !(l^6) ) {
8330  switch( k ) {
8331  case 1: { // 1x1 * 1x6, no precision loss compared to FPU version
8332  __asm {
8333  mov esi, m2Ptr
8334  mov edi, m1Ptr
8335  mov eax, dstPtr
8336  movss xmm0, [edi]
8337  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
8338  movaps xmm1, [esi]
8339  mulps xmm1, xmm0
8340  movaps [eax], xmm1
8341  movlps xmm2, [esi+16]
8342  mulps xmm2, xmm0
8343  movlps [eax+16], xmm2
8344  }
8345  return;
8346  }
8347  case 6: { // 6x1 * 1x6, no precision loss compared to FPU version
8348  __asm {
8349  mov esi, m2Ptr
8350  mov edi, m1Ptr
8351  mov eax, dstPtr
8352  xorps xmm1, xmm1
8353  movaps xmm0, [edi]
8354  movlps xmm1, [edi+16]
8355  movlhps xmm1, xmm0
8356  movhlps xmm2, xmm0
8357  movlhps xmm2, xmm1
8358  // row 0 and 1
8359  movaps xmm3, [esi]
8360  movaps xmm4, xmm3
8361  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
8362  movaps xmm5, xmm3
8363  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
8364  movaps xmm6, xmm3
8365  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
8366  mulps xmm4, xmm0
8367  mulps xmm5, xmm1
8368  mulps xmm6, xmm2
8369  movaps [eax], xmm4
8370  movaps [eax+16], xmm5
8371  movaps [eax+32], xmm6
8372  // row 2 and 3
8373  movaps xmm4, xmm3
8374  shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 )
8375  movaps xmm5, xmm3
8376  shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 )
8377  shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
8378  mulps xmm4, xmm0
8379  mulps xmm5, xmm1
8380  mulps xmm3, xmm2
8381  movaps [eax+48], xmm4
8382  movaps [eax+64], xmm5
8383  movaps [eax+80], xmm3
8384  // row 4 and 5
8385  movlps xmm3, [esi+16]
8386  movaps xmm4, xmm3
8387  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
8388  movaps xmm5, xmm3
8389  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
8390  shufps xmm3, xmm3, R_SHUFFLEPS( 1, 1, 1, 1 )
8391  mulps xmm4, xmm0
8392  mulps xmm5, xmm1
8393  mulps xmm3, xmm2
8394  movaps [eax+96], xmm4
8395  movaps [eax+112], xmm5
8396  movaps [eax+128], xmm3
8397  }
8398  return;
8399  }
8400  }
8401  }
8402  for ( i = 0; i < k; i++ ) {
8403  m2Ptr = m2.ToFloatPtr();
8404  for ( j = 0; j < l; j++ ) {
8405  *dstPtr++ = m1Ptr[0] * m2Ptr[0];
8406  m2Ptr++;
8407  }
8408  m1Ptr++;
8409  }
8410  break;
8411  }
8412  case 2: {
8413  if ( !(l^6) ) {
8414  switch( k ) {
8415  case 2: { // 2x2 * 2x6
8416 
8417  #define MUL_Nx2_2x6_INIT \
8418  __asm mov esi, m2Ptr \
8419  __asm mov edi, m1Ptr \
8420  __asm mov eax, dstPtr \
8421  __asm movaps xmm0, [esi] \
8422  __asm movlps xmm1, [esi+16] \
8423  __asm movhps xmm1, [esi+40] \
8424  __asm movlps xmm2, [esi+24] \
8425  __asm movhps xmm2, [esi+32]
8426 
8427  #define MUL_Nx2_2x6_ROW2( row ) \
8428  __asm movaps xmm3, [edi+row*16] \
8429  __asm movaps xmm5, xmm0 \
8430  __asm movaps xmm4, xmm3 \
8431  __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8432  __asm mulps xmm5, xmm4 \
8433  __asm movaps xmm4, xmm3 \
8434  __asm movaps xmm6, xmm2 \
8435  __asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 1, 1 ) \
8436  __asm mulps xmm6, xmm4 \
8437  __asm addps xmm5, xmm6 \
8438  __asm movaps [eax+row*48], xmm5 \
8439  __asm movaps xmm4, xmm3 \
8440  __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \
8441  __asm movaps xmm7, xmm1 \
8442  __asm mulps xmm7, xmm4 \
8443  __asm movaps xmm4, xmm3 \
8444  __asm movaps xmm5, xmm0 \
8445  __asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 ) \
8446  __asm mulps xmm5, xmm4 \
8447  __asm movaps xmm4, xmm3 \
8448  __asm movaps xmm6, xmm2 \
8449  __asm shufps xmm4, xmm4, R_SHUFFLEPS( 3, 3, 3, 3 ) \
8450  __asm mulps xmm6, xmm4 \
8451  __asm addps xmm5, xmm6 \
8452  __asm shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 ) \
8453  __asm movaps xmm6, xmm1 \
8454  __asm mulps xmm6, xmm3 \
8455  __asm movaps xmm4, xmm7 \
8456  __asm movlhps xmm7, xmm6 \
8457  __asm movhlps xmm6, xmm4 \
8458  __asm addps xmm6, xmm7 \
8459  __asm movlps [eax+row*48+16], xmm6 \
8460  __asm movlps [eax+row*48+24], xmm5 \
8461  __asm movhps [eax+row*48+32], xmm5 \
8462  __asm movhps [eax+row*48+40], xmm6
8463 
8464  MUL_Nx2_2x6_INIT
8465  MUL_Nx2_2x6_ROW2( 0 )
8466 
8467  return;
8468  }
8469  case 6: { // 6x2 * 2x6
8470 
8471  MUL_Nx2_2x6_INIT
8472  MUL_Nx2_2x6_ROW2( 0 )
8473  MUL_Nx2_2x6_ROW2( 1 )
8474  MUL_Nx2_2x6_ROW2( 2 )
8475 
8476  return;
8477  }
8478  }
8479  }
8480  for ( i = 0; i < k; i++ ) {
8481  m2Ptr = m2.ToFloatPtr();
8482  for ( j = 0; j < l; j++ ) {
8483  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];
8484  m2Ptr++;
8485  }
8486  m1Ptr += 2;
8487  }
8488  break;
8489  }
8490  case 3: {
8491  if ( !(l^6) ) {
8492  switch( k ) {
8493  case 3: { // 3x3 * 3x6
8494  __asm {
8495  mov esi, m2Ptr
8496  mov edi, m1Ptr
8497  mov eax, dstPtr
8498  movaps xmm5, xmmword ptr [esi]
8499  movlps xmm6, qword ptr [esi+24]
8500  movhps xmm6, qword ptr [esi+32]
8501  movaps xmm7, xmmword ptr [esi+48]
8502  movss xmm0, dword ptr [edi]
8503  shufps xmm0, xmm0, 0
8504  mulps xmm0, xmm5
8505  movss xmm1, dword ptr [edi+4]
8506  shufps xmm1, xmm1, 0
8507  mulps xmm1, xmm6
8508  movss xmm2, dword ptr [edi+8]
8509  shufps xmm2, xmm2, 0
8510  mulps xmm2, xmm7
8511  addps xmm0, xmm1
8512  addps xmm0, xmm2
8513  movaps xmmword ptr [eax], xmm0
8514  movss xmm3, dword ptr [edi+12]
8515  shufps xmm3, xmm3, 0
8516  mulps xmm3, xmm5
8517  movss xmm4, dword ptr [edi+16]
8518  shufps xmm4, xmm4, 0
8519  mulps xmm4, xmm6
8520  movss xmm0, dword ptr [edi+20]
8521  shufps xmm0, xmm0, 0
8522  mulps xmm0, xmm7
8523  addps xmm3, xmm4
8524  addps xmm0, xmm3
8525  movlps qword ptr [eax+24], xmm0
8526  movhps qword ptr [eax+32], xmm0
8527  movss xmm1, dword ptr [edi+24]
8528  shufps xmm1, xmm1, 0
8529  mulps xmm1, xmm5
8530  movss xmm2, dword ptr [edi+28]
8531  shufps xmm2, xmm2, 0
8532  mulps xmm2, xmm6
8533  movss xmm3, dword ptr [edi+32]
8534  shufps xmm3, xmm3, 0
8535  mulps xmm3, xmm7
8536  addps xmm1, xmm2
8537  addps xmm1, xmm3
8538  movaps xmmword ptr [eax+48], xmm1
8539  movlps xmm5, qword ptr [esi+16]
8540  movlps xmm6, qword ptr [esi+40]
8541  movlps xmm7, qword ptr [esi+64]
8542  shufps xmm5, xmm5, 0x44
8543  shufps xmm6, xmm6, 0x44
8544  shufps xmm7, xmm7, 0x44
8545  movaps xmm3, xmmword ptr [edi]
8546  movlps xmm4, qword ptr [edi+16]
8547  movaps xmm0, xmm3
8548  shufps xmm0, xmm0, 0xF0
8549  mulps xmm0, xmm5
8550  movaps xmm1, xmm3
8551  shufps xmm1, xmm4, 0x05
8552  mulps xmm1, xmm6
8553  shufps xmm3, xmm4, 0x5A
8554  mulps xmm3, xmm7
8555  addps xmm1, xmm0
8556  addps xmm1, xmm3
8557  movlps qword ptr [eax+16], xmm1
8558  movhps qword ptr [eax+40], xmm1
8559  movss xmm0, dword ptr [edi+24]
8560  shufps xmm0, xmm0, 0
8561  mulps xmm0, xmm5
8562  movss xmm2, dword ptr [edi+28]
8563  shufps xmm2, xmm2, 0
8564  mulps xmm2, xmm6
8565  movss xmm4, dword ptr [edi+32]
8566  shufps xmm4, xmm4, 0
8567  mulps xmm4, xmm7
8568  addps xmm0, xmm2
8569  addps xmm0, xmm4
8570  movlps qword ptr [eax+64], xmm0
8571  }
8572  return;
8573  }
8574  case 6: { // 6x3 * 3x6
8575  #define MUL_Nx3_3x6_FIRST4COLUMNS_INIT \
8576  __asm mov esi, m2Ptr \
8577  __asm mov edi, m1Ptr \
8578  __asm mov eax, dstPtr \
8579  __asm movlps xmm0, [esi+ 0*4] \
8580  __asm movhps xmm0, [esi+ 2*4] \
8581  __asm movlps xmm1, [esi+ 6*4] \
8582  __asm movhps xmm1, [esi+ 8*4] \
8583  __asm movlps xmm2, [esi+12*4] \
8584  __asm movhps xmm2, [esi+14*4]
8585 
8586  #define MUL_Nx3_3x6_FIRST4COLUMNS_ROW( row ) \
8587  __asm movss xmm3, [edi+(row*3+0)*4] \
8588  __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8589  __asm mulps xmm3, xmm0 \
8590  __asm movss xmm4, [edi+(row*3+1)*4] \
8591  __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8592  __asm mulps xmm4, xmm1 \
8593  __asm addps xmm3, xmm4 \
8594  __asm movss xmm5, [edi+(row*3+2)*4] \
8595  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8596  __asm mulps xmm5, xmm2 \
8597  __asm addps xmm3, xmm5 \
8598  __asm movlps [eax+(row*6+0)*4], xmm3 \
8599  __asm movhps [eax+(row*6+2)*4], xmm3
8600 
8601  #define MUL_Nx3_3x6_LAST2COLUMNS_ROW6 \
8602  __asm movlps xmm0, [esi+ 4*4] \
8603  __asm movlps xmm1, [esi+10*4] \
8604  __asm movlps xmm2, [esi+16*4] \
8605  __asm shufps xmm0, xmm0, 0x44 \
8606  __asm shufps xmm1, xmm1, 0x44 \
8607  __asm shufps xmm2, xmm2, 0x44 \
8608  __asm movlps xmm3, [edi+0*4] \
8609  __asm movhps xmm3, [edi+2*4] \
8610  __asm movaps xmm4, xmm3 \
8611  __asm movaps xmm5, xmm3 \
8612  __asm shufps xmm3, xmm3, 0xF0 \
8613  __asm mulps xmm3, xmm0 \
8614  __asm movlps xmm6, [edi+4*4] \
8615  __asm movhps xmm6, [edi+6*4] \
8616  __asm shufps xmm4, xmm6, 0x05 \
8617  __asm mulps xmm4, xmm1 \
8618  __asm addps xmm3, xmm4 \
8619  __asm shufps xmm5, xmm6, 0x5A \
8620  __asm mulps xmm5, xmm2 \
8621  __asm addps xmm3, xmm5 \
8622  __asm movlps [eax+4*4], xmm3 \
8623  __asm movhps [eax+10*4], xmm3 \
8624  __asm movaps xmm5, xmm6 \
8625  __asm movlps xmm3, [edi+8*4] \
8626  __asm movhps xmm3, [edi+10*4] \
8627  __asm movaps xmm4, xmm3 \
8628  __asm shufps xmm5, xmm3, 0x5A \
8629  __asm mulps xmm5, xmm0 \
8630  __asm shufps xmm6, xmm3, 0xAF \
8631  __asm mulps xmm6, xmm1 \
8632  __asm addps xmm5, xmm6 \
8633  __asm shufps xmm4, xmm4, 0xF0 \
8634  __asm mulps xmm4, xmm2 \
8635  __asm addps xmm4, xmm5 \
8636  __asm movlps [eax+16*4], xmm4 \
8637  __asm movhps [eax+22*4], xmm4 \
8638  __asm movlps xmm6, [edi+12*4] \
8639  __asm movhps xmm6, [edi+14*4] \
8640  __asm movaps xmm5, xmm6 \
8641  __asm movaps xmm4, xmm6 \
8642  __asm shufps xmm6, xmm6, 0xF0 \
8643  __asm mulps xmm6, xmm0 \
8644  __asm movlps xmm3, [edi+16*4] \
8645  __asm shufps xmm5, xmm3, 0x05 \
8646  __asm mulps xmm5, xmm1 \
8647  __asm addps xmm5, xmm6 \
8648  __asm shufps xmm4, xmm3, 0x5A \
8649  __asm mulps xmm4, xmm2 \
8650  __asm addps xmm4, xmm5 \
8651  __asm movlps [eax+28*4], xmm4 \
8652  __asm movhps [eax+34*4], xmm4
8653 
8654  MUL_Nx3_3x6_FIRST4COLUMNS_INIT
8655  MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 0 )
8656  MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 1 )
8657  MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 2 )
8658  MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 3 )
8659  MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 4 )
8660  MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 5 )
8661  MUL_Nx3_3x6_LAST2COLUMNS_ROW6
8662 
8663  return;
8664  }
8665  }
8666  }
8667  for ( i = 0; i < k; i++ ) {
8668  m2Ptr = m2.ToFloatPtr();
8669  for ( j = 0; j < l; j++ ) {
8670  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l];
8671  m2Ptr++;
8672  }
8673  m1Ptr += 3;
8674  }
8675  break;
8676  }
8677  case 4: {
8678  if ( !(l^6) ) {
8679  switch( k ) {
8680  case 4: { // 4x4 * 4x6
8681 
8682  #define MUL_Nx4_4x6_FIRST4COLUMNS_INIT \
8683  __asm mov esi, m2Ptr \
8684  __asm mov edi, m1Ptr \
8685  __asm mov eax, dstPtr \
8686  __asm movlps xmm0, [esi+ 0*4] \
8687  __asm movhps xmm0, [esi+ 2*4] \
8688  __asm movlps xmm1, [esi+ 6*4] \
8689  __asm movhps xmm1, [esi+ 8*4] \
8690  __asm movlps xmm2, [esi+12*4] \
8691  __asm movhps xmm2, [esi+14*4] \
8692  __asm movlps xmm3, [esi+18*4] \
8693  __asm movhps xmm3, [esi+20*4]
8694 
8695  #define MUL_Nx4_4x6_FIRST4COLUMNS_ROW( row ) \
8696  __asm movss xmm4, [edi+row*16+0*4] \
8697  __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8698  __asm mulps xmm4, xmm0 \
8699  __asm movss xmm5, [edi+row*16+1*4] \
8700  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8701  __asm mulps xmm5, xmm1 \
8702  __asm addps xmm4, xmm5 \
8703  __asm movss xmm6, [edi+row*16+2*4] \
8704  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8705  __asm mulps xmm6, xmm2 \
8706  __asm addps xmm4, xmm6 \
8707  __asm movss xmm7, [edi+row*16+3*4] \
8708  __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8709  __asm mulps xmm7, xmm3 \
8710  __asm addps xmm4, xmm7 \
8711  __asm movlps [eax+row*24+0], xmm4 \
8712  __asm movhps [eax+row*24+8], xmm4
8713 
8714  #define MUL_Nx4_4x6_LAST2COLUMNS_INIT \
8715  __asm movlps xmm0, [esi+ 4*4] \
8716  __asm movlps xmm1, [esi+10*4] \
8717  __asm movlps xmm2, [esi+16*4] \
8718  __asm movlps xmm3, [esi+22*4] \
8719  __asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
8720  __asm shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
8721  __asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
8722  __asm shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
8723 
8724  #define MUL_Nx4_4x6_LAST2COLUMNS_ROW2( row ) \
8725  __asm movlps xmm7, [edi+row*32+ 0*4] \
8726  __asm movhps xmm7, [edi+row*32+ 4*4] \
8727  __asm movaps xmm6, xmm7 \
8728  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 3, 3 ) \
8729  __asm mulps xmm6, xmm0 \
8730  __asm shufps xmm7, xmm7, R_SHUFFLEPS( 1, 1, 2, 2 ) \
8731  __asm mulps xmm7, xmm1 \
8732  __asm addps xmm6, xmm7 \
8733  __asm movlps xmm4, [edi+row*32+ 2*4] \
8734  __asm movhps xmm4, [edi+row*32+ 6*4] \
8735  __asm movaps xmm5, xmm4 \
8736  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 3, 3 ) \
8737  __asm mulps xmm5, xmm2 \
8738  __asm addps xmm6, xmm5 \
8739  __asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 2, 2 ) \
8740  __asm mulps xmm4, xmm3 \
8741  __asm addps xmm6, xmm4 \
8742  __asm movlps [eax+row*48+ 4*4], xmm6 \
8743  __asm movhps [eax+row*48+10*4], xmm6
8744 
8745  MUL_Nx4_4x6_FIRST4COLUMNS_INIT
8746  MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
8747  MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
8748  MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
8749  MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
8750  MUL_Nx4_4x6_LAST2COLUMNS_INIT
8751  MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
8752  MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
8753 
8754  return;
8755  }
8756  case 6: { // 6x4 * 4x6
8757 
8758  MUL_Nx4_4x6_FIRST4COLUMNS_INIT
8759  MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
8760  MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
8761  MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
8762  MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
8763  MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 4 )
8764  MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 5 )
8765  MUL_Nx4_4x6_LAST2COLUMNS_INIT
8766  MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
8767  MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
8768  MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 2 )
8769 
8770  return;
8771  }
8772  }
8773  }
8774  for ( i = 0; i < k; i++ ) {
8775  m2Ptr = m2.ToFloatPtr();
8776  for ( j = 0; j < l; j++ ) {
8777  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
8778  m1Ptr[3] * m2Ptr[3*l];
8779  m2Ptr++;
8780  }
8781  m1Ptr += 4;
8782  }
8783  break;
8784  }
8785  case 5: {
8786  if ( !(l^6) ) {
8787  switch( k ) {
8788  case 5: { // 5x5 * 5x6
8789 
8790  #define MUL_Nx5_5x6_FIRST4COLUMNS_INIT \
8791  __asm mov esi, m2Ptr \
8792  __asm mov edi, m1Ptr \
8793  __asm mov eax, dstPtr \
8794  __asm movlps xmm0, [esi+ 0*4] \
8795  __asm movhps xmm0, [esi+ 2*4] \
8796  __asm movlps xmm1, [esi+ 6*4] \
8797  __asm movhps xmm1, [esi+ 8*4] \
8798  __asm movlps xmm2, [esi+12*4] \
8799  __asm movhps xmm2, [esi+14*4] \
8800  __asm movlps xmm3, [esi+18*4] \
8801  __asm movhps xmm3, [esi+20*4] \
8802  __asm movlps xmm4, [esi+24*4] \
8803  __asm movhps xmm4, [esi+26*4]
8804 
8805  #define MUL_Nx5_5x6_FIRST4COLUMNS_ROW( row ) \
8806  __asm movss xmm6, [edi+row*20+0*4] \
8807  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8808  __asm mulps xmm6, xmm0 \
8809  __asm movss xmm5, [edi+row*20+1*4] \
8810  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8811  __asm mulps xmm5, xmm1 \
8812  __asm addps xmm6, xmm5 \
8813  __asm movss xmm5, [edi+row*20+2*4] \
8814  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8815  __asm mulps xmm5, xmm2 \
8816  __asm addps xmm6, xmm5 \
8817  __asm movss xmm5, [edi+row*20+3*4] \
8818  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8819  __asm mulps xmm5, xmm3 \
8820  __asm addps xmm6, xmm5 \
8821  __asm movss xmm5, [edi+row*20+4*4] \
8822  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8823  __asm mulps xmm5, xmm4 \
8824  __asm addps xmm6, xmm5 \
8825  __asm movlps [eax+row*24+0], xmm6 \
8826  __asm movhps [eax+row*24+8], xmm6
8827 
8828  #define MUL_Nx5_5x6_LAST2COLUMNS_INIT \
8829  __asm movlps xmm0, [esi+ 4*4] \
8830  __asm movlps xmm1, [esi+10*4] \
8831  __asm movlps xmm2, [esi+16*4] \
8832  __asm movlps xmm3, [esi+22*4] \
8833  __asm movlps xmm4, [esi+28*4] \
8834  __asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
8835  __asm shufps xmm1, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \
8836  __asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
8837  __asm shufps xmm3, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \
8838  __asm shufps xmm4, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )
8839 
8840  #define MUL_Nx5_5x6_LAST2COLUMNS_ROW2( row ) \
8841  __asm movlps xmm7, [edi+row*40+ 0*4] \
8842  __asm movhps xmm7, [edi+row*40+ 6*4] \
8843  __asm movaps xmm6, xmm7 \
8844  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 2, 2 ) \
8845  __asm mulps xmm6, xmm0 \
8846  __asm movaps xmm5, xmm7 \
8847  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \
8848  __asm mulps xmm5, xmm1 \
8849  __asm addps xmm6, xmm5 \
8850  __asm movlps xmm7, [edi+row*40+ 2*4] \
8851  __asm movhps xmm7, [edi+row*40+ 8*4] \
8852  __asm movaps xmm5, xmm7 \
8853  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 2, 2 ) \
8854  __asm mulps xmm5, xmm2 \
8855  __asm addps xmm6, xmm5 \
8856  __asm movaps xmm5, xmm7 \
8857  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \
8858  __asm mulps xmm5, xmm3 \
8859  __asm addps xmm6, xmm5 \
8860  __asm movlps xmm5, [edi+row*40+ 4*4] \
8861  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \
8862  __asm mulps xmm5, xmm4 \
8863  __asm addps xmm6, xmm5 \
8864  __asm movlps [eax+row*48+ 4*4], xmm6 \
8865  __asm movhps [eax+row*48+10*4], xmm6
8866 
8867  #define MUL_Nx5_5x6_LAST2COLUMNS_ROW( row ) \
8868  __asm movlps xmm6, [edi+20*4+0*4] \
8869  __asm unpcklps xmm6, xmm6 \
8870  __asm mulps xmm6, xmm0 \
8871  __asm movlps xmm5, [edi+20*4+2*4] \
8872  __asm unpcklps xmm5, xmm5 \
8873  __asm mulps xmm5, xmm2 \
8874  __asm addps xmm6, xmm5 \
8875  __asm movss xmm5, [edi+20*4+4*4] \
8876  __asm unpcklps xmm5, xmm5 \
8877  __asm mulps xmm5, xmm4 \
8878  __asm addps xmm6, xmm5 \
8879  __asm movhlps xmm7, xmm6 \
8880  __asm addps xmm6, xmm7 \
8881  __asm movlps [eax+row*24+4*4], xmm6
8882 
8883  MUL_Nx5_5x6_FIRST4COLUMNS_INIT
8884  MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
8885  MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
8886  MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
8887  MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
8888  MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
8889  MUL_Nx5_5x6_LAST2COLUMNS_INIT
8890  MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
8891  MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
8892  MUL_Nx5_5x6_LAST2COLUMNS_ROW( 4 )
8893 
8894  return;
8895  }
8896  case 6: { // 6x5 * 5x6
8897 
8898  MUL_Nx5_5x6_FIRST4COLUMNS_INIT
8899  MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
8900  MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
8901  MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
8902  MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
8903  MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
8904  MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 5 )
8905  MUL_Nx5_5x6_LAST2COLUMNS_INIT
8906  MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
8907  MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
8908  MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 2 )
8909 
8910  return;
8911  }
8912  }
8913  }
8914  for ( i = 0; i < k; i++ ) {
8915  m2Ptr = m2.ToFloatPtr();
8916  for ( j = 0; j < l; j++ ) {
8917  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
8918  m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l];
8919  m2Ptr++;
8920  }
8921  m1Ptr += 5;
8922  }
8923  break;
8924  }
8925  case 6: {
8926  switch( k ) {
8927  case 1: {
8928  if ( !(l^1) ) { // 1x6 * 6x1
8929  dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
8930  m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
8931  return;
8932  }
8933  break;
8934  }
8935  case 2: {
8936  if ( !(l^2) ) { // 2x6 * 6x2
8937 
8938  #define MUL_Nx6_6x2_INIT \
8939  __asm mov esi, m2Ptr \
8940  __asm mov edi, m1Ptr \
8941  __asm mov eax, dstPtr \
8942  __asm movaps xmm0, [esi] \
8943  __asm movaps xmm1, [esi+16] \
8944  __asm movaps xmm2, [esi+32]
8945 
8946  #define MUL_Nx6_6x2_ROW2( row ) \
8947  __asm movaps xmm7, [edi+row*48+0*4] \
8948  __asm movaps xmm6, xmm7 \
8949  __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
8950  __asm mulps xmm7, xmm0 \
8951  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 2, 2, 3, 3 ) \
8952  __asm mulps xmm6, xmm1 \
8953  __asm addps xmm7, xmm6 \
8954  __asm movaps xmm6, [edi+row*48+4*4] \
8955  __asm movaps xmm5, xmm6 \
8956  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
8957  __asm mulps xmm6, xmm2 \
8958  __asm addps xmm7, xmm6 \
8959  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 ) \
8960  __asm mulps xmm5, xmm0 \
8961  __asm movaps xmm6, [edi+row*48+24+2*4] \
8962  __asm movaps xmm4, xmm6 \
8963  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
8964  __asm mulps xmm6, xmm1 \
8965  __asm addps xmm5, xmm6 \
8966  __asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 3, 3 ) \
8967  __asm mulps xmm4, xmm2 \
8968  __asm addps xmm5, xmm4 \
8969  __asm movaps xmm4, xmm5 \
8970  __asm movhlps xmm5, xmm7 \
8971  __asm movlhps xmm7, xmm4 \
8972  __asm addps xmm7, xmm5 \
8973  __asm movaps [eax+row*16], xmm7
8974 
8975  MUL_Nx6_6x2_INIT
8976  MUL_Nx6_6x2_ROW2( 0 )
8977 
8978  return;
8979  }
8980  break;
8981  }
8982  case 3: {
8983  if ( !(l^3) ) { // 3x6 * 6x3
8984 
8985  #define MUL_Nx6_6x3_INIT \
8986  __asm mov esi, m2Ptr \
8987  __asm mov edi, m1Ptr \
8988  __asm mov eax, dstPtr \
8989  __asm movss xmm0, [esi+ 0*4] \
8990  __asm movhps xmm0, [esi+ 1*4] \
8991  __asm movss xmm1, [esi+ 3*4] \
8992  __asm movhps xmm1, [esi+ 4*4] \
8993  __asm movss xmm2, [esi+ 6*4] \
8994  __asm movhps xmm2, [esi+ 7*4] \
8995  __asm movss xmm3, [esi+ 9*4] \
8996  __asm movhps xmm3, [esi+10*4] \
8997  __asm movss xmm4, [esi+12*4] \
8998  __asm movhps xmm4, [esi+13*4] \
8999  __asm movss xmm5, [esi+15*4] \
9000  __asm movhps xmm5, [esi+16*4]
9001 
9002  #define MUL_Nx6_6x3_ROW( row ) \
9003  __asm movss xmm7, [edi+row*24+0] \
9004  __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9005  __asm mulps xmm7, xmm0 \
9006  __asm movss xmm6, [edi+row*24+4] \
9007  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9008  __asm mulps xmm6, xmm1 \
9009  __asm addps xmm7, xmm6 \
9010  __asm movss xmm6, [edi+row*24+8] \
9011  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9012  __asm mulps xmm6, xmm2 \
9013  __asm addps xmm7, xmm6 \
9014  __asm movss xmm6, [edi+row*24+12] \
9015  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9016  __asm mulps xmm6, xmm3 \
9017  __asm addps xmm7, xmm6 \
9018  __asm movss xmm6, [edi+row*24+16] \
9019  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9020  __asm mulps xmm6, xmm4 \
9021  __asm addps xmm7, xmm6 \
9022  __asm movss xmm6, [edi+row*24+20] \
9023  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9024  __asm mulps xmm6, xmm5 \
9025  __asm addps xmm7, xmm6 \
9026  __asm movss [eax+row*12+0], xmm7 \
9027  __asm movhps [eax+row*12+4], xmm7
9028 
9029  MUL_Nx6_6x3_INIT
9030  MUL_Nx6_6x3_ROW( 0 )
9031  MUL_Nx6_6x3_ROW( 1 )
9032  MUL_Nx6_6x3_ROW( 2 )
9033 
9034  return;
9035  }
9036  break;
9037  }
9038  case 4: {
9039  if ( !(l^4) ) { // 4x6 * 6x4
9040 
9041  #define MUL_Nx6_6x4_INIT \
9042  __asm mov esi, m2Ptr \
9043  __asm mov edi, m1Ptr \
9044  __asm mov eax, dstPtr \
9045  __asm movaps xmm0, [esi] \
9046  __asm movaps xmm1, [esi+16] \
9047  __asm movaps xmm2, [esi+32] \
9048  __asm movaps xmm3, [esi+48] \
9049  __asm movaps xmm4, [esi+64] \
9050  __asm movaps xmm5, [esi+80]
9051 
9052  #define MUL_Nx6_6x4_ROW( row ) \
9053  __asm movss xmm7, [edi+row*24+0] \
9054  __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9055  __asm mulps xmm7, xmm0 \
9056  __asm movss xmm6, [edi+row*24+4] \
9057  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9058  __asm mulps xmm6, xmm1 \
9059  __asm addps xmm7, xmm6 \
9060  __asm movss xmm6, [edi+row*24+8] \
9061  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9062  __asm mulps xmm6, xmm2 \
9063  __asm addps xmm7, xmm6 \
9064  __asm movss xmm6, [edi+row*24+12] \
9065  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9066  __asm mulps xmm6, xmm3 \
9067  __asm addps xmm7, xmm6 \
9068  __asm movss xmm6, [edi+row*24+16] \
9069  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9070  __asm mulps xmm6, xmm4 \
9071  __asm addps xmm7, xmm6 \
9072  __asm movss xmm6, [edi+row*24+20] \
9073  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9074  __asm mulps xmm6, xmm5 \
9075  __asm addps xmm7, xmm6 \
9076  __asm movaps [eax+row*16], xmm7
9077 
9078  MUL_Nx6_6x4_INIT
9079  MUL_Nx6_6x4_ROW( 0 )
9080  MUL_Nx6_6x4_ROW( 1 )
9081  MUL_Nx6_6x4_ROW( 2 )
9082  MUL_Nx6_6x4_ROW( 3 )
9083 
9084  return;
9085  }
9086  break;
9087  }
9088  case 5: {
9089  if ( !(l^5) ) { // 5x6 * 6x5
9090 
9091  #define MUL_Nx6_6x5_INIT \
9092  __asm mov esi, m2Ptr \
9093  __asm mov edi, m1Ptr \
9094  __asm mov eax, dstPtr \
9095  __asm movaps xmm0, [esi] \
9096  __asm movlps xmm1, [esi+20] \
9097  __asm movhps xmm1, [esi+28] \
9098  __asm movlps xmm2, [esi+40] \
9099  __asm movhps xmm2, [esi+48] \
9100  __asm movlps xmm3, [esi+60] \
9101  __asm movhps xmm3, [esi+68] \
9102  __asm movaps xmm4, [esi+80] \
9103  __asm movlps xmm5, [esi+100] \
9104  __asm movhps xmm5, [esi+108]
9105 
9106  #define MUL_Nx6_6x5_ROW( row ) \
9107  __asm movss xmm7, [edi+row*24+0] \
9108  __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9109  __asm mulps xmm7, xmm0 \
9110  __asm fld dword ptr [edi+(row*6+0)*4] \
9111  __asm fmul dword ptr [esi+(4+0*5)*4] \
9112  __asm movss xmm6, [edi+row*24+4] \
9113  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9114  __asm mulps xmm6, xmm1 \
9115  __asm addps xmm7, xmm6 \
9116  __asm fld dword ptr [edi+(row*6+1)*4] \
9117  __asm fmul dword ptr [esi+(4+1*5)*4] \
9118  __asm faddp st(1),st \
9119  __asm movss xmm6, [edi+row*24+8] \
9120  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9121  __asm mulps xmm6, xmm2 \
9122  __asm addps xmm7, xmm6 \
9123  __asm fld dword ptr [edi+(row*6+2)*4] \
9124  __asm fmul dword ptr [esi+(4+2*5)*4] \
9125  __asm faddp st(1),st \
9126  __asm movss xmm6, [edi+row*24+12] \
9127  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9128  __asm mulps xmm6, xmm3 \
9129  __asm addps xmm7, xmm6 \
9130  __asm fld dword ptr [edi+(row*6+3)*4] \
9131  __asm fmul dword ptr [esi+(4+3*5)*4] \
9132  __asm faddp st(1),st \
9133  __asm movss xmm6, [edi+row*24+16] \
9134  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9135  __asm mulps xmm6, xmm4 \
9136  __asm addps xmm7, xmm6 \
9137  __asm fld dword ptr [edi+(row*6+4)*4] \
9138  __asm fmul dword ptr [esi+(4+4*5)*4] \
9139  __asm faddp st(1),st \
9140  __asm movss xmm6, [edi+row*24+20] \
9141  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9142  __asm mulps xmm6, xmm5 \
9143  __asm addps xmm7, xmm6 \
9144  __asm fld dword ptr [edi+(row*6+5)*4] \
9145  __asm fmul dword ptr [esi+(4+5*5)*4] \
9146  __asm faddp st(1),st \
9147  __asm fstp dword ptr [eax+(row*5+4)*4] \
9148  __asm movlps [eax+row*20], xmm7 \
9149  __asm movhps [eax+row*20+8], xmm7
9150 
9151  MUL_Nx6_6x5_INIT
9152  MUL_Nx6_6x5_ROW( 0 )
9153  MUL_Nx6_6x5_ROW( 1 )
9154  MUL_Nx6_6x5_ROW( 2 )
9155  MUL_Nx6_6x5_ROW( 3 )
9156  MUL_Nx6_6x5_ROW( 4 )
9157 
9158  return;
9159  }
9160  break;
9161  }
9162  case 6: {
9163  switch( l ) {
9164  case 1: { // 6x6 * 6x1
9165  __asm {
9166  mov esi, m2Ptr
9167  mov edi, m1Ptr
9168  mov eax, dstPtr
9169  movlps xmm7, qword ptr [esi]
9170  movlps xmm6, qword ptr [esi+8]
9171  shufps xmm7, xmm7, 0x44
9172  shufps xmm6, xmm6, 0x44
9173  movlps xmm0, qword ptr [edi ]
9174  movhps xmm0, qword ptr [edi+ 24]
9175  mulps xmm0, xmm7
9176  movlps xmm3, qword ptr [edi+ 8]
9177  movhps xmm3, qword ptr [edi+ 32]
9178  mulps xmm3, xmm6
9179  movlps xmm1, qword ptr [edi+ 48]
9180  movhps xmm1, qword ptr [edi+ 72]
9181  mulps xmm1, xmm7
9182  movlps xmm2, qword ptr [edi+ 96]
9183  movhps xmm2, qword ptr [edi+120]
9184  mulps xmm2, xmm7
9185  movlps xmm4, qword ptr [edi+ 56]
9186  movhps xmm4, qword ptr [edi+ 80]
9187  movlps xmm5, qword ptr [edi+104]
9188  movhps xmm5, qword ptr [edi+128]
9189  mulps xmm4, xmm6
9190  movlps xmm7, qword ptr [esi+16]
9191  addps xmm0, xmm3
9192  shufps xmm7, xmm7, 0x44
9193  mulps xmm5, xmm6
9194  addps xmm1, xmm4
9195  movlps xmm3, qword ptr [edi+ 16]
9196  movhps xmm3, qword ptr [edi+ 40]
9197  addps xmm2, xmm5
9198  movlps xmm4, qword ptr [edi+ 64]
9199  movhps xmm4, qword ptr [edi+ 88]
9200  mulps xmm3, xmm7
9201  movlps xmm5, qword ptr [edi+112]
9202  movhps xmm5, qword ptr [edi+136]
9203  addps xmm0, xmm3
9204  mulps xmm4, xmm7
9205  mulps xmm5, xmm7
9206  addps xmm1, xmm4
9207  addps xmm2, xmm5
9208  movaps xmm6, xmm0
9209  shufps xmm0, xmm1, 0x88
9210  shufps xmm6, xmm1, 0xDD
9211  movaps xmm7, xmm2
9212  shufps xmm7, xmm2, 0x88
9213  shufps xmm2, xmm2, 0xDD
9214  addps xmm0, xmm6
9215  addps xmm2, xmm7
9216  movlps [eax], xmm0
9217  movhps [eax+8], xmm0
9218  movlps [eax+16], xmm2
9219  }
9220  return;
9221  }
9222  case 2: { // 6x6 * 6x2
9223 
9224  MUL_Nx6_6x2_INIT
9225  MUL_Nx6_6x2_ROW2( 0 )
9226  MUL_Nx6_6x2_ROW2( 1 )
9227  MUL_Nx6_6x2_ROW2( 2 )
9228 
9229  return;
9230  }
9231  case 3: { // 6x6 * 6x3
9232 
9233  MUL_Nx6_6x3_INIT
9234  MUL_Nx6_6x3_ROW( 0 )
9235  MUL_Nx6_6x3_ROW( 1 )
9236  MUL_Nx6_6x3_ROW( 2 )
9237  MUL_Nx6_6x3_ROW( 3 )
9238  MUL_Nx6_6x3_ROW( 4 )
9239  MUL_Nx6_6x3_ROW( 5 )
9240 
9241  return;
9242  }
9243  case 4: { // 6x6 * 6x4
9244 
9245  MUL_Nx6_6x4_INIT
9246  MUL_Nx6_6x4_ROW( 0 )
9247  MUL_Nx6_6x4_ROW( 1 )
9248  MUL_Nx6_6x4_ROW( 2 )
9249  MUL_Nx6_6x4_ROW( 3 )
9250  MUL_Nx6_6x4_ROW( 4 )
9251  MUL_Nx6_6x4_ROW( 5 )
9252 
9253  return;
9254  }
9255  case 5: { // 6x6 * 6x5
9256 
9257  MUL_Nx6_6x5_INIT
9258  MUL_Nx6_6x5_ROW( 0 )
9259  MUL_Nx6_6x5_ROW( 1 )
9260  MUL_Nx6_6x5_ROW( 2 )
9261  MUL_Nx6_6x5_ROW( 3 )
9262  MUL_Nx6_6x5_ROW( 4 )
9263  MUL_Nx6_6x5_ROW( 5 )
9264 
9265  return;
9266  }
9267  case 6: { // 6x6 * 6x6
9268  __asm {
9269  mov ecx, dword ptr m2Ptr
9270  movlps xmm3, qword ptr [ecx+72]
9271  mov edx, dword ptr m1Ptr
9272  // Loading first 4 columns (upper 4 rows) of m2Ptr.
9273  movaps xmm0, xmmword ptr [ecx]
9274  movlps xmm1, qword ptr [ecx+24]
9275  movhps xmm1, qword ptr [ecx+32]
9276  movaps xmm2, xmmword ptr [ecx+48]
9277  movhps xmm3, qword ptr [ecx+80]
9278  // Calculating first 4 elements in the first row of the destination matrix.
9279  movss xmm4, dword ptr [edx]
9280  movss xmm5, dword ptr [edx+4]
9281  mov eax, dword ptr dstPtr
9282  shufps xmm4, xmm4, 0
9283  movss xmm6, dword ptr [edx+8]
9284  shufps xmm5, xmm5, 0
9285  movss xmm7, dword ptr [edx+12]
9286  mulps xmm4, xmm0
9287  shufps xmm6, xmm6, 0
9288  shufps xmm7, xmm7, 0
9289  mulps xmm5, xmm1
9290  mulps xmm6, xmm2
9291  addps xmm5, xmm4
9292  mulps xmm7, xmm3
9293  addps xmm6, xmm5
9294  addps xmm7, xmm6
9295  movaps xmmword ptr [eax], xmm7
9296  // Calculating first 4 elements in the second row of the destination matrix.
9297  movss xmm4, dword ptr [edx+24]
9298  shufps xmm4, xmm4, 0
9299  mulps xmm4, xmm0
9300  movss xmm5, dword ptr [edx+28]
9301  shufps xmm5, xmm5, 0
9302  mulps xmm5, xmm1
9303  movss xmm6, dword ptr [edx+32]
9304  shufps xmm6, xmm6, 0
9305  movss xmm7, dword ptr [edx+36]
9306  shufps xmm7, xmm7, 0
9307  mulps xmm6, xmm2
9308  mulps xmm7, xmm3
9309  addps xmm7, xmm6
9310  addps xmm5, xmm4
9311  addps xmm7, xmm5
9312  // Calculating first 4 elements in the third row of the destination matrix.
9313  movss xmm4, dword ptr [edx+48]
9314  movss xmm5, dword ptr [edx+52]
9315  movlps qword ptr [eax+24], xmm7 ; save 2nd
9316  movhps qword ptr [eax+32], xmm7 ; row
9317  movss xmm6, dword ptr [edx+56]
9318  movss xmm7, dword ptr [edx+60]
9319  shufps xmm4, xmm4, 0
9320  shufps xmm5, xmm5, 0
9321  shufps xmm6, xmm6, 0
9322  shufps xmm7, xmm7, 0
9323  mulps xmm4, xmm0
9324  mulps xmm5, xmm1
9325  mulps xmm6, xmm2
9326  mulps xmm7, xmm3
9327  addps xmm5, xmm4
9328  addps xmm7, xmm6
9329  addps xmm7, xmm5
9330  movaps xmmword ptr [eax+48], xmm7
9331  // Calculating first 4 elements in the fourth row of the destination matrix.
9332  movss xmm4, dword ptr [edx+72]
9333  movss xmm5, dword ptr [edx+76]
9334  movss xmm6, dword ptr [edx+80]
9335  movss xmm7, dword ptr [edx+84]
9336  shufps xmm4, xmm4, 0
9337  shufps xmm5, xmm5, 0
9338  shufps xmm6, xmm6, 0
9339  shufps xmm7, xmm7, 0
9340  mulps xmm4, xmm0
9341  mulps xmm5, xmm1
9342  mulps xmm6, xmm2
9343  mulps xmm7, xmm3
9344  addps xmm4, xmm5
9345  addps xmm6, xmm4
9346  addps xmm7, xmm6
9347  movlps qword ptr [eax+72], xmm7
9348  movhps qword ptr [eax+80], xmm7
9349  // Calculating first 4 elements in the fifth row of the destination matrix.
9350  movss xmm4, dword ptr [edx+96]
9351  movss xmm5, dword ptr [edx+100]
9352  movss xmm6, dword ptr [edx+104]
9353  movss xmm7, dword ptr [edx+108]
9354  shufps xmm4, xmm4, 0
9355  shufps xmm5, xmm5, 0
9356  shufps xmm6, xmm6, 0
9357  shufps xmm7, xmm7, 0
9358  mulps xmm4, xmm0
9359  mulps xmm5, xmm1
9360  mulps xmm6, xmm2
9361  mulps xmm7, xmm3
9362  addps xmm5, xmm4
9363  addps xmm7, xmm6
9364  addps xmm7, xmm5
9365  movaps xmmword ptr [eax+96], xmm7
9366  // Calculating first 4 elements in the sixth row of the destination matrix.
9367  movss xmm4, dword ptr [edx+120]
9368  movss xmm5, dword ptr [edx+124]
9369  movss xmm6, dword ptr [edx+128]
9370  movss xmm7, dword ptr [edx+132]
9371  shufps xmm4, xmm4, 0
9372  shufps xmm5, xmm5, 0
9373  shufps xmm6, xmm6, 0
9374  shufps xmm7, xmm7, 0
9375  mulps xmm4, xmm0
9376  mulps xmm5, xmm1
9377  mulps xmm6, xmm2
9378  mulps xmm7, xmm3
9379  addps xmm4, xmm5
9380  addps xmm6, xmm4
9381  addps xmm7, xmm6
9382  movhps qword ptr [eax+128], xmm7
9383  movlps qword ptr [eax+120], xmm7
9384  // Loading first 4 columns (lower 2 rows) of m2Ptr.
9385  movlps xmm0, qword ptr [ecx+96]
9386  movhps xmm0, qword ptr [ecx+104]
9387  movlps xmm1, qword ptr [ecx+120]
9388  movhps xmm1, qword ptr [ecx+128]
9389  // Calculating first 4 elements in the first row of the destination matrix.
9390  movss xmm2, dword ptr [edx+16]
9391  shufps xmm2, xmm2, 0
9392  movss xmm4, dword ptr [edx+40]
9393  movss xmm3, dword ptr [edx+20]
9394  movss xmm5, dword ptr [edx+44]
9395  movaps xmm6, xmmword ptr [eax]
9396  movlps xmm7, qword ptr [eax+24]
9397  shufps xmm3, xmm3, 0
9398  shufps xmm5, xmm5, 0
9399  movhps xmm7, qword ptr [eax+32]
9400  shufps xmm4, xmm4, 0
9401  mulps xmm5, xmm1
9402  mulps xmm2, xmm0
9403  mulps xmm3, xmm1
9404  mulps xmm4, xmm0
9405  addps xmm6, xmm2
9406  addps xmm7, xmm4
9407  addps xmm7, xmm5
9408  addps xmm6, xmm3
9409  movlps qword ptr [eax+24], xmm7
9410  movaps xmmword ptr [eax], xmm6
9411  movhps qword ptr [eax+32], xmm7
9412  // Calculating first 4 elements in the third row of the destination matrix.
9413  movss xmm2, dword ptr [edx+64]
9414  movss xmm4, dword ptr [edx+88]
9415  movss xmm5, dword ptr [edx+92]
9416  movss xmm3, dword ptr [edx+68]
9417  movaps xmm6, xmmword ptr [eax+48]
9418  movlps xmm7, qword ptr [eax+72]
9419  movhps xmm7, qword ptr [eax+80]
9420  shufps xmm2, xmm2, 0
9421  shufps xmm4, xmm4, 0
9422  shufps xmm5, xmm5, 0
9423  shufps xmm3, xmm3, 0
9424  mulps xmm2, xmm0
9425  mulps xmm4, xmm0
9426  mulps xmm5, xmm1
9427  mulps xmm3, xmm1
9428  addps xmm6, xmm2
9429  addps xmm6, xmm3
9430  addps xmm7, xmm4
9431  addps xmm7, xmm5
9432  movlps qword ptr [eax+72], xmm7
9433  movaps xmmword ptr [eax+48], xmm6
9434  movhps qword ptr [eax+80], xmm7
9435  // Calculating first 4 elements in the fifth row of the destination matrix.
9436  movss xmm2, dword ptr [edx+112]
9437  movss xmm3, dword ptr [edx+116]
9438  movaps xmm6, xmmword ptr [eax+96]
9439  shufps xmm2, xmm2, 0
9440  shufps xmm3, xmm3, 0
9441  mulps xmm2, xmm0
9442  mulps xmm3, xmm1
9443  addps xmm6, xmm2
9444  addps xmm6, xmm3
9445  movaps xmmword ptr [eax+96], xmm6
9446  // Calculating first 4 elements in the sixth row of the destination matrix.
9447  movss xmm4, dword ptr [edx+136]
9448  movss xmm5, dword ptr [edx+140]
9449  movhps xmm7, qword ptr [eax+128]
9450  movlps xmm7, qword ptr [eax+120]
9451  shufps xmm4, xmm4, 0
9452  shufps xmm5, xmm5, 0
9453  mulps xmm4, xmm0
9454  mulps xmm5, xmm1
9455  addps xmm7, xmm4
9456  addps xmm7, xmm5
9457  // Calculating last 2 columns of the destination matrix.
9458  movlps xmm0, qword ptr [ecx+16]
9459  movhps xmm0, qword ptr [ecx+40]
9460  movhps qword ptr [eax+128], xmm7
9461  movlps qword ptr [eax+120], xmm7
9462  movlps xmm2, qword ptr [ecx+64]
9463  movhps xmm2, qword ptr [ecx+88]
9464  movaps xmm3, xmm2
9465  shufps xmm3, xmm3, 4Eh
9466  movlps xmm4, qword ptr [ecx+112]
9467  movhps xmm4, qword ptr [ecx+136]
9468  movaps xmm5, xmm4
9469  shufps xmm5, xmm5, 4Eh
9470  movlps xmm6, qword ptr [edx]
9471  movhps xmm6, qword ptr [edx+24]
9472  movaps xmm7, xmm6
9473  shufps xmm7, xmm7, 0F0h
9474  mulps xmm7, xmm0
9475  shufps xmm6, xmm6, 0A5h
9476  movaps xmm1, xmm0
9477  shufps xmm1, xmm1, 4Eh
9478  mulps xmm1, xmm6
9479  addps xmm7, xmm1
9480  movlps xmm6, qword ptr [edx+8]
9481  movhps xmm6, qword ptr [edx+32]
9482  movaps xmm1, xmm6
9483  shufps xmm1, xmm1, 0F0h
9484  shufps xmm6, xmm6, 0A5h
9485  mulps xmm1, xmm2
9486  mulps xmm6, xmm3
9487  addps xmm7, xmm1
9488  addps xmm7, xmm6
9489  movhps xmm6, qword ptr [edx+40]
9490  movlps xmm6, qword ptr [edx+16]
9491  movaps xmm1, xmm6
9492  shufps xmm1, xmm1, 0F0h
9493  shufps xmm6, xmm6, 0A5h
9494  mulps xmm1, xmm4
9495  mulps xmm6, xmm5
9496  addps xmm7, xmm1
9497  addps xmm7, xmm6
9498  movlps qword ptr [eax+16], xmm7
9499  movhps qword ptr [eax+40], xmm7
9500  movlps xmm6, qword ptr [edx+48]
9501  movhps xmm6, qword ptr [edx+72]
9502  movaps xmm7, xmm6
9503  shufps xmm7, xmm7, 0F0h
9504  mulps xmm7, xmm0
9505  shufps xmm6, xmm6, 0A5h
9506  movaps xmm1, xmm0
9507  shufps xmm1, xmm1, 4Eh
9508  mulps xmm1, xmm6
9509  addps xmm7, xmm1
9510  movhps xmm6, qword ptr [edx+80]
9511  movlps xmm6, qword ptr [edx+56]
9512  movaps xmm1, xmm6
9513  shufps xmm1, xmm1, 0F0h
9514  shufps xmm6, xmm6, 0A5h
9515  mulps xmm1, xmm2
9516  mulps xmm6, xmm3
9517  addps xmm7, xmm1
9518  addps xmm7, xmm6
9519  movlps xmm6, qword ptr [edx+64]
9520  movhps xmm6, qword ptr [edx+88]
9521  movaps xmm1, xmm6
9522  shufps xmm1, xmm1, 0F0h
9523  shufps xmm6, xmm6, 0A5h
9524  mulps xmm1, xmm4
9525  mulps xmm6, xmm5
9526  addps xmm7, xmm1
9527  addps xmm7, xmm6
9528  movlps qword ptr [eax+64], xmm7
9529  movhps qword ptr [eax+88], xmm7
9530  movlps xmm6, qword ptr [edx+96]
9531  movhps xmm6, qword ptr [edx+120]
9532  movaps xmm7, xmm6
9533  shufps xmm7, xmm7, 0F0h
9534  mulps xmm7, xmm0
9535  shufps xmm6, xmm6, 0A5h
9536  movaps xmm1, xmm0
9537  shufps xmm1, xmm1, 4Eh
9538  mulps xmm1, xmm6
9539  addps xmm7, xmm1
9540  movlps xmm6, qword ptr [edx+104]
9541  movhps xmm6, qword ptr [edx+128]
9542  movaps xmm1, xmm6
9543  shufps xmm1, xmm1, 0F0h
9544  shufps xmm6, xmm6, 0A5h
9545  mulps xmm1, xmm2
9546  mulps xmm6, xmm3
9547  addps xmm7, xmm1
9548  addps xmm7, xmm6
9549  movlps xmm6, qword ptr [edx+112]
9550  movhps xmm6, qword ptr [edx+136]
9551  movaps xmm1, xmm6
9552  shufps xmm1, xmm1, 0F0h
9553  shufps xmm6, xmm6, 0A5h
9554  mulps xmm1, xmm4
9555  mulps xmm6, xmm5
9556  addps xmm7, xmm1
9557  addps xmm7, xmm6
9558  movlps qword ptr [eax+112], xmm7
9559  movhps qword ptr [eax+136], xmm7
9560  }
9561  return;
9562  }
9563  }
9564  }
9565  }
9566  for ( i = 0; i < k; i++ ) {
9567  m2Ptr = m2.ToFloatPtr();
9568  for ( j = 0; j < l; j++ ) {
9569  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
9570  m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l];
9571  m2Ptr++;
9572  }
9573  m1Ptr += 6;
9574  }
9575  break;
9576  }
9577  default: {
9578  for ( i = 0; i < k; i++ ) {
9579  for ( j = 0; j < l; j++ ) {
9580  m2Ptr = m2.ToFloatPtr() + j;
9581  sum = m1Ptr[0] * m2Ptr[0];
9582  for ( n = 1; n < m1.GetNumColumns(); n++ ) {
9583  m2Ptr += l;
9584  sum += m1Ptr[n] * m2Ptr[0];
9585  }
9586  *dstPtr++ = sum;
9587  }
9588  m1Ptr += m1.GetNumColumns();
9589  }
9590  break;
9591  }
9592  }
9593 }
9594 
9595 /*
9596 ============
9597 idSIMD_SSE::MatX_TransposeMultiplyMatX
9598 
9599  optimizes the following transpose matrix multiplications:
9600 
9601  Nx6 * NxN
9602  6xN * 6x6
9603 
9604  with N in the range [1-6].
9605 ============
9606 */
9607 void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
9608  int i, j, k, l, n;
9609  float *dstPtr;
9610  const float *m1Ptr, *m2Ptr;
9611  double sum;
9612 
9613  assert( m1.GetNumRows() == m2.GetNumRows() );
9614 
9615  m1Ptr = m1.ToFloatPtr();
9616  m2Ptr = m2.ToFloatPtr();
9617  dstPtr = dst.ToFloatPtr();
9618  k = m1.GetNumColumns();
9619  l = m2.GetNumColumns();
9620 
9621  switch( m1.GetNumRows() ) {
9622  case 1:
9623  if ( !((k^6)|(l^1)) ) { // 1x6 * 1x1
9624  __asm {
9625  mov esi, m2Ptr
9626  mov edi, m1Ptr
9627  mov eax, dstPtr
9628  movss xmm0, [esi]
9629  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
9630  movaps xmm1, xmm0
9631  mulps xmm0, [edi]
9632  mulps xmm1, [edi+16]
9633  movaps [eax], xmm0
9634  movlps [eax+16], xmm1
9635  }
9636  return;
9637  }
9638  for ( i = 0; i < k; i++ ) {
9639  m2Ptr = m2.ToFloatPtr();
9640  for ( j = 0; j < l; j++ ) {
9641  *dstPtr++ = m1Ptr[0] * m2Ptr[0];
9642  m2Ptr++;
9643  }
9644  m1Ptr++;
9645  }
9646  break;
9647  case 2:
9648  if ( !((k^6)|(l^2)) ) { // 2x6 * 2x2
9649  #define MUL_2xN_2x2_INIT \
9650  __asm mov esi, m2Ptr \
9651  __asm mov edi, m1Ptr \
9652  __asm mov eax, dstPtr \
9653  __asm movlps xmm0, [esi] \
9654  __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
9655  __asm movlps xmm1, [esi+8] \
9656  __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )
9657 
9658  #define MUL_2xN_2x2_ROW2( N, row ) \
9659  __asm movlps xmm6, [edi+(row+0*N)*4] \
9660  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9661  __asm movlps xmm7, [edi+(row+1*N)*4] \
9662  __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9663  __asm mulps xmm6, xmm0 \
9664  __asm mulps xmm7, xmm1 \
9665  __asm addps xmm6, xmm7 \
9666  __asm movaps [eax+(row*2)*4], xmm6
9667 
9668  MUL_2xN_2x2_INIT
9669  MUL_2xN_2x2_ROW2( 6, 0 )
9670  MUL_2xN_2x2_ROW2( 6, 2 )
9671  MUL_2xN_2x2_ROW2( 6, 4 )
9672 
9673  return;
9674  }
9675  for ( i = 0; i < k; i++ ) {
9676  m2Ptr = m2.ToFloatPtr();
9677  for ( j = 0; j < l; j++ ) {
9678  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];
9679  m2Ptr++;
9680  }
9681  m1Ptr++;
9682  }
9683  break;
9684  case 3:
9685  if ( !((k^6)|(l^3)) ) { // 3x6 * 3x3
9686 
9687  #define MUL_3xN_3x3_INIT \
9688  __asm mov esi, m2Ptr \
9689  __asm mov edi, m1Ptr \
9690  __asm mov eax, dstPtr \
9691  __asm movss xmm0, [esi+(0*3+0)*4] \
9692  __asm movhps xmm0, [esi+(0*3+1)*4] \
9693  __asm movss xmm1, [esi+(1*3+0)*4] \
9694  __asm movhps xmm1, [esi+(1*3+1)*4] \
9695  __asm movss xmm2, [esi+(2*3+0)*4] \
9696  __asm movhps xmm2, [esi+(2*3+1)*4]
9697 
9698  #define MUL_3xN_3x3_INIT_ROW4 \
9699  __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 3, 0 ) \
9700  __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 0 ) \
9701  __asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 2, 3, 0 )
9702 
9703  #define MUL_3xN_3x3_ROW4( N, row ) \
9704  __asm movlps xmm3, [edi+(row+0*N+0)*4] \
9705  __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 1 ) \
9706  __asm movlps xmm4, [edi+(row+1*N+0)*4] \
9707  __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 1 ) \
9708  __asm movlps xmm5, [edi+(row+2*N+0)*4] \
9709  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 1 ) \
9710  __asm mulps xmm3, xmm0 \
9711  __asm mulps xmm4, xmm1 \
9712  __asm mulps xmm5, xmm2 \
9713  __asm addps xmm3, xmm4 \
9714  __asm addps xmm3, xmm5 \
9715  __asm movaps [eax+(row*3+0)*4], xmm3 \
9716  __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 ) \
9717  __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 ) \
9718  __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 ) \
9719  __asm movlps xmm3, [edi+(row+0*N+1)*4] \
9720  __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9721  __asm movlps xmm4, [edi+(row+1*N+1)*4] \
9722  __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9723  __asm movlps xmm5, [edi+(row+2*N+1)*4] \
9724  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9725  __asm mulps xmm3, xmm0 \
9726  __asm mulps xmm4, xmm1 \
9727  __asm mulps xmm5, xmm2 \
9728  __asm addps xmm3, xmm4 \
9729  __asm addps xmm3, xmm5 \
9730  __asm movaps [eax+(row*3+4)*4], xmm3 \
9731  __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 ) \
9732  __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 ) \
9733  __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 ) \
9734  __asm movlps xmm3, [edi+(row+0*N+2)*4] \
9735  __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 1, 1 ) \
9736  __asm movlps xmm4, [edi+(row+1*N+2)*4] \
9737  __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 1, 1 ) \
9738  __asm movlps xmm5, [edi+(row+2*N+2)*4] \
9739  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 1, 1 ) \
9740  __asm mulps xmm3, xmm0 \
9741  __asm mulps xmm4, xmm1 \
9742  __asm mulps xmm5, xmm2 \
9743  __asm addps xmm3, xmm4 \
9744  __asm addps xmm3, xmm5 \
9745  __asm movaps [eax+(row*3+8)*4], xmm3
9746 
9747  #define MUL_3xN_3x3_INIT_ROW4_ROW4 \
9748  __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) \
9749  __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) \
9750  __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
9751 
9752  #define MUL_3xN_3x3_INIT_ROW4_ROW \
9753  __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 1, 2, 3 ) \
9754  __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 2, 3 ) \
9755  __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 1, 2, 3 )
9756 
9757  #define MUL_3xN_3x3_ROW( N, row ) \
9758  __asm movss xmm3, [edi+(row+0*N)*4] \
9759  __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9760  __asm movss xmm4, [edi+(row+1*N)*4] \
9761  __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9762  __asm movss xmm5, [edi+(row+2*N)*4] \
9763  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9764  __asm mulps xmm3, xmm0 \
9765  __asm mulps xmm4, xmm1 \
9766  __asm mulps xmm5, xmm2 \
9767  __asm addps xmm3, xmm4 \
9768  __asm addps xmm3, xmm5 \
9769  __asm movss [eax+(row*3+0)*4], xmm3 \
9770  __asm movhps [eax+(row*3+1)*4], xmm3
9771 
9772  MUL_3xN_3x3_INIT
9773  MUL_3xN_3x3_INIT_ROW4
9774  MUL_3xN_3x3_ROW4( 6, 0 )
9775  MUL_3xN_3x3_INIT_ROW4_ROW
9776  MUL_3xN_3x3_ROW( 6, 4 )
9777  MUL_3xN_3x3_ROW( 6, 5 )
9778 
9779  return;
9780  }
9781  for ( i = 0; i < k; i++ ) {
9782  m2Ptr = m2.ToFloatPtr();
9783  for ( j = 0; j < l; j++ ) {
9784  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l];
9785  m2Ptr++;
9786  }
9787  m1Ptr++;
9788  }
9789  break;
9790  case 4:
9791  if ( !((k^6)|(l^4)) ) { // 4x6 * 4x4
9792 
9793  #define MUL_4xN_4x4_INIT \
9794  __asm mov esi, m2Ptr \
9795  __asm mov edi, m1Ptr \
9796  __asm mov eax, dstPtr \
9797  __asm movaps xmm0, [esi] \
9798  __asm movaps xmm1, [esi+16] \
9799  __asm movaps xmm2, [esi+32] \
9800  __asm movaps xmm3, [esi+48]
9801 
9802  #define MUL_4xN_4x4_ROW( N, row ) \
9803  __asm movss xmm7, [edi+(row+0*N)*4] \
9804  __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9805  __asm mulps xmm7, xmm0 \
9806  __asm movss xmm6, [edi+(row+1*N)*4] \
9807  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9808  __asm mulps xmm6, xmm1 \
9809  __asm addps xmm7, xmm6 \
9810  __asm movss xmm6, [edi+(row+2*N)*4] \
9811  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9812  __asm mulps xmm6, xmm2 \
9813  __asm addps xmm7, xmm6 \
9814  __asm movss xmm6, [edi+(row+3*N)*4] \
9815  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9816  __asm mulps xmm6, xmm3 \
9817  __asm addps xmm7, xmm6 \
9818  __asm movaps [eax+row*16], xmm7
9819 
9820  MUL_4xN_4x4_INIT
9821  MUL_4xN_4x4_ROW( 6, 0 )
9822  MUL_4xN_4x4_ROW( 6, 1 )
9823  MUL_4xN_4x4_ROW( 6, 2 )
9824  MUL_4xN_4x4_ROW( 6, 3 )
9825  MUL_4xN_4x4_ROW( 6, 4 )
9826  MUL_4xN_4x4_ROW( 6, 5 )
9827 
9828  return;
9829  }
9830  for ( i = 0; i < k; i++ ) {
9831  m2Ptr = m2.ToFloatPtr();
9832  for ( j = 0; j < l; j++ ) {
9833  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
9834  m1Ptr[3*k] * m2Ptr[3*l];
9835  m2Ptr++;
9836  }
9837  m1Ptr++;
9838  }
9839  break;
9840  case 5:
9841  if ( !((k^6)|(l^5)) ) { // 5x6 * 5x5
9842 
9843  #define MUL_5xN_5x5_INIT \
9844  __asm mov esi, m2Ptr \
9845  __asm mov edi, m1Ptr \
9846  __asm mov eax, dstPtr \
9847  __asm movlps xmm0, [esi+ 0*4] \
9848  __asm movhps xmm0, [esi+ 2*4] \
9849  __asm movlps xmm1, [esi+ 5*4] \
9850  __asm movhps xmm1, [esi+ 7*4] \
9851  __asm movlps xmm2, [esi+10*4] \
9852  __asm movhps xmm2, [esi+12*4] \
9853  __asm movlps xmm3, [esi+15*4] \
9854  __asm movhps xmm3, [esi+17*4] \
9855  __asm movlps xmm4, [esi+20*4] \
9856  __asm movhps xmm4, [esi+22*4]
9857 
9858  #define MUL_5xN_5x5_ROW( N, row ) \
9859  __asm movss xmm6, [edi+(row+0*N)*4] \
9860  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9861  __asm mulps xmm6, xmm0 \
9862  __asm fld dword ptr [edi+(row+0*N)*4] \
9863  __asm fmul dword ptr [esi+ 4*4] \
9864  __asm movss xmm5, [edi+(row+1*N)*4] \
9865  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9866  __asm mulps xmm5, xmm1 \
9867  __asm addps xmm6, xmm5 \
9868  __asm fld dword ptr [edi+(row+1*N)*4] \
9869  __asm fmul dword ptr [esi+ 9*4] \
9870  __asm faddp st(1),st \
9871  __asm movss xmm5, [edi+(row+2*N)*4] \
9872  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9873  __asm mulps xmm5, xmm2 \
9874  __asm addps xmm6, xmm5 \
9875  __asm fld dword ptr [edi+(row+2*N)*4] \
9876  __asm fmul dword ptr [esi+14*4] \
9877  __asm faddp st(1),st \
9878  __asm movss xmm5, [edi+(row+3*N)*4] \
9879  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9880  __asm mulps xmm5, xmm3 \
9881  __asm addps xmm6, xmm5 \
9882  __asm fld dword ptr [edi+(row+3*N)*4] \
9883  __asm fmul dword ptr [esi+19*4] \
9884  __asm faddp st(1),st \
9885  __asm movss xmm5, [edi+(row+4*N)*4] \
9886  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9887  __asm mulps xmm5, xmm4 \
9888  __asm addps xmm6, xmm5 \
9889  __asm fld dword ptr [edi+(row+4*N)*4] \
9890  __asm fmul dword ptr [esi+24*4] \
9891  __asm faddp st(1),st \
9892  __asm fstp dword ptr [eax+(row*5+4)*4] \
9893  __asm movlps [eax+(row*5+0)*4], xmm6 \
9894  __asm movhps [eax+(row*5+2)*4], xmm6
9895 
9896  MUL_5xN_5x5_INIT
9897  MUL_5xN_5x5_ROW( 6, 0 )
9898  MUL_5xN_5x5_ROW( 6, 1 )
9899  MUL_5xN_5x5_ROW( 6, 2 )
9900  MUL_5xN_5x5_ROW( 6, 3 )
9901  MUL_5xN_5x5_ROW( 6, 4 )
9902  MUL_5xN_5x5_ROW( 6, 5 )
9903 
9904  return;
9905  }
9906  for ( i = 0; i < k; i++ ) {
9907  m2Ptr = m2.ToFloatPtr();
9908  for ( j = 0; j < l; j++ ) {
9909  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
9910  m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l];
9911  m2Ptr++;
9912  }
9913  m1Ptr++;
9914  }
9915  break;
9916  case 6:
9917  if ( !(l^6) ) {
9918  switch( k ) {
9919  case 1: { // 6x1 * 6x6
9920  #define MUL_6xN_6x6_FIRST4COLUMNS_INIT \
9921  __asm mov esi, m2Ptr \
9922  __asm mov edi, m1Ptr \
9923  __asm mov eax, dstPtr \
9924  __asm movlps xmm0, [esi+ 0*4] \
9925  __asm movhps xmm0, [esi+ 2*4] \
9926  __asm movlps xmm1, [esi+ 6*4] \
9927  __asm movhps xmm1, [esi+ 8*4] \
9928  __asm movlps xmm2, [esi+12*4] \
9929  __asm movhps xmm2, [esi+14*4] \
9930  __asm movlps xmm3, [esi+18*4] \
9931  __asm movhps xmm3, [esi+20*4] \
9932  __asm movlps xmm4, [esi+24*4] \
9933  __asm movhps xmm4, [esi+26*4] \
9934  __asm movlps xmm5, [esi+30*4] \
9935  __asm movhps xmm5, [esi+32*4]
9936 
9937  #define MUL_6xN_6x6_FIRST4COLUMNS_ROW( N, row ) \
9938  __asm movss xmm7, [edi+(row+0*N)*4] \
9939  __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9940  __asm mulps xmm7, xmm0 \
9941  __asm movss xmm6, [edi+(row+1*N)*4] \
9942  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9943  __asm mulps xmm6, xmm1 \
9944  __asm addps xmm7, xmm6 \
9945  __asm movss xmm6, [edi+(row+2*N)*4] \
9946  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9947  __asm mulps xmm6, xmm2 \
9948  __asm addps xmm7, xmm6 \
9949  __asm movss xmm6, [edi+(row+3*N)*4] \
9950  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9951  __asm mulps xmm6, xmm3 \
9952  __asm addps xmm7, xmm6 \
9953  __asm movss xmm6, [edi+(row+4*N)*4] \
9954  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9955  __asm mulps xmm6, xmm4 \
9956  __asm addps xmm7, xmm6 \
9957  __asm movss xmm6, [edi+(row+5*N)*4] \
9958  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9959  __asm mulps xmm6, xmm5 \
9960  __asm addps xmm7, xmm6 \
9961  __asm movlps [eax+(row*6+0)*4], xmm7 \
9962  __asm movhps [eax+(row*6+2)*4], xmm7
9963 
9964  #define MUL_6xN_6x6_LAST2COLUMNS_INIT \
9965  __asm movlps xmm0, [esi+ 4*4] \
9966  __asm movlps xmm1, [esi+10*4] \
9967  __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
9968  __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
9969  __asm movlps xmm2, [esi+16*4] \
9970  __asm movlps xmm3, [esi+22*4] \
9971  __asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \
9972  __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
9973  __asm movlps xmm4, [esi+28*4] \
9974  __asm movlps xmm5, [esi+34*4] \
9975  __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \
9976  __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
9977 
9978  #define MUL_6xN_6x6_LAST2COLUMNS_ROW2( N, row ) \
9979  __asm movlps xmm7, [edi+(row*2+0*N)*4] \
9980  __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9981  __asm mulps xmm7, xmm0 \
9982  __asm movlps xmm6, [edi+(row*2+1*N)*4] \
9983  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9984  __asm mulps xmm6, xmm1 \
9985  __asm addps xmm7, xmm6 \
9986  __asm movlps xmm6, [edi+(row*2+2*N)*4] \
9987  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9988  __asm mulps xmm6, xmm2 \
9989  __asm addps xmm7, xmm6 \
9990  __asm movlps xmm6, [edi+(row*2+3*N)*4] \
9991  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9992  __asm mulps xmm6, xmm3 \
9993  __asm addps xmm7, xmm6 \
9994  __asm movlps xmm6, [edi+(row*2+4*N)*4] \
9995  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9996  __asm mulps xmm6, xmm4 \
9997  __asm addps xmm7, xmm6 \
9998  __asm movlps xmm6, [edi+(row*2+5*N)*4] \
9999  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
10000  __asm mulps xmm6, xmm5 \
10001  __asm addps xmm7, xmm6 \
10002  __asm movlps [eax+(row*12+ 4)*4], xmm7 \
10003  __asm movhps [eax+(row*12+10)*4], xmm7
10004 
10005  #define MUL_6xN_6x6_LAST2COLUMNS_ROW( N, row ) \
10006  __asm movss xmm7, [edi+(1*N-1)*4] \
10007  __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
10008  __asm mulps xmm7, xmm0 \
10009  __asm movss xmm6, [edi+(2*N-1)*4] \
10010  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
10011  __asm mulps xmm6, xmm1 \
10012  __asm addps xmm7, xmm6 \
10013  __asm movss xmm6, [edi+(3*N-1)*4] \
10014  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
10015  __asm mulps xmm6, xmm2 \
10016  __asm addps xmm7, xmm6 \
10017  __asm movss xmm6, [edi+(4*N-1)*4] \
10018  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
10019  __asm mulps xmm6, xmm3 \
10020  __asm addps xmm7, xmm6 \
10021  __asm movss xmm6, [edi+(5*N-1)*4] \
10022  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
10023  __asm mulps xmm6, xmm4 \
10024  __asm addps xmm7, xmm6 \
10025  __asm movss xmm6, [edi+(6*N-1)*4] \
10026  __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
10027  __asm mulps xmm6, xmm5 \
10028  __asm addps xmm7, xmm6 \
10029  __asm movlps [eax+(row*6+4)*4], xmm7
10030 
10031  MUL_6xN_6x6_FIRST4COLUMNS_INIT
10032  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 1, 0 )
10033  MUL_6xN_6x6_LAST2COLUMNS_INIT
10034  MUL_6xN_6x6_LAST2COLUMNS_ROW( 1, 0 )
10035 
10036  return;
10037  }
10038  case 2: { // 6x2 * 6x6
10039 
10040  MUL_6xN_6x6_FIRST4COLUMNS_INIT
10041  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 0 )
10042  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 1 )
10043  MUL_6xN_6x6_LAST2COLUMNS_INIT
10044  MUL_6xN_6x6_LAST2COLUMNS_ROW2( 2, 0 )
10045 
10046  return;
10047  }
10048  case 3: { // 6x3 * 6x6
10049 
10050  MUL_6xN_6x6_FIRST4COLUMNS_INIT
10051  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 0 )
10052  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 1 )
10053  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 2 )
10054  MUL_6xN_6x6_LAST2COLUMNS_INIT
10055  MUL_6xN_6x6_LAST2COLUMNS_ROW2( 3, 0 )
10056  MUL_6xN_6x6_LAST2COLUMNS_ROW( 3, 2 )
10057 
10058  return;
10059  }
10060  case 4: { // 6x4 * 6x6
10061 
10062  MUL_6xN_6x6_FIRST4COLUMNS_INIT
10063  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 0 )
10064  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 1 )
10065  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 2 )
10066  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 3 )
10067  MUL_6xN_6x6_LAST2COLUMNS_INIT
10068  MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 0 )
10069  MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 1 )
10070 
10071  return;
10072  }
10073  case 5: { // 6x5 * 6x6
10074 
10075  MUL_6xN_6x6_FIRST4COLUMNS_INIT
10076  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 0 )
10077  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 1 )
10078  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 2 )
10079  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 3 )
10080  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 4 )
10081  MUL_6xN_6x6_LAST2COLUMNS_INIT
10082  MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 0 )
10083  MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 1 )
10084  MUL_6xN_6x6_LAST2COLUMNS_ROW( 5, 4 )
10085 
10086  return;
10087  }
10088  case 6: { // 6x6 * 6x6
10089 
10090  MUL_6xN_6x6_FIRST4COLUMNS_INIT
10091  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 0 )
10092  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 1 )
10093  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 2 )
10094  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 3 )
10095  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 4 )
10096  MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 5 )
10097  MUL_6xN_6x6_LAST2COLUMNS_INIT
10098  MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 0 )
10099  MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 1 )
10100  MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 2 )
10101 
10102  return;
10103  }
10104  }
10105  }
10106  for ( i = 0; i < k; i++ ) {
10107  m2Ptr = m2.ToFloatPtr();
10108  for ( j = 0; j < l; j++ ) {
10109  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
10110  m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l];
10111  m2Ptr++;
10112  }
10113  m1Ptr++;
10114  }
10115  break;
10116  default:
10117  for ( i = 0; i < k; i++ ) {
10118  for ( j = 0; j < l; j++ ) {
10119  m1Ptr = m1.ToFloatPtr() + i;
10120  m2Ptr = m2.ToFloatPtr() + j;
10121  sum = m1Ptr[0] * m2Ptr[0];
10122  for ( n = 1; n < m1.GetNumRows(); n++ ) {
10123  m1Ptr += k;
10124  m2Ptr += l;
10125  sum += m1Ptr[0] * m2Ptr[0];
10126  }
10127  *dstPtr++ = sum;
10128  }
10129  }
10130  break;
10131  }
10132 }
10133 
10134 /*
10135 ============
10136 idSIMD_SSE::MatX_LowerTriangularSolve
10137 
10138  solves x in Lx = b for the n * n sub-matrix of L
10139  if skip > 0 the first skip elements of x are assumed to be valid already
10140  L has to be a lower triangular matrix with (implicit) ones on the diagonal
10141  x == b is allowed
10142 ============
10143 */
10144 void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
10145  int nc;
10146  const float *lptr;
10147 
10148  if ( skip >= n ) {
10149  return;
10150  }
10151 
10152  lptr = L.ToFloatPtr();
10153  nc = L.GetNumColumns();
10154 
10155  // unrolled cases for n < 8
10156  if ( n < 8 ) {
10157  #define NSKIP( n, s ) ((n<<3)|(s&7))
10158  switch( NSKIP( n, skip ) ) {
10159  case NSKIP( 1, 0 ): x[0] = b[0];
10160  return;
10161  case NSKIP( 2, 0 ): x[0] = b[0];
10162  case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10163  return;
10164  case NSKIP( 3, 0 ): x[0] = b[0];
10165  case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10166  case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10167  return;
10168  case NSKIP( 4, 0 ): x[0] = b[0];
10169  case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10170  case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10171  case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10172  return;
10173  case NSKIP( 5, 0 ): x[0] = b[0];
10174  case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10175  case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10176  case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10177  case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
10178  return;
10179  case NSKIP( 6, 0 ): x[0] = b[0];
10180  case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10181  case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10182  case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10183  case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
10184  case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
10185  return;
10186  case NSKIP( 7, 0 ): x[0] = b[0];
10187  case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10188  case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10189  case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10190  case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
10191  case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
10192  case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
10193  return;
10194  }
10195  return;
10196  }
10197 
10198  // process first 4 rows
10199  switch( skip ) {
10200  case 0: x[0] = b[0];
10201  case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
10202  case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10203  case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10204  skip = 4;
10205  }
10206 
10207  lptr = L[skip];
10208 
10209  // this code assumes n > 4
10210  __asm {
10211  push ebx
10212  mov eax, skip // eax = i
10213  shl eax, 2 // eax = i*4
10214  mov edx, n // edx = n
10215  shl edx, 2 // edx = n*4
10216  mov esi, x // esi = x
10217  mov edi, lptr // edi = lptr
10218  add esi, eax
10219  add edi, eax
10220  mov ebx, b // ebx = b
10221 
10222  // check for aligned memory
10223  mov ecx, nc
10224  shl ecx, 2
10225  or ecx, esi
10226  or ecx, edi
10227  and ecx, 15
10228  jnz loopurow
10229 
10230  // aligned
10231  looprow:
10232  mov ecx, eax
10233  neg ecx
10234  movaps xmm0, [esi+ecx]
10235  mulps xmm0, [edi+ecx]
10236  add ecx, 12*4
10237  jg donedot8
10238  dot8:
10239  movaps xmm1, [esi+ecx-(8*4)]
10240  mulps xmm1, [edi+ecx-(8*4)]
10241  addps xmm0, xmm1
10242  movaps xmm3, [esi+ecx-(4*4)]
10243  mulps xmm3, [edi+ecx-(4*4)]
10244  addps xmm0, xmm3
10245  add ecx, 8*4
10246  jle dot8
10247  donedot8:
10248  sub ecx, 4*4
10249  jg donedot4
10250  //dot4:
10251  movaps xmm1, [esi+ecx-(4*4)]
10252  mulps xmm1, [edi+ecx-(4*4)]
10253  addps xmm0, xmm1
10254  add ecx, 4*4
10255  donedot4:
10256  movhlps xmm1, xmm0
10257  addps xmm0, xmm1
10258  movaps xmm1, xmm0
10259  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
10260  addss xmm0, xmm1
10261  sub ecx, 4*4
10262  jz dot0
10263  add ecx, 4
10264  jz dot1
10265  add ecx, 4
10266  jz dot2
10267  //dot3:
10268  movss xmm1, [esi-(3*4)]
10269  mulss xmm1, [edi-(3*4)]
10270  addss xmm0, xmm1
10271  dot2:
10272  movss xmm3, [esi-(2*4)]
10273  mulss xmm3, [edi-(2*4)]
10274  addss xmm0, xmm3
10275  dot1:
10276  movss xmm5, [esi-(1*4)]
10277  mulss xmm5, [edi-(1*4)]
10278  addss xmm0, xmm5
10279  dot0:
10280  movss xmm1, [ebx+eax]
10281  subss xmm1, xmm0
10282  movss [esi], xmm1
10283  add eax, 4
10284  cmp eax, edx
10285  jge done
10286  add esi, 4
10287  mov ecx, nc
10288  shl ecx, 2
10289  add edi, ecx
10290  add edi, 4
10291  jmp looprow
10292 
10293  // unaligned
10294  loopurow:
10295  mov ecx, eax
10296  neg ecx
10297  movups xmm0, [esi+ecx]
10298  movups xmm1, [edi+ecx]
10299  mulps xmm0, xmm1
10300  add ecx, 12*4
10301  jg doneudot8
10302  udot8:
10303  movups xmm1, [esi+ecx-(8*4)]
10304  movups xmm2, [edi+ecx-(8*4)]
10305  mulps xmm1, xmm2
10306  addps xmm0, xmm1
10307  movups xmm3, [esi+ecx-(4*4)]
10308  movups xmm4, [edi+ecx-(4*4)]
10309  mulps xmm3, xmm4
10310  addps xmm0, xmm3
10311  add ecx, 8*4
10312  jle udot8
10313  doneudot8:
10314  sub ecx, 4*4
10315  jg doneudot4
10316  //udot4:
10317  movups xmm1, [esi+ecx-(4*4)]
10318  movups xmm2, [edi+ecx-(4*4)]
10319  mulps xmm1, xmm2
10320  addps xmm0, xmm1
10321  add ecx, 4*4
10322  doneudot4:
10323  movhlps xmm1, xmm0
10324  addps xmm0, xmm1
10325  movaps xmm1, xmm0
10326  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
10327  addss xmm0, xmm1
10328  sub ecx, 4*4
10329  jz udot0
10330  add ecx, 4
10331  jz udot1
10332  add ecx, 4
10333  jz udot2
10334  //udot3:
10335  movss xmm1, [esi-(3*4)]
10336  movss xmm2, [edi-(3*4)]
10337  mulss xmm1, xmm2
10338  addss xmm0, xmm1
10339  udot2:
10340  movss xmm3, [esi-(2*4)]
10341  movss xmm4, [edi-(2*4)]
10342  mulss xmm3, xmm4
10343  addss xmm0, xmm3
10344  udot1:
10345  movss xmm5, [esi-(1*4)]
10346  movss xmm6, [edi-(1*4)]
10347  mulss xmm5, xmm6
10348  addss xmm0, xmm5
10349  udot0:
10350  movss xmm1, [ebx+eax]
10351  subss xmm1, xmm0
10352  movss [esi], xmm1
10353  add eax, 4
10354  cmp eax, edx
10355  jge done
10356  add esi, 4
10357  mov ecx, nc
10358  shl ecx, 2
10359  add edi, ecx
10360  add edi, 4
10361  jmp loopurow
10362  done:
10363  pop ebx
10364  }
10365 }
10366 
10367 /*
10368 ============
10369 idSIMD_SSE::MatX_LowerTriangularSolveTranspose
10370 
10371  solves x in L'x = b for the n * n sub-matrix of L
10372  L has to be a lower triangular matrix with (implicit) ones on the diagonal
10373  x == b is allowed
10374 ============
10375 */
10376 void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
10377  int nc;
10378  const float *lptr;
10379 
10380  lptr = L.ToFloatPtr();
10381  nc = L.GetNumColumns();
10382 
10383  // unrolled cases for n < 8
10384  if ( n < 8 ) {
10385  switch( n ) {
10386  case 0:
10387  return;
10388  case 1:
10389  x[0] = b[0];
10390  return;
10391  case 2:
10392  x[1] = b[1];
10393  x[0] = b[0] - lptr[1*nc+0] * x[1];
10394  return;
10395  case 3:
10396  x[2] = b[2];
10397  x[1] = b[1] - lptr[2*nc+1] * x[2];
10398  x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10399  return;
10400  case 4:
10401  x[3] = b[3];
10402  x[2] = b[2] - lptr[3*nc+2] * x[3];
10403  x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
10404  x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10405  return;
10406  case 5:
10407  x[4] = b[4];
10408  x[3] = b[3] - lptr[4*nc+3] * x[4];
10409  x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
10410  x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
10411  x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10412  return;
10413  case 6:
10414  x[5] = b[5];
10415  x[4] = b[4] - lptr[5*nc+4] * x[5];
10416  x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
10417  x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
10418  x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
10419  x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10420  return;
10421  case 7:
10422  x[6] = b[6];
10423  x[5] = b[5] - lptr[6*nc+5] * x[6];
10424  x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
10425  x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
10426  x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
10427  x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
10428  x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10429  return;
10430  }
10431  return;
10432  }
10433 
10434 #if 1
10435 
10436  int i, j, m;
10437  float *xptr;
10438  double s0;
10439 
10440  // if the number of columns is not a multiple of 2 we're screwed for alignment.
10441  // however, if the number of columns is a multiple of 2 but the number of to be
10442  // processed rows is not a multiple of 2 we can still run 8 byte aligned
10443  m = n;
10444  if ( m & 1 ) {
10445 
10446  m--;
10447  x[m] = b[m];
10448 
10449  lptr = L.ToFloatPtr() + m * nc + m - 4;
10450  xptr = x + m;
10451  __asm {
10452  push ebx
10453  mov eax, m // eax = i
10454  mov esi, xptr // esi = xptr
10455  mov edi, lptr // edi = lptr
10456  mov ebx, b // ebx = b
10457  mov edx, nc // edx = nc*sizeof(float)
10458  shl edx, 2
10459  process4rows_1:
10460  movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
10461  movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3]
10462  xor ecx, ecx
10463  sub eax, m
10464  neg eax
10465  jz done4x4_1
10466  process4x4_1: // process 4x4 blocks
10467  movlps xmm2, [edi+0]
10468  movhps xmm2, [edi+8]
10469  add edi, edx
10470  movss xmm1, [esi+4*ecx+0]
10471  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10472  movlps xmm3, [edi+0]
10473  movhps xmm3, [edi+8]
10474  add edi, edx
10475  mulps xmm1, xmm2
10476  subps xmm0, xmm1
10477  movss xmm1, [esi+4*ecx+4]
10478  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10479  movlps xmm4, [edi+0]
10480  movhps xmm4, [edi+8]
10481  add edi, edx
10482  mulps xmm1, xmm3
10483  subps xmm0, xmm1
10484  movss xmm1, [esi+4*ecx+8]
10485  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10486  movlps xmm5, [edi+0]
10487  movhps xmm5, [edi+8]
10488  add edi, edx
10489  mulps xmm1, xmm4
10490  subps xmm0, xmm1
10491  movss xmm1, [esi+4*ecx+12]
10492  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10493  add ecx, 4
10494  cmp ecx, eax
10495  mulps xmm1, xmm5
10496  subps xmm0, xmm1
10497  jl process4x4_1
10498  done4x4_1: // process left over of the 4 rows
10499  movlps xmm2, [edi+0]
10500  movhps xmm2, [edi+8]
10501  movss xmm1, [esi+4*ecx]
10502  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10503  mulps xmm1, xmm2
10504  subps xmm0, xmm1
10505  imul ecx, edx
10506  sub edi, ecx
10507  neg eax
10508 
10509  add eax, m
10510  sub eax, 4
10511  movaps xmm1, xmm0
10512  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
10513  movaps xmm2, xmm0
10514  shufps xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
10515  movaps xmm3, xmm0
10516  shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
10517  sub edi, edx
10518  movss [esi-4], xmm3 // xptr[-1] = s3
10519  movss xmm4, xmm3
10520  movss xmm5, xmm3
10521  mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3
10522  mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3
10523  mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3
10524  subss xmm2, xmm3
10525  movss [esi-8], xmm2 // xptr[-2] = s2
10526  movss xmm6, xmm2
10527  sub edi, edx
10528  subss xmm0, xmm5
10529  subss xmm1, xmm4
10530  mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2
10531  mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2
10532  subss xmm1, xmm2
10533  movss [esi-12], xmm1 // xptr[-3] = s1
10534  subss xmm0, xmm6
10535  sub edi, edx
10536  cmp eax, 4
10537  mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1
10538  subss xmm0, xmm1
10539  movss [esi-16], xmm0 // xptr[-4] = s0
10540  jl done4rows_1
10541  sub edi, edx
10542  sub edi, 16
10543  sub esi, 16
10544  jmp process4rows_1
10545  done4rows_1:
10546  pop ebx
10547  }
10548 
10549  } else {
10550 
10551  lptr = L.ToFloatPtr() + m * nc + m - 4;
10552  xptr = x + m;
10553  __asm {
10554  push ebx
10555  mov eax, m // eax = i
10556  mov esi, xptr // esi = xptr
10557  mov edi, lptr // edi = lptr
10558  mov ebx, b // ebx = b
10559  mov edx, nc // edx = nc*sizeof(float)
10560  shl edx, 2
10561  process4rows:
10562  movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
10563  movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3]
10564  sub eax, m
10565  jz done4x4
10566  neg eax
10567  xor ecx, ecx
10568  process4x4: // process 4x4 blocks
10569  movlps xmm2, [edi+0]
10570  movhps xmm2, [edi+8]
10571  add edi, edx
10572  movss xmm1, [esi+4*ecx+0]
10573  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10574  movlps xmm3, [edi+0]
10575  movhps xmm3, [edi+8]
10576  add edi, edx
10577  mulps xmm1, xmm2
10578  subps xmm0, xmm1
10579  movss xmm1, [esi+4*ecx+4]
10580  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10581  movlps xmm4, [edi+0]
10582  movhps xmm4, [edi+8]
10583  add edi, edx
10584  mulps xmm1, xmm3
10585  subps xmm0, xmm1
10586  movss xmm1, [esi+4*ecx+8]
10587  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10588  movlps xmm5, [edi+0]
10589  movhps xmm5, [edi+8]
10590  add edi, edx
10591  mulps xmm1, xmm4
10592  subps xmm0, xmm1
10593  movss xmm1, [esi+4*ecx+12]
10594  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10595  add ecx, 4
10596  cmp ecx, eax
10597  mulps xmm1, xmm5
10598  subps xmm0, xmm1
10599  jl process4x4
10600  imul ecx, edx
10601  sub edi, ecx
10602  neg eax
10603  done4x4: // process left over of the 4 rows
10604  add eax, m
10605  sub eax, 4
10606  movaps xmm1, xmm0
10607  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
10608  movaps xmm2, xmm0
10609  shufps xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
10610  movaps xmm3, xmm0
10611  shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
10612  sub edi, edx
10613  movss [esi-4], xmm3 // xptr[-1] = s3
10614  movss xmm4, xmm3
10615  movss xmm5, xmm3
10616  mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3
10617  mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3
10618  mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3
10619  subss xmm2, xmm3
10620  movss [esi-8], xmm2 // xptr[-2] = s2
10621  movss xmm6, xmm2
10622  sub edi, edx
10623  subss xmm0, xmm5
10624  subss xmm1, xmm4
10625  mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2
10626  mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2
10627  subss xmm1, xmm2
10628  movss [esi-12], xmm1 // xptr[-3] = s1
10629  subss xmm0, xmm6
10630  sub edi, edx
10631  cmp eax, 4
10632  mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1
10633  subss xmm0, xmm1
10634  movss [esi-16], xmm0 // xptr[-4] = s0
10635  jl done4rows
10636  sub edi, edx
10637  sub edi, 16
10638  sub esi, 16
10639  jmp process4rows
10640  done4rows:
10641  pop ebx
10642  }
10643  }
10644 
10645  // process left over rows
10646  for ( i = (m&3)-1; i >= 0; i-- ) {
10647  s0 = b[i];
10648  lptr = L[0] + i;
10649  for ( j = i + 1; j < n; j++ ) {
10650  s0 -= lptr[j*nc] * x[j];
10651  }
10652  x[i] = s0;
10653  }
10654 
10655 #else
10656 
10657  int i, j, m;
10658  double s0, s1, s2, s3, t;
10659  const float *lptr2;
10660  float *xptr, *xptr2;
10661 
10662  m = n;
10663  if ( m & 1 ) {
10664 
10665  m--;
10666  x[m] = b[m];
10667 
10668  lptr = L.ToFloatPtr() + m * nc + m - 4;
10669  xptr = x + m;
10670  // process 4 rows at a time
10671  for ( i = m; i >= 4; i -= 4 ) {
10672  s0 = b[i-4];
10673  s1 = b[i-3];
10674  s2 = b[i-2];
10675  s3 = b[i-1];
10676  // process 4x4 blocks
10677  xptr2 = xptr; // x + i;
10678  lptr2 = lptr; // ptr = L[i] + i - 4;
10679  for ( j = 0; j < m-i; j += 4 ) {
10680  t = xptr2[0];
10681  s0 -= lptr2[0] * t;
10682  s1 -= lptr2[1] * t;
10683  s2 -= lptr2[2] * t;
10684  s3 -= lptr2[3] * t;
10685  lptr2 += nc;
10686  xptr2++;
10687  t = xptr2[0];
10688  s0 -= lptr2[0] * t;
10689  s1 -= lptr2[1] * t;
10690  s2 -= lptr2[2] * t;
10691  s3 -= lptr2[3] * t;
10692  lptr2 += nc;
10693  xptr2++;
10694  t = xptr2[0];
10695  s0 -= lptr2[0] * t;
10696  s1 -= lptr2[1] * t;
10697  s2 -= lptr2[2] * t;
10698  s3 -= lptr2[3] * t;
10699  lptr2 += nc;
10700  xptr2++;
10701  t = xptr2[0];
10702  s0 -= lptr2[0] * t;
10703  s1 -= lptr2[1] * t;
10704  s2 -= lptr2[2] * t;
10705  s3 -= lptr2[3] * t;
10706  lptr2 += nc;
10707  xptr2++;
10708  }
10709  t = xptr2[0];
10710  s0 -= lptr2[0] * t;
10711  s1 -= lptr2[1] * t;
10712  s2 -= lptr2[2] * t;
10713  s3 -= lptr2[3] * t;
10714  // process left over of the 4 rows
10715  lptr -= nc;
10716  s0 -= lptr[0] * s3;
10717  s1 -= lptr[1] * s3;
10718  s2 -= lptr[2] * s3;
10719  lptr -= nc;
10720  s0 -= lptr[0] * s2;
10721  s1 -= lptr[1] * s2;
10722  lptr -= nc;
10723  s0 -= lptr[0] * s1;
10724  lptr -= nc;
10725  // store result
10726  xptr[-4] = s0;
10727  xptr[-3] = s1;
10728  xptr[-2] = s2;
10729  xptr[-1] = s3;
10730  // update pointers for next four rows
10731  lptr -= 4;
10732  xptr -= 4;
10733  }
10734 
10735  } else {
10736 
10737  lptr = L.ToFloatPtr() + m * nc + m - 4;
10738  xptr = x + m;
10739  // process 4 rows at a time
10740  for ( i = m; i >= 4; i -= 4 ) {
10741  s0 = b[i-4];
10742  s1 = b[i-3];
10743  s2 = b[i-2];
10744  s3 = b[i-1];
10745  // process 4x4 blocks
10746  xptr2 = xptr; // x + i;
10747  lptr2 = lptr; // ptr = L[i] + i - 4;
10748  for ( j = 0; j < m-i; j += 4 ) {
10749  t = xptr2[0];
10750  s0 -= lptr2[0] * t;
10751  s1 -= lptr2[1] * t;
10752  s2 -= lptr2[2] * t;
10753  s3 -= lptr2[3] * t;
10754  lptr2 += nc;
10755  xptr2++;
10756  t = xptr2[0];
10757  s0 -= lptr2[0] * t;
10758  s1 -= lptr2[1] * t;
10759  s2 -= lptr2[2] * t;
10760  s3 -= lptr2[3] * t;
10761  lptr2 += nc;
10762  xptr2++;
10763  t = xptr2[0];
10764  s0 -= lptr2[0] * t;
10765  s1 -= lptr2[1] * t;
10766  s2 -= lptr2[2] * t;
10767  s3 -= lptr2[3] * t;
10768  lptr2 += nc;
10769  xptr2++;
10770  t = xptr2[0];
10771  s0 -= lptr2[0] * t;
10772  s1 -= lptr2[1] * t;
10773  s2 -= lptr2[2] * t;
10774  s3 -= lptr2[3] * t;
10775  lptr2 += nc;
10776  xptr2++;
10777  }
10778  // process left over of the 4 rows
10779  lptr -= nc;
10780  s0 -= lptr[0] * s3;
10781  s1 -= lptr[1] * s3;
10782  s2 -= lptr[2] * s3;
10783  lptr -= nc;
10784  s0 -= lptr[0] * s2;
10785  s1 -= lptr[1] * s2;
10786  lptr -= nc;
10787  s0 -= lptr[0] * s1;
10788  lptr -= nc;
10789  // store result
10790  xptr[-4] = s0;
10791  xptr[-3] = s1;
10792  xptr[-2] = s2;
10793  xptr[-1] = s3;
10794  // update pointers for next four rows
10795  lptr -= 4;
10796  xptr -= 4;
10797  }
10798  }
10799  // process left over rows
10800  for ( i--; i >= 0; i-- ) {
10801  s0 = b[i];
10802  lptr = L[0] + i;
10803  for ( j = i + 1; j < m; j++ ) {
10804  s0 -= lptr[j*nc] * x[j];
10805  }
10806  x[i] = s0;
10807  }
10808 
10809 #endif
10810 }
10811 
10812 /*
10813 ============
10814 idSIMD_SSE::MatX_LDLTFactor
10815 
10816  in-place factorization LDL' of the n * n sub-matrix of mat
10817  the reciprocal of the diagonal elements are stored in invDiag
10818  currently assumes the number of columns of mat is a multiple of 4
10819 ============
10820 */
10821 bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
10822 #if 1
10823 
10824  int j, nc;
10825  float *v, *diag, *invDiagPtr, *mptr;
10826  double s0, s1, s2, sum, d;
10827 
10828  v = (float *) _alloca16( n * sizeof( float ) );
10829  diag = (float *) _alloca16( n * sizeof( float ) );
10830  invDiagPtr = invDiag.ToFloatPtr();
10831 
10832  nc = mat.GetNumColumns();
10833 
10834  assert( ( nc & 3 ) == 0 );
10835 
10836  if ( n <= 0 ) {
10837  return true;
10838  }
10839 
10840  mptr = mat[0];
10841 
10842  sum = mptr[0];
10843 
10844  if ( sum == 0.0f ) {
10845  return false;
10846  }
10847 
10848  diag[0] = sum;
10849  invDiagPtr[0] = d = 1.0f / sum;
10850 
10851  if ( n <= 1 ) {
10852  return true;
10853  }
10854 
10855  mptr = mat[0];
10856  for ( j = 1; j < n; j++ ) {
10857  mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
10858  }
10859 
10860  mptr = mat[1];
10861 
10862  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
10863  sum = mptr[1] - s0;
10864 
10865  if ( sum == 0.0f ) {
10866  return false;
10867  }
10868 
10869  mat[1][1] = sum;
10870  diag[1] = sum;
10871  invDiagPtr[1] = d = 1.0f / sum;
10872 
10873  if ( n <= 2 ) {
10874  return true;
10875  }
10876 
10877  mptr = mat[0];
10878  for ( j = 2; j < n; j++ ) {
10879  mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
10880  }
10881 
10882  mptr = mat[2];
10883 
10884  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
10885  v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
10886  sum = mptr[2] - s0 - s1;
10887 
10888  if ( sum == 0.0f ) {
10889  return false;
10890  }
10891 
10892  mat[2][2] = sum;
10893  diag[2] = sum;
10894  invDiagPtr[2] = d = 1.0f / sum;
10895 
10896  if ( n <= 3 ) {
10897  return true;
10898  }
10899 
10900  mptr = mat[0];
10901  for ( j = 3; j < n; j++ ) {
10902  mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
10903  }
10904 
10905  mptr = mat[3];
10906 
10907  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
10908  v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
10909  v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
10910  sum = mptr[3] - s0 - s1 - s2;
10911 
10912  if ( sum == 0.0f ) {
10913  return false;
10914  }
10915 
10916  mat[3][3] = sum;
10917  diag[3] = sum;
10918  invDiagPtr[3] = d = 1.0f / sum;
10919 
10920  if ( n <= 4 ) {
10921  return true;
10922  }
10923 
10924  mptr = mat[0];
10925  for ( j = 4; j < n; j++ ) {
10926  mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
10927  }
10928 
10929  int ncf = nc * sizeof( float );
10930  mptr = mat[0];
10931 
10932  __asm {
10933  xorps xmm2, xmm2
10934  xorps xmm3, xmm3
10935  xorps xmm4, xmm4
10936 
10937  push ebx
10938  mov ebx, 4
10939 
10940  loopRow:
10941  cmp ebx, n
10942  jge done
10943 
10944  mov ecx, ebx // esi = i
10945  shl ecx, 2 // esi = i * 4
10946  mov edx, diag // edx = diag
10947  add edx, ecx // edx = &diag[i]
10948  mov edi, ebx // edi = i
10949  imul edi, ncf // edi = i * nc * sizeof( float )
10950  add edi, mptr // edi = mat[i]
10951  add edi, ecx // edi = &mat[i][i]
10952  mov esi, v // ecx = v
10953  add esi, ecx // ecx = &v[i]
10954  mov eax, invDiagPtr // eax = invDiagPtr
10955  add eax, ecx // eax = &invDiagPtr[i]
10956  neg ecx
10957 
10958  movaps xmm0, [edx+ecx]
10959  mulps xmm0, [edi+ecx]
10960  movaps [esi+ecx], xmm0
10961  mulps xmm0, [edi+ecx]
10962  add ecx, 12*4
10963  jg doneDot8
10964  dot8:
10965  movaps xmm1, [edx+ecx-(8*4)]
10966  mulps xmm1, [edi+ecx-(8*4)]
10967  movaps [esi+ecx-(8*4)], xmm1
10968  mulps xmm1, [edi+ecx-(8*4)]
10969  addps xmm0, xmm1
10970  movaps xmm2, [edx+ecx-(4*4)]
10971  mulps xmm2, [edi+ecx-(4*4)]
10972  movaps [esi+ecx-(4*4)], xmm2
10973  mulps xmm2, [edi+ecx-(4*4)]
10974  addps xmm0, xmm2
10975  add ecx, 8*4
10976  jle dot8
10977  doneDot8:
10978  sub ecx, 4*4
10979  jg doneDot4
10980  movaps xmm1, [edx+ecx-(4*4)]
10981  mulps xmm1, [edi+ecx-(4*4)]
10982  movaps [esi+ecx-(4*4)], xmm1
10983  mulps xmm1, [edi+ecx-(4*4)]
10984  addps xmm0, xmm1
10985  add ecx, 4*4
10986  doneDot4:
10987  sub ecx, 2*4
10988  jg doneDot2
10989  movlps xmm3, [edx+ecx-(2*4)]
10990  movlps xmm4, [edi+ecx-(2*4)]
10991  mulps xmm3, xmm4
10992  movlps [esi+ecx-(2*4)], xmm3
10993  mulps xmm3, xmm4
10994  addps xmm0, xmm3
10995  add ecx, 2*4
10996  doneDot2:
10997  sub ecx, 1*4
10998  jg doneDot1
10999  movss xmm3, [edx+ecx-(1*4)]
11000  movss xmm4, [edi+ecx-(1*4)]
11001  mulss xmm3, xmm4
11002  movss [esi+ecx-(1*4)], xmm3
11003  mulss xmm3, xmm4
11004  addss xmm0, xmm3
11005  doneDot1:
11006  movhlps xmm2, xmm0
11007  addps xmm0, xmm2
11008  movaps xmm2, xmm0
11009  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
11010  addss xmm0, xmm2
11011  movss xmm1, [edi]
11012  subss xmm1, xmm0
11013  movss [edi], xmm1 // mptr[i] = sum;
11014  movss [edx], xmm1 // diag[i] = sum;
11015 
11016  // if ( sum == 0.0f ) return false;
11017  movaps xmm2, xmm1
11018  cmpeqss xmm2, SIMD_SP_zero
11019  andps xmm2, SIMD_SP_tiny
11020  orps xmm1, xmm2
11021 
11022  rcpss xmm7, xmm1
11023  mulss xmm1, xmm7
11024  mulss xmm1, xmm7
11025  addss xmm7, xmm7
11026  subss xmm7, xmm1
11027  movss [eax], xmm7 // invDiagPtr[i] = 1.0f / sum;
11028 
11029  mov edx, n // edx = n
11030  sub edx, ebx // edx = n - i
11031  dec edx // edx = n - i - 1
11032  jle doneSubRow // if ( i + 1 >= n ) return true;
11033 
11034  mov eax, ebx // eax = i
11035  shl eax, 2 // eax = i * 4
11036  neg eax
11037 
11038  loopSubRow:
11039  add edi, ncf
11040  mov ecx, eax
11041  movaps xmm0, [esi+ecx]
11042  mulps xmm0, [edi+ecx]
11043  add ecx, 12*4
11044  jg doneSubDot8
11045  subDot8:
11046  movaps xmm1, [esi+ecx-(8*4)]
11047  mulps xmm1, [edi+ecx-(8*4)]
11048  addps xmm0, xmm1
11049  movaps xmm2, [esi+ecx-(4*4)]
11050  mulps xmm2, [edi+ecx-(4*4)]
11051  addps xmm0, xmm2
11052  add ecx, 8*4
11053  jle subDot8
11054  doneSubDot8:
11055  sub ecx, 4*4
11056  jg doneSubDot4
11057  movaps xmm1, [esi+ecx-(4*4)]
11058  mulps xmm1, [edi+ecx-(4*4)]
11059  addps xmm0, xmm1
11060  add ecx, 4*4
11061  doneSubDot4:
11062  sub ecx, 2*4
11063  jg doneSubDot2
11064  movlps xmm3, [esi+ecx-(2*4)]
11065  movlps xmm4, [edi+ecx-(2*4)]
11066  mulps xmm3, xmm4
11067  addps xmm0, xmm3
11068  add ecx, 2*4
11069  doneSubDot2:
11070  sub ecx, 1*4
11071  jg doneSubDot1
11072  movss xmm3, [esi+ecx-(1*4)]
11073  movss xmm4, [edi+ecx-(1*4)]
11074  mulss xmm3, xmm4
11075  addss xmm0, xmm3
11076  doneSubDot1:
11077  movhlps xmm2, xmm0
11078  addps xmm0, xmm2
11079  movaps xmm2, xmm0
11080  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
11081  addss xmm0, xmm2
11082  movss xmm1, [edi]
11083  subss xmm1, xmm0
11084  mulss xmm1, xmm7
11085  movss [edi], xmm1
11086  dec edx
11087  jg loopSubRow
11088  doneSubRow:
11089  inc ebx
11090  jmp loopRow
11091  done:
11092  pop ebx
11093  }
11094 
11095  return true;
11096 
11097 #else
11098 
11099  int i, j, k, nc;
11100  float *v, *diag, *mptr;
11101  double s0, s1, s2, s3, sum, d;
11102 
11103  v = (float *) _alloca16( n * sizeof( float ) );
11104  diag = (float *) _alloca16( n * sizeof( float ) );
11105 
11106  nc = mat.GetNumColumns();
11107 
11108  if ( n <= 0 ) {
11109  return true;
11110  }
11111 
11112  mptr = mat[0];
11113 
11114  sum = mptr[0];
11115 
11116  if ( sum == 0.0f ) {
11117  return false;
11118  }
11119 
11120  diag[0] = sum;
11121  invDiag[0] = d = 1.0f / sum;
11122 
11123  if ( n <= 1 ) {
11124  return true;
11125  }
11126 
11127  mptr = mat[0];
11128  for ( j = 1; j < n; j++ ) {
11129  mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
11130  }
11131 
11132  mptr = mat[1];
11133 
11134  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
11135  sum = mptr[1] - s0;
11136 
11137  if ( sum == 0.0f ) {
11138  return false;
11139  }
11140 
11141  mat[1][1] = sum;
11142  diag[1] = sum;
11143  invDiag[1] = d = 1.0f / sum;
11144 
11145  if ( n <= 2 ) {
11146  return true;
11147  }
11148 
11149  mptr = mat[0];
11150  for ( j = 2; j < n; j++ ) {
11151  mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
11152  }
11153 
11154  mptr = mat[2];
11155 
11156  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
11157  v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
11158  sum = mptr[2] - s0 - s1;
11159 
11160  if ( sum == 0.0f ) {
11161  return false;
11162  }
11163 
11164  mat[2][2] = sum;
11165  diag[2] = sum;
11166  invDiag[2] = d = 1.0f / sum;
11167 
11168  if ( n <= 3 ) {
11169  return true;
11170  }
11171 
11172  mptr = mat[0];
11173  for ( j = 3; j < n; j++ ) {
11174  mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
11175  }
11176 
11177  mptr = mat[3];
11178 
11179  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
11180  v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
11181  v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
11182  sum = mptr[3] - s0 - s1 - s2;
11183 
11184  if ( sum == 0.0f ) {
11185  return false;
11186  }
11187 
11188  mat[3][3] = sum;
11189  diag[3] = sum;
11190  invDiag[3] = d = 1.0f / sum;
11191 
11192  if ( n <= 4 ) {
11193  return true;
11194  }
11195 
11196  mptr = mat[0];
11197  for ( j = 4; j < n; j++ ) {
11198  mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
11199  }
11200 
11201  for ( i = 4; i < n; i++ ) {
11202 
11203  mptr = mat[i];
11204 
11205  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
11206  v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
11207  v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
11208  v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
11209  for ( k = 4; k < i-3; k += 4 ) {
11210  v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
11211  v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
11212  v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
11213  v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
11214  }
11215  switch( i - k ) {
11216  case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
11217  case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
11218  case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
11219  }
11220  sum = s3;
11221  sum += s2;
11222  sum += s1;
11223  sum += s0;
11224  sum = mptr[i] - sum;
11225 
11226  if ( sum == 0.0f ) {
11227  return false;
11228  }
11229 
11230  mat[i][i] = sum;
11231  diag[i] = sum;
11232  invDiag[i] = d = 1.0f / sum;
11233 
11234  if ( i + 1 >= n ) {
11235  return true;
11236  }
11237 
11238  mptr = mat[i+1];
11239  for ( j = i+1; j < n; j++ ) {
11240  s0 = mptr[0] * v[0];
11241  s1 = mptr[1] * v[1];
11242  s2 = mptr[2] * v[2];
11243  s3 = mptr[3] * v[3];
11244  for ( k = 4; k < i-7; k += 8 ) {
11245  s0 += mptr[k+0] * v[k+0];
11246  s1 += mptr[k+1] * v[k+1];
11247  s2 += mptr[k+2] * v[k+2];
11248  s3 += mptr[k+3] * v[k+3];
11249  s0 += mptr[k+4] * v[k+4];
11250  s1 += mptr[k+5] * v[k+5];
11251  s2 += mptr[k+6] * v[k+6];
11252  s3 += mptr[k+7] * v[k+7];
11253  }
11254  switch( i - k ) {
11255  case 7: s0 += mptr[k+6] * v[k+6];
11256  case 6: s1 += mptr[k+5] * v[k+5];
11257  case 5: s2 += mptr[k+4] * v[k+4];
11258  case 4: s3 += mptr[k+3] * v[k+3];
11259  case 3: s0 += mptr[k+2] * v[k+2];
11260  case 2: s1 += mptr[k+1] * v[k+1];
11261  case 1: s2 += mptr[k+0] * v[k+0];
11262  }
11263  sum = s3;
11264  sum += s2;
11265  sum += s1;
11266  sum += s0;
11267  mptr[i] = ( mptr[i] - sum ) * d;
11268  mptr += nc;
11269  }
11270  }
11271 
11272  return true;
11273 
11274 #endif
11275 }
11276 
11277 /*
11278 ============
11279 idSIMD_SSE::BlendJoints
11280 ============
11281 */
11282 #define REFINE_BLENDJOINTS_RECIPROCAL
11283 
11284 void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
11285  int i;
11286 
11287  if ( lerp <= 0.0f ) {
11288  return;
11289  } else if ( lerp >= 1.0f ) {
11290  for ( i = 0; i < numJoints; i++ ) {
11291  int j = index[i];
11292  joints[j] = blendJoints[j];
11293  }
11294  return;
11295  }
11296 
11297  for ( i = 0; i <= numJoints - 4; i += 4 ) {
11298  ALIGN16( float jointVert0[4] );
11299  ALIGN16( float jointVert1[4] );
11300  ALIGN16( float jointVert2[4] );
11301  ALIGN16( float blendVert0[4] );
11302  ALIGN16( float blendVert1[4] );
11303  ALIGN16( float blendVert2[4] );
11304  ALIGN16( float jointQuat0[4] );
11305  ALIGN16( float jointQuat1[4] );
11306  ALIGN16( float jointQuat2[4] );
11307  ALIGN16( float jointQuat3[4] );
11308  ALIGN16( float blendQuat0[4] );
11309  ALIGN16( float blendQuat1[4] );
11310  ALIGN16( float blendQuat2[4] );
11311  ALIGN16( float blendQuat3[4] );
11312 
11313  for ( int j = 0; j < 4; j++ ) {
11314  int n = index[i+j];
11315 
11316  jointVert0[j] = joints[n].t[0];
11317  jointVert1[j] = joints[n].t[1];
11318  jointVert2[j] = joints[n].t[2];
11319 
11320  blendVert0[j] = blendJoints[n].t[0];
11321  blendVert1[j] = blendJoints[n].t[1];
11322  blendVert2[j] = blendJoints[n].t[2];
11323 
11324  jointQuat0[j] = joints[n].q[0];
11325  jointQuat1[j] = joints[n].q[1];
11326  jointQuat2[j] = joints[n].q[2];
11327  jointQuat3[j] = joints[n].q[3];
11328 
11329  blendQuat0[j] = blendJoints[n].q[0];
11330  blendQuat1[j] = blendJoints[n].q[1];
11331  blendQuat2[j] = blendJoints[n].q[2];
11332  blendQuat3[j] = blendJoints[n].q[3];
11333  }
11334 
11335 #if 1
11336  __asm {
11337  // lerp translation
11338  movss xmm7, lerp
11339  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
11340  movaps xmm0, blendVert0
11341  subps xmm0, jointVert0
11342  mulps xmm0, xmm7
11343  addps xmm0, jointVert0
11344  movaps jointVert0, xmm0
11345  movaps xmm1, blendVert1
11346  subps xmm1, jointVert1
11347  mulps xmm1, xmm7
11348  addps xmm1, jointVert1
11349  movaps jointVert1, xmm1
11350  movaps xmm2, blendVert2
11351  subps xmm2, jointVert2
11352  mulps xmm2, xmm7
11353  addps xmm2, jointVert2
11354  movaps jointVert2, xmm2
11355 
11356  // lerp quaternions
11357  movaps xmm0, jointQuat0
11358  mulps xmm0, blendQuat0
11359  movaps xmm1, jointQuat1
11360  mulps xmm1, blendQuat1
11361  addps xmm0, xmm1
11362  movaps xmm2, jointQuat2
11363  mulps xmm2, blendQuat2
11364  addps xmm0, xmm2
11365  movaps xmm3, jointQuat3
11366  mulps xmm3, blendQuat3
11367  addps xmm0, xmm3 // xmm0 = cosom
11368 
11369  movaps xmm1, xmm0
11370  movaps xmm2, xmm0
11371  andps xmm1, SIMD_SP_signBitMask // xmm1 = signBit
11372  xorps xmm0, xmm1
11373  mulps xmm2, xmm2
11374 
11375  xorps xmm4, xmm4
11376  movaps xmm3, SIMD_SP_one
11377  subps xmm3, xmm2 // xmm3 = scale0
11378  cmpeqps xmm4, xmm3
11379  andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
11380  andps xmm3, SIMD_SP_absMask // make sure the values are positive
11381  orps xmm3, xmm4
11382 
11383 #ifdef REFINE_BLENDJOINTS_RECIPROCAL
11384  movaps xmm2, xmm3
11385  rsqrtps xmm4, xmm2
11386  mulps xmm2, xmm4
11387  mulps xmm2, xmm4
11388  subps xmm2, SIMD_SP_rsqrt_c0
11389  mulps xmm4, SIMD_SP_rsqrt_c1
11390  mulps xmm2, xmm4
11391 #else
11392  rsqrtps xmm2, xmm3 // xmm2 = sinom
11393 #endif
11394  mulps xmm3, xmm2 // xmm3 = sqrt( scale0 )
11395 
11396  // omega0 = atan2( xmm3, xmm0 )
11397  movaps xmm4, xmm0
11398  minps xmm0, xmm3
11399  maxps xmm3, xmm4
11400  cmpeqps xmm4, xmm0
11401 
11402 #ifdef REFINE_BLENDJOINTS_RECIPROCAL
11403  rcpps xmm5, xmm3
11404  mulps xmm3, xmm5
11405  mulps xmm3, xmm5
11406  addps xmm5, xmm5
11407  subps xmm5, xmm3 // xmm5 = 1 / y or 1 / x
11408  mulps xmm0, xmm5 // xmm0 = x / y or y / x
11409 #else
11410  rcpps xmm3, xmm3 // xmm3 = 1 / y or 1 / x
11411  mulps xmm0, xmm3 // xmm0 = x / y or y / x
11412 #endif
11413  movaps xmm3, xmm4
11414  andps xmm3, SIMD_SP_signBitMask
11415  xorps xmm0, xmm3 // xmm0 = -x / y or y / x
11416  andps xmm4, SIMD_SP_halfPI // xmm4 = HALF_PI or 0.0f
11417  movaps xmm3, xmm0
11418  mulps xmm3, xmm3 // xmm3 = s
11419  movaps xmm5, SIMD_SP_atan_c0
11420  mulps xmm5, xmm3
11421  addps xmm5, SIMD_SP_atan_c1
11422  mulps xmm5, xmm3
11423  addps xmm5, SIMD_SP_atan_c2
11424  mulps xmm5, xmm3
11425  addps xmm5, SIMD_SP_atan_c3
11426  mulps xmm5, xmm3
11427  addps xmm5, SIMD_SP_atan_c4
11428  mulps xmm5, xmm3
11429  addps xmm5, SIMD_SP_atan_c5
11430  mulps xmm5, xmm3
11431  addps xmm5, SIMD_SP_atan_c6
11432  mulps xmm5, xmm3
11433  addps xmm5, SIMD_SP_atan_c7
11434  mulps xmm5, xmm3
11435  addps xmm5, SIMD_SP_one
11436  mulps xmm5, xmm0
11437  addps xmm5, xmm4 // xmm5 = omega0
11438 
11439  movaps xmm6, xmm7 // xmm6 = lerp
11440  mulps xmm6, xmm5 // xmm6 = omega1
11441  subps xmm5, xmm6 // xmm5 = omega0
11442 
11443  // scale0 = sin( xmm5 ) * xmm2
11444  // scale1 = sin( xmm6 ) * xmm2
11445  movaps xmm3, xmm5
11446  movaps xmm7, xmm6
11447  mulps xmm3, xmm3
11448  mulps xmm7, xmm7
11449  movaps xmm4, SIMD_SP_sin_c0
11450  movaps xmm0, SIMD_SP_sin_c0
11451  mulps xmm4, xmm3
11452  mulps xmm0, xmm7
11453  addps xmm4, SIMD_SP_sin_c1
11454  addps xmm0, SIMD_SP_sin_c1
11455  mulps xmm4, xmm3
11456  mulps xmm0, xmm7
11457  addps xmm4, SIMD_SP_sin_c2
11458  addps xmm0, SIMD_SP_sin_c2
11459  mulps xmm4, xmm3
11460  mulps xmm0, xmm7
11461  addps xmm4, SIMD_SP_sin_c3
11462  addps xmm0, SIMD_SP_sin_c3
11463  mulps xmm4, xmm3
11464  mulps xmm0, xmm7
11465  addps xmm4, SIMD_SP_sin_c4
11466  addps xmm0, SIMD_SP_sin_c4
11467  mulps xmm4, xmm3
11468  mulps xmm0, xmm7
11469  addps xmm4, SIMD_SP_one
11470  addps xmm0, SIMD_SP_one
11471  mulps xmm5, xmm4
11472  mulps xmm6, xmm0
11473  mulps xmm5, xmm2 // xmm5 = scale0
11474  mulps xmm6, xmm2 // xmm6 = scale1
11475 
11476  xorps xmm6, xmm1
11477 
11478  movaps xmm0, jointQuat0
11479  mulps xmm0, xmm5
11480  movaps xmm1, blendQuat0
11481  mulps xmm1, xmm6
11482  addps xmm0, xmm1
11483  movaps jointQuat0, xmm0
11484 
11485  movaps xmm1, jointQuat1
11486  mulps xmm1, xmm5
11487  movaps xmm2, blendQuat1
11488  mulps xmm2, xmm6
11489  addps xmm1, xmm2
11490  movaps jointQuat1, xmm1
11491 
11492  movaps xmm2, jointQuat2
11493  mulps xmm2, xmm5
11494  movaps xmm3, blendQuat2
11495  mulps xmm3, xmm6
11496  addps xmm2, xmm3
11497  movaps jointQuat2, xmm2
11498 
11499  movaps xmm3, jointQuat3
11500  mulps xmm3, xmm5
11501  movaps xmm4, blendQuat3
11502  mulps xmm4, xmm6
11503  addps xmm3, xmm4
11504  movaps jointQuat3, xmm3
11505  }
11506 
11507 #else
11508 
11509  jointVert0[0] += lerp * ( blendVert0[0] - jointVert0[0] );
11510  jointVert0[1] += lerp * ( blendVert0[1] - jointVert0[1] );
11511  jointVert0[2] += lerp * ( blendVert0[2] - jointVert0[2] );
11512  jointVert0[3] += lerp * ( blendVert0[3] - jointVert0[3] );
11513 
11514  jointVert1[0] += lerp * ( blendVert1[0] - jointVert1[0] );
11515  jointVert1[1] += lerp * ( blendVert1[1] - jointVert1[1] );
11516  jointVert1[2] += lerp * ( blendVert1[2] - jointVert1[2] );
11517  jointVert1[3] += lerp * ( blendVert1[3] - jointVert1[3] );
11518 
11519  jointVert2[0] += lerp * ( blendVert2[0] - jointVert2[0] );
11520  jointVert2[1] += lerp * ( blendVert2[1] - jointVert2[1] );
11521  jointVert2[2] += lerp * ( blendVert2[2] - jointVert2[2] );
11522  jointVert2[3] += lerp * ( blendVert2[3] - jointVert2[3] );
11523 
11524  ALIGN16( float cosom[4] );
11525  ALIGN16( float sinom[4] );
11526  ALIGN16( float omega0[4] );
11527  ALIGN16( float omega1[4] );
11528  ALIGN16( float scale0[4] );
11529  ALIGN16( float scale1[4] );
11530  ALIGN16( unsigned long signBit[4] );
11531 
11532  cosom[0] = jointQuat0[0] * blendQuat0[0];
11533  cosom[1] = jointQuat0[1] * blendQuat0[1];
11534  cosom[2] = jointQuat0[2] * blendQuat0[2];
11535  cosom[3] = jointQuat0[3] * blendQuat0[3];
11536 
11537  cosom[0] += jointQuat1[0] * blendQuat1[0];
11538  cosom[1] += jointQuat1[1] * blendQuat1[1];
11539  cosom[2] += jointQuat1[2] * blendQuat1[2];
11540  cosom[3] += jointQuat1[3] * blendQuat1[3];
11541 
11542  cosom[0] += jointQuat2[0] * blendQuat2[0];
11543  cosom[1] += jointQuat2[1] * blendQuat2[1];
11544  cosom[2] += jointQuat2[2] * blendQuat2[2];
11545  cosom[3] += jointQuat2[3] * blendQuat2[3];
11546 
11547  cosom[0] += jointQuat3[0] * blendQuat3[0];
11548  cosom[1] += jointQuat3[1] * blendQuat3[1];
11549  cosom[2] += jointQuat3[2] * blendQuat3[2];
11550  cosom[3] += jointQuat3[3] * blendQuat3[3];
11551 
11552  signBit[0] = (*(unsigned long *)&cosom[0]) & ( 1 << 31 );
11553  signBit[1] = (*(unsigned long *)&cosom[1]) & ( 1 << 31 );
11554  signBit[2] = (*(unsigned long *)&cosom[2]) & ( 1 << 31 );
11555  signBit[3] = (*(unsigned long *)&cosom[3]) & ( 1 << 31 );
11556 
11557  (*(unsigned long *)&cosom[0]) ^= signBit[0];
11558  (*(unsigned long *)&cosom[1]) ^= signBit[1];
11559  (*(unsigned long *)&cosom[2]) ^= signBit[2];
11560  (*(unsigned long *)&cosom[3]) ^= signBit[3];
11561 
11562  scale0[0] = 1.0f - cosom[0] * cosom[0];
11563  scale0[1] = 1.0f - cosom[1] * cosom[1];
11564  scale0[2] = 1.0f - cosom[2] * cosom[2];
11565  scale0[3] = 1.0f - cosom[3] * cosom[3];
11566 
11567  scale0[0] = ( scale0[0] <= 0.0f ) ? SIMD_SP_tiny[0] : scale0[0];
11568  scale0[1] = ( scale0[1] <= 0.0f ) ? SIMD_SP_tiny[1] : scale0[1];
11569  scale0[2] = ( scale0[2] <= 0.0f ) ? SIMD_SP_tiny[2] : scale0[2];
11570  scale0[3] = ( scale0[3] <= 0.0f ) ? SIMD_SP_tiny[3] : scale0[3];
11571 
11572  sinom[0] = idMath::RSqrt( scale0[0] );
11573  sinom[1] = idMath::RSqrt( scale0[1] );
11574  sinom[2] = idMath::RSqrt( scale0[2] );
11575  sinom[3] = idMath::RSqrt( scale0[3] );
11576 
11577  scale0[0] *= sinom[0];
11578  scale0[1] *= sinom[1];
11579  scale0[2] *= sinom[2];
11580  scale0[3] *= sinom[3];
11581 
11582  omega0[0] = SSE_ATanPositive( scale0[0], cosom[0] );
11583  omega0[1] = SSE_ATanPositive( scale0[1], cosom[1] );
11584  omega0[2] = SSE_ATanPositive( scale0[2], cosom[2] );
11585  omega0[3] = SSE_ATanPositive( scale0[3], cosom[3] );
11586 
11587  omega1[0] = lerp * omega0[0];
11588  omega1[1] = lerp * omega0[1];
11589  omega1[2] = lerp * omega0[2];
11590  omega1[3] = lerp * omega0[3];
11591 
11592  omega0[0] -= omega1[0];
11593  omega0[1] -= omega1[1];
11594  omega0[2] -= omega1[2];
11595  omega0[3] -= omega1[3];
11596 
11597  scale0[0] = SSE_SinZeroHalfPI( omega0[0] ) * sinom[0];
11598  scale0[1] = SSE_SinZeroHalfPI( omega0[1] ) * sinom[1];
11599  scale0[2] = SSE_SinZeroHalfPI( omega0[2] ) * sinom[2];
11600  scale0[3] = SSE_SinZeroHalfPI( omega0[3] ) * sinom[3];
11601 
11602  scale1[0] = SSE_SinZeroHalfPI( omega1[0] ) * sinom[0];
11603  scale1[1] = SSE_SinZeroHalfPI( omega1[1] ) * sinom[1];
11604  scale1[2] = SSE_SinZeroHalfPI( omega1[2] ) * sinom[2];
11605  scale1[3] = SSE_SinZeroHalfPI( omega1[3] ) * sinom[3];
11606 
11607  (*(unsigned long *)&scale1[0]) ^= signBit[0];
11608  (*(unsigned long *)&scale1[1]) ^= signBit[1];
11609  (*(unsigned long *)&scale1[2]) ^= signBit[2];
11610  (*(unsigned long *)&scale1[3]) ^= signBit[3];
11611 
11612  jointQuat0[0] = scale0[0] * jointQuat0[0] + scale1[0] * blendQuat0[0];
11613  jointQuat0[1] = scale0[1] * jointQuat0[1] + scale1[1] * blendQuat0[1];
11614  jointQuat0[2] = scale0[2] * jointQuat0[2] + scale1[2] * blendQuat0[2];
11615  jointQuat0[3] = scale0[3] * jointQuat0[3] + scale1[3] * blendQuat0[3];
11616 
11617  jointQuat1[0] = scale0[0] * jointQuat1[0] + scale1[0] * blendQuat1[0];
11618  jointQuat1[1] = scale0[1] * jointQuat1[1] + scale1[1] * blendQuat1[1];
11619  jointQuat1[2] = scale0[2] * jointQuat1[2] + scale1[2] * blendQuat1[2];
11620  jointQuat1[3] = scale0[3] * jointQuat1[3] + scale1[3] * blendQuat1[3];
11621 
11622  jointQuat2[0] = scale0[0] * jointQuat2[0] + scale1[0] * blendQuat2[0];
11623  jointQuat2[1] = scale0[1] * jointQuat2[1] + scale1[1] * blendQuat2[1];
11624  jointQuat2[2] = scale0[2] * jointQuat2[2] + scale1[2] * blendQuat2[2];
11625  jointQuat2[3] = scale0[3] * jointQuat2[3] + scale1[3] * blendQuat2[3];
11626 
11627  jointQuat3[0] = scale0[0] * jointQuat3[0] + scale1[0] * blendQuat3[0];
11628  jointQuat3[1] = scale0[1] * jointQuat3[1] + scale1[1] * blendQuat3[1];
11629  jointQuat3[2] = scale0[2] * jointQuat3[2] + scale1[2] * blendQuat3[2];
11630  jointQuat3[3] = scale0[3] * jointQuat3[3] + scale1[3] * blendQuat3[3];
11631 
11632 #endif
11633 
11634  for ( int j = 0; j < 4; j++ ) {
11635  int n = index[i+j];
11636 
11637  joints[n].t[0] = jointVert0[j];
11638  joints[n].t[1] = jointVert1[j];
11639  joints[n].t[2] = jointVert2[j];
11640 
11641  joints[n].q[0] = jointQuat0[j];
11642  joints[n].q[1] = jointQuat1[j];
11643  joints[n].q[2] = jointQuat2[j];
11644  joints[n].q[3] = jointQuat3[j];
11645  }
11646  }
11647 
11648  for ( ; i < numJoints; i++ ) {
11649  int n = index[i];
11650 
11651  idVec3 &jointVert = joints[n].t;
11652  const idVec3 &blendVert = blendJoints[n].t;
11653 
11654  jointVert[0] += lerp * ( blendVert[0] - jointVert[0] );
11655  jointVert[1] += lerp * ( blendVert[1] - jointVert[1] );
11656  jointVert[2] += lerp * ( blendVert[2] - jointVert[2] );
11657 
11658  idQuat &jointQuat = joints[n].q;
11659  const idQuat &blendQuat = blendJoints[n].q;
11660 
11661  float cosom;
11662  float sinom;
11663  float omega;
11664  float scale0;
11665  float scale1;
11666  unsigned long signBit;
11667 
11668  cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w;
11669 
11670  signBit = (*(unsigned long *)&cosom) & ( 1 << 31 );
11671 
11672  (*(unsigned long *)&cosom) ^= signBit;
11673 
11674  scale0 = 1.0f - cosom * cosom;
11675  scale0 = ( scale0 <= 0.0f ) ? SIMD_SP_tiny[0] : scale0;
11676  sinom = idMath::InvSqrt( scale0 );
11677  omega = idMath::ATan16( scale0 * sinom, cosom );
11678  scale0 = idMath::Sin16( ( 1.0f - lerp ) * omega ) * sinom;
11679  scale1 = idMath::Sin16( lerp * omega ) * sinom;
11680 
11681  (*(unsigned long *)&scale1) ^= signBit;
11682 
11683  jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x;
11684  jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y;
11685  jointQuat.z = scale0 * jointQuat.z + scale1 * blendQuat.z;
11686  jointQuat.w = scale0 * jointQuat.w + scale1 * blendQuat.w;
11687  }
11688 }
11689 
11690 /*
11691 ============
11692 idSIMD_SSE::ConvertJointQuatsToJointMats
11693 ============
11694 */
11695 void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
11696 
11697  assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
11698  assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
11699  assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
11700 
11701  for ( int i = 0; i < numJoints; i++ ) {
11702 
11703  const float *q = jointQuats[i].q.ToFloatPtr();
11704  float *m = jointMats[i].ToFloatPtr();
11705 
11706  m[0*4+3] = q[4];
11707  m[1*4+3] = q[5];
11708  m[2*4+3] = q[6];
11709 
11710  float x2 = q[0] + q[0];
11711  float y2 = q[1] + q[1];
11712  float z2 = q[2] + q[2];
11713 
11714  {
11715  float xx = q[0] * x2;
11716  float yy = q[1] * y2;
11717  float zz = q[2] * z2;
11718 
11719  m[0*4+0] = 1.0f - yy - zz;
11720  m[1*4+1] = 1.0f - xx - zz;
11721  m[2*4+2] = 1.0f - xx - yy;
11722  }
11723 
11724  {
11725  float yz = q[1] * z2;
11726  float wx = q[3] * x2;
11727 
11728  m[2*4+1] = yz - wx;
11729  m[1*4+2] = yz + wx;
11730  }
11731 
11732  {
11733  float xy = q[0] * y2;
11734  float wz = q[3] * z2;
11735 
11736  m[1*4+0] = xy - wz;
11737  m[0*4+1] = xy + wz;
11738  }
11739 
11740  {
11741  float xz = q[0] * z2;
11742  float wy = q[3] * y2;
11743 
11744  m[0*4+2] = xz - wy;
11745  m[2*4+0] = xz + wy;
11746  }
11747  }
11748 }
11749 
11750 /*
11751 ============
11752 idSIMD_SSE::ConvertJointMatsToJointQuats
11753 ============
11754 */
11755 void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
11756 
11757  assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
11758  assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
11759  assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
11760 
11761 #if 1
11762 
11763  ALIGN16( byte shuffle[16] );
11764 
11765  __asm {
11766  mov eax, numJoints
11767  mov esi, jointMats
11768  mov edi, jointQuats
11769  and eax, ~3
11770  jz done4
11771  imul eax, JOINTMAT_SIZE
11772  add esi, eax
11773  neg eax
11774 
11775  loopMat4:
11776  movss xmm5, [esi+eax+3*JOINTMAT_SIZE+0*16+0*4]
11777  movss xmm6, [esi+eax+3*JOINTMAT_SIZE+1*16+1*4]
11778  movss xmm7, [esi+eax+3*JOINTMAT_SIZE+2*16+2*4]
11779 
11780  shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
11781  shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
11782  shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
11783 
11784  movss xmm0, [esi+eax+2*JOINTMAT_SIZE+0*16+0*4]
11785  movss xmm1, [esi+eax+2*JOINTMAT_SIZE+1*16+1*4]
11786  movss xmm2, [esi+eax+2*JOINTMAT_SIZE+2*16+2*4]
11787 
11788  movss xmm5, xmm0
11789  movss xmm6, xmm1
11790  movss xmm7, xmm2
11791 
11792  shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
11793  shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
11794  shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
11795 
11796  movss xmm0, [esi+eax+1*JOINTMAT_SIZE+0*16+0*4]
11797  movss xmm1, [esi+eax+1*JOINTMAT_SIZE+1*16+1*4]
11798  movss xmm2, [esi+eax+1*JOINTMAT_SIZE+2*16+2*4]
11799 
11800  movss xmm5, xmm0
11801  movss xmm6, xmm1
11802  movss xmm7, xmm2
11803 
11804  shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
11805  shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
11806  shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
11807 
11808  movss xmm0, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
11809  movss xmm1, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
11810  movss xmm2, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
11811 
11812  movss xmm5, xmm0
11813  movss xmm6, xmm1
11814  movss xmm7, xmm2
11815 
11816  // -------------------
11817 
11818  movaps xmm0, xmm5
11819  addps xmm0, xmm6
11820  addps xmm0, xmm7
11821  cmpnltps xmm0, SIMD_SP_zero // xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f
11822 
11823  movaps xmm1, xmm5
11824  movaps xmm2, xmm5
11825  cmpnltps xmm1, xmm6
11826  cmpnltps xmm2, xmm7
11827  andps xmm2, xmm1 // xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]
11828 
11829  movaps xmm4, xmm6
11830  cmpnltps xmm4, xmm7 // xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2]
11831 
11832  movaps xmm1, xmm0
11833  andnps xmm1, xmm2
11834  orps xmm2, xmm0
11835  movaps xmm3, xmm2
11836  andnps xmm2, xmm4
11837  orps xmm3, xmm2
11838  xorps xmm3, SIMD_SP_not
11839 
11840  andps xmm0, SIMD_DW_mat2quatShuffle0
11841  movaps xmm4, xmm1
11842  andps xmm4, SIMD_DW_mat2quatShuffle1
11843  orps xmm0, xmm4
11844  movaps xmm4, xmm2
11845  andps xmm4, SIMD_DW_mat2quatShuffle2
11846  orps xmm0, xmm4
11847  movaps xmm4, xmm3
11848  andps xmm4, SIMD_DW_mat2quatShuffle3
11849  orps xmm4, xmm0
11850 
11851  movaps shuffle, xmm4
11852 
11853  movaps xmm0, xmm2
11854  orps xmm0, xmm3 // xmm0 = xmm2 | xmm3 = s0
11855  orps xmm2, xmm1 // xmm2 = xmm1 | xmm2 = s2
11856  orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 = s1
11857 
11858  andps xmm0, SIMD_SP_signBitMask
11859  andps xmm1, SIMD_SP_signBitMask
11860  andps xmm2, SIMD_SP_signBitMask
11861 
11862  xorps xmm5, xmm0
11863  xorps xmm6, xmm1
11864  xorps xmm7, xmm2
11865  addps xmm5, xmm6
11866  addps xmm7, SIMD_SP_one
11867  addps xmm5, xmm7 // xmm5 = t
11868 
11869  movaps xmm7, xmm5 // xmm7 = t
11870  rsqrtps xmm6, xmm5
11871  mulps xmm5, xmm6
11872  mulps xmm5, xmm6
11873  subps xmm5, SIMD_SP_rsqrt_c0
11874  mulps xmm6, SIMD_SP_mat2quat_rsqrt_c1
11875  mulps xmm6, xmm5 // xmm5 = s
11876 
11877  mulps xmm7, xmm6 // xmm7 = s * t
11878  xorps xmm6, SIMD_SP_signBitMask // xmm6 = -s
11879 
11880  // -------------------
11881 
11882  add edi, 4*JOINTQUAT_SIZE
11883 
11884  movzx ecx, byte ptr shuffle[0*4+0] // ecx = k0
11885  movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
11886 
11887  movzx edx, byte ptr shuffle[0*4+1] // edx = k1
11888  movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
11889  xorps xmm4, xmm2
11890  subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
11891  mulss xmm4, xmm6
11892  movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
11893 
11894  movzx ecx, byte ptr shuffle[0*4+2] // ecx = k2
11895  movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
11896  xorps xmm3, xmm1
11897  subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
11898  mulss xmm3, xmm6
11899  movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
11900 
11901  movzx edx, byte ptr shuffle[0*4+3] // edx = k3
11902  movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
11903  xorps xmm4, xmm0
11904  subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
11905  mulss xmm4, xmm6
11906  movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
11907 
11908  mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
11909  mov [edi-4*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
11910  mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
11911  mov [edi-4*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
11912  mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
11913  mov [edi-4*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
11914 
11915  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
11916  shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
11917  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
11918  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
11919  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
11920 
11921  movzx ecx, byte ptr shuffle[1*4+0] // ecx = k0
11922  movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
11923 
11924  movzx edx, byte ptr shuffle[1*4+1] // edx = k1
11925  movss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+0*4]
11926  xorps xmm4, xmm2
11927  subss xmm4, [esi+eax+1*JOINTMAT_SIZE+0*16+1*4]
11928  mulss xmm4, xmm6
11929  movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
11930 
11931  movzx ecx, byte ptr shuffle[1*4+2] // ecx = k2
11932  movss xmm3, [esi+eax+1*JOINTMAT_SIZE+0*16+2*4]
11933  xorps xmm3, xmm1
11934  subss xmm3, [esi+eax+1*JOINTMAT_SIZE+2*16+0*4]
11935  mulss xmm3, xmm6
11936  movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
11937 
11938  movzx edx, byte ptr shuffle[1*4+3] // edx = k3
11939  movss xmm4, [esi+eax+1*JOINTMAT_SIZE+2*16+1*4]
11940  xorps xmm4, xmm0
11941  subss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+2*4]
11942  mulss xmm4, xmm6
11943  movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
11944 
11945  mov ecx, [esi+eax+1*JOINTMAT_SIZE+0*16+3*4]
11946  mov [edi-3*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
11947  mov edx, [esi+eax+1*JOINTMAT_SIZE+1*16+3*4]
11948  mov [edi-3*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
11949  mov ecx, [esi+eax+1*JOINTMAT_SIZE+2*16+3*4]
11950  mov [edi-3*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
11951 
11952  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
11953  shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
11954  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
11955  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
11956  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
11957 
11958  movzx ecx, byte ptr shuffle[2*4+0] // ecx = k0
11959  movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
11960 
11961  movzx edx, byte ptr shuffle[2*4+1] // edx = k1
11962  movss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+0*4]
11963  xorps xmm4, xmm2
11964  subss xmm4, [esi+eax+2*JOINTMAT_SIZE+0*16+1*4]
11965  mulss xmm4, xmm6
11966  movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
11967 
11968  movzx ecx, byte ptr shuffle[2*4+2] // ecx = k2
11969  movss xmm3, [esi+eax+2*JOINTMAT_SIZE+0*16+2*4]
11970  xorps xmm3, xmm1
11971  subss xmm3, [esi+eax+2*JOINTMAT_SIZE+2*16+0*4]
11972  mulss xmm3, xmm6
11973  movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
11974 
11975  movzx edx, byte ptr shuffle[2*4+3] // edx = k3
11976  movss xmm4, [esi+eax+2*JOINTMAT_SIZE+2*16+1*4]
11977  xorps xmm4, xmm0
11978  subss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+2*4]
11979  mulss xmm4, xmm6
11980  movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
11981 
11982  mov ecx, [esi+eax+2*JOINTMAT_SIZE+0*16+3*4]
11983  mov [edi-2*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
11984  mov edx, [esi+eax+2*JOINTMAT_SIZE+1*16+3*4]
11985  mov [edi-2*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
11986  mov ecx, [esi+eax+2*JOINTMAT_SIZE+2*16+3*4]
11987  mov [edi-2*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
11988 
11989  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
11990  shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
11991  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
11992  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
11993  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
11994 
11995  movzx ecx, byte ptr shuffle[3*4+0] // ecx = k0
11996  movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
11997 
11998  movzx edx, byte ptr shuffle[3*4+1] // edx = k1
11999  movss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+0*4]
12000  xorps xmm4, xmm2
12001  subss xmm4, [esi+eax+3*JOINTMAT_SIZE+0*16+1*4]
12002  mulss xmm4, xmm6
12003  movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
12004 
12005  movzx ecx, byte ptr shuffle[3*4+2] // ecx = k2
12006  movss xmm3, [esi+eax+3*JOINTMAT_SIZE+0*16+2*4]
12007  xorps xmm3, xmm1
12008  subss xmm3, [esi+eax+3*JOINTMAT_SIZE+2*16+0*4]
12009  mulss xmm3, xmm6
12010  movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
12011 
12012  movzx edx, byte ptr shuffle[3*4+3] // edx = k3
12013  movss xmm4, [esi+eax+3*JOINTMAT_SIZE+2*16+1*4]
12014  xorps xmm4, xmm0
12015  subss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+2*4]
12016  mulss xmm4, xmm6
12017  movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
12018 
12019  mov ecx, [esi+eax+3*JOINTMAT_SIZE+0*16+3*4]
12020  mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
12021  mov edx, [esi+eax+3*JOINTMAT_SIZE+1*16+3*4]
12022  mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
12023  mov ecx, [esi+eax+3*JOINTMAT_SIZE+2*16+3*4]
12024  mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
12025 
12026  add eax, 4*JOINTMAT_SIZE
12027  jl loopMat4
12028 
12029  done4:
12030  mov eax, numJoints
12031  and eax, 3
12032  jz done1
12033  imul eax, JOINTMAT_SIZE
12034  add esi, eax
12035  neg eax
12036 
12037  loopMat1:
12038  movss xmm5, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
12039  movss xmm6, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
12040  movss xmm7, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
12041 
12042  // -------------------
12043 
12044  movaps xmm0, xmm5
12045  addss xmm0, xmm6
12046  addss xmm0, xmm7
12047  cmpnltss xmm0, SIMD_SP_zero // xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f
12048 
12049  movaps xmm1, xmm5
12050  movaps xmm2, xmm5
12051  cmpnltss xmm1, xmm6
12052  cmpnltss xmm2, xmm7
12053  andps xmm2, xmm1 // xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]
12054 
12055  movaps xmm4, xmm6
12056  cmpnltss xmm4, xmm7 // xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2]
12057 
12058  movaps xmm1, xmm0
12059  andnps xmm1, xmm2
12060  orps xmm2, xmm0
12061  movaps xmm3, xmm2
12062  andnps xmm2, xmm4
12063  orps xmm3, xmm2
12064  xorps xmm3, SIMD_SP_not
12065 
12066  andps xmm0, SIMD_DW_mat2quatShuffle0
12067  movaps xmm4, xmm1
12068  andps xmm4, SIMD_DW_mat2quatShuffle1
12069  orps xmm0, xmm4
12070  movaps xmm4, xmm2
12071  andps xmm4, SIMD_DW_mat2quatShuffle2
12072  orps xmm0, xmm4
12073  movaps xmm4, xmm3
12074  andps xmm4, SIMD_DW_mat2quatShuffle3
12075  orps xmm4, xmm0
12076 
12077  movss shuffle, xmm4
12078 
12079  movaps xmm0, xmm2
12080  orps xmm0, xmm3 // xmm0 = xmm2 | xmm3 = s0
12081  orps xmm2, xmm1 // xmm2 = xmm1 | xmm2 = s2
12082  orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 = s1
12083 
12084  andps xmm0, SIMD_SP_signBitMask
12085  andps xmm1, SIMD_SP_signBitMask
12086  andps xmm2, SIMD_SP_signBitMask
12087 
12088  xorps xmm5, xmm0
12089  xorps xmm6, xmm1
12090  xorps xmm7, xmm2
12091  addss xmm5, xmm6
12092  addss xmm7, SIMD_SP_one
12093  addss xmm5, xmm7 // xmm5 = t
12094 
12095  movss xmm7, xmm5 // xmm7 = t
12096  rsqrtss xmm6, xmm5
12097  mulss xmm5, xmm6
12098  mulss xmm5, xmm6
12099  subss xmm5, SIMD_SP_rsqrt_c0
12100  mulss xmm6, SIMD_SP_mat2quat_rsqrt_c1
12101  mulss xmm6, xmm5 // xmm5 = s
12102 
12103  mulss xmm7, xmm6 // xmm7 = s * t
12104  xorps xmm6, SIMD_SP_signBitMask // xmm6 = -s
12105 
12106  // -------------------
12107 
12108  movzx ecx, byte ptr shuffle[0] // ecx = k0
12109  add edi, JOINTQUAT_SIZE
12110  movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
12111 
12112  movzx edx, byte ptr shuffle[1] // edx = k1
12113  movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
12114  xorps xmm4, xmm2
12115  subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
12116  mulss xmm4, xmm6
12117  movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
12118 
12119  movzx ecx, byte ptr shuffle[2] // ecx = k2
12120  movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
12121  xorps xmm3, xmm1
12122  subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
12123  mulss xmm3, xmm6
12124  movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
12125 
12126  movzx edx, byte ptr shuffle[3] // edx = k3
12127  movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
12128  xorps xmm4, xmm0
12129  subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
12130  mulss xmm4, xmm6
12131  movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
12132 
12133  mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
12134  mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
12135  mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
12136  mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
12137  mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
12138  mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
12139 
12140  add eax, JOINTMAT_SIZE
12141  jl loopMat1
12142 
12143  done1:
12144  }
12145 
12146 #elif 0
12147 
12148  for ( int i = 0; i < numJoints; i++ ) {
12149  float s0, s1, s2;
12150  int k0, k1, k2, k3;
12151 
12152  float *q = jointQuats[i].q.ToFloatPtr();
12153  const float *m = jointMats[i].ToFloatPtr();
12154 
12155  if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {
12156 
12157  k0 = 3;
12158  k1 = 2;
12159  k2 = 1;
12160  k3 = 0;
12161  s0 = 1.0f;
12162  s1 = 1.0f;
12163  s2 = 1.0f;
12164 
12165  } else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
12166 
12167  k0 = 0;
12168  k1 = 1;
12169  k2 = 2;
12170  k3 = 3;
12171  s0 = 1.0f;
12172  s1 = -1.0f;
12173  s2 = -1.0f;
12174 
12175  } else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
12176 
12177  k0 = 1;
12178  k1 = 0;
12179  k2 = 3;
12180  k3 = 2;
12181  s0 = -1.0f;
12182  s1 = 1.0f;
12183  s2 = -1.0f;
12184 
12185  } else {
12186 
12187  k0 = 2;
12188  k1 = 3;
12189  k2 = 0;
12190  k3 = 1;
12191  s0 = -1.0f;
12192  s1 = -1.0f;
12193  s2 = 1.0f;
12194 
12195  }
12196 
12197  float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f;
12198  float s = idMath::InvSqrt( t ) * 0.5f;
12199 
12200  q[k0] = s * t;
12201  q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
12202  q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
12203  q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
12204 
12205  q[4] = m[0 * 4 + 3];
12206  q[5] = m[1 * 4 + 3];
12207  q[6] = m[2 * 4 + 3];
12208  }
12209 
12210 #elif 1
12211 
12212  for ( int i = 0; i < numJoints; i++ ) {
12213 
12214  float *q = jointQuats[i].q.ToFloatPtr();
12215  const float *m = jointMats[i].ToFloatPtr();
12216 
12217  if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {
12218 
12219  float t = + m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
12220  float s = idMath::InvSqrt( t ) * 0.5f;
12221 
12222  q[3] = s * t;
12223  q[2] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
12224  q[1] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
12225  q[0] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
12226 
12227  } else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
12228 
12229  float t = + m[0 * 4 + 0] - m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
12230  float s = idMath::InvSqrt( t ) * 0.5f;
12231 
12232  q[0] = s * t;
12233  q[1] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
12234  q[2] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
12235  q[3] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
12236 
12237  } else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
12238 
12239  float t = - m[0 * 4 + 0] + m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
12240  float s = idMath::InvSqrt( t ) * 0.5f;
12241 
12242  q[1] = s * t;
12243  q[0] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
12244  q[3] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
12245  q[2] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
12246 
12247  } else {
12248 
12249  float t = - m[0 * 4 + 0] - m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
12250  float s = idMath::InvSqrt( t ) * 0.5f;
12251 
12252  q[2] = s * t;
12253  q[3] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
12254  q[0] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
12255  q[1] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
12256 
12257  }
12258 
12259  q[4] = m[0 * 4 + 3];
12260  q[5] = m[1 * 4 + 3];
12261  q[6] = m[2 * 4 + 3];
12262  }
12263 
12264 #endif
12265 }
12266 
12267 /*
12268 ============
12269 idSIMD_SSE::TransformJoints
12270 ============
12271 */
12272 void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
12273 #if 1
12274 
12275  assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
12276 
12277  __asm {
12278 
12279  mov ecx, firstJoint
12280  mov eax, lastJoint
12281  sub eax, ecx
12282  jl done
12283  imul ecx, 4
12284  mov edi, parents
12285  add edi, ecx
12286  imul ecx, 12
12287  mov esi, jointMats
12288  imul eax, 4
12289  add edi, eax
12290  neg eax
12291 
12292  loopJoint:
12293 
12294  movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0
12295  mov edx, [edi+eax]
12296  movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1
12297  imul edx, JOINTMAT_SIZE
12298  movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2
12299 
12300  movss xmm4, [esi+edx+ 0]
12301  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12302  mulps xmm4, xmm0
12303 
12304  movss xmm5, [esi+edx+ 4]
12305  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12306  mulps xmm5, xmm1
12307  addps xmm4, xmm5
12308  movss xmm6, [esi+edx+ 8]
12309  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12310  mulps xmm6, xmm2
12311  addps xmm4, xmm6
12312 
12313  movss xmm5, [esi+edx+16]
12314  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12315  mulps xmm5, xmm0
12316 
12317  movss xmm7, [esi+edx+12]
12318  shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
12319  addps xmm4, xmm7
12320 
12321  movaps [esi+ecx+ 0], xmm4
12322 
12323  movss xmm6, [esi+edx+20]
12324  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12325  mulps xmm6, xmm1
12326  addps xmm5, xmm6
12327  movss xmm7, [esi+edx+24]
12328  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12329  mulps xmm7, xmm2
12330  addps xmm5, xmm7
12331 
12332  movss xmm6, [esi+edx+32]
12333  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12334  mulps xmm6, xmm0
12335 
12336  movss xmm3, [esi+edx+28]
12337  shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
12338  addps xmm5, xmm3
12339 
12340  movaps [esi+ecx+16], xmm5
12341 
12342  movss xmm7, [esi+edx+36]
12343  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12344  mulps xmm7, xmm1
12345  addps xmm6, xmm7
12346  movss xmm3, [esi+edx+40]
12347  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
12348  mulps xmm3, xmm2
12349  addps xmm6, xmm3
12350 
12351  movss xmm7, [esi+edx+44]
12352  shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
12353  addps xmm6, xmm7
12354 
12355  movaps [esi+ecx+32], xmm6
12356 
12357  add ecx, JOINTMAT_SIZE
12358  add eax, 4
12359  jle loopJoint
12360  done:
12361  }
12362 
12363 #else
12364 
12365  int i;
12366 
12367  for( i = firstJoint; i <= lastJoint; i++ ) {
12368  assert( parents[i] < i );
12369  jointMats[i] *= jointMats[parents[i]];
12370  }
12371 
12372 #endif
12373 }
12374 
12375 /*
12376 ============
12377 idSIMD_SSE::UntransformJoints
12378 ============
12379 */
12380 void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
12381 #if 1
12382 
12383  assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
12384 
12385  __asm {
12386 
12387  mov edx, firstJoint
12388  mov eax, lastJoint
12389  mov ecx, eax
12390  sub eax, edx
12391  jl done
12392  mov esi, jointMats
12393  imul ecx, JOINTMAT_SIZE
12394  imul edx, 4
12395  mov edi, parents
12396  add edi, edx
12397  imul eax, 4
12398 
12399  loopJoint:
12400 
12401  movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0
12402  mov edx, [edi+eax]
12403  movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1
12404  imul edx, JOINTMAT_SIZE
12405  movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2
12406 
12407  movss xmm6, [esi+edx+12]
12408  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
12409  subps xmm0, xmm6
12410  movss xmm7, [esi+edx+28]
12411  shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
12412  subps xmm1, xmm7
12413  movss xmm3, [esi+edx+44]
12414  shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
12415  subps xmm2, xmm3
12416 
12417  movss xmm4, [esi+edx+ 0]
12418  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12419  mulps xmm4, xmm0
12420  movss xmm5, [esi+edx+16]
12421  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12422  mulps xmm5, xmm1
12423  addps xmm4, xmm5
12424  movss xmm6, [esi+edx+32]
12425  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12426  mulps xmm6, xmm2
12427  addps xmm4, xmm6
12428 
12429  movaps [esi+ecx+ 0], xmm4
12430 
12431  movss xmm5, [esi+edx+ 4]
12432  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12433  mulps xmm5, xmm0
12434  movss xmm6, [esi+edx+20]
12435  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12436  mulps xmm6, xmm1
12437  addps xmm5, xmm6
12438  movss xmm7, [esi+edx+36]
12439  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12440  mulps xmm7, xmm2
12441  addps xmm5, xmm7
12442 
12443  movaps [esi+ecx+16], xmm5
12444 
12445  movss xmm6, [esi+edx+ 8]
12446  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12447  mulps xmm6, xmm0
12448  movss xmm7, [esi+edx+24]
12449  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12450  mulps xmm7, xmm1
12451  addps xmm6, xmm7
12452  movss xmm3, [esi+edx+40]
12453  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
12454  mulps xmm3, xmm2
12455  addps xmm6, xmm3
12456 
12457  movaps [esi+ecx+32], xmm6
12458 
12459  sub ecx, JOINTMAT_SIZE
12460  sub eax, 4
12461  jge loopJoint
12462  done:
12463  }
12464 
12465 #else
12466 
12467  int i;
12468 
12469  for( i = lastJoint; i >= firstJoint; i-- ) {
12470  assert( parents[i] < i );
12471  jointMats[i] /= jointMats[parents[i]];
12472  }
12473 
12474 #endif
12475 }
12476 
12477 /*
12478 ============
12479 idSIMD_SSE::TransformVerts
12480 ============
12481 */
12482 void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
12483 #if 1
12484 
12485  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
12486  assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
12487  assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
12488  assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
12489 
12490  __asm
12491  {
12492  mov eax, numVerts
12493  test eax, eax
12494  jz done
12495  imul eax, DRAWVERT_SIZE
12496 
12497  mov ecx, verts
12498  mov edx, index
12499  mov esi, weights
12500  mov edi, joints
12501 
12502  add ecx, eax
12503  neg eax
12504 
12505  loopVert:
12506  mov ebx, [edx]
12507  movaps xmm2, [esi]
12508  add edx, 8
12509  movaps xmm0, xmm2
12510  add esi, JOINTWEIGHT_SIZE
12511  movaps xmm1, xmm2
12512 
12513  mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
12514  mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
12515  mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
12516 
12517  cmp dword ptr [edx-4], 0
12518 
12519  jne doneWeight
12520 
12521  loopWeight:
12522  mov ebx, [edx]
12523  movaps xmm5, [esi]
12524  add edx, 8
12525  movaps xmm3, xmm5
12526  add esi, JOINTWEIGHT_SIZE
12527  movaps xmm4, xmm5
12528 
12529  mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
12530  mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
12531  mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
12532 
12533  cmp dword ptr [edx-4], 0
12534 
12535  addps xmm0, xmm3
12536  addps xmm1, xmm4
12537  addps xmm2, xmm5
12538 
12539  je loopWeight
12540 
12541  doneWeight:
12542  add eax, DRAWVERT_SIZE
12543 
12544  movaps xmm6, xmm0 // xmm6 = m0, m1, m2, t0
12545  unpcklps xmm6, xmm1 // xmm6 = m0, m3, m1, m4
12546  unpckhps xmm0, xmm1 // xmm1 = m2, m5, t0, t1
12547  addps xmm6, xmm0 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
12548 
12549  movaps xmm7, xmm2 // xmm7 = m6, m7, m8, t2
12550  movlhps xmm2, xmm6 // xmm2 = m6, m7, m0+m2, m3+m5
12551  movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
12552  addps xmm6, xmm2 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
12553 
12554  movhps [ecx+eax-DRAWVERT_SIZE+0], xmm6
12555 
12556  movaps xmm5, xmm6 // xmm5 = m6+m8, m7+t2
12557  shufps xmm5, xmm5, R_SHUFFLEPS( 1, 0, 2, 3 ) // xmm5 = m7+t2, m6+m8
12558  addss xmm5, xmm6 // xmm5 = m6+m8+m7+t2
12559 
12560  movss [ecx+eax-DRAWVERT_SIZE+8], xmm5
12561 
12562  jl loopVert
12563  done:
12564  }
12565 
12566 #else
12567 
12568  int i, j;
12569  const byte *jointsPtr = (byte *)joints;
12570 
12571  for( j = i = 0; i < numVerts; i++ ) {
12572  idVec3 v;
12573 
12574  v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
12575  while( index[j*2+1] == 0 ) {
12576  j++;
12577  v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
12578  }
12579  j++;
12580 
12581  verts[i].xyz = v;
12582  }
12583 
12584 #endif
12585 }
12586 
12587 /*
12588 ============
12589 idSIMD_SSE::TracePointCull
12590 ============
12591 */
12592 void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
12593 #if 1
12594 
12595  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
12596  assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
12597 
12598  __asm {
12599  push ebx
12600  mov eax, numVerts
12601  test eax, eax
12602  jz done
12603 
12604  mov edi, planes
12605  movlps xmm1, [edi] // xmm1 = 0, 1, X, X
12606  movhps xmm1, [edi+16] // xmm1 = 0, 1, 4, 5
12607  movlps xmm3, [edi+8] // xmm3 = 2, 3, X, X
12608  movhps xmm3, [edi+24] // xmm3 = 2, 3, 6, 7
12609  movlps xmm4, [edi+32] // xmm4 = 8, 9, X, X
12610  movhps xmm4, [edi+48] // xmm4 = 8, 9, 12, 13
12611  movlps xmm5, [edi+40] // xmm5 = 10, 11, X, X
12612  movhps xmm5, [edi+56] // xmm5 = 10, 11, 14, 15
12613  movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5
12614  shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12
12615  shufps xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13
12616  movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7
12617  shufps xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14
12618  shufps xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15
12619  movss xmm7, radius
12620  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12621 
12622  xor edx, edx
12623  mov esi, verts
12624  mov edi, cullBits
12625  imul eax, DRAWVERT_SIZE
12626  add esi, eax
12627  neg eax
12628 
12629  loopVert:
12630  movss xmm4, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
12631  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12632  movss xmm5, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
12633  mulps xmm4, xmm0
12634  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12635  movss xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
12636  mulps xmm5, xmm1
12637  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12638  addps xmm4, xmm5
12639  mulps xmm6, xmm2
12640  addps xmm4, xmm3
12641  addps xmm4, xmm6
12642  movaps xmm5, xmm4
12643  xorps xmm5, SIMD_SP_signBitMask
12644  cmpltps xmm4, xmm7
12645  movmskps ecx, xmm4
12646  cmpltps xmm5, xmm7
12647  movmskps ebx, xmm5
12648  shl cx, 4
12649  or cl, bl
12650  inc edi
12651  or dl, cl
12652  add eax, DRAWVERT_SIZE
12653  mov byte ptr [edi-1], cl
12654  jl loopVert
12655 
12656  done:
12657  mov esi, totalOr
12658  mov byte ptr [esi], dl
12659  pop ebx
12660  }
12661 
12662 #else
12663 
12664  int i;
12665  byte tOr;
12666 
12667  tOr = 0;
12668 
12669  for ( i = 0; i < numVerts; i++ ) {
12670  byte bits;
12671  float d0, d1, d2, d3, t;
12672  const idVec3 &v = verts[i].xyz;
12673 
12674  d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
12675  d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
12676  d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
12677  d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];
12678 
12679  t = d0 + radius;
12680  bits = FLOATSIGNBITSET( t ) << 0;
12681  t = d1 + radius;
12682  bits |= FLOATSIGNBITSET( t ) << 1;
12683  t = d2 + radius;
12684  bits |= FLOATSIGNBITSET( t ) << 2;
12685  t = d3 + radius;
12686  bits |= FLOATSIGNBITSET( t ) << 3;
12687 
12688  t = d0 - radius;
12689  bits |= FLOATSIGNBITSET( t ) << 4;
12690  t = d1 - radius;
12691  bits |= FLOATSIGNBITSET( t ) << 5;
12692  t = d2 - radius;
12693  bits |= FLOATSIGNBITSET( t ) << 6;
12694  t = d3 - radius;
12695  bits |= FLOATSIGNBITSET( t ) << 7;
12696 
12697  bits ^= 0x0F; // flip lower four bits
12698 
12699  tOr |= bits;
12700  cullBits[i] = bits;
12701  }
12702 
12703  totalOr = tOr;
12704 
12705 #endif
12706 }
12707 
12708 /*
12709 ============
12710 idSIMD_SSE::DecalPointCull
12711 ============
12712 */
12713 void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
12714 #if 1
12715 
12716  ALIGN16( float p0[4] );
12717  ALIGN16( float p1[4] );
12718  ALIGN16( float p2[4] );
12719  ALIGN16( float p3[4] );
12720  ALIGN16( float p4[4] );
12721  ALIGN16( float p5[4] );
12722  ALIGN16( float p6[4] );
12723  ALIGN16( float p7[4] );
12724 
12725  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
12726  assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
12727 
12728  __asm {
12729  mov ecx, planes
12730  movlps xmm1, [ecx] // xmm1 = 0, 1, X, X
12731  movhps xmm1, [ecx+16] // xmm1 = 0, 1, 4, 5
12732  movlps xmm3, [ecx+8] // xmm3 = 2, 3, X, X
12733  movhps xmm3, [ecx+24] // xmm3 = 2, 3, 6, 7
12734  movlps xmm4, [ecx+32] // xmm4 = 8, 9, X, X
12735  movhps xmm4, [ecx+48] // xmm4 = 8, 9, 12, 13
12736  movlps xmm5, [ecx+40] // xmm5 = 10, 11, X, X
12737  movhps xmm5, [ecx+56] // xmm5 = 10, 11, 14, 15
12738  movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5
12739  shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12
12740  shufps xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13
12741  movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7
12742  shufps xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14
12743  shufps xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15
12744 
12745  movaps p0, xmm0
12746  movaps p1, xmm1
12747  movaps p2, xmm2
12748  movaps p3, xmm3
12749 
12750  movlps xmm4, [ecx+64] // xmm4 = p40, p41, X, X
12751  movhps xmm4, [ecx+80] // xmm4 = p40, p41, p50, p51
12752  movaps xmm5, xmm4 // xmm5 = p40, p41, p50, p51
12753  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm4 = p40, p50, p40, p50
12754  shufps xmm5, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm5 = p41, p51, p41, p51
12755  movlps xmm6, [ecx+72] // xmm6 = p42, p43, X, X
12756  movhps xmm6, [ecx+88] // xmm6 = p42, p43, p52, p53
12757  movaps xmm7, xmm6 // xmm7 = p42, p43, p52, p53
12758  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm6 = p42, p52, p42, p52
12759  shufps xmm7, xmm7, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm7 = p43, p53, p43, p53
12760 
12761  movaps p4, xmm4
12762  movaps p5, xmm5
12763  movaps p6, xmm6
12764  movaps p7, xmm7
12765 
12766  mov esi, verts
12767  mov edi, cullBits
12768  mov eax, numVerts
12769  and eax, ~1
12770  jz done2
12771  imul eax, DRAWVERT_SIZE
12772  add esi, eax
12773  neg eax
12774 
12775  loopVert2:
12776  movaps xmm6, p0
12777  movss xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
12778  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
12779  mulps xmm6, xmm0
12780  movaps xmm7, p1
12781  movss xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
12782  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
12783  mulps xmm7, xmm1
12784  addps xmm6, xmm7
12785  movaps xmm7, p2
12786  movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
12787  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
12788  mulps xmm7, xmm2
12789  addps xmm6, xmm7
12790  addps xmm6, p3
12791 
12792  cmpnltps xmm6, SIMD_SP_zero
12793  movmskps ecx, xmm6
12794 
12795  movaps xmm6, p0
12796  movss xmm3, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
12797  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
12798  mulps xmm6, xmm3
12799  movaps xmm7, p1
12800  movss xmm4, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
12801  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12802  mulps xmm7, xmm4
12803  addps xmm6, xmm7
12804  movaps xmm7, p2
12805  movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
12806  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12807  mulps xmm7, xmm5
12808  addps xmm6, xmm7
12809  addps xmm6, p3
12810 
12811  cmpnltps xmm6, SIMD_SP_zero
12812  movmskps edx, xmm6
12813  mov ch, dl
12814 
12815  shufps xmm0, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
12816  mulps xmm0, p4
12817  shufps xmm1, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12818  mulps xmm1, p5
12819  addps xmm0, xmm1
12820  shufps xmm2, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12821  mulps xmm2, p6
12822  addps xmm0, xmm2
12823  addps xmm0, p7
12824 
12825  cmpnltps xmm0, SIMD_SP_zero
12826  movmskps edx, xmm0
12827 
12828  add edi, 2
12829 
12830  mov dh, dl
12831  shl dl, 4
12832  shl dh, 2
12833  and edx, (3<<4)|(3<<12)
12834  or ecx, edx
12835 
12836  add eax, 2*DRAWVERT_SIZE
12837  mov word ptr [edi-2], cx
12838  jl loopVert2
12839 
12840  done2:
12841 
12842  mov eax, numVerts
12843  and eax, 1
12844  jz done
12845 
12846  movaps xmm6, p0
12847  movss xmm0, [esi+DRAWVERT_XYZ_OFFSET+0]
12848  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
12849  mulps xmm6, xmm0
12850  movaps xmm7, p1
12851  movss xmm1, [esi+DRAWVERT_XYZ_OFFSET+4]
12852  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
12853  mulps xmm7, xmm1
12854  addps xmm6, xmm7
12855  movaps xmm7, p2
12856  movss xmm2, [esi+DRAWVERT_XYZ_OFFSET+8]
12857  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
12858  mulps xmm7, xmm2
12859  addps xmm6, xmm7
12860  addps xmm6, p3
12861 
12862  cmpnltps xmm6, SIMD_SP_zero
12863  movmskps ecx, xmm6
12864 
12865  mulps xmm0, p4
12866  mulps xmm1, p5
12867  addps xmm0, xmm1
12868  mulps xmm2, p6
12869  addps xmm0, xmm2
12870  addps xmm0, p7
12871 
12872  cmpnltps xmm0, SIMD_SP_zero
12873  movmskps edx, xmm0
12874 
12875  and edx, 3
12876  shl edx, 4
12877  or ecx, edx
12878 
12879  mov byte ptr [edi], cl
12880 
12881  done:
12882  }
12883 
12884 
12885 #else
12886 
12887  int i;
12888 
12889  for ( i = 0; i < numVerts; i += 2 ) {
12890  unsigned short bits0, bits1;
12891  float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
12892  const idVec3 &v0 = verts[i+0].xyz;
12893  const idVec3 &v1 = verts[i+1].xyz;
12894 
12895  d0 = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3];
12896  d1 = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3];
12897  d2 = planes[2][0] * v0[0] + planes[2][1] * v0[1] + planes[2][2] * v0[2] + planes[2][3];
12898  d3 = planes[3][0] * v0[0] + planes[3][1] * v0[1] + planes[3][2] * v0[2] + planes[3][3];
12899 
12900  d4 = planes[4][0] * v0[0] + planes[4][1] * v0[1] + planes[4][2] * v0[2] + planes[4][3];
12901  d5 = planes[5][0] * v0[0] + planes[5][1] * v0[1] + planes[5][2] * v0[2] + planes[5][3];
12902  d10 = planes[4][0] * v1[0] + planes[4][1] * v1[1] + planes[4][2] * v1[2] + planes[4][3];
12903  d11 = planes[5][0] * v1[0] + planes[5][1] * v1[1] + planes[5][2] * v1[2] + planes[5][3];
12904 
12905  d6 = planes[0][0] * v1[0] + planes[0][1] * v1[1] + planes[0][2] * v1[2] + planes[0][3];
12906  d7 = planes[1][0] * v1[0] + planes[1][1] * v1[1] + planes[1][2] * v1[2] + planes[1][3];
12907  d8 = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3];
12908  d9 = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3];
12909 
12910  bits0 = FLOATSIGNBITSET( d0 ) << (0+0);
12911  bits0 |= FLOATSIGNBITSET( d1 ) << (0+1);
12912  bits0 |= FLOATSIGNBITSET( d2 ) << (0+2);
12913  bits0 |= FLOATSIGNBITSET( d3 ) << (0+3);
12914  bits0 |= FLOATSIGNBITSET( d4 ) << (0+4);
12915  bits0 |= FLOATSIGNBITSET( d5 ) << (0+5);
12916 
12917  bits1 = FLOATSIGNBITSET( d6 ) << (8+0);
12918  bits1 |= FLOATSIGNBITSET( d7 ) << (8+1);
12919  bits1 |= FLOATSIGNBITSET( d8 ) << (8+2);
12920  bits1 |= FLOATSIGNBITSET( d9 ) << (8+3);
12921  bits1 |= FLOATSIGNBITSET( d10 ) << (8+4);
12922  bits1 |= FLOATSIGNBITSET( d11 ) << (8+5);
12923 
12924  *(unsigned short *)(cullBits + i) = ( bits0 | bits1 ) ^ 0x3F3F;
12925  }
12926 
12927  if ( numVerts & 1 ) {
12928  byte bits;
12929  float d0, d1, d2, d3, d4, d5;
12930  const idVec3 &v = verts[numVerts - 1].xyz;
12931 
12932  d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
12933  d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
12934  d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
12935  d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];
12936 
12937  d4 = planes[4][0] * v[0] + planes[4][1] * v[1] + planes[4][2] * v[2] + planes[4][3];
12938  d5 = planes[5][0] * v[0] + planes[5][1] * v[1] + planes[5][2] * v[2] + planes[5][3];
12939 
12940  bits = FLOATSIGNBITSET( d0 ) << 0;
12941  bits |= FLOATSIGNBITSET( d1 ) << 1;
12942  bits |= FLOATSIGNBITSET( d2 ) << 2;
12943  bits |= FLOATSIGNBITSET( d3 ) << 3;
12944 
12945  bits |= FLOATSIGNBITSET( d4 ) << 4;
12946  bits |= FLOATSIGNBITSET( d5 ) << 5;
12947 
12948  cullBits[numVerts - 1] = bits ^ 0x3F; // flip lower 6 bits
12949  }
12950 
12951 #endif
12952 }
12953 
12954 /*
12955 ============
12956 idSIMD_SSE::OverlayPointCull
12957 ============
12958 */
12959 void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
12960 #if 1
12961 
12962  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
12963  assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
12964 
12965  __asm {
12966  mov eax, numVerts
12967  mov edx, verts
12968  mov esi, texCoords
12969  mov edi, cullBits
12970 
12971  mov ecx, planes
12972  movss xmm4, [ecx+ 0]
12973  movss xmm5, [ecx+16]
12974  shufps xmm4, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12975  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
12976  movss xmm5, [ecx+ 4]
12977  movss xmm6, [ecx+20]
12978  shufps xmm5, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12979  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )
12980  movss xmm6, [ecx+ 8]
12981  movss xmm7, [ecx+24]
12982  shufps xmm6, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12983  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 )
12984  movss xmm7, [ecx+12]
12985  movss xmm0, [ecx+28]
12986  shufps xmm7, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
12987  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 0, 2 )
12988 
12989  and eax, ~1
12990  jz done2
12991  add edi, eax
12992  neg eax
12993 
12994  loopVert2:
12995  movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
12996  movss xmm1, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
12997  shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
12998  mulps xmm0, xmm4
12999  movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
13000  movss xmm2, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
13001  shufps xmm1, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
13002  mulps xmm1, xmm5
13003  movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
13004  movss xmm3, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
13005  shufps xmm2, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
13006  mulps xmm2, xmm6
13007  addps xmm0, xmm1
13008  addps xmm0, xmm2
13009  addps xmm0, xmm7
13010  movaps [esi], xmm0
13011  movaps xmm1, xmm0
13012  movaps xmm2, SIMD_SP_one
13013  subps xmm2, xmm0
13014  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
13015  shufps xmm1, xmm2, R_SHUFFLEPS( 2, 3, 2, 3 )
13016  add edx, 2*DRAWVERT_SIZE
13017  movmskps ecx, xmm0
13018  mov byte ptr [edi+eax+0], cl
13019  add esi, 4*4
13020  movmskps ecx, xmm1
13021  mov byte ptr [edi+eax+1], cl
13022  add eax, 2
13023  jl loopVert2
13024 
13025  done2:
13026  mov eax, numVerts
13027  and eax, 1
13028  jz done
13029 
13030  movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
13031  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
13032  mulps xmm0, xmm4
13033  movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
13034  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
13035  mulps xmm1, xmm5
13036  movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
13037  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
13038  mulps xmm2, xmm6
13039  addps xmm0, xmm1
13040  addps xmm0, xmm2
13041  addps xmm0, xmm7
13042  movlps [esi], xmm0
13043  movaps xmm1, xmm0
13044  movaps xmm2, SIMD_SP_one
13045  subps xmm2, xmm0
13046  shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
13047  movmskps ecx, xmm0
13048  mov byte ptr [edi], cl
13049 
13050  done:
13051  }
13052 
13053 #else
13054 
13055  const idPlane &p0 = planes[0];
13056  const idPlane &p1 = planes[1];
13057 
13058  for ( int i = 0; i < numVerts - 1; i += 2 ) {
13059  unsigned short bits;
13060  float d0, d1, d2, d3;
13061 
13062  const idVec3 &v0 = verts[i+0].xyz;
13063  const idVec3 &v1 = verts[i+1].xyz;
13064 
13065  d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
13066  d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
13067  d2 = p0[0] * v1[0] + p0[1] * v1[1] + p0[2] * v1[2] + p0[3];
13068  d3 = p1[0] * v1[0] + p1[1] * v1[1] + p1[2] * v1[2] + p1[3];
13069 
13070  texCoords[i+0][0] = d0;
13071  texCoords[i+0][1] = d1;
13072  texCoords[i+1][0] = d2;
13073  texCoords[i+1][1] = d3;
13074 
13075  bits = FLOATSIGNBITSET( d0 ) << 0;
13076  bits |= FLOATSIGNBITSET( d1 ) << 1;
13077  bits |= FLOATSIGNBITSET( d2 ) << 8;
13078  bits |= FLOATSIGNBITSET( d3 ) << 9;
13079 
13080  d0 = 1.0f - d0;
13081  d1 = 1.0f - d1;
13082  d2 = 1.0f - d2;
13083  d3 = 1.0f - d3;
13084 
13085  bits |= FLOATSIGNBITSET( d0 ) << 2;
13086  bits |= FLOATSIGNBITSET( d1 ) << 3;
13087  bits |= FLOATSIGNBITSET( d2 ) << 10;
13088  bits |= FLOATSIGNBITSET( d3 ) << 11;
13089 
13090  *(unsigned short *)(cullBits + i) = bits;
13091  }
13092 
13093  if ( numVerts & 1 ) {
13094  byte bits;
13095  float d0, d1;
13096 
13097  const idPlane &p0 = planes[0];
13098  const idPlane &p1 = planes[1];
13099  const idVec3 &v0 = verts[numVerts - 1].xyz;
13100 
13101  d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
13102  d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
13103 
13104  texCoords[i][0] = d0;
13105  texCoords[i][1] = d1;
13106 
13107  bits = FLOATSIGNBITSET( d0 ) << 0;
13108  bits |= FLOATSIGNBITSET( d1 ) << 1;
13109 
13110  d0 = 1.0f - d0;
13111  d1 = 1.0f - d1;
13112 
13113  bits |= FLOATSIGNBITSET( d0 ) << 2;
13114  bits |= FLOATSIGNBITSET( d1 ) << 3;
13115 
13116  cullBits[numVerts - 1] = bits;
13117  }
13118 
13119 #endif
13120 }
13121 
13122 /*
13123 ============
13124 idSIMD_SSE::DeriveTriPlanes
13125 ============
13126 */
13127 void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
13128 #if 1
13129 
13130  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
13131  assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
13132 
13133  __asm {
13134  mov eax, numIndexes
13135  shl eax, 2
13136  mov esi, verts
13137  mov edi, indexes
13138  mov edx, planes
13139 
13140  add edi, eax
13141  neg eax
13142 
13143  add eax, 4*12
13144  jge done4
13145 
13146  loopPlane4:
13147  mov ebx, [edi+eax-4*12+4]
13148  imul ebx, DRAWVERT_SIZE
13149  mov ecx, [edi+eax-4*12+0]
13150  imul ecx, DRAWVERT_SIZE
13151 
13152  movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13153  subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13154 
13155  movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13156  subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13157 
13158  movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13159  subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13160 
13161  mov ebx, [edi+eax-4*12+8]
13162  imul ebx, DRAWVERT_SIZE
13163 
13164  shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
13165  shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
13166  shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
13167 
13168  movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13169  subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13170 
13171  movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13172  subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13173 
13174  movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13175  subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13176 
13177  mov ebx, [edi+eax-3*12+4]
13178  imul ebx, DRAWVERT_SIZE
13179  mov ecx, [edi+eax-3*12+0]
13180  imul ecx, DRAWVERT_SIZE
13181 
13182  shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
13183  shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
13184  shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
13185 
13186  movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13187  subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13188  movss xmm0, xmm6
13189 
13190  movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13191  subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13192  movss xmm1, xmm7
13193 
13194  movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13195  subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13196  movss xmm2, xmm6
13197 
13198  mov ebx, [edi+eax-3*12+8]
13199  imul ebx, DRAWVERT_SIZE
13200 
13201  shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
13202  shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
13203  shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
13204 
13205  movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13206  subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13207  movss xmm3, xmm7
13208 
13209  movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13210  subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13211  movss xmm4, xmm6
13212 
13213  movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13214  subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13215  movss xmm5, xmm7
13216 
13217  mov ebx, [edi+eax-2*12+4]
13218  imul ebx, DRAWVERT_SIZE
13219  mov ecx, [edi+eax-2*12+0]
13220  imul ecx, DRAWVERT_SIZE
13221 
13222  shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
13223  shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
13224  shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
13225 
13226  movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13227  subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13228  movss xmm0, xmm6
13229 
13230  movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13231  subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13232  movss xmm1, xmm7
13233 
13234  movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13235  subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13236  movss xmm2, xmm6
13237 
13238  mov ebx, [edi+eax-2*12+8]
13239  imul ebx, DRAWVERT_SIZE
13240 
13241  shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
13242  shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
13243  shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
13244 
13245  movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13246  subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13247  movss xmm3, xmm7
13248 
13249  movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13250  subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13251  movss xmm4, xmm6
13252 
13253  movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13254  subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13255  movss xmm5, xmm7
13256 
13257  mov ebx, [edi+eax-1*12+4]
13258  imul ebx, DRAWVERT_SIZE
13259  mov ecx, [edi+eax-1*12+0]
13260  imul ecx, DRAWVERT_SIZE
13261 
13262  shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
13263  shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
13264  shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
13265 
13266  movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13267  subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13268  movss xmm0, xmm6
13269 
13270  movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13271  subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13272  movss xmm1, xmm7
13273 
13274  movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13275  subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13276  movss xmm2, xmm6
13277 
13278  mov ebx, [edi+eax-1*12+8]
13279  imul ebx, DRAWVERT_SIZE
13280 
13281  movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13282  subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13283  movss xmm3, xmm7
13284 
13285  movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13286  subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13287  movss xmm4, xmm6
13288 
13289  movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13290  subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13291  movss xmm5, xmm7
13292 
13293  movaps xmm6, xmm4
13294  mulps xmm6, xmm2
13295  movaps xmm7, xmm5
13296  mulps xmm7, xmm1
13297  subps xmm6, xmm7
13298 
13299  mulps xmm5, xmm0
13300  mulps xmm2, xmm3
13301  subps xmm5, xmm2
13302 
13303  mulps xmm3, xmm1
13304  mulps xmm4, xmm0
13305  subps xmm3, xmm4
13306 
13307  movaps xmm0, xmm6
13308  mulps xmm6, xmm6
13309  movaps xmm1, xmm5
13310  mulps xmm5, xmm5
13311  movaps xmm2, xmm3
13312  mulps xmm3, xmm3
13313 
13314  addps xmm3, xmm5
13315  addps xmm3, xmm6
13316  rsqrtps xmm3, xmm3
13317 
13318  add edx, 4*16
13319  mov ecx, [edi+eax-1*12+0]
13320  imul ecx, DRAWVERT_SIZE
13321 
13322  mulps xmm0, xmm3
13323  mulps xmm1, xmm3
13324  mulps xmm2, xmm3
13325 
13326  movss [edx-1*16+0], xmm0
13327  movss [edx-1*16+4], xmm1
13328  movss [edx-1*16+8], xmm2
13329 
13330  mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13331  mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13332  mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13333 
13334  xorps xmm0, SIMD_SP_singleSignBitMask
13335  subss xmm0, xmm1
13336  subss xmm0, xmm2
13337  movss [edx-1*16+12], xmm0
13338 
13339  mov ecx, [edi+eax-2*12+0]
13340  imul ecx, DRAWVERT_SIZE
13341 
13342  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
13343  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
13344  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
13345 
13346  movss [edx-2*16+0], xmm0
13347  movss [edx-2*16+4], xmm1
13348  movss [edx-2*16+8], xmm2
13349 
13350  mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13351  mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13352  mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13353 
13354  xorps xmm0, SIMD_SP_singleSignBitMask
13355  subss xmm0, xmm1
13356  subss xmm0, xmm2
13357  movss [edx-2*16+12], xmm0
13358 
13359  mov ecx, [edi+eax-3*12+0]
13360  imul ecx, DRAWVERT_SIZE
13361 
13362  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
13363  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
13364  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
13365 
13366  movss [edx-3*16+0], xmm0
13367  movss [edx-3*16+4], xmm1
13368  movss [edx-3*16+8], xmm2
13369 
13370  mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13371  mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13372  mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13373 
13374  xorps xmm0, SIMD_SP_singleSignBitMask
13375  subss xmm0, xmm1
13376  subss xmm0, xmm2
13377  movss [edx-3*16+12], xmm0
13378 
13379  mov ecx, [edi+eax-4*12+0]
13380  imul ecx, DRAWVERT_SIZE
13381 
13382  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
13383  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
13384  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
13385 
13386  movss [edx-4*16+0], xmm0
13387  movss [edx-4*16+4], xmm1
13388  movss [edx-4*16+8], xmm2
13389 
13390  mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13391  mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13392  mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13393 
13394  xorps xmm0, SIMD_SP_singleSignBitMask
13395  subss xmm0, xmm1
13396  subss xmm0, xmm2
13397  movss [edx-4*16+12], xmm0
13398 
13399  add eax, 4*12
13400  jle loopPlane4
13401 
13402  done4:
13403 
13404  sub eax, 4*12
13405  jge done
13406 
13407  loopPlane1:
13408  mov ebx, [edi+eax+4]
13409  imul ebx, DRAWVERT_SIZE
13410  mov ecx, [edi+eax+0]
13411  imul ecx, DRAWVERT_SIZE
13412 
13413  movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13414  subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13415 
13416  movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13417  subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13418 
13419  movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13420  subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13421 
13422  mov ebx, [edi+eax+8]
13423  imul ebx, DRAWVERT_SIZE
13424 
13425  movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13426  subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13427 
13428  movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13429  subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13430 
13431  movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13432  subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13433 
13434  movss xmm6, xmm4
13435  mulss xmm6, xmm2
13436  movss xmm7, xmm5
13437  mulss xmm7, xmm1
13438  subss xmm6, xmm7
13439 
13440  mulss xmm5, xmm0
13441  mulss xmm2, xmm3
13442  subss xmm5, xmm2
13443 
13444  mulss xmm3, xmm1
13445  mulss xmm4, xmm0
13446  subss xmm3, xmm4
13447 
13448  movss xmm0, xmm6
13449  mulss xmm6, xmm6
13450  movss xmm1, xmm5
13451  mulss xmm5, xmm5
13452  movss xmm2, xmm3
13453  mulss xmm3, xmm3
13454 
13455  addss xmm3, xmm5
13456  addss xmm3, xmm6
13457  rsqrtss xmm3, xmm3
13458 
13459  add edx, 1*16
13460 
13461  mulss xmm0, xmm3
13462  mulss xmm1, xmm3
13463  mulss xmm2, xmm3
13464 
13465  movss [edx-1*16+0], xmm0
13466  movss [edx-1*16+4], xmm1
13467  movss [edx-1*16+8], xmm2
13468 
13469  mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13470  mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13471  mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13472 
13473  xorps xmm0, SIMD_SP_singleSignBitMask
13474  subss xmm0, xmm1
13475  subss xmm0, xmm2
13476  movss [edx-1*16+12], xmm0
13477 
13478  add eax, 1*12
13479  jl loopPlane1
13480 
13481  done:
13482  }
13483 
13484 #else
13485 
13486  int i, j;
13487 
13488  for ( i = 0; i <= numIndexes - 12; i += 12 ) {
13489  ALIGN16( float d0[4] );
13490  ALIGN16( float d1[4] );
13491  ALIGN16( float d2[4] );
13492  ALIGN16( float d3[4] );
13493  ALIGN16( float d4[4] );
13494  ALIGN16( float d5[4] );
13495  ALIGN16( float n0[4] );
13496  ALIGN16( float n1[4] );
13497  ALIGN16( float n2[4] );
13498 
13499  for ( j = 0; j < 4; j++ ) {
13500  const idDrawVert *a, *b, *c;
13501 
13502  a = verts + indexes[i + j * 3 + 0];
13503  b = verts + indexes[i + j * 3 + 1];
13504  c = verts + indexes[i + j * 3 + 2];
13505 
13506  d0[j] = b->xyz[0] - a->xyz[0];
13507  d1[j] = b->xyz[1] - a->xyz[1];
13508  d2[j] = b->xyz[2] - a->xyz[2];
13509 
13510  d3[j] = c->xyz[0] - a->xyz[0];
13511  d4[j] = c->xyz[1] - a->xyz[1];
13512  d5[j] = c->xyz[2] - a->xyz[2];
13513  }
13514 
13515  ALIGN16( float tmp[4] );
13516 
13517  n0[0] = d4[0] * d2[0];
13518  n0[1] = d4[1] * d2[1];
13519  n0[2] = d4[2] * d2[2];
13520  n0[3] = d4[3] * d2[3];
13521 
13522  n0[0] -= d5[0] * d1[0];
13523  n0[1] -= d5[1] * d1[1];
13524  n0[2] -= d5[2] * d1[2];
13525  n0[3] -= d5[3] * d1[3];
13526 
13527  n1[0] = d5[0] * d0[0];
13528  n1[1] = d5[1] * d0[1];
13529  n1[2] = d5[2] * d0[2];
13530  n1[3] = d5[3] * d0[3];
13531 
13532  n1[0] -= d3[0] * d2[0];
13533  n1[1] -= d3[1] * d2[1];
13534  n1[2] -= d3[2] * d2[2];
13535  n1[3] -= d3[3] * d2[3];
13536 
13537  n2[0] = d3[0] * d1[0];
13538  n2[1] = d3[1] * d1[1];
13539  n2[2] = d3[2] * d1[2];
13540  n2[3] = d3[3] * d1[3];
13541 
13542  n2[0] -= d4[0] * d0[0];
13543  n2[1] -= d4[1] * d0[1];
13544  n2[2] -= d4[2] * d0[2];
13545  n2[3] -= d4[3] * d0[3];
13546 
13547  tmp[0] = n0[0] * n0[0];
13548  tmp[1] = n0[1] * n0[1];
13549  tmp[2] = n0[2] * n0[2];
13550  tmp[3] = n0[3] * n0[3];
13551 
13552  tmp[0] += n1[0] * n1[0];
13553  tmp[1] += n1[1] * n1[1];
13554  tmp[2] += n1[2] * n1[2];
13555  tmp[3] += n1[3] * n1[3];
13556 
13557  tmp[0] += n2[0] * n2[0];
13558  tmp[1] += n2[1] * n2[1];
13559  tmp[2] += n2[2] * n2[2];
13560  tmp[3] += n2[3] * n2[3];
13561 
13562  tmp[0] = idMath::RSqrt( tmp[0] );
13563  tmp[1] = idMath::RSqrt( tmp[1] );
13564  tmp[2] = idMath::RSqrt( tmp[2] );
13565  tmp[3] = idMath::RSqrt( tmp[3] );
13566 
13567  n0[0] *= tmp[0];
13568  n0[1] *= tmp[1];
13569  n0[2] *= tmp[2];
13570  n0[3] *= tmp[3];
13571 
13572  n1[0] *= tmp[0];
13573  n1[1] *= tmp[1];
13574  n1[2] *= tmp[2];
13575  n1[3] *= tmp[3];
13576 
13577  n2[0] *= tmp[0];
13578  n2[1] *= tmp[1];
13579  n2[2] *= tmp[2];
13580  n2[3] *= tmp[3];
13581 
13582 
13583  for ( j = 0; j < 4; j++ ) {
13584  const idDrawVert *a;
13585 
13586  a = verts + indexes[i + j * 3];
13587 
13588  planes->Normal()[0] = n0[j];
13589  planes->Normal()[1] = n1[j];
13590  planes->Normal()[2] = n2[j];
13591  planes->FitThroughPoint( a->xyz );
13592  planes++;
13593  }
13594  }
13595 
13596  for ( ; i < numIndexes; i += 3 ) {
13597  const idDrawVert *a, *b, *c;
13598  float d0, d1, d2, d3, d4, d5;
13599  float n0, n1, n2;
13600 
13601  a = verts + indexes[i + 0];
13602  b = verts + indexes[i + 1];
13603  c = verts + indexes[i + 2];
13604 
13605  d0 = b->xyz[0] - a->xyz[0];
13606  d1 = b->xyz[1] - a->xyz[1];
13607  d2 = b->xyz[2] - a->xyz[2];
13608 
13609  d3 = c->xyz[0] - a->xyz[0];
13610  d4 = c->xyz[1] - a->xyz[1];
13611  d5 = c->xyz[2] - a->xyz[2];
13612 
13613  float tmp;
13614 
13615  n0 = d4 * d2 - d5 * d1;
13616  n1 = d5 * d0 - d3 * d2;
13617  n2 = d3 * d1 - d4 * d0;
13618 
13619  tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 );
13620 
13621  n0 *= tmp;
13622  n1 *= tmp;
13623  n2 *= tmp;
13624 
13625  planes->Normal()[0] = n0;
13626  planes->Normal()[1] = n1;
13627  planes->Normal()[2] = n2;
13628  planes->FitThroughPoint( a->xyz );
13629  planes++;
13630  }
13631 
13632 #endif
13633 }
13634 
13635 /*
13636 ============
13637 idSIMD_SSE::DeriveTangents
13638 ============
13639 */
13640 //#define REFINE_TANGENT_SQUAREROOT
13641 #define FIX_DEGENERATE_TANGENT
13642 
13643 void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
13644  int i;
13645 
13646  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
13647  assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
13648  assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
13649  assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
13650 
13651  assert( planes != NULL );
13652  assert( verts != NULL );
13653  assert( numVerts >= 0 );
13654 
13655 #ifdef REFINE_TANGENT_SQUAREROOT
13656  __asm {
13657  movaps xmm6, SIMD_SP_rsqrt_c0
13658  movaps xmm7, SIMD_SP_rsqrt_c1
13659  }
13660 #endif
13661 
13662  bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
13663  memset( used, 0, numVerts * sizeof( used[0] ) );
13664 
13665  for ( i = 0; i <= numIndexes - 12; i += 12 ) {
13666  idDrawVert *a, *b, *c;
13667  ALIGN16( unsigned long signBit[4] );
13668  ALIGN16( float d0[4] );
13669  ALIGN16( float d1[4] );
13670  ALIGN16( float d2[4] );
13671  ALIGN16( float d3[4] );
13672  ALIGN16( float d4[4] );
13673  ALIGN16( float d5[4] );
13674  ALIGN16( float d6[4] );
13675  ALIGN16( float d7[4] );
13676  ALIGN16( float d8[4] );
13677  ALIGN16( float d9[4] );
13678  ALIGN16( float n0[4] );
13679  ALIGN16( float n1[4] );
13680  ALIGN16( float n2[4] );
13681  ALIGN16( float t0[4] );
13682  ALIGN16( float t1[4] );
13683  ALIGN16( float t2[4] );
13684  ALIGN16( float t3[4] );
13685  ALIGN16( float t4[4] );
13686  ALIGN16( float t5[4] );
13687 
13688  for ( int j = 0; j < 4; j++ ) {
13689 
13690  a = verts + indexes[i + j * 3 + 0];
13691  b = verts + indexes[i + j * 3 + 1];
13692  c = verts + indexes[i + j * 3 + 2];
13693 
13694  d0[j] = b->xyz[0] - a->xyz[0];
13695  d1[j] = b->xyz[1] - a->xyz[1];
13696  d2[j] = b->xyz[2] - a->xyz[2];
13697  d3[j] = b->st[0] - a->st[0];
13698  d4[j] = b->st[1] - a->st[1];
13699 
13700  d5[j] = c->xyz[0] - a->xyz[0];
13701  d6[j] = c->xyz[1] - a->xyz[1];
13702  d7[j] = c->xyz[2] - a->xyz[2];
13703  d8[j] = c->st[0] - a->st[0];
13704  d9[j] = c->st[1] - a->st[1];
13705  }
13706 
13707 #if 1
13708 
13709  __asm {
13710  // normal
13711  movaps xmm0, d6
13712  mulps xmm0, d2
13713  movaps xmm1, d7
13714  mulps xmm1, d1
13715  subps xmm0, xmm1
13716 
13717  movaps xmm1, d7
13718  mulps xmm1, d0
13719  movaps xmm2, d5
13720  mulps xmm2, d2
13721  subps xmm1, xmm2
13722 
13723  movaps xmm2, d5
13724  mulps xmm2, d1
13725  movaps xmm3, d6
13726  mulps xmm3, d0
13727  subps xmm2, xmm3
13728 
13729  movaps xmm3, xmm0
13730  movaps xmm4, xmm1
13731  movaps xmm5, xmm2
13732 
13733  mulps xmm3, xmm3
13734  mulps xmm4, xmm4
13735  mulps xmm5, xmm5
13736 
13737  addps xmm3, xmm4
13738  addps xmm3, xmm5
13739 
13740 #ifdef FIX_DEGENERATE_TANGENT
13741  xorps xmm4, xmm4
13742  cmpeqps xmm4, xmm3
13743  andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
13744  andps xmm3, SIMD_SP_absMask // make sure the values are positive
13745  orps xmm3, xmm4
13746 #endif
13747 
13748 #ifdef REFINE_TANGENT_SQUAREROOT
13749  rsqrtps xmm4, xmm3
13750  mulps xmm3, xmm4
13751  mulps xmm3, xmm4
13752  subps xmm3, xmm6
13753  mulps xmm4, xmm7
13754  mulps xmm3, xmm4
13755 #else
13756  rsqrtps xmm3, xmm3
13757 #endif
13758  mulps xmm0, xmm3
13759  movaps n0, xmm0
13760  mulps xmm1, xmm3
13761  movaps n1, xmm1
13762  mulps xmm2, xmm3
13763  movaps n2, xmm2
13764 
13765  // area sign bit
13766  movaps xmm0, d3
13767  mulps xmm0, d9
13768  movaps xmm1, d4
13769  mulps xmm1, d8
13770  subps xmm0, xmm1
13771  andps xmm0, SIMD_SP_signBitMask
13772  movaps signBit, xmm0
13773 
13774  // first tangent
13775  movaps xmm0, d0
13776  mulps xmm0, d9
13777  movaps xmm1, d4
13778  mulps xmm1, d5
13779  subps xmm0, xmm1
13780 
13781  movaps xmm1, d1
13782  mulps xmm1, d9
13783  movaps xmm2, d4
13784  mulps xmm2, d6
13785  subps xmm1, xmm2
13786 
13787  movaps xmm2, d2
13788  mulps xmm2, d9
13789  movaps xmm3, d4
13790  mulps xmm3, d7
13791  subps xmm2, xmm3
13792 
13793  movaps xmm3, xmm0
13794  movaps xmm4, xmm1
13795  movaps xmm5, xmm2
13796 
13797  mulps xmm3, xmm3
13798  mulps xmm4, xmm4
13799  mulps xmm5, xmm5
13800 
13801  addps xmm3, xmm4
13802  addps xmm3, xmm5
13803 
13804 #ifdef FIX_DEGENERATE_TANGENT
13805  xorps xmm4, xmm4
13806  cmpeqps xmm4, xmm3
13807  andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
13808  andps xmm3, SIMD_SP_absMask // make sure the values are positive
13809  orps xmm3, xmm4
13810 #endif
13811 
13812 #ifdef REFINE_TANGENT_SQUAREROOT
13813  rsqrtps xmm4, xmm3
13814  mulps xmm3, xmm4
13815  mulps xmm3, xmm4
13816  subps xmm3, xmm6
13817  mulps xmm4, xmm7
13818  mulps xmm3, xmm4
13819 #else
13820  rsqrtps xmm3, xmm3
13821 #endif
13822  xorps xmm3, signBit
13823 
13824  mulps xmm0, xmm3
13825  movaps t0, xmm0
13826  mulps xmm1, xmm3
13827  movaps t1, xmm1
13828  mulps xmm2, xmm3
13829  movaps t2, xmm2
13830 
13831  // second tangent
13832  movaps xmm0, d3
13833  mulps xmm0, d5
13834  movaps xmm1, d0
13835  mulps xmm1, d8
13836  subps xmm0, xmm1
13837 
13838  movaps xmm1, d3
13839  mulps xmm1, d6
13840  movaps xmm2, d1
13841  mulps xmm2, d8
13842  subps xmm1, xmm2
13843 
13844  movaps xmm2, d3
13845  mulps xmm2, d7
13846  movaps xmm3, d2
13847  mulps xmm3, d8
13848  subps xmm2, xmm3
13849 
13850  movaps xmm3, xmm0
13851  movaps xmm4, xmm1
13852  movaps xmm5, xmm2
13853 
13854  mulps xmm3, xmm3
13855  mulps xmm4, xmm4
13856  mulps xmm5, xmm5
13857 
13858  addps xmm3, xmm4
13859  addps xmm3, xmm5
13860 
13861 #ifdef FIX_DEGENERATE_TANGENT
13862  xorps xmm4, xmm4
13863  cmpeqps xmm4, xmm3
13864  andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
13865  andps xmm3, SIMD_SP_absMask // make sure the values are positive
13866  orps xmm3, xmm4
13867 #endif
13868 
13869 #ifdef REFINE_TANGENT_SQUAREROOT
13870  rsqrtps xmm4, xmm3
13871  mulps xmm3, xmm4
13872  mulps xmm3, xmm4
13873  subps xmm3, xmm6
13874  mulps xmm4, xmm7
13875  mulps xmm3, xmm4
13876 #else
13877  rsqrtps xmm3, xmm3
13878 #endif
13879  xorps xmm3, signBit
13880 
13881  mulps xmm0, xmm3
13882  movaps t3, xmm0
13883  mulps xmm1, xmm3
13884  movaps t4, xmm1
13885  mulps xmm2, xmm3
13886  movaps t5, xmm2
13887  }
13888 
13889 #else
13890 
13891  ALIGN16( float tmp[4] );
13892 
13893  // normal
13894  n0[0] = d6[0] * d2[0];
13895  n0[1] = d6[1] * d2[1];
13896  n0[2] = d6[2] * d2[2];
13897  n0[3] = d6[3] * d2[3];
13898 
13899  n0[0] -= d7[0] * d1[0];
13900  n0[1] -= d7[1] * d1[1];
13901  n0[2] -= d7[2] * d1[2];
13902  n0[3] -= d7[3] * d1[3];
13903 
13904  n1[0] = d7[0] * d0[0];
13905  n1[1] = d7[1] * d0[1];
13906  n1[2] = d7[2] * d0[2];
13907  n1[3] = d7[3] * d0[3];
13908 
13909  n1[0] -= d5[0] * d2[0];
13910  n1[1] -= d5[1] * d2[1];
13911  n1[2] -= d5[2] * d2[2];
13912  n1[3] -= d5[3] * d2[3];
13913 
13914  n2[0] = d5[0] * d1[0];
13915  n2[1] = d5[1] * d1[1];
13916  n2[2] = d5[2] * d1[2];
13917  n2[3] = d5[3] * d1[3];
13918 
13919  n2[0] -= d6[0] * d0[0];
13920  n2[1] -= d6[1] * d0[1];
13921  n2[2] -= d6[2] * d0[2];
13922  n2[3] -= d6[3] * d0[3];
13923 
13924  tmp[0] = n0[0] * n0[0];
13925  tmp[1] = n0[1] * n0[1];
13926  tmp[2] = n0[2] * n0[2];
13927  tmp[3] = n0[3] * n0[3];
13928 
13929  tmp[0] += n1[0] * n1[0];
13930  tmp[1] += n1[1] * n1[1];
13931  tmp[2] += n1[2] * n1[2];
13932  tmp[3] += n1[3] * n1[3];
13933 
13934  tmp[0] += n2[0] * n2[0];
13935  tmp[1] += n2[1] * n2[1];
13936  tmp[2] += n2[2] * n2[2];
13937  tmp[3] += n2[3] * n2[3];
13938 
13939  tmp[0] = idMath::RSqrt( tmp[0] );
13940  tmp[1] = idMath::RSqrt( tmp[1] );
13941  tmp[2] = idMath::RSqrt( tmp[2] );
13942  tmp[3] = idMath::RSqrt( tmp[3] );
13943 
13944  n0[0] *= tmp[0];
13945  n0[1] *= tmp[1];
13946  n0[2] *= tmp[2];
13947  n0[3] *= tmp[3];
13948 
13949  n1[0] *= tmp[0];
13950  n1[1] *= tmp[1];
13951  n1[2] *= tmp[2];
13952  n1[3] *= tmp[3];
13953 
13954  n2[0] *= tmp[0];
13955  n2[1] *= tmp[1];
13956  n2[2] *= tmp[2];
13957  n2[3] *= tmp[3];
13958 
13959  // area sign bit
13960  tmp[0] = d3[0] * d9[0];
13961  tmp[1] = d3[1] * d9[1];
13962  tmp[2] = d3[2] * d9[2];
13963  tmp[3] = d3[3] * d9[3];
13964 
13965  tmp[0] -= d4[0] * d8[0];
13966  tmp[1] -= d4[1] * d8[1];
13967  tmp[2] -= d4[2] * d8[2];
13968  tmp[3] -= d4[3] * d8[3];
13969 
13970  signBit[0] = ( *(unsigned long *)&tmp[0] ) & ( 1 << 31 );
13971  signBit[1] = ( *(unsigned long *)&tmp[1] ) & ( 1 << 31 );
13972  signBit[2] = ( *(unsigned long *)&tmp[2] ) & ( 1 << 31 );
13973  signBit[3] = ( *(unsigned long *)&tmp[3] ) & ( 1 << 31 );
13974 
13975  // first tangent
13976  t0[0] = d0[0] * d9[0];
13977  t0[1] = d0[1] * d9[1];
13978  t0[2] = d0[2] * d9[2];
13979  t0[3] = d0[3] * d9[3];
13980 
13981  t0[0] -= d4[0] * d5[0];
13982  t0[1] -= d4[1] * d5[1];
13983  t0[2] -= d4[2] * d5[2];
13984  t0[3] -= d4[3] * d5[3];
13985 
13986  t1[0] = d1[0] * d9[0];
13987  t1[1] = d1[1] * d9[1];
13988  t1[2] = d1[2] * d9[2];
13989  t1[3] = d1[3] * d9[3];
13990 
13991  t1[0] -= d4[0] * d6[0];
13992  t1[1] -= d4[1] * d6[1];
13993  t1[2] -= d4[2] * d6[2];
13994  t1[3] -= d4[3] * d6[3];
13995 
13996  t2[0] = d2[0] * d9[0];
13997  t2[1] = d2[1] * d9[1];
13998  t2[2] = d2[2] * d9[2];
13999  t2[3] = d2[3] * d9[3];
14000 
14001  t2[0] -= d4[0] * d7[0];
14002  t2[1] -= d4[1] * d7[1];
14003  t2[2] -= d4[2] * d7[2];
14004  t2[3] -= d4[3] * d7[3];
14005 
14006  tmp[0] = t0[0] * t0[0];
14007  tmp[1] = t0[1] * t0[1];
14008  tmp[2] = t0[2] * t0[2];
14009  tmp[3] = t0[3] * t0[3];
14010 
14011  tmp[0] += t1[0] * t1[0];
14012  tmp[1] += t1[1] * t1[1];
14013  tmp[2] += t1[2] * t1[2];
14014  tmp[3] += t1[3] * t1[3];
14015 
14016  tmp[0] += t2[0] * t2[0];
14017  tmp[1] += t2[1] * t2[1];
14018  tmp[2] += t2[2] * t2[2];
14019  tmp[3] += t2[3] * t2[3];
14020 
14021  tmp[0] = idMath::RSqrt( tmp[0] );
14022  tmp[1] = idMath::RSqrt( tmp[1] );
14023  tmp[2] = idMath::RSqrt( tmp[2] );
14024  tmp[3] = idMath::RSqrt( tmp[3] );
14025 
14026  *(unsigned long *)&tmp[0] ^= signBit[0];
14027  *(unsigned long *)&tmp[1] ^= signBit[1];
14028  *(unsigned long *)&tmp[2] ^= signBit[2];
14029  *(unsigned long *)&tmp[3] ^= signBit[3];
14030 
14031  t0[0] *= tmp[0];
14032  t0[1] *= tmp[1];
14033  t0[2] *= tmp[2];
14034  t0[3] *= tmp[3];
14035 
14036  t1[0] *= tmp[0];
14037  t1[1] *= tmp[1];
14038  t1[2] *= tmp[2];
14039  t1[3] *= tmp[3];
14040 
14041  t2[0] *= tmp[0];
14042  t2[1] *= tmp[1];
14043  t2[2] *= tmp[2];
14044  t2[3] *= tmp[3];
14045 
14046  // second tangent
14047  t3[0] = d3[0] * d5[0];
14048  t3[1] = d3[1] * d5[1];
14049  t3[2] = d3[2] * d5[2];
14050  t3[3] = d3[3] * d5[3];
14051 
14052  t3[0] -= d0[0] * d8[0];
14053  t3[1] -= d0[1] * d8[1];
14054  t3[2] -= d0[2] * d8[2];
14055  t3[3] -= d0[3] * d8[3];
14056 
14057  t4[0] = d3[0] * d6[0];
14058  t4[1] = d3[1] * d6[1];
14059  t4[2] = d3[2] * d6[2];
14060  t4[3] = d3[3] * d6[3];
14061 
14062  t4[0] -= d1[0] * d8[0];
14063  t4[1] -= d1[1] * d8[1];
14064  t4[2] -= d1[2] * d8[2];
14065  t4[3] -= d1[3] * d8[3];
14066 
14067  t5[0] = d3[0] * d7[0];
14068  t5[1] = d3[1] * d7[1];
14069  t5[2] = d3[2] * d7[2];
14070  t5[3] = d3[3] * d7[3];
14071 
14072  t5[0] -= d2[0] * d8[0];
14073  t5[1] -= d2[1] * d8[1];
14074  t5[2] -= d2[2] * d8[2];
14075  t5[3] -= d2[3] * d8[3];
14076 
14077  tmp[0] = t3[0] * t3[0];
14078  tmp[1] = t3[1] * t3[1];
14079  tmp[2] = t3[2] * t3[2];
14080  tmp[3] = t3[3] * t3[3];
14081 
14082  tmp[0] += t4[0] * t4[0];
14083  tmp[1] += t4[1] * t4[1];
14084  tmp[2] += t4[2] * t4[2];
14085  tmp[3] += t4[3] * t4[3];
14086 
14087  tmp[0] += t5[0] * t5[0];
14088  tmp[1] += t5[1] * t5[1];
14089  tmp[2] += t5[2] * t5[2];
14090  tmp[3] += t5[3] * t5[3];
14091 
14092  tmp[0] = idMath::RSqrt( tmp[0] );
14093  tmp[1] = idMath::RSqrt( tmp[1] );
14094  tmp[2] = idMath::RSqrt( tmp[2] );
14095  tmp[3] = idMath::RSqrt( tmp[3] );
14096 
14097  *(unsigned long *)&tmp[0] ^= signBit[0];
14098  *(unsigned long *)&tmp[1] ^= signBit[1];
14099  *(unsigned long *)&tmp[2] ^= signBit[2];
14100  *(unsigned long *)&tmp[3] ^= signBit[3];
14101 
14102  t3[0] *= tmp[0];
14103  t3[1] *= tmp[1];
14104  t3[2] *= tmp[2];
14105  t3[3] *= tmp[3];
14106 
14107  t4[0] *= tmp[0];
14108  t4[1] *= tmp[1];
14109  t4[2] *= tmp[2];
14110  t4[3] *= tmp[3];
14111 
14112  t5[0] *= tmp[0];
14113  t5[1] *= tmp[1];
14114  t5[2] *= tmp[2];
14115  t5[3] *= tmp[3];
14116 
14117 #endif
14118 
14119  for ( int j = 0; j < 4; j++ ) {
14120 
14121  const int v0 = indexes[i + j * 3 + 0];
14122  const int v1 = indexes[i + j * 3 + 1];
14123  const int v2 = indexes[i + j * 3 + 2];
14124 
14125  a = verts + v0;
14126  b = verts + v1;
14127  c = verts + v2;
14128 
14129  planes->Normal()[0] = n0[j];
14130  planes->Normal()[1] = n1[j];
14131  planes->Normal()[2] = n2[j];
14132  planes->FitThroughPoint( a->xyz );
14133  planes++;
14134 
14135  if ( used[v0] ) {
14136  a->normal[0] += n0[j];
14137  a->normal[1] += n1[j];
14138  a->normal[2] += n2[j];
14139 
14140  a->tangents[0][0] += t0[j];
14141  a->tangents[0][1] += t1[j];
14142  a->tangents[0][2] += t2[j];
14143 
14144  a->tangents[1][0] += t3[j];
14145  a->tangents[1][1] += t4[j];
14146  a->tangents[1][2] += t5[j];
14147  } else {
14148  a->normal[0] = n0[j];
14149  a->normal[1] = n1[j];
14150  a->normal[2] = n2[j];
14151 
14152  a->tangents[0][0] = t0[j];
14153  a->tangents[0][1] = t1[j];
14154  a->tangents[0][2] = t2[j];
14155 
14156  a->tangents[1][0] = t3[j];
14157  a->tangents[1][1] = t4[j];
14158  a->tangents[1][2] = t5[j];
14159 
14160  used[v0] = true;
14161  }
14162 
14163  if ( used[v1] ) {
14164  b->normal[0] += n0[j];
14165  b->normal[1] += n1[j];
14166  b->normal[2] += n2[j];
14167 
14168  b->tangents[0][0] += t0[j];
14169  b->tangents[0][1] += t1[j];
14170  b->tangents[0][2] += t2[j];
14171 
14172  b->tangents[1][0] += t3[j];
14173  b->tangents[1][1] += t4[j];
14174  b->tangents[1][2] += t5[j];
14175  } else {
14176  b->normal[0] = n0[j];
14177  b->normal[1] = n1[j];
14178  b->normal[2] = n2[j];
14179 
14180  b->tangents[0][0] = t0[j];
14181  b->tangents[0][1] = t1[j];
14182  b->tangents[0][2] = t2[j];
14183 
14184  b->tangents[1][0] = t3[j];
14185  b->tangents[1][1] = t4[j];
14186  b->tangents[1][2] = t5[j];
14187 
14188  used[v1] = true;
14189  }
14190 
14191  if ( used[v2] ) {
14192  c->normal[0] += n0[j];
14193  c->normal[1] += n1[j];
14194  c->normal[2] += n2[j];
14195 
14196  c->tangents[0][0] += t0[j];
14197  c->tangents[0][1] += t1[j];
14198  c->tangents[0][2] += t2[j];
14199 
14200  c->tangents[1][0] += t3[j];
14201  c->tangents[1][1] += t4[j];
14202  c->tangents[1][2] += t5[j];
14203  } else {
14204  c->normal[0] = n0[j];
14205  c->normal[1] = n1[j];
14206  c->normal[2] = n2[j];
14207 
14208  c->tangents[0][0] = t0[j];
14209  c->tangents[0][1] = t1[j];
14210  c->tangents[0][2] = t2[j];
14211 
14212  c->tangents[1][0] = t3[j];
14213  c->tangents[1][1] = t4[j];
14214  c->tangents[1][2] = t5[j];
14215 
14216  used[v2] = true;
14217  }
14218  }
14219  }
14220 
14221  for ( ; i < numIndexes; i += 3 ) {
14222  idDrawVert *a, *b, *c;
14223  ALIGN16( unsigned long signBit[4] );
14224  float d0, d1, d2, d3, d4;
14225  float d5, d6, d7, d8, d9;
14226  float n0, n1, n2;
14227  float t0, t1, t2;
14228  float t3, t4, t5;
14229 
14230  const int v0 = indexes[i + 0];
14231  const int v1 = indexes[i + 1];
14232  const int v2 = indexes[i + 2];
14233 
14234  a = verts + v0;
14235  b = verts + v1;
14236  c = verts + v2;
14237 
14238  d0 = b->xyz[0] - a->xyz[0];
14239  d1 = b->xyz[1] - a->xyz[1];
14240  d2 = b->xyz[2] - a->xyz[2];
14241  d3 = b->st[0] - a->st[0];
14242  d4 = b->st[1] - a->st[1];
14243 
14244  d5 = c->xyz[0] - a->xyz[0];
14245  d6 = c->xyz[1] - a->xyz[1];
14246  d7 = c->xyz[2] - a->xyz[2];
14247  d8 = c->st[0] - a->st[0];
14248  d9 = c->st[1] - a->st[1];
14249 
14250 #if 1
14251 
14252  __asm {
14253  // normal
14254  movss xmm0, d6
14255  mulss xmm0, d2
14256  movss xmm1, d7
14257  mulss xmm1, d1
14258  subss xmm0, xmm1
14259 
14260  movss xmm1, d7
14261  mulss xmm1, d0
14262  movss xmm2, d5
14263  mulss xmm2, d2
14264  subss xmm1, xmm2
14265 
14266  movss xmm2, d5
14267  mulss xmm2, d1
14268  movss xmm3, d6
14269  mulss xmm3, d0
14270  subss xmm2, xmm3
14271 
14272  movss xmm3, xmm0
14273  movss xmm4, xmm1
14274  movss xmm5, xmm2
14275 
14276  mulss xmm3, xmm3
14277  mulss xmm4, xmm4
14278  mulss xmm5, xmm5
14279 
14280  addss xmm3, xmm4
14281  addss xmm3, xmm5
14282 
14283 #ifdef FIX_DEGENERATE_TANGENT
14284  xorps xmm4, xmm4
14285  cmpeqps xmm4, xmm3
14286  andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
14287  andps xmm3, SIMD_SP_absMask // make sure the values are positive
14288  orps xmm3, xmm4
14289 #endif
14290 
14291 #ifdef REFINE_TANGENT_SQUAREROOT
14292  rsqrtss xmm4, xmm3
14293  mulss xmm3, xmm4
14294  mulss xmm3, xmm4
14295  subss xmm3, xmm6
14296  mulss xmm4, xmm7
14297  mulss xmm3, xmm4
14298 #else
14299  rsqrtss xmm3, xmm3
14300 #endif
14301  mulss xmm0, xmm3
14302  movss n0, xmm0
14303  mulss xmm1, xmm3
14304  movss n1, xmm1
14305  mulss xmm2, xmm3
14306  movss n2, xmm2
14307 
14308  // area sign bit
14309  movss xmm0, d3
14310  mulss xmm0, d9
14311  movss xmm1, d4
14312  mulss xmm1, d8
14313  subss xmm0, xmm1
14314  andps xmm0, SIMD_SP_signBitMask
14315  movaps signBit, xmm0
14316 
14317  // first tangent
14318  movss xmm0, d0
14319  mulss xmm0, d9
14320  movss xmm1, d4
14321  mulss xmm1, d5
14322  subss xmm0, xmm1
14323 
14324  movss xmm1, d1
14325  mulss xmm1, d9
14326  movss xmm2, d4
14327  mulss xmm2, d6
14328  subss xmm1, xmm2
14329 
14330  movss xmm2, d2
14331  mulss xmm2, d9
14332  movss xmm3, d4
14333  mulss xmm3, d7
14334  subss xmm2, xmm3
14335 
14336  movss xmm3, xmm0
14337  movss xmm4, xmm1
14338  movss xmm5, xmm2
14339 
14340  mulss xmm3, xmm3
14341  mulss xmm4, xmm4
14342  mulss xmm5, xmm5
14343 
14344  addss xmm3, xmm4
14345  addss xmm3, xmm5
14346 
14347 #ifdef FIX_DEGENERATE_TANGENT
14348  xorps xmm4, xmm4
14349  cmpeqps xmm4, xmm3
14350  andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
14351  andps xmm3, SIMD_SP_absMask // make sure the values are positive
14352  orps xmm3, xmm4
14353 #endif
14354 
14355 #ifdef REFINE_TANGENT_SQUAREROOT
14356  rsqrtss xmm4, xmm3
14357  mulss xmm3, xmm4
14358  mulss xmm3, xmm4
14359  subss xmm3, xmm6
14360  mulss xmm4, xmm7
14361  mulss xmm3, xmm4
14362 #else
14363  rsqrtss xmm3, xmm3
14364 #endif
14365  xorps xmm3, signBit
14366 
14367  mulss xmm0, xmm3
14368  movss t0, xmm0
14369  mulss xmm1, xmm3
14370  movss t1, xmm1
14371  mulss xmm2, xmm3
14372  movss t2, xmm2
14373 
14374  // second tangent
14375  movss xmm0, d3
14376  mulss xmm0, d5
14377  movss xmm1, d0
14378  mulss xmm1, d8
14379  subss xmm0, xmm1
14380 
14381  movss xmm1, d3
14382  mulss xmm1, d6
14383  movss xmm2, d1
14384  mulss xmm2, d8
14385  subss xmm1, xmm2
14386 
14387  movss xmm2, d3
14388  mulss xmm2, d7
14389  movss xmm3, d2
14390  mulss xmm3, d8
14391  subss xmm2, xmm3
14392 
14393  movss xmm3, xmm0
14394  movss xmm4, xmm1
14395  movss xmm5, xmm2
14396 
14397  mulss xmm3, xmm3
14398  mulss xmm4, xmm4
14399  mulss xmm5, xmm5
14400 
14401  addss xmm3, xmm4
14402  addss xmm3, xmm5
14403 
14404 #ifdef FIX_DEGENERATE_TANGENT
14405  xorps xmm4, xmm4
14406  cmpeqps xmm4, xmm3
14407  andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
14408  andps xmm3, SIMD_SP_absMask // make sure the values are positive
14409  orps xmm3, xmm4
14410 #endif
14411 
14412 #ifdef REFINE_TANGENT_SQUAREROOT
14413  rsqrtss xmm4, xmm3
14414  mulss xmm3, xmm4
14415  mulss xmm3, xmm4
14416  subss xmm3, xmm6
14417  mulss xmm4, xmm7
14418  mulss xmm3, xmm4
14419 #else
14420  rsqrtss xmm3, xmm3
14421 #endif
14422  xorps xmm3, signBit
14423 
14424  mulss xmm0, xmm3
14425  movss t3, xmm0
14426  mulss xmm1, xmm3
14427  movss t4, xmm1
14428  mulss xmm2, xmm3
14429  movss t5, xmm2
14430  }
14431 
14432 #else
14433 
14434  float tmp;
14435 
14436  // normal
14437  n0 = d6 * d2 - d7 * d1;
14438  n1 = d7 * d0 - d5 * d2;
14439  n2 = d5 * d1 - d6 * d0;
14440 
14441  tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 );
14442 
14443  n0 *= tmp;
14444  n1 *= tmp;
14445  n2 *= tmp;
14446 
14447  // area sign bit
14448  tmp = d3 * d9 - d4 * d8;
14449  signBit[0] = ( *(unsigned long *)&tmp ) & ( 1 << 31 );
14450 
14451  // first tangent
14452  t0 = d0 * d9 - d4 * d5;
14453  t1 = d1 * d9 - d4 * d6;
14454  t2 = d2 * d9 - d4 * d7;
14455 
14456  tmp = idMath::RSqrt( t0 * t0 + t1 * t1 + t2 * t2 );
14457  *(unsigned long *)&tmp ^= signBit[0];
14458 
14459  t0 *= tmp;
14460  t1 *= tmp;
14461  t2 *= tmp;
14462 
14463  // second tangent
14464  t3 = d3 * d5 - d0 * d8;
14465  t4 = d3 * d6 - d1 * d8;
14466  t5 = d3 * d7 - d2 * d8;
14467 
14468  tmp = idMath::RSqrt( t3 * t3 + t4 * t4 + t5 * t5 );
14469  *(unsigned long *)&tmp ^= signBit[0];
14470 
14471  t3 *= tmp;
14472  t4 *= tmp;
14473  t5 *= tmp;
14474 
14475 #endif
14476 
14477  planes->Normal()[0] = n0;
14478  planes->Normal()[1] = n1;
14479  planes->Normal()[2] = n2;
14480  planes->FitThroughPoint( a->xyz );
14481  planes++;
14482 
14483  if ( used[v0] ) {
14484  a->normal[0] += n0;
14485  a->normal[1] += n1;
14486  a->normal[2] += n2;
14487 
14488  a->tangents[0][0] += t0;
14489  a->tangents[0][1] += t1;
14490  a->tangents[0][2] += t2;
14491 
14492  a->tangents[1][0] += t3;
14493  a->tangents[1][1] += t4;
14494  a->tangents[1][2] += t5;
14495  } else {
14496  a->normal[0] = n0;
14497  a->normal[1] = n1;
14498  a->normal[2] = n2;
14499 
14500  a->tangents[0][0] = t0;
14501  a->tangents[0][1] = t1;
14502  a->tangents[0][2] = t2;
14503 
14504  a->tangents[1][0] = t3;
14505  a->tangents[1][1] = t4;
14506  a->tangents[1][2] = t5;
14507 
14508  used[v0] = true;
14509  }
14510 
14511  if ( used[v1] ) {
14512  b->normal[0] += n0;
14513  b->normal[1] += n1;
14514  b->normal[2] += n2;
14515 
14516  b->tangents[0][0] += t0;
14517  b->tangents[0][1] += t1;
14518  b->tangents[0][2] += t2;
14519 
14520  b->tangents[1][0] += t3;
14521  b->tangents[1][1] += t4;
14522  b->tangents[1][2] += t5;
14523  } else {
14524  b->normal[0] = n0;
14525  b->normal[1] = n1;
14526  b->normal[2] = n2;
14527 
14528  b->tangents[0][0] = t0;
14529  b->tangents[0][1] = t1;
14530  b->tangents[0][2] = t2;
14531 
14532  b->tangents[1][0] = t3;
14533  b->tangents[1][1] = t4;
14534  b->tangents[1][2] = t5;
14535 
14536  used[v1] = true;
14537  }
14538 
14539  if ( used[v2] ) {
14540  c->normal[0] += n0;
14541  c->normal[1] += n1;
14542  c->normal[2] += n2;
14543 
14544  c->tangents[0][0] += t0;
14545  c->tangents[0][1] += t1;
14546  c->tangents[0][2] += t2;
14547 
14548  c->tangents[1][0] += t3;
14549  c->tangents[1][1] += t4;
14550  c->tangents[1][2] += t5;
14551  } else {
14552  c->normal[0] = n0;
14553  c->normal[1] = n1;
14554  c->normal[2] = n2;
14555 
14556  c->tangents[0][0] = t0;
14557  c->tangents[0][1] = t1;
14558  c->tangents[0][2] = t2;
14559 
14560  c->tangents[1][0] = t3;
14561  c->tangents[1][1] = t4;
14562  c->tangents[1][2] = t5;
14563 
14564  used[v2] = true;
14565  }
14566  }
14567 }
14568 
14569 /*
14570 ============
14571 idSIMD_SSE::DeriveUnsmoothedTangents
14572 ============
14573 */
14574 #define DERIVE_UNSMOOTHED_BITANGENT
14575 
14576 void VPCALL idSIMD_SSE::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
14577  int i, j;
14578 
14579  for ( i = 0; i <= numVerts - 4; i += 4 ) {
14580  ALIGN16( float s0[4] );
14581  ALIGN16( float s1[4] );
14582  ALIGN16( float s2[4] );
14583  ALIGN16( float d0[4] );
14584  ALIGN16( float d1[4] );
14585  ALIGN16( float d2[4] );
14586  ALIGN16( float d3[4] );
14587  ALIGN16( float d4[4] );
14588  ALIGN16( float d5[4] );
14589  ALIGN16( float d6[4] );
14590  ALIGN16( float d7[4] );
14591  ALIGN16( float d8[4] );
14592  ALIGN16( float d9[4] );
14593  ALIGN16( float n0[4] );
14594  ALIGN16( float n1[4] );
14595  ALIGN16( float n2[4] );
14596  ALIGN16( float t0[4] );
14597  ALIGN16( float t1[4] );
14598  ALIGN16( float t2[4] );
14599  ALIGN16( float t3[4] );
14600  ALIGN16( float t4[4] );
14601  ALIGN16( float t5[4] );
14602 
14603  for ( j = 0; j < 4; j++ ) {
14604  const idDrawVert *a, *b, *c;
14605 
14606  const dominantTri_s &dt = dominantTris[i+j];
14607 
14608  s0[j] = dt.normalizationScale[0];
14609  s1[j] = dt.normalizationScale[1];
14610  s2[j] = dt.normalizationScale[2];
14611 
14612  a = verts + i + j;
14613  b = verts + dt.v2;
14614  c = verts + dt.v3;
14615 
14616  d0[j] = b->xyz[0] - a->xyz[0];
14617  d1[j] = b->xyz[1] - a->xyz[1];
14618  d2[j] = b->xyz[2] - a->xyz[2];
14619  d3[j] = b->st[0] - a->st[0];
14620  d4[j] = b->st[1] - a->st[1];
14621 
14622  d5[j] = c->xyz[0] - a->xyz[0];
14623  d6[j] = c->xyz[1] - a->xyz[1];
14624  d7[j] = c->xyz[2] - a->xyz[2];
14625  d8[j] = c->st[0] - a->st[0];
14626  d9[j] = c->st[1] - a->st[1];
14627  }
14628 
14629 #if 1
14630 
14631  __asm {
14632 
14633  movaps xmm0, d6
14634  mulps xmm0, d2
14635  movaps xmm1, d7
14636  mulps xmm1, d1
14637 
14638  movaps xmm2, d7
14639  mulps xmm2, d0
14640  movaps xmm3, d5
14641  mulps xmm3, d2
14642 
14643  movaps xmm4, d5
14644  mulps xmm4, d1
14645  movaps xmm5, d6
14646  mulps xmm5, d0
14647 
14648  subps xmm0, xmm1
14649  subps xmm2, xmm3
14650  movaps xmm7, s2
14651  subps xmm4, xmm5
14652 
14653  mulps xmm0, xmm7
14654  movaps n0, xmm0
14655  mulps xmm2, xmm7
14656  movaps n1, xmm2
14657  mulps xmm4, xmm7
14658  movaps n2, xmm4
14659 
14660  movaps xmm0, d0
14661  mulps xmm0, d9
14662  movaps xmm1, d4
14663  mulps xmm1, d5
14664 
14665  movaps xmm2, d1
14666  mulps xmm2, d9
14667  movaps xmm3, d4
14668  mulps xmm3, d6
14669 
14670  movaps xmm4, d2
14671  mulps xmm4, d9
14672  movaps xmm5, d4
14673  mulps xmm5, d7
14674 
14675  subps xmm0, xmm1
14676  subps xmm2, xmm3
14677  movaps xmm7, s0
14678  subps xmm4, xmm5
14679 
14680  mulps xmm0, xmm7
14681  movaps t0, xmm0
14682  mulps xmm2, xmm7
14683  movaps t1, xmm2
14684  mulps xmm4, xmm7
14685  movaps t2, xmm4
14686 
14687 #ifndef DERIVE_UNSMOOTHED_BITANGENT
14688  movaps xmm0, d3
14689  mulps xmm0, d5
14690  movaps xmm1, d0
14691  mulps xmm1, d8
14692 
14693  movaps xmm2, d3
14694  mulps xmm2, d6
14695  movaps xmm3, d1
14696  mulps xmm3, d8
14697 
14698  movaps xmm4, d3
14699  mulps xmm4, d7
14700  movaps xmm5, d2
14701  mulps xmm5, d8
14702 #else
14703  movaps xmm0, n2
14704  mulps xmm0, t1
14705  movaps xmm1, n1
14706  mulps xmm1, t2
14707 
14708  movaps xmm2, n0
14709  mulps xmm2, t2
14710  movaps xmm3, n2
14711  mulps xmm3, t0
14712 
14713  movaps xmm4, n1
14714  mulps xmm4, t0
14715  movaps xmm5, n0
14716  mulps xmm5, t1
14717 #endif
14718  subps xmm0, xmm1
14719  subps xmm2, xmm3
14720  movaps xmm7, s1
14721  subps xmm4, xmm5
14722 
14723  mulps xmm0, xmm7
14724  movaps t3, xmm0
14725  mulps xmm2, xmm7
14726  movaps t4, xmm2
14727  mulps xmm4, xmm7
14728  movaps t5, xmm4
14729  }
14730 
14731 #else
14732 
14733  n0[0] = d6[0] * d2[0];
14734  n0[1] = d6[1] * d2[1];
14735  n0[2] = d6[2] * d2[2];
14736  n0[3] = d6[3] * d2[3];
14737 
14738  n1[0] = d7[0] * d0[0];
14739  n1[1] = d7[1] * d0[1];
14740  n1[2] = d7[2] * d0[2];
14741  n1[3] = d7[3] * d0[3];
14742 
14743  n2[0] = d5[0] * d1[0];
14744  n2[1] = d5[1] * d1[1];
14745  n2[2] = d5[2] * d1[2];
14746  n2[3] = d5[3] * d1[3];
14747 
14748  n0[0] -= d7[0] * d1[0];
14749  n0[1] -= d7[1] * d1[1];
14750  n0[2] -= d7[2] * d1[2];
14751  n0[3] -= d7[3] * d1[3];
14752 
14753  n1[0] -= d5[0] * d2[0];
14754  n1[1] -= d5[1] * d2[1];
14755  n1[2] -= d5[2] * d2[2];
14756  n1[3] -= d5[3] * d2[3];
14757 
14758  n2[0] -= d6[0] * d0[0];
14759  n2[1] -= d6[1] * d0[1];
14760  n2[2] -= d6[2] * d0[2];
14761  n2[3] -= d6[3] * d0[3];
14762 
14763  n0[0] *= s2[0];
14764  n0[1] *= s2[1];
14765  n0[2] *= s2[2];
14766  n0[3] *= s2[3];
14767 
14768  n1[0] *= s2[0];
14769  n1[1] *= s2[1];
14770  n1[2] *= s2[2];
14771  n1[3] *= s2[3];
14772 
14773  n2[0] *= s2[0];
14774  n2[1] *= s2[1];
14775  n2[2] *= s2[2];
14776  n2[3] *= s2[3];
14777 
14778  t0[0] = d0[0] * d9[0];
14779  t0[1] = d0[1] * d9[1];
14780  t0[2] = d0[2] * d9[2];
14781  t0[3] = d0[3] * d9[3];
14782 
14783  t1[0] = d1[0] * d9[0];
14784  t1[1] = d1[1] * d9[1];
14785  t1[2] = d1[2] * d9[2];
14786  t1[3] = d1[3] * d9[3];
14787 
14788  t2[0] = d2[0] * d9[0];
14789  t2[1] = d2[1] * d9[1];
14790  t2[2] = d2[2] * d9[2];
14791  t2[3] = d2[3] * d9[3];
14792 
14793  t0[0] -= d4[0] * d5[0];
14794  t0[1] -= d4[1] * d5[1];
14795  t0[2] -= d4[2] * d5[2];
14796  t0[3] -= d4[3] * d5[3];
14797 
14798  t1[0] -= d4[0] * d6[0];
14799  t1[1] -= d4[1] * d6[1];
14800  t1[2] -= d4[2] * d6[2];
14801  t1[3] -= d4[3] * d6[3];
14802 
14803  t2[0] -= d4[0] * d7[0];
14804  t2[1] -= d4[1] * d7[1];
14805  t2[2] -= d4[2] * d7[2];
14806  t2[3] -= d4[3] * d7[3];
14807 
14808  t0[0] *= s0[0];
14809  t0[1] *= s0[1];
14810  t0[2] *= s0[2];
14811  t0[3] *= s0[3];
14812 
14813  t1[0] *= s0[0];
14814  t1[1] *= s0[1];
14815  t1[2] *= s0[2];
14816  t1[3] *= s0[3];
14817 
14818  t2[0] *= s0[0];
14819  t2[1] *= s0[1];
14820  t2[2] *= s0[2];
14821  t2[3] *= s0[3];
14822 
14823 #ifndef DERIVE_UNSMOOTHED_BITANGENT
14824  t3[0] = d3[0] * d5[0];
14825  t3[1] = d3[1] * d5[1];
14826  t3[2] = d3[2] * d5[2];
14827  t3[3] = d3[3] * d5[3];
14828 
14829  t4[0] = d3[0] * d6[0];
14830  t4[1] = d3[1] * d6[1];
14831  t4[2] = d3[2] * d6[2];
14832  t4[3] = d3[3] * d6[3];
14833 
14834  t5[0] = d3[0] * d7[0];
14835  t5[1] = d3[1] * d7[1];
14836  t5[2] = d3[2] * d7[2];
14837  t5[3] = d3[3] * d7[3];
14838 
14839  t3[0] -= d0[0] * d8[0];
14840  t3[1] -= d0[1] * d8[1];
14841  t3[2] -= d0[2] * d8[2];
14842  t3[3] -= d0[3] * d8[3];
14843 
14844  t4[0] -= d1[0] * d8[0];
14845  t4[1] -= d1[1] * d8[1];
14846  t4[2] -= d1[2] * d8[2];
14847  t4[3] -= d1[3] * d8[3];
14848 
14849  t5[0] -= d2[0] * d8[0];
14850  t5[1] -= d2[1] * d8[1];
14851  t5[2] -= d2[2] * d8[2];
14852  t5[3] -= d2[3] * d8[3];
14853 #else
14854  t3[0] = n2[0] * t1[0];
14855  t3[1] = n2[1] * t1[1];
14856  t3[2] = n2[2] * t1[2];
14857  t3[3] = n2[3] * t1[3];
14858 
14859  t4[0] = n0[0] * t2[0];
14860  t4[1] = n0[1] * t2[1];
14861  t4[2] = n0[2] * t2[2];
14862  t4[3] = n0[3] * t2[3];
14863 
14864  t5[0] = n1[0] * t0[0];
14865  t5[1] = n1[1] * t0[1];
14866  t5[2] = n1[2] * t0[2];
14867  t5[3] = n1[3] * t0[3];
14868 
14869  t3[0] -= n1[0] * t2[0];
14870  t3[1] -= n1[1] * t2[1];
14871  t3[2] -= n1[2] * t2[2];
14872  t3[3] -= n1[3] * t2[3];
14873 
14874  t4[0] -= n2[0] * t0[0];
14875  t4[1] -= n2[1] * t0[1];
14876  t4[2] -= n2[2] * t0[2];
14877  t4[3] -= n2[3] * t0[3];
14878 
14879  t5[0] -= n0[0] * t1[0];
14880  t5[1] -= n0[1] * t1[1];
14881  t5[2] -= n0[2] * t1[2];
14882  t5[3] -= n0[3] * t1[3];
14883 #endif
14884  t3[0] *= s1[0];
14885  t3[1] *= s1[1];
14886  t3[2] *= s1[2];
14887  t3[3] *= s1[3];
14888 
14889  t4[0] *= s1[0];
14890  t4[1] *= s1[1];
14891  t4[2] *= s1[2];
14892  t4[3] *= s1[3];
14893 
14894  t5[0] *= s1[0];
14895  t5[1] *= s1[1];
14896  t5[2] *= s1[2];
14897  t5[3] *= s1[3];
14898 
14899 #endif
14900 
14901  for ( j = 0; j < 4; j++ ) {
14902  idDrawVert *a;
14903 
14904  a = verts + i + j;
14905 
14906  a->normal[0] = n0[j];
14907  a->normal[1] = n1[j];
14908  a->normal[2] = n2[j];
14909 
14910  a->tangents[0][0] = t0[j];
14911  a->tangents[0][1] = t1[j];
14912  a->tangents[0][2] = t2[j];
14913 
14914  a->tangents[1][0] = t3[j];
14915  a->tangents[1][1] = t4[j];
14916  a->tangents[1][2] = t5[j];
14917  }
14918  }
14919 
14920  for ( ; i < numVerts; i++ ) {
14921  idDrawVert *a, *b, *c;
14922  float d0, d1, d2, d3, d4;
14923  float d5, d6, d7, d8, d9;
14924  float s0, s1, s2;
14925  float n0, n1, n2;
14926  float t0, t1, t2;
14927  float t3, t4, t5;
14928 
14929  const dominantTri_s &dt = dominantTris[i];
14930 
14931  s0 = dt.normalizationScale[0];
14932  s1 = dt.normalizationScale[1];
14933  s2 = dt.normalizationScale[2];
14934 
14935  a = verts + i;
14936  b = verts + dt.v2;
14937  c = verts + dt.v3;
14938 
14939  d0 = b->xyz[0] - a->xyz[0];
14940  d1 = b->xyz[1] - a->xyz[1];
14941  d2 = b->xyz[2] - a->xyz[2];
14942  d3 = b->st[0] - a->st[0];
14943  d4 = b->st[1] - a->st[1];
14944 
14945  d5 = c->xyz[0] - a->xyz[0];
14946  d6 = c->xyz[1] - a->xyz[1];
14947  d7 = c->xyz[2] - a->xyz[2];
14948  d8 = c->st[0] - a->st[0];
14949  d9 = c->st[1] - a->st[1];
14950 
14951 #if 1
14952 
14953  __asm {
14954 
14955  movss xmm0, d6
14956  mulss xmm0, d2
14957  movss xmm1, d7
14958  mulss xmm1, d1
14959 
14960  movss xmm2, d7
14961  mulss xmm2, d0
14962  movss xmm3, d5
14963  mulss xmm3, d2
14964 
14965  movss xmm4, d5
14966  mulss xmm4, d1
14967  movss xmm5, d6
14968  mulss xmm5, d0
14969 
14970  subss xmm0, xmm1
14971  subss xmm2, xmm3
14972  movss xmm7, s2
14973  subss xmm4, xmm5
14974 
14975  mulss xmm0, xmm7
14976  movss n0, xmm0
14977  mulss xmm2, xmm7
14978  movss n1, xmm2
14979  mulss xmm4, xmm7
14980  movss n2, xmm4
14981 
14982  movss xmm0, d0
14983  mulss xmm0, d9
14984  movss xmm1, d4
14985  mulss xmm1, d5
14986 
14987  movss xmm2, d1
14988  mulss xmm2, d9
14989  movss xmm3, d4
14990  mulss xmm3, d6
14991 
14992  movss xmm4, d2
14993  mulss xmm4, d9
14994  movss xmm5, d4
14995  mulss xmm5, d7
14996 
14997  subss xmm0, xmm1
14998  subss xmm2, xmm3
14999  movss xmm7, s0
15000  subss xmm4, xmm5
15001 
15002  mulss xmm0, xmm7
15003  movss t0, xmm0
15004  mulss xmm2, xmm7
15005  movss t1, xmm2
15006  mulss xmm4, xmm7
15007  movss t2, xmm4
15008 
15009 #ifndef DERIVE_UNSMOOTHED_BITANGENT
15010  movss xmm0, d3
15011  mulss xmm0, d5
15012  movss xmm1, d0
15013  mulss xmm1, d8
15014 
15015  movss xmm2, d3
15016  mulss xmm2, d6
15017  movss xmm3, d1
15018  mulss xmm3, d8
15019 
15020  movss xmm4, d3
15021  mulss xmm4, d7
15022  movss xmm5, d2
15023  mulss xmm5, d8
15024 #else
15025  movss xmm0, n2
15026  mulss xmm0, t1
15027  movss xmm1, n1
15028  mulss xmm1, t2
15029 
15030  movss xmm2, n0
15031  mulss xmm2, t2
15032  movss xmm3, n2
15033  mulss xmm3, t0
15034 
15035  movss xmm4, n1
15036  mulss xmm4, t0
15037  movss xmm5, n0
15038  mulss xmm5, t1
15039 #endif
15040  subss xmm0, xmm1
15041  subss xmm2, xmm3
15042  movss xmm7, s1
15043  subss xmm4, xmm5
15044 
15045  mulss xmm0, xmm7
15046  movss t3, xmm0
15047  mulss xmm2, xmm7
15048  movss t4, xmm2
15049  mulss xmm4, xmm7
15050  movss t5, xmm4
15051  }
15052 
15053 #else
15054 
15055  n0 = s2 * ( d6 * d2 - d7 * d1 );
15056  n1 = s2 * ( d7 * d0 - d5 * d2 );
15057  n2 = s2 * ( d5 * d1 - d6 * d0 );
15058 
15059  t0 = s0 * ( d0 * d9 - d4 * d5 );
15060  t1 = s0 * ( d1 * d9 - d4 * d6 );
15061  t2 = s0 * ( d2 * d9 - d4 * d7 );
15062 
15063 #ifndef DERIVE_UNSMOOTHED_BITANGENT
15064  t3 = s1 * ( d3 * d5 - d0 * d8 );
15065  t4 = s1 * ( d3 * d6 - d1 * d8 );
15066  t5 = s1 * ( d3 * d7 - d2 * d8 );
15067 #else
15068  t3 = s1 * ( n2 * t1 - n1 * t2 );
15069  t4 = s1 * ( n0 * t2 - n2 * t0 );
15070  t5 = s1 * ( n1 * t0 - n0 * t1 );
15071 #endif
15072 
15073 #endif
15074 
15075  a->normal[0] = n0;
15076  a->normal[1] = n1;
15077  a->normal[2] = n2;
15078 
15079  a->tangents[0][0] = t0;
15080  a->tangents[0][1] = t1;
15081  a->tangents[0][2] = t2;
15082 
15083  a->tangents[1][0] = t3;
15084  a->tangents[1][1] = t4;
15085  a->tangents[1][2] = t5;
15086  }
15087 }
15088 
15089 /*
15090 ============
15091 idSIMD_SSE::NormalizeTangents
15092 ============
15093 */
15094 void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
15095  ALIGN16( float normal[12] );
15096 
15097  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
15098  assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
15099  assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
15100  assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
15101 
15102  assert( verts != NULL );
15103  assert( numVerts >= 0 );
15104 
15105  __asm {
15106  mov eax, numVerts
15107  test eax, eax
15108  jz done
15109 #ifdef REFINE_TANGENT_SQUAREROOT
15110  movaps xmm6, SIMD_SP_rsqrt_c0
15111  movaps xmm7, SIMD_SP_rsqrt_c1
15112 #endif
15113  mov esi, verts
15114  imul eax, DRAWVERT_SIZE
15115  add esi, eax
15116  neg eax
15117  add eax, DRAWVERT_SIZE*4
15118  jle loopVert4
15119 
15120  sub eax, DRAWVERT_SIZE*4
15121  jl loopVert1
15122 
15123  loopVert4:
15124 
15125  sub eax, DRAWVERT_SIZE*4
15126 
15127  // normalize 4 idDrawVert::normal
15128 
15129  movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0] // 0, X, X, X
15130  movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0] // 0, X, 3, 4
15131  movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8] // 5, X, X, X
15132  movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4] // 5, X, 1, 2
15133  movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0] // 6, X, X, X
15134  movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0] // 6, X, 9, 10
15135  movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8] // 11, X, X, X
15136  movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4] // 11, X, 7, 8
15137 
15138  movaps xmm1, xmm0
15139  movaps xmm5, xmm2
15140  shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
15141  shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
15142  shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
15143  shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
15144  shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
15145 
15146  movaps xmm3, xmm0
15147  movaps xmm4, xmm1
15148  movaps xmm5, xmm2
15149 
15150  mulps xmm3, xmm3
15151  mulps xmm4, xmm4
15152  mulps xmm5, xmm5
15153  addps xmm3, xmm4
15154  addps xmm3, xmm5
15155 
15156 #ifdef REFINE_TANGENT_SQUAREROOT
15157  rsqrtps xmm4, xmm3
15158  mulps xmm3, xmm4
15159  mulps xmm3, xmm4
15160  subps xmm3, xmm6
15161  mulps xmm4, xmm7
15162  mulps xmm3, xmm4
15163 #else
15164  rsqrtps xmm3, xmm3
15165 #endif
15166 
15167  mulps xmm0, xmm3
15168  mulps xmm1, xmm3
15169  mulps xmm2, xmm3
15170 
15171  // save the 4 idDrawVert::normal to project the tangents
15172 
15173  movaps [normal+ 0], xmm0
15174  movaps [normal+16], xmm1
15175  movaps [normal+32], xmm2
15176 
15177  movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0], xmm0
15178  movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4], xmm1
15179  movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+8], xmm2
15180 
15181  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15182  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15183  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15184 
15185  movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0], xmm0
15186  movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+4], xmm1
15187  movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8], xmm2
15188 
15189  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15190  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15191  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15192 
15193  movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0], xmm0
15194  movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4], xmm1
15195  movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+8], xmm2
15196 
15197  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15198  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15199  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15200 
15201  movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0], xmm0
15202  movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+4], xmm1
15203  movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8], xmm2
15204 
15205  // project and normalize 4 idDrawVert::tangent[0]
15206 
15207  movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0] // 0, X, X, X
15208  movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0] // 0, X, 3, 4
15209  movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8] // 5, X, X, X
15210  movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4] // 5, X, 1, 2
15211  movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0] // 6, X, X, X
15212  movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0] // 6, X, 9, 10
15213  movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8] // 11, X, X, X
15214  movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4] // 11, X, 7, 8
15215 
15216  movaps xmm1, xmm0
15217  movaps xmm5, xmm2
15218  shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
15219  shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
15220  shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
15221  shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
15222  shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
15223 
15224  movaps xmm3, xmm0
15225  movaps xmm4, xmm1
15226  movaps xmm5, xmm2
15227 
15228  mulps xmm3, [normal+ 0]
15229  mulps xmm4, [normal+16]
15230  mulps xmm5, [normal+32]
15231  addps xmm3, xmm4
15232  addps xmm3, xmm5
15233 
15234  movaps xmm4, xmm3
15235  movaps xmm5, xmm3
15236  mulps xmm3, [normal+ 0]
15237  mulps xmm4, [normal+16]
15238  mulps xmm5, [normal+32]
15239  subps xmm0, xmm3
15240  subps xmm1, xmm4
15241  subps xmm2, xmm5
15242 
15243  movaps xmm3, xmm0
15244  movaps xmm4, xmm1
15245  movaps xmm5, xmm2
15246 
15247  mulps xmm3, xmm3
15248  mulps xmm4, xmm4
15249  mulps xmm5, xmm5
15250  addps xmm3, xmm4
15251  addps xmm3, xmm5
15252 
15253 #ifdef REFINE_TANGENT_SQUAREROOT
15254  rsqrtps xmm4, xmm3
15255  mulps xmm3, xmm4
15256  mulps xmm3, xmm4
15257  subps xmm3, xmm6
15258  mulps xmm4, xmm7
15259  mulps xmm3, xmm4
15260 #else
15261  rsqrtps xmm3, xmm3
15262 #endif
15263 
15264  mulps xmm0, xmm3
15265  mulps xmm1, xmm3
15266  mulps xmm2, xmm3
15267 
15268  movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15269  movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15270  movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15271 
15272  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15273  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15274  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15275 
15276  movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15277  movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15278  movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15279 
15280  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15281  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15282  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15283 
15284  movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15285  movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15286  movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15287 
15288  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15289  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15290  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15291 
15292  movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15293  movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15294  movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15295 
15296  // project and normalize 4 idDrawVert::tangent[1]
15297 
15298  movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0] // 0, X, X, X
15299  movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0] // 0, X, 3, 4
15300  movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8] // 5, X, X, X
15301  movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4] // 5, X, 1, 2
15302  movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0] // 6, X, X, X
15303  movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0] // 6, X, 9, 10
15304  movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8] // 11, X, X, X
15305  movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4] // 11, X, 7, 8
15306 
15307  movaps xmm1, xmm0
15308  movaps xmm5, xmm2
15309  shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
15310  shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
15311  shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
15312  shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
15313  shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
15314 
15315  movaps xmm3, xmm0
15316  movaps xmm4, xmm1
15317  movaps xmm5, xmm2
15318 
15319  mulps xmm3, [normal+ 0]
15320  mulps xmm4, [normal+16]
15321  mulps xmm5, [normal+32]
15322  addps xmm3, xmm4
15323  addps xmm3, xmm5
15324 
15325  movaps xmm4, xmm3
15326  movaps xmm5, xmm3
15327  mulps xmm3, [normal+ 0]
15328  mulps xmm4, [normal+16]
15329  mulps xmm5, [normal+32]
15330  subps xmm0, xmm3
15331  subps xmm1, xmm4
15332  subps xmm2, xmm5
15333 
15334  movaps xmm3, xmm0
15335  movaps xmm4, xmm1
15336  movaps xmm5, xmm2
15337 
15338  mulps xmm3, xmm3
15339  mulps xmm4, xmm4
15340  mulps xmm5, xmm5
15341  addps xmm3, xmm4
15342  addps xmm3, xmm5
15343 
15344 #ifdef REFINE_TANGENT_SQUAREROOT
15345  rsqrtps xmm4, xmm3
15346  mulps xmm3, xmm4
15347  mulps xmm3, xmm4
15348  subps xmm3, xmm6
15349  mulps xmm4, xmm7
15350  mulps xmm3, xmm4
15351 #else
15352  rsqrtps xmm3, xmm3
15353 #endif
15354 
15355  mulps xmm0, xmm3
15356  mulps xmm1, xmm3
15357  mulps xmm2, xmm3
15358 
15359  movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15360  movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15361  movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15362 
15363  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15364  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15365  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15366 
15367  movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15368  movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15369  movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15370 
15371  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15372  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15373  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15374 
15375  movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15376  movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15377  movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15378 
15379  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15380  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15381  shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15382 
15383  movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15384  movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15385  movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15386 
15387  add eax, DRAWVERT_SIZE*8
15388 
15389  jle loopVert4
15390 
15391  sub eax, DRAWVERT_SIZE*4
15392  jge done
15393 
15394  loopVert1:
15395 
15396  // normalize one idDrawVert::normal
15397 
15398  movss xmm0, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15399  movss xmm1, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15400  movss xmm2, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15401  movss xmm3, xmm0
15402  movss xmm4, xmm1
15403  movss xmm5, xmm2
15404 
15405  mulss xmm3, xmm3
15406  mulss xmm4, xmm4
15407  mulss xmm5, xmm5
15408  addss xmm3, xmm4
15409  addss xmm3, xmm5
15410 
15411 #ifdef REFINE_TANGENT_SQUAREROOT
15412  rsqrtss xmm4, xmm3
15413  mulss xmm3, xmm4
15414  mulss xmm3, xmm4
15415  subss xmm3, xmm6
15416  mulss xmm4, xmm7
15417  mulss xmm3, xmm4
15418 #else
15419  rsqrtss xmm3, xmm3
15420 #endif
15421 
15422  mulss xmm0, xmm3
15423  mulss xmm1, xmm3
15424  mulss xmm2, xmm3
15425 
15426  movss [esi+eax+DRAWVERT_NORMAL_OFFSET+0], xmm0
15427  movss [esi+eax+DRAWVERT_NORMAL_OFFSET+4], xmm1
15428  movss [esi+eax+DRAWVERT_NORMAL_OFFSET+8], xmm2
15429 
15430  // project and normalize one idDrawVert::tangent[0]
15431 
15432  movss xmm0, [esi+eax+DRAWVERT_TANGENT0_OFFSET+0]
15433  movss xmm1, [esi+eax+DRAWVERT_TANGENT0_OFFSET+4]
15434  movss xmm2, [esi+eax+DRAWVERT_TANGENT0_OFFSET+8]
15435  movss xmm3, xmm0
15436  movss xmm4, xmm1
15437  movss xmm5, xmm2
15438 
15439  mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15440  mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15441  mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15442  addss xmm3, xmm4
15443  addss xmm3, xmm5
15444 
15445  movss xmm4, xmm3
15446  movss xmm5, xmm3
15447  mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15448  mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15449  mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15450  subss xmm0, xmm3
15451  subss xmm1, xmm4
15452  subss xmm2, xmm5
15453 
15454  movss xmm3, xmm0
15455  movss xmm4, xmm1
15456  movss xmm5, xmm2
15457 
15458  mulss xmm3, xmm3
15459  mulss xmm4, xmm4
15460  mulss xmm5, xmm5
15461  addss xmm3, xmm4
15462  addss xmm3, xmm5
15463 
15464 #ifdef REFINE_TANGENT_SQUAREROOT
15465  rsqrtss xmm4, xmm3
15466  mulss xmm3, xmm4
15467  mulss xmm3, xmm4
15468  subss xmm3, xmm6
15469  mulss xmm4, xmm7
15470  mulss xmm3, xmm4
15471 #else
15472  rsqrtss xmm3, xmm3
15473 #endif
15474 
15475  mulss xmm0, xmm3
15476  mulss xmm1, xmm3
15477  mulss xmm2, xmm3
15478 
15479  movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15480  movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15481  movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15482 
15483  // project and normalize one idDrawVert::tangent[1]
15484 
15485  movss xmm0, [esi+eax+DRAWVERT_TANGENT1_OFFSET+0]
15486  movss xmm1, [esi+eax+DRAWVERT_TANGENT1_OFFSET+4]
15487  movss xmm2, [esi+eax+DRAWVERT_TANGENT1_OFFSET+8]
15488  movss xmm3, xmm0
15489  movss xmm4, xmm1
15490  movss xmm5, xmm2
15491 
15492  mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15493  mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15494  mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15495  addss xmm3, xmm4
15496  addss xmm3, xmm5
15497 
15498  movss xmm4, xmm3
15499  movss xmm5, xmm3
15500  mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15501  mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15502  mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15503  subss xmm0, xmm3
15504  subss xmm1, xmm4
15505  subss xmm2, xmm5
15506 
15507  movss xmm3, xmm0
15508  movss xmm4, xmm1
15509  movss xmm5, xmm2
15510 
15511  mulss xmm3, xmm3
15512  mulss xmm4, xmm4
15513  mulss xmm5, xmm5
15514  addss xmm3, xmm4
15515  addss xmm3, xmm5
15516 
15517 #ifdef REFINE_TANGENT_SQUAREROOT
15518  rsqrtss xmm4, xmm3
15519  mulss xmm3, xmm4
15520  mulss xmm3, xmm4
15521  subss xmm3, xmm6
15522  mulss xmm4, xmm7
15523  mulss xmm3, xmm4
15524 #else
15525  rsqrtss xmm3, xmm3
15526 #endif
15527 
15528  mulss xmm0, xmm3
15529  mulss xmm1, xmm3
15530  mulss xmm2, xmm3
15531 
15532  movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15533  movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15534  movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15535 
15536  add eax, DRAWVERT_SIZE
15537 
15538  jl loopVert1
15539  done:
15540  }
15541 }
15542 
15543 /*
15544 ============
15545 idSIMD_SSE::CreateTextureSpaceLightVectors
15546 ============
15547 */
15548 void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
15549 
15550  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
15551  assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
15552  assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
15553  assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
15554  assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
15555 
15556  bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
15557  memset( used, 0, numVerts * sizeof( used[0] ) );
15558 
15559  for ( int i = numIndexes - 1; i >= 0; i-- ) {
15560  used[indexes[i]] = true;
15561  }
15562 
15563 #if 0
15564 
15565  __asm {
15566 
15567  mov eax, numVerts
15568 
15569  mov esi, used
15570  add esi, eax
15571 
15572  mov edi, verts
15573  sub edi, DRAWVERT_SIZE
15574 
15575  neg eax
15576  dec eax
15577 
15578  mov ecx, lightOrigin
15579  movss xmm7, [ecx+0]
15580  movhps xmm7, [ecx+4]
15581 
15582  mov ecx, lightVectors
15583  sub ecx, 3*4
15584 
15585  loopVert:
15586  inc eax
15587  jge done
15588 
15589  add edi, DRAWVERT_SIZE
15590  add ecx, 3*4
15591 
15592  cmp byte ptr [esi+eax], 0
15593  je loopVert
15594 
15595  movaps xmm0, xmm7
15596  movss xmm1, [edi+DRAWVERT_XYZ_OFFSET+0]
15597  movhps xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
15598  subps xmm0, xmm1
15599 
15600  // 0, X, 1, 2
15601  // 3, X, 4, 5
15602  // 6, X, 7, 8
15603 
15604  movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
15605  movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
15606  mulps xmm2, xmm0
15607 
15608  movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
15609  movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
15610  mulps xmm3, xmm0
15611 
15612  movaps xmm5, xmm2 // xmm5 = 0, X, 1, 2
15613  unpcklps xmm5, xmm3 // xmm5 = 0, 3, X, X
15614  unpckhps xmm2, xmm3 // xmm2 = 1, 4, 2, 5
15615 
15616  movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
15617  movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
15618  mulps xmm4, xmm0
15619 
15620  movlhps xmm5, xmm4 // xmm5 = 0, 3, 6, X
15621  movhlps xmm4, xmm2 // xmm4 = 2, 5, 7, 8
15622  shufps xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 ) // xmm2 = 2, 5, 8, 7
15623 
15624  addps xmm5, xmm4
15625  addps xmm5, xmm2
15626  movlps [ecx+0], xmm5
15627  shufps xmm5, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
15628  movss [ecx+8], xmm5
15629 
15630  jmp loopVert
15631 
15632  done:
15633  }
15634 
15635 #elif 1
15636 
15637  for ( int i = 0; i < numVerts; i++ ) {
15638  if ( !used[i] ) {
15639  continue;
15640  }
15641 
15642  const idDrawVert *v = &verts[i];
15643  idVec3 lightDir;
15644 
15645  lightDir[0] = lightOrigin[0] - v->xyz[0];
15646  lightDir[1] = lightOrigin[1] - v->xyz[1];
15647  lightDir[2] = lightOrigin[2] - v->xyz[2];
15648 
15649  lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
15650  lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
15651  lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
15652  }
15653 
15654 #elif 1
15655 
15656  ALIGN16( int usedVertNums[4] );
15657  ALIGN16( float lightDir0[4] );
15658  ALIGN16( float lightDir1[4] );
15659  ALIGN16( float lightDir2[4] );
15660  ALIGN16( float normal0[4] );
15661  ALIGN16( float normal1[4] );
15662  ALIGN16( float normal2[4] );
15663  ALIGN16( float tangent0[4] );
15664  ALIGN16( float tangent1[4] );
15665  ALIGN16( float tangent2[4] );
15666  ALIGN16( float tangent3[4] );
15667  ALIGN16( float tangent4[4] );
15668  ALIGN16( float tangent5[4] );
15669  idVec3 localLightOrigin = lightOrigin;
15670 
15671  __asm {
15672 
15673  xor ecx, ecx
15674  mov eax, numVerts
15675 
15676  mov esi, used
15677  add esi, eax
15678 
15679  mov edi, verts
15680  sub edi, DRAWVERT_SIZE
15681 
15682  neg eax
15683  dec eax
15684 
15685  loopVert4:
15686  inc eax
15687  jge done4
15688 
15689  add edi, DRAWVERT_SIZE
15690 
15691  cmp byte ptr [esi+eax], 0
15692  je loopVert4
15693 
15694  mov usedVertNums[ecx*4], eax
15695 
15696  inc ecx
15697  cmp ecx, 4
15698 
15699  movss xmm0, localLightOrigin[0]
15700  movss xmm1, localLightOrigin[4]
15701  movss xmm2, localLightOrigin[8]
15702 
15703  subss xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
15704  subss xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
15705  subss xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]
15706 
15707  movss lightDir0[ecx*4-4], xmm0
15708  movss lightDir1[ecx*4-4], xmm1
15709  movss lightDir2[ecx*4-4], xmm2
15710 
15711  movss xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
15712  movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
15713  movss xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]
15714 
15715  movss normal0[ecx*4-4], xmm3
15716  movss normal1[ecx*4-4], xmm4
15717  movss normal2[ecx*4-4], xmm5
15718 
15719  movss xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
15720  movss xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
15721  movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]
15722 
15723  movss tangent0[ecx*4-4], xmm0
15724  movss tangent1[ecx*4-4], xmm1
15725  movss tangent2[ecx*4-4], xmm2
15726 
15727  movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
15728  movss xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
15729  movss xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]
15730 
15731  movss tangent3[ecx*4-4], xmm3
15732  movss tangent4[ecx*4-4], xmm4
15733  movss tangent5[ecx*4-4], xmm5
15734 
15735  jl loopVert4
15736 
15737  movaps xmm0, lightDir0
15738  movaps xmm1, lightDir1
15739  movaps xmm2, lightDir2
15740 
15741  movaps xmm3, tangent0
15742  mulps xmm3, xmm0
15743  movaps xmm4, tangent1
15744  mulps xmm4, xmm1
15745  movaps xmm5, tangent2
15746  mulps xmm5, xmm2
15747 
15748  addps xmm3, xmm4
15749  addps xmm5, xmm3
15750 
15751  movaps xmm3, tangent3
15752  mulps xmm3, xmm0
15753  movaps xmm4, tangent4
15754  mulps xmm4, xmm1
15755  movaps xmm6, tangent5
15756  mulps xmm6, xmm2
15757 
15758  addps xmm3, xmm4
15759  addps xmm6, xmm3
15760 
15761  mulps xmm0, normal0
15762  mulps xmm1, normal1
15763  mulps xmm2, normal2
15764 
15765  addps xmm0, xmm1
15766  addps xmm0, xmm2
15767 
15768  mov ecx, numVerts
15769  imul ecx, 12
15770  mov edx, usedVertNums[0]
15771  add ecx, lightVectors
15772  imul edx, 12
15773 
15774  movss [ecx+edx+0], xmm5
15775  movss [ecx+edx+4], xmm6
15776  movss [ecx+edx+8], xmm0
15777 
15778  shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
15779  mov edx, usedVertNums[4]
15780  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
15781  imul edx, 12
15782  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15783 
15784  movss [ecx+edx+0], xmm5
15785  movss [ecx+edx+4], xmm6
15786  movss [ecx+edx+8], xmm0
15787 
15788  shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
15789  mov edx, usedVertNums[8]
15790  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
15791  imul edx, 12
15792  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15793 
15794  movss [ecx+edx+0], xmm5
15795  movss [ecx+edx+4], xmm6
15796  movss [ecx+edx+8], xmm0
15797 
15798  shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
15799  mov edx, usedVertNums[12]
15800  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
15801  imul edx, 12
15802  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15803 
15804  movss [ecx+edx+0], xmm5
15805  movss [ecx+edx+4], xmm6
15806  movss [ecx+edx+8], xmm0
15807 
15808  xor ecx, ecx
15809  jmp loopVert4
15810 
15811  done4:
15812  test ecx, ecx
15813  jz done
15814  xor eax, eax
15815  mov edi, numVerts
15816  imul edi, 12
15817  add edi, lightVectors
15818 
15819  loopVert1:
15820  movss xmm0, lightDir0[eax*4]
15821  movss xmm1, lightDir1[eax*4]
15822  movss xmm2, lightDir2[eax*4]
15823 
15824  mov edx, usedVertNums[eax*4]
15825  imul edx, 12
15826 
15827  movss xmm3, tangent0[eax*4]
15828  mulss xmm3, xmm0
15829  movss xmm4, tangent1[eax*4]
15830  mulss xmm4, xmm1
15831  movss xmm5, tangent2[eax*4]
15832  mulss xmm5, xmm2
15833 
15834  addss xmm3, xmm4
15835  addss xmm5, xmm3
15836  movss [edi+edx+0], xmm5
15837 
15838  movss xmm3, tangent3[eax*4]
15839  mulss xmm3, xmm0
15840  movss xmm4, tangent4[eax*4]
15841  mulss xmm4, xmm1
15842  movss xmm6, tangent5[eax*4]
15843  mulss xmm6, xmm2
15844 
15845  addss xmm3, xmm4
15846  addss xmm6, xmm3
15847  movss [edi+edx+4], xmm6
15848 
15849  mulss xmm0, normal0[eax*4]
15850  mulss xmm1, normal1[eax*4]
15851  mulss xmm2, normal2[eax*4]
15852 
15853  addss xmm0, xmm1
15854  addss xmm0, xmm2
15855  movss [edi+edx+8], xmm0
15856 
15857  inc eax
15858  dec ecx
15859  jg loopVert1
15860 
15861  done:
15862  }
15863 
15864 #else
15865 
15866  ALIGN16( float lightVectors0[4] );
15867  ALIGN16( float lightVectors1[4] );
15868  ALIGN16( float lightVectors2[4] );
15869  int numUsedVerts = 0;
15870 
15871  for ( int i = 0; i < numVerts; i++ ) {
15872  if ( !used[i] ) {
15873  continue;
15874  }
15875 
15876  const idDrawVert *v = &verts[i];
15877 
15878  lightDir0[numUsedVerts] = lightOrigin[0] - v->xyz[0];
15879  lightDir1[numUsedVerts] = lightOrigin[1] - v->xyz[1];
15880  lightDir2[numUsedVerts] = lightOrigin[2] - v->xyz[2];
15881 
15882  normal0[numUsedVerts] = v->normal[0];
15883  normal1[numUsedVerts] = v->normal[1];
15884  normal2[numUsedVerts] = v->normal[2];
15885 
15886  tangent0[numUsedVerts] = v->tangents[0][0];
15887  tangent1[numUsedVerts] = v->tangents[0][1];
15888  tangent2[numUsedVerts] = v->tangents[0][2];
15889 
15890  tangent3[numUsedVerts] = v->tangents[1][0];
15891  tangent4[numUsedVerts] = v->tangents[1][1];
15892  tangent5[numUsedVerts] = v->tangents[1][2];
15893 
15894  usedVertNums[numUsedVerts++] = i;
15895  if ( numUsedVerts < 4 ) {
15896  continue;
15897  }
15898 
15899  lightVectors0[0] = lightDir0[0] * tangent0[0];
15900  lightVectors0[1] = lightDir0[1] * tangent0[1];
15901  lightVectors0[2] = lightDir0[2] * tangent0[2];
15902  lightVectors0[3] = lightDir0[3] * tangent0[3];
15903 
15904  lightVectors0[0] += lightDir1[0] * tangent1[0];
15905  lightVectors0[1] += lightDir1[1] * tangent1[1];
15906  lightVectors0[2] += lightDir1[2] * tangent1[2];
15907  lightVectors0[3] += lightDir1[3] * tangent1[3];
15908 
15909  lightVectors0[0] += lightDir2[0] * tangent2[0];
15910  lightVectors0[1] += lightDir2[1] * tangent2[1];
15911  lightVectors0[2] += lightDir2[2] * tangent2[2];
15912  lightVectors0[3] += lightDir2[3] * tangent2[3];
15913 
15914  lightVectors1[0] = lightDir0[0] * tangent3[0];
15915  lightVectors1[1] = lightDir0[1] * tangent3[1];
15916  lightVectors1[2] = lightDir0[2] * tangent3[2];
15917  lightVectors1[3] = lightDir0[3] * tangent3[3];
15918 
15919  lightVectors1[0] += lightDir1[0] * tangent4[0];
15920  lightVectors1[1] += lightDir1[1] * tangent4[1];
15921  lightVectors1[2] += lightDir1[2] * tangent4[2];
15922  lightVectors1[3] += lightDir1[3] * tangent4[3];
15923 
15924  lightVectors1[0] += lightDir2[0] * tangent5[0];
15925  lightVectors1[1] += lightDir2[1] * tangent5[1];
15926  lightVectors1[2] += lightDir2[2] * tangent5[2];
15927  lightVectors1[3] += lightDir2[3] * tangent5[3];
15928 
15929  lightVectors2[0] = lightDir0[0] * normal0[0];
15930  lightVectors2[1] = lightDir0[1] * normal0[1];
15931  lightVectors2[2] = lightDir0[2] * normal0[2];
15932  lightVectors2[3] = lightDir0[3] * normal0[3];
15933 
15934  lightVectors2[0] += lightDir1[0] * normal1[0];
15935  lightVectors2[1] += lightDir1[1] * normal1[1];
15936  lightVectors2[2] += lightDir1[2] * normal1[2];
15937  lightVectors2[3] += lightDir1[3] * normal1[3];
15938 
15939  lightVectors2[0] += lightDir2[0] * normal2[0];
15940  lightVectors2[1] += lightDir2[1] * normal2[1];
15941  lightVectors2[2] += lightDir2[2] * normal2[2];
15942  lightVectors2[3] += lightDir2[3] * normal2[3];
15943 
15944 
15945  for ( int j = 0; j < 4; j++ ) {
15946  int n = usedVertNums[j];
15947 
15948  lightVectors[n][0] = lightVectors0[j];
15949  lightVectors[n][1] = lightVectors1[j];
15950  lightVectors[n][2] = lightVectors2[j];
15951  }
15952 
15953  numUsedVerts = 0;
15954  }
15955 
15956  for ( int i = 0; i < numUsedVerts; i++ ) {
15957 
15958  lightVectors0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i];
15959  lightVectors1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i];
15960  lightVectors2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i];
15961 
15962  int n = usedVertNums[i];
15963  lightVectors[n][0] = lightVectors0[i];
15964  lightVectors[n][1] = lightVectors1[i];
15965  lightVectors[n][2] = lightVectors2[i];
15966  }
15967 
15968 #endif
15969 }
15970 
15971 /*
15972 ============
15973 idSIMD_SSE::CreateSpecularTextureCoords
15974 ============
15975 */
15976 void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
15977 
15978  assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
15979  assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
15980  assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
15981  assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
15982  assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
15983 
15984  bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
15985  memset( used, 0, numVerts * sizeof( used[0] ) );
15986 
15987  for ( int i = numIndexes - 1; i >= 0; i-- ) {
15988  used[indexes[i]] = true;
15989  }
15990 
15991 #if 0
15992 
15993  __asm {
15994 
15995  mov eax, numVerts
15996 
15997  mov esi, used
15998  add esi, eax
15999 
16000  mov edi, verts
16001  sub edi, DRAWVERT_SIZE
16002 
16003  neg eax
16004  dec eax
16005 
16006  mov ecx, viewOrigin
16007  movss xmm6, [ecx+0]
16008  movhps xmm6, [ecx+4]
16009 
16010  mov ecx, lightOrigin
16011  movss xmm7, [ecx+0]
16012  movhps xmm7, [ecx+4]
16013 
16014  mov ecx, texCoords
16015  sub ecx, 4*4
16016 
16017  loopVert:
16018  inc eax
16019  jge done
16020 
16021  add edi, DRAWVERT_SIZE
16022  add ecx, 4*4
16023 
16024  cmp byte ptr [esi+eax], 0
16025  je loopVert
16026 
16027  movaps xmm0, xmm7
16028  movaps xmm1, xmm6
16029  movss xmm2, [edi+DRAWVERT_XYZ_OFFSET+0]
16030  movhps xmm2, [edi+DRAWVERT_XYZ_OFFSET+4]
16031  subps xmm0, xmm2
16032  subps xmm1, xmm2
16033 
16034  movaps xmm3, xmm0
16035  movaps xmm4, xmm1
16036  mulps xmm3, xmm3
16037  mulps xmm4, xmm4
16038 
16039  // 0, X, 1, 2
16040  // 3, X, 4, 5
16041 
16042  movaps xmm5, xmm3 // xmm5 = 0, X, 1, 2
16043  unpcklps xmm5, xmm4 // xmm5 = 0, 3, X, X
16044  unpckhps xmm3, xmm4 // xmm3 = 1, 4, 2, 5
16045  movhlps xmm4, xmm3 // xmm4 = 2, 5, 4, 5
16046 
16047  addps xmm5, xmm3
16048  addps xmm5, xmm4
16049  shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
16050  rsqrtps xmm5, xmm5
16051 
16052  movaps xmm4, xmm5
16053  shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
16054  shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 1, 1 )
16055 
16056  mulps xmm0, xmm4
16057  mulps xmm1, xmm5
16058  addps xmm0, xmm1
16059 
16060  movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
16061  movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
16062  mulps xmm2, xmm0
16063 
16064  movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
16065  movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
16066  mulps xmm3, xmm0
16067 
16068  movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
16069  movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
16070  mulps xmm4, xmm0
16071 
16072  movaps xmm5, xmm2 // xmm5 = 0, X, 1, 2
16073  unpcklps xmm5, xmm3 // xmm5 = 0, 3, X, X
16074  unpckhps xmm2, xmm3 // xmm2 = 1, 4, 2, 5
16075 
16076  movlhps xmm5, xmm4 // xmm5 = 0, 3, 6, X
16077  movhlps xmm4, xmm2 // xmm4 = 2, 5, 7, 8
16078  shufps xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 ) // xmm2 = 2, 5, 8, 7
16079 
16080  movaps xmm3, SIMD_SP_one
16081 
16082  addps xmm5, xmm4
16083  addps xmm5, xmm2
16084  movaps [ecx+0], xmm5
16085  movss [ecx+12], xmm3
16086 
16087  jmp loopVert
16088 
16089  done:
16090  }
16091 
16092 #elif 0
16093 
16094  for ( int i = 0; i < numVerts; i++ ) {
16095  if ( !used[i] ) {
16096  continue;
16097  }
16098 
16099  const idDrawVert *v = &verts[i];
16100 
16101  idVec3 lightDir = lightOrigin - v->xyz;
16102  idVec3 viewDir = viewOrigin - v->xyz;
16103 
16104  float ilength;
16105 
16106  ilength = idMath::RSqrt( lightDir[0] * lightDir[0] + lightDir[1] * lightDir[1] + lightDir[2] * lightDir[2] );
16107  lightDir[0] *= ilength;
16108  lightDir[1] *= ilength;
16109  lightDir[2] *= ilength;
16110 
16111  ilength = idMath::RSqrt( viewDir[0] * viewDir[0] + viewDir[1] * viewDir[1] + viewDir[2] * viewDir[2] );
16112  viewDir[0] *= ilength;
16113  viewDir[1] *= ilength;
16114  viewDir[2] *= ilength;
16115 
16116  lightDir += viewDir;
16117 
16118  texCoords[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
16119  texCoords[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
16120  texCoords[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
16121  texCoords[i][3] = 1.0f;
16122  }
16123 
16124 
16125 #elif 1
16126 
16127  ALIGN16( int usedVertNums[4] );
16128  ALIGN16( float lightDir0[4] );
16129  ALIGN16( float lightDir1[4] );
16130  ALIGN16( float lightDir2[4] );
16131  ALIGN16( float viewDir0[4] );
16132  ALIGN16( float viewDir1[4] );
16133  ALIGN16( float viewDir2[4] );
16134  ALIGN16( float normal0[4] );
16135  ALIGN16( float normal1[4] );
16136  ALIGN16( float normal2[4] );
16137  ALIGN16( float tangent0[4] );
16138  ALIGN16( float tangent1[4] );
16139  ALIGN16( float tangent2[4] );
16140  ALIGN16( float tangent3[4] );
16141  ALIGN16( float tangent4[4] );
16142  ALIGN16( float tangent5[4] );
16143  idVec3 localLightOrigin = lightOrigin;
16144  idVec3 localViewOrigin = viewOrigin;
16145 
16146  __asm {
16147 
16148  xor ecx, ecx
16149  mov eax, numVerts
16150 
16151  mov esi, used
16152  add esi, eax
16153 
16154  mov edi, verts
16155  sub edi, DRAWVERT_SIZE
16156 
16157  neg eax
16158  dec eax
16159 
16160  loopVert4:
16161  inc eax
16162  jge done4
16163 
16164  add edi, DRAWVERT_SIZE
16165 
16166  cmp byte ptr [esi+eax], 0
16167  je loopVert4
16168 
16169  mov usedVertNums[ecx*4], eax
16170 
16171  inc ecx
16172  cmp ecx, 4
16173 
16174  movss xmm3, localLightOrigin[0]
16175  movss xmm4, localLightOrigin[4]
16176  movss xmm5, localLightOrigin[8]
16177 
16178  subss xmm3, [edi+DRAWVERT_XYZ_OFFSET+0]
16179  subss xmm4, [edi+DRAWVERT_XYZ_OFFSET+4]
16180  subss xmm5, [edi+DRAWVERT_XYZ_OFFSET+8]
16181 
16182  movss lightDir0[ecx*4-4], xmm3
16183  movss lightDir1[ecx*4-4], xmm4
16184  movss lightDir2[ecx*4-4], xmm5
16185 
16186  movss xmm0, localViewOrigin[0]
16187  movss xmm1, localViewOrigin[4]
16188  movss xmm2, localViewOrigin[8]
16189 
16190  subss xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
16191  subss xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
16192  subss xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]
16193 
16194  movss viewDir0[ecx*4-4], xmm0
16195  movss viewDir1[ecx*4-4], xmm1
16196  movss viewDir2[ecx*4-4], xmm2
16197 
16198  movss xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
16199  movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
16200  movss xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]
16201 
16202  movss normal0[ecx*4-4], xmm3
16203  movss normal1[ecx*4-4], xmm4
16204  movss normal2[ecx*4-4], xmm5
16205 
16206  movss xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
16207  movss xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
16208  movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]
16209 
16210  movss tangent0[ecx*4-4], xmm0
16211  movss tangent1[ecx*4-4], xmm1
16212  movss tangent2[ecx*4-4], xmm2
16213 
16214  movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
16215  movss xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
16216  movss xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]
16217 
16218  movss tangent3[ecx*4-4], xmm3
16219  movss tangent4[ecx*4-4], xmm4
16220  movss tangent5[ecx*4-4], xmm5
16221 
16222  jl loopVert4
16223 
16224  movaps xmm6, lightDir0
16225  movaps xmm0, xmm6
16226  mulps xmm6, xmm6
16227  movaps xmm7, lightDir1
16228  movaps xmm1, xmm7
16229  mulps xmm7, xmm7
16230  addps xmm6, xmm7
16231  movaps xmm5, lightDir2
16232  movaps xmm2, xmm5
16233  mulps xmm5, xmm5
16234  addps xmm6, xmm5
16235  rsqrtps xmm6, xmm6
16236 
16237  mulps xmm0, xmm6
16238  mulps xmm1, xmm6
16239  mulps xmm2, xmm6
16240 
16241  movaps xmm3, viewDir0
16242  movaps xmm7, xmm3
16243  mulps xmm7, xmm7
16244  movaps xmm4, viewDir1
16245  movaps xmm6, xmm4
16246  mulps xmm6, xmm6
16247  addps xmm7, xmm6
16248  movaps xmm5, viewDir2
16249  movaps xmm6, xmm5
16250  mulps xmm6, xmm6
16251  addps xmm7, xmm6
16252  rsqrtps xmm7, xmm7
16253 
16254  mulps xmm3, xmm7
16255  addps xmm0, xmm3
16256  mulps xmm4, xmm7
16257  addps xmm1, xmm4
16258  mulps xmm5, xmm7
16259  addps xmm2, xmm5
16260 
16261  movaps xmm3, tangent0
16262  mulps xmm3, xmm0
16263  movaps xmm4, tangent1
16264  mulps xmm4, xmm1
16265  addps xmm3, xmm4
16266  movaps xmm5, tangent2
16267  mulps xmm5, xmm2
16268  addps xmm5, xmm3
16269 
16270  movaps xmm3, tangent3
16271  mulps xmm3, xmm0
16272  movaps xmm4, tangent4
16273  mulps xmm4, xmm1
16274  addps xmm3, xmm4
16275  movaps xmm6, tangent5
16276  mulps xmm6, xmm2
16277  addps xmm6, xmm3
16278 
16279  mulps xmm0, normal0
16280  mulps xmm1, normal1
16281  addps xmm0, xmm1
16282  mulps xmm2, normal2
16283  addps xmm0, xmm2
16284 
16285  mov ecx, numVerts
16286  shl ecx, 4
16287  mov edx, usedVertNums[0]
16288  add ecx, texCoords
16289  shl edx, 4
16290  movss xmm3, SIMD_SP_one
16291 
16292  movss [ecx+edx+0], xmm5
16293  movss [ecx+edx+4], xmm6
16294  movss [ecx+edx+8], xmm0
16295  movss [ecx+edx+12], xmm3
16296 
16297  shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
16298  mov edx, usedVertNums[4]
16299  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
16300  shl edx, 4
16301  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
16302 
16303  movss [ecx+edx+0], xmm5
16304  movss [ecx+edx+4], xmm6
16305  movss [ecx+edx+8], xmm0
16306  movss [ecx+edx+12], xmm3
16307 
16308  shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
16309  mov edx, usedVertNums[8]
16310  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
16311  shl edx, 4
16312  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
16313 
16314  movss [ecx+edx+0], xmm5
16315  movss [ecx+edx+4], xmm6
16316  movss [ecx+edx+8], xmm0
16317  movss [ecx+edx+12], xmm3
16318 
16319  shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
16320  mov edx, usedVertNums[12]
16321  shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
16322  shl edx, 4
16323  shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
16324 
16325  movss [ecx+edx+0], xmm5
16326  movss [ecx+edx+4], xmm6
16327  movss [ecx+edx+8], xmm0
16328  movss [ecx+edx+12], xmm3
16329 
16330  xor ecx, ecx
16331  jmp loopVert4
16332 
16333  done4:
16334  test ecx, ecx
16335  jz done
16336  xor eax, eax
16337  mov edi, numVerts
16338  shl edi, 4
16339  add edi, texCoords
16340 
16341  loopVert1:
16342  movss xmm6, lightDir0[eax*4]
16343  movss xmm0, xmm6
16344  mulss xmm6, xmm6
16345  movss xmm7, lightDir1[eax*4]
16346  movss xmm1, xmm7
16347  mulss xmm7, xmm7
16348  addss xmm6, xmm7
16349  movss xmm5, lightDir2[eax*4]
16350  movss xmm2, xmm5
16351  mulss xmm5, xmm5
16352  addss xmm6, xmm5
16353  rsqrtss xmm6, xmm6
16354 
16355  mulss xmm0, xmm6
16356  mulss xmm1, xmm6
16357  mulss xmm2, xmm6
16358 
16359  movss xmm3, viewDir0[eax*4]
16360  movss xmm7, xmm3
16361  mulss xmm7, xmm7
16362  movss xmm4, viewDir1[eax*4]
16363  movss xmm6, xmm4
16364  mulss xmm6, xmm6
16365  addss xmm7, xmm6
16366  movss xmm5, viewDir2[eax*4]
16367  movss xmm6, xmm5
16368  mulss xmm6, xmm6
16369  addss xmm7, xmm6
16370  rsqrtss xmm7, xmm7
16371 
16372  mulss xmm3, xmm7
16373  addss xmm0, xmm3
16374  mulss xmm4, xmm7
16375  addss xmm1, xmm4
16376  mulss xmm5, xmm7
16377  addss xmm2, xmm5
16378 
16379  mov edx, usedVertNums[eax*4]
16380  shl edx, 4
16381 
16382  movss xmm3, tangent0[eax*4]
16383  mulss xmm3, xmm0
16384  movss xmm4, tangent1[eax*4]
16385  mulss xmm4, xmm1
16386  addss xmm3, xmm4
16387  movss xmm5, tangent2[eax*4]
16388  mulss xmm5, xmm2
16389  addss xmm5, xmm3
16390  movss [edi+edx+0], xmm5
16391 
16392  movss xmm3, tangent3[eax*4]
16393  mulss xmm3, xmm0
16394  movss xmm4, tangent4[eax*4]
16395  mulss xmm4, xmm1
16396  addss xmm3, xmm4
16397  movss xmm6, tangent5[eax*4]
16398  mulss xmm6, xmm2
16399  addss xmm6, xmm3
16400  movss [edi+edx+4], xmm6
16401 
16402  mulss xmm0, normal0[eax*4]
16403  mulss xmm1, normal1[eax*4]
16404  addss xmm0, xmm1
16405  mulss xmm2, normal2[eax*4]
16406  addss xmm0, xmm2
16407  movss [edi+edx+8], xmm0
16408 
16409  movss xmm3, SIMD_SP_one
16410  movss [edi+edx+12], xmm3
16411 
16412  inc eax
16413  dec ecx
16414  jg loopVert1
16415 
16416  done:
16417  }
16418 
16419 #else
16420 
16421  ALIGN16( int usedVertNums[4] );
16422  ALIGN16( float lightDir0[4] );
16423  ALIGN16( float lightDir1[4] );
16424  ALIGN16( float lightDir2[4] );
16425  ALIGN16( float viewDir0[4] );
16426  ALIGN16( float viewDir1[4] );
16427  ALIGN16( float viewDir2[4] );
16428  ALIGN16( float normal0[4] );
16429  ALIGN16( float normal1[4] );
16430  ALIGN16( float normal2[4] );
16431  ALIGN16( float tangent0[4] );
16432  ALIGN16( float tangent1[4] );
16433  ALIGN16( float tangent2[4] );
16434  ALIGN16( float tangent3[4] );
16435  ALIGN16( float tangent4[4] );
16436  ALIGN16( float tangent5[4] );
16437  ALIGN16( float texCoords0[4] );
16438  ALIGN16( float texCoords1[4] );
16439  ALIGN16( float texCoords2[4] );
16440  idVec3 localLightOrigin = lightOrigin;
16441  idVec3 localViewOrigin = viewOrigin;
16442  int numUsedVerts = 0;
16443 
16444  for ( int i = 0; i < numVerts; i++ ) {
16445  if ( !used[i] ) {
16446  continue;
16447  }
16448 
16449  const idDrawVert *v = &verts[i];
16450 
16451  lightDir0[numUsedVerts] = localLightOrigin[0] - v->xyz[0];
16452  lightDir1[numUsedVerts] = localLightOrigin[1] - v->xyz[1];
16453  lightDir2[numUsedVerts] = localLightOrigin[2] - v->xyz[2];
16454 
16455  viewDir0[numUsedVerts] = localViewOrigin[0] - v->xyz[0];
16456  viewDir1[numUsedVerts] = localViewOrigin[1] - v->xyz[1];
16457  viewDir2[numUsedVerts] = localViewOrigin[2] - v->xyz[2];
16458 
16459  normal0[numUsedVerts] = v->normal[0];
16460  normal1[numUsedVerts] = v->normal[1];
16461  normal2[numUsedVerts] = v->normal[2];
16462 
16463  tangent0[numUsedVerts] = v->tangents[0][0];
16464  tangent1[numUsedVerts] = v->tangents[0][1];
16465  tangent2[numUsedVerts] = v->tangents[0][2];
16466 
16467  tangent3[numUsedVerts] = v->tangents[1][0];
16468  tangent4[numUsedVerts] = v->tangents[1][1];
16469  tangent5[numUsedVerts] = v->tangents[1][2];
16470 
16471  usedVertNums[numUsedVerts++] = i;
16472  if ( numUsedVerts < 4 ) {
16473  continue;
16474  }
16475 
16476  ALIGN16( float temp[4] );
16477 
16478  temp[0] = lightDir0[0] * lightDir0[0];
16479  temp[1] = lightDir0[1] * lightDir0[1];
16480  temp[2] = lightDir0[2] * lightDir0[2];
16481  temp[3] = lightDir0[3] * lightDir0[3];
16482 
16483  temp[0] += lightDir1[0] * lightDir1[0];
16484  temp[1] += lightDir1[1] * lightDir1[1];
16485  temp[2] += lightDir1[2] * lightDir1[2];
16486  temp[3] += lightDir1[3] * lightDir1[3];
16487 
16488  temp[0] += lightDir2[0] * lightDir2[0];
16489  temp[1] += lightDir2[1] * lightDir2[1];
16490  temp[2] += lightDir2[2] * lightDir2[2];
16491  temp[3] += lightDir2[3] * lightDir2[3];
16492 
16493  temp[0] = idMath::RSqrt( temp[0] );
16494  temp[1] = idMath::RSqrt( temp[1] );
16495  temp[2] = idMath::RSqrt( temp[2] );
16496  temp[3] = idMath::RSqrt( temp[3] );
16497 
16498  lightDir0[0] *= temp[0];
16499  lightDir0[1] *= temp[1];
16500  lightDir0[2] *= temp[2];
16501  lightDir0[3] *= temp[3];
16502 
16503  lightDir1[0] *= temp[0];
16504  lightDir1[1] *= temp[1];
16505  lightDir1[2] *= temp[2];
16506  lightDir1[3] *= temp[3];
16507 
16508  lightDir2[0] *= temp[0];
16509  lightDir2[1] *= temp[1];
16510  lightDir2[2] *= temp[2];
16511  lightDir2[3] *= temp[3];
16512 
16513  temp[0] = viewDir0[0] * viewDir0[0];
16514  temp[1] = viewDir0[1] * viewDir0[1];
16515  temp[2] = viewDir0[2] * viewDir0[2];
16516  temp[3] = viewDir0[3] * viewDir0[3];
16517 
16518  temp[0] += viewDir1[0] * viewDir1[0];
16519  temp[1] += viewDir1[1] * viewDir1[1];
16520  temp[2] += viewDir1[2] * viewDir1[2];
16521  temp[3] += viewDir1[3] * viewDir1[3];
16522 
16523  temp[0] += viewDir2[0] * viewDir2[0];
16524  temp[1] += viewDir2[1] * viewDir2[1];
16525  temp[2] += viewDir2[2] * viewDir2[2];
16526  temp[3] += viewDir2[3] * viewDir2[3];
16527 
16528  temp[0] = idMath::RSqrt( temp[0] );
16529  temp[1] = idMath::RSqrt( temp[1] );
16530  temp[2] = idMath::RSqrt( temp[2] );
16531  temp[3] = idMath::RSqrt( temp[3] );
16532 
16533  viewDir0[0] *= temp[0];
16534  viewDir0[1] *= temp[1];
16535  viewDir0[2] *= temp[2];
16536  viewDir0[3] *= temp[3];
16537 
16538  viewDir1[0] *= temp[0];
16539  viewDir1[1] *= temp[1];
16540  viewDir1[2] *= temp[2];
16541  viewDir1[3] *= temp[3];
16542 
16543  viewDir2[0] *= temp[0];
16544  viewDir2[1] *= temp[1];
16545  viewDir2[2] *= temp[2];
16546  viewDir2[3] *= temp[3];
16547 
16548  lightDir0[0] += viewDir0[0];
16549  lightDir0[1] += viewDir0[1];
16550  lightDir0[2] += viewDir0[2];
16551  lightDir0[3] += viewDir0[3];
16552 
16553  lightDir1[0] += viewDir1[0];
16554  lightDir1[1] += viewDir1[1];
16555  lightDir1[2] += viewDir1[2];
16556  lightDir1[3] += viewDir1[3];
16557 
16558  lightDir2[0] += viewDir2[0];
16559  lightDir2[1] += viewDir2[1];
16560  lightDir2[2] += viewDir2[2];
16561  lightDir2[3] += viewDir2[3];
16562 
16563  texCoords0[0] = lightDir0[0] * tangent0[0];
16564  texCoords0[1] = lightDir0[1] * tangent0[1];
16565  texCoords0[2] = lightDir0[2] * tangent0[2];
16566  texCoords0[3] = lightDir0[3] * tangent0[3];
16567 
16568  texCoords0[0] += lightDir1[0] * tangent1[0];
16569  texCoords0[1] += lightDir1[1] * tangent1[1];
16570  texCoords0[2] += lightDir1[2] * tangent1[2];
16571  texCoords0[3] += lightDir1[3] * tangent1[3];
16572 
16573  texCoords0[0] += lightDir2[0] * tangent2[0];
16574  texCoords0[1] += lightDir2[1] * tangent2[1];
16575  texCoords0[2] += lightDir2[2] * tangent2[2];
16576  texCoords0[3] += lightDir2[3] * tangent2[3];
16577 
16578  texCoords1[0] = lightDir0[0] * tangent3[0];
16579  texCoords1[1] = lightDir0[1] * tangent3[1];
16580  texCoords1[2] = lightDir0[2] * tangent3[2];
16581  texCoords1[3] = lightDir0[3] * tangent3[3];
16582 
16583  texCoords1[0] += lightDir1[0] * tangent4[0];
16584  texCoords1[1] += lightDir1[1] * tangent4[1];
16585  texCoords1[2] += lightDir1[2] * tangent4[2];
16586  texCoords1[3] += lightDir1[3] * tangent4[3];
16587 
16588  texCoords1[0] += lightDir2[0] * tangent5[0];
16589  texCoords1[1] += lightDir2[1] * tangent5[1];
16590  texCoords1[2] += lightDir2[2] * tangent5[2];
16591  texCoords1[3] += lightDir2[3] * tangent5[3];
16592 
16593  texCoords2[0] = lightDir0[0] * normal0[0];
16594  texCoords2[1] = lightDir0[1] * normal0[1];
16595  texCoords2[2] = lightDir0[2] * normal0[2];
16596  texCoords2[3] = lightDir0[3] * normal0[3];
16597 
16598  texCoords2[0] += lightDir1[0] * normal1[0];
16599  texCoords2[1] += lightDir1[1] * normal1[1];
16600  texCoords2[2] += lightDir1[2] * normal1[2];
16601  texCoords2[3] += lightDir1[3] * normal1[3];
16602 
16603  texCoords2[0] += lightDir2[0] * normal2[0];
16604  texCoords2[1] += lightDir2[1] * normal2[1];
16605  texCoords2[2] += lightDir2[2] * normal2[2];
16606  texCoords2[3] += lightDir2[3] * normal2[3];
16607 
16608  for ( int j = 0; j < 4; j++ ) {
16609  int n = usedVertNums[j];
16610 
16611  texCoords[n][0] = texCoords0[j];
16612  texCoords[n][1] = texCoords1[j];
16613  texCoords[n][2] = texCoords2[j];
16614  texCoords[n][3] = 1.0f;
16615  }
16616 
16617  numUsedVerts = 0;
16618  }
16619 
16620  for ( int i = 0; i < numUsedVerts; i++ ) {
16621  float temp;
16622 
16623  temp = lightDir0[i] * lightDir0[i] + lightDir1[i] * lightDir1[i] + lightDir2[i] * lightDir2[i];
16624  temp = idMath::RSqrt( temp );
16625 
16626  lightDir0[i] *= temp;
16627  lightDir1[i] *= temp;
16628  lightDir2[i] *= temp;
16629 
16630  temp = viewDir0[i] * viewDir0[i] + viewDir1[i] * viewDir1[i] + viewDir2[i] * viewDir2[i];
16631  temp = idMath::RSqrt( temp );
16632 
16633  viewDir0[i] *= temp;
16634  viewDir1[i] *= temp;
16635  viewDir2[i] *= temp;
16636 
16637  lightDir0[i] += viewDir0[i];
16638  lightDir1[i] += viewDir1[i];
16639  lightDir2[i] += viewDir2[i];
16640 
16641  texCoords0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i];
16642  texCoords1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i];
16643  texCoords2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i];
16644 
16645  int n = usedVertNums[i];
16646  texCoords[n][0] = texCoords0;
16647  texCoords[n][1] = texCoords1;
16648  texCoords[n][2] = texCoords2;
16649  texCoords[n][3] = 1.0f;
16650  }
16651 
16652 #endif
16653 }
16654 
16655 /*
16656 ============
16657 idSIMD_SSE::CreateShadowCache
16658 ============
16659 */
16660 int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
16661 #if 1
16662  int outVerts;
16663 
16664  __asm {
16665  push ebx
16666 
16667  mov esi, lightOrigin
16668  movaps xmm5, SIMD_SP_lastOne
16669  movss xmm6, [esi+0]
16670  movhps xmm6, [esi+4]
16671  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 3, 1 )
16672  orps xmm6, SIMD_SP_lastOne
16673  movaps xmm7, xmm6
16674 
16675  xor ebx, ebx
16676  xor ecx, ecx
16677 
16678  mov edx, vertRemap
16679  mov esi, verts
16680  mov edi, vertexCache
16681  mov eax, numVerts
16682  and eax, ~3
16683  jz done4
16684  shl eax, 2
16685  add edx, eax
16686  neg eax
16687 
16688  loop4:
16689  prefetchnta [edx+128]
16690  prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
16691 
16692  cmp dword ptr [edx+eax+0], ebx
16693  jne skip1
16694 
16695  mov dword ptr [edx+eax+0], ecx
16696  movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16697  movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16698  add ecx, 2
16699  shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
16700  orps xmm0, xmm5
16701  movaps [edi+0*16], xmm0
16702  subps xmm0, xmm6
16703  movaps [edi+1*16], xmm0
16704  add edi, 2*16
16705 
16706  skip1:
16707  cmp dword ptr [edx+eax+4], ebx
16708  jne skip2
16709 
16710  mov dword ptr [edx+eax+4], ecx
16711  movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16712  movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
16713  add ecx, 2
16714  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
16715  orps xmm1, xmm5
16716  movaps [edi+0*16], xmm1
16717  subps xmm1, xmm7
16718  movaps [edi+1*16], xmm1
16719  add edi, 2*16
16720 
16721  skip2:
16722  cmp dword ptr [edx+eax+8], ebx
16723  jne skip3
16724 
16725  mov dword ptr [edx+eax+8], ecx
16726  movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16727  movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16728  add ecx, 2
16729  shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
16730  orps xmm2, xmm5
16731  movaps [edi+0*16], xmm2
16732  subps xmm2, xmm6
16733  movaps [edi+1*16], xmm2
16734  add edi, 2*16
16735 
16736  skip3:
16737  cmp dword ptr [edx+eax+12], ebx
16738  jne skip4
16739 
16740  mov dword ptr [edx+eax+12], ecx
16741  movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16742  movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
16743  add ecx, 2
16744  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
16745  orps xmm3, xmm5
16746  movaps [edi+0*16], xmm3
16747  subps xmm3, xmm7
16748  movaps [edi+1*16], xmm3
16749  add edi, 2*16
16750 
16751  skip4:
16752  add esi, 4*DRAWVERT_SIZE
16753  add eax, 4*4
16754  jl loop4
16755 
16756  done4:
16757  mov eax, numVerts
16758  and eax, 3
16759  jz done1
16760  shl eax, 2
16761  add edx, eax
16762  neg eax
16763 
16764  loop1:
16765  cmp dword ptr [edx+eax+0], ebx
16766  jne skip0
16767 
16768  mov dword ptr [edx+eax+0], ecx
16769  movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16770  movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16771  add ecx, 2
16772  shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 )
16773  orps xmm0, xmm5
16774  movaps [edi+0*16], xmm0
16775  subps xmm0, xmm6
16776  movaps [edi+1*16], xmm0
16777  add edi, 2*16
16778 
16779  skip0:
16780 
16781  add esi, DRAWVERT_SIZE
16782  add eax, 4
16783  jl loop1
16784 
16785  done1:
16786  pop ebx
16787  mov outVerts, ecx
16788  }
16789  return outVerts;
16790 
16791 #else
16792 
16793  int outVerts = 0;
16794  for ( int i = 0; i < numVerts; i++ ) {
16795  if ( vertRemap[i] ) {
16796  continue;
16797  }
16798  const float *v = verts[i].xyz.ToFloatPtr();
16799  vertexCache[outVerts+0][0] = v[0];
16800  vertexCache[outVerts+0][1] = v[1];
16801  vertexCache[outVerts+0][2] = v[2];
16802  vertexCache[outVerts+0][3] = 1.0f;
16803 
16804  // R_SetupProjection() builds the projection matrix with a slight crunch
16805  // for depth, which keeps this w=0 division from rasterizing right at the
16806  // wrap around point and causing depth fighting with the rear caps
16807  vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
16808  vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
16809  vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
16810  vertexCache[outVerts+1][3] = 0.0f;
16811  vertRemap[i] = outVerts;
16812  outVerts += 2;
16813  }
16814  return outVerts;
16815 
16816 #endif
16817 }
16818 
16819 /*
16820 ============
16821 idSIMD_SSE::CreateVertexProgramShadowCache
16822 ============
16823 */
16824 int VPCALL idSIMD_SSE::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
16825 #if 1
16826 
16827  __asm {
16828  movaps xmm4, SIMD_SP_lastOne
16829  movaps xmm5, xmm4
16830  movaps xmm6, xmm4
16831  movaps xmm7, xmm4
16832 
16833  mov esi, verts
16834  mov edi, vertexCache
16835  mov eax, numVerts
16836  and eax, ~3
16837  jz done4
16838  shl eax, 5
16839  add edi, eax
16840  neg eax
16841 
16842  loop4:
16843  prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
16844 
16845  movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16846  movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16847  shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
16848  movaps [edi+eax+1*16], xmm0
16849  orps xmm0, xmm4
16850  movaps [edi+eax+0*16], xmm0
16851 
16852  movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16853  movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
16854  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
16855  movaps [edi+eax+3*16], xmm1
16856  orps xmm1, xmm5
16857  movaps [edi+eax+2*16], xmm1
16858 
16859  movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16860  movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16861  shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
16862  movaps [edi+eax+5*16], xmm2
16863  orps xmm2, xmm6
16864  movaps [edi+eax+4*16], xmm2
16865 
16866  movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16867  movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
16868  shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
16869  movaps [edi+eax+7*16], xmm3
16870  orps xmm3, xmm7
16871  movaps [edi+eax+6*16], xmm3
16872 
16873  add esi, 4*DRAWVERT_SIZE
16874  add eax, 4*8*4
16875  jl loop4
16876 
16877  done4:
16878  mov eax, numVerts
16879  and eax, 3
16880  jz done1
16881  shl eax, 5
16882  add edi, eax
16883  neg eax
16884 
16885  loop1:
16886  movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16887  movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16888  shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
16889  movaps [edi+eax+1*16], xmm0
16890  orps xmm0, xmm4
16891  movaps [edi+eax+0*16], xmm0
16892 
16893  add esi, DRAWVERT_SIZE
16894  add eax, 8*4
16895  jl loop1
16896 
16897  done1:
16898  }
16899  return numVerts * 2;
16900 
16901 #else
16902 
16903  for ( int i = 0; i < numVerts; i++ ) {
16904  const float *v = verts[i].xyz.ToFloatPtr();
16905  vertexCache[i*2+0][0] = v[0];
16906  vertexCache[i*2+0][1] = v[1];
16907  vertexCache[i*2+0][2] = v[2];
16908  vertexCache[i*2+0][3] = 1.0f;
16909 
16910  vertexCache[i*2+1][0] = v[0];
16911  vertexCache[i*2+1][1] = v[1];
16912  vertexCache[i*2+1][2] = v[2];
16913  vertexCache[i*2+1][3] = 0.0f;
16914  }
16915  return numVerts * 2;
16916 
16917 #endif
16918 }
16919 
16920 /*
16921 ============
16922 SSE_UpSample11kHzMonoPCMTo44kHz
16923 ============
16924 */
16925 static void SSE_UpSample11kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
16926  __asm {
16927  mov esi, src
16928  mov edi, dest
16929 
16930  mov eax, numSamples
16931  and eax, ~1
16932  jz done2
16933  shl eax, 1
16934  add esi, eax
16935  neg eax
16936 
16937  align 16
16938  loop2:
16939  add edi, 2*4*4
16940 
16941  movsx ecx, word ptr [esi+eax+0]
16942  cvtsi2ss xmm0, ecx
16943  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
16944  movlps [edi-2*4*4+0], xmm0
16945  movhps [edi-2*4*4+8], xmm0
16946 
16947  movsx edx, word ptr [esi+eax+2]
16948  cvtsi2ss xmm1, edx
16949  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
16950  movlps [edi-1*4*4+0], xmm1
16951  movhps [edi-1*4*4+8], xmm1
16952 
16953  add eax, 2*2
16954  jl loop2
16955 
16956  done2:
16957  mov eax, numSamples
16958  and eax, 1
16959  jz done
16960 
16961  movsx ecx, word ptr [esi]
16962  cvtsi2ss xmm0, ecx
16963  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
16964  movlps [edi+0], xmm0
16965  movhps [edi+8], xmm0
16966 
16967  done:
16968  }
16969 }
16970 
16971 /*
16972 ============
16973 SSE_UpSample11kHzStereoPCMTo44kHz
16974 ============
16975 */
16976 static void SSE_UpSample11kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
16977  __asm {
16978  mov esi, src
16979  mov edi, dest
16980 
16981  mov eax, numSamples
16982  test eax, ~1
16983  jz done2
16984  shl eax, 1
16985  add esi, eax
16986  neg eax
16987 
16988  align 16
16989  loop2:
16990  add edi, 8*4
16991 
16992  movsx ecx, word ptr [esi+eax+0]
16993  cvtsi2ss xmm0, ecx
16994 
16995  movsx edx, word ptr [esi+eax+2]
16996  cvtsi2ss xmm1, edx
16997 
16998  unpcklps xmm0, xmm1
16999 
17000  movlps [edi-8*4+0], xmm0
17001  movlps [edi-8*4+8], xmm0
17002  movlps [edi-4*4+0], xmm0
17003  movlps [edi-4*4+8], xmm0
17004 
17005  add eax, 2*2
17006  jl loop2
17007 
17008  done2:
17009  }
17010 }
17011 
17012 /*
17013 ============
17014 SSE_UpSample22kHzMonoPCMTo44kHz
17015 ============
17016 */
17017 static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
17018  __asm {
17019  mov esi, src
17020  mov edi, dest
17021 
17022  mov eax, numSamples
17023  and eax, ~1
17024  jz done2
17025  shl eax, 1
17026  add esi, eax
17027  neg eax
17028 
17029  align 16
17030  loop2:
17031  add edi, 4*4
17032 
17033  movsx ecx, word ptr [esi+eax+0]
17034  cvtsi2ss xmm0, ecx
17035 
17036  movsx edx, word ptr [esi+eax+2]
17037  cvtsi2ss xmm1, edx
17038 
17039  shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
17040  movlps [edi-4*4+0], xmm0
17041  movhps [edi-4*4+8], xmm0
17042 
17043  add eax, 2*2
17044  jl loop2
17045 
17046  done2:
17047  mov eax, numSamples
17048  and eax, 1
17049  jz done
17050 
17051  movsx ecx, word ptr [esi]
17052  cvtsi2ss xmm0, ecx
17053  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
17054  movlps [edi], xmm0
17055 
17056  done:
17057  }
17058 }
17059 
17060 /*
17061 ============
17062 SSE_UpSample22kHzStereoPCMTo44kHz
17063 ============
17064 */
17065 static void SSE_UpSample22kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
17066  __asm {
17067  mov esi, src
17068  mov edi, dest
17069 
17070  mov eax, numSamples
17071  test eax, ~1
17072  jz done2
17073  shl eax, 1
17074  add esi, eax
17075  neg eax
17076 
17077  align 16
17078  loop2:
17079  add edi, 4*4
17080 
17081  movsx ecx, word ptr [esi+eax+0]
17082  cvtsi2ss xmm0, ecx
17083  movss [edi-4*4], xmm0
17084  movss [edi-2*4], xmm0
17085 
17086  movsx edx, word ptr [esi+eax+2]
17087  cvtsi2ss xmm1, edx
17088  movss [edi-3*4], xmm1
17089  movss [edi-1*4], xmm1
17090 
17091  add eax, 2*2
17092  jl loop2
17093 
17094  done2:
17095  }
17096 }
17097 
17098 /*
17099 ============
17100 SSE_UpSample44kHzMonoPCMTo44kHz
17101 ============
17102 */
17103 static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
17104  __asm {
17105  mov esi, src
17106  mov edi, dest
17107 
17108  mov eax, numSamples
17109  and eax, ~1
17110  jz done2
17111  shl eax, 1
17112  add esi, eax
17113  neg eax
17114 
17115  align 16
17116  loop2:
17117  add edi, 2*4
17118 
17119  movsx ecx, word ptr [esi+eax+0]
17120  cvtsi2ss xmm0, ecx
17121  movss [edi-2*4], xmm0
17122 
17123  movsx edx, word ptr [esi+eax+2]
17124  cvtsi2ss xmm1, edx
17125  movss [edi-1*4], xmm1
17126 
17127  add eax, 2*2
17128  jl loop2
17129 
17130  done2:
17131  mov eax, numSamples
17132  and eax, 1
17133  jz done
17134 
17135  movsx ecx, word ptr [esi]
17136  cvtsi2ss xmm0, ecx
17137  movss [edi], xmm0
17138 
17139  done:
17140  }
17141 }
17142 
17143 /*
17144 ============
17145 idSIMD_SSE::UpSamplePCMTo44kHz
17146 
17147  Duplicate samples for 44kHz output.
17148 ============
17149 */
17150 void idSIMD_SSE::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
17151  if ( kHz == 11025 ) {
17152  if ( numChannels == 1 ) {
17153  SSE_UpSample11kHzMonoPCMTo44kHz( dest, src, numSamples );
17154  } else {
17155  SSE_UpSample11kHzStereoPCMTo44kHz( dest, src, numSamples );
17156  }
17157  } else if ( kHz == 22050 ) {
17158  if ( numChannels == 1 ) {
17159  SSE_UpSample22kHzMonoPCMTo44kHz( dest, src, numSamples );
17160  } else {
17161  SSE_UpSample22kHzStereoPCMTo44kHz( dest, src, numSamples );
17162  }
17163  } else if ( kHz == 44100 ) {
17164  SSE_UpSample44kHzMonoPCMTo44kHz( dest, src, numSamples );
17165  } else {
17166  assert( 0 );
17167  }
17168 }
17169 
17170 /*
17171 ============
17172 SSE_UpSample11kHzMonoOGGTo44kHz
17173 ============
17174 */
17175 static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
17176  float constant = 32768.0f;
17177  __asm {
17178  mov esi, src
17179  mov edi, dest
17180  movss xmm7, constant
17181  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17182 
17183  mov eax, numSamples
17184  and eax, ~1
17185  jz done2
17186  shl eax, 2
17187  add esi, eax
17188  neg eax
17189 
17190  align 16
17191  loop2:
17192  add edi, 2*16
17193 
17194  movss xmm0, [esi+eax+0]
17195  mulss xmm0, xmm7
17196  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
17197  movlps [edi-32], xmm0
17198  movlps [edi-24], xmm0
17199 
17200  movss xmm1, [esi+eax+4]
17201  mulss xmm1, xmm7
17202  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
17203  movlps [edi-16], xmm1
17204  movlps [edi- 8], xmm1
17205 
17206  add eax, 2*4
17207  jl loop2
17208 
17209  done2:
17210  mov eax, numSamples
17211  and eax, 1
17212  jz done
17213 
17214  movss xmm0, [esi]
17215  mulss xmm0, xmm7
17216  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
17217  movlps [edi+0], xmm0
17218  movlps [edi+8], xmm0
17219 
17220  done:
17221  }
17222 }
17223 
17224 /*
17225 ============
17226 SSE_UpSample11kHzStereoOGGTo44kHz
17227 ============
17228 */
17229 static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
17230  float constant = 32768.0f;
17231  __asm {
17232  mov esi, src
17233  mov ecx, [esi+0]
17234  mov edx, [esi+4]
17235  mov edi, dest
17236  movss xmm7, constant
17237  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17238 
17239  mov eax, numSamples
17240  and eax, ~1
17241  jz done2
17242  shl eax, 1
17243  add ecx, eax
17244  add edx, eax
17245  neg eax
17246 
17247  align 16
17248  loop2:
17249  add edi, 4*16
17250 
17251  movlps xmm0, [ecx+eax]
17252  movlps xmm1, [edx+eax]
17253  unpcklps xmm0, xmm1
17254  mulps xmm0, xmm7
17255  movlps [edi-8*8], xmm0
17256  movlps [edi-7*8], xmm0
17257  movlps [edi-6*8], xmm0
17258  movlps [edi-5*8], xmm0
17259  movhps [edi-4*8], xmm0
17260  movhps [edi-3*8], xmm0
17261  movhps [edi-2*8], xmm0
17262  movhps [edi-1*8], xmm0
17263 
17264  add eax, 2*4
17265  jl loop2
17266 
17267  done2:
17268  mov eax, numSamples
17269  and eax, 1
17270  jz done
17271 
17272  movss xmm0, [ecx]
17273  movss xmm1, [edx]
17274  unpcklps xmm0, xmm1
17275  mulps xmm0, xmm7
17276  movlps [edi+0*8], xmm0
17277  movlps [edi+1*8], xmm0
17278  movlps [edi+2*8], xmm0
17279  movlps [edi+3*8], xmm0
17280 
17281  done:
17282  }
17283 }
17284 
17285 /*
17286 ============
17287 SSE_UpSample22kHzMonoOGGTo44kHz
17288 ============
17289 */
17290 static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
17291  float constant = 32768.0f;
17292  __asm {
17293  mov esi, src
17294  mov edi, dest
17295  movss xmm7, constant
17296  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17297 
17298  mov eax, numSamples
17299  and eax, ~1
17300  jz done2
17301  shl eax, 2
17302  add esi, eax
17303  neg eax
17304 
17305  align 16
17306  loop2:
17307  add edi, 2*8
17308 
17309  movss xmm0, [esi+eax+0]
17310  movss xmm1, [esi+eax+4]
17311  shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
17312  mulps xmm0, xmm7
17313  movlps [edi-16], xmm0
17314  movhps [edi- 8], xmm0
17315 
17316  add eax, 2*4
17317  jl loop2
17318 
17319  done2:
17320  mov eax, numSamples
17321  and eax, 1
17322  jz done
17323 
17324  movss xmm0, [esi]
17325  mulss xmm0, xmm7
17326  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
17327  movlps [edi+0], xmm0
17328 
17329  done:
17330  }
17331 }
17332 
17333 /*
17334 ============
17335 SSE_UpSample22kHzStereoOGGTo44kHz
17336 ============
17337 */
17338 static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
17339  float constant = 32768.0f;
17340  __asm {
17341  mov esi, src
17342  mov ecx, [esi+0]
17343  mov edx, [esi+4]
17344  mov edi, dest
17345  movss xmm7, constant
17346  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17347 
17348  mov eax, numSamples
17349  and eax, ~1
17350  jz done2
17351  shl eax, 1
17352  add ecx, eax
17353  add edx, eax
17354  neg eax
17355 
17356  align 16
17357  loop2:
17358  add edi, 2*16
17359 
17360  movlps xmm0, [ecx+eax]
17361  movlps xmm1, [edx+eax]
17362  unpcklps xmm0, xmm1
17363  mulps xmm0, xmm7
17364  movlps [edi-4*8], xmm0
17365  movlps [edi-3*8], xmm0
17366  movhps [edi-2*8], xmm0
17367  movhps [edi-1*8], xmm0
17368 
17369  add eax, 2*4
17370  jl loop2
17371 
17372  done2:
17373  mov eax, numSamples
17374  and eax, 1
17375  jz done
17376 
17377  movss xmm0, [ecx]
17378  movss xmm1, [edx]
17379  unpcklps xmm0, xmm1
17380  mulps xmm0, xmm7
17381  movlps [edi+0*8], xmm0
17382  movlps [edi+1*8], xmm0
17383 
17384  done:
17385  }
17386 }
17387 
17388 /*
17389 ============
17390 SSE_UpSample44kHzMonoOGGTo44kHz
17391 ============
17392 */
17393 static void SSE_UpSample44kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
17394  float constant = 32768.0f;
17395  KFLOAT_CA( mul, dest, src, constant, numSamples )
17396 }
17397 
17398 /*
17399 ============
17400 SSE_UpSample44kHzStereoOGGTo44kHz
17401 ============
17402 */
17403 static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
17404  float constant = 32768.0f;
17405  __asm {
17406  mov esi, src
17407  mov ecx, [esi+0]
17408  mov edx, [esi+4]
17409  mov edi, dest
17410  movss xmm7, constant
17411  shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17412 
17413  mov eax, numSamples
17414  and eax, ~1
17415  jz done2
17416  shl eax, 1
17417  add ecx, eax
17418  add edx, eax
17419  neg eax
17420 
17421  align 16
17422  loop2:
17423  add edi, 16
17424 
17425  movlps xmm0, [ecx+eax]
17426  movlps xmm1, [edx+eax]
17427  unpcklps xmm0, xmm1
17428  mulps xmm0, xmm7
17429  movlps [edi-2*8], xmm0
17430  movhps [edi-1*8], xmm0
17431 
17432  add eax, 2*4
17433  jl loop2
17434 
17435  done2:
17436  mov eax, numSamples
17437  and eax, 1
17438  jz done
17439 
17440  movss xmm0, [ecx]
17441  movss xmm1, [edx]
17442  unpcklps xmm0, xmm1
17443  mulps xmm0, xmm7
17444  movlps [edi+0*8], xmm0
17445 
17446  done:
17447  }
17448 }
17449 
17450 /*
17451 ============
17452 idSIMD_SSE::UpSampleOGGTo44kHz
17453 
17454  Duplicate samples for 44kHz output.
17455 ============
17456 */
17457 void idSIMD_SSE::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
17458  if ( kHz == 11025 ) {
17459  if ( numChannels == 1 ) {
17460  SSE_UpSample11kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
17461  } else {
17462  SSE_UpSample11kHzStereoOGGTo44kHz( dest, ogg, numSamples );
17463  }
17464  } else if ( kHz == 22050 ) {
17465  if ( numChannels == 1 ) {
17466  SSE_UpSample22kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
17467  } else {
17468  SSE_UpSample22kHzStereoOGGTo44kHz( dest, ogg, numSamples );
17469  }
17470  } else if ( kHz == 44100 ) {
17471  if ( numChannels == 1 ) {
17472  SSE_UpSample44kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
17473  } else {
17474  SSE_UpSample44kHzStereoOGGTo44kHz( dest, ogg, numSamples );
17475  }
17476  } else {
17477  assert( 0 );
17478  }
17479 }
17480 
17481 /*
17482 ============
17483 idSIMD_SSE::MixSoundTwoSpeakerMono
17484 ============
17485 */
17486 void VPCALL idSIMD_SSE::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
17487 #if 1
17488 
17489  ALIGN16( float incs[2] );
17490 
17491  assert( numSamples == MIXBUFFER_SAMPLES );
17492 
17493  incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17494  incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17495 
17496  __asm {
17497  mov eax, MIXBUFFER_SAMPLES
17498  mov edi, mixBuffer
17499  mov esi, samples
17500  shl eax, 2
17501  add esi, eax
17502  neg eax
17503 
17504  mov ecx, lastV
17505  movlps xmm6, [ecx]
17506  xorps xmm7, xmm7
17507  movhps xmm7, incs
17508  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
17509  addps xmm6, xmm7
17510  shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
17511  addps xmm7, xmm7
17512 
17513  loop16:
17514  add edi, 4*4*4
17515 
17516  movaps xmm0, [esi+eax+0*4*4]
17517  movaps xmm1, xmm0
17518  shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
17519  mulps xmm0, xmm6
17520  addps xmm0, [edi-4*4*4]
17521  addps xmm6, xmm7
17522  movaps [edi-4*4*4], xmm0
17523 
17524  shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
17525  mulps xmm1, xmm6
17526  addps xmm1, [edi-3*4*4]
17527  addps xmm6, xmm7
17528  movaps [edi-3*4*4], xmm1
17529 
17530  movaps xmm2, [esi+eax+1*4*4]
17531  movaps xmm3, xmm2
17532  shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
17533  mulps xmm2, xmm6
17534  addps xmm2, [edi-2*4*4]
17535  addps xmm6, xmm7
17536  movaps [edi-2*4*4], xmm2
17537 
17538  shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 )
17539  mulps xmm3, xmm6
17540  addps xmm3, [edi-1*4*4]
17541  addps xmm6, xmm7
17542  movaps [edi-1*4*4], xmm3
17543 
17544  add eax, 2*4*4
17545 
17546  jl loop16
17547  }
17548 
17549 #else
17550 
17551  int i;
17552  float incL;
17553  float incR;
17554  float sL0, sL1;
17555  float sR0, sR1;
17556 
17557  assert( numSamples == MIXBUFFER_SAMPLES );
17558 
17559  incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17560  incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17561 
17562  sL0 = lastV[0];
17563  sR0 = lastV[1];
17564  sL1 = lastV[0] + incL;
17565  sR1 = lastV[1] + incR;
17566 
17567  incL *= 2;
17568  incR *= 2;
17569 
17570  for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
17571  mixBuffer[i*2+0] += samples[i+0] * sL0;
17572  mixBuffer[i*2+1] += samples[i+0] * sR0;
17573  mixBuffer[i*2+2] += samples[i+1] * sL1;
17574  mixBuffer[i*2+3] += samples[i+1] * sR1;
17575  sL0 += incL;
17576  sR0 += incR;
17577  sL1 += incL;
17578  sR1 += incR;
17579  }
17580 
17581 #endif
17582 }
17583 
17584 /*
17585 ============
17586 idSIMD_SSE::MixSoundTwoSpeakerStereo
17587 ============
17588 */
17589 void VPCALL idSIMD_SSE::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
17590 #if 1
17591 
17592  ALIGN16( float incs[2] );
17593 
17594  assert( numSamples == MIXBUFFER_SAMPLES );
17595 
17596  incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17597  incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17598 
17599  __asm {
17600  mov eax, MIXBUFFER_SAMPLES
17601  mov edi, mixBuffer
17602  mov esi, samples
17603  shl eax, 3
17604  add esi, eax
17605  neg eax
17606 
17607  mov ecx, lastV
17608  movlps xmm6, [ecx]
17609  xorps xmm7, xmm7
17610  movhps xmm7, incs
17611  shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
17612  addps xmm6, xmm7
17613  shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
17614  addps xmm7, xmm7
17615 
17616  loop16:
17617  add edi, 4*4*4
17618 
17619  movaps xmm0, [esi+eax+0*4*4]
17620  mulps xmm0, xmm6
17621  addps xmm0, [edi-4*4*4]
17622  addps xmm6, xmm7
17623  movaps [edi-4*4*4], xmm0
17624 
17625  movaps xmm2, [esi+eax+1*4*4]
17626  mulps xmm2, xmm6
17627  addps xmm2, [edi-3*4*4]
17628  addps xmm6, xmm7
17629  movaps [edi-3*4*4], xmm2
17630 
17631  movaps xmm3, [esi+eax+2*4*4]
17632  mulps xmm3, xmm6
17633  addps xmm3, [edi-2*4*4]
17634  addps xmm6, xmm7
17635  movaps [edi-2*4*4], xmm3
17636 
17637  movaps xmm4, [esi+eax+3*4*4]
17638  mulps xmm4, xmm6
17639  addps xmm4, [edi-1*4*4]
17640  addps xmm6, xmm7
17641  movaps [edi-1*4*4], xmm4
17642 
17643  add eax, 4*4*4
17644 
17645  jl loop16
17646  }
17647 
17648 #else
17649 
17650  int i;
17651  float incL;
17652  float incR;
17653  float sL0, sL1;
17654  float sR0, sR1;
17655 
17656  assert( numSamples == MIXBUFFER_SAMPLES );
17657 
17658  incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17659  incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17660 
17661  sL0 = lastV[0];
17662  sR0 = lastV[1];
17663  sL1 = lastV[0] + incL;
17664  sR1 = lastV[1] + incR;
17665 
17666  incL *= 2;
17667  incR *= 2;
17668 
17669  for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
17670  mixBuffer[i*2+0] += samples[i*2+0] * sL0;
17671  mixBuffer[i*2+1] += samples[i*2+1] * sR0;
17672  mixBuffer[i*2+2] += samples[i*2+2] * sL1;
17673  mixBuffer[i*2+3] += samples[i*2+3] * sR1;
17674  sL0 += incL;
17675  sR0 += incR;
17676  sL1 += incL;
17677  sR1 += incR;
17678  }
17679 
17680 #endif
17681 }
17682 
17683 /*
17684 ============
17685 idSIMD_SSE::MixSoundSixSpeakerMono
17686 ============
17687 */
17688 void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
17689 #if 1
17690 
17691  ALIGN16( float incs[6] );
17692 
17693  assert( numSamples == MIXBUFFER_SAMPLES );
17694 
17695  incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17696  incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17697  incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
17698  incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
17699  incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
17700  incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
17701 
17702  __asm {
17703  mov eax, MIXBUFFER_SAMPLES
17704  mov edi, mixBuffer
17705  mov esi, samples
17706  shl eax, 2
17707  add esi, eax
17708  neg eax
17709 
17710  mov ecx, lastV
17711  movlps xmm2, [ecx+ 0]
17712  movhps xmm2, [ecx+ 8]
17713  movlps xmm3, [ecx+16]
17714  movaps xmm4, xmm2
17715  shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
17716  shufps xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
17717 
17718  xorps xmm5, xmm5
17719  movhps xmm5, incs
17720  movlps xmm7, incs+8
17721  movhps xmm7, incs+16
17722  addps xmm3, xmm5
17723  addps xmm4, xmm7
17724  shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
17725  movaps xmm6, xmm7
17726  shufps xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
17727  addps xmm5, xmm5
17728  addps xmm6, xmm6
17729  addps xmm7, xmm7
17730 
17731  loop24:
17732  add edi, 6*16
17733 
17734  movaps xmm0, [esi+eax]
17735 
17736  movaps xmm1, xmm0
17737  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
17738  mulps xmm1, xmm2
17739  addps xmm1, [edi-6*16]
17740  addps xmm2, xmm5
17741  movaps [edi-6*16], xmm1
17742 
17743  movaps xmm1, xmm0
17744  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
17745  mulps xmm1, xmm3
17746  addps xmm1, [edi-5*16]
17747  addps xmm3, xmm6
17748  movaps [edi-5*16], xmm1
17749 
17750  movaps xmm1, xmm0
17751  shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
17752  mulps xmm1, xmm4
17753  addps xmm1, [edi-4*16]
17754  addps xmm4, xmm7
17755  movaps [edi-4*16], xmm1
17756 
17757  movaps xmm1, xmm0
17758  shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 2, 2 )
17759  mulps xmm1, xmm2
17760  addps xmm1, [edi-3*16]
17761  addps xmm2, xmm5
17762  movaps [edi-3*16], xmm1
17763 
17764  movaps xmm1, xmm0
17765  shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
17766  mulps xmm1, xmm3
17767  addps xmm1, [edi-2*16]
17768  addps xmm3, xmm6
17769  movaps [edi-2*16], xmm1
17770 
17771  shufps xmm0, xmm0, R_SHUFFLEPS( 3, 3, 3, 3 )
17772  mulps xmm0, xmm4
17773  addps xmm0, [edi-1*16]
17774  addps xmm4, xmm7
17775  movaps [edi-1*16], xmm0
17776 
17777  add eax, 4*4
17778 
17779  jl loop24
17780  }
17781 
17782 #else
17783 
17784  int i;
17785  float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
17786  float incL0, incL1, incL2, incL3, incL4, incL5;
17787 
17788  assert( numSamples == MIXBUFFER_SAMPLES );
17789 
17790  incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17791  incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17792  incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
17793  incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
17794  incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
17795  incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
17796 
17797  sL0 = lastV[0];
17798  sL1 = lastV[1];
17799  sL2 = lastV[2];
17800  sL3 = lastV[3];
17801  sL4 = lastV[4];
17802  sL5 = lastV[5];
17803 
17804  sL6 = lastV[0] + incL0;
17805  sL7 = lastV[1] + incL1;
17806  sL8 = lastV[2] + incL2;
17807  sL9 = lastV[3] + incL3;
17808  sL10 = lastV[4] + incL4;
17809  sL11 = lastV[5] + incL5;
17810 
17811  incL0 *= 2;
17812  incL1 *= 2;
17813  incL2 *= 2;
17814  incL3 *= 2;
17815  incL4 *= 2;
17816  incL5 *= 2;
17817 
17818  for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
17819  mixBuffer[i*6+ 0] += samples[i+0] * sL0;
17820  mixBuffer[i*6+ 1] += samples[i+0] * sL1;
17821  mixBuffer[i*6+ 2] += samples[i+0] * sL2;
17822  mixBuffer[i*6+ 3] += samples[i+0] * sL3;
17823 
17824  mixBuffer[i*6+ 4] += samples[i+0] * sL4;
17825  mixBuffer[i*6+ 5] += samples[i+0] * sL5;
17826  mixBuffer[i*6+ 6] += samples[i+1] * sL6;
17827  mixBuffer[i*6+ 7] += samples[i+1] * sL7;
17828 
17829  mixBuffer[i*6+ 8] += samples[i+1] * sL8;
17830  mixBuffer[i*6+ 9] += samples[i+1] * sL9;
17831  mixBuffer[i*6+10] += samples[i+1] * sL10;
17832  mixBuffer[i*6+11] += samples[i+1] * sL11;
17833 
17834  sL0 += incL0;
17835  sL1 += incL1;
17836  sL2 += incL2;
17837  sL3 += incL3;
17838 
17839  sL4 += incL4;
17840  sL5 += incL5;
17841  sL6 += incL0;
17842  sL7 += incL1;
17843 
17844  sL8 += incL2;
17845  sL9 += incL3;
17846  sL10 += incL4;
17847  sL11 += incL5;
17848  }
17849 
17850 #endif
17851 }
17852 
17853 /*
17854 ============
17855 idSIMD_SSE::MixSoundSixSpeakerStereo
17856 ============
17857 */
17858 void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
17859 #if 1
17860 
17861  ALIGN16( float incs[6] );
17862 
17863  assert( numSamples == MIXBUFFER_SAMPLES );
17864  assert( SPEAKER_RIGHT == 1 );
17865  assert( SPEAKER_BACKRIGHT == 5 );
17866 
17867  incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17868  incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17869  incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
17870  incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
17871  incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
17872  incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
17873 
17874  __asm {
17875  mov eax, MIXBUFFER_SAMPLES
17876  mov edi, mixBuffer
17877  mov esi, samples
17878  shl eax, 3
17879  add esi, eax
17880  neg eax
17881 
17882  mov ecx, lastV
17883  movlps xmm2, [ecx+ 0]
17884  movhps xmm2, [ecx+ 8]
17885  movlps xmm3, [ecx+16]
17886  movaps xmm4, xmm2
17887  shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
17888  shufps xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
17889 
17890  xorps xmm5, xmm5
17891  movhps xmm5, incs
17892  movlps xmm7, incs+ 8
17893  movhps xmm7, incs+16
17894  addps xmm3, xmm5
17895  addps xmm4, xmm7
17896  shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
17897  movaps xmm6, xmm7
17898  shufps xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
17899  addps xmm5, xmm5
17900  addps xmm6, xmm6
17901  addps xmm7, xmm7
17902 
17903  loop12:
17904  add edi, 3*16
17905 
17906  movaps xmm0, [esi+eax+0]
17907 
17908  movaps xmm1, xmm0
17909  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 0 )
17910  mulps xmm1, xmm2
17911  addps xmm1, [edi-3*16]
17912  addps xmm2, xmm5
17913  movaps [edi-3*16], xmm1
17914 
17915  movaps xmm1, xmm0
17916  shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 2, 3 )
17917  mulps xmm1, xmm3
17918  addps xmm1, [edi-2*16]
17919  addps xmm3, xmm6
17920  movaps [edi-2*16], xmm1
17921 
17922  add eax, 4*4
17923 
17924  shufps xmm0, xmm0, R_SHUFFLEPS( 2, 2, 2, 3 )
17925  mulps xmm0, xmm4
17926  addps xmm0, [edi-1*16]
17927  addps xmm4, xmm7
17928  movaps [edi-1*16], xmm0
17929 
17930  jl loop12
17931 
17932  emms
17933  }
17934 
17935 #else
17936 
17937  int i;
17938  float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
17939  float incL0, incL1, incL2, incL3, incL4, incL5;
17940 
17941  assert( numSamples == MIXBUFFER_SAMPLES );
17942  assert( SPEAKER_RIGHT == 1 );
17943  assert( SPEAKER_BACKRIGHT == 5 );
17944 
17945  incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17946  incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17947  incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
17948  incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
17949  incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
17950  incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
17951 
17952  sL0 = lastV[0];
17953  sL1 = lastV[1];
17954  sL2 = lastV[2];
17955  sL3 = lastV[3];
17956  sL4 = lastV[4];
17957  sL5 = lastV[5];
17958 
17959  sL6 = lastV[0] + incL0;
17960  sL7 = lastV[1] + incL1;
17961  sL8 = lastV[2] + incL2;
17962  sL9 = lastV[3] + incL3;
17963  sL10 = lastV[4] + incL4;
17964  sL11 = lastV[5] + incL5;
17965 
17966  incL0 *= 2;
17967  incL1 *= 2;
17968  incL2 *= 2;
17969  incL3 *= 2;
17970  incL4 *= 2;
17971  incL5 *= 2;
17972 
17973  for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
17974  mixBuffer[i*6+ 0] += samples[i*2+0+0] * sL0;
17975  mixBuffer[i*6+ 1] += samples[i*2+0+1] * sL1;
17976  mixBuffer[i*6+ 2] += samples[i*2+0+0] * sL2;
17977  mixBuffer[i*6+ 3] += samples[i*2+0+0] * sL3;
17978 
17979  mixBuffer[i*6+ 4] += samples[i*2+0+0] * sL4;
17980  mixBuffer[i*6+ 5] += samples[i*2+0+1] * sL5;
17981  mixBuffer[i*6+ 6] += samples[i*2+2+0] * sL6;
17982  mixBuffer[i*6+ 7] += samples[i*2+2+1] * sL7;
17983 
17984  mixBuffer[i*6+ 8] += samples[i*2+2+0] * sL8;
17985  mixBuffer[i*6+ 9] += samples[i*2+2+0] * sL9;
17986  mixBuffer[i*6+10] += samples[i*2+2+0] * sL10;
17987  mixBuffer[i*6+11] += samples[i*2+2+1] * sL11;
17988 
17989  sL0 += incL0;
17990  sL1 += incL1;
17991  sL2 += incL2;
17992  sL3 += incL3;
17993 
17994  sL4 += incL4;
17995  sL5 += incL5;
17996  sL6 += incL0;
17997  sL7 += incL1;
17998 
17999  sL8 += incL2;
18000  sL9 += incL3;
18001  sL10 += incL4;
18002  sL11 += incL5;
18003  }
18004 
18005 #endif
18006 }
18007 
18008 /*
18009 ============
18010 idSIMD_SSE::MixedSoundToSamples
18011 ============
18012 */
18013 void VPCALL idSIMD_SSE::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
18014 #if 1
18015 
18016  assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );
18017 
18018  __asm {
18019 
18020  mov eax, numSamples
18021  mov edi, mixBuffer
18022  mov esi, samples
18023  shl eax, 2
18024  add edi, eax
18025  neg eax
18026 
18027  loop16:
18028 
18029  movaps xmm0, [edi+eax+0*16]
18030  movaps xmm2, [edi+eax+1*16]
18031  movaps xmm4, [edi+eax+2*16]
18032  movaps xmm6, [edi+eax+3*16]
18033 
18034  add esi, 4*4*2
18035 
18036  movhlps xmm1, xmm0
18037  movhlps xmm3, xmm2
18038  movhlps xmm5, xmm4
18039  movhlps xmm7, xmm6
18040 
18041  prefetchnta [edi+eax+64]
18042 
18043  cvtps2pi mm0, xmm0
18044  cvtps2pi mm2, xmm2
18045  cvtps2pi mm4, xmm4
18046  cvtps2pi mm6, xmm6
18047 
18048  prefetchnta [edi+eax+128]
18049 
18050  cvtps2pi mm1, xmm1
18051  cvtps2pi mm3, xmm3
18052  cvtps2pi mm5, xmm5
18053  cvtps2pi mm7, xmm7
18054 
18055  add eax, 4*16
18056 
18057  packssdw mm0, mm1
18058  packssdw mm2, mm3
18059  packssdw mm4, mm5
18060  packssdw mm6, mm7
18061 
18062  movq [esi-4*4*2], mm0
18063  movq [esi-3*4*2], mm2
18064  movq [esi-2*4*2], mm4
18065  movq [esi-1*4*2], mm6
18066 
18067  jl loop16
18068 
18069  emms
18070  }
18071 
18072 #else
18073 
18074  for ( int i = 0; i < numSamples; i++ ) {
18075  if ( mixBuffer[i] <= -32768.0f ) {
18076  samples[i] = -32768;
18077  } else if ( mixBuffer[i] >= 32767.0f ) {
18078  samples[i] = 32767;
18079  } else {
18080  samples[i] = (short) mixBuffer[i];
18081  }
18082  }
18083 
18084 #endif
18085 }
18086 
18087 #endif /* _WIN32 */
virtual void VPCALL MatX_LowerTriangularSolve(const idMatX &L, float *x, const float *b, const int n, int skip=0)
virtual void VPCALL CreateSpecularTextureCoords(idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
GLdouble GLdouble GLdouble GLdouble q
Definition: glext.h:2959
virtual void VPCALL TransformVerts(idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights)
static float ATan16(float a)
Definition: Math.h:583
unsigned int dword
Definition: Lib.h:77
static const float INFINITY
Definition: Math.h:218
#define min(a, b)
virtual void VPCALL AddAssign16(float *dst, const float *src, const int count)
assert(prefInfo.fullscreenBtn)
const GLbyte * weights
Definition: glext.h:3273
const idVec3 & Normal(void) const
Definition: Plane.h:239
const float * ToFloatPtr(void) const
Definition: Quat.h:289
virtual void VPCALL Dot(float *dst, const idVec3 &constant, const idVec3 *src, const int count)
int GetSize(void) const
Definition: Vector.h:1467
virtual void VPCALL BlendJoints(idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints)
const GLdouble * v
Definition: glext.h:2936
GLdouble GLdouble x2
Definition: qgl.h:415
const int MIXBUFFER_SAMPLES
Definition: Simd.h:84
#define const
Definition: getdate.c:251
const float * ToFloatPtr(void) const
Definition: Vector.h:719
float w
Definition: Quat.h:53
idVec3 xyz
Definition: DrawVert.h:42
static const float PI
Definition: Math.h:205
GLenum GLint GLint y
Definition: glext.h:2849
virtual void VPCALL MixedSoundToSamples(short *samples, const float *mixBuffer, const int numSamples)
GLenum GLsizei n
Definition: glext.h:3705
virtual void VPCALL ClampMax(float *dst, const float *src, const float max, const int count)
idVec3 tangents[2]
Definition: DrawVert.h:45
virtual void VPCALL MatX_TransposeMultiplyAddVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
Definition: Vector.h:316
case const float
Definition: Callbacks.cpp:62
virtual void VPCALL TracePointCull(byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL Add(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL CmpGE(byte *dst, const float *src0, const float constant, const int count)
virtual void VPCALL MixSoundTwoSpeakerStereo(float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2])
GLdouble s
Definition: glext.h:2935
GLuint src
Definition: glext.h:5390
glIndex_t v3
Definition: Model.h:70
static const float HALF_PI
Definition: Math.h:207
GLfloat v0
Definition: glext.h:3606
virtual void VPCALL MatX_LowerTriangularSolveTranspose(const idMatX &L, float *x, const float *b, const int n)
GLenum GLint x
Definition: glext.h:2849
int i
Definition: process.py:33
int test(char *url)
Definition: lib500.c:3
virtual void VPCALL Sub16(float *dst, const float *src1, const float *src2, const int count)
virtual void VPCALL ClampMin(float *dst, const float *src, const float min, const int count)
float x
Definition: Quat.h:50
list l
Definition: prepare.py:17
static float Sin16(float a)
Definition: Math.h:314
float y
Definition: Quat.h:51
virtual void VPCALL DeriveUnsmoothedTangents(idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts)
virtual void VPCALL MatX_MultiplyVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
virtual void VPCALL MulAssign16(float *dst, const float constant, const int count)
virtual void VPCALL MixSoundTwoSpeakerMono(float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2])
GLfloat GLfloat GLfloat v2
Definition: glext.h:3608
idVec2 st
Definition: DrawVert.h:43
GLuint dst
Definition: glext.h:5285
float normalizationScale[3]
Definition: Model.h:71
GLuint GLuint GLsizei count
Definition: glext.h:2845
int GetNumColumns(void) const
Definition: Matrix.h:1822
Definition: Vector.h:52
#define FLOATSIGNBITSET(f)
Definition: Math.h:68
virtual void VPCALL OverlayPointCull(byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL Negate16(float *dst, const int count)
GLuint index
Definition: glext.h:3476
const GLubyte * c
Definition: glext.h:4677
Definition: Vector.h:808
virtual void VPCALL DecalPointCull(byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL SubAssign16(float *dst, const float *src, const int count)
#define NSKIP(n, s)
virtual void VPCALL MatX_MultiplyAddVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
virtual void VPCALL MatX_MultiplyMatX(idMatX &dst, const idMatX &m1, const idMatX &m2)
virtual void VPCALL CmpLE(byte *dst, const float *src0, const float constant, const int count)
virtual void VPCALL MatX_TransposeMultiplyMatX(idMatX &dst, const idMatX &m1, const idMatX &m2)
virtual void VPCALL Div(float *dst, const float constant, const float *src, const int count)
#define NULL
Definition: Lib.h:88
const float * ToFloatPtr(void) const
Definition: Vector.h:1910
virtual void VPCALL ConvertJointQuatsToJointMats(idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints)
Definition: Plane.h:71
Definition: eax4.h:1413
int GetNumRows(void) const
Definition: Matrix.h:1821
const float * ToFloatPtr(void) const
virtual void VPCALL DeriveTriPlanes(idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
virtual bool VPCALL MatX_LDLTFactor(idMatX &mat, idVecX &invDiag, const int n)
idVec3 normal
Definition: DrawVert.h:44
virtual void VPCALL MulSub(float *dst, const float constant, const float *src, const int count)
static float InvSqrt(float x)
Definition: Math.h:268
GLubyte GLubyte GLubyte a
Definition: glext.h:4662
GLdouble GLdouble GLdouble y2
Definition: qgl.h:415
virtual void VPCALL MatX_TransposeMultiplyVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
virtual void VPCALL MixSoundSixSpeakerStereo(float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6])
GLfloat GLfloat v1
Definition: glext.h:3607
GLubyte GLubyte b
Definition: glext.h:4662
Definition: Quat.h:48
static const float TWO_PI
Definition: Math.h:206
virtual void VPCALL MixSoundSixSpeakerMono(float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6])
float z
Definition: Quat.h:52
virtual void VPCALL CreateTextureSpaceLightVectors(idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
#define bits
Definition: Unzip.cpp:3797
glIndex_t v2
Definition: Model.h:70
GLenum GLenum GLvoid * row
Definition: glext.h:2866
virtual void VPCALL UpSamplePCMTo44kHz(float *dest, const short *pcm, const int numSamples, const int kHz, const int numChannels)
tuple f
Definition: idal.py:89
virtual void VPCALL DeriveTangents(idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
virtual void VPCALL Mul16(float *dst, const float *src1, const float constant, const int count)
unsigned short word
Definition: Lib.h:76
unsigned char byte
Definition: Lib.h:75
virtual int VPCALL CreateShadowCache(idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts)
GLfloat * st
Definition: qgl.h:89
virtual void VPCALL Zero16(float *dst, const int count)
virtual void VPCALL Mul(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL NormalizeTangents(idDrawVert *verts, const int numVerts)
idVertexCache vertexCache
Definition: VertexCache.cpp:41
virtual void VPCALL Clamp(float *dst, const float *src, const float min, const float max, const int count)
#define VPCALL
Definition: Simd.h:63
virtual void VPCALL UntransformJoints(idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint)
GLint j
Definition: qgl.h:264
float dot(float a[], float b[])
Definition: Model_lwo.cpp:3883
virtual int VPCALL CreateVertexProgramShadowCache(idVec4 *vertexCache, const idDrawVert *verts, const int numVerts)
virtual void VPCALL ConvertJointMatsToJointQuats(idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints)
virtual void VPCALL MulAdd(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL MatX_MultiplySubVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
virtual void VPCALL MatX_TransposeMultiplySubVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
#define max(x, y)
Definition: os.h:70
virtual void VPCALL MinMax(float &min, float &max, const float *src, const int count)
virtual void VPCALL Copy16(float *dst, const float *src, const int count)
virtual void VPCALL Sub(float *dst, const float constant, const float *src, const int count)
const float * ToFloatPtr(void) const
Definition: Matrix.h:2935
break
Definition: Callbacks.cpp:38
virtual void VPCALL TransformJoints(idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint)
virtual void VPCALL CmpLT(byte *dst, const float *src0, const float constant, const int count)
static float RSqrt(float x)
Definition: Math.h:241
virtual void VPCALL Add16(float *dst, const float *src1, const float *src2, const int count)
virtual void VPCALL UpSampleOGGTo44kHz(float *dest, const float *const *ogg, const int numSamples, const int kHz, const int numChannels)
virtual void VPCALL CmpGT(byte *dst, const float *src0, const float constant, const int count)
virtual const char *VPCALL GetName(void) const
void FitThroughPoint(const idVec3 &p)
Definition: Plane.h:297
GLdouble GLdouble t
Definition: glext.h:2943