doom3-gpl
Doom 3 GPL source release
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Simd_Generic.cpp
Go to the documentation of this file.
1 /*
2 ===========================================================================
3 
4 Doom 3 GPL Source Code
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
6 
7 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
8 
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13 
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
21 
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
23 
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
25 
26 ===========================================================================
27 */
28 
29 #include "../precompiled.h"
30 #pragma hdrstop
31 
32 #include "Simd_Generic.h"
33 
34 
35 //===============================================================
36 //
37 // Generic implementation of idSIMDProcessor
38 //
39 //===============================================================
40 
41 #define UNROLL1(Y) { int _IX; for (_IX=0;_IX<count;_IX++) {Y(_IX);} }
42 #define UNROLL2(Y) { int _IX, _NM = count&0xfffffffe; for (_IX=0;_IX<_NM;_IX+=2){Y(_IX+0);Y(_IX+1);} if (_IX < count) {Y(_IX);}}
43 #define UNROLL4(Y) { int _IX, _NM = count&0xfffffffc; for (_IX=0;_IX<_NM;_IX+=4){Y(_IX+0);Y(_IX+1);Y(_IX+2);Y(_IX+3);}for(;_IX<count;_IX++){Y(_IX);}}
44 #define UNROLL8(Y) { int _IX, _NM = count&0xfffffff8; for (_IX=0;_IX<_NM;_IX+=8){Y(_IX+0);Y(_IX+1);Y(_IX+2);Y(_IX+3);Y(_IX+4);Y(_IX+5);Y(_IX+6);Y(_IX+7);} _NM = count&0xfffffffe; for(;_IX<_NM;_IX+=2){Y(_IX); Y(_IX+1);} if (_IX < count) {Y(_IX);} }
45 
46 #ifdef _DEBUG
47 #define NODEFAULT default: assert( 0 )
48 #elif _WIN32
49 #define NODEFAULT default: __assume( 0 )
50 #else
51 #define NODEFAULT
52 #endif
53 
54 
55 /*
56 ============
57 idSIMD_Generic::GetName
58 ============
59 */
60 const char * idSIMD_Generic::GetName( void ) const {
61  return "generic code";
62 }
63 
64 /*
65 ============
66 idSIMD_Generic::Add
67 
68  dst[i] = constant + src[i];
69 ============
70 */
71 void VPCALL idSIMD_Generic::Add( float *dst, const float constant, const float *src, const int count ) {
72 #define OPER(X) dst[(X)] = src[(X)] + constant;
73  UNROLL4(OPER)
74 #undef OPER
75 }
76 
77 /*
78 ============
79 idSIMD_Generic::Add
80 
81  dst[i] = src0[i] + src1[i];
82 ============
83 */
84 void VPCALL idSIMD_Generic::Add( float *dst, const float *src0, const float *src1, const int count ) {
85 #define OPER(X) dst[(X)] = src0[(X)] + src1[(X)];
86  UNROLL4(OPER)
87 #undef OPER
88 }
89 
90 /*
91 ============
92 idSIMD_Generic::Sub
93 
94  dst[i] = constant - src[i];
95 ============
96 */
97 void VPCALL idSIMD_Generic::Sub( float *dst, const float constant, const float *src, const int count ) {
98  double c = constant;
99 #define OPER(X) dst[(X)] = c - src[(X)];
100  UNROLL4(OPER)
101 #undef OPER
102 }
103 
104 /*
105 ============
106 idSIMD_Generic::Sub
107 
108  dst[i] = src0[i] - src1[i];
109 ============
110 */
111 void VPCALL idSIMD_Generic::Sub( float *dst, const float *src0, const float *src1, const int count ) {
112 #define OPER(X) dst[(X)] = src0[(X)] - src1[(X)];
113  UNROLL4(OPER)
114 #undef OPER
115 }
116 
117 /*
118 ============
119 idSIMD_Generic::Mul
120 
121  dst[i] = constant * src[i];
122 ============
123 */
124 void VPCALL idSIMD_Generic::Mul( float *dst, const float constant, const float *src0, const int count) {
125  double c = constant;
126 #define OPER(X) (dst[(X)] = (c * src0[(X)]))
127  UNROLL4(OPER)
128 #undef OPER
129 }
130 
131 /*
132 ============
133 idSIMD_Generic::Mul
134 
135  dst[i] = src0[i] * src1[i];
136 ============
137 */
138 void VPCALL idSIMD_Generic::Mul( float *dst, const float *src0, const float *src1, const int count ) {
139 #define OPER(X) (dst[(X)] = src0[(X)] * src1[(X)])
140  UNROLL4(OPER)
141 #undef OPER
142 }
143 
144 /*
145 ============
146 idSIMD_Generic::Div
147 
148  dst[i] = constant / divisor[i];
149 ============
150 */
151 void VPCALL idSIMD_Generic::Div( float *dst, const float constant, const float *divisor, const int count ) {
152  double c = constant;
153 #define OPER(X) (dst[(X)] = (c / divisor[(X)]))
154  UNROLL4(OPER)
155 #undef OPER
156 }
157 
158 /*
159 ============
160 idSIMD_Generic::Div
161 
162  dst[i] = src0[i] / src1[i];
163 ============
164 */
165 void VPCALL idSIMD_Generic::Div( float *dst, const float *src0, const float *src1, const int count ) {
166 #define OPER(X) (dst[(X)] = src0[(X)] / src1[(X)])
167  UNROLL4(OPER)
168 #undef OPER
169 }
170 
171 /*
172 ============
173 idSIMD_Generic::MulAdd
174 
175  dst[i] += constant * src[i];
176 ============
177 */
178 void VPCALL idSIMD_Generic::MulAdd( float *dst, const float constant, const float *src, const int count ) {
179  double c = constant;
180 #define OPER(X) (dst[(X)] += c * src[(X)])
181  UNROLL4(OPER)
182 #undef OPER
183 }
184 
185 /*
186 ============
187 idSIMD_Generic::MulAdd
188 
189  dst[i] += src0[i] * src1[i];
190 ============
191 */
192 void VPCALL idSIMD_Generic::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
193 #define OPER(X) (dst[(X)] += src0[(X)] * src1[(X)])
194  UNROLL4(OPER)
195 #undef OPER
196 }
197 
198 /*
199 ============
200 idSIMD_Generic::MulSub
201 
202  dst[i] -= constant * src[i];
203 ============
204 */
205 void VPCALL idSIMD_Generic::MulSub( float *dst, const float constant, const float *src, const int count ) {
206  double c = constant;
207 #define OPER(X) (dst[(X)] -= c * src[(X)])
208  UNROLL4(OPER)
209 #undef OPER
210 }
211 
212 /*
213 ============
214 idSIMD_Generic::MulSub
215 
216  dst[i] -= src0[i] * src1[i];
217 ============
218 */
219 void VPCALL idSIMD_Generic::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
220 #define OPER(X) (dst[(X)] -= src0[(X)] * src1[(X)])
221  UNROLL4(OPER)
222 #undef OPER
223 }
224 
225 /*
226 ============
227 idSIMD_Generic::Dot
228 
229  dst[i] = constant * src[i];
230 ============
231 */
232 void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
233 #define OPER(X) dst[(X)] = constant * src[(X)];
234  UNROLL1(OPER)
235 #undef OPER
236 }
237 
238 /*
239 ============
240 idSIMD_Generic::Dot
241 
242  dst[i] = constant * src[i].Normal() + src[i][3];
243 ============
244 */
245 void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
246 #define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];
247  UNROLL1(OPER)
248 #undef OPER
249 }
250 
251 /*
252 ============
253 idSIMD_Generic::Dot
254 
255  dst[i] = constant * src[i].xyz;
256 ============
257 */
258 void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
259 #define OPER(X) dst[(X)] = constant * src[(X)].xyz;
260  UNROLL1(OPER)
261 #undef OPER
262 }
263 
264 /*
265 ============
266 idSIMD_Generic::Dot
267 
268  dst[i] = constant.Normal() * src[i] + constant[3];
269 ============
270 */
271 void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
272 #define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];
273  UNROLL1(OPER)
274 #undef OPER
275 }
276 
277 /*
278 ============
279 idSIMD_Generic::Dot
280 
281  dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
282 ============
283 */
284 void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
285 #define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];
286  UNROLL1(OPER)
287 #undef OPER
288 }
289 
290 /*
291 ============
292 idSIMD_Generic::Dot
293 
294  dst[i] = constant.Normal() * src[i].xyz + constant[3];
295 ============
296 */
297 void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
298 #define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
299  UNROLL1(OPER)
300 #undef OPER
301 }
302 
303 /*
304 ============
305 idSIMD_Generic::Dot
306 
307  dst[i] = src0[i] * src1[i];
308 ============
309 */
310 void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
311 #define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];
312  UNROLL1(OPER)
313 #undef OPER
314 }
315 
316 /*
317 ============
318 idSIMD_Generic::Dot
319 
320  dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
321 ============
322 */
323 void VPCALL idSIMD_Generic::Dot( float &dot, const float *src1, const float *src2, const int count ) {
324 #if 1
325 
326  switch( count ) {
327  case 0: {
328  dot = 0.0f;
329  return;
330  }
331  case 1: {
332  dot = src1[0] * src2[0];
333  return;
334  }
335  case 2: {
336  dot = src1[0] * src2[0] + src1[1] * src2[1];
337  return;
338  }
339  case 3: {
340  dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
341  return;
342  }
343  default: {
344  int i;
345  double s0, s1, s2, s3;
346  s0 = src1[0] * src2[0];
347  s1 = src1[1] * src2[1];
348  s2 = src1[2] * src2[2];
349  s3 = src1[3] * src2[3];
350  for ( i = 4; i < count-7; i += 8 ) {
351  s0 += src1[i+0] * src2[i+0];
352  s1 += src1[i+1] * src2[i+1];
353  s2 += src1[i+2] * src2[i+2];
354  s3 += src1[i+3] * src2[i+3];
355  s0 += src1[i+4] * src2[i+4];
356  s1 += src1[i+5] * src2[i+5];
357  s2 += src1[i+6] * src2[i+6];
358  s3 += src1[i+7] * src2[i+7];
359  }
360  switch( count - i ) {
361  NODEFAULT;
362  case 7: s0 += src1[i+6] * src2[i+6];
363  case 6: s1 += src1[i+5] * src2[i+5];
364  case 5: s2 += src1[i+4] * src2[i+4];
365  case 4: s3 += src1[i+3] * src2[i+3];
366  case 3: s0 += src1[i+2] * src2[i+2];
367  case 2: s1 += src1[i+1] * src2[i+1];
368  case 1: s2 += src1[i+0] * src2[i+0];
369  case 0: break;
370  }
371  double sum;
372  sum = s3;
373  sum += s2;
374  sum += s1;
375  sum += s0;
376  dot = sum;
377  }
378  }
379 
380 #else
381 
382  dot = 0.0f;
383  for ( i = 0; i < count; i++ ) {
384  dot += src1[i] * src2[i];
385  }
386 
387 #endif
388 }
389 
390 /*
391 ============
392 idSIMD_Generic::CmpGT
393 
394  dst[i] = src0[i] > constant;
395 ============
396 */
397 void VPCALL idSIMD_Generic::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
398 #define OPER(X) dst[(X)] = src0[(X)] > constant;
399  UNROLL4(OPER)
400 #undef OPER
401 }
402 
403 /*
404 ============
405 idSIMD_Generic::CmpGT
406 
407  dst[i] |= ( src0[i] > constant ) << bitNum;
408 ============
409 */
410 void VPCALL idSIMD_Generic::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
411 #define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;
412  UNROLL4(OPER)
413 #undef OPER
414 }
415 
416 /*
417 ============
418 idSIMD_Generic::CmpGE
419 
420  dst[i] = src0[i] >= constant;
421 ============
422 */
423 void VPCALL idSIMD_Generic::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
424 #define OPER(X) dst[(X)] = src0[(X)] >= constant;
425  UNROLL4(OPER)
426 #undef OPER
427 }
428 
429 /*
430 ============
431 idSIMD_Generic::CmpGE
432 
433  dst[i] |= ( src0[i] >= constant ) << bitNum;
434 ============
435 */
436 void VPCALL idSIMD_Generic::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
437 #define OPER(X) dst[(X)] |= ( src0[(X)] >= constant ) << bitNum;
438  UNROLL4(OPER)
439 #undef OPER
440 }
441 
442 /*
443 ============
444 idSIMD_Generic::CmpLT
445 
446  dst[i] = src0[i] < constant;
447 ============
448 */
449 void VPCALL idSIMD_Generic::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
450 #define OPER(X) dst[(X)] = src0[(X)] < constant;
451  UNROLL4(OPER)
452 #undef OPER
453 }
454 
455 /*
456 ============
457 idSIMD_Generic::CmpLT
458 
459  dst[i] |= ( src0[i] < constant ) << bitNum;
460 ============
461 */
462 void VPCALL idSIMD_Generic::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
463 #define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;
464  UNROLL4(OPER)
465 #undef OPER
466 }
467 
468 /*
469 ============
470 idSIMD_Generic::CmpLE
471 
472  dst[i] = src0[i] <= constant;
473 ============
474 */
475 void VPCALL idSIMD_Generic::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
476 #define OPER(X) dst[(X)] = src0[(X)] <= constant;
477  UNROLL4(OPER)
478 #undef OPER
479 }
480 
481 /*
482 ============
483 idSIMD_Generic::CmpLE
484 
485  dst[i] |= ( src0[i] <= constant ) << bitNum;
486 ============
487 */
488 void VPCALL idSIMD_Generic::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
489 #define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;
490  UNROLL4(OPER)
491 #undef OPER
492 }
493 
494 /*
495 ============
496 idSIMD_Generic::MinMax
497 ============
498 */
499 void VPCALL idSIMD_Generic::MinMax( float &min, float &max, const float *src, const int count ) {
500  min = idMath::INFINITY; max = -idMath::INFINITY;
501 #define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}
502  UNROLL1(OPER)
503 #undef OPER
504 }
505 
506 /*
507 ============
508 idSIMD_Generic::MinMax
509 ============
510 */
512  min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;
513 #define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }
514  UNROLL1(OPER)
515 #undef OPER
516 }
517 
518 /*
519 ============
520 idSIMD_Generic::MinMax
521 ============
522 */
524  min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
525 #define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
526  UNROLL1(OPER)
527 #undef OPER
528 }
529 
530 /*
531 ============
532 idSIMD_Generic::MinMax
533 ============
534 */
536  min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
537 #define OPER(X) const idVec3 &v = src[(X)].xyz; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
538  UNROLL1(OPER)
539 #undef OPER
540 }
541 
542 /*
543 ============
544 idSIMD_Generic::MinMax
545 ============
546 */
547 void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
548  min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
549 #define OPER(X) const idVec3 &v = src[indexes[(X)]].xyz; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
550  UNROLL1(OPER)
551 #undef OPER
552 }
553 
554 /*
555 ============
556 idSIMD_Generic::Clamp
557 ============
558 */
559 void VPCALL idSIMD_Generic::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
560 #define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];
561  UNROLL1(OPER)
562 #undef OPER
563 }
564 
565 /*
566 ============
567 idSIMD_Generic::ClampMin
568 ============
569 */
570 void VPCALL idSIMD_Generic::ClampMin( float *dst, const float *src, const float min, const int count ) {
571 #define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];
572  UNROLL1(OPER)
573 #undef OPER
574 }
575 
576 /*
577 ============
578 idSIMD_Generic::ClampMax
579 ============
580 */
581 void VPCALL idSIMD_Generic::ClampMax( float *dst, const float *src, const float max, const int count ) {
582 #define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];
583  UNROLL1(OPER)
584 #undef OPER
585 }
586 
587 /*
588 ================
589 idSIMD_Generic::Memcpy
590 ================
591 */
592 void VPCALL idSIMD_Generic::Memcpy( void *dst, const void *src, const int count ) {
593  memcpy( dst, src, count );
594 }
595 
596 /*
597 ================
598 idSIMD_Generic::Memset
599 ================
600 */
601 void VPCALL idSIMD_Generic::Memset( void *dst, const int val, const int count ) {
602  memset( dst, val, count );
603 }
604 
605 /*
606 ============
607 idSIMD_Generic::Zero16
608 ============
609 */
610 void VPCALL idSIMD_Generic::Zero16( float *dst, const int count ) {
611  memset( dst, 0, count * sizeof( float ) );
612 }
613 
614 /*
615 ============
616 idSIMD_Generic::Negate16
617 ============
618 */
619 void VPCALL idSIMD_Generic::Negate16( float *dst, const int count ) {
620  unsigned int *ptr = reinterpret_cast<unsigned int *>(dst);
621 #define OPER(X) ptr[(X)] ^= ( 1 << 31 ) // IEEE 32 bits float sign bit
622  UNROLL1(OPER)
623 #undef OPER
624 }
625 
626 /*
627 ============
628 idSIMD_Generic::Copy16
629 ============
630 */
631 void VPCALL idSIMD_Generic::Copy16( float *dst, const float *src, const int count ) {
632 #define OPER(X) dst[(X)] = src[(X)]
633  UNROLL1(OPER)
634 #undef OPER
635 }
636 
637 /*
638 ============
639 idSIMD_Generic::Add16
640 ============
641 */
642 void VPCALL idSIMD_Generic::Add16( float *dst, const float *src1, const float *src2, const int count ) {
643 #define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]
644  UNROLL1(OPER)
645 #undef OPER
646 }
647 
648 /*
649 ============
650 idSIMD_Generic::Sub16
651 ============
652 */
653 void VPCALL idSIMD_Generic::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
654 #define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]
655  UNROLL1(OPER)
656 #undef OPER
657 }
658 
659 /*
660 ============
661 idSIMD_Generic::Mul16
662 ============
663 */
664 void VPCALL idSIMD_Generic::Mul16( float *dst, const float *src1, const float constant, const int count ) {
665 #define OPER(X) dst[(X)] = src1[(X)] * constant
666  UNROLL1(OPER)
667 #undef OPER
668 }
669 
670 /*
671 ============
672 idSIMD_Generic::AddAssign16
673 ============
674 */
675 void VPCALL idSIMD_Generic::AddAssign16( float *dst, const float *src, const int count ) {
676 #define OPER(X) dst[(X)] += src[(X)]
677  UNROLL1(OPER)
678 #undef OPER
679 }
680 
681 /*
682 ============
683 idSIMD_Generic::SubAssign16
684 ============
685 */
686 void VPCALL idSIMD_Generic::SubAssign16( float *dst, const float *src, const int count ) {
687 #define OPER(X) dst[(X)] -= src[(X)]
688  UNROLL1(OPER)
689 #undef OPER
690 }
691 
692 /*
693 ============
694 idSIMD_Generic::MulAssign16
695 ============
696 */
697 void VPCALL idSIMD_Generic::MulAssign16( float *dst, const float constant, const int count ) {
698 #define OPER(X) dst[(X)] *= constant
699  UNROLL1(OPER)
700 #undef OPER
701 }
702 
703 /*
704 ============
705 idSIMD_Generic::MatX_MultiplyVecX
706 ============
707 */
709  int i, j, numRows;
710  const float *mPtr, *vPtr;
711  float *dstPtr;
712 
713  assert( vec.GetSize() >= mat.GetNumColumns() );
714  assert( dst.GetSize() >= mat.GetNumRows() );
715 
716  mPtr = mat.ToFloatPtr();
717  vPtr = vec.ToFloatPtr();
718  dstPtr = dst.ToFloatPtr();
719  numRows = mat.GetNumRows();
720  switch( mat.GetNumColumns() ) {
721  case 1:
722  for ( i = 0; i < numRows; i++ ) {
723  dstPtr[i] = mPtr[0] * vPtr[0];
724  mPtr++;
725  }
726  break;
727  case 2:
728  for ( i = 0; i < numRows; i++ ) {
729  dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
730  mPtr += 2;
731  }
732  break;
733  case 3:
734  for ( i = 0; i < numRows; i++ ) {
735  dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
736  mPtr += 3;
737  }
738  break;
739  case 4:
740  for ( i = 0; i < numRows; i++ ) {
741  dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
742  mPtr[3] * vPtr[3];
743  mPtr += 4;
744  }
745  break;
746  case 5:
747  for ( i = 0; i < numRows; i++ ) {
748  dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
749  mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
750  mPtr += 5;
751  }
752  break;
753  case 6:
754  for ( i = 0; i < numRows; i++ ) {
755  dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
756  mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
757  mPtr += 6;
758  }
759  break;
760  default:
761  int numColumns = mat.GetNumColumns();
762  for ( i = 0; i < numRows; i++ ) {
763  float sum = mPtr[0] * vPtr[0];
764  for ( j = 1; j < numColumns; j++ ) {
765  sum += mPtr[j] * vPtr[j];
766  }
767  dstPtr[i] = sum;
768  mPtr += numColumns;
769  }
770  break;
771  }
772 }
773 
774 /*
775 ============
776 idSIMD_Generic::MatX_MultiplyAddVecX
777 ============
778 */
780  int i, j, numRows;
781  const float *mPtr, *vPtr;
782  float *dstPtr;
783 
784  assert( vec.GetSize() >= mat.GetNumColumns() );
785  assert( dst.GetSize() >= mat.GetNumRows() );
786 
787  mPtr = mat.ToFloatPtr();
788  vPtr = vec.ToFloatPtr();
789  dstPtr = dst.ToFloatPtr();
790  numRows = mat.GetNumRows();
791  switch( mat.GetNumColumns() ) {
792  case 1:
793  for ( i = 0; i < numRows; i++ ) {
794  dstPtr[i] += mPtr[0] * vPtr[0];
795  mPtr++;
796  }
797  break;
798  case 2:
799  for ( i = 0; i < numRows; i++ ) {
800  dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
801  mPtr += 2;
802  }
803  break;
804  case 3:
805  for ( i = 0; i < numRows; i++ ) {
806  dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
807  mPtr += 3;
808  }
809  break;
810  case 4:
811  for ( i = 0; i < numRows; i++ ) {
812  dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
813  mPtr[3] * vPtr[3];
814  mPtr += 4;
815  }
816  break;
817  case 5:
818  for ( i = 0; i < numRows; i++ ) {
819  dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
820  mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
821  mPtr += 5;
822  }
823  break;
824  case 6:
825  for ( i = 0; i < numRows; i++ ) {
826  dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
827  mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
828  mPtr += 6;
829  }
830  break;
831  default:
832  int numColumns = mat.GetNumColumns();
833  for ( i = 0; i < numRows; i++ ) {
834  float sum = mPtr[0] * vPtr[0];
835  for ( j = 1; j < numColumns; j++ ) {
836  sum += mPtr[j] * vPtr[j];
837  }
838  dstPtr[i] += sum;
839  mPtr += numColumns;
840  }
841  break;
842  }
843 }
844 
845 /*
846 ============
847 idSIMD_Generic::MatX_MultiplySubVecX
848 ============
849 */
851  int i, j, numRows;
852  const float *mPtr, *vPtr;
853  float *dstPtr;
854 
855  assert( vec.GetSize() >= mat.GetNumColumns() );
856  assert( dst.GetSize() >= mat.GetNumRows() );
857 
858  mPtr = mat.ToFloatPtr();
859  vPtr = vec.ToFloatPtr();
860  dstPtr = dst.ToFloatPtr();
861  numRows = mat.GetNumRows();
862  switch( mat.GetNumColumns() ) {
863  case 1:
864  for ( i = 0; i < numRows; i++ ) {
865  dstPtr[i] -= mPtr[0] * vPtr[0];
866  mPtr++;
867  }
868  break;
869  case 2:
870  for ( i = 0; i < numRows; i++ ) {
871  dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
872  mPtr += 2;
873  }
874  break;
875  case 3:
876  for ( i = 0; i < numRows; i++ ) {
877  dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
878  mPtr += 3;
879  }
880  break;
881  case 4:
882  for ( i = 0; i < numRows; i++ ) {
883  dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
884  mPtr[3] * vPtr[3];
885  mPtr += 4;
886  }
887  break;
888  case 5:
889  for ( i = 0; i < numRows; i++ ) {
890  dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
891  mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
892  mPtr += 5;
893  }
894  break;
895  case 6:
896  for ( i = 0; i < numRows; i++ ) {
897  dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
898  mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
899  mPtr += 6;
900  }
901  break;
902  default:
903  int numColumns = mat.GetNumColumns();
904  for ( i = 0; i < numRows; i++ ) {
905  float sum = mPtr[0] * vPtr[0];
906  for ( j = 1; j < numColumns; j++ ) {
907  sum += mPtr[j] * vPtr[j];
908  }
909  dstPtr[i] -= sum;
910  mPtr += numColumns;
911  }
912  break;
913  }
914 }
915 
916 /*
917 ============
918 idSIMD_Generic::MatX_TransposeMultiplyVecX
919 ============
920 */
922  int i, j, numColumns;
923  const float *mPtr, *vPtr;
924  float *dstPtr;
925 
926  assert( vec.GetSize() >= mat.GetNumRows() );
927  assert( dst.GetSize() >= mat.GetNumColumns() );
928 
929  mPtr = mat.ToFloatPtr();
930  vPtr = vec.ToFloatPtr();
931  dstPtr = dst.ToFloatPtr();
932  numColumns = mat.GetNumColumns();
933  switch( mat.GetNumRows() ) {
934  case 1:
935  for ( i = 0; i < numColumns; i++ ) {
936  dstPtr[i] = *(mPtr) * vPtr[0];
937  mPtr++;
938  }
939  break;
940  case 2:
941  for ( i = 0; i < numColumns; i++ ) {
942  dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
943  mPtr++;
944  }
945  break;
946  case 3:
947  for ( i = 0; i < numColumns; i++ ) {
948  dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
949  mPtr++;
950  }
951  break;
952  case 4:
953  for ( i = 0; i < numColumns; i++ ) {
954  dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
955  *(mPtr+3*numColumns) * vPtr[3];
956  mPtr++;
957  }
958  break;
959  case 5:
960  for ( i = 0; i < numColumns; i++ ) {
961  dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
962  *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
963  mPtr++;
964  }
965  break;
966  case 6:
967  for ( i = 0; i < numColumns; i++ ) {
968  dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
969  *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
970  mPtr++;
971  }
972  break;
973  default:
974  int numRows = mat.GetNumRows();
975  for ( i = 0; i < numColumns; i++ ) {
976  mPtr = mat.ToFloatPtr() + i;
977  float sum = mPtr[0] * vPtr[0];
978  for ( j = 1; j < numRows; j++ ) {
979  mPtr += numColumns;
980  sum += mPtr[0] * vPtr[j];
981  }
982  dstPtr[i] = sum;
983  }
984  break;
985  }
986 }
987 
988 /*
989 ============
990 idSIMD_Generic::MatX_TransposeMultiplyAddVecX
991 ============
992 */
994  int i, j, numColumns;
995  const float *mPtr, *vPtr;
996  float *dstPtr;
997 
998  assert( vec.GetSize() >= mat.GetNumRows() );
999  assert( dst.GetSize() >= mat.GetNumColumns() );
1000 
1001  mPtr = mat.ToFloatPtr();
1002  vPtr = vec.ToFloatPtr();
1003  dstPtr = dst.ToFloatPtr();
1004  numColumns = mat.GetNumColumns();
1005  switch( mat.GetNumRows() ) {
1006  case 1:
1007  for ( i = 0; i < numColumns; i++ ) {
1008  dstPtr[i] += *(mPtr) * vPtr[0];
1009  mPtr++;
1010  }
1011  break;
1012  case 2:
1013  for ( i = 0; i < numColumns; i++ ) {
1014  dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
1015  mPtr++;
1016  }
1017  break;
1018  case 3:
1019  for ( i = 0; i < numColumns; i++ ) {
1020  dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
1021  mPtr++;
1022  }
1023  break;
1024  case 4:
1025  for ( i = 0; i < numColumns; i++ ) {
1026  dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
1027  *(mPtr+3*numColumns) * vPtr[3];
1028  mPtr++;
1029  }
1030  break;
1031  case 5:
1032  for ( i = 0; i < numColumns; i++ ) {
1033  dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
1034  *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
1035  mPtr++;
1036  }
1037  break;
1038  case 6:
1039  for ( i = 0; i < numColumns; i++ ) {
1040  dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
1041  *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
1042  mPtr++;
1043  }
1044  break;
1045  default:
1046  int numRows = mat.GetNumRows();
1047  for ( i = 0; i < numColumns; i++ ) {
1048  mPtr = mat.ToFloatPtr() + i;
1049  float sum = mPtr[0] * vPtr[0];
1050  for ( j = 1; j < numRows; j++ ) {
1051  mPtr += numColumns;
1052  sum += mPtr[0] * vPtr[j];
1053  }
1054  dstPtr[i] += sum;
1055  }
1056  break;
1057  }
1058 }
1059 
1060 /*
1061 ============
1062 idSIMD_Generic::MatX_TransposeMultiplySubVecX
1063 ============
1064 */
1066  int i, numColumns;
1067  const float *mPtr, *vPtr;
1068  float *dstPtr;
1069 
1070  assert( vec.GetSize() >= mat.GetNumRows() );
1071  assert( dst.GetSize() >= mat.GetNumColumns() );
1072 
1073  mPtr = mat.ToFloatPtr();
1074  vPtr = vec.ToFloatPtr();
1075  dstPtr = dst.ToFloatPtr();
1076  numColumns = mat.GetNumColumns();
1077  switch( mat.GetNumRows() ) {
1078  case 1:
1079  for ( i = 0; i < numColumns; i++ ) {
1080  dstPtr[i] -= *(mPtr) * vPtr[0];
1081  mPtr++;
1082  }
1083  break;
1084  case 2:
1085  for ( i = 0; i < numColumns; i++ ) {
1086  dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
1087  mPtr++;
1088  }
1089  break;
1090  case 3:
1091  for ( i = 0; i < numColumns; i++ ) {
1092  dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
1093  mPtr++;
1094  }
1095  break;
1096  case 4:
1097  for ( i = 0; i < numColumns; i++ ) {
1098  dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
1099  *(mPtr+3*numColumns) * vPtr[3];
1100  mPtr++;
1101  }
1102  break;
1103  case 5:
1104  for ( i = 0; i < numColumns; i++ ) {
1105  dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
1106  *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
1107  mPtr++;
1108  }
1109  break;
1110  case 6:
1111  for ( i = 0; i < numColumns; i++ ) {
1112  dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
1113  *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
1114  mPtr++;
1115  }
1116  break;
1117  default:
1118  int numRows = mat.GetNumRows();
1119  for ( i = 0; i < numColumns; i++ ) {
1120  mPtr = mat.ToFloatPtr() + i;
1121  float sum = mPtr[0] * vPtr[0];
1122  for ( int j = 1; j < numRows; j++ ) {
1123  mPtr += numColumns;
1124  sum += mPtr[0] * vPtr[j];
1125  }
1126  dstPtr[i] -= sum;
1127  }
1128  break;
1129  }
1130 }
1131 
1132 /*
1133 ============
1134 idSIMD_Generic::MatX_MultiplyMatX
1135 
1136  optimizes the following matrix multiplications:
1137 
1138  NxN * Nx6
1139  6xN * Nx6
1140  Nx6 * 6xN
1141  6x6 * 6xN
1142 
1143  with N in the range [1-6].
1144 ============
1145 */
1147  int i, j, k, l, n;
1148  float *dstPtr;
1149  const float *m1Ptr, *m2Ptr;
1150  double sum;
1151 
1152  assert( m1.GetNumColumns() == m2.GetNumRows() );
1153 
1154  dstPtr = dst.ToFloatPtr();
1155  m1Ptr = m1.ToFloatPtr();
1156  m2Ptr = m2.ToFloatPtr();
1157  k = m1.GetNumRows();
1158  l = m2.GetNumColumns();
1159 
1160  switch( m1.GetNumColumns() ) {
1161  case 1: {
1162  if ( l == 6 ) {
1163  for ( i = 0; i < k; i++ ) { // Nx1 * 1x6
1164  *dstPtr++ = m1Ptr[i] * m2Ptr[0];
1165  *dstPtr++ = m1Ptr[i] * m2Ptr[1];
1166  *dstPtr++ = m1Ptr[i] * m2Ptr[2];
1167  *dstPtr++ = m1Ptr[i] * m2Ptr[3];
1168  *dstPtr++ = m1Ptr[i] * m2Ptr[4];
1169  *dstPtr++ = m1Ptr[i] * m2Ptr[5];
1170  }
1171  return;
1172  }
1173  for ( i = 0; i < k; i++ ) {
1174  m2Ptr = m2.ToFloatPtr();
1175  for ( j = 0; j < l; j++ ) {
1176  *dstPtr++ = m1Ptr[0] * m2Ptr[0];
1177  m2Ptr++;
1178  }
1179  m1Ptr++;
1180  }
1181  break;
1182  }
1183  case 2: {
1184  if ( l == 6 ) {
1185  for ( i = 0; i < k; i++ ) { // Nx2 * 2x6
1186  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6];
1187  *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7];
1188  *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8];
1189  *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9];
1190  *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10];
1191  *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11];
1192  m1Ptr += 2;
1193  }
1194  return;
1195  }
1196  for ( i = 0; i < k; i++ ) {
1197  m2Ptr = m2.ToFloatPtr();
1198  for ( j = 0; j < l; j++ ) {
1199  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];
1200  m2Ptr++;
1201  }
1202  m1Ptr += 2;
1203  }
1204  break;
1205  }
1206  case 3: {
1207  if ( l == 6 ) {
1208  for ( i = 0; i < k; i++ ) { // Nx3 * 3x6
1209  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12];
1210  *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13];
1211  *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14];
1212  *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15];
1213  *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16];
1214  *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17];
1215  m1Ptr += 3;
1216  }
1217  return;
1218  }
1219  for ( i = 0; i < k; i++ ) {
1220  m2Ptr = m2.ToFloatPtr();
1221  for ( j = 0; j < l; j++ ) {
1222  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l];
1223  m2Ptr++;
1224  }
1225  m1Ptr += 3;
1226  }
1227  break;
1228  }
1229  case 4: {
1230  if ( l == 6 ) {
1231  for ( i = 0; i < k; i++ ) { // Nx4 * 4x6
1232  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12] + m1Ptr[3] * m2Ptr[18];
1233  *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13] + m1Ptr[3] * m2Ptr[19];
1234  *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14] + m1Ptr[3] * m2Ptr[20];
1235  *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15] + m1Ptr[3] * m2Ptr[21];
1236  *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16] + m1Ptr[3] * m2Ptr[22];
1237  *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17] + m1Ptr[3] * m2Ptr[23];
1238  m1Ptr += 4;
1239  }
1240  return;
1241  }
1242  for ( i = 0; i < k; i++ ) {
1243  m2Ptr = m2.ToFloatPtr();
1244  for ( j = 0; j < l; j++ ) {
1245  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
1246  m1Ptr[3] * m2Ptr[3*l];
1247  m2Ptr++;
1248  }
1249  m1Ptr += 4;
1250  }
1251  break;
1252  }
1253  case 5: {
1254  if ( l == 6 ) {
1255  for ( i = 0; i < k; i++ ) { // Nx5 * 5x6
1256  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12] + m1Ptr[3] * m2Ptr[18] + m1Ptr[4] * m2Ptr[24];
1257  *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13] + m1Ptr[3] * m2Ptr[19] + m1Ptr[4] * m2Ptr[25];
1258  *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14] + m1Ptr[3] * m2Ptr[20] + m1Ptr[4] * m2Ptr[26];
1259  *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15] + m1Ptr[3] * m2Ptr[21] + m1Ptr[4] * m2Ptr[27];
1260  *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16] + m1Ptr[3] * m2Ptr[22] + m1Ptr[4] * m2Ptr[28];
1261  *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17] + m1Ptr[3] * m2Ptr[23] + m1Ptr[4] * m2Ptr[29];
1262  m1Ptr += 5;
1263  }
1264  return;
1265  }
1266  for ( i = 0; i < k; i++ ) {
1267  m2Ptr = m2.ToFloatPtr();
1268  for ( j = 0; j < l; j++ ) {
1269  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
1270  m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l];
1271  m2Ptr++;
1272  }
1273  m1Ptr += 5;
1274  }
1275  break;
1276  }
1277  case 6: {
1278  switch( k ) {
1279  case 1: {
1280  if ( l == 1 ) { // 1x6 * 6x1
1281  dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
1282  m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
1283  return;
1284  }
1285  break;
1286  }
1287  case 2: {
1288  if ( l == 2 ) { // 2x6 * 6x2
1289  for ( i = 0; i < 2; i++ ) {
1290  for ( j = 0; j < 2; j++ ) {
1291  *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 2 + j ]
1292  + m1Ptr[1] * m2Ptr[ 1 * 2 + j ]
1293  + m1Ptr[2] * m2Ptr[ 2 * 2 + j ]
1294  + m1Ptr[3] * m2Ptr[ 3 * 2 + j ]
1295  + m1Ptr[4] * m2Ptr[ 4 * 2 + j ]
1296  + m1Ptr[5] * m2Ptr[ 5 * 2 + j ];
1297  dstPtr++;
1298  }
1299  m1Ptr += 6;
1300  }
1301  return;
1302  }
1303  break;
1304  }
1305  case 3: {
1306  if ( l == 3 ) { // 3x6 * 6x3
1307  for ( i = 0; i < 3; i++ ) {
1308  for ( j = 0; j < 3; j++ ) {
1309  *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 3 + j ]
1310  + m1Ptr[1] * m2Ptr[ 1 * 3 + j ]
1311  + m1Ptr[2] * m2Ptr[ 2 * 3 + j ]
1312  + m1Ptr[3] * m2Ptr[ 3 * 3 + j ]
1313  + m1Ptr[4] * m2Ptr[ 4 * 3 + j ]
1314  + m1Ptr[5] * m2Ptr[ 5 * 3 + j ];
1315  dstPtr++;
1316  }
1317  m1Ptr += 6;
1318  }
1319  return;
1320  }
1321  break;
1322  }
1323  case 4: {
1324  if ( l == 4 ) { // 4x6 * 6x4
1325  for ( i = 0; i < 4; i++ ) {
1326  for ( j = 0; j < 4; j++ ) {
1327  *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 4 + j ]
1328  + m1Ptr[1] * m2Ptr[ 1 * 4 + j ]
1329  + m1Ptr[2] * m2Ptr[ 2 * 4 + j ]
1330  + m1Ptr[3] * m2Ptr[ 3 * 4 + j ]
1331  + m1Ptr[4] * m2Ptr[ 4 * 4 + j ]
1332  + m1Ptr[5] * m2Ptr[ 5 * 4 + j ];
1333  dstPtr++;
1334  }
1335  m1Ptr += 6;
1336  }
1337  return;
1338  }
1339  }
1340  case 5: {
1341  if ( l == 5 ) { // 5x6 * 6x5
1342  for ( i = 0; i < 5; i++ ) {
1343  for ( j = 0; j < 5; j++ ) {
1344  *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 5 + j ]
1345  + m1Ptr[1] * m2Ptr[ 1 * 5 + j ]
1346  + m1Ptr[2] * m2Ptr[ 2 * 5 + j ]
1347  + m1Ptr[3] * m2Ptr[ 3 * 5 + j ]
1348  + m1Ptr[4] * m2Ptr[ 4 * 5 + j ]
1349  + m1Ptr[5] * m2Ptr[ 5 * 5 + j ];
1350  dstPtr++;
1351  }
1352  m1Ptr += 6;
1353  }
1354  return;
1355  }
1356  }
1357  case 6: {
1358  switch( l ) {
1359  case 1: { // 6x6 * 6x1
1360  for ( i = 0; i < 6; i++ ) {
1361  *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 1 ]
1362  + m1Ptr[1] * m2Ptr[ 1 * 1 ]
1363  + m1Ptr[2] * m2Ptr[ 2 * 1 ]
1364  + m1Ptr[3] * m2Ptr[ 3 * 1 ]
1365  + m1Ptr[4] * m2Ptr[ 4 * 1 ]
1366  + m1Ptr[5] * m2Ptr[ 5 * 1 ];
1367  dstPtr++;
1368  m1Ptr += 6;
1369  }
1370  return;
1371  }
1372  case 2: { // 6x6 * 6x2
1373  for ( i = 0; i < 6; i++ ) {
1374  for ( j = 0; j < 2; j++ ) {
1375  *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 2 + j ]
1376  + m1Ptr[1] * m2Ptr[ 1 * 2 + j ]
1377  + m1Ptr[2] * m2Ptr[ 2 * 2 + j ]
1378  + m1Ptr[3] * m2Ptr[ 3 * 2 + j ]
1379  + m1Ptr[4] * m2Ptr[ 4 * 2 + j ]
1380  + m1Ptr[5] * m2Ptr[ 5 * 2 + j ];
1381  dstPtr++;
1382  }
1383  m1Ptr += 6;
1384  }
1385  return;
1386  }
1387  case 3: { // 6x6 * 6x3
1388  for ( i = 0; i < 6; i++ ) {
1389  for ( j = 0; j < 3; j++ ) {
1390  *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 3 + j ]
1391  + m1Ptr[1] * m2Ptr[ 1 * 3 + j ]
1392  + m1Ptr[2] * m2Ptr[ 2 * 3 + j ]
1393  + m1Ptr[3] * m2Ptr[ 3 * 3 + j ]
1394  + m1Ptr[4] * m2Ptr[ 4 * 3 + j ]
1395  + m1Ptr[5] * m2Ptr[ 5 * 3 + j ];
1396  dstPtr++;
1397  }
1398  m1Ptr += 6;
1399  }
1400  return;
1401  }
1402  case 4: { // 6x6 * 6x4
1403  for ( i = 0; i < 6; i++ ) {
1404  for ( j = 0; j < 4; j++ ) {
1405  *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 4 + j ]
1406  + m1Ptr[1] * m2Ptr[ 1 * 4 + j ]
1407  + m1Ptr[2] * m2Ptr[ 2 * 4 + j ]
1408  + m1Ptr[3] * m2Ptr[ 3 * 4 + j ]
1409  + m1Ptr[4] * m2Ptr[ 4 * 4 + j ]
1410  + m1Ptr[5] * m2Ptr[ 5 * 4 + j ];
1411  dstPtr++;
1412  }
1413  m1Ptr += 6;
1414  }
1415  return;
1416  }
1417  case 5: { // 6x6 * 6x5
1418  for ( i = 0; i < 6; i++ ) {
1419  for ( j = 0; j < 5; j++ ) {
1420  *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 5 + j ]
1421  + m1Ptr[1] * m2Ptr[ 1 * 5 + j ]
1422  + m1Ptr[2] * m2Ptr[ 2 * 5 + j ]
1423  + m1Ptr[3] * m2Ptr[ 3 * 5 + j ]
1424  + m1Ptr[4] * m2Ptr[ 4 * 5 + j ]
1425  + m1Ptr[5] * m2Ptr[ 5 * 5 + j ];
1426  dstPtr++;
1427  }
1428  m1Ptr += 6;
1429  }
1430  return;
1431  }
1432  case 6: { // 6x6 * 6x6
1433  for ( i = 0; i < 6; i++ ) {
1434  for ( j = 0; j < 6; j++ ) {
1435  *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 6 + j ]
1436  + m1Ptr[1] * m2Ptr[ 1 * 6 + j ]
1437  + m1Ptr[2] * m2Ptr[ 2 * 6 + j ]
1438  + m1Ptr[3] * m2Ptr[ 3 * 6 + j ]
1439  + m1Ptr[4] * m2Ptr[ 4 * 6 + j ]
1440  + m1Ptr[5] * m2Ptr[ 5 * 6 + j ];
1441  dstPtr++;
1442  }
1443  m1Ptr += 6;
1444  }
1445  return;
1446  }
1447  }
1448  }
1449  }
1450  for ( i = 0; i < k; i++ ) {
1451  m2Ptr = m2.ToFloatPtr();
1452  for ( j = 0; j < l; j++ ) {
1453  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
1454  m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l];
1455  m2Ptr++;
1456  }
1457  m1Ptr += 6;
1458  }
1459  break;
1460  }
1461  default: {
1462  for ( i = 0; i < k; i++ ) {
1463  for ( j = 0; j < l; j++ ) {
1464  m2Ptr = m2.ToFloatPtr() + j;
1465  sum = m1Ptr[0] * m2Ptr[0];
1466  for ( n = 1; n < m1.GetNumColumns(); n++ ) {
1467  m2Ptr += l;
1468  sum += m1Ptr[n] * m2Ptr[0];
1469  }
1470  *dstPtr++ = sum;
1471  }
1472  m1Ptr += m1.GetNumColumns();
1473  }
1474  break;
1475  }
1476  }
1477 }
1478 
1479 /*
1480 ============
1481 idSIMD_Generic::MatX_TransposeMultiplyMatX
1482 
1483  optimizes the following tranpose matrix multiplications:
1484 
1485  Nx6 * NxN
1486  6xN * 6x6
1487 
1488  with N in the range [1-6].
1489 ============
1490 */
1492  int i, j, k, l, n;
1493  float *dstPtr;
1494  const float *m1Ptr, *m2Ptr;
1495  double sum;
1496 
1497  assert( m1.GetNumRows() == m2.GetNumRows() );
1498 
1499  m1Ptr = m1.ToFloatPtr();
1500  m2Ptr = m2.ToFloatPtr();
1501  dstPtr = dst.ToFloatPtr();
1502  k = m1.GetNumColumns();
1503  l = m2.GetNumColumns();
1504 
1505  switch( m1.GetNumRows() ) {
1506  case 1:
1507  if ( k == 6 && l == 1 ) { // 1x6 * 1x1
1508  for ( i = 0; i < 6; i++ ) {
1509  *dstPtr++ = m1Ptr[0] * m2Ptr[0];
1510  m1Ptr++;
1511  }
1512  return;
1513  }
1514  for ( i = 0; i < k; i++ ) {
1515  m2Ptr = m2.ToFloatPtr();
1516  for ( j = 0; j < l; j++ ) {
1517  *dstPtr++ = m1Ptr[0] * m2Ptr[0];
1518  m2Ptr++;
1519  }
1520  m1Ptr++;
1521  }
1522  break;
1523  case 2:
1524  if ( k == 6 && l == 2 ) { // 2x6 * 2x2
1525  for ( i = 0; i < 6; i++ ) {
1526  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*2+0] + m1Ptr[1*6] * m2Ptr[1*2+0];
1527  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*2+1] + m1Ptr[1*6] * m2Ptr[1*2+1];
1528  m1Ptr++;
1529  }
1530  return;
1531  }
1532  for ( i = 0; i < k; i++ ) {
1533  m2Ptr = m2.ToFloatPtr();
1534  for ( j = 0; j < l; j++ ) {
1535  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];
1536  m2Ptr++;
1537  }
1538  m1Ptr++;
1539  }
1540  break;
1541  case 3:
1542  if ( k == 6 && l == 3 ) { // 3x6 * 3x3
1543  for ( i = 0; i < 6; i++ ) {
1544  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+0] + m1Ptr[1*6] * m2Ptr[1*3+0] + m1Ptr[2*6] * m2Ptr[2*3+0];
1545  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+1] + m1Ptr[1*6] * m2Ptr[1*3+1] + m1Ptr[2*6] * m2Ptr[2*3+1];
1546  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+2] + m1Ptr[1*6] * m2Ptr[1*3+2] + m1Ptr[2*6] * m2Ptr[2*3+2];
1547  m1Ptr++;
1548  }
1549  return;
1550  }
1551  for ( i = 0; i < k; i++ ) {
1552  m2Ptr = m2.ToFloatPtr();
1553  for ( j = 0; j < l; j++ ) {
1554  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l];
1555  m2Ptr++;
1556  }
1557  m1Ptr++;
1558  }
1559  break;
1560  case 4:
1561  if ( k == 6 && l == 4 ) { // 4x6 * 4x4
1562  for ( i = 0; i < 6; i++ ) {
1563  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+0] + m1Ptr[1*6] * m2Ptr[1*4+0] + m1Ptr[2*6] * m2Ptr[2*4+0] + m1Ptr[3*6] * m2Ptr[3*4+0];
1564  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+1] + m1Ptr[1*6] * m2Ptr[1*4+1] + m1Ptr[2*6] * m2Ptr[2*4+1] + m1Ptr[3*6] * m2Ptr[3*4+1];
1565  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+2] + m1Ptr[1*6] * m2Ptr[1*4+2] + m1Ptr[2*6] * m2Ptr[2*4+2] + m1Ptr[3*6] * m2Ptr[3*4+2];
1566  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+3] + m1Ptr[1*6] * m2Ptr[1*4+3] + m1Ptr[2*6] * m2Ptr[2*4+3] + m1Ptr[3*6] * m2Ptr[3*4+3];
1567  m1Ptr++;
1568  }
1569  return;
1570  }
1571  for ( i = 0; i < k; i++ ) {
1572  m2Ptr = m2.ToFloatPtr();
1573  for ( j = 0; j < l; j++ ) {
1574  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
1575  m1Ptr[3*k] * m2Ptr[3*l];
1576  m2Ptr++;
1577  }
1578  m1Ptr++;
1579  }
1580  break;
1581  case 5:
1582  if ( k == 6 && l == 5 ) { // 5x6 * 5x5
1583  for ( i = 0; i < 6; i++ ) {
1584  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+0] + m1Ptr[1*6] * m2Ptr[1*5+0] + m1Ptr[2*6] * m2Ptr[2*5+0] + m1Ptr[3*6] * m2Ptr[3*5+0] + m1Ptr[4*6] * m2Ptr[4*5+0];
1585  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+1] + m1Ptr[1*6] * m2Ptr[1*5+1] + m1Ptr[2*6] * m2Ptr[2*5+1] + m1Ptr[3*6] * m2Ptr[3*5+1] + m1Ptr[4*6] * m2Ptr[4*5+1];
1586  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+2] + m1Ptr[1*6] * m2Ptr[1*5+2] + m1Ptr[2*6] * m2Ptr[2*5+2] + m1Ptr[3*6] * m2Ptr[3*5+2] + m1Ptr[4*6] * m2Ptr[4*5+2];
1587  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+3] + m1Ptr[1*6] * m2Ptr[1*5+3] + m1Ptr[2*6] * m2Ptr[2*5+3] + m1Ptr[3*6] * m2Ptr[3*5+3] + m1Ptr[4*6] * m2Ptr[4*5+3];
1588  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+4] + m1Ptr[1*6] * m2Ptr[1*5+4] + m1Ptr[2*6] * m2Ptr[2*5+4] + m1Ptr[3*6] * m2Ptr[3*5+4] + m1Ptr[4*6] * m2Ptr[4*5+4];
1589  m1Ptr++;
1590  }
1591  return;
1592  }
1593  for ( i = 0; i < k; i++ ) {
1594  m2Ptr = m2.ToFloatPtr();
1595  for ( j = 0; j < l; j++ ) {
1596  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
1597  m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l];
1598  m2Ptr++;
1599  }
1600  m1Ptr++;
1601  }
1602  break;
1603  case 6:
1604  if ( l == 6 ) {
1605  switch( k ) {
1606  case 1: // 6x1 * 6x6
1607  m2Ptr = m2.ToFloatPtr();
1608  for ( j = 0; j < 6; j++ ) {
1609  *dstPtr++ = m1Ptr[0*1] * m2Ptr[0*6] +
1610  m1Ptr[1*1] * m2Ptr[1*6] +
1611  m1Ptr[2*1] * m2Ptr[2*6] +
1612  m1Ptr[3*1] * m2Ptr[3*6] +
1613  m1Ptr[4*1] * m2Ptr[4*6] +
1614  m1Ptr[5*1] * m2Ptr[5*6];
1615  m2Ptr++;
1616  }
1617  return;
1618  case 2: // 6x2 * 6x6
1619  for ( i = 0; i < 2; i++ ) {
1620  m2Ptr = m2.ToFloatPtr();
1621  for ( j = 0; j < 6; j++ ) {
1622  *dstPtr++ = m1Ptr[0*2] * m2Ptr[0*6] +
1623  m1Ptr[1*2] * m2Ptr[1*6] +
1624  m1Ptr[2*2] * m2Ptr[2*6] +
1625  m1Ptr[3*2] * m2Ptr[3*6] +
1626  m1Ptr[4*2] * m2Ptr[4*6] +
1627  m1Ptr[5*2] * m2Ptr[5*6];
1628  m2Ptr++;
1629  }
1630  m1Ptr++;
1631  }
1632  return;
1633  case 3: // 6x3 * 6x6
1634  for ( i = 0; i < 3; i++ ) {
1635  m2Ptr = m2.ToFloatPtr();
1636  for ( j = 0; j < 6; j++ ) {
1637  *dstPtr++ = m1Ptr[0*3] * m2Ptr[0*6] +
1638  m1Ptr[1*3] * m2Ptr[1*6] +
1639  m1Ptr[2*3] * m2Ptr[2*6] +
1640  m1Ptr[3*3] * m2Ptr[3*6] +
1641  m1Ptr[4*3] * m2Ptr[4*6] +
1642  m1Ptr[5*3] * m2Ptr[5*6];
1643  m2Ptr++;
1644  }
1645  m1Ptr++;
1646  }
1647  return;
1648  case 4: // 6x4 * 6x6
1649  for ( i = 0; i < 4; i++ ) {
1650  m2Ptr = m2.ToFloatPtr();
1651  for ( j = 0; j < 6; j++ ) {
1652  *dstPtr++ = m1Ptr[0*4] * m2Ptr[0*6] +
1653  m1Ptr[1*4] * m2Ptr[1*6] +
1654  m1Ptr[2*4] * m2Ptr[2*6] +
1655  m1Ptr[3*4] * m2Ptr[3*6] +
1656  m1Ptr[4*4] * m2Ptr[4*6] +
1657  m1Ptr[5*4] * m2Ptr[5*6];
1658  m2Ptr++;
1659  }
1660  m1Ptr++;
1661  }
1662  return;
1663  case 5: // 6x5 * 6x6
1664  for ( i = 0; i < 5; i++ ) {
1665  m2Ptr = m2.ToFloatPtr();
1666  for ( j = 0; j < 6; j++ ) {
1667  *dstPtr++ = m1Ptr[0*5] * m2Ptr[0*6] +
1668  m1Ptr[1*5] * m2Ptr[1*6] +
1669  m1Ptr[2*5] * m2Ptr[2*6] +
1670  m1Ptr[3*5] * m2Ptr[3*6] +
1671  m1Ptr[4*5] * m2Ptr[4*6] +
1672  m1Ptr[5*5] * m2Ptr[5*6];
1673  m2Ptr++;
1674  }
1675  m1Ptr++;
1676  }
1677  return;
1678  case 6: // 6x6 * 6x6
1679  for ( i = 0; i < 6; i++ ) {
1680  m2Ptr = m2.ToFloatPtr();
1681  for ( j = 0; j < 6; j++ ) {
1682  *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*6] +
1683  m1Ptr[1*6] * m2Ptr[1*6] +
1684  m1Ptr[2*6] * m2Ptr[2*6] +
1685  m1Ptr[3*6] * m2Ptr[3*6] +
1686  m1Ptr[4*6] * m2Ptr[4*6] +
1687  m1Ptr[5*6] * m2Ptr[5*6];
1688  m2Ptr++;
1689  }
1690  m1Ptr++;
1691  }
1692  return;
1693  }
1694  }
1695  for ( i = 0; i < k; i++ ) {
1696  m2Ptr = m2.ToFloatPtr();
1697  for ( j = 0; j < l; j++ ) {
1698  *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
1699  m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l];
1700  m2Ptr++;
1701  }
1702  m1Ptr++;
1703  }
1704  break;
1705  default:
1706  for ( i = 0; i < k; i++ ) {
1707  for ( j = 0; j < l; j++ ) {
1708  m1Ptr = m1.ToFloatPtr() + i;
1709  m2Ptr = m2.ToFloatPtr() + j;
1710  sum = m1Ptr[0] * m2Ptr[0];
1711  for ( n = 1; n < m1.GetNumRows(); n++ ) {
1712  m1Ptr += k;
1713  m2Ptr += l;
1714  sum += m1Ptr[0] * m2Ptr[0];
1715  }
1716  *dstPtr++ = sum;
1717  }
1718  }
1719  break;
1720  }
1721 }
1722 
1723 /*
1724 ============
1725 idSIMD_Generic::MatX_LowerTriangularSolve
1726 
1727  solves x in Lx = b for the n * n sub-matrix of L
1728  if skip > 0 the first skip elements of x are assumed to be valid already
1729  L has to be a lower triangular matrix with (implicit) ones on the diagonal
1730  x == b is allowed
1731 ============
1732 */
1733 void VPCALL idSIMD_Generic::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
1734 #if 1
1735 
1736  int nc;
1737  const float *lptr;
1738 
1739  if ( skip >= n ) {
1740  return;
1741  }
1742 
1743  lptr = L.ToFloatPtr();
1744  nc = L.GetNumColumns();
1745 
1746  // unrolled cases for n < 8
1747  if ( n < 8 ) {
1748  #define NSKIP( n, s ) ((n<<3)|(s&7))
1749  switch( NSKIP( n, skip ) ) {
1750  case NSKIP( 1, 0 ): x[0] = b[0];
1751  return;
1752  case NSKIP( 2, 0 ): x[0] = b[0];
1753  case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
1754  return;
1755  case NSKIP( 3, 0 ): x[0] = b[0];
1756  case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
1757  case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
1758  return;
1759  case NSKIP( 4, 0 ): x[0] = b[0];
1760  case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
1761  case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
1762  case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
1763  return;
1764  case NSKIP( 5, 0 ): x[0] = b[0];
1765  case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
1766  case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
1767  case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
1768  case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
1769  return;
1770  case NSKIP( 6, 0 ): x[0] = b[0];
1771  case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
1772  case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
1773  case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
1774  case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
1775  case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
1776  return;
1777  case NSKIP( 7, 0 ): x[0] = b[0];
1778  case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
1779  case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
1780  case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
1781  case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
1782  case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
1783  case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
1784  return;
1785  }
1786  return;
1787  }
1788 
1789  // process first 4 rows
1790  switch( skip ) {
1791  case 0: x[0] = b[0];
1792  case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
1793  case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
1794  case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
1795  skip = 4;
1796  }
1797 
1798  lptr = L[skip];
1799 
1800  int i, j;
1801  register double s0, s1, s2, s3;
1802 
1803  for ( i = skip; i < n; i++ ) {
1804  s0 = lptr[0] * x[0];
1805  s1 = lptr[1] * x[1];
1806  s2 = lptr[2] * x[2];
1807  s3 = lptr[3] * x[3];
1808  for ( j = 4; j < i-7; j += 8 ) {
1809  s0 += lptr[j+0] * x[j+0];
1810  s1 += lptr[j+1] * x[j+1];
1811  s2 += lptr[j+2] * x[j+2];
1812  s3 += lptr[j+3] * x[j+3];
1813  s0 += lptr[j+4] * x[j+4];
1814  s1 += lptr[j+5] * x[j+5];
1815  s2 += lptr[j+6] * x[j+6];
1816  s3 += lptr[j+7] * x[j+7];
1817  }
1818  switch( i - j ) {
1819  NODEFAULT;
1820  case 7: s0 += lptr[j+6] * x[j+6];
1821  case 6: s1 += lptr[j+5] * x[j+5];
1822  case 5: s2 += lptr[j+4] * x[j+4];
1823  case 4: s3 += lptr[j+3] * x[j+3];
1824  case 3: s0 += lptr[j+2] * x[j+2];
1825  case 2: s1 += lptr[j+1] * x[j+1];
1826  case 1: s2 += lptr[j+0] * x[j+0];
1827  case 0: break;
1828  }
1829  double sum;
1830  sum = s3;
1831  sum += s2;
1832  sum += s1;
1833  sum += s0;
1834  sum -= b[i];
1835  x[i] = -sum;
1836  lptr += nc;
1837  }
1838 
1839 #else
1840 
1841  int i, j;
1842  const float *lptr;
1843  double sum;
1844 
1845  for ( i = skip; i < n; i++ ) {
1846  sum = b[i];
1847  lptr = L[i];
1848  for ( j = 0; j < i; j++ ) {
1849  sum -= lptr[j] * x[j];
1850  }
1851  x[i] = sum;
1852  }
1853 
1854 #endif
1855 }
1856 
1857 /*
1858 ============
1859 idSIMD_Generic::MatX_LowerTriangularSolveTranspose
1860 
1861  solves x in L'x = b for the n * n sub-matrix of L
1862  L has to be a lower triangular matrix with (implicit) ones on the diagonal
1863  x == b is allowed
1864 ============
1865 */
1866 void VPCALL idSIMD_Generic::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
1867 #if 1
1868 
1869  int nc;
1870  const float *lptr;
1871 
1872  lptr = L.ToFloatPtr();
1873  nc = L.GetNumColumns();
1874 
1875  // unrolled cases for n < 8
1876  if ( n < 8 ) {
1877  switch( n ) {
1878  case 0:
1879  return;
1880  case 1:
1881  x[0] = b[0];
1882  return;
1883  case 2:
1884  x[1] = b[1];
1885  x[0] = b[0] - lptr[1*nc+0] * x[1];
1886  return;
1887  case 3:
1888  x[2] = b[2];
1889  x[1] = b[1] - lptr[2*nc+1] * x[2];
1890  x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
1891  return;
1892  case 4:
1893  x[3] = b[3];
1894  x[2] = b[2] - lptr[3*nc+2] * x[3];
1895  x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
1896  x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
1897  return;
1898  case 5:
1899  x[4] = b[4];
1900  x[3] = b[3] - lptr[4*nc+3] * x[4];
1901  x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
1902  x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
1903  x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
1904  return;
1905  case 6:
1906  x[5] = b[5];
1907  x[4] = b[4] - lptr[5*nc+4] * x[5];
1908  x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
1909  x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
1910  x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
1911  x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
1912  return;
1913  case 7:
1914  x[6] = b[6];
1915  x[5] = b[5] - lptr[6*nc+5] * x[6];
1916  x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
1917  x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
1918  x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
1919  x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
1920  x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
1921  return;
1922  }
1923  return;
1924  }
1925 
1926  int i, j;
1927  register double s0, s1, s2, s3;
1928  float *xptr;
1929 
1930  lptr = L.ToFloatPtr() + n * nc + n - 4;
1931  xptr = x + n;
1932 
1933  // process 4 rows at a time
1934  for ( i = n; i >= 4; i -= 4 ) {
1935  s0 = b[i-4];
1936  s1 = b[i-3];
1937  s2 = b[i-2];
1938  s3 = b[i-1];
1939  // process 4x4 blocks
1940  for ( j = 0; j < n-i; j += 4 ) {
1941  s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
1942  s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
1943  s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
1944  s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
1945  s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
1946  s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
1947  s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
1948  s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
1949  s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
1950  s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
1951  s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
1952  s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
1953  s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
1954  s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
1955  s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
1956  s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
1957  }
1958  // process left over of the 4 rows
1959  s0 -= lptr[0-1*nc] * s3;
1960  s1 -= lptr[1-1*nc] * s3;
1961  s2 -= lptr[2-1*nc] * s3;
1962  s0 -= lptr[0-2*nc] * s2;
1963  s1 -= lptr[1-2*nc] * s2;
1964  s0 -= lptr[0-3*nc] * s1;
1965  // store result
1966  xptr[-4] = s0;
1967  xptr[-3] = s1;
1968  xptr[-2] = s2;
1969  xptr[-1] = s3;
1970  // update pointers for next four rows
1971  lptr -= 4 + 4 * nc;
1972  xptr -= 4;
1973  }
1974  // process left over rows
1975  for ( i--; i >= 0; i-- ) {
1976  s0 = b[i];
1977  lptr = L[0] + i;
1978  for ( j = i + 1; j < n; j++ ) {
1979  s0 -= lptr[j*nc] * x[j];
1980  }
1981  x[i] = s0;
1982  }
1983 
1984 #else
1985 
1986  int i, j, nc;
1987  const float *ptr;
1988  double sum;
1989 
1990  nc = L.GetNumColumns();
1991  for ( i = n - 1; i >= 0; i-- ) {
1992  sum = b[i];
1993  ptr = L[0] + i;
1994  for ( j = i + 1; j < n; j++ ) {
1995  sum -= ptr[j*nc] * x[j];
1996  }
1997  x[i] = sum;
1998  }
1999 
2000 #endif
2001 }
2002 
2003 /*
2004 ============
2005 idSIMD_Generic::MatX_LDLTFactor
2006 
2007  in-place factorization LDL' of the n * n sub-matrix of mat
2008  the reciprocal of the diagonal elements are stored in invDiag
2009 ============
2010 */
2011 bool VPCALL idSIMD_Generic::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
2012 #if 1
2013 
2014  int i, j, k, nc;
2015  float *v, *diag, *mptr;
2016  double s0, s1, s2, s3, sum, d;
2017 
2018  v = (float *) _alloca16( n * sizeof( float ) );
2019  diag = (float *) _alloca16( n * sizeof( float ) );
2020 
2021  nc = mat.GetNumColumns();
2022 
2023  if ( n <= 0 ) {
2024  return true;
2025  }
2026 
2027  mptr = mat[0];
2028 
2029  sum = mptr[0];
2030 
2031  if ( sum == 0.0f ) {
2032  return false;
2033  }
2034 
2035  diag[0] = sum;
2036  invDiag[0] = d = 1.0f / sum;
2037 
2038  if ( n <= 1 ) {
2039  return true;
2040  }
2041 
2042  mptr = mat[0];
2043  for ( j = 1; j < n; j++ ) {
2044  mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
2045  }
2046 
2047  mptr = mat[1];
2048 
2049  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
2050  sum = mptr[1] - s0;
2051 
2052  if ( sum == 0.0f ) {
2053  return false;
2054  }
2055 
2056  mat[1][1] = sum;
2057  diag[1] = sum;
2058  invDiag[1] = d = 1.0f / sum;
2059 
2060  if ( n <= 2 ) {
2061  return true;
2062  }
2063 
2064  mptr = mat[0];
2065  for ( j = 2; j < n; j++ ) {
2066  mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
2067  }
2068 
2069  mptr = mat[2];
2070 
2071  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
2072  v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
2073  sum = mptr[2] - s0 - s1;
2074 
2075  if ( sum == 0.0f ) {
2076  return false;
2077  }
2078 
2079  mat[2][2] = sum;
2080  diag[2] = sum;
2081  invDiag[2] = d = 1.0f / sum;
2082 
2083  if ( n <= 3 ) {
2084  return true;
2085  }
2086 
2087  mptr = mat[0];
2088  for ( j = 3; j < n; j++ ) {
2089  mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
2090  }
2091 
2092  mptr = mat[3];
2093 
2094  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
2095  v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
2096  v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
2097  sum = mptr[3] - s0 - s1 - s2;
2098 
2099  if ( sum == 0.0f ) {
2100  return false;
2101  }
2102 
2103  mat[3][3] = sum;
2104  diag[3] = sum;
2105  invDiag[3] = d = 1.0f / sum;
2106 
2107  if ( n <= 4 ) {
2108  return true;
2109  }
2110 
2111  mptr = mat[0];
2112  for ( j = 4; j < n; j++ ) {
2113  mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
2114  }
2115 
2116  for ( i = 4; i < n; i++ ) {
2117 
2118  mptr = mat[i];
2119 
2120  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
2121  v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
2122  v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
2123  v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
2124  for ( k = 4; k < i-3; k += 4 ) {
2125  v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
2126  v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
2127  v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
2128  v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
2129  }
2130  switch( i - k ) {
2131  NODEFAULT;
2132  case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
2133  case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
2134  case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
2135  case 0: break;
2136  }
2137  sum = s3;
2138  sum += s2;
2139  sum += s1;
2140  sum += s0;
2141  sum = mptr[i] - sum;
2142 
2143  if ( sum == 0.0f ) {
2144  return false;
2145  }
2146 
2147  mat[i][i] = sum;
2148  diag[i] = sum;
2149  invDiag[i] = d = 1.0f / sum;
2150 
2151  if ( i + 1 >= n ) {
2152  return true;
2153  }
2154 
2155  mptr = mat[i+1];
2156  for ( j = i+1; j < n; j++ ) {
2157  s0 = mptr[0] * v[0];
2158  s1 = mptr[1] * v[1];
2159  s2 = mptr[2] * v[2];
2160  s3 = mptr[3] * v[3];
2161  for ( k = 4; k < i-7; k += 8 ) {
2162  s0 += mptr[k+0] * v[k+0];
2163  s1 += mptr[k+1] * v[k+1];
2164  s2 += mptr[k+2] * v[k+2];
2165  s3 += mptr[k+3] * v[k+3];
2166  s0 += mptr[k+4] * v[k+4];
2167  s1 += mptr[k+5] * v[k+5];
2168  s2 += mptr[k+6] * v[k+6];
2169  s3 += mptr[k+7] * v[k+7];
2170  }
2171  switch( i - k ) {
2172  NODEFAULT;
2173  case 7: s0 += mptr[k+6] * v[k+6];
2174  case 6: s1 += mptr[k+5] * v[k+5];
2175  case 5: s2 += mptr[k+4] * v[k+4];
2176  case 4: s3 += mptr[k+3] * v[k+3];
2177  case 3: s0 += mptr[k+2] * v[k+2];
2178  case 2: s1 += mptr[k+1] * v[k+1];
2179  case 1: s2 += mptr[k+0] * v[k+0];
2180  case 0: break;
2181  }
2182  sum = s3;
2183  sum += s2;
2184  sum += s1;
2185  sum += s0;
2186  mptr[i] = ( mptr[i] - sum ) * d;
2187  mptr += nc;
2188  }
2189  }
2190 
2191  return true;
2192 
2193 #else
2194 
2195  int i, j, k, nc;
2196  float *v, *ptr, *diagPtr;
2197  double d, sum;
2198 
2199  v = (float *) _alloca16( n * sizeof( float ) );
2200  nc = mat.GetNumColumns();
2201 
2202  for ( i = 0; i < n; i++ ) {
2203 
2204  ptr = mat[i];
2205  diagPtr = mat[0];
2206  sum = ptr[i];
2207  for ( j = 0; j < i; j++ ) {
2208  d = ptr[j];
2209  v[j] = diagPtr[0] * d;
2210  sum -= v[j] * d;
2211  diagPtr += nc + 1;
2212  }
2213 
2214  if ( sum == 0.0f ) {
2215  return false;
2216  }
2217 
2218  diagPtr[0] = sum;
2219  invDiag[i] = d = 1.0f / sum;
2220 
2221  if ( i + 1 >= n ) {
2222  continue;
2223  }
2224 
2225  ptr = mat[i+1];
2226  for ( j = i + 1; j < n; j++ ) {
2227  sum = ptr[i];
2228  for ( k = 0; k < i; k++ ) {
2229  sum -= ptr[k] * v[k];
2230  }
2231  ptr[i] = sum * d;
2232  ptr += nc;
2233  }
2234  }
2235 
2236  return true;
2237 
2238 #endif
2239 }
2240 
2241 /*
2242 ============
2243 idSIMD_Generic::BlendJoints
2244 ============
2245 */
2246 void VPCALL idSIMD_Generic::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
2247  int i;
2248 
2249  for ( i = 0; i < numJoints; i++ ) {
2250  int j = index[i];
2251  joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );
2252  joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
2253  }
2254 }
2255 
2256 /*
2257 ============
2258 idSIMD_Generic::ConvertJointQuatsToJointMats
2259 ============
2260 */
2261 void VPCALL idSIMD_Generic::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
2262  int i;
2263 
2264  for ( i = 0; i < numJoints; i++ ) {
2265  jointMats[i].SetRotation( jointQuats[i].q.ToMat3() );
2266  jointMats[i].SetTranslation( jointQuats[i].t );
2267  }
2268 }
2269 
2270 /*
2271 ============
2272 idSIMD_Generic::ConvertJointMatsToJointQuats
2273 ============
2274 */
2275 void VPCALL idSIMD_Generic::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
2276  int i;
2277 
2278  for ( i = 0; i < numJoints; i++ ) {
2279  jointQuats[i] = jointMats[i].ToJointQuat();
2280  }
2281 }
2282 
2283 /*
2284 ============
2285 idSIMD_Generic::TransformJoints
2286 ============
2287 */
2288 void VPCALL idSIMD_Generic::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
2289  int i;
2290 
2291  for( i = firstJoint; i <= lastJoint; i++ ) {
2292  assert( parents[i] < i );
2293  jointMats[i] *= jointMats[parents[i]];
2294  }
2295 }
2296 
2297 /*
2298 ============
2299 idSIMD_Generic::UntransformJoints
2300 ============
2301 */
2302 void VPCALL idSIMD_Generic::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
2303  int i;
2304 
2305  for( i = lastJoint; i >= firstJoint; i-- ) {
2306  assert( parents[i] < i );
2307  jointMats[i] /= jointMats[parents[i]];
2308  }
2309 }
2310 
2311 /*
2312 ============
2313 idSIMD_Generic::TransformVerts
2314 ============
2315 */
2316 void VPCALL idSIMD_Generic::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, int numWeights ) {
2317  int i, j;
2318  const byte *jointsPtr = (byte *)joints;
2319 
2320  for( j = i = 0; i < numVerts; i++ ) {
2321  idVec3 v;
2322 
2323  v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
2324  while( index[j*2+1] == 0 ) {
2325  j++;
2326  v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
2327  }
2328  j++;
2329 
2330  verts[i].xyz = v;
2331  }
2332 }
2333 
2334 /*
2335 ============
2336 idSIMD_Generic::TracePointCull
2337 ============
2338 */
2339 void VPCALL idSIMD_Generic::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
2340  int i;
2341  byte tOr;
2342 
2343  tOr = 0;
2344 
2345  for ( i = 0; i < numVerts; i++ ) {
2346  byte bits;
2347  float d0, d1, d2, d3, t;
2348  const idVec3 &v = verts[i].xyz;
2349 
2350  d0 = planes[0].Distance( v );
2351  d1 = planes[1].Distance( v );
2352  d2 = planes[2].Distance( v );
2353  d3 = planes[3].Distance( v );
2354 
2355  t = d0 + radius;
2356  bits = FLOATSIGNBITSET( t ) << 0;
2357  t = d1 + radius;
2358  bits |= FLOATSIGNBITSET( t ) << 1;
2359  t = d2 + radius;
2360  bits |= FLOATSIGNBITSET( t ) << 2;
2361  t = d3 + radius;
2362  bits |= FLOATSIGNBITSET( t ) << 3;
2363 
2364  t = d0 - radius;
2365  bits |= FLOATSIGNBITSET( t ) << 4;
2366  t = d1 - radius;
2367  bits |= FLOATSIGNBITSET( t ) << 5;
2368  t = d2 - radius;
2369  bits |= FLOATSIGNBITSET( t ) << 6;
2370  t = d3 - radius;
2371  bits |= FLOATSIGNBITSET( t ) << 7;
2372 
2373  bits ^= 0x0F; // flip lower four bits
2374 
2375  tOr |= bits;
2376  cullBits[i] = bits;
2377  }
2378 
2379  totalOr = tOr;
2380 }
2381 
2382 /*
2383 ============
2384 idSIMD_Generic::DecalPointCull
2385 ============
2386 */
2387 void VPCALL idSIMD_Generic::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
2388  int i;
2389 
2390  for ( i = 0; i < numVerts; i++ ) {
2391  byte bits;
2392  float d0, d1, d2, d3, d4, d5;
2393  const idVec3 &v = verts[i].xyz;
2394 
2395  d0 = planes[0].Distance( v );
2396  d1 = planes[1].Distance( v );
2397  d2 = planes[2].Distance( v );
2398  d3 = planes[3].Distance( v );
2399  d4 = planes[4].Distance( v );
2400  d5 = planes[5].Distance( v );
2401 
2402  bits = FLOATSIGNBITSET( d0 ) << 0;
2403  bits |= FLOATSIGNBITSET( d1 ) << 1;
2404  bits |= FLOATSIGNBITSET( d2 ) << 2;
2405  bits |= FLOATSIGNBITSET( d3 ) << 3;
2406  bits |= FLOATSIGNBITSET( d4 ) << 4;
2407  bits |= FLOATSIGNBITSET( d5 ) << 5;
2408 
2409  cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
2410  }
2411 }
2412 
2413 /*
2414 ============
2415 idSIMD_Generic::OverlayPointCull
2416 ============
2417 */
2418 void VPCALL idSIMD_Generic::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
2419  int i;
2420 
2421  for ( i = 0; i < numVerts; i++ ) {
2422  byte bits;
2423  float d0, d1;
2424  const idVec3 &v = verts[i].xyz;
2425 
2426  texCoords[i][0] = d0 = planes[0].Distance( v );
2427  texCoords[i][1] = d1 = planes[1].Distance( v );
2428 
2429  bits = FLOATSIGNBITSET( d0 ) << 0;
2430  d0 = 1.0f - d0;
2431  bits |= FLOATSIGNBITSET( d1 ) << 1;
2432  d1 = 1.0f - d1;
2433  bits |= FLOATSIGNBITSET( d0 ) << 2;
2434  bits |= FLOATSIGNBITSET( d1 ) << 3;
2435 
2436  cullBits[i] = bits;
2437  }
2438 }
2439 
2440 /*
2441 ============
2442 idSIMD_Generic::DeriveTriPlanes
2443 
2444  Derives a plane equation for each triangle.
2445 ============
2446 */
2447 void VPCALL idSIMD_Generic::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
2448  int i;
2449 
2450  for ( i = 0; i < numIndexes; i += 3 ) {
2451  const idDrawVert *a, *b, *c;
2452  float d0[3], d1[3], f;
2453  idVec3 n;
2454 
2455  a = verts + indexes[i + 0];
2456  b = verts + indexes[i + 1];
2457  c = verts + indexes[i + 2];
2458 
2459  d0[0] = b->xyz[0] - a->xyz[0];
2460  d0[1] = b->xyz[1] - a->xyz[1];
2461  d0[2] = b->xyz[2] - a->xyz[2];
2462 
2463  d1[0] = c->xyz[0] - a->xyz[0];
2464  d1[1] = c->xyz[1] - a->xyz[1];
2465  d1[2] = c->xyz[2] - a->xyz[2];
2466 
2467  n[0] = d1[1] * d0[2] - d1[2] * d0[1];
2468  n[1] = d1[2] * d0[0] - d1[0] * d0[2];
2469  n[2] = d1[0] * d0[1] - d1[1] * d0[0];
2470 
2471  f = idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
2472 
2473  n.x *= f;
2474  n.y *= f;
2475  n.z *= f;
2476 
2477  planes->SetNormal( n );
2478  planes->FitThroughPoint( a->xyz );
2479  planes++;
2480  }
2481 }
2482 
2483 /*
2484 ============
2485 idSIMD_Generic::DeriveTangents
2486 
2487  Derives the normal and orthogonal tangent vectors for the triangle vertices.
2488  For each vertex the normal and tangent vectors are derived from all triangles
2489  using the vertex which results in smooth tangents across the mesh.
2490  In the process the triangle planes are calculated as well.
2491 ============
2492 */
2493 void VPCALL idSIMD_Generic::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
2494  int i;
2495 
2496  bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
2497  memset( used, 0, numVerts * sizeof( used[0] ) );
2498 
2499  idPlane *planesPtr = planes;
2500  for ( i = 0; i < numIndexes; i += 3 ) {
2501  idDrawVert *a, *b, *c;
2502  unsigned long signBit;
2503  float d0[5], d1[5], f, area;
2504  idVec3 n, t0, t1;
2505 
2506  int v0 = indexes[i + 0];
2507  int v1 = indexes[i + 1];
2508  int v2 = indexes[i + 2];
2509 
2510  a = verts + v0;
2511  b = verts + v1;
2512  c = verts + v2;
2513 
2514  d0[0] = b->xyz[0] - a->xyz[0];
2515  d0[1] = b->xyz[1] - a->xyz[1];
2516  d0[2] = b->xyz[2] - a->xyz[2];
2517  d0[3] = b->st[0] - a->st[0];
2518  d0[4] = b->st[1] - a->st[1];
2519 
2520  d1[0] = c->xyz[0] - a->xyz[0];
2521  d1[1] = c->xyz[1] - a->xyz[1];
2522  d1[2] = c->xyz[2] - a->xyz[2];
2523  d1[3] = c->st[0] - a->st[0];
2524  d1[4] = c->st[1] - a->st[1];
2525 
2526  // normal
2527  n[0] = d1[1] * d0[2] - d1[2] * d0[1];
2528  n[1] = d1[2] * d0[0] - d1[0] * d0[2];
2529  n[2] = d1[0] * d0[1] - d1[1] * d0[0];
2530 
2531  f = idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
2532 
2533  n.x *= f;
2534  n.y *= f;
2535  n.z *= f;
2536 
2537  planesPtr->SetNormal( n );
2538  planesPtr->FitThroughPoint( a->xyz );
2539  planesPtr++;
2540 
2541  // area sign bit
2542  area = d0[3] * d1[4] - d0[4] * d1[3];
2543  signBit = ( *(unsigned long *)&area ) & ( 1 << 31 );
2544 
2545  // first tangent
2546  t0[0] = d0[0] * d1[4] - d0[4] * d1[0];
2547  t0[1] = d0[1] * d1[4] - d0[4] * d1[1];
2548  t0[2] = d0[2] * d1[4] - d0[4] * d1[2];
2549 
2550  f = idMath::RSqrt( t0.x * t0.x + t0.y * t0.y + t0.z * t0.z );
2551  *(unsigned long *)&f ^= signBit;
2552 
2553  t0.x *= f;
2554  t0.y *= f;
2555  t0.z *= f;
2556 
2557  // second tangent
2558  t1[0] = d0[3] * d1[0] - d0[0] * d1[3];
2559  t1[1] = d0[3] * d1[1] - d0[1] * d1[3];
2560  t1[2] = d0[3] * d1[2] - d0[2] * d1[3];
2561 
2562  f = idMath::RSqrt( t1.x * t1.x + t1.y * t1.y + t1.z * t1.z );
2563  *(unsigned long *)&f ^= signBit;
2564 
2565  t1.x *= f;
2566  t1.y *= f;
2567  t1.z *= f;
2568 
2569  if ( used[v0] ) {
2570  a->normal += n;
2571  a->tangents[0] += t0;
2572  a->tangents[1] += t1;
2573  } else {
2574  a->normal = n;
2575  a->tangents[0] = t0;
2576  a->tangents[1] = t1;
2577  used[v0] = true;
2578  }
2579 
2580  if ( used[v1] ) {
2581  b->normal += n;
2582  b->tangents[0] += t0;
2583  b->tangents[1] += t1;
2584  } else {
2585  b->normal = n;
2586  b->tangents[0] = t0;
2587  b->tangents[1] = t1;
2588  used[v1] = true;
2589  }
2590 
2591  if ( used[v2] ) {
2592  c->normal += n;
2593  c->tangents[0] += t0;
2594  c->tangents[1] += t1;
2595  } else {
2596  c->normal = n;
2597  c->tangents[0] = t0;
2598  c->tangents[1] = t1;
2599  used[v2] = true;
2600  }
2601  }
2602 }
2603 
2604 /*
2605 ============
2606 idSIMD_Generic::DeriveUnsmoothedTangents
2607 
2608  Derives the normal and orthogonal tangent vectors for the triangle vertices.
2609  For each vertex the normal and tangent vectors are derived from a single dominant triangle.
2610 ============
2611 */
2612 #define DERIVE_UNSMOOTHED_BITANGENT
2613 
2614 void VPCALL idSIMD_Generic::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
2615  int i;
2616 
2617  for ( i = 0; i < numVerts; i++ ) {
2618  idDrawVert *a, *b, *c;
2619  float d0, d1, d2, d3, d4;
2620  float d5, d6, d7, d8, d9;
2621  float s0, s1, s2;
2622  float n0, n1, n2;
2623  float t0, t1, t2;
2624  float t3, t4, t5;
2625 
2626  const dominantTri_s &dt = dominantTris[i];
2627 
2628  a = verts + i;
2629  b = verts + dt.v2;
2630  c = verts + dt.v3;
2631 
2632  d0 = b->xyz[0] - a->xyz[0];
2633  d1 = b->xyz[1] - a->xyz[1];
2634  d2 = b->xyz[2] - a->xyz[2];
2635  d3 = b->st[0] - a->st[0];
2636  d4 = b->st[1] - a->st[1];
2637 
2638  d5 = c->xyz[0] - a->xyz[0];
2639  d6 = c->xyz[1] - a->xyz[1];
2640  d7 = c->xyz[2] - a->xyz[2];
2641  d8 = c->st[0] - a->st[0];
2642  d9 = c->st[1] - a->st[1];
2643 
2644  s0 = dt.normalizationScale[0];
2645  s1 = dt.normalizationScale[1];
2646  s2 = dt.normalizationScale[2];
2647 
2648  n0 = s2 * ( d6 * d2 - d7 * d1 );
2649  n1 = s2 * ( d7 * d0 - d5 * d2 );
2650  n2 = s2 * ( d5 * d1 - d6 * d0 );
2651 
2652  t0 = s0 * ( d0 * d9 - d4 * d5 );
2653  t1 = s0 * ( d1 * d9 - d4 * d6 );
2654  t2 = s0 * ( d2 * d9 - d4 * d7 );
2655 
2656 #ifndef DERIVE_UNSMOOTHED_BITANGENT
2657  t3 = s1 * ( d3 * d5 - d0 * d8 );
2658  t4 = s1 * ( d3 * d6 - d1 * d8 );
2659  t5 = s1 * ( d3 * d7 - d2 * d8 );
2660 #else
2661  t3 = s1 * ( n2 * t1 - n1 * t2 );
2662  t4 = s1 * ( n0 * t2 - n2 * t0 );
2663  t5 = s1 * ( n1 * t0 - n0 * t1 );
2664 #endif
2665 
2666  a->normal[0] = n0;
2667  a->normal[1] = n1;
2668  a->normal[2] = n2;
2669 
2670  a->tangents[0][0] = t0;
2671  a->tangents[0][1] = t1;
2672  a->tangents[0][2] = t2;
2673 
2674  a->tangents[1][0] = t3;
2675  a->tangents[1][1] = t4;
2676  a->tangents[1][2] = t5;
2677  }
2678 }
2679 
2680 /*
2681 ============
2682 idSIMD_Generic::NormalizeTangents
2683 
2684  Normalizes each vertex normal and projects and normalizes the
2685  tangent vectors onto the plane orthogonal to the vertex normal.
2686 ============
2687 */
2688 void VPCALL idSIMD_Generic::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
2689 
2690  for ( int i = 0; i < numVerts; i++ ) {
2691  idVec3 &v = verts[i].normal;
2692  float f;
2693 
2694  f = idMath::RSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
2695  v.x *= f; v.y *= f; v.z *= f;
2696 
2697  for ( int j = 0; j < 2; j++ ) {
2698  idVec3 &t = verts[i].tangents[j];
2699 
2700  t -= ( t * v ) * v;
2701  f = idMath::RSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
2702  t.x *= f; t.y *= f; t.z *= f;
2703  }
2704  }
2705 }
2706 
2707 /*
2708 ============
2709 idSIMD_Generic::CreateTextureSpaceLightVectors
2710 
2711  Calculates light vectors in texture space for the given triangle vertices.
2712  For each vertex the direction towards the light origin is projected onto texture space.
2713  The light vectors are only calculated for the vertices referenced by the indexes.
2714 ============
2715 */
2716 void VPCALL idSIMD_Generic::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
2717 
2718  bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
2719  memset( used, 0, numVerts * sizeof( used[0] ) );
2720 
2721  for ( int i = numIndexes - 1; i >= 0; i-- ) {
2722  used[indexes[i]] = true;
2723  }
2724 
2725  for ( int i = 0; i < numVerts; i++ ) {
2726  if ( !used[i] ) {
2727  continue;
2728  }
2729 
2730  const idDrawVert *v = &verts[i];
2731 
2732  idVec3 lightDir = lightOrigin - v->xyz;
2733 
2734  lightVectors[i][0] = lightDir * v->tangents[0];
2735  lightVectors[i][1] = lightDir * v->tangents[1];
2736  lightVectors[i][2] = lightDir * v->normal;
2737  }
2738 }
2739 
2740 /*
2741 ============
2742 idSIMD_Generic::CreateSpecularTextureCoords
2743 
2744  Calculates specular texture coordinates for the given triangle vertices.
2745  For each vertex the normalized direction towards the light origin is added to the
2746  normalized direction towards the view origin and the result is projected onto texture space.
2747  The texture coordinates are only calculated for the vertices referenced by the indexes.
2748 ============
2749 */
2750 void VPCALL idSIMD_Generic::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
2751 
2752  bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
2753  memset( used, 0, numVerts * sizeof( used[0] ) );
2754 
2755  for ( int i = numIndexes - 1; i >= 0; i-- ) {
2756  used[indexes[i]] = true;
2757  }
2758 
2759  for ( int i = 0; i < numVerts; i++ ) {
2760  if ( !used[i] ) {
2761  continue;
2762  }
2763 
2764  const idDrawVert *v = &verts[i];
2765 
2766  idVec3 lightDir = lightOrigin - v->xyz;
2767  idVec3 viewDir = viewOrigin - v->xyz;
2768 
2769  float ilength;
2770 
2771  ilength = idMath::RSqrt( lightDir * lightDir );
2772  lightDir[0] *= ilength;
2773  lightDir[1] *= ilength;
2774  lightDir[2] *= ilength;
2775 
2776  ilength = idMath::RSqrt( viewDir * viewDir );
2777  viewDir[0] *= ilength;
2778  viewDir[1] *= ilength;
2779  viewDir[2] *= ilength;
2780 
2781  lightDir += viewDir;
2782 
2783  texCoords[i][0] = lightDir * v->tangents[0];
2784  texCoords[i][1] = lightDir * v->tangents[1];
2785  texCoords[i][2] = lightDir * v->normal;
2786  texCoords[i][3] = 1.0f;
2787  }
2788 }
2789 
2790 /*
2791 ============
2792 idSIMD_Generic::CreateShadowCache
2793 ============
2794 */
2795 int VPCALL idSIMD_Generic::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
2796  int outVerts = 0;
2797 
2798  for ( int i = 0; i < numVerts; i++ ) {
2799  if ( vertRemap[i] ) {
2800  continue;
2801  }
2802  const float *v = verts[i].xyz.ToFloatPtr();
2803  vertexCache[outVerts+0][0] = v[0];
2804  vertexCache[outVerts+0][1] = v[1];
2805  vertexCache[outVerts+0][2] = v[2];
2806  vertexCache[outVerts+0][3] = 1.0f;
2807 
2808  // R_SetupProjection() builds the projection matrix with a slight crunch
2809  // for depth, which keeps this w=0 division from rasterizing right at the
2810  // wrap around point and causing depth fighting with the rear caps
2811  vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
2812  vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
2813  vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
2814  vertexCache[outVerts+1][3] = 0.0f;
2815  vertRemap[i] = outVerts;
2816  outVerts += 2;
2817  }
2818  return outVerts;
2819 }
2820 
2821 /*
2822 ============
2823 idSIMD_Generic::CreateVertexProgramShadowCache
2824 ============
2825 */
2827  for ( int i = 0; i < numVerts; i++ ) {
2828  const float *v = verts[i].xyz.ToFloatPtr();
2829  vertexCache[i*2+0][0] = v[0];
2830  vertexCache[i*2+1][0] = v[0];
2831  vertexCache[i*2+0][1] = v[1];
2832  vertexCache[i*2+1][1] = v[1];
2833  vertexCache[i*2+0][2] = v[2];
2834  vertexCache[i*2+1][2] = v[2];
2835  vertexCache[i*2+0][3] = 1.0f;
2836  vertexCache[i*2+1][3] = 0.0f;
2837  }
2838  return numVerts * 2;
2839 }
2840 
2841 /*
2842 ============
2843 idSIMD_Generic::UpSamplePCMTo44kHz
2844 
2845  Duplicate samples for 44kHz output.
2846 ============
2847 */
2848 void idSIMD_Generic::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
2849  if ( kHz == 11025 ) {
2850  if ( numChannels == 1 ) {
2851  for ( int i = 0; i < numSamples; i++ ) {
2852  dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
2853  }
2854  } else {
2855  for ( int i = 0; i < numSamples; i += 2 ) {
2856  dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
2857  dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
2858  }
2859  }
2860  } else if ( kHz == 22050 ) {
2861  if ( numChannels == 1 ) {
2862  for ( int i = 0; i < numSamples; i++ ) {
2863  dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
2864  }
2865  } else {
2866  for ( int i = 0; i < numSamples; i += 2 ) {
2867  dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
2868  dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
2869  }
2870  }
2871  } else if ( kHz == 44100 ) {
2872  for ( int i = 0; i < numSamples; i++ ) {
2873  dest[i] = (float) src[i];
2874  }
2875  } else {
2876  assert( 0 );
2877  }
2878 }
2879 
2880 /*
2881 ============
2882 idSIMD_Generic::UpSampleOGGTo44kHz
2883 
2884  Duplicate samples for 44kHz output.
2885 ============
2886 */
2887 void idSIMD_Generic::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
2888  if ( kHz == 11025 ) {
2889  if ( numChannels == 1 ) {
2890  for ( int i = 0; i < numSamples; i++ ) {
2891  dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
2892  }
2893  } else {
2894  for ( int i = 0; i < numSamples >> 1; i++ ) {
2895  dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
2896  dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
2897  }
2898  }
2899  } else if ( kHz == 22050 ) {
2900  if ( numChannels == 1 ) {
2901  for ( int i = 0; i < numSamples; i++ ) {
2902  dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
2903  }
2904  } else {
2905  for ( int i = 0; i < numSamples >> 1; i++ ) {
2906  dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
2907  dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
2908  }
2909  }
2910  } else if ( kHz == 44100 ) {
2911  if ( numChannels == 1 ) {
2912  for ( int i = 0; i < numSamples; i++ ) {
2913  dest[i*1+0] = ogg[0][i] * 32768.0f;
2914  }
2915  } else {
2916  for ( int i = 0; i < numSamples >> 1; i++ ) {
2917  dest[i*2+0] = ogg[0][i] * 32768.0f;
2918  dest[i*2+1] = ogg[1][i] * 32768.0f;
2919  }
2920  }
2921  } else {
2922  assert( 0 );
2923  }
2924 }
2925 
2926 /*
2927 ============
2928 idSIMD_Generic::MixSoundTwoSpeakerMono
2929 ============
2930 */
2931 void VPCALL idSIMD_Generic::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
2932  float sL = lastV[0];
2933  float sR = lastV[1];
2934  float incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
2935  float incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
2936 
2937  assert( numSamples == MIXBUFFER_SAMPLES );
2938 
2939  for( int j = 0; j < MIXBUFFER_SAMPLES; j++ ) {
2940  mixBuffer[j*2+0] += samples[j] * sL;
2941  mixBuffer[j*2+1] += samples[j] * sR;
2942  sL += incL;
2943  sR += incR;
2944  }
2945 }
2946 
2947 /*
2948 ============
2949 idSIMD_Generic::MixSoundTwoSpeakerStereo
2950 ============
2951 */
2952 void VPCALL idSIMD_Generic::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
2953  float sL = lastV[0];
2954  float sR = lastV[1];
2955  float incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
2956  float incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
2957 
2958  assert( numSamples == MIXBUFFER_SAMPLES );
2959 
2960  for( int j = 0; j < MIXBUFFER_SAMPLES; j++ ) {
2961  mixBuffer[j*2+0] += samples[j*2+0] * sL;
2962  mixBuffer[j*2+1] += samples[j*2+1] * sR;
2963  sL += incL;
2964  sR += incR;
2965  }
2966 }
2967 
2968 /*
2969 ============
2970 idSIMD_Generic::MixSoundSixSpeakerMono
2971 ============
2972 */
2973 void VPCALL idSIMD_Generic::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
2974  float sL0 = lastV[0];
2975  float sL1 = lastV[1];
2976  float sL2 = lastV[2];
2977  float sL3 = lastV[3];
2978  float sL4 = lastV[4];
2979  float sL5 = lastV[5];
2980 
2981  float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
2982  float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
2983  float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
2984  float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
2985  float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
2986  float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
2987 
2988  assert( numSamples == MIXBUFFER_SAMPLES );
2989 
2990  for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {
2991  mixBuffer[i*6+0] += samples[i] * sL0;
2992  mixBuffer[i*6+1] += samples[i] * sL1;
2993  mixBuffer[i*6+2] += samples[i] * sL2;
2994  mixBuffer[i*6+3] += samples[i] * sL3;
2995  mixBuffer[i*6+4] += samples[i] * sL4;
2996  mixBuffer[i*6+5] += samples[i] * sL5;
2997  sL0 += incL0;
2998  sL1 += incL1;
2999  sL2 += incL2;
3000  sL3 += incL3;
3001  sL4 += incL4;
3002  sL5 += incL5;
3003  }
3004 }
3005 
3006 /*
3007 ============
3008 idSIMD_Generic::MixSoundSixSpeakerStereo
3009 ============
3010 */
3011 void VPCALL idSIMD_Generic::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
3012  float sL0 = lastV[0];
3013  float sL1 = lastV[1];
3014  float sL2 = lastV[2];
3015  float sL3 = lastV[3];
3016  float sL4 = lastV[4];
3017  float sL5 = lastV[5];
3018 
3019  float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
3020  float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
3021  float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
3022  float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
3023  float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
3024  float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
3025 
3026  assert( numSamples == MIXBUFFER_SAMPLES );
3027 
3028  for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {
3029  mixBuffer[i*6+0] += samples[i*2+0] * sL0;
3030  mixBuffer[i*6+1] += samples[i*2+1] * sL1;
3031  mixBuffer[i*6+2] += samples[i*2+0] * sL2;
3032  mixBuffer[i*6+3] += samples[i*2+0] * sL3;
3033  mixBuffer[i*6+4] += samples[i*2+0] * sL4;
3034  mixBuffer[i*6+5] += samples[i*2+1] * sL5;
3035  sL0 += incL0;
3036  sL1 += incL1;
3037  sL2 += incL2;
3038  sL3 += incL3;
3039  sL4 += incL4;
3040  sL5 += incL5;
3041  }
3042 }
3043 
3044 /*
3045 ============
3046 idSIMD_Generic::MixedSoundToSamples
3047 ============
3048 */
3049 void VPCALL idSIMD_Generic::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
3050 
3051  for ( int i = 0; i < numSamples; i++ ) {
3052  if ( mixBuffer[i] <= -32768.0f ) {
3053  samples[i] = -32768;
3054  } else if ( mixBuffer[i] >= 32767.0f ) {
3055  samples[i] = 32767;
3056  } else {
3057  samples[i] = (short) mixBuffer[i];
3058  }
3059  }
3060 }
virtual void VPCALL MatX_LowerTriangularSolve(const idMatX &L, float *x, const float *b, const int n, int skip=0)
virtual void VPCALL CreateSpecularTextureCoords(idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
GLdouble GLdouble GLdouble GLdouble q
Definition: glext.h:2959
virtual void VPCALL TransformVerts(idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights)
static const float INFINITY
Definition: Math.h:218
#define min(a, b)
virtual void VPCALL AddAssign16(float *dst, const float *src, const int count)
assert(prefInfo.fullscreenBtn)
const GLbyte * weights
Definition: glext.h:3273
virtual void VPCALL Dot(float *dst, const idVec3 &constant, const idVec3 *src, const int count)
int GetSize(void) const
Definition: Vector.h:1467
virtual void VPCALL BlendJoints(idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints)
virtual void VPCALL Memset(void *dst, const int val, const int count)
const GLdouble * v
Definition: glext.h:2936
const int MIXBUFFER_SAMPLES
Definition: Simd.h:84
const float * ToFloatPtr(void) const
Definition: Vector.h:719
float Distance(const idVec3 &v) const
Definition: Plane.h:324
idVec3 xyz
Definition: DrawVert.h:42
virtual void VPCALL MixedSoundToSamples(short *samples, const float *mixBuffer, const int numSamples)
GLenum GLsizei n
Definition: glext.h:3705
virtual void VPCALL ClampMax(float *dst, const float *src, const float max, const int count)
float z
Definition: Vector.h:320
idVec3 tangents[2]
Definition: DrawVert.h:45
virtual void VPCALL Memcpy(void *dst, const void *src, const int count)
virtual void VPCALL MatX_TransposeMultiplyAddVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
Definition: Vector.h:316
case const float
Definition: Callbacks.cpp:62
virtual void VPCALL TracePointCull(byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL Add(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL CmpGE(byte *dst, const float *src0, const float constant, const int count)
virtual void VPCALL MixSoundTwoSpeakerStereo(float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2])
GLuint src
Definition: glext.h:5390
glIndex_t v3
Definition: Model.h:70
void SetNormal(const idVec3 &normal)
Definition: Plane.h:233
GLfloat v0
Definition: glext.h:3606
float x
Definition: Vector.h:318
virtual void VPCALL MatX_LowerTriangularSolveTranspose(const idMatX &L, float *x, const float *b, const int n)
idQuat & Slerp(const idQuat &from, const idQuat &to, float t)
Definition: Quat.cpp:160
GLenum GLint x
Definition: glext.h:2849
int i
Definition: process.py:33
void SetTranslation(const idVec3 &t)
#define UNROLL1(Y)
virtual void VPCALL Sub16(float *dst, const float *src1, const float *src2, const int count)
virtual void VPCALL ClampMin(float *dst, const float *src, const float min, const int count)
void SetRotation(const idMat3 &m)
list l
Definition: prepare.py:17
virtual void VPCALL DeriveUnsmoothedTangents(idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts)
virtual void VPCALL MatX_MultiplyVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
virtual void VPCALL MulAssign16(float *dst, const float constant, const int count)
virtual void VPCALL MixSoundTwoSpeakerMono(float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2])
idVec2 st
Definition: DrawVert.h:43
GLfloat GLfloat GLfloat v2
Definition: glext.h:3608
GLuint dst
Definition: glext.h:5285
float normalizationScale[3]
Definition: Model.h:71
GLuint GLuint GLsizei count
Definition: glext.h:2845
int GetNumColumns(void) const
Definition: Matrix.h:1822
Definition: Vector.h:52
#define FLOATSIGNBITSET(f)
Definition: Math.h:68
virtual void VPCALL OverlayPointCull(byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL Negate16(float *dst, const int count)
GLuint index
Definition: glext.h:3476
const GLubyte * c
Definition: glext.h:4677
Definition: Vector.h:808
virtual void VPCALL DecalPointCull(byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL SubAssign16(float *dst, const float *src, const int count)
#define NSKIP(n, s)
virtual void VPCALL MatX_MultiplyAddVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
virtual void VPCALL MatX_MultiplyMatX(idMatX &dst, const idMatX &m1, const idMatX &m2)
virtual void VPCALL CmpLE(byte *dst, const float *src0, const float constant, const int count)
virtual void VPCALL MatX_TransposeMultiplyMatX(idMatX &dst, const idMatX &m1, const idMatX &m2)
virtual void VPCALL Div(float *dst, const float constant, const float *src, const int count)
const float * ToFloatPtr(void) const
Definition: Vector.h:1910
float y
Definition: Vector.h:319
virtual void VPCALL ConvertJointQuatsToJointMats(idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints)
Definition: Plane.h:71
Definition: eax4.h:1413
int GetNumRows(void) const
Definition: Matrix.h:1821
virtual void VPCALL DeriveTriPlanes(idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
INT64 INT64 divisor
Definition: wglext.h:504
idJointQuat ToJointQuat(void) const
virtual bool VPCALL MatX_LDLTFactor(idMatX &mat, idVecX &invDiag, const int n)
idVec3 normal
Definition: DrawVert.h:44
virtual void VPCALL MulSub(float *dst, const float constant, const float *src, const int count)
GLubyte GLubyte GLubyte a
Definition: glext.h:4662
virtual void VPCALL MatX_TransposeMultiplyVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
virtual void VPCALL MixSoundSixSpeakerStereo(float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6])
GLfloat GLfloat v1
Definition: glext.h:3607
GLubyte GLubyte b
Definition: glext.h:4662
virtual void VPCALL MixSoundSixSpeakerMono(float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6])
virtual void VPCALL CreateTextureSpaceLightVectors(idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
#define bits
Definition: Unzip.cpp:3797
glIndex_t v2
Definition: Model.h:70
virtual void VPCALL UpSamplePCMTo44kHz(float *dest, const short *pcm, const int numSamples, const int kHz, const int numChannels)
tuple f
Definition: idal.py:89
virtual void VPCALL DeriveTangents(idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
virtual void VPCALL Mul16(float *dst, const float *src1, const float constant, const int count)
unsigned char byte
Definition: Lib.h:75
virtual int VPCALL CreateShadowCache(idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts)
#define NODEFAULT
virtual void VPCALL Zero16(float *dst, const int count)
virtual void VPCALL Mul(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL NormalizeTangents(idDrawVert *verts, const int numVerts)
idVertexCache vertexCache
Definition: VertexCache.cpp:41
virtual void VPCALL Clamp(float *dst, const float *src, const float min, const float max, const int count)
#define VPCALL
Definition: Simd.h:63
#define UNROLL4(Y)
virtual void VPCALL UntransformJoints(idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint)
GLint j
Definition: qgl.h:264
void Lerp(const idVec3 &v1, const idVec3 &v2, const float l)
Definition: Vector.cpp:232
float dot(float a[], float b[])
Definition: Model_lwo.cpp:3883
virtual int VPCALL CreateVertexProgramShadowCache(idVec4 *vertexCache, const idDrawVert *verts, const int numVerts)
virtual void VPCALL ConvertJointMatsToJointQuats(idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints)
virtual void VPCALL MulAdd(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL MatX_MultiplySubVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
virtual void VPCALL MatX_TransposeMultiplySubVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
#define max(x, y)
Definition: os.h:70
virtual void VPCALL MinMax(float &min, float &max, const float *src, const int count)
virtual void VPCALL Copy16(float *dst, const float *src, const int count)
virtual void VPCALL Sub(float *dst, const float constant, const float *src, const int count)
#define OPER(X)
const float * ToFloatPtr(void) const
Definition: Matrix.h:2935
virtual void VPCALL TransformJoints(idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint)
virtual void VPCALL CmpLT(byte *dst, const float *src0, const float constant, const int count)
static float RSqrt(float x)
Definition: Math.h:241
virtual void VPCALL Add16(float *dst, const float *src1, const float *src2, const int count)
virtual void VPCALL UpSampleOGGTo44kHz(float *dest, const float *const *ogg, const int numSamples, const int kHz, const int numChannels)
virtual void VPCALL CmpGT(byte *dst, const float *src0, const float constant, const int count)
virtual const char *VPCALL GetName(void) const
void FitThroughPoint(const idVec3 &p)
Definition: Plane.h:297
GLdouble GLdouble t
Definition: glext.h:2943