doom3-gpl
Doom 3 GPL source release
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Simd_AltiVec.cpp
Go to the documentation of this file.
1 /*
2 ===========================================================================
3 
4 Doom 3 GPL Source Code
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
6 
7 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
8 
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13 
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
21 
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
23 
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
25 
26 ===========================================================================
27 */
28 
29 
30 #include "../precompiled.h"
31 #pragma hdrstop
32 
33 #include "Simd_Generic.h"
34 #include "Simd_AltiVec.h"
35 #include <math.h>
36 #include <float.h>
37 
38 #ifdef PPC_INTRINSICS
39  #include <ppc_intrinsics.h>
40 #endif
41 
42 // Doom3 SIMD Library version 0.5
43 // Patrick Flanagan (pflanagan@apple.com)
44 // Sanjay Patel (spatel@apple.com)
45 // Architecture & Performance Group, Apple Computer
46 
47 
48 //===============================================================
49 //
50 // AltiVec implementation of idSIMDProcessor
51 //
52 //===============================================================
53 
54 #if defined(MACOS_X) && defined(__ppc__)
55 
56 // Data struct sizes
57 
58 #ifndef DRAWVERT_PADDED
59  // 60 bytes, 15 floats at 4 bytes each
60  #define DRAWVERT_OFFSET 15
61 #else
62  // 64 bytes, 16 floats
63  #define DRAWVERT_OFFSET 16
64 #endif
65 // 16 bytes each, 4 floats
66 #define PLANE_OFFSET 4
67 // 16 bytes each, 4 floats
68 #define IDVEC4_OFFSET 4
69 
70 // Alignment tests
71 #define IS_16BYTE_ALIGNED( x ) ( ( (unsigned long)&x & 0x0F ) == 0 )
72 #define NOT_16BYTE_ALIGNED( x ) ( ( (unsigned long)&x & 0x0F) != 0 )
73 
74 // Aligned storing floats
75 #define ALIGNED_STORE2( ADDR, V0, V1 ) \
76  vec_st( V0, 0, ADDR ); \
77  vec_st( V1, 16, ADDR )
78 
79 #define ALIGNED_STORE3( ADDR, V0, V1, V2 ) \
80  vec_st( V0, 0, ADDR ); \
81  vec_st( V1, 16, ADDR ); \
82  vec_st( V2, 32, ADDR )
83 
84 #define ALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) \
85  vec_st( V0, 0, ADDR ); \
86  vec_st( V1, 16, ADDR ); \
87  vec_st( V2, 32, ADDR ); \
88  vec_st( V3, 48, ADDR )
89 
90 #define ALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) \
91  vec_st( V0, 0, ADDR ); \
92  vec_st( V1, 16, ADDR ); \
93  vec_st( V2, 32, ADDR ); \
94  vec_st( V3, 48, ADDR ); \
95  vec_st( V4, 64, ADDR ); \
96  vec_st( V5, 80, ADDR )
97 
98 #define ALIGNED_STORE8( ADDR, V0, V1, V2, V3, V4, V5, V6, V7 ) \
99  vec_st( V0, 0, ADDR ); \
100  vec_st( V1, 16, ADDR ); \
101  vec_st( V2, 32, ADDR ); \
102  vec_st( V3, 48, ADDR ); \
103  vec_st( V4, 64, ADDR ); \
104  vec_st( V5, 80, ADDR ); \
105  vec_st( V6, 96, ADDR ); \
106  vec_st( V7, 112, ADDR )
107 
108 // Unaligned storing floats. These assume that we can trash the input
109 #define UNALIGNED_STORE1( ADDR, V0 ) { \
110  /* use store element */ \
111  vector unsigned char ULStoreMacroPerm = vec_lvsr( 0, ADDR ); \
112  V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
113  vec_ste( V0, 0, ADDR ); \
114  vec_ste( V0, 4, ADDR ); \
115  vec_ste( V0, 8, ADDR ); \
116  vec_ste( V0, 12, ADDR ); \
117  }
118 
119 #define UNALIGNED_STORE2( ADDR, V0, V1 ) { \
120  /* load up the values that are there now */ \
121  vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
122  vector float ULStoreMacro2 = vec_ld( 31, ADDR ); \
123  /* generate permute vector and mask */ \
124  vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
125  vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
126  /* right rotate input data */ \
127  V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
128  V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
129  /* setup the output vectors */ \
130  vector float ULStoreVal1, ULStoreVal2, ULStoreVal3; \
131  ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
132  ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
133  ULStoreVal3 = vec_sel( V1, ULStoreMacro2, ULStoreMacroMask ); \
134  /* store results */ \
135  vec_st( ULStoreVal1, 0, ADDR ); \
136  vec_st( ULStoreVal2, 15, ADDR ); \
137  vec_st( ULStoreVal3, 31, ADDR ); }
138 
139 #define UNALIGNED_STORE3( ADDR, V0, V1, V2 ) { \
140  /* load up the values that are there now */ \
141  vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
142  vector float ULStoreMacro2 = vec_ld( 47, ADDR ); \
143  /* generate permute vector and mask */ \
144  vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
145  vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
146  /* right rotate input data */ \
147  V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
148  V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
149  V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
150  /* setup the output vectors */ \
151  vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4; \
152  ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
153  ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
154  ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
155  ULStoreVal4 = vec_sel( V2, ULStoreMacro2, ULStoreMacroMask ); \
156  /* store results */ \
157  vec_st( ULStoreVal1, 0, ADDR ); \
158  vec_st( ULStoreVal2, 15, ADDR ); \
159  vec_st( ULStoreVal3, 31, ADDR ); \
160  vec_st( ULStoreVal4, 47, ADDR ); }
161 
162 #define UNALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) { \
163  /* load up the values that are there now */ \
164  vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
165  vector float ULStoreMacro2 = vec_ld( 63, ADDR ); \
166  /* generate permute vector and mask */ \
167  vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
168  vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
169  /* right rotate input data */ \
170  V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
171  V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
172  V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
173  V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
174  /* setup the output vectors */ \
175  vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5; \
176  ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
177  ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
178  ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
179  ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
180  ULStoreVal5 = vec_sel( V3, ULStoreMacro2, ULStoreMacroMask ); \
181  /* store results */ \
182  vec_st( ULStoreVal1, 0, ADDR ); \
183  vec_st( ULStoreVal2, 15, ADDR ); \
184  vec_st( ULStoreVal3, 31, ADDR ); \
185  vec_st( ULStoreVal4, 47, ADDR ); \
186  vec_st( ULStoreVal5, 63, ADDR ); }
187 
188 #define UNALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) { \
189  /* load up the values that are there now */ \
190  vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
191  vector float ULStoreMacro2 = vec_ld( 95, ADDR ); \
192  /* generate permute vector and mask */ \
193  vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
194  vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
195  /* right rotate input data */ \
196  V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
197  V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
198  V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
199  V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
200  V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
201  V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
202  /* setup the output vectors */ \
203  vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
204  ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
205  ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
206  ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
207  ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
208  ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
209  ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
210  ULStoreVal7 = vec_sel( V5, ULStoreMacro2, ULStoreMacroMask ); \
211  /* store results */ \
212  vec_st( ULStoreVal1, 0, ADDR ); \
213  vec_st( ULStoreVal2, 15, ADDR ); \
214  vec_st( ULStoreVal3, 31, ADDR ); \
215  vec_st( ULStoreVal4, 47, ADDR ); \
216  vec_st( ULStoreVal5, 63, ADDR ); \
217  vec_st( ULStoreVal6, 79, ADDR ); \
218  vec_st( ULStoreVal7, 95, ADDR ); }
219 
220 #define UNALIGNED_STORE9( ADDR, V0, V1, V2, V3, V4, V5, V6, V7, V8 ) { \
221  /* load up the values that are there now */ \
222  vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
223  vector float ULStoreMacro2 = vec_ld( 143, ADDR ); \
224  /* generate permute vector and mask */ \
225  vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
226  vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
227  /* right rotate input data */ \
228  V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
229  V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
230  V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
231  V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
232  V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
233  V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
234  V6 = vec_perm( V6, V6, ULStoreMacroPerm ); \
235  V7 = vec_perm( V7, V7, ULStoreMacroPerm ); \
236  V8 = vec_perm( V8, V8, ULStoreMacroPerm ); \
237  /* setup the output vectors */ \
238  vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
239  vector float ULStoreVal8, ULStoreVal9, ULStoreVal10; \
240  ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
241  ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
242  ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
243  ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
244  ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
245  ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
246  ULStoreVal7 = vec_sel( V5, V6, ULStoreMacroMask ); \
247  ULStoreVal8 = vec_sel( V6, V7, ULStoreMacroMask ); \
248  ULStoreVal9 = vec_sel( V7, V8, ULStoreMacroMask ); \
249  ULStoreVal10 = vec_sel( V8, ULStoreMacro2, ULStoreMacroMask ); \
250  /* store results */ \
251  vec_st( ULStoreVal1, 0, ADDR ); \
252  vec_st( ULStoreVal2, 15, ADDR ); \
253  vec_st( ULStoreVal3, 31, ADDR ); \
254  vec_st( ULStoreVal4, 47, ADDR ); \
255  vec_st( ULStoreVal5, 63, ADDR ); \
256  vec_st( ULStoreVal6, 79, ADDR ); \
257  vec_st( ULStoreVal7, 95, ADDR ); \
258  vec_st( ULStoreVal8, 111, ADDR ); \
259  vec_st( ULStoreVal9, 127, ADDR ); \
260  vec_st( ULStoreVal10, 143, ADDR ); }
261 
262 /*
263 ============
264 idSIMD_AltiVec::GetName
265 ============
266 */
267 const char *idSIMD_AltiVec::GetName( void ) const {
268  return "AltiVec";
269 }
270 
271 /*
272  Helper Functions
273 */
274 #if 0
275 // Prints the values of a vector, useful for debugging but
276 // should never be called in real code
277 inline void debugPrintVector( vector float v, char *msg ) {
278  printf("%s -- %vf\n", msg, v );
279 }
280 
281 inline void debugPrintVector( vector unsigned int v, char *msg ) {
282  printf("%s -- %vd\n", msg, v );
283 }
284 
285 inline void debugPrintVector( vector bool int v, char *msg ) {
286  printf("%s -- %vi\n", msg, v );
287 }
288 
289 inline void debugPrintVector( vector unsigned char v, char *msg ) {
290  printf("%s -- %vuc\n", msg, v );
291 }
292 
293 inline void debugPrintVector( vector unsigned short v, char *msg ) {
294  printf("%s -- %vs\n", msg, v );
295 }
296 #endif
297 /*
298 ===============
299  Reciprocal
300 
301  For each element in vector:
302  n = 1 / n
303 ===============
304 */
305 
306 // Use Newton-Raphson to calculate reciprocal of a vector
307 inline vector float Reciprocal( vector float v ) {
308  //Get the reciprocal estimate
309  vector float estimate = vec_re( v );
310  //One round of Newton-Raphson refinement
311  return vec_madd( vec_nmsub( estimate, v, (vector float) (1.0) ), estimate, estimate );
312 }
313 
314 /*
315 ===============
316  ReciprocalSquareRoot
317 
318  For each element in vector:
319  n = 1 / sqrt(n)
320 ===============
321 */
322 // Reciprocal square root estimate of a vector
323 inline vector float ReciprocalSquareRoot( vector float v ) {
324  //Get the square root reciprocal estimate
325  vector float zero = (vector float)(0);
326  vector float oneHalf = (vector float)(0.5);
327  vector float one = (vector float)(1.0);
328  vector float estimate = vec_rsqrte( vec_max( v, (vector float)(FLT_MIN) ) );
329 
330  //One round of Newton-Raphson refinement
331  vector float estimateSquared = vec_madd( estimate, estimate, zero );
332  vector float halfEstimate = vec_madd( estimate, oneHalf, zero );
333  return vec_madd( vec_nmsub( v, estimateSquared, one ), halfEstimate, estimate );
334 }
335 
336 
337 /*
338 ===============
339  Divide
340 
341  For each element in vectors:
342  n = a / b
343 ===============
344 */
345 // Use reciprocal estimate and multiply to divide a vector
346 inline vector float Divide( vector float a, vector float b ) {
347  return vec_madd( a, Reciprocal( b ), (vector float)(0) );
348 }
349 
350 /*
351 ===============
352  loadSplatUnalignedScalar
353 
354  For each element in vector:
355  n = s
356 ===============
357 */
358 inline vector float loadSplatUnalignedScalar( const float *s ) {
359  vector unsigned char splatMap = vec_lvsl( 0, s );
360  vector float v = vec_ld( 0, s );
361  splatMap = (vector unsigned char) vec_splat( (vector float) splatMap, 0 );
362  return vec_perm( v, v, splatMap );
363 }
364 
365 /*
366 ===============
367  VectorATan16
368 
369  For each element in vector:
370  n = idMath::ATan16( x, y )
371 ===============
372 */
373 // calculates arc tangent of a vector with 16 bits of precision, based on atan16 in idMath
374 inline vector float VectorATan16( vector float x, vector float y ) {
375 
376  vector float xDivY = Divide( x, y );
377  vector float yDivX = Divide( y, x );
378  vector float zeroVector = (vector float)(0);
379 
380  vector bool int vecCmp = vec_cmpgt( vec_abs( y ), vec_abs( x ) );
381  vector float vecA = vec_sel( yDivX, xDivY, vecCmp );
382  vector bool int vecCmp2 = vec_cmplt( vecA, zeroVector );
383  vector float vecS = vec_madd( vecA, vecA, (vector float)(0) );
384 
385  // do calculation for S
386  vector float vecWork1 = vec_madd( (vector float)(0.0028662257f), vecS, (vector float)(-0.0161657367f) );
387  vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.0429096138f) );
388  vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.0752896400f) );
389  vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1065626393f) );
390  vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.1420889944f) );
391  vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1999355085f) );
392  vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.3333314528f) );
393  vecWork1 = vec_madd( vecWork1, vecS, (vector float)(1) );
394 
395  // get the regular S value
396  vecS = vec_madd( vecWork1, vecA, (vector float)(0) );
397 
398  // calculate what to return if y > x
399  vector float negSPlusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(0.5f * 3.14159265358979323846f) );
400  vector float negSMinusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(-0.5f * 3.14159265358979323846f) );
401  vector float modRet = vec_sel( negSPlusHalfPI, negSMinusHalfPI, vecCmp2 );
402 
403  return vec_sel( modRet, vecS, vecCmp );
404 }
405 
406 /*
407 ===============
408  VectorSin16
409 
410  For each element in vector:
411  n = idMath::Sin16( v )
412 ===============
413 */
414 inline vector float VectorSin16( vector float v ) {
415  vector float zero = (vector float)(0);
416 
417 #if 0
418  // load up half PI and use it to calculate the rest of the values. This is
419  // sometimes cheaper than loading them from memory
420 
421  vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
422  vector float PI = vec_add( halfPI, halfPI );
423  vector float oneandhalfPI = vec_add( PI, halfPI );
424  vector float twoPI = vec_add( oneandhalfPI, halfPI );
425 #else
426  vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
427  vector float PI = (vector float)(3.14159265358979323846f);
428  vector float oneandhalfPI = (vector float)(3.14159265358979323846f + ( 0.5f * 3.14159265358979323846f ) );
429  vector float twoPI = (vector float)( 2.0f * 3.14159265358979323846f);
430 #endif
431 
432  vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4;
433 
434  vector float vecMod;
435  vector float vecResult;
436 
437  // fix the range if needbe
438  vecMod = vec_floor( Divide( v, twoPI ) );
439  vecResult = vec_nmsub( vecMod, twoPI, v );
440 
441  vector float vecPIminusA = vec_sub( PI, vecResult );
442  vector float vecAminus2PI = vec_sub( vecResult, twoPI );
443 
444  vecCmp1 = vec_cmplt( vecResult, PI );
445  vecCmp2 = vec_cmpgt( vecResult, halfPI );
446 
447  // these are the ones where a > PI + HALF_PI so set a = a - TWO_PI
448  vecCmp3 = vec_cmpgt( vecResult, oneandhalfPI );
449 
450  // we also want to set a = PI - a everywhere that !(a < PI) and !(a > PI + HALF_PI)
451  vecCmp4 = vec_and( vec_xor( vecCmp3, (vector bool int)(1) ), vec_xor( vecCmp1, (vector bool int)(1) ) ); // everywhere that both of those are false
452 
453  // these are ones where a < PI and a > HALF_PI so we set a = PI - a
454  vecCmp1 = vec_and( vecCmp1, vecCmp2 );
455  vecCmp1 = vec_or( vecCmp1, vecCmp4 );
456 
457  // put the correct values into place
458  vecResult = vec_sel( vecResult, vecPIminusA, vecCmp1 );
459  vecResult = vec_sel( vecResult, vecAminus2PI, vecCmp3 );
460 
461  // calculate answer
462  vector float vecASquared = vec_madd( vecResult, vecResult, zero );
463  vector float vecEst = vec_madd( (vector float)(-2.39e-08f), vecASquared, (vector float)(2.7526e-06f) );
464  vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.98409e-04f) );
465  vecEst = vec_madd( vecEst, vecASquared, (vector float)(8.3333315e-03f) );
466  vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.666666664e-01f) );
467  vecEst = vec_madd( vecEst, vecASquared, (vector float)(1.0f) );
468  return vec_madd( vecResult, vecEst, zero );
469 }
470 
471 /*
472 ===============
473  vecSplatWithRunTime
474 
475  For each element in vector:
476  n = v(i)
477 ===============
478 */
479 // splats an element across a vector using a runtime variable
480 inline vector float vecSplatWithRunTime( vector float v, int i ) {
481  vector unsigned char rotate = vec_lvsl( i * sizeof( float ), (int*) 0L );
482  v = vec_perm( v, v, rotate );
483  return vec_splat( v, 0 );
484 }
485 
486 
487 /*
488 ===============
489  FastScalarInvSqrt
490 
491  n = 1 / sqrt( f )
492 ===============
493 */
494 inline float FastScalarInvSqrt( float f ) {
495 #ifdef PPC_INTRINSICS
496  float estimate;
497  const float kSmallestFloat = FLT_MIN;
498 
499  //Calculate a 5 bit starting estimate for the reciprocal sqrt
500  estimate = __frsqrte ( f + kSmallestFloat );
501 
502  //if you require less precision, you may reduce the number of loop iterations.
503  // This will do 2 rounds of NR
504  estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
505  estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
506  return estimate;
507 #else
508  return idMath::InvSqrt( f );
509 #endif
510 }
511 
512 /*
513 ===============
514  FastScalarInvSqrt_x3
515 
516  arg1 = 1 / sqrt( arg1 )
517  arg2 = 1 / sqrt( arg2 )
518  arg3 = 1 / sqrt( arg3 )
519 ===============
520 */
521 inline void FastScalarInvSqrt_x3( float *arg1, float *arg2, float *arg3 ) {
522 #ifdef PPC_INTRINSICS
523  register float estimate1, estimate2, estimate3;
524  const float kSmallestFloat = FLT_MIN;
525 
526  //Calculate a 5 bit starting estimate for the reciprocal sqrt of each
527  estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
528  estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
529  estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
530 
531  // two rounds newton-raphson
532  estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
533  estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
534  estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
535  estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
536  estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
537  estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
538 
539  *arg1 = estimate1;
540  *arg2 = estimate2;
541  *arg3 = estimate3;
542 #else
543  *arg1 = idMath::InvSqrt( *arg1 );
544  *arg2 = idMath::InvSqrt( *arg2 );
545  *arg3 = idMath::InvSqrt( *arg3 );
546 #endif
547 }
548 
549 /*
550 ===============
551  FastScalarInvSqrt_x6
552 
553  arg1 = 1 / sqrt( arg1 )
554  arg2 = 1 / sqrt( arg2 )
555  arg3 = 1 / sqrt( arg3 )
556  arg4 = 1 / sqrt( arg4 )
557  arg5 = 1 / sqrt( arg5 )
558  arg6 = 1 / sqrt( arg6 )
559 
560  On a G5, you've got 2 pipeline stages to fill. (2 FPU's with 6 stages each)
561 ===============
562 */
563 inline void FastScalarInvSqrt_x6( float *arg1, float *arg2, float *arg3, float *arg4, float *arg5, float *arg6 ) {
564 #ifdef PPC_INTRINSICS
565  register float estimate1, estimate2, estimate3, estimate4, estimate5, estimate6;
566  const float kSmallestFloat = FLT_MIN;
567 
568  //Calculate a 5 bit starting estimate for the reciprocal sqrt of each
569  estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
570  estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
571  estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
572  estimate4 = __frsqrte ( *arg4 + kSmallestFloat );
573  estimate5 = __frsqrte ( *arg5 + kSmallestFloat );
574  estimate6 = __frsqrte ( *arg6 + kSmallestFloat );
575 
576  // two rounds newton-raphson
577  estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
578  estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
579  estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
580  estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
581  estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
582  estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
583 
584  estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
585  estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
586  estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
587  estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
588  estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
589  estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
590 
591  *arg1 = estimate1;
592  *arg2 = estimate2;
593  *arg3 = estimate3;
594  *arg4 = estimate4;
595  *arg5 = estimate5;
596  *arg6 = estimate6;
597 #else
598  *arg1 = idMath::InvSqrt( *arg1 );
599  *arg2 = idMath::InvSqrt( *arg2 );
600  *arg3 = idMath::InvSqrt( *arg3 );
601  *arg4 = idMath::InvSqrt( *arg4 );
602  *arg5 = idMath::InvSqrt( *arg5 );
603  *arg6 = idMath::InvSqrt( *arg6 );
604 #endif
605 }
606 
607 
608 // End Helper Functions
609 
610 #ifdef ENABLE_SIMPLE_MATH
611 
612 /*
613 ============
614 idSIMD_AltiVec::Add
615 
616  dst[i] = constant + src[i];
617 ============
618 */
619 void VPCALL idSIMD_AltiVec::Add( float *dst, const float constant, const float *src, const int count ) {
620  vector float v0, v1, v2, v3;
621  vector float v0_low, v0_hi, v1_hi;
622  vector unsigned char permVec;
623  vector float constVec;
624  int i;
625 
626  // handle unaligned cases at beginning
627  for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
628  dst[i] = constant + src[i];
629  }
630 
631  //splat constant into a vector
632  constVec = loadSplatUnalignedScalar( &constant );
633 
634  //calculate permute and do first load
635  permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), (vector unsigned char)(1) );
636  v1_hi = vec_ld( 0, &src[i] );
637 
638  //vectorize!
639  for ( ; i+7 < count; i += 8 ) {
640  //load source
641  v0_low = v1_hi;
642  v0_hi = vec_ld( 15, &src[i] );
643  v1_hi = vec_ld( 31, &src[i] );
644 
645  v0 = vec_perm( v0_low, v0_hi, permVec );
646  v1 = vec_perm( v0_hi, v1_hi, permVec );
647 
648  v2 = vec_add( v0, constVec );
649  v3 = vec_add( v1, constVec );
650 
651  // store results
652  ALIGNED_STORE2( &dst[i], v2, v3 );
653  }
654 
655  //handle cleanup
656  for ( ; i < count ; i++ ) {
657  dst[i] = constant + src[i];
658  }
659 }
660 
661 /*
662 ============
663 idSIMD_AltiVec::Add
664 
665  dst[i] = src0[i] + src1[i];
666 ============
667 */
668 void VPCALL idSIMD_AltiVec::Add( float *dst, const float *src0, const float *src1, const int count ) {
669 
670  register vector float v0, v1, v2, v3, v4, v5;
671  //src0
672  register vector float v0_low, v0_hi, v2_low, v2_hi;
673  //src1
674  register vector float v1_low, v1_hi, v3_low, v3_hi;
675  //permute vectors
676  register vector unsigned char permVec1, permVec2;
677  vector unsigned char oneCharVector = (vector unsigned char)(1);
678 
679  int i;
680 
681  //unaligned at start
682  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
683  dst[i] = src0[i] + src1[i];
684  }
685 
686  //calculate permute and do loads
687  permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
688  permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
689  v2_hi = vec_ld( 0, &src0[i] );
690  v3_hi = vec_ld( 0, &src1[i] );
691 
692  //vectorize!
693  for ( ; i+7 < count; i += 8 ) {
694  //load source
695  v0_low = v2_hi;
696  v0_hi = vec_ld( 15, &src0[i] );
697  v2_low = v0_hi;
698  v2_hi = vec_ld( 31, &src0[i] );
699 
700  v1_low = v3_hi;
701  v1_hi = vec_ld( 15, &src1[i] );
702  v3_low = v1_hi;
703  v3_hi = vec_ld( 31, &src1[i] );
704 
705  v0 = vec_perm( v0_low, v0_hi, permVec1 );
706  v1 = vec_perm( v1_low, v1_hi, permVec2 );
707  v2 = vec_perm( v2_low, v2_hi, permVec1 );
708  v3 = vec_perm( v3_low, v3_hi, permVec2 );
709 
710  v4 = vec_add( v0, v1 );
711  v5 = vec_add( v2, v3 );
712 
713  ALIGNED_STORE2( &dst[i], v4, v5 );
714 
715  }
716 
717  //handle cleanup
718  for ( ; i < count ; i++ ) {
719  dst[i] = src0[i] + src1[i];
720  }
721 }
722 
723 /*
724 ============
725 idSIMD_AltiVec::Sub
726 
727  dst[i] = constant - src[i];
728 ============
729 */
730 void VPCALL idSIMD_AltiVec::Sub( float *dst, const float constant, const float *src, const int count ) {
731 
732  register vector float v0, v1, v2, v3;
733  register vector float v0_low, v0_hi, v1_low, v1_hi;
734  register vector unsigned char permVec;
735  register vector float constVec;
736  vector unsigned char oneCharVector = (vector unsigned char)(1);
737  int i;
738 
739  //handle unaligned at start
740  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
741  dst[i] = constant - src[i];
742  }
743 
744  //splat constant into a vector
745  constVec = loadSplatUnalignedScalar( &constant );
746 
747  //calculate permute vector and do first load
748  permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
749  v1_hi = vec_ld( 0, &src[i] );
750 
751  //vectorize!
752  for ( ; i+7 < count; i += 8 ) {
753  //load source
754  v0_low = v1_hi;
755  v0_hi = vec_ld( 15, &src[i] );
756  v1_low = v0_hi;
757  v1_hi = vec_ld( 31, &src[i] );
758 
759  v0 = vec_perm( v0_low, v0_hi, permVec );
760  v1 = vec_perm( v1_low, v1_hi, permVec );
761 
762  v2 = vec_sub( constVec, v0 );
763  v3 = vec_sub( constVec, v1 );
764 
765  ALIGNED_STORE2( &dst[i], v2, v3 );
766  }
767 
768  //handle cleanup
769  for ( ; i < count ; i++ ) {
770  dst[i] = constant - src[i];
771  }
772 }
773 
774 /*
775 ============
776 idSIMD_AltiVec::Sub
777 
778  dst[i] = src0[i] - src1[i];
779 ============
780 */
781 void VPCALL idSIMD_AltiVec::Sub( float *dst, const float *src0, const float *src1, const int count ) {
782  register vector float v0, v1, v2, v3, v4, v5;
783  //src0
784  register vector float v0_low, v0_hi, v2_low, v2_hi;
785  //src1
786  register vector float v1_low, v1_hi, v3_low, v3_hi;
787  register vector unsigned char permVec1, permVec2;
788  vector unsigned char oneCharVector = (vector unsigned char)(1);
789  int i;
790 
791  //handle unaligned at start
792  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
793  dst[i] = src0[i] - src1[i];
794  }
795 
796  //calculate permute and do first loads
797  permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
798  permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
799  v2_hi = vec_ld( 0, &src0[i] );
800  v3_hi = vec_ld( 0, &src1[i] );
801 
802  //vectorize!
803  for ( ; i+7 < count; i += 8 ) {
804  //load source
805  v0_low = v2_hi;
806  v0_hi = vec_ld( 15, &src0[i] );
807  v2_low = v0_hi;
808  v2_hi = vec_ld( 31, &src0[i] );
809 
810  v1_low = v3_hi;
811  v1_hi = vec_ld( 15, &src1[i] );
812  v3_low = v1_hi;
813  v3_hi = vec_ld( 31, &src1[i] );
814 
815  v0 = vec_perm( v0_low, v0_hi, permVec1 );
816  v1 = vec_perm( v1_low, v1_hi, permVec2 );
817  v2 = vec_perm( v2_low, v2_hi, permVec1 );
818  v3 = vec_perm( v3_low, v3_hi, permVec2 );
819 
820  v4 = vec_sub( v0, v1 );
821  v5 = vec_sub( v2, v3 );
822 
823  ALIGNED_STORE2( &dst[i], v4, v5 );
824  }
825 
826  //handle cleanup
827  for ( ; i < count ; i++ ) {
828  dst[i] = src0[i] - src1[i];
829  }
830 }
831 
832 /*
833 ============
834 idSIMD_AltiVec::Mul
835 
836  dst[i] = constant * src[i];
837 ============
838 */
839 void VPCALL idSIMD_AltiVec::Mul( float *dst, const float constant, const float *src, const int count) {
840  register vector float v0, v0_low, v0_hi, v1_low, v1_hi, v1, v2, v3;
841  register vector float constVec;
842  register vector unsigned char permVec;
843  vector unsigned char oneCharVector = (vector unsigned char)(1);
844  register vector float zeroVector = (vector float)(0.0);
845  int i;
846 
847  // handle unaligned data at start
848  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
849  dst[i] = constant * src[i];
850  }
851 
852  //splat constant into a vector
853  constVec = loadSplatUnalignedScalar( &constant );
854 
855  permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
856  v1_hi = vec_ld( 0, &src[i] );
857 
858  //vectorize!
859  for ( ; i+7 < count; i += 8 ) {
860  //load source
861  v0_low = v1_hi;
862  v0_hi = vec_ld( 15, &src[i] );
863  v1_low = v0_hi;
864  v1_hi = vec_ld( 31, &src[i] );
865 
866  v0 = vec_perm( v0_low, v0_hi, permVec );
867  v1 = vec_perm( v1_low, v1_hi, permVec );
868 
869  v2 = vec_madd( constVec, v0, zeroVector );
870  v3 = vec_madd( constVec, v1, zeroVector );
871 
872  ALIGNED_STORE2( &dst[i], v2, v3 );
873  }
874 
875  //handle cleanup
876  for ( ; i < count ; i++ ) {
877  dst[i] = constant * src[i];
878  }
879 }
880 
881 /*
882 ============
883 idSIMD_AltiVec::Mul
884 
885  dst[i] = src0[i] * src1[i];
886 ============
887 */
888 void VPCALL idSIMD_AltiVec::Mul( float *dst, const float *src0, const float *src1, const int count ) {
889  register vector float v0, v1, v2, v3, v4, v5;
890  //src0
891  register vector float v0_low, v0_hi, v2_low, v2_hi;
892  //src1
893  register vector float v1_low, v1_hi, v3_low, v3_hi;
894  //permute vectors
895  register vector unsigned char permVec1, permVec2;
896  register vector float constVec = (vector float)(0.0);
897  vector unsigned char oneCharVector = (vector unsigned char)(1);
898  int i;
899 
900  //handle unaligned at start
901  for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
902  dst[i] = src0[i] * src1[i];
903  }
904 
905  //calculate permute and do loads
906  permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
907  permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
908  v2_hi = vec_ld( 0, &src0[i] );
909  v3_hi = vec_ld( 0, &src1[i] );
910 
911  //vectorize!
912  for ( ; i+7 < count; i += 8 ) {
913  //load source
914  v0_low = v2_hi;
915  v0_hi = vec_ld( 15, &src0[i] );
916  v2_low = v0_hi;
917  v2_hi = vec_ld( 31, &src0[i] );
918 
919  v1_low = v3_hi;
920  v1_hi = vec_ld( 15, &src1[i] );
921  v3_low = v1_hi;
922  v3_hi = vec_ld( 31, &src1[i] );
923 
924  v0 = vec_perm( v0_low, v0_hi, permVec1 );
925  v1 = vec_perm( v1_low, v1_hi, permVec2 );
926  v2 = vec_perm( v2_low, v2_hi, permVec1 );
927  v3 = vec_perm( v3_low, v3_hi, permVec2 );
928 
929  //no such thing as regular multiply so we do
930  //multiply then add zero
931  v4 = vec_madd( v0, v1, constVec );
932  v5 = vec_madd( v2, v3, constVec );
933 
934  ALIGNED_STORE2( &dst[i], v4, v5 );
935  }
936 
937  //handle cleanup
938  for ( ; i < count ; i++ ) {
939  dst[i] = src0[i] * src1[i];
940  }
941 }
942 
943 /*
944 ============
945 idSIMD_AltiVec::Div
946 
947  dst[i] = constant / divisor[i];
948 ============
949 */
950 void VPCALL idSIMD_AltiVec::Div( float *dst, const float constant, const float *divisor, const int count ) {
951  register vector float v0, v1, v2, v3;
952  register vector float v0_low, v0_hi, v1_low, v1_hi;
953  register vector unsigned char permVec;
954  register vector float constVec;
955  vector unsigned char oneCharVector = (vector unsigned char)(1);
956  int i;
957 
958  //handle unaligned at start
959  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
960  dst[i] = constant / divisor[i];
961  }
962 
963  //splat constant into a vector
964  constVec = loadSplatUnalignedScalar( &constant );
965 
966  //calculate permute and do first loads
967  permVec = vec_add( vec_lvsl( -1, (int*) &divisor[i] ), oneCharVector );
968  v1_hi = vec_ld( 0, &divisor[i] );
969 
970  //vectorize!
971  for ( ; i+7 < count; i += 8 ) {
972  //load source
973  v0_low = v1_hi;
974  v0_hi = vec_ld( 15, &divisor[i] );
975  v1_low = v0_hi;
976  v1_hi = vec_ld( 31, &divisor[i] );
977 
978  v0 = vec_perm( v0_low, v0_hi, permVec );
979  v1 = vec_perm( v1_low, v1_hi, permVec );
980 
981  v2 = Divide( constVec, v0 );
982  v3 = Divide( constVec, v1 );
983 
984  ALIGNED_STORE2( &dst[i], v2, v3 );
985  }
986 
987  //handle cleanup
988  for ( ; i < count ; i++ ) {
989  dst[i] = constant / divisor[i];
990  }
991 }
992 
993 /*
994 ============
995 idSIMD_AltiVec::Div
996 
997  dst[i] = src0[i] / src1[i];
998 ============
999 */
1000 void VPCALL idSIMD_AltiVec::Div( float *dst, const float *src0, const float *src1, const int count ) {
1001  register vector float v0, v1, v2, v3, v4, v5;
1002  //src0
1003  register vector float v0_low, v0_hi, v2_low, v2_hi;
1004  //src1
1005  register vector float v1_low, v1_hi, v3_low, v3_hi;
1006  //permute vectors
1007  register vector unsigned char permVec1, permVec2;
1008  vector unsigned char oneCharVector = (vector unsigned char)(1);
1009  int i;
1010 
1011  //handle unaligned at start
1012  for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1013  dst[i] = src0[i] / src1[i];
1014  }
1015 
1016  //calculate permute and do loads
1017  permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
1018  permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
1019  v2_hi = vec_ld( 0, &src0[i] );
1020  v3_hi = vec_ld( 0, &src1[i] );
1021 
1022  //vectorize!
1023  for ( ; i+7 < count; i += 8 ) {
1024  //load source
1025  v0_low = v2_hi;
1026  v0_hi = vec_ld( 15, &src0[i] );
1027  v2_low = v0_hi;
1028  v2_hi = vec_ld( 31, &src0[i] );
1029 
1030  v1_low = v3_hi;
1031  v1_hi = vec_ld( 15, &src1[i] );
1032  v3_low = v1_hi;
1033  v3_hi = vec_ld( 31, &src1[i] );
1034 
1035  v0 = vec_perm( v0_low, v0_hi, permVec1 );
1036  v1 = vec_perm( v1_low, v1_hi, permVec2 );
1037  v2 = vec_perm( v2_low, v2_hi, permVec1 );
1038  v3 = vec_perm( v3_low, v3_hi, permVec2 );
1039 
1040  v4 = Divide( v0, v1 );
1041  v5 = Divide( v2, v3 );
1042 
1043  ALIGNED_STORE2( &dst[i], v4, v5 );
1044  }
1045 
1046  //handle cleanup
1047  for ( ; i < count ; i++ ) {
1048  dst[i] = src0[i] / src1[i];
1049  }
1050 }
1051 
1052 /*
1053 ============
1054 idSIMD_AltiVec::MulAdd
1055 
1056  dst[i] += constant * src[i];
1057 ============
1058 */
1059 void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float constant, const float *src, const int count ) {
1060 
1061  register vector float v0, v1, v2, v3, v4, v5;
1062  register vector float constVec;
1063  //src
1064  register vector float v0_low, v0_hi, v2_low, v2_hi;
1065  //permute vectors
1066  register vector unsigned char permVec1;
1067  vector unsigned char oneCharVector = (vector unsigned char)(1);
1068  int i;
1069 
1070  //handle unaligned at start
1071  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1072  dst[i] += constant * src[i];
1073  }
1074 
1075  //splat constant into a vector
1076  constVec = loadSplatUnalignedScalar( &constant );
1077 
1078  //calculate permute and do loads
1079  permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
1080  v2_hi = vec_ld( 0, &src[i] );
1081 
1082  //vectorize!
1083  for ( ; i+7 < count; i += 8 ) {
1084  v0_low = v2_hi;
1085  v0_hi = vec_ld( 15, &src[i] );
1086  v2_low = v0_hi;
1087  v2_hi = vec_ld( 31, &src[i] );
1088 
1089  v0 = vec_perm( v0_low, v0_hi, permVec1 );
1090  v2 = vec_perm( v2_low, v2_hi, permVec1 );
1091 
1092  // at this point, dst is known to be aligned
1093  v1 = vec_ld( 0, &dst[i] );
1094  v3 = vec_ld( 16, &dst[i] );
1095 
1096  v4 = vec_madd( constVec, v0, v1 );
1097  v5 = vec_madd( constVec, v2, v3 );
1098 
1099  ALIGNED_STORE2( &dst[i], v4, v5 );
1100  }
1101 
1102  //handle cleanup
1103  for ( ; i < count ; i++ ) {
1104  dst[i] += constant * src[i];
1105  }
1106 }
1107 
1108 /*
1109 ============
1110 idSIMD_AltiVec::MulAdd
1111 
1112  dst[i] += src0[i] * src1[i];
1113 ============
1114 */
1115 void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
1116  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1117  //src0
1118  register vector float v0_low, v0_hi, v2_low, v2_hi;
1119  //src1
1120  register vector float v1_low, v1_hi, v3_low, v3_hi;
1121  //permute vectors
1122  register vector unsigned char permVec1, permVec2;
1123  vector unsigned char oneCharVector = (vector unsigned char)(1);
1124 
1125  int i;
1126 
1127  //unaligned at start
1128  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1129  dst[i] += src0[i] * src1[i];
1130  }
1131 
1132  //calculate permute and do loads
1133  permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
1134  permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
1135  v2_hi = vec_ld( 0, &src0[i] );
1136  v3_hi = vec_ld( 0, &src1[i] );
1137 
1138  //vectorize!
1139  for ( ; i+7 < count; i += 8 ) {
1140  // load sources
1141  v0_low = v2_hi;
1142  v0_hi = vec_ld( 15, &src0[i] );
1143  v2_low = v0_hi;
1144  v2_hi = vec_ld( 31, &src0[i] );
1145 
1146  v1_low = v3_hi;
1147  v1_hi = vec_ld( 15, &src1[i] );
1148  v3_low = v1_hi;
1149  v3_hi = vec_ld( 31, &src1[i] );
1150 
1151  v0 = vec_perm( v0_low, v0_hi, permVec1 );
1152  v1 = vec_perm( v1_low, v1_hi, permVec2 );
1153  v2 = vec_perm( v2_low, v2_hi, permVec1 );
1154  v3 = vec_perm( v3_low, v3_hi, permVec2 );
1155 
1156  //we know dst is aligned because we handled unaligned cases
1157  //up front
1158  v4 = vec_ld( 0, &dst[i] );
1159  v5 = vec_ld( 16, &dst[i] );
1160 
1161  v6 = vec_madd( v0, v1, v4 );
1162  v7 = vec_madd( v2, v3, v5 );
1163 
1164  ALIGNED_STORE2( &dst[i], v6, v7 );
1165  }
1166 
1167  //handle cleanup
1168  for ( ; i < count ; i++ ) {
1169  dst[i] += src0[i] * src1[i];
1170  }
1171 }
1172 
1173 /*
1174 ============
1175 idSIMD_AltiVec::MulSub
1176 
1177  dst[i] -= constant * src[i];
1178 ============
1179 */
1180 void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float constant, const float *src, const int count ) {
1181  register vector float v0, v1, v2, v3, v4, v5;
1182  register vector float constVec;
1183  //src
1184  register vector float v0_low, v0_hi, v2_low, v2_hi;
1185  //permute vectors
1186  register vector unsigned char permVec1;
1187  vector unsigned char oneCharVector = (vector unsigned char)(1);
1188  int i;
1189 
1190  //handle unaligned at start
1191  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1192  dst[i] -= constant * src[i];
1193  }
1194 
1195  //splat constant into a vector
1196  constVec = loadSplatUnalignedScalar( &constant );
1197 
1198  //calculate permute and do loads
1199  permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
1200  v2_hi = vec_ld( 0, &src[i] );
1201 
1202  //vectorize!
1203  for ( ; i+7 < count; i += 8 ) {
1204  v0_low = v2_hi;
1205  v0_hi = vec_ld( 15, &src[i] );
1206  v2_low = v0_hi;
1207  v2_hi = vec_ld( 31, &src[i] );
1208 
1209  v0 = vec_perm( v0_low, v0_hi, permVec1 );
1210  v2 = vec_perm( v2_low, v2_hi, permVec1 );
1211 
1212  //we know dst will be aligned here because we already handled the preceeding
1213  //unaligned cases
1214  v1 = vec_ld( 0, &dst[i] );
1215  v3 = vec_ld( 16, &dst[i] );
1216 
1217  v4 = vec_nmsub( v0, constVec, v1 );
1218  v5 = vec_nmsub( v2, constVec, v3 );
1219 
1220  ALIGNED_STORE2( &dst[i], v4, v5 );
1221  }
1222 
1223  //handle cleanup
1224  for ( ; i < count ; i++ ) {
1225  dst[i] -= constant * src[i];
1226  }
1227 }
1228 
1229 /*
1230 ============
1231 idSIMD_AltiVec::MulSub
1232 
1233  dst[i] -= src0[i] * src1[i];
1234 ============
1235 */
1236 void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
1237  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1238  //src0
1239  register vector float v0_low, v0_hi, v2_low, v2_hi;
1240  //src1
1241  register vector float v1_low, v1_hi, v3_low, v3_hi;
1242  //permute vectors
1243  register vector unsigned char permVec1, permVec2;
1244  vector unsigned char oneCharVector = (vector unsigned char)(1);
1245  int i;
1246 
1247  //unaligned at start
1248  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1249  dst[i] -= src0[i] * src1[i];
1250  }
1251 
1252  //calculate permute and do loads
1253  permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
1254  permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
1255  v2_hi = vec_ld( 0, &src0[i] );
1256  v3_hi = vec_ld( 0, &src1[i] );
1257 
1258 
1259  //vectorize!
1260  for ( ; i+7 < count; i += 8 ) {
1261  // load sources
1262  v0_low = v2_hi;
1263  v0_hi = vec_ld( 15, &src0[i] );
1264  v2_low = v0_hi;
1265  v2_hi = vec_ld( 31, &src0[i] );
1266 
1267  v1_low = v3_hi;
1268  v1_hi = vec_ld( 15, &src1[i] );
1269  v3_low = v1_hi;
1270  v3_hi = vec_ld( 31, &src1[i] );
1271 
1272  v0 = vec_perm( v0_low, v0_hi, permVec1 );
1273  v1 = vec_perm( v1_low, v1_hi, permVec2 );
1274  v2 = vec_perm( v2_low, v2_hi, permVec1 );
1275  v3 = vec_perm( v3_low, v3_hi, permVec2 );
1276 
1277  //we know dst is aligned because we handled unaligned cases
1278  //up front
1279  v4 = vec_ld( 0, &dst[i] );
1280  v5 = vec_ld( 16, &dst[i] );
1281 
1282  v6 = vec_nmsub( v0, v1, v4 );
1283  v7 = vec_nmsub( v2, v3, v5 );
1284 
1285  ALIGNED_STORE2( &dst[i], v6, v7 );
1286  }
1287 
1288  //handle cleanup
1289  for ( ; i < count ; i++ ) {
1290  dst[i] -= src0[i] * src1[i];
1291  }
1292 }
1293 
1294 #endif /* ENABLE_SIMPLE_MATH */
1295 
1296 #ifdef ENABLE_DOT
1297 /*
1298 ============
1299 idSIMD_AltiVec::Dot
1300 
1301  dst[i] = constant * src[i];
1302 ============
1303 */
1304 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
1305 
1306  register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
1307  register vector float vecX, vecY, vecZ;
1308  vector float vecX2, vecY2, vecZ2;
1309  const float *addr = src[0].ToFloatPtr();
1310  float tempVal[4];
1311  float constVal[4];
1312  register vector float zeroVector = (vector float)(0.0);
1313  register vector float vecConstX, vecConstY, vecConstZ;
1314 
1315  // permute vectors
1316  register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
1317  register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
1318 
1319  register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
1320  register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
1321 
1322  register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
1323  register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
1324 
1325  int i;
1326 
1327  // for scalar cleanup, if necessary
1328  constVal[0] = constant[0];
1329  constVal[1] = constant[1];
1330  constVal[2] = constant[2];
1331  constVal[3] = 0;
1332 
1333  vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1334  vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
1335  vecLd2 = vec_ld( 11, constant.ToFloatPtr() );
1336  vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
1337 
1338 
1339  // populate const vectors
1340  vecConstX = vec_splat( vecLd1, 0 );
1341  vecConstY = vec_splat( vecLd1, 1 );
1342  vecConstZ = vec_splat( vecLd1, 2 );
1343 
1344  vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
1345  vector float vecOld = vec_ld( 0, addr );
1346 
1347  // handle unaligned case at beginning
1348  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1349  dst[i] = constant * src[i];
1350  }
1351 
1352  for ( ; i + 7 < count; i += 8 ) {
1353  float *vecPtr = (float*)( addr + (i*3) );
1354  vector float v0, v1, v2, v3, v4, v5;
1355 
1356  v0 = vecOld; //vec_ld( 0, vecPtr );
1357  v1 = vec_ld( 15, vecPtr );
1358  v2 = vec_ld( 31, vecPtr );
1359  v3 = vec_ld( 47, vecPtr );
1360  v4 = vec_ld( 63, vecPtr );
1361  v5 = vec_ld( 79, vecPtr );
1362  vecOld = vec_ld( 95, vecPtr );
1363 
1364  vecLd1 = vec_perm( v0, v1, permVec );
1365  vecLd2 = vec_perm( v1, v2, permVec );
1366  vecLd3 = vec_perm( v2, v3, permVec );
1367 
1368  vecLd4 = vec_perm( v3, v4, permVec );
1369  vecLd5 = vec_perm( v4, v5, permVec );
1370  vecLd6 = vec_perm( v5, vecOld, permVec );
1371 
1372  // permute into X Y Z vectors
1373  vecX = vec_perm( vecLd1, vecLd2, permX1 );
1374  vecY = vec_perm( vecLd1, vecLd2, permY1 );
1375  vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
1376  vecX = vec_perm( vecX, vecLd3, permX2 );
1377  vecY = vec_perm( vecY, vecLd3, permY2 );
1378  vecZ = vec_perm( vecZ, vecLd3, permZ2 );
1379 
1380  vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
1381  vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
1382  vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
1383  vecX2 = vec_perm( vecX2, vecLd6, permX2 );
1384  vecY2 = vec_perm( vecY2, vecLd6, permY2 );
1385  vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
1386 
1387  // do multiply
1388  vecX = vec_madd( vecX, vecConstX, zeroVector );
1389  vecY = vec_madd( vecY, vecConstY, vecX );
1390  vecZ = vec_madd( vecZ, vecConstZ, vecY );
1391 
1392  vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
1393  vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
1394  vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
1395 
1396  // store out results
1397  ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
1398  }
1399 
1400  //cleanup
1401  for ( ; i < count; i++ ) {
1402  // look up whats at the address we want, cast it as float pointer, then
1403  // dereference that pointer
1404  tempVal[0] = *( addr + (i*3) + 0 );
1405  tempVal[1] = *( addr + (i*3) + 1 );
1406  tempVal[2] = *( addr + (i*3) + 2 );
1407  dst[i] = constVal[0] * tempVal[0] + constVal[1] * tempVal[1] + constVal[2] * tempVal[2];
1408  }
1409 }
1410 
1411 
1412 /*
1413 ============
1414 idSIMD_AltiVec::Dot
1415 
1416  dst[i] = constant * src[i].Normal() + src[i][3];
1417 ============
1418 */
1419 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
1420 //#define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];
1421 
1422  assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
1423 
1424  int i;
1425  float constVal[4];
1426  float srcVal[3];
1427  float srcI3;
1428  float tempVal;
1429 
1430  vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
1431  vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
1432  vector float vecX, vecY, vecZ, vecI3;
1433  vector float vecX2, vecY2, vecZ2, vecI32;
1434  vector float vecConstX, vecConstY, vecConstZ;
1435 
1436  constVal[0] = constant[0];
1437  constVal[1] = constant[1];
1438  constVal[2] = constant[2];
1439  constVal[3] = 1;
1440 
1441  vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1442  vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
1443  vector float v1 = vec_ld( 11, constant.ToFloatPtr() );
1444  vector float vecConst = vec_perm( v0, v1, constPerm );
1445 
1446  vecConstX = vec_splat( vecConst, 0 );
1447  vecConstY = vec_splat( vecConst, 1 );
1448  vecConstZ = vec_splat( vecConst, 2 );
1449 
1450  // handle unaligned case at beginning
1451  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1452  dst[i] = constant * src[i].Normal() + src[i][3];
1453  }
1454 
1455  const float *addr = src[i].ToFloatPtr();
1456  vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
1457  vector float vecOld = vec_ld( 0, addr );
1458 
1459  for ( ; i + 7 < count; i += 8 ) {
1460  float *planePtr = (float*)( addr + (i*PLANE_OFFSET) );
1461  vector float v0, v1, v2, v3, v4, v5, v6, v7;
1462 
1463  v0 = vecOld; //vec_ld( 0, planePtr );
1464  v1 = vec_ld( 15, planePtr );
1465  v2 = vec_ld( 31, planePtr );
1466  v3 = vec_ld( 47, planePtr );
1467  v4 = vec_ld( 63, planePtr );
1468  v5 = vec_ld( 79, planePtr );
1469  v6 = vec_ld( 95, planePtr );
1470  v7 = vec_ld( 111, planePtr );
1471  vecOld = vec_ld( 127, planePtr );
1472 
1473  vecPlaneLd1 = vec_perm( v0, v1, permVec );
1474  vecPlaneLd2 = vec_perm( v1, v2, permVec );
1475  vecPlaneLd3 = vec_perm( v2, v3, permVec );
1476  vecPlaneLd4 = vec_perm( v3, v4, permVec );
1477 
1478  vecPlaneLd5 = vec_perm( v4, v5, permVec );
1479  vecPlaneLd6 = vec_perm( v5, v6, permVec );
1480  vecPlaneLd7 = vec_perm( v6, v7, permVec );
1481  vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
1482 
1483  // permute into X Y Z vectors, since this is square its basically
1484  // a matrix transpose
1485  v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
1486  v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
1487  v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
1488  v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
1489 
1490  vecX = vec_mergeh( v0, v1 );
1491  vecY = vec_mergel( v0, v1 );
1492  vecZ = vec_mergeh( v2, v3 );
1493  vecI3 = vec_mergel( v2, v3 );
1494 
1495  v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
1496  v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
1497  v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
1498  v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
1499 
1500  vecX2 = vec_mergeh( v4, v5 );
1501  vecY2 = vec_mergel( v4, v5 );
1502  vecZ2 = vec_mergeh( v6, v7 );
1503  vecI32 = vec_mergel( v6, v7 );
1504 
1505  // do calculation
1506  v6 = vec_madd( vecZ, vecConstZ, vecI3 );
1507  v5 = vec_madd( vecY, vecConstY, v6 );
1508  v4 = vec_madd( vecX, vecConstX, v5 );
1509 
1510  v0 = vec_madd( vecZ2, vecConstZ, vecI32 );
1511  v1 = vec_madd( vecY2, vecConstY, v0 );
1512  v2 = vec_madd( vecX2, vecConstX, v1 );
1513 
1514  // store results
1515  ALIGNED_STORE2( &dst[i], v4, v2 );
1516  }
1517 
1518  // cleanup
1519  for ( ; i < count; i++ ) {
1520  // populate srcVal with src X Y Z
1521  srcVal[0] = *(addr + (i*PLANE_OFFSET) + 0 );
1522  srcVal[1] = *(addr + (i*PLANE_OFFSET) + 1 );
1523  srcVal[2] = *(addr + (i*PLANE_OFFSET) + 2 );
1524 
1525  // put src[i][3] into srcI3
1526  srcI3 = *(addr + (i*PLANE_OFFSET) + 3 );
1527 
1528  tempVal = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
1529  dst[i] = tempVal + srcI3;
1530  }
1531 }
1532 
1533 #ifndef DRAWVERT_PADDED
1534 /*
1535 ============
1536 idSIMD_AltiVec::Dot
1537 
1538  dst[i] = constant * src[i].xyz;
1539 ============
1540 */
1541 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
1542 //#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
1543 
1544  // idDrawVert size is 60 bytes
1545  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
1546 
1547  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1548  int i;
1549  register vector float vecConstX, vecConstY, vecConstZ;
1550  register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
1551  register vector float zeroVector = (vector float)(0.0);
1552  vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1553 
1554  vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1555  v0 = vec_ld( 0, constant.ToFloatPtr() );
1556  v1 = vec_ld( 11, constant.ToFloatPtr() );
1557  v0 = vec_perm( v0, v1, constPerm );
1558 
1559  // permute into constant vectors
1560  vecConstX = vec_splat( v0, 0 );
1561  vecConstY = vec_splat( v0, 1 );
1562  vecConstZ = vec_splat( v0, 2 );
1563 
1564  // handle unaligned case at beginning
1565  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1566  dst[i] = constant * src[i].xyz;
1567  }
1568 
1569  // every fourth one will have the same alignment. Make sure we've got enough here
1570  if ( i+3 < count ) {
1571  vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1572  vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1573  vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1574  vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1575  }
1576 
1577  for ( ; i+3 < count; i += 4 ) {
1578  const float *vertPtr = src[i].xyz.ToFloatPtr();
1579  const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
1580  const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
1581  const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
1582 
1583  v0 = vec_ld( 0, vertPtr );
1584  v1 = vec_ld( 11, vertPtr );
1585  v2 = vec_ld( 0, vertPtr2 );
1586  v3 = vec_ld( 11, vertPtr2 );
1587  v4 = vec_ld( 0, vertPtr3 );
1588  v5 = vec_ld( 11, vertPtr3 );
1589  v6 = vec_ld( 0, vertPtr4 );
1590  v7 = vec_ld( 11, vertPtr4 );
1591 
1592  v0 = vec_perm( v0, v1, vertPerm1 );
1593  v2 = vec_perm( v2, v3, vertPerm2 );
1594  v4 = vec_perm( v4, v5, vertPerm3 );
1595  v6 = vec_perm( v6, v7, vertPerm4 );
1596 
1597  // transpose into X Y Z vectors
1598  v1 = vec_mergeh( v0, v4 );
1599  v3 = vec_mergeh( v2, v6 );
1600  v5 = vec_mergel( v0, v4 );
1601  v7 = vec_mergel( v2, v6 );
1602 
1603  vecSrcX1 = vec_mergeh( v1, v3 );
1604  vecSrcY1 = vec_mergel( v1, v3 );
1605  vecSrcZ1 = vec_mergeh( v5, v7 );
1606 
1607  // now calculate dot product
1608  vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
1609  vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
1610  vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
1611 
1612  // store results
1613  vec_st( vecSrcZ1, 0, &dst[i] );
1614  }
1615 
1616  for ( ; i < count; i++ ) {
1617  dst[i] = constant * src[i].xyz;
1618  }
1619 }
1620 #else
1621 /*
1622 ============
1623 idSIMD_AltiVec::Dot
1624 
1625  dst[i] = constant * src[i].xyz;
1626 ============
1627 */
1628 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
1629 //#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
1630 
1631  // idDrawVert size is 64 bytes
1632  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
1633 
1634  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1635  int i;
1636  register vector float vecConstX, vecConstY, vecConstZ;
1637  register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
1638  register vector float zeroVector = (vector float)(0.0);
1639  vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1640 
1641  vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1642  v0 = vec_ld( 0, constant.ToFloatPtr() );
1643  v1 = vec_ld( 11, constant.ToFloatPtr() );
1644  v0 = vec_perm( v0, v1, constPerm );
1645 
1646  // permute into constant vectors
1647  vecConstX = vec_splat( v0, 0 );
1648  vecConstY = vec_splat( v0, 1 );
1649  vecConstZ = vec_splat( v0, 2 );
1650 
1651  // handle unaligned case at beginning
1652  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1653  dst[i] = constant * src[i].xyz;
1654  }
1655 
1656  for ( ; i+3 < count; i += 4 ) {
1657  const float *vertPtr = src[i].xyz.ToFloatPtr();
1658  const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
1659  const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
1660  const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
1661 
1662  v0 = vec_ld( 0, vertPtr );
1663  v2 = vec_ld( 0, vertPtr2 );
1664  v4 = vec_ld( 0, vertPtr3 );
1665  v6 = vec_ld( 0, vertPtr4 );
1666 
1667  // transpose into X Y Z vectors
1668  v1 = vec_mergeh( v0, v4 );
1669  v3 = vec_mergeh( v2, v6 );
1670  v5 = vec_mergel( v0, v4 );
1671  v7 = vec_mergel( v2, v6 );
1672 
1673  vecSrcX1 = vec_mergeh( v1, v3 );
1674  vecSrcY1 = vec_mergel( v1, v3 );
1675  vecSrcZ1 = vec_mergeh( v5, v7 );
1676 
1677  // now calculate dot product
1678  vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
1679  vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
1680  vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
1681 
1682  // store results
1683  vec_st( vecSrcZ1, 0, &dst[i] );
1684  }
1685 
1686  for ( ; i < count; i++ ) {
1687  dst[i] = constant * src[i].xyz;
1688  }
1689 }
1690 
1691 #endif /* DRAWVERT_PADDED */
1692 
1693 /*
1694 ============
1695 idSIMD_AltiVec::Dot
1696 
1697  dst[i] = constant.Normal() * src[i] + constant[3];
1698 ============
1699 */
1700 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
1701 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];
1702 
1703  register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
1704  register vector float vecX, vecY, vecZ, vecX2, vecY2, vecZ2;
1705  register vector float zeroVector = (vector float)(0.0);
1706  register vector float vecConstX, vecConstY, vecConstZ;
1707  register vector float vecConst3;
1708 
1709  idVec3 constNormal = constant.Normal();
1710  float const3 = constant[3];
1711 
1712  // permute vectors
1713  register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
1714  register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
1715 
1716  register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
1717  register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
1718 
1719  register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
1720  register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
1721 
1722  int i;
1723 
1724  vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1725  vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
1726  vecLd2 = vec_ld( 15, constant.ToFloatPtr() );
1727  vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
1728 
1729  // populate const vec
1730  vecConstX = vec_splat( vecLd1, 0 );
1731  vecConstY = vec_splat( vecLd1, 1 );
1732  vecConstZ = vec_splat( vecLd1, 2 );
1733 
1734  // put constant to add in vector
1735  vecConst3 = loadSplatUnalignedScalar( &const3 );
1736 
1737  // handle unaligned case at beginning
1738  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1739  dst[i] = constant.Normal() * src[i] + constant[3];
1740  }
1741 
1742  const float *addr = src[i].ToFloatPtr();
1743  vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
1744  vector float vecOld = vec_ld( 0, addr );
1745 
1746  for ( ; i+7 < count; i += 8 ) {
1747  float *vecPtr = (float*)( addr + (i*3) );
1748  vector float v0, v1, v2, v3, v4, v5;
1749 
1750  v0 = vecOld; //vec_ld( 0, vecPtr );
1751  v1 = vec_ld( 15, vecPtr );
1752  v2 = vec_ld( 31, vecPtr );
1753  v3 = vec_ld( 47, vecPtr );
1754  v4 = vec_ld( 63, vecPtr );
1755  v5 = vec_ld( 79, vecPtr );
1756  vecOld = vec_ld( 95, vecPtr );
1757 
1758  vecLd1 = vec_perm( v0, v1, permVec );
1759  vecLd2 = vec_perm( v1, v2, permVec );
1760  vecLd3 = vec_perm( v2, v3, permVec );
1761 
1762  vecLd4 = vec_perm( v3, v4, permVec );
1763  vecLd5 = vec_perm( v4, v5, permVec );
1764  vecLd6 = vec_perm( v5, vecOld, permVec );
1765 
1766  // permute into X Y Z vectors
1767  vecX = vec_perm( vecLd1, vecLd2, permX1 );
1768  vecY = vec_perm( vecLd1, vecLd2, permY1 );
1769  vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
1770  vecX = vec_perm( vecX, vecLd3, permX2 );
1771  vecY = vec_perm( vecY, vecLd3, permY2 );
1772  vecZ = vec_perm( vecZ, vecLd3, permZ2 );
1773 
1774  vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
1775  vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
1776  vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
1777  vecX2 = vec_perm( vecX2, vecLd6, permX2 );
1778  vecY2 = vec_perm( vecY2, vecLd6, permY2 );
1779  vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
1780 
1781  // calculate dot product
1782  vecX = vec_madd( vecX, vecConstX, zeroVector );
1783  vecY = vec_madd( vecY, vecConstY, vecX );
1784  vecZ = vec_madd( vecZ, vecConstZ, vecY );
1785 
1786  vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
1787  vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
1788  vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
1789 
1790  // add in constant[3]
1791  vecZ = vec_add( vecZ, vecConst3 );
1792  vecZ2 = vec_add( vecZ2, vecConst3 );
1793 
1794  // store out results
1795  ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
1796  }
1797 
1798  //cleanup
1799  for ( ; i < count; i++ ) {
1800  dst[i] = constNormal * src[i] + const3;
1801  }
1802 }
1803 
1804 /*
1805 ============
1806 idSIMD_AltiVec::Dot
1807 
1808  dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
1809 ============
1810 */
1811 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
1812 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];
1813 
1814  // check plane size
1815  assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
1816 
1817  float constVal[4];
1818  float srcVal[4];
1819 
1820  int i;
1821  const float *constPtr = constant.ToFloatPtr();
1822 
1823  register vector float vecX, vecY, vecZ, vecI3;
1824  register vector float vecX2, vecY2, vecZ2, vecI32;
1825 
1826  vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
1827  vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
1828  register vector float zeroVector = (vector float)(0.0);
1829  register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
1830 
1831  constVal[0] = *(constPtr);
1832  constVal[1] = *(constPtr+1);
1833  constVal[2] = *(constPtr+2);
1834  constVal[3] = *(constPtr+3);
1835 
1836  // populate const vector
1837  vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1838  vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
1839  vector float v1 = vec_ld( 15, constant.ToFloatPtr() );
1840  vector float vecConst = vec_perm( v0, v1, constPerm );
1841 
1842  vecConstX = vec_splat( vecConst, 0 );
1843  vecConstY = vec_splat( vecConst, 1 );
1844  vecConstZ = vec_splat( vecConst, 2 );
1845  vecConstI3 = vec_splat( vecConst, 3 );
1846 
1847  // handle unaligned case at beginning
1848  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1849  dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
1850  }
1851 
1852  const float *srcPtr = src[i].ToFloatPtr();
1853  vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
1854  vector float vecOld = vec_ld( 0, srcPtr );
1855 
1856  for ( ; i+7 < count; i += 8 ) {
1857  float *planePtr = (float*)( srcPtr + (i*PLANE_OFFSET) );
1858  vector float v0, v1, v2, v3, v4, v5, v6, v7;
1859 
1860  v0 = vecOld; // vec_ld( 0, planePtr );
1861  v1 = vec_ld( 15, planePtr );
1862  v2 = vec_ld( 31, planePtr );
1863  v3 = vec_ld( 47, planePtr );
1864  v4 = vec_ld( 63, planePtr );
1865  v5 = vec_ld( 79, planePtr );
1866  v6 = vec_ld( 95, planePtr );
1867  v7 = vec_ld( 111, planePtr );
1868  vecOld = vec_ld( 127, planePtr );
1869 
1870  vecPlaneLd1 = vec_perm( v0, v1, permVec );
1871  vecPlaneLd2 = vec_perm( v1, v2, permVec );
1872  vecPlaneLd3 = vec_perm( v2, v3, permVec );
1873  vecPlaneLd4 = vec_perm( v3, v4, permVec );
1874 
1875  vecPlaneLd5 = vec_perm( v4, v5, permVec );
1876  vecPlaneLd6 = vec_perm( v5, v6, permVec );
1877  vecPlaneLd7 = vec_perm( v6, v7, permVec );
1878  vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
1879 
1880  // permute into X Y Z vectors, since this is square its basically
1881  // a matrix transpose
1882  v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
1883  v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
1884  v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
1885  v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
1886 
1887  vecX = vec_mergeh( v0, v1 );
1888  vecY = vec_mergel( v0, v1 );
1889  vecZ = vec_mergeh( v2, v3 );
1890  vecI3 = vec_mergel( v2, v3 );
1891 
1892  v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
1893  v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
1894  v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
1895  v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
1896 
1897  vecX2 = vec_mergeh( v4, v5 );
1898  vecY2 = vec_mergel( v4, v5 );
1899  vecZ2 = vec_mergeh( v6, v7 );
1900  vecI32 = vec_mergel( v6, v7 );
1901 
1902  // do calculation
1903  v4 = vec_madd( vecConstX, vecX, zeroVector );
1904  v5 = vec_madd( vecConstY, vecY, v4 );
1905  v6 = vec_madd( vecConstZ, vecZ, v5 );
1906  v7 = vec_madd( vecConstI3, vecI3, v6 );
1907 
1908  v0 = vec_madd( vecConstX, vecX2, zeroVector );
1909  v1 = vec_madd( vecConstY, vecY2, v0 );
1910  v2 = vec_madd( vecConstZ, vecZ2, v1 );
1911  v3 = vec_madd( vecConstI3, vecI32, v2 );
1912 
1913  //store result
1914  ALIGNED_STORE2( &dst[i], v7, v3 );
1915  }
1916 
1917  // cleanup
1918  for ( ; i < count; i++ ) {
1919  //dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
1920  srcVal[0] = *(srcPtr + (i*PLANE_OFFSET) + 0 );
1921  srcVal[1] = *(srcPtr + (i*PLANE_OFFSET) + 1 );
1922  srcVal[2] = *(srcPtr + (i*PLANE_OFFSET) + 2 );
1923  srcVal[3] = *(srcPtr + (i*PLANE_OFFSET) + 3 );
1924  dst[i] = srcVal[0] * constVal[0] + srcVal[1] * constVal[1] + srcVal[2] * constVal[2] + constVal[3] * srcVal[3];
1925  }
1926 }
1927 
1928 
1929 #ifndef DRAWVERT_PADDED
1930 /*
1931 ============
1932 idSIMD_AltiVec::Dot
1933 
1934  dst[i] = constant.Normal() * src[i].xyz + constant[3];
1935 ============
1936 */
1937 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
1938 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
1939 
1940  // idDrawVert size is 60 bytes
1941  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
1942 
1943  int i;
1944  const float *constPtr = constant.ToFloatPtr();
1945  const float *srcPtr = src[0].xyz.ToFloatPtr();
1946 
1947  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1948  register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
1949  register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
1950  register vector float vecDest1;
1951  register vector float zeroVector = (vector float)(0.0);
1952  vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1953 
1954  float constVal[4];
1955  float srcVal[3];
1956 
1957  constVal[0] = *(constPtr+0);
1958  constVal[1] = *(constPtr+1);
1959  constVal[2] = *(constPtr+2);
1960  constVal[3] = *(constPtr+3);
1961 
1962  // populate const vec
1963  vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1964  v0 = vec_ld( 0, constant.ToFloatPtr() );
1965  v1 = vec_ld( 15, constant.ToFloatPtr() );
1966  v0 = vec_perm( v0, v1, constPerm );
1967 
1968  vecConstX = vec_splat( v0, 0 );
1969  vecConstY = vec_splat( v0, 1 );
1970  vecConstZ = vec_splat( v0, 2 );
1971  vecConstI3 = vec_splat( v0, 3 );
1972 
1973  // handle unaligned case at beginning
1974  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1975  dst[i] = constant.Normal() * src[i].xyz + constant[3];
1976  }
1977 
1978  // every fourth one will have the same alignment, so can store these. Make sure we
1979  // have enough so we don't run off the end of the array
1980  if ( i+3 < count ) {
1981  vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1982  vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1983  vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1984  vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1985  }
1986 
1987  for ( ; i+3 < count; i+=4 ) {
1988  const float *vertPtr = src[i].xyz.ToFloatPtr();
1989  const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
1990  const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
1991  const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
1992 
1993  v0 = vec_ld( 0, vertPtr );
1994  v1 = vec_ld( 11, vertPtr );
1995  v2 = vec_ld( 0, vertPtr2 );
1996  v3 = vec_ld( 11, vertPtr2 );
1997  v4 = vec_ld( 0, vertPtr3 );
1998  v5 = vec_ld( 11, vertPtr3 );
1999  v6 = vec_ld( 0, vertPtr4 );
2000  v7 = vec_ld( 11, vertPtr4 );
2001 
2002  v0 = vec_perm( v0, v1, vertPerm1 );
2003  v2 = vec_perm( v2, v3, vertPerm2 );
2004  v4 = vec_perm( v4, v5, vertPerm3 );
2005  v6 = vec_perm( v6, v7, vertPerm4 );
2006 
2007  // transpose into X Y Z vectors
2008  v1 = vec_mergeh( v0, v4 );
2009  v3 = vec_mergeh( v2, v6 );
2010  v5 = vec_mergel( v0, v4 );
2011  v7 = vec_mergel( v2, v6 );
2012 
2013  vecSrcX1 = vec_mergeh( v1, v3 );
2014  vecSrcY1 = vec_mergel( v1, v3 );
2015  vecSrcZ1 = vec_mergeh( v5, v7 );
2016 
2017  // now calculate dot product
2018  vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
2019  vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
2020  vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
2021  vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
2022 
2023  // store results
2024  vec_st( vecDest1, 0, &dst[i] );
2025  }
2026 
2027  // cleanup
2028  for ( ; i < count; i++ ) {
2029  srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
2030  srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
2031  srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
2032  // dst[i] = constant.Normal() * src[i].xyz + constant[3];
2033 
2034  dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
2035  dst[i] += constVal[3];
2036  }
2037 }
2038 #else
2039 /*
2040 ============
2041 idSIMD_AltiVec::Dot
2042 
2043  dst[i] = constant.Normal() * src[i].xyz + constant[3];
2044 ============
2045 */
2046 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
2047 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
2048 
2049  // idDrawVert size is 60 bytes
2050  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
2051 
2052  int i;
2053  const float *constPtr = constant.ToFloatPtr();
2054  const float *srcPtr = src[0].xyz.ToFloatPtr();
2055 
2056  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
2057  register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
2058  register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
2059  register vector float vecDest1;
2060  register vector float zeroVector = (vector float)(0.0);
2061  vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
2062 
2063  float constVal[4];
2064  float srcVal[3];
2065 
2066  constVal[0] = *(constPtr+0);
2067  constVal[1] = *(constPtr+1);
2068  constVal[2] = *(constPtr+2);
2069  constVal[3] = *(constPtr+3);
2070 
2071  // populate const vec
2072  vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
2073  v0 = vec_ld( 0, constant.ToFloatPtr() );
2074  v1 = vec_ld( 15, constant.ToFloatPtr() );
2075  v0 = vec_perm( v0, v1, constPerm );
2076 
2077  vecConstX = vec_splat( v0, 0 );
2078  vecConstY = vec_splat( v0, 1 );
2079  vecConstZ = vec_splat( v0, 2 );
2080  vecConstI3 = vec_splat( v0, 3 );
2081 
2082  // handle unaligned case at beginning
2083  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
2084  dst[i] = constant.Normal() * src[i].xyz + constant[3];
2085  }
2086 
2087  for ( ; i+3 < count; i+=4 ) {
2088  const float *vertPtr = src[i].xyz.ToFloatPtr();
2089  const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
2090  const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
2091  const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
2092 
2093  v0 = vec_ld( 0, vertPtr );
2094  v2 = vec_ld( 0, vertPtr2 );
2095  v4 = vec_ld( 0, vertPtr3 );
2096  v6 = vec_ld( 0, vertPtr4 );
2097 
2098  // transpose into X Y Z vectors
2099  v1 = vec_mergeh( v0, v4 );
2100  v3 = vec_mergeh( v2, v6 );
2101  v5 = vec_mergel( v0, v4 );
2102  v7 = vec_mergel( v2, v6 );
2103 
2104  vecSrcX1 = vec_mergeh( v1, v3 );
2105  vecSrcY1 = vec_mergel( v1, v3 );
2106  vecSrcZ1 = vec_mergeh( v5, v7 );
2107 
2108  // now calculate dot product
2109  vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
2110  vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
2111  vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
2112  vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
2113 
2114  // store results
2115  vec_st( vecDest1, 0, &dst[i] );
2116  }
2117 
2118  // cleanup
2119  for ( ; i < count; i++ ) {
2120  srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
2121  srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
2122  srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
2123  // dst[i] = constant.Normal() * src[i].xyz + constant[3];
2124 
2125  dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
2126  dst[i] += constVal[3];
2127  }
2128 }
2129 
2130 #endif /* DRAWVERT_PADDED */
2131 
2132 /*
2133 ============
2134 idSIMD_AltiVec::Dot
2135 
2136  dst[i] = src0[i] * src1[i];
2137 ============
2138 */
2139 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
2140 //#define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];
2141 
2142  int i;
2143  float src0Val[3];
2144  float src1Val[3];
2145 
2146  register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
2147  vector float vecLd7, vecLd8, vecLd9, vecLd10, vecLd11, vecLd12;
2148  register vector float vecX0, vecY0, vecZ0, vecX1, vecY1, vecZ1;
2149  register vector float vecX02, vecY02, vecZ02, vecX12, vecY12, vecZ12;
2150  register vector float zeroVector = (vector float)(0.0);
2151  // permute vectors
2152  register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
2153  register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2154  register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
2155  register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2156  register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
2157  register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2158 
2159  // handle unaligned case at beginning
2160  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
2161  dst[i] = src0[i] * src1[i];
2162  }
2163 
2164  const float *src0Ptr = src0[i].ToFloatPtr();
2165  const float *src1Ptr = src1[i].ToFloatPtr();
2166  vector unsigned char permVec1 = vec_add( vec_lvsl( -1, src0Ptr ), (vector unsigned char)(1) );
2167  vector unsigned char permVec2 = vec_add( vec_lvsl( -1, src1Ptr ), (vector unsigned char)(1) );
2168  vector float vecOld0 = vec_ld( 0, src0Ptr );
2169  vector float vecOld1 = vec_ld( 0, src1Ptr );
2170 
2171  for ( i = 0; i+7 < count; i += 8 ) {
2172  float *s0Ptr = (float*)( src0Ptr + (i*3) );
2173  float *s1Ptr = (float*)( src1Ptr + (i*3) );
2174 
2175  vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
2176  v0 = vecOld0;
2177  v1 = vec_ld( 15, s0Ptr );
2178  v2 = vec_ld( 31, s0Ptr );
2179  v3 = vec_ld( 47, s0Ptr );
2180  v4 = vec_ld( 63, s0Ptr );
2181  v5 = vec_ld( 79, s0Ptr );
2182  vecOld0 = vec_ld( 95, s0Ptr );
2183 
2184  v6 = vecOld1;
2185  v7 = vec_ld( 15, s1Ptr );
2186  v8 = vec_ld( 31, s1Ptr );
2187  v9 = vec_ld( 47, s1Ptr );
2188  v10 = vec_ld( 63, s1Ptr );
2189  v11 = vec_ld( 79, s1Ptr );
2190  vecOld1 = vec_ld( 95, s1Ptr );
2191 
2192  vecLd1 = vec_perm( v0, v1, permVec1 );
2193  vecLd2 = vec_perm( v1, v2, permVec1 );
2194  vecLd3 = vec_perm( v2, v3, permVec1 );
2195  vecLd4 = vec_perm( v3, v4, permVec1 );
2196  vecLd5 = vec_perm( v4, v5, permVec1 );
2197  vecLd6 = vec_perm( v5, vecOld0, permVec1 );
2198 
2199  vecLd7 = vec_perm( v6, v7, permVec2 );
2200  vecLd8 = vec_perm( v7, v8, permVec2 );
2201  vecLd9 = vec_perm( v8, v9, permVec2 );
2202  vecLd10 = vec_perm( v9, v10, permVec2 );
2203  vecLd11 = vec_perm( v10, v11, permVec2 );
2204  vecLd12 = vec_perm( v11, vecOld1, permVec2 );
2205 
2206  // permute into X Y Z vectors
2207  vecX0 = vec_perm( vecLd1, vecLd2, permX1 );
2208  vecY0 = vec_perm( vecLd1, vecLd2, permY1 );
2209  vecZ0 = vec_perm( vecLd1, vecLd2, permZ1 );
2210  vecX0 = vec_perm( vecX0, vecLd3, permX2 );
2211  vecY0 = vec_perm( vecY0, vecLd3, permY2 );
2212  vecZ0 = vec_perm( vecZ0, vecLd3, permZ2 );
2213 
2214  vecX02 = vec_perm( vecLd4, vecLd5, permX1 );
2215  vecY02 = vec_perm( vecLd4, vecLd5, permY1 );
2216  vecZ02 = vec_perm( vecLd4, vecLd5, permZ1 );
2217  vecX02 = vec_perm( vecX02, vecLd6, permX2 );
2218  vecY02 = vec_perm( vecY02, vecLd6, permY2 );
2219  vecZ02 = vec_perm( vecZ02, vecLd6, permZ2 );
2220 
2221  vecX1 = vec_perm( vecLd7, vecLd8, permX1 );
2222  vecY1 = vec_perm( vecLd7, vecLd8, permY1 );
2223  vecZ1 = vec_perm( vecLd7, vecLd8, permZ1 );
2224  vecX1 = vec_perm( vecX1, vecLd9, permX2 );
2225  vecY1 = vec_perm( vecY1, vecLd9, permY2 );
2226  vecZ1 = vec_perm( vecZ1, vecLd9, permZ2 );
2227 
2228  vecX12 = vec_perm( vecLd10, vecLd11, permX1 );
2229  vecY12 = vec_perm( vecLd10, vecLd11, permY1 );
2230  vecZ12 = vec_perm( vecLd10, vecLd11, permZ1 );
2231  vecX12 = vec_perm( vecX12, vecLd12, permX2 );
2232  vecY12 = vec_perm( vecY12, vecLd12, permY2 );
2233  vecZ12 = vec_perm( vecZ12, vecLd12, permZ2 );
2234 
2235  // do multiply
2236  vecX0 = vec_madd( vecX0, vecX1, zeroVector );
2237  vecY0 = vec_madd( vecY0, vecY1, vecX0 );
2238  vecZ0 = vec_madd( vecZ0, vecZ1, vecY0 );
2239  vecX02 = vec_madd( vecX02, vecX12, zeroVector );
2240  vecY02 = vec_madd( vecY02, vecY12, vecX02 );
2241  vecZ02 = vec_madd( vecZ02, vecZ12, vecY02 );
2242 
2243  // store out results
2244  ALIGNED_STORE2( &dst[i], vecZ0, vecZ02 );
2245  }
2246 
2247  // cleanup
2248  for ( ; i < count; i++ ) {
2249  // dst[i] = src0[i] * src1[i];
2250  src0Val[0] = *( src0Ptr + (i*3) + 0 );
2251  src0Val[1] = *( src0Ptr + (i*3) + 1 );
2252  src0Val[2] = *( src0Ptr + (i*3) + 2 );
2253 
2254  src1Val[0] = *( src1Ptr + (i*3) + 0 );
2255  src1Val[1] = *( src1Ptr + (i*3) + 1 );
2256  src1Val[2] = *( src1Ptr + (i*3) + 2 );
2257 
2258  dst[i] = src0Val[0] * src1Val[0] + src0Val[1] * src1Val[1] + src0Val[2] * src1Val[2];
2259  }
2260 }
2261 
2262 /*
2263 ============
2264 idSIMD_AltiVec::Dot
2265 
2266  dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
2267 ============
2268 */
2269 void VPCALL idSIMD_AltiVec::Dot( float &dot, const float *src1, const float *src2, const int count ) {
2270  dot = 0.0f;
2271 
2272  register vector float v0, v1, v2, v3;
2273  register vector float zeroVector;
2274  register vector float runningTotal1, runningTotal2;
2275  //src0
2276  register vector float v0_low, v0_hi, v2_low, v2_hi;
2277  //src1
2278  register vector float v1_low, v1_hi, v3_low, v3_hi;
2279  //permute vectors
2280  register vector unsigned char permVec1, permVec2;
2281  vector unsigned char oneCharVector = (vector unsigned char)(1);
2282 
2283  int i = 0;
2284 
2285  runningTotal1 = (vector float)(0.0);
2286  runningTotal2 = (vector float)(0.0);
2287  zeroVector = (vector float)(0.0);
2288 
2289  if ( count >= 8 ) {
2290  //calculate permute and do loads
2291  permVec1 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
2292  permVec2 = vec_add( vec_lvsl( -1, (int*) &src2[i] ), oneCharVector );
2293  v2_hi = vec_ld( 0, &src1[i] );
2294  v3_hi = vec_ld( 0, &src2[i] );
2295 
2296  //vectorize!
2297  for ( ; i+7 < count; i += 8 ) {
2298  //load sources
2299  v0_low = v2_hi;
2300  v0_hi = vec_ld( 15, &src1[i] );
2301  v2_low = v0_hi;
2302  v2_hi = vec_ld( 31, &src1[i] );
2303 
2304  v1_low = v3_hi;
2305  v1_hi = vec_ld( 15, &src2[i] );
2306  v3_low = v1_hi;
2307  v3_hi = vec_ld( 31, &src2[i] );
2308 
2309  v0 = vec_perm( v0_low, v0_hi, permVec1 );
2310  v1 = vec_perm( v1_low, v1_hi, permVec2 );
2311  v2 = vec_perm( v2_low, v2_hi, permVec1 );
2312  v3 = vec_perm( v3_low, v3_hi, permVec2 );
2313 
2314  //multiply together and keep running sum
2315  runningTotal1 = vec_madd( v0, v1, runningTotal1 );
2316  runningTotal2 = vec_madd( v2, v3, runningTotal2 );
2317  }
2318 
2319  runningTotal1 = vec_add( runningTotal1, runningTotal2 );
2320 
2321  // sum accross vector
2322  v0 = vec_add( runningTotal1, vec_sld( runningTotal1, runningTotal1, 8 ) );
2323  v1 = vec_add( v0, vec_sld( v0, v0, 4 ) );
2324  runningTotal1 = vec_splat( v1, 0 );
2325  vec_ste( runningTotal1, 0, &dot );
2326  }
2327 
2328  //handle cleanup. when profiling the game, we found that most of the counts to this function were small, so it
2329  // spends a lot of time in this scalar code. It's already really really fast (eg 1 TB tick) for scalar code for
2330  // counts less than 50, so not much point in trying to get vector code in on the action
2331  for ( ; i < count ; i++ ) {
2332  dot += src1[i] * src2[i];
2333  }
2334 
2335 }
2336 #endif /* ENABLE_DOT */
2337 
2338 #ifdef ENABLE_COMPARES
2339 
2340 /*
2341 ============
2342 idSIMD_AltiVec::CmpGT
2343 
2344  dst[i] = src0[i] > constant;
2345 ============
2346 */
2347 
2348 void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
2349 //#define OPER(X) dst[(X)] = src0[(X)] > constant;
2350 
2351  register vector float v0, v1, v2, v3;
2352  register vector bool int vr1, vr2, vr3, vr4;
2353  register vector bool short vs1, vs2;
2354  register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2355  register vector unsigned char vc1;
2356  register vector bool char vbc1;
2357  register vector float constVec;
2358  register vector unsigned char oneVector = (vector unsigned char)(1);
2359  register vector unsigned char permVec;
2360  int i;
2361 
2362  //handle unaligned at start
2363  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2364  dst[i] = src0[i] > constant;
2365  }
2366 
2367  //splat constant into a vector
2368  constVec = loadSplatUnalignedScalar( &constant );
2369 
2370  //calculate permute and do loads
2371  permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2372  v3_hi = vec_ld( 0, &src0[i] );
2373 
2374  //vectorize!
2375  for ( ; i+15 < count; i += 16 ) {
2376  // load values
2377  v0_low = v3_hi;
2378  v0_hi = vec_ld( 15, &src0[i] );
2379  v1_low = v0_hi;
2380  v1_hi = vec_ld( 31, &src0[i] );
2381  v2_low = v1_hi;
2382  v2_hi = vec_ld( 47, &src0[i] );
2383  v3_low = v2_hi;
2384  v3_hi = vec_ld( 63, &src0[i] );
2385 
2386  //permute into the vectors we want
2387  v0 = vec_perm( v0_low, v0_hi, permVec );
2388  v1 = vec_perm( v1_low, v1_hi, permVec );
2389  v2 = vec_perm( v2_low, v2_hi, permVec );
2390  v3 = vec_perm( v3_low, v3_hi, permVec );
2391 
2392  //do comparison
2393  vr1 = vec_cmpgt( v0, constVec );
2394  vr2 = vec_cmpgt( v1, constVec );
2395  vr3 = vec_cmpgt( v2, constVec );
2396  vr4 = vec_cmpgt( v3, constVec );
2397 
2398  // pack results into shorts
2399  vs1 = vec_pack(vr1, vr2);
2400  vs2 = vec_pack(vr3, vr4);
2401 
2402  // pack results into byte
2403  vbc1 = vec_pack(vs1, vs2);
2404 
2405  //AND with 1 to get true=1 not true=255
2406  vc1 = vec_and( vbc1, oneVector );
2407 
2408  //store results
2409  vec_st( vc1, 0, &dst[i] );
2410  }
2411 
2412  //handle cleanup
2413  for ( ; i < count ; i++ ) {
2414  dst[i] = src0[i] > constant;
2415  }
2416 }
2417 
2418 
2419 /*
2420 ============
2421 idSIMD_AltiVec::CmpGT
2422 
2423  dst[i] |= ( src0[i] > constant ) << bitNum;
2424 ============
2425 */
2426 void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2427 //#define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;
2428 
2429  // Temp vector registers
2430  register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2431  register vector bool short vtbs0, vtbs1;
2432  register vector bool char vtbc0;
2433  register vector unsigned char vtuc0;
2434  register vector unsigned char permVec, permVec2;
2435 
2436  // dest vectors
2437  register vector unsigned char vd;
2438  // bitNum vectors
2439  register vector unsigned char bitNumVec;
2440  // src0 vectors
2441  register vector float vs0, vs1, vs2, vs3;
2442  register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2443  // constant vector
2444  register vector float constVec;
2445  // all one's
2446  register vector unsigned char oneVector = (vector unsigned char)(1);
2447  int i = 0;
2448 
2449  //handle unaligned at start
2450  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2451  dst[i] |= ( src0[i] > constant ) << bitNum;
2452  }
2453 
2454  //splat constant into a vector
2455  constVec = loadSplatUnalignedScalar( &constant );
2456 
2457  //bitNum is unaligned.
2458  permVec2 = vec_lvsl( 0, &bitNum );
2459  vtuc0 = vec_ld( 0, &bitNum );
2460  bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2461  bitNumVec = vec_splat( bitNumVec, 0 );
2462 
2463  //calculate permute and do loads
2464  permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2465  vs3_hi = vec_ld( 0, &src0[i] );
2466 
2467  //vectorize!
2468  for ( ; i+15 < count; i += 16 ) {
2469  //load sources (floats)
2470  vs0_low = vs3_hi;
2471  vs0_hi = vec_ld( 15, &src0[i] );
2472  vs1_low = vs0_hi;
2473  vs1_hi = vec_ld( 31, &src0[i] );
2474  vs2_low = vs1_hi;
2475  vs2_hi = vec_ld( 47, &src0[i] );
2476  vs3_low = vs2_hi;
2477  vs3_hi = vec_ld( 63, &src0[i] );
2478 
2479  //permute into the vectors we want
2480  vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2481  vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2482  vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2483  vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2484 
2485  //load dest (bytes) as unsigned char
2486  vd = vec_ld( 0, &dst[i] );
2487 
2488  // do comparison and get bool int result
2489  vtbi0 = vec_cmpgt( vs0, constVec );
2490  vtbi1 = vec_cmpgt( vs1, constVec );
2491  vtbi2 = vec_cmpgt( vs2, constVec );
2492  vtbi3 = vec_cmpgt( vs3, constVec );
2493 
2494  // pack results into shorts
2495  vtbs0 = vec_pack(vtbi0, vtbi1);
2496  vtbs1 = vec_pack(vtbi2, vtbi3);
2497 
2498  // pack results into byte
2499  vtbc0 = vec_pack(vtbs0, vtbs1);
2500 
2501  //and with 1 to get true=1 instead of true=255
2502  vtuc0 = vec_and(vtbc0, oneVector);
2503  vtuc0 = vec_sl(vtuc0, bitNumVec );
2504 
2505  //or with original
2506  vd = vec_or( vd, vtuc0 );
2507 
2508  vec_st( vd, 0, &dst[i] );
2509  }
2510 
2511  //handle cleanup
2512  for ( ; i < count ; i++ ) {
2513  dst[i] |= ( src0[i] > constant ) << bitNum;
2514  }
2515 }
2516 
2517 /*
2518 ============
2519 idSIMD_AltiVec::CmpGE
2520 
2521  dst[i] = src0[i] >= constant;
2522 ============
2523 */
2524 void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
2525 
2526  register vector float v0, v1, v2, v3;
2527  register vector bool int vr1, vr2, vr3, vr4;
2528  register vector bool short vs1, vs2;
2529  register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2530  register vector unsigned char vc1;
2531  register vector bool char vbc1;
2532  register vector float constVec;
2533  register vector unsigned char oneVector = (vector unsigned char)(1);
2534  register vector unsigned char permVec;
2535  int i = 0;
2536 
2537  //handle unaligned at start
2538  for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2539  dst[i] = src0[i] >= constant;
2540  }
2541 
2542  //splat constant into a vector
2543  constVec = loadSplatUnalignedScalar( &constant );
2544 
2545  //calculate permute and do loads
2546  permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2547  v3_hi = vec_ld( 0, &src0[i] );
2548 
2549  //vectorize!
2550  for ( ; i+15 < count; i += 16 ) {
2551  // load values
2552  v0_low = v3_hi;
2553  v0_hi = vec_ld( 15, &src0[i] );
2554  v1_low = v0_hi;
2555  v1_hi = vec_ld( 31, &src0[i] );
2556  v2_low = v1_hi;
2557  v2_hi = vec_ld( 47, &src0[i] );
2558  v3_low = v2_hi;
2559  v3_hi = vec_ld( 63, &src0[i] );
2560 
2561  //permute into the vectors we want
2562  v0 = vec_perm( v0_low, v0_hi, permVec );
2563  v1 = vec_perm( v1_low, v1_hi, permVec );
2564  v2 = vec_perm( v2_low, v2_hi, permVec );
2565  v3 = vec_perm( v3_low, v3_hi, permVec );
2566 
2567  //do comparison
2568  vr1 = vec_cmpge( v0, constVec );
2569  vr2 = vec_cmpge( v1, constVec );
2570  vr3 = vec_cmpge( v2, constVec );
2571  vr4 = vec_cmpge( v3, constVec );
2572 
2573  // pack results into shorts
2574  vs1 = vec_pack(vr1, vr2);
2575  vs2 = vec_pack(vr3, vr4);
2576 
2577  // pack results into byte
2578  vbc1 = vec_pack(vs1, vs2);
2579 
2580  //AND with 1 to get true=1 not true=255
2581  vc1 = vec_and( vbc1, oneVector );
2582 
2583  //store results
2584  vec_st( vc1, 0, &dst[i] );
2585  }
2586 
2587  //handle cleanup
2588  for ( ; i < count ; i++ ) {
2589  dst[i] = src0[i] >= constant;
2590  }
2591 }
2592 
2593 /*
2594 ============
2595 idSIMD_AltiVec::CmpGE
2596 
2597  dst[i] |= ( src0[i] >= constant ) << bitNum;
2598 ============
2599 */
2600 void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2601  register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2602  register vector bool short vtbs0, vtbs1;
2603  register vector bool char vtbc0;
2604  register vector unsigned char vtuc0;
2605  register vector unsigned char permVec, permVec2;
2606 
2607  // dest vectors
2608  register vector unsigned char vd;
2609  // bitNum vectors
2610  register vector unsigned char bitNumVec;
2611  // src0 vectors
2612  register vector float vs0, vs1, vs2, vs3;
2613  register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2614  // constant vector
2615  register vector float constVec;
2616  // all one's
2617  register vector unsigned char oneVector = (vector unsigned char)(1);
2618  int i = 0;
2619 
2620  //handle unaligned at start
2621  for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2622  dst[i] |= ( src0[i] >= constant ) << bitNum;
2623  }
2624 
2625  //splat constant into a vector
2626  constVec = loadSplatUnalignedScalar( &constant );
2627 
2628  //bitNum is unaligned.
2629  permVec2 = vec_lvsl( 0, &bitNum );
2630  vtuc0 = vec_ld( 0, &bitNum );
2631  bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2632  bitNumVec = vec_splat( bitNumVec, 0 );
2633 
2634  //calculate permute and do loads
2635  permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2636  vs3_hi = vec_ld( 0, &src0[i] );
2637 
2638  //vectorize!
2639  for ( ; i+15 < count; i += 16 ) {
2640  //load sources (floats)
2641  vs0_low = vs3_hi;
2642  vs0_hi = vec_ld( 15, &src0[i] );
2643  vs1_low = vs0_hi;
2644  vs1_hi = vec_ld( 31, &src0[i] );
2645  vs2_low = vs1_hi;
2646  vs2_hi = vec_ld( 47, &src0[i] );
2647  vs3_low = vs2_hi;
2648  vs3_hi = vec_ld( 63, &src0[i] );
2649 
2650  //permute into the vectors we want
2651  vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2652  vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2653  vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2654  vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2655 
2656  //load dest (bytes) as unsigned char
2657  vd = vec_ld( 0, &dst[i] );
2658 
2659  // do comparison and get bool int result
2660  vtbi0 = vec_cmpge( vs0, constVec );
2661  vtbi1 = vec_cmpge( vs1, constVec );
2662  vtbi2 = vec_cmpge( vs2, constVec );
2663  vtbi3 = vec_cmpge( vs3, constVec );
2664 
2665  // pack results into shorts
2666  vtbs0 = vec_pack(vtbi0, vtbi1);
2667  vtbs1 = vec_pack(vtbi2, vtbi3);
2668 
2669  // pack results into byte
2670  vtbc0 = vec_pack(vtbs0, vtbs1);
2671 
2672  //and with 1L to get true=1 instead of true=255
2673  vtuc0 = vec_and(vtbc0, oneVector);
2674  vtuc0 = vec_sl(vtuc0, bitNumVec );
2675 
2676  //or with original
2677  vd = vec_or( vd, vtuc0 );
2678 
2679  vec_st( vd, 0, &dst[i] );
2680  }
2681 
2682  //handle cleanup
2683  for ( ; i < count ; i++ ) {
2684  dst[i] |= ( src0[i] >= constant ) << bitNum;
2685  }
2686 }
2687 
2688 
2689 /*
2690 ============
2691 idSIMD_AltiVec::CmpLT
2692 
2693  dst[i] = src0[i] < constant;
2694 ============
2695 */
2696 void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
2697 //#define OPER(X) dst[(X)] = src0[(X)] < constant;
2698  register vector float v0, v1, v2, v3;
2699  register vector bool int vr1, vr2, vr3, vr4;
2700  register vector bool short vs1, vs2;
2701  register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2702  register vector unsigned char vc1;
2703  register vector bool char vbc1;
2704  register vector float constVec;
2705  register vector unsigned char oneVector = (vector unsigned char)(1);
2706  register vector unsigned char permVec;
2707  int i = 0;
2708 
2709  //handle unaligned at start
2710  for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2711  dst[i] = src0[i] < constant;
2712  }
2713 
2714  //splat constant into a vector
2715  constVec = loadSplatUnalignedScalar( &constant );
2716 
2717  //calculate permute and do loads
2718  permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2719  v3_hi = vec_ld( 0, &src0[i] );
2720 
2721  //vectorize!
2722  for ( ; i+15 < count; i += 16 ) {
2723  // load values
2724  v0_low = v3_hi;
2725  v0_hi = vec_ld( 15, &src0[i] );
2726  v1_low = v0_hi;
2727  v1_hi = vec_ld( 31, &src0[i] );
2728  v2_low = v1_hi;
2729  v2_hi = vec_ld( 47, &src0[i] );
2730  v3_low = v2_hi;
2731  v3_hi = vec_ld( 63, &src0[i] );
2732 
2733  //permute into the vectors we want
2734  v0 = vec_perm( v0_low, v0_hi, permVec );
2735  v1 = vec_perm( v1_low, v1_hi, permVec );
2736  v2 = vec_perm( v2_low, v2_hi, permVec );
2737  v3 = vec_perm( v3_low, v3_hi, permVec );
2738 
2739  //do comparison
2740  vr1 = vec_cmplt( v0, constVec );
2741  vr2 = vec_cmplt( v1, constVec );
2742  vr3 = vec_cmplt( v2, constVec );
2743  vr4 = vec_cmplt( v3, constVec );
2744 
2745  // pack results into shorts
2746  vs1 = vec_pack(vr1, vr2);
2747  vs2 = vec_pack(vr3, vr4);
2748 
2749  // pack results into byte
2750  vbc1 = vec_pack(vs1, vs2);
2751 
2752  //AND with 1 to get true=1 not true=255
2753  vc1 = vec_and( vbc1, oneVector );
2754 
2755  //store results
2756  vec_st( vc1, 0, &dst[i] );
2757  }
2758 
2759  //handle cleanup
2760  for ( ; i < count ; i++ ) {
2761  dst[i] = src0[i] < constant;
2762  }
2763 }
2764 
2765 /*
2766 ============
2767 idSIMD_AltiVec::CmpLT
2768 
2769  dst[i] |= ( src0[i] < constant ) << bitNum;
2770 ============
2771 */
2772 void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2773 //#define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;
2774  register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2775  register vector bool short vtbs0, vtbs1;
2776  register vector bool char vtbc0;
2777  register vector unsigned char vtuc0;
2778  register vector unsigned char permVec, permVec2;
2779 
2780  // dest vectors
2781  register vector unsigned char vd;
2782  // bitNum vectors
2783  register vector unsigned char bitNumVec;
2784  // src0 vectors
2785  register vector float vs0, vs1, vs2, vs3;
2786  register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2787  // constant vector
2788  register vector float constVec;
2789  // all one's
2790  register vector unsigned char oneVector = (vector unsigned char)(1);
2791  int i = 0;
2792 
2793  //handle unaligned at start
2794  for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2795  dst[i] |= ( src0[i] < constant ) << bitNum;
2796  }
2797 
2798  //splat constant into a vector
2799  constVec = loadSplatUnalignedScalar( &constant );
2800 
2801  //bitNum is unaligned.
2802  permVec2 = vec_lvsl( 0, &bitNum );
2803  vtuc0 = vec_ld( 0, &bitNum );
2804  bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2805  bitNumVec = vec_splat( bitNumVec, 0 );
2806 
2807  //calculate permute and do loads
2808  permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2809  vs3_hi = vec_ld( 0, &src0[i] );
2810 
2811  //vectorize!
2812  for ( ; i+15 < count; i += 16 ) {
2813  //load sources (floats)
2814  vs0_low = vs3_hi;
2815  vs0_hi = vec_ld( 15, &src0[i] );
2816  vs1_low = vs0_hi;
2817  vs1_hi = vec_ld( 31, &src0[i] );
2818  vs2_low = vs1_hi;
2819  vs2_hi = vec_ld( 47, &src0[i] );
2820  vs3_low = vs2_hi;
2821  vs3_hi = vec_ld( 63, &src0[i] );
2822 
2823  //permute into the vectors we want
2824  vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2825  vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2826  vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2827  vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2828 
2829  //load dest (bytes) as unsigned char
2830  vd = vec_ld( 0, &dst[i] );
2831 
2832  // do comparison and get bool int result
2833  vtbi0 = vec_cmplt( vs0, constVec );
2834  vtbi1 = vec_cmplt( vs1, constVec );
2835  vtbi2 = vec_cmplt( vs2, constVec );
2836  vtbi3 = vec_cmplt( vs3, constVec );
2837 
2838  // pack results into shorts
2839  vtbs0 = vec_pack(vtbi0, vtbi1);
2840  vtbs1 = vec_pack(vtbi2, vtbi3);
2841 
2842  // pack results into byte
2843  vtbc0 = vec_pack(vtbs0, vtbs1);
2844 
2845  //and with 1L to get true=1 instead of true=255
2846  vtuc0 = vec_and(vtbc0, oneVector);
2847  vtuc0 = vec_sl(vtuc0, bitNumVec );
2848 
2849  //or with original
2850  vd = vec_or( vd, vtuc0 );
2851 
2852  vec_st( vd, 0, &dst[i] );
2853  }
2854 
2855  //handle cleanup
2856  for ( ; i < count ; i++ ) {
2857  dst[i] |= ( src0[i] < constant ) << bitNum;
2858  }
2859 
2860 }
2861 //#endif
2862 
2863 /*
2864 ============
2865 idSIMD_AltiVec::CmpLE
2866 
2867  dst[i] = src0[i] <= constant;
2868 ============
2869 */
2870 void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
2871 //#define OPER(X) dst[(X)] = src0[(X)] <= constant;
2872  register vector float v0, v1, v2, v3;
2873  register vector bool int vr1, vr2, vr3, vr4;
2874  register vector bool short vs1, vs2;
2875  register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2876  register vector unsigned char vc1;
2877  register vector bool char vbc1;
2878  register vector float constVec;
2879  register vector unsigned char oneVector = (vector unsigned char)(1);
2880  register vector unsigned char permVec;
2881  int i = 0;
2882 
2883  //handle unaligned at start
2884  for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2885  dst[i] = src0[i] <= constant;
2886  }
2887 
2888  //splat constant into a vector
2889  constVec = loadSplatUnalignedScalar( &constant );
2890 
2891  //calculate permute and do loads
2892  permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2893  v3_hi = vec_ld( 0, &src0[i] );
2894 
2895  //vectorize!
2896  for ( ; i+15 < count; i += 16 ) {
2897  // load values
2898  v0_low = v3_hi;
2899  v0_hi = vec_ld( 15, &src0[i] );
2900  v1_low = v0_hi;
2901  v1_hi = vec_ld( 31, &src0[i] );
2902  v2_low = v1_hi;
2903  v2_hi = vec_ld( 47, &src0[i] );
2904  v3_low = v2_hi;
2905  v3_hi = vec_ld( 63, &src0[i] );
2906 
2907  //permute into the vectors we want
2908  v0 = vec_perm( v0_low, v0_hi, permVec );
2909  v1 = vec_perm( v1_low, v1_hi, permVec );
2910  v2 = vec_perm( v2_low, v2_hi, permVec );
2911  v3 = vec_perm( v3_low, v3_hi, permVec );
2912 
2913  //do comparison
2914  vr1 = vec_cmple( v0, constVec );
2915  vr2 = vec_cmple( v1, constVec );
2916  vr3 = vec_cmple( v2, constVec );
2917  vr4 = vec_cmple( v3, constVec );
2918 
2919  // pack results into shorts
2920  vs1 = vec_pack(vr1, vr2);
2921  vs2 = vec_pack(vr3, vr4);
2922 
2923  // pack results into byte
2924  vbc1 = vec_pack(vs1, vs2);
2925 
2926  //AND with 1 to get true=1 not true=255
2927  vc1 = vec_and( vbc1, oneVector );
2928 
2929  //store results
2930  vec_st( vc1, 0, &dst[i] );
2931  }
2932 
2933  //handle cleanup
2934  for ( ; i < count ; i++ ) {
2935  dst[i] = src0[i] <= constant;
2936  }
2937 }
2938 
2939 /*
2940 ============
2941 idSIMD_AltiVec::CmpLE
2942 
2943  dst[i] |= ( src0[i] <= constant ) << bitNum;
2944 ============
2945 */
2946 void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2947 //#define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;
2948  register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2949  register vector bool short vtbs0, vtbs1;
2950  register vector bool char vtbc0;
2951  register vector unsigned char vtuc0;
2952  register vector unsigned char permVec, permVec2;
2953 
2954  // dest vectors
2955  register vector unsigned char vd;
2956  // bitNum vectors
2957  register vector unsigned char bitNumVec;
2958  // src0 vectors
2959  register vector float vs0, vs1, vs2, vs3;
2960  register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2961  // constant vector
2962  register vector float constVec;
2963  // all one's
2964  register vector unsigned char oneVector = (vector unsigned char)(1);
2965  int i = 0;
2966 
2967  //handle unaligned at start
2968  for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2969  dst[i] |= ( src0[i] <= constant ) << bitNum;
2970  }
2971 
2972  //splat constant into a vector
2973  constVec = loadSplatUnalignedScalar( &constant );
2974 
2975  //bitNum is unaligned.
2976  permVec2 = vec_lvsl( 0, &bitNum );
2977  vtuc0 = vec_ld( 0, &bitNum );
2978  bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2979  bitNumVec = vec_splat( bitNumVec, 0 );
2980 
2981  //calculate permute and do loads
2982  permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2983  vs3_hi = vec_ld( 0, &src0[i] );
2984 
2985  //vectorize!
2986  for ( ; i+15 < count; i += 16 ) {
2987  //load sources (floats)
2988  vs0_low = vs3_hi;
2989  vs0_hi = vec_ld( 15, &src0[i] );
2990  vs1_low = vs0_hi;
2991  vs1_hi = vec_ld( 31, &src0[i] );
2992  vs2_low = vs1_hi;
2993  vs2_hi = vec_ld( 47, &src0[i] );
2994  vs3_low = vs2_hi;
2995  vs3_hi = vec_ld( 63, &src0[i] );
2996 
2997  //permute into the vectors we want
2998  vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2999  vs1 = vec_perm( vs1_low, vs1_hi, permVec );
3000  vs2 = vec_perm( vs2_low, vs2_hi, permVec );
3001  vs3 = vec_perm( vs3_low, vs3_hi, permVec );
3002 
3003  //load dest (bytes) as unsigned char
3004  vd = vec_ld( 0, &dst[i] );
3005 
3006  // do comparison and get bool int result
3007  vtbi0 = vec_cmple( vs0, constVec );
3008  vtbi1 = vec_cmple( vs1, constVec );
3009  vtbi2 = vec_cmple( vs2, constVec );
3010  vtbi3 = vec_cmple( vs3, constVec );
3011 
3012  // pack results into shorts
3013  vtbs0 = vec_pack(vtbi0, vtbi1);
3014  vtbs1 = vec_pack(vtbi2, vtbi3);
3015 
3016  // pack results into byte
3017  vtbc0 = vec_pack(vtbs0, vtbs1);
3018 
3019  //and with 1L to get true=1 instead of true=255
3020  vtuc0 = vec_and(vtbc0, oneVector);
3021  vtuc0 = vec_sl(vtuc0, bitNumVec );
3022 
3023  //or with original
3024  vd = vec_or( vd, vtuc0 );
3025 
3026  vec_st( vd, 0, &dst[i] );
3027  }
3028 
3029  //handle cleanup
3030  for ( ; i < count ; i++ ) {
3031  dst[i] |= ( src0[i] <= constant ) << bitNum;
3032  }
3033 }
3034 #endif /* ENABLE_COMPARES */
3035 
3036 #ifdef ENABLE_MINMAX
3037 
3038 /*
3039 ============
3040 idSIMD_AltiVec::MinMax
3041 ============
3042 */
3043 void VPCALL idSIMD_AltiVec::MinMax( float &min, float &max, const float *src, const int count ) {
3044  min = idMath::INFINITY; max = -idMath::INFINITY;
3045 //#define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}
3046 
3047  register vector float v0, v1, v2, v3;
3048  register vector float maxVec, minVec, tempMin, tempMax;
3049  register vector unsigned char permVec;
3050  register vector float v0_low, v0_hi, v1_low, v1_hi;
3051  vector unsigned char oneCharVector = (vector unsigned char)(1);
3052  int i = 0;
3053 
3054  if ( count >= 4 ) {
3055 
3056  //calculate permute and do first load to
3057  //get a starting point for min and max
3058  permVec = vec_add( vec_lvsl( -1, (int*) &src[0] ), oneCharVector );
3059  v1_hi = vec_ld( 0, &src[0] );
3060 
3061  maxVec = loadSplatUnalignedScalar( &max );
3062  minVec = loadSplatUnalignedScalar( &min );
3063 
3064  //vectorize!
3065  for ( ; i+7 < count; i += 8 ) {
3066  //load sources
3067  v0_low = v1_hi;
3068  v0_hi = vec_ld( 15, &src[i] );
3069  v1_low = v0_hi;
3070  v1_hi = vec_ld( 31, &src[i] );
3071  v0 = vec_perm( v0_low, v0_hi, permVec );
3072  v1 = vec_perm( v1_low, v1_hi, permVec );
3073 
3074  // minimum
3075  v2 = vec_min( v0, v1 );
3076  minVec = vec_min( minVec, v2 );
3077  // maximum
3078  v3 = vec_max( v0, v1 );
3079  maxVec = vec_max( maxVec, v3 );
3080  }
3081 
3082  //minVec and maxVec hold the min/max elements from the array, but now
3083  //we need to figure out which particular element it is
3084 
3085  tempMin = minVec;
3086  tempMax = maxVec;
3087 
3088  // rotate vector around and compare to itself to find the real min/max
3089  tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 8 ) );
3090  tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 8 ) );
3091  tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 4 ) );
3092  tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 4 ) );
3093  minVec = vec_splat( tempMin, 0 );
3094  maxVec = vec_splat( tempMax, 0 );
3095  vec_ste( minVec, 0, &min );
3096  vec_ste( maxVec, 0, &max );
3097  }
3098 
3099  //cleanup
3100  for ( ; i < count; i++ ) {
3101  if ( src[i] < min ) {
3102  min = src[i];
3103  }
3104  if ( src[i] > max ) {
3105  max = src[i];
3106  }
3107  }
3108 }
3109 
3110 /*
3111 ============
3112 idSIMD_AltiVec::MinMax
3113 ============
3114 */
3115 void VPCALL idSIMD_AltiVec::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
3116  min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;
3117 //#define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }
3118 
3119  idVec2 v;
3120  int i = 0;
3121  int j;
3122 
3123  const float *srcPtr = src[0].ToFloatPtr();
3124  register vector float vecLd1, vecLd2, vecLd3, vecLd4;
3125  register vector float vecMin, vecMax;
3126 
3127  register vector float v0, v1, v2, v3;
3128 
3129  if ( count > 4 ) {
3130 
3131  vecMin = (vector float)(FLT_MAX);
3132  vecMax = (vector float)(FLT_MIN);
3133 
3134  vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
3135  vector float vecOld = vec_ld( 0, srcPtr );
3136 
3137  for ( i = 0, j = 0; i+7 < count; i += 8, j += 4) {
3138  // load data
3139  float *vecPtr = (float*)( srcPtr + (j*4) );
3140  vector float v0, v1, v2, v3;
3141 
3142  v0 = vecOld;
3143  v1 = vec_ld( 15, vecPtr );
3144  v2 = vec_ld( 31, vecPtr );
3145  v3 = vec_ld( 47, vecPtr );
3146  vecOld = vec_ld( 63, vecPtr );
3147 
3148  vecLd1 = vec_perm( v0, v1, permVec );
3149  vecLd2 = vec_perm( v1, v2, permVec );
3150  vecLd3 = vec_perm( v2, v3, permVec );
3151  vecLd4 = vec_perm( v3, vecOld, permVec );
3152 
3153  // each of these vectors contains 2 elements
3154  // looks like | X Y X Y | X Y X Y
3155  v0 = vec_min( vecLd1, vecLd2 );
3156  v1 = vec_min( vecLd3, vecLd4 );
3157  v0 = vec_min( v0, v1 );
3158 
3159  v2 = vec_max( vecLd1, vecLd2 );
3160  v3 = vec_max( vecLd3, vecLd4 );
3161  v2 = vec_max( v2, v3 );
3162 
3163  // since its always X Y X Y we don't have to re-merge each time. we can wait
3164  // until the end
3165  vecMin = vec_min( v0, vecMin );
3166  vecMax = vec_max( v2, vecMax );
3167  }
3168 
3169  vecMin = vec_min( vecMin, vec_sld( vecMin, vecMin, 8 ) );
3170  vecMax = vec_max( vecMax, vec_sld( vecMax, vecMax, 8 ) );
3171  v0 = vec_splat( vecMin, 0 );
3172  v1 = vec_splat( vecMin, 1 );
3173  v2 = vec_splat( vecMax, 0 );
3174  v3 = vec_splat( vecMax, 1 );
3175 
3176  vec_ste( v0, 0, &min[0] );
3177  vec_ste( v1, 0, &min[1] );
3178  vec_ste( v2, 0, &max[0] );
3179  vec_ste( v3, 0, &max[1] );
3180  }
3181 
3182  // cleanup
3183  for ( ; i < count; i++ ) {
3184  v = src[i];
3185 
3186  if ( v[0] < min[0] ) {
3187  min[0] = v[0];
3188  }
3189  if ( v[0] > max[0] ) {
3190  max[0] = v[0];
3191  }
3192 
3193  if ( v[1] < min[1] ) {
3194  min[1] = v[1];
3195  }
3196  if ( v[1] > max[1] ) {
3197  max[1] = v[1];
3198  }
3199  }
3200 }
3201 
3202 /*
3203 ============
3204 idSIMD_AltiVec::MinMax
3205 ============
3206 */
3207 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
3208  min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3209 //#define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
3210 
3211  int i = 0;
3212  const float *srcPtr = src[0].ToFloatPtr();
3213  idVec3 v;
3214 
3215  register vector float vecLd1, vecLd2, vecLd3;
3216  register vector float vecMin, vecMax;
3217  register vector float vecSrc1, vecSrc2, vecSrc3, vecSrc4;
3218  register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3219 
3220  if ( count >= 4 ) {
3221 
3222  vecMin = (vector float)(FLT_MAX);
3223  vecMax = (vector float)(FLT_MIN);
3224 
3225  vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr), (vector unsigned char)(1) );
3226  vector float vecOld = vec_ld( 0, srcPtr );
3227 
3228  // 4 elements at a time
3229  for ( ; i+3 < count; i += 4 ) {
3230  float *vecPtr = (float*)( srcPtr + (i*3) );
3231  vector float v0, v1, v2;
3232 
3233  v0 = vecOld;
3234  v1 = vec_ld( 15, vecPtr );
3235  v2 = vec_ld( 31, vecPtr );
3236  vecOld = vec_ld( 47, vecPtr );
3237 
3238  vecLd1 = vec_perm( v0, v1, permVec );
3239  vecLd2 = vec_perm( v1, v2, permVec );
3240  vecLd3 = vec_perm( v2, vecOld, permVec );
3241 
3242  // put each idVec3 into its own vector as X Y Z (crap)
3243  vecSrc1 = vecLd1;
3244  vecSrc2 = vec_sld( vecLd1, vecLd2, 12 );
3245  vecSrc3 = vec_sld( vecLd2, vecLd3, 8 );
3246  vecSrc4 = vec_sld( vecLd3, vecLd3, 4 );
3247 
3248  // do min and max
3249  vecMin1 = vec_min( vecSrc1, vecSrc2 );
3250  vecMin2 = vec_min( vecSrc3, vecSrc4 );
3251  vecMin1 = vec_min( vecMin1, vecMin2 );
3252  vecMin = vec_min( vecMin, vecMin1 );
3253 
3254  vecMax1 = vec_max( vecSrc1, vecSrc2 );
3255  vecMax2 = vec_max( vecSrc3, vecSrc4 );
3256  vecMax1 = vec_max( vecMax1, vecMax2 );
3257  vecMax = vec_max( vecMax1, vecMax );
3258  }
3259 
3260  // store results
3261  vector float v0, v1, v2, v3, v4, v5;
3262  v0 = vec_splat( vecMin, 0 );
3263  v1 = vec_splat( vecMin, 1 );
3264  v2 = vec_splat( vecMin, 2 );
3265  v3 = vec_splat( vecMax, 0 );
3266  v4 = vec_splat( vecMax, 1 );
3267  v5 = vec_splat( vecMax, 2 );
3268 
3269  vec_ste( v0, 0, &min[0] );
3270  vec_ste( v1, 0, &min[1] );
3271  vec_ste( v2, 0, &min[2] );
3272  vec_ste( v3, 0, &max[0] );
3273  vec_ste( v4, 0, &max[1] );
3274  vec_ste( v5, 0, &max[2] );
3275  }
3276 
3277  // cleanup
3278  for ( ; i < count; i ++ ) {
3279  v = src[i];
3280 
3281  if ( v[0] < min[0] ) {
3282  min[0] = v[0];
3283  }
3284  if ( v[0] > max[0] ) {
3285  max[0] = v[0];
3286  }
3287  if ( v[1] < min[1] ) {
3288  min[1] = v[1];
3289  }
3290  if ( v[1] > max[1] ) {
3291  max[1] = v[1];
3292  }
3293  if ( v[2] < min[2] ) {
3294  min[2] = v[2];
3295  }
3296  if ( v[2] > max[2] ) {
3297  max[2] = v[2];
3298  }
3299  }
3300 }
3301 
3302 #ifndef DRAWVERT_PADDED
3303 /*
3304 ============
3305 idSIMD_AltiVec::MinMax
3306 ============
3307 */
3308 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
3309 
3310  min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3311  idVec3 v;
3312  int i = 0;
3313  register vector float vecMin, vecMax;
3314 
3315  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3316  register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3317 
3318  if ( count >= 4 ) {
3319  vecMin = (vector float)(FLT_MAX);
3320  vecMax = (vector float)(FLT_MIN);
3321 
3322  vector unsigned char vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3323  vector unsigned char vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3324  vector unsigned char vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3325  vector unsigned char vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3326 
3327  for ( ; i+3 < count; i += 4) {
3328  const float *vertPtr = src[i].xyz.ToFloatPtr();
3329  const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
3330  const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
3331  const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
3332 
3333  v0 = vec_ld( 0, vertPtr );
3334  v1 = vec_ld( 11, vertPtr );
3335  v2 = vec_ld( 0, vertPtr2 );
3336  v3 = vec_ld( 11, vertPtr2 );
3337  v4 = vec_ld( 0, vertPtr3 );
3338  v5 = vec_ld( 11, vertPtr3 );
3339  v6 = vec_ld( 0, vertPtr4 );
3340  v7 = vec_ld( 11, vertPtr4 );
3341 
3342  v0 = vec_perm( v0, v1, vertPerm1 );
3343  v2 = vec_perm( v2, v3, vertPerm2 );
3344  v4 = vec_perm( v4, v5, vertPerm3 );
3345  v6 = vec_perm( v6, v7, vertPerm4 );
3346 
3347  vecMin1 = vec_min( v0, v2 );
3348  vecMin2 = vec_min( v4, v6 );
3349  vecMin1 = vec_min( vecMin1, vecMin2 );
3350  vecMin = vec_min( vecMin, vecMin1 );
3351 
3352  vecMax1 = vec_max( v0, v2 );
3353  vecMax2 = vec_max( v4, v6 );
3354  vecMax1 = vec_max( vecMax1, vecMax2 );
3355  vecMax = vec_max( vecMax, vecMax1 );
3356  }
3357 
3358  // now we have min/max vectors in X Y Z form, store out
3359  v0 = vec_splat( vecMin, 0 );
3360  v1 = vec_splat( vecMin, 1 );
3361  v2 = vec_splat( vecMin, 2 );
3362  v3 = vec_splat( vecMax, 0 );
3363  v4 = vec_splat( vecMax, 1 );
3364  v5 = vec_splat( vecMax, 2 );
3365 
3366  vec_ste( v0, 0, &min[0] );
3367  vec_ste( v1, 0, &min[1] );
3368  vec_ste( v2, 0, &min[2] );
3369  vec_ste( v3, 0, &max[0] );
3370  vec_ste( v4, 0, &max[1] );
3371  vec_ste( v5, 0, &max[2] );
3372  }
3373 
3374  // cleanup
3375  for ( ; i < count; i++ ) {
3376  v = src[i].xyz;
3377 
3378  if ( v[0] < min[0] ) {
3379  min[0] = v[0];
3380  }
3381  if ( v[0] > max[0] ) {
3382  max[0] = v[0];
3383  }
3384 
3385  if ( v[1] < min[1] ) {
3386  min[1] = v[1];
3387  }
3388  if ( v[1] > max[1] ) {
3389  max[1] = v[1];
3390  }
3391 
3392  if ( v[2] > max[2] ) {
3393  max[2] = v[2];
3394  }
3395 
3396  if ( v[2] < min[2] ) {
3397  min[2] = v[2];
3398  }
3399  }
3400 }
3401 #else
3402 /*
3403 ============
3404 idSIMD_AltiVec::MinMax
3405 ============
3406 */
3407 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
3408 
3409  min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3410  idVec3 v;
3411  int i = 0;
3412  register vector float vecMin, vecMax;
3413 
3414  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3415  register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3416 
3417  if ( count >= 4 ) {
3418  vecMin = (vector float)(FLT_MAX);
3419  vecMax = (vector float)(FLT_MIN);
3420 
3421  for ( ; i+3 < count; i += 4) {
3422  const float *vertPtr = src[i].xyz.ToFloatPtr();
3423  const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
3424  const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
3425  const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
3426 
3427  v0 = vec_ld( 0, vertPtr );
3428  v2 = vec_ld( 0, vertPtr2 );
3429  v4 = vec_ld( 0, vertPtr3 );
3430  v6 = vec_ld( 0, vertPtr4 );
3431 
3432  vecMin1 = vec_min( v0, v2 );
3433  vecMin2 = vec_min( v4, v6 );
3434  vecMin1 = vec_min( vecMin1, vecMin2 );
3435  vecMin = vec_min( vecMin, vecMin1 );
3436 
3437  vecMax1 = vec_max( v0, v2 );
3438  vecMax2 = vec_max( v4, v6 );
3439  vecMax1 = vec_max( vecMax1, vecMax2 );
3440  vecMax = vec_max( vecMax, vecMax1 );
3441  }
3442 
3443  // now we have min/max vectors in X Y Z form, store out
3444  v0 = vec_splat( vecMin, 0 );
3445  v1 = vec_splat( vecMin, 1 );
3446  v2 = vec_splat( vecMin, 2 );
3447  v3 = vec_splat( vecMax, 0 );
3448  v4 = vec_splat( vecMax, 1 );
3449  v5 = vec_splat( vecMax, 2 );
3450 
3451  vec_ste( v0, 0, &min[0] );
3452  vec_ste( v1, 0, &min[1] );
3453  vec_ste( v2, 0, &min[2] );
3454  vec_ste( v3, 0, &max[0] );
3455  vec_ste( v4, 0, &max[1] );
3456  vec_ste( v5, 0, &max[2] );
3457  }
3458 
3459  // cleanup
3460  for ( ; i < count; i++ ) {
3461  v = src[i].xyz;
3462 
3463  if ( v[0] < min[0] ) {
3464  min[0] = v[0];
3465  }
3466  if ( v[0] > max[0] ) {
3467  max[0] = v[0];
3468  }
3469 
3470  if ( v[1] < min[1] ) {
3471  min[1] = v[1];
3472  }
3473  if ( v[1] > max[1] ) {
3474  max[1] = v[1];
3475  }
3476 
3477  if ( v[2] > max[2] ) {
3478  max[2] = v[2];
3479  }
3480 
3481  if ( v[2] < min[2] ) {
3482  min[2] = v[2];
3483  }
3484  }
3485 }
3486 
3487 #endif /* DRAWVERT_PADDED */
3488 
3489 #ifndef DRAWVERT_PADDED
3490 /*
3491 ============
3492 idSIMD_AltiVec::MinMax
3493 ============
3494 */
3495 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
3496  min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3497 
3498  idVec3 v;
3499  int i = 0;
3500 
3501  register vector float vecMin, vecMax;
3502 
3503  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3504  register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3505 
3506  if ( count >= 4 ) {
3507 
3508  vecMin = (vector float)(FLT_MAX);
3509  vecMax = (vector float)(FLT_MIN);
3510 
3511  vector unsigned char vertPerm1;
3512  vector unsigned char vertPerm2;
3513  vector unsigned char vertPerm3;
3514  vector unsigned char vertPerm4;
3515 
3516  for ( ; i+3 < count; i += 4) {
3517  const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
3518  const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
3519  const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
3520  const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
3521 
3522  vertPerm1 = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
3523  vertPerm2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
3524  vertPerm3 = vec_add( vec_lvsl( -1, vertPtr3 ), (vector unsigned char)(1) );
3525  vertPerm4 = vec_add( vec_lvsl( -1, vertPtr4 ), (vector unsigned char)(1) );
3526 
3527  v0 = vec_ld( 0, vertPtr );
3528  v1 = vec_ld( 15, vertPtr );
3529  v2 = vec_ld( 0, vertPtr2 );
3530  v3 = vec_ld( 15, vertPtr2 );
3531  v4 = vec_ld( 0, vertPtr3 );
3532  v5 = vec_ld( 15, vertPtr3 );
3533  v6 = vec_ld( 0, vertPtr4 );
3534  v7 = vec_ld( 15, vertPtr4 );
3535 
3536  v0 = vec_perm( v0, v1, vertPerm1 );
3537  v2 = vec_perm( v2, v3, vertPerm2 );
3538  v4 = vec_perm( v4, v5, vertPerm3 );
3539  v6 = vec_perm( v6, v7, vertPerm4 );
3540 
3541  vecMin1 = vec_min( v0, v2 );
3542  vecMin2 = vec_min( v4, v6 );
3543  vecMin1 = vec_min( vecMin1, vecMin2 );
3544  vecMin = vec_min( vecMin, vecMin1 );
3545 
3546  vecMax1 = vec_max( v0, v2 );
3547  vecMax2 = vec_max( v4, v6 );
3548  vecMax1 = vec_max( vecMax1, vecMax2 );
3549  vecMax = vec_max( vecMax, vecMax1 );
3550  }
3551 
3552  // now we have min/max vectors in X Y Z form, store out
3553  v0 = vec_splat( vecMin, 0 );
3554  v1 = vec_splat( vecMin, 1 );
3555  v2 = vec_splat( vecMin, 2 );
3556  v3 = vec_splat( vecMax, 0 );
3557  v4 = vec_splat( vecMax, 1 );
3558  v5 = vec_splat( vecMax, 2 );
3559 
3560  vec_ste( v0, 0, &min[0] );
3561  vec_ste( v1, 0, &min[1] );
3562  vec_ste( v2, 0, &min[2] );
3563  vec_ste( v3, 0, &max[0] );
3564  vec_ste( v4, 0, &max[1] );
3565  vec_ste( v5, 0, &max[2] );
3566  }
3567 
3568  // cleanup
3569  for ( ; i < count; i++ ) {
3570  v = src[indexes[i]].xyz;
3571 
3572  if ( v[0] < min[0] ) {
3573  min[0] = v[0];
3574  }
3575  if ( v[0] > max[0] ) {
3576  max[0] = v[0];
3577  }
3578 
3579  if ( v[1] < min[1] ) {
3580  min[1] = v[1];
3581  }
3582  if ( v[1] > max[1] ) {
3583  max[1] = v[1];
3584  }
3585 
3586  if ( v[2] > max[2] ) {
3587  max[2] = v[2];
3588  }
3589 
3590  if ( v[2] < min[2] ) {
3591  min[2] = v[2];
3592  }
3593  }
3594 }
3595 #else
3596 /*
3597 ============
3598 idSIMD_AltiVec::MinMax
3599 ============
3600 */
3601 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
3602  min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3603 
3604  idVec3 v;
3605  int i = 0;
3606 
3607  register vector float vecMin, vecMax;
3608 
3609  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3610  register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3611 
3612  if ( count >= 4 ) {
3613 
3614  vecMin = (vector float)(FLT_MAX);
3615  vecMax = (vector float)(FLT_MIN);
3616 
3617  vector unsigned char vertPerm1;
3618  vector unsigned char vertPerm2;
3619  vector unsigned char vertPerm3;
3620  vector unsigned char vertPerm4;
3621 
3622  for ( ; i+3 < count; i += 4) {
3623  const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
3624  const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
3625  const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
3626  const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
3627 
3628  v0 = vec_ld( 0, vertPtr );
3629  v2 = vec_ld( 0, vertPtr2 );
3630  v4 = vec_ld( 0, vertPtr3 );
3631  v6 = vec_ld( 0, vertPtr4 );
3632 
3633  vecMin1 = vec_min( v0, v2 );
3634  vecMin2 = vec_min( v4, v6 );
3635  vecMin1 = vec_min( vecMin1, vecMin2 );
3636  vecMin = vec_min( vecMin, vecMin1 );
3637 
3638  vecMax1 = vec_max( v0, v2 );
3639  vecMax2 = vec_max( v4, v6 );
3640  vecMax1 = vec_max( vecMax1, vecMax2 );
3641  vecMax = vec_max( vecMax, vecMax1 );
3642  }
3643 
3644  // now we have min/max vectors in X Y Z form, store out
3645  v0 = vec_splat( vecMin, 0 );
3646  v1 = vec_splat( vecMin, 1 );
3647  v2 = vec_splat( vecMin, 2 );
3648  v3 = vec_splat( vecMax, 0 );
3649  v4 = vec_splat( vecMax, 1 );
3650  v5 = vec_splat( vecMax, 2 );
3651 
3652  vec_ste( v0, 0, &min[0] );
3653  vec_ste( v1, 0, &min[1] );
3654  vec_ste( v2, 0, &min[2] );
3655  vec_ste( v3, 0, &max[0] );
3656  vec_ste( v4, 0, &max[1] );
3657  vec_ste( v5, 0, &max[2] );
3658  }
3659 
3660  // cleanup
3661  for ( ; i < count; i++ ) {
3662  v = src[indexes[i]].xyz;
3663 
3664  if ( v[0] < min[0] ) {
3665  min[0] = v[0];
3666  }
3667  if ( v[0] > max[0] ) {
3668  max[0] = v[0];
3669  }
3670 
3671  if ( v[1] < min[1] ) {
3672  min[1] = v[1];
3673  }
3674  if ( v[1] > max[1] ) {
3675  max[1] = v[1];
3676  }
3677 
3678  if ( v[2] > max[2] ) {
3679  max[2] = v[2];
3680  }
3681 
3682  if ( v[2] < min[2] ) {
3683  min[2] = v[2];
3684  }
3685  }
3686 }
3687 
3688 
3689 #endif /* DRAWVERT_PADDED */
3690 
3691 #endif /* ENABLE_MINMAX */
3692 
3693 #ifdef ENABLE_CLAMP
3694 
3695 /*
3696 ============
3697 idSIMD_AltiVec::Clamp
3698 ============
3699 */
3700 void VPCALL idSIMD_AltiVec::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
3701 //#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];
3702  register vector float v0, v1, v2, v3, v4, v5;
3703  register vector unsigned char permVec;
3704  register vector float v0_low, v0_hi, v1_low, v1_hi;
3705  vector unsigned char oneVector = (vector unsigned char)(1);
3706  register vector float minVec, maxVec;
3707  int i = 0;
3708 
3709  //handle unaligned at start
3710  for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
3711  dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
3712  }
3713 
3714  //splat min/max into a vector
3715  minVec = loadSplatUnalignedScalar( &min );
3716  maxVec = loadSplatUnalignedScalar( &max );
3717 
3718  //calculate permute and do first load
3719  permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
3720  v1_hi = vec_ld( 0, &src[i] );
3721 
3722 
3723  //vectorize!
3724  for ( ; i+7 < count; i += 8 ) {
3725  //load source
3726  v0_low = v1_hi;
3727  v0_hi = vec_ld( 15, &src[i] );
3728  v1_low = v0_hi;
3729  v1_hi = vec_ld( 31, &src[i] );
3730 
3731  v0 = vec_perm( v0_low, v0_hi, permVec );
3732  v1 = vec_perm( v1_low, v1_hi, permVec );
3733 
3734  //apply minimum
3735  v2 = vec_max( v0, minVec );
3736  v3 = vec_max( v1, minVec );
3737 
3738  //apply maximum
3739  v4 = vec_min( v2, maxVec );
3740  v5 = vec_min( v3, maxVec );
3741 
3742  ALIGNED_STORE2( &dst[i], v4, v5 );
3743  }
3744 
3745  //handle cleanup
3746  for ( ; i < count ; i++ ) {
3747  dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
3748  }
3749 }
3750 
3751 /*
3752 ============
3753 idSIMD_AltiVec::ClampMin
3754 ============
3755 */
3756 void VPCALL idSIMD_AltiVec::ClampMin( float *dst, const float *src, const float min, const int count ) {
3757 //#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];
3758  register vector float v0, v1, v2, v3;
3759  register vector unsigned char permVec;
3760  register vector float v0_low, v0_hi, v1_low, v1_hi;
3761  register vector float constVec;
3762  vector unsigned char oneVector = (vector unsigned char)(1);
3763  int i = 0;
3764 
3765  //handle unaligned at start
3766  for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
3767  dst[i] = src[i] < min ? min : src[i];
3768  }
3769 
3770  //splat constant into a vector
3771  constVec = loadSplatUnalignedScalar( &min );
3772 
3773  //calculate permute and do first load
3774  permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
3775  v1_hi = vec_ld( 0, &src[i] );
3776 
3777  //vectorize!
3778  for ( ; i+7 < count; i += 8 ) {
3779  //load source
3780  v0_low = v1_hi;
3781  v0_hi = vec_ld( 15, &src[i] );
3782  v1_low = v0_hi;
3783  v1_hi = vec_ld( 31, &src[i] );
3784 
3785  v0 = vec_perm( v0_low, v0_hi, permVec );
3786  v1 = vec_perm( v1_low, v1_hi, permVec );
3787 
3788  v2 = vec_max( v0, constVec );
3789  v3 = vec_max( v1, constVec );
3790 
3791  ALIGNED_STORE2( &dst[i], v2, v3 );
3792  }
3793 
3794  //handle cleanup
3795  for ( ; i < count ; i++ ) {
3796  dst[i] = src[i] < min ? min : src[i];
3797  }
3798  }
3799 
3800 /*
3801 ============
3802 idSIMD_AltiVec::ClampMax
3803 ============
3804 */
3805 void VPCALL idSIMD_AltiVec::ClampMax( float *dst, const float *src, const float max, const int count ) {
3806 //#define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];
3807  register vector float v0, v1, v2, v3;
3808  register vector unsigned char permVec;
3809  register vector float constVec;
3810  register vector float v0_low, v0_hi, v1_low, v1_hi;
3811  vector unsigned char oneVector = (vector unsigned char)(1);
3812  int i = 0;
3813 
3814  //handle unaligned at start
3815  for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
3816  dst[i] = src[i] < max ? max : src[i];
3817  }
3818 
3819  //splat constant into a vector
3820  constVec = loadSplatUnalignedScalar( &max );
3821 
3822  //calculate permute and do first load
3823  permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
3824  v1_hi = vec_ld( 0, &src[i] );
3825 
3826  //vectorize!
3827  for ( ; i+7 < count; i += 8 ) {
3828  //load source
3829  v0_low = v1_hi;
3830  v0_hi = vec_ld( 15, &src[i] );
3831  v1_low = v0_hi;
3832  v1_hi = vec_ld( 31, &src[i] );
3833 
3834  v0 = vec_perm( v0_low, v0_hi, permVec );
3835  v1 = vec_perm( v1_low, v1_hi, permVec );
3836  v2 = vec_min( v0, constVec );
3837  v3 = vec_min( v1, constVec );
3838 
3839  ALIGNED_STORE2( &dst[i], v2, v3 );
3840  }
3841 
3842  //handle cleanup
3843  for ( ; i < count ; i++ ) {
3844  dst[i] = src[i] < max ? max : src[i];
3845  }
3846 }
3847 
3848 #endif /* ENABLE_CLAMP */
3849 
3850 #ifdef ENABLE_16ROUTINES
3851 
3852 /*
3853 ============
3854 idSIMD_AltiVec::Zero16
3855 ============
3856 */
3857 void VPCALL idSIMD_AltiVec::Zero16( float *dst, const int count ) {
3858  memset( dst, 0, count * sizeof( float ) );
3859 }
3860 
3861 /*
3862 ============
3863 idSIMD_AltiVec::Negate16
3864 
3865  Assumptions:
3866  dst is aligned
3867 ============
3868 */
3869 void VPCALL idSIMD_AltiVec::Negate16( float *dst, const int count ) {
3870 //#define OPER(X) ptr[(X)] ^= ( 1 << 31 ) // IEEE 32 bits float sign bit
3871 
3872  // dst is aligned
3873  assert( IS_16BYTE_ALIGNED( dst[0] ) );
3874 
3875  // round count up to next 4 if needbe
3876  int count2 = ( count + 3 ) & ~3;
3877 
3878  int i = 0;
3879  vector float v0, v1, v2, v3;
3880 
3881  //know its 16-byte aligned
3882  for ( ; i + 7 < count2; i += 8 ) {
3883  v0 = vec_ld( 0, &dst[i] );
3884  v1 = vec_ld( 16, &dst[i] );
3885 
3886  v2 = vec_sub( (vector float)(0), v0 );
3887  v3 = vec_sub( (vector float)(0), v1 );
3888 
3889  ALIGNED_STORE2( &dst[i], v2, v3 );
3890  }
3891 
3892  for ( ; i < count2; i += 4 ) {
3893  v0 = vec_ld( 0, &dst[i] );
3894  v1 = vec_sub( (vector float)(0), v0 );
3895  vec_st( v1, 0, &dst[i] );
3896  }
3897 }
3898 
3899 /*
3900 ============
3901 idSIMD_AltiVec::Copy16
3902 ============
3903 */
3904 void VPCALL idSIMD_AltiVec::Copy16( float *dst, const float *src, const int count ) {
3905 //#define OPER(X) dst[(X)] = src[(X)]
3906  memcpy( dst, src, sizeof(float) * count );
3907 }
3908 
3909 /*
3910 ============
3911 idSIMD_AltiVec::Add16
3912 
3913  Assumptions:
3914  Assumes dst, src1, src2 all start at aligned address
3915 ============
3916 */
3917 void VPCALL idSIMD_AltiVec::Add16( float *dst, const float *src1, const float *src2, const int count ) {
3918 //#define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]
3919 
3920  // dst is aligned
3921  assert( IS_16BYTE_ALIGNED( dst[0] ) );
3922  // src1 is aligned
3923  assert( IS_16BYTE_ALIGNED( src1[0] ) );
3924  // src2 is aligned
3925  assert( IS_16BYTE_ALIGNED( src2[0] ) );
3926 
3927  // round count up to next 4 if needbe
3928  int count2 = ( count + 3 ) & ~3;
3929 
3930  register vector float v0, v1, v2, v3, v4, v5;
3931  int i = 0;
3932 
3933  //know all data is 16-byte aligned, so vectorize!
3934  for ( ; i+7 < count2; i += 8 ) {
3935  //load sources
3936  v0 = vec_ld( 0, &src1[i] );
3937  v1 = vec_ld( 16, &src1[i] );
3938  v2 = vec_ld( 0, &src2[i] );
3939  v3 = vec_ld( 16, &src2[i] );
3940  v4 = vec_add( v0, v2 );
3941  v5 = vec_add( v1, v3 );
3942 
3943  ALIGNED_STORE2( &dst[i], v4, v5 );
3944  }
3945 
3946  for ( ; i < count2; i += 4 ) {
3947  v0 = vec_ld( 0, &src1[i] );
3948  v1 = vec_ld( 0, &src2[i] );
3949  v2 = vec_add( v0, v1 );
3950  vec_st( v2, 0, &dst[i] );
3951  }
3952 }
3953 
3954 /*
3955 ============
3956 idSIMD_AltiVec::Sub16
3957 
3958  Assumptions:
3959  Assumes that dst, src1, and src2 all start at aligned address
3960 ============
3961 */
3962 void VPCALL idSIMD_AltiVec::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
3963 //#define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]
3964  // dst is aligned
3965  assert( IS_16BYTE_ALIGNED( dst[0] ) );
3966  // src1 is aligned
3967  assert( IS_16BYTE_ALIGNED( src1[0] ) );
3968  // src2 is aligned
3969  assert( IS_16BYTE_ALIGNED( src2[0] ) );
3970 
3971  // round count up to next 4 if needbe
3972  int count2 = ( count + 3 ) & ~3;
3973 
3974  register vector float v0, v1, v2, v3, v4, v5;
3975  int i = 0;
3976 
3977  //know data is aligned, so vectorize!
3978  for ( ; i+7 < count2; i += 8 ) {
3979  //load sources
3980  v0 = vec_ld( 0, &src1[i] );
3981  v1 = vec_ld( 16, &src1[i] );
3982  v2 = vec_ld( 0, &src2[i] );
3983  v3 = vec_ld( 16, &src2[i] );
3984  v4 = vec_sub( v0, v2 );
3985  v5 = vec_sub( v1, v3 );
3986 
3987  ALIGNED_STORE2( &dst[i], v4, v5 );
3988  }
3989 
3990  for ( ; i < count2; i += 4 ) {
3991  v0 = vec_ld( 0, &src1[i] );
3992  v1 = vec_ld( 0, &src2[i] );
3993  v2 = vec_sub( v0, v1 );
3994  vec_st( v2, 0, &dst[i] );
3995  }
3996 }
3997 
3998 /*
3999 ============
4000 idSIMD_AltiVec::Mul16
4001 
4002  Assumptions:
4003  Assumes that dst and src1 start at aligned address
4004 ============
4005 */
4006 void VPCALL idSIMD_AltiVec::Mul16( float *dst, const float *src1, const float constant, const int count ) {
4007 //#define OPER(X) dst[(X)] = src1[(X)] * constant
4008 
4009  // dst is aligned
4010  assert( IS_16BYTE_ALIGNED( dst[0] ) );
4011  // src1 is aligned
4012  assert( IS_16BYTE_ALIGNED( src1[0] ) );
4013 
4014  // round count up to next 4 if needbe
4015  int count2 = ( count + 3 ) & ~3;
4016 
4017  register vector float v0, v1, v2, v3;
4018  register vector float constVec;
4019  register vector float zeroVector = (vector float)(0.0);
4020  int i = 0;
4021 
4022  //splat constant into a vector
4023  constVec = loadSplatUnalignedScalar( &constant );
4024 
4025  //know data is aligned, so vectorize!
4026  for ( ; i+7 < count2; i += 8 ) {
4027  //load source
4028  v0 = vec_ld( 0, &src1[i] );
4029  v1 = vec_ld( 16, &src1[i] );
4030  v2 = vec_madd( constVec, v0, zeroVector );
4031  v3 = vec_madd( constVec, v1, zeroVector );
4032  ALIGNED_STORE2( &dst[i], v2, v3 );
4033  }
4034 
4035  for ( ; i < count2; i += 4 ) {
4036  v0 = vec_ld( 0, &src1[i] );
4037  v1 = vec_madd( constVec, v0, zeroVector );
4038  vec_st( v1, 0, &dst[i] );
4039  }
4040 }
4041 
4042 /*
4043 ============
4044 idSIMD_AltiVec::AddAssign16
4045 
4046  Assumptions:
4047  Assumes that dst and src start at aligned address
4048 ============
4049 */
4050 void VPCALL idSIMD_AltiVec::AddAssign16( float *dst, const float *src, const int count ) {
4051 //#define OPER(X) dst[(X)] += src[(X)]
4052 
4053  // dst is aligned
4054  assert( IS_16BYTE_ALIGNED( dst[0] ) );
4055  // src is aligned
4056  assert( IS_16BYTE_ALIGNED( src[0] ) );
4057 
4058  // round count up to next 4 if needbe
4059  int count2 = ( count + 3 ) & ~3;
4060 
4061  register vector float v0, v1, v2, v3, v4, v5;
4062  int i = 0;
4063 
4064  //vectorize!
4065  for ( ; i+7 < count2; i += 8 ) {
4066  v0 = vec_ld( 0, &src[i] );
4067  v1 = vec_ld( 16, &src[i] );
4068  v2 = vec_ld( 0, &dst[i] );
4069  v3 = vec_ld( 16, &dst[i] );
4070  v4 = vec_add( v0, v2 );
4071  v5 = vec_add( v1, v3 );
4072  ALIGNED_STORE2( &dst[i], v4, v5 );
4073  }
4074 
4075  for ( ; i < count2; i += 4 ) {
4076  v0 = vec_ld( 0, &src[i] );
4077  v1 = vec_ld( 0, &dst[i] );
4078  v2 = vec_add( v0, v1 );
4079  vec_st( v2, 0, &dst[i] );
4080  }
4081 }
4082 
4083 /*
4084 ============
4085 idSIMD_AltiVec::SubAssign16
4086 
4087  Assumptions:
4088  Assumes that dst and src start at aligned address
4089 ============
4090 */
4091 void VPCALL idSIMD_AltiVec::SubAssign16( float *dst, const float *src, const int count ) {
4092 //#define OPER(X) dst[(X)] -= src[(X)]
4093  register vector float v0, v1, v2, v3, v4, v5;
4094  int i=0;
4095 
4096  // dst is aligned
4097  assert( IS_16BYTE_ALIGNED( dst[0] ) );
4098  // src is aligned
4099  assert( IS_16BYTE_ALIGNED( src[0] ) );
4100  // round count up to next 4 if needbe
4101  int count2 = ( count + 3 ) & ~3;
4102 
4103  //vectorize!
4104  for ( ; i+7 < count2; i += 8 ) {
4105  v0 = vec_ld( 0, &src[i] );
4106  v1 = vec_ld( 16, &src[i] );
4107  v2 = vec_ld( 0, &dst[i] );
4108  v3 = vec_ld( 16, &dst[i] );
4109  v4 = vec_sub( v2, v0 );
4110  v5 = vec_sub( v3, v1 );
4111  ALIGNED_STORE2( &dst[i], v4, v5 );
4112  }
4113 
4114  for ( ; i < count2; i += 4 ) {
4115  v0 = vec_ld( 0, &src[i] );
4116  v1 = vec_ld( 0, &dst[i] );
4117  v2 = vec_sub( v1, v0 );
4118  vec_st( v2, 0, &dst[i] );
4119  }
4120 }
4121 
4122 /*
4123 ============
4124 idSIMD_AltiVec::MulAssign16
4125 
4126  Assumptions:
4127  Assumes that dst starts at aligned address and count is multiple of 4
4128 ============
4129 */
4130 void VPCALL idSIMD_AltiVec::MulAssign16( float *dst, const float constant, const int count ) {
4131 //#define OPER(X) dst[(X)] *= constant
4132 
4133  // dst is aligned
4134  assert( IS_16BYTE_ALIGNED( dst[0] ) );
4135  // round count up to next 4 if needbe
4136  int count2 = ( count + 3 ) & ~3;
4137 
4138  register vector float v0, v1, v2, v3;
4139  register vector float constVec;
4140  int i = 0;
4141  register vector float zeroVector = (vector float)(0.0);
4142 
4143  //splat constant into a vector
4144  constVec = loadSplatUnalignedScalar( &constant );
4145 
4146  //vectorize!
4147  for ( ; i+7 < count2; i += 8 ) {
4148  v0 = vec_ld( 0, &dst[i] );
4149  v1 = vec_ld( 16, &dst[i] );
4150  v2 = vec_madd( v0, constVec, zeroVector );
4151  v3 = vec_madd( v1, constVec, zeroVector );
4152  ALIGNED_STORE2( &dst[i], v2, v3 );
4153  }
4154 
4155  for ( ; i < count2; i += 4 ) {
4156  v0 = vec_ld( 0, &dst[i] );
4157  v1 = vec_madd( v0, constVec, zeroVector );
4158  vec_st( v1, 0, &dst[i] );
4159  }
4160 }
4161 
4162 #endif /* ENABLE_16ROUTINES */
4163 
4164 #ifdef ENABLE_LOWER_TRIANGULAR
4165 
4166 /*
4167 ============
4168 idSIMD_AltiVec::MatX_LowerTriangularSolve
4169 
4170  solves x in L * x = b for the first n rows of L
4171  if skip > 0 the first skip elements of x are assumed to be valid already
4172  L has to be a lower triangular matrix with (implicit) ones on the diagonal
4173  x == b is allowed
4174 ============
4175 */
4176 
4177 void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
4178 
4179  int i, j;
4180  const float *lptr;
4181  const float *lptr2;
4182  const float *lptr3;
4183  const float *lptr4;
4184  float sum;
4185  float sum2;
4186  float sum3;
4187  float sum4;
4188  float tempSum;
4189  float tempSum2;
4190  float tempSum3;
4191  float tempSum4;
4192  vector float vecSum1 = (vector float)(0.0);
4193  vector float vecSum2 = (vector float)(0.0);
4194  vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
4195  vector float zeroVector = (vector float)(0.0);
4196  vector float vecSum3, vecSum4, vecSum5, vecSum6, vecSum7, vecSum8;
4197 
4198  vector unsigned char vecPermX = vec_add( vec_lvsl( -1, &x[0] ), (vector unsigned char)(1) );
4199 
4200  // unrolled this loop a bit
4201  for ( i = skip; i+3 < n; i+=4 ) {
4202  sum = b[i];
4203  sum2 = b[i+1];
4204  sum3 = b[i+2];
4205  sum4 = b[i+3];
4206 
4207  vecSum1 = zeroVector;
4208  vecSum2 = zeroVector;
4209  vecSum3 = vecSum4 = vecSum5 = vecSum6 = vecSum7 = vecSum8 = zeroVector;
4210  lptr = L[i];
4211  lptr2 = L[i+1];
4212  lptr3 = L[i+2];
4213  lptr4 = L[i+3];
4214 
4215  vector unsigned char vecPermLptr1 = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
4216  vector unsigned char vecPermLptr2 = vec_add( vec_lvsl( -1, lptr2 ), (vector unsigned char)(1) );
4217  vector unsigned char vecPermLptr3 = vec_add( vec_lvsl( -1, lptr3 ), (vector unsigned char)(1) );
4218  vector unsigned char vecPermLptr4 = vec_add( vec_lvsl( -1, lptr4 ), (vector unsigned char)(1) );
4219 
4220  for ( j = 0 ; j+7 < i; j+=8 ) {
4221 
4222  v0 = vec_ld( 0, &x[j] );
4223  v1 = vec_ld( 15, &x[j] );
4224  vector float vecExtraX = vec_ld( 31, &x[j] );
4225  v0 = vec_perm( v0, v1, vecPermX );
4226  v1 = vec_perm( v1, vecExtraX, vecPermX );
4227 
4228  v2 = vec_ld( 0, lptr + j );
4229  v3 = vec_ld( 15, lptr + j );
4230  vector float vecExtra1 = vec_ld( 31, lptr + j );
4231  v2 = vec_perm( v2, v3, vecPermLptr1 );
4232  v3 = vec_perm( v3, vecExtra1, vecPermLptr1 );
4233 
4234  v4 = vec_ld( 0, lptr2 + j );
4235  v5 = vec_ld( 15, lptr2 + j );
4236  vector float vecExtra2 = vec_ld( 31, lptr2 + j );
4237  v4 = vec_perm( v4, v5, vecPermLptr2 );
4238  v5 = vec_perm( v5, vecExtra2, vecPermLptr2 );
4239 
4240  v6 = vec_ld( 0, lptr3 + j );
4241  v7 = vec_ld( 15, lptr3 + j );
4242  vector float vecExtra3 = vec_ld( 31, lptr3 + j );
4243  v6 = vec_perm( v6, v7, vecPermLptr3 );
4244  v7 = vec_perm( v7, vecExtra3, vecPermLptr3 );
4245 
4246  v8 = vec_ld( 0, lptr4 + j );
4247  v9 = vec_ld( 15, lptr4 + j );
4248  vector float vecExtra4 = vec_ld( 31, lptr4 + j );
4249  v8 = vec_perm( v8, v9, vecPermLptr4 );
4250  v9 = vec_perm( v9, vecExtra4, vecPermLptr4 );
4251 
4252  vecSum1 = vec_madd( v2, v0, vecSum1 );
4253  vecSum2 = vec_madd( v3, v1, vecSum2 );
4254 
4255  vecSum3 = vec_madd( v4, v0, vecSum3 );
4256  vecSum4 = vec_madd( v5, v1, vecSum4 );
4257 
4258  vecSum5 = vec_madd( v6, v0, vecSum5 );
4259  vecSum6 = vec_madd( v7, v1, vecSum6 );
4260 
4261  vecSum7 = vec_madd( v8, v0, vecSum7 );
4262  vecSum8 = vec_madd( v9, v1, vecSum8 );
4263  }
4264 
4265  // if we ran the unrolled code, we need to sum accross the vectors
4266  // to find out how much to subtract from sum
4267  if ( j > 0 ) {
4268  vecSum1 = vec_add( vecSum1, vecSum2 );
4269  vecSum3 = vec_add( vecSum3, vecSum4 );
4270  vecSum5 = vec_add( vecSum5, vecSum6 );
4271  vecSum7 = vec_add( vecSum7, vecSum8 );
4272  //sum accross the vectors
4273  vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
4274  vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
4275 
4276  vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 8 ) );
4277  vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 4 ) );
4278 
4279  vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 8 ) );
4280  vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 4 ) );
4281 
4282  vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 8 ) );
4283  vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 4 ) );
4284 
4285  //move the result to the FPU
4286  vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
4287  vec_ste( vec_splat( vecSum3, 0 ), 0, &tempSum2 );
4288  vec_ste( vec_splat( vecSum5, 0 ), 0, &tempSum3 );
4289  vec_ste( vec_splat( vecSum7, 0 ), 0, &tempSum4 );
4290 
4291  sum -= tempSum;
4292  sum2 -= tempSum2;
4293  sum3 -= tempSum3;
4294  sum4 -= tempSum4;
4295  }
4296 
4297  //cleanup
4298  for ( ; j < i; j++ ) {
4299  sum -= lptr[j] * x[j];
4300  sum2 -= lptr2[j] * x[j];
4301  sum3 -= lptr3[j] * x[j];
4302  sum4 -= lptr4[j] * x[j];
4303  }
4304 
4305  // store the 4 results at a time
4306  sum2 -= ( lptr2[i] * sum );
4307  sum3 = sum3 - ( lptr3[i+1] * sum2 ) - ( lptr3[i] * sum );
4308  sum4 = sum4 - ( lptr4[i+2] * sum3 ) - ( lptr4[i+1] * sum2 ) - ( lptr4[i] * sum );
4309 
4310  x[i] = sum;
4311  x[i+1] = sum2;
4312  x[i+2] = sum3;
4313  x[i+3] = sum4;
4314  }
4315 
4316  // cleanup
4317  for ( ; i < n; i++ ) {
4318  sum = b[i];
4319  vecSum1 = zeroVector;
4320  vecSum2 = zeroVector;
4321  lptr = L[i];
4322  vector unsigned char vecPermLptr = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
4323 
4324  for ( j = 0 ; j+7 < i; j+=8 ) {
4325 
4326  v0 = vec_ld( 0, &x[j] );
4327  v2 = vec_ld( 15, &x[j] );
4328  vector float vecExtraX = vec_ld( 31, &x[j] );
4329  v0 = vec_perm( v0, v2, vecPermX );
4330  v2 = vec_perm( v2, vecExtraX, vecPermX );
4331 
4332  v1 = vec_ld( 0, lptr + j );
4333  v3 = vec_ld( 15, lptr + j );
4334  vector float vecExtra = vec_ld( 31, lptr + j );
4335  v1 = vec_perm( v1, v3, vecPermLptr );
4336  v3 = vec_perm( v3, vecExtra, vecPermLptr );
4337 
4338  vecSum1 = vec_madd( v1, v0, vecSum1 );
4339  vecSum2 = vec_madd( v3, v2, vecSum2 );
4340  }
4341 
4342  // if we ran the unrolled code, we need to sum accross the vectors
4343  // to find out how much to subtract from sum
4344  if ( j > 0 ) {
4345  //sum accross the vectors
4346  vecSum1 = vec_add( vecSum1, vecSum2 );
4347  vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
4348  vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
4349 
4350  //move the result to the FPU
4351  vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
4352  sum -= tempSum;
4353  }
4354 
4355  //cleanup
4356  for ( ; j < i; j++ ) {
4357  sum -= lptr[j] * x[j];
4358  }
4359  x[i] = sum;
4360  }
4361 }
4362 
4363 /*
4364 ============
4365 idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose
4366 
4367  solves x in L.Transpose() * x = b for the first n rows of L
4368  L has to be a lower triangular matrix with (implicit) ones on the diagonal
4369  x == b is allowed
4370 ============
4371 */
4372 void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
4373 
4374  int nc;
4375  const float *lptr;
4376 
4377  lptr = L.ToFloatPtr();
4378  nc = L.GetNumColumns();
4379 
4380  float x0, x1, x2, x3, x4, x5, x6;
4381  // unrolled cases for n < 8
4382  if ( n < 8 ) {
4383  switch( n ) {
4384  // using local variables to avoid aliasing issues
4385  case 0:
4386  return;
4387  case 1:
4388  x[0] = b[0];
4389  return;
4390  case 2:
4391  x1 = b[1];
4392  x0 = b[0] - lptr[1*nc+0] * x1;
4393 
4394  x[1] = x1;
4395  x[0] = x0;
4396  return;
4397  case 3:
4398  x2 = b[2];
4399  x1 = b[1] - lptr[2*nc+1] * x2;
4400  x0 = b[0] - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4401 
4402  x[2] = x2;
4403  x[1] = x1;
4404  x[0] = x0;
4405  return;
4406  case 4:
4407  x3 = b[3];
4408  x2 = b[2] - lptr[3*nc+2] * x3;
4409  x1 = b[1] - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4410  x0 = b[0] - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4411 
4412  x[3] = x3;
4413  x[2] = x2;
4414  x[1] = x1;
4415  x[0] = x0;
4416 
4417  return;
4418  case 5:
4419  x4 = b[4];
4420  x3 = b[3] - lptr[4*nc+3] * x4;
4421  x2 = b[2] - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4422  x1 = b[1] - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4423  x0 = b[0] - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4424 
4425  x[4] = x4;
4426  x[3] = x3;
4427  x[2] = x2;
4428  x[1] = x1;
4429  x[0] = x0;
4430  return;
4431  case 6:
4432  x5 = b[5];
4433  x4 = b[4] - lptr[5*nc+4] * x5;
4434  x3 = b[3] - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
4435  x2 = b[2] - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4436  x1 = b[1] - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4437  x0 = b[0] - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4438 
4439  x[5] = x5;
4440  x[4] = x4;
4441  x[3] = x3;
4442  x[2] = x2;
4443  x[1] = x1;
4444  x[0] = x0;
4445 
4446  return;
4447  case 7:
4448  x6 = b[6];
4449  x5 = b[5] - lptr[6*nc+5] * x6;
4450  x4 = b[4] - lptr[6*nc+4] * x6 - lptr[5*nc+4] * x5;
4451  x3 = b[3] - lptr[6*nc+3] * x6 - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
4452  x2 = b[2] - lptr[6*nc+2] * x6 - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4453  x1 = b[1] - lptr[6*nc+1] * x6 - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4454  x0 = b[0] - lptr[6*nc+0] * x6 - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4455 
4456  x[6] = x6;
4457  x[5] = x5;
4458  x[4] = x4;
4459  x[3] = x3;
4460  x[2] = x2;
4461  x[1] = x1;
4462  x[0] = x0;
4463  return;
4464  }
4465  return;
4466  }
4467 
4468  int i, j;
4469  register float s0, s1, s2, s3;
4470  float *xptr;
4471 
4472  lptr = L.ToFloatPtr() + n * nc + n - 4;
4473  xptr = x + n;
4474 
4475  // process 4 rows at a time
4476  for ( i = n; i >= 4; i -= 4 ) {
4477  s0 = b[i-4];
4478  s1 = b[i-3];
4479  s2 = b[i-2];
4480  s3 = b[i-1];
4481  // process 4x4 blocks
4482  for ( j = 0; j < n-i; j += 4 ) {
4483  s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
4484  s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
4485  s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
4486  s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
4487  s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
4488  s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
4489  s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
4490  s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
4491  s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
4492  s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
4493  s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
4494  s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
4495  s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
4496  s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
4497  s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
4498  s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
4499  }
4500  // process left over of the 4 rows
4501  s0 -= lptr[0-1*nc] * s3;
4502  s1 -= lptr[1-1*nc] * s3;
4503  s2 -= lptr[2-1*nc] * s3;
4504  s0 -= lptr[0-2*nc] * s2;
4505  s1 -= lptr[1-2*nc] * s2;
4506  s0 -= lptr[0-3*nc] * s1;
4507  // store result
4508  xptr[-4] = s0;
4509  xptr[-3] = s1;
4510  xptr[-2] = s2;
4511  xptr[-1] = s3;
4512  // update pointers for next four rows
4513  lptr -= 4 + 4 * nc;
4514  xptr -= 4;
4515  }
4516  // process left over rows
4517  for ( i--; i >= 0; i-- ) {
4518  s0 = b[i];
4519  lptr = L[0] + i;
4520  for ( j = i + 1; j < n; j++ ) {
4521  s0 -= lptr[j*nc] * x[j];
4522  }
4523  x[i] = s0;
4524  }
4525 }
4526 
4527 /*
4528 ============
4529 idSIMD_AltiVec::MatX_LDLTFactor
4530 ============
4531 */
4532 bool VPCALL idSIMD_AltiVec::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
4533  int i, j, k, nc;
4534  float *v, *diag, *mptr;
4535  float s0, s1, s2, s3, sum, d;
4536  float s0_2, s1_2, s2_2, s3_2, sum_2;
4537  float *mptr2;
4538 
4539  v = (float *) _alloca16( n * sizeof( float ) );
4540  diag = (float *) _alloca16( n * sizeof( float ) );
4541 
4542  nc = mat.GetNumColumns();
4543 
4544  if ( n <= 0 ) {
4545  return true;
4546  }
4547 
4548  mptr = mat[0];
4549 
4550  sum = mptr[0];
4551 
4552  if ( sum == 0.0f ) {
4553  return false;
4554  }
4555 
4556  diag[0] = sum;
4557  invDiag[0] = d = 1.0f / sum;
4558 
4559  if ( n <= 1 ) {
4560  return true;
4561  }
4562 
4563  mptr = mat[0];
4564  for ( j = 1; j < n; j++ ) {
4565  mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
4566  }
4567 
4568  mptr = mat[1];
4569 
4570  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4571  sum = mptr[1] - s0;
4572 
4573  if ( sum == 0.0f ) {
4574  return false;
4575  }
4576 
4577  mat[1][1] = sum;
4578  diag[1] = sum;
4579  invDiag[1] = d = 1.0f / sum;
4580 
4581  if ( n <= 2 ) {
4582  return true;
4583  }
4584 
4585  mptr = mat[0];
4586  for ( j = 2; j < n; j++ ) {
4587  mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
4588  }
4589 
4590  mptr = mat[2];
4591 
4592  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4593  v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4594  sum = mptr[2] - s0 - s1;
4595 
4596  if ( sum == 0.0f ) {
4597  return false;
4598  }
4599 
4600  mat[2][2] = sum;
4601  diag[2] = sum;
4602  invDiag[2] = d = 1.0f / sum;
4603 
4604  if ( n <= 3 ) {
4605  return true;
4606  }
4607 
4608  mptr = mat[0];
4609  for ( j = 3; j < n; j++ ) {
4610  mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
4611  }
4612 
4613  mptr = mat[3];
4614 
4615  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4616  v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4617  v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
4618  sum = mptr[3] - s0 - s1 - s2;
4619 
4620  if ( sum == 0.0f ) {
4621  return false;
4622  }
4623 
4624  mat[3][3] = sum;
4625  diag[3] = sum;
4626  invDiag[3] = d = 1.0f / sum;
4627 
4628  if ( n <= 4 ) {
4629  return true;
4630  }
4631 
4632  mptr = mat[0];
4633  for ( j = 4; j < n; j++ ) {
4634  mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
4635  }
4636 
4637  for ( i = 4; i < n; i++ ) {
4638 
4639  mptr = mat[i];
4640 
4641  v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4642  v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4643  v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
4644  v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
4645  for ( k = 4; k < i-3; k += 4 ) {
4646  v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
4647  v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
4648  v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
4649  v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
4650  }
4651  switch( i - k ) {
4652  case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
4653  case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
4654  case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
4655  }
4656  sum = s3;
4657  sum += s2;
4658  sum += s1;
4659  sum += s0;
4660  sum = mptr[i] - sum;
4661 
4662  if ( sum == 0.0f ) {
4663  return false;
4664  }
4665 
4666  mat[i][i] = sum;
4667  diag[i] = sum;
4668  invDiag[i] = d = 1.0f / sum;
4669 
4670  if ( i + 1 >= n ) {
4671  return true;
4672  }
4673 
4674  // unrolling madness!
4675  mptr = mat[i+1];
4676  mptr2 = mat[i+1] + nc;
4677 
4678  for ( j = i+1; j+1 < n; j+=2 ) {
4679  s0 = mptr[0] * v[0];
4680  s1 = mptr[1] * v[1];
4681  s2 = mptr[2] * v[2];
4682  s3 = mptr[3] * v[3];
4683 
4684  s0_2 = mptr2[0] * v[0];
4685  s1_2 = mptr2[1] * v[1];
4686  s2_2 = mptr2[2] * v[2];
4687  s3_2 = mptr2[3] * v[3];
4688 
4689  for ( k = 4; k < i-7; k += 8 ) {
4690  s0 += mptr[k+0] * v[k+0];
4691  s1 += mptr[k+1] * v[k+1];
4692  s2 += mptr[k+2] * v[k+2];
4693  s3 += mptr[k+3] * v[k+3];
4694  s0 += mptr[k+4] * v[k+4];
4695  s1 += mptr[k+5] * v[k+5];
4696  s2 += mptr[k+6] * v[k+6];
4697  s3 += mptr[k+7] * v[k+7];
4698 
4699  s0_2 += mptr2[k+0] * v[k+0];
4700  s1_2 += mptr2[k+1] * v[k+1];
4701  s2_2 += mptr2[k+2] * v[k+2];
4702  s3_2 += mptr2[k+3] * v[k+3];
4703  s0_2 += mptr2[k+4] * v[k+4];
4704  s1_2 += mptr2[k+5] * v[k+5];
4705  s2_2 += mptr2[k+6] * v[k+6];
4706  s3_2 += mptr2[k+7] * v[k+7];
4707  }
4708 
4709  switch( i - k ) {
4710  case 7: s0 += mptr[k+6] * v[k+6]; s0_2 += mptr2[k+6] * v[k+6];
4711  case 6: s1 += mptr[k+5] * v[k+5]; s1_2 += mptr2[k+5] * v[k+5];
4712  case 5: s2 += mptr[k+4] * v[k+4]; s2_2 += mptr2[k+4] * v[k+4];
4713  case 4: s3 += mptr[k+3] * v[k+3]; s3_2 += mptr2[k+3] * v[k+3];
4714  case 3: s0 += mptr[k+2] * v[k+2]; s0_2 += mptr2[k+2] * v[k+2];
4715  case 2: s1 += mptr[k+1] * v[k+1]; s1_2 += mptr2[k+1] * v[k+1];
4716  case 1: s2 += mptr[k+0] * v[k+0]; s2_2 += mptr2[k+0] * v[k+0];
4717  }
4718  // disassociate these adds
4719  s3 += s2;
4720  s1 += s0;
4721  sum = s1 + s3;
4722 
4723  s3_2 += s2_2;
4724  s1_2 += s0_2;
4725  sum_2 = s1_2 + s3_2;
4726 
4727  mptr[i] = ( mptr[i] - sum ) * d;
4728  mptr2[i] = ( mptr2[i] - sum_2 ) * d;
4729 
4730  mptr += nc*2;
4731  mptr2 += nc*2;
4732  }
4733 
4734  // cleanup
4735  for ( ; j < n; j++ ) {
4736  s0 = mptr[0] * v[0];
4737  s1 = mptr[1] * v[1];
4738  s2 = mptr[2] * v[2];
4739  s3 = mptr[3] * v[3];
4740  for ( k = 4; k < i-7; k += 8 ) {
4741  s0 += mptr[k+0] * v[k+0];
4742  s1 += mptr[k+1] * v[k+1];
4743  s2 += mptr[k+2] * v[k+2];
4744  s3 += mptr[k+3] * v[k+3];
4745  s0 += mptr[k+4] * v[k+4];
4746  s1 += mptr[k+5] * v[k+5];
4747  s2 += mptr[k+6] * v[k+6];
4748  s3 += mptr[k+7] * v[k+7];
4749  }
4750  switch( i - k ) {
4751  case 7: s0 += mptr[k+6] * v[k+6];
4752  case 6: s1 += mptr[k+5] * v[k+5];
4753  case 5: s2 += mptr[k+4] * v[k+4];
4754  case 4: s3 += mptr[k+3] * v[k+3];
4755  case 3: s0 += mptr[k+2] * v[k+2];
4756  case 2: s1 += mptr[k+1] * v[k+1];
4757  case 1: s2 += mptr[k+0] * v[k+0];
4758  }
4759  // disassociate these adds
4760  s3 += s2;
4761  s1 += s0;
4762  sum = s1 + s3;
4763  mptr[i] = ( mptr[i] - sum ) * d;
4764  mptr += nc;
4765  }
4766  }
4767  return true;
4768 }
4769 #endif /* ENABLE_LOWER_TRIANGULAR */
4770 
4771 
4772 #ifdef LIVE_VICARIOUSLY
4773 /*
4774 ============
4775 idSIMD_AltiVec::BlendJoints
4776 ============
4777 */
4778 void VPCALL idSIMD_AltiVec::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
4779  int i;
4780 
4781  // since lerp is a constant, we can special case the two cases if they're true
4782  if ( lerp <= 0.0f ) {
4783  // this sets joints back to joints. No sense in doing no work, so just return
4784  return;
4785  }
4786 
4787  if ( lerp >= 1.0f ) {
4788  // this copies each q from blendJoints to joints and copies each t from blendJoints to joints
4789  memcpy( joints[0].q.ToFloatPtr(), blendJoints[0].q.ToFloatPtr(), sizeof(idJointQuat) * numJoints );
4790  return;
4791  }
4792 
4793  vector float vecLerp = loadSplatUnalignedScalar( &lerp );
4794  vector float zeroVector = (vector float)(0);
4795 
4796  for ( i = 0; i+3 < numJoints; i+=4 ) {
4797  int j = index[i];
4798  int j2 = index[i+1];
4799  int j3 = index[i+2];
4800  int j4 = index[i+3];
4801 
4802  // slerp
4803  const float *jointPtr = joints[j].q.ToFloatPtr();
4804  const float *blendPtr = blendJoints[j].q.ToFloatPtr();
4805  const float *jointPtr2 = joints[j2].q.ToFloatPtr();
4806  const float *blendPtr2 = blendJoints[j2].q.ToFloatPtr();
4807  const float *jointPtr3 = joints[j3].q.ToFloatPtr();
4808  const float *blendPtr3 = blendJoints[j3].q.ToFloatPtr();
4809  const float *jointPtr4 = joints[j4].q.ToFloatPtr();
4810  const float *blendPtr4 = blendJoints[j4].q.ToFloatPtr();
4811 
4812  vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
4813  vector unsigned char permVec2 = vec_add( vec_lvsl( -1, jointPtr2 ), (vector unsigned char)(1) );
4814  vector unsigned char permVec3 = vec_add( vec_lvsl( -1, jointPtr3 ), (vector unsigned char)(1) );
4815  vector unsigned char permVec4 = vec_add( vec_lvsl( -1, jointPtr4 ), (vector unsigned char)(1) );
4816 
4817  vector unsigned char permVec5 = vec_add( vec_lvsl( -1, blendPtr ), (vector unsigned char)(1) );
4818  vector unsigned char permVec6 = vec_add( vec_lvsl( -1, blendPtr2 ), (vector unsigned char)(1) );
4819  vector unsigned char permVec7 = vec_add( vec_lvsl( -1, blendPtr3 ), (vector unsigned char)(1) );
4820  vector unsigned char permVec8 = vec_add( vec_lvsl( -1, blendPtr4 ), (vector unsigned char)(1) );
4821 
4822  vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
4823  vector float v12, v13, v14, v15, v16;
4824  vector float vecFromX, vecFromY, vecFromZ, vecFromW;
4825  vector float vecToX, vecToY, vecToZ, vecToW;
4826 
4827  // load up the the idJointQuats from joints
4828  v0 = vec_ld( 0, jointPtr );
4829  v1 = vec_ld( 15, jointPtr );
4830  v2 = vec_perm( v0, v1, permVec );
4831 
4832  v3 = vec_ld( 0, jointPtr2 );
4833  v4 = vec_ld( 15, jointPtr2 );
4834  v5 = vec_perm( v3, v4, permVec2 );
4835 
4836  v6 = vec_ld( 0, jointPtr3 );
4837  v7 = vec_ld( 15, jointPtr3 );
4838  v8 = vec_perm( v6, v7, permVec3 );
4839 
4840  v9 = vec_ld( 0, jointPtr4 );
4841  v10 = vec_ld( 15, jointPtr4 );
4842  v11 = vec_perm( v9, v10, permVec4 );
4843 
4844  // planarizing, so put each x y z w into its own vector
4845  v0 = vec_mergeh( v2, v8 );
4846  v1 = vec_mergeh( v5, v11 );
4847  v3 = vec_mergel( v2, v8 );
4848  v4 = vec_mergel( v5, v11 );
4849 
4850  vecFromX = vec_mergeh( v0, v1 );
4851  vecFromY = vec_mergel( v0, v1 );
4852  vecFromZ = vec_mergeh( v3, v4 );
4853  vecFromW = vec_mergel( v3, v4 );
4854 
4855  // load up idJointQuats from blendJoints
4856  v5 = vec_ld( 0, blendPtr );
4857  v6 = vec_ld( 15, blendPtr );
4858  v7 = vec_perm( v5, v6, permVec5 );
4859 
4860  v8 = vec_ld( 0, blendPtr2 );
4861  v9 = vec_ld( 15, blendPtr2 );
4862  v10 = vec_perm( v8, v9, permVec6 );
4863 
4864  v11 = vec_ld( 0, blendPtr3 );
4865  v12 = vec_ld( 15, blendPtr3 );
4866  v13 = vec_perm( v11, v12, permVec7 );
4867 
4868  v14 = vec_ld( 0, blendPtr4 );
4869  v15 = vec_ld( 15, blendPtr4 );
4870  v16 = vec_perm( v14, v15, permVec8 );
4871 
4872  // put these into their own vectors too
4873  v5 = vec_mergeh( v7, v13 );
4874  v6 = vec_mergeh( v10, v16 );
4875  v8 = vec_mergel( v7, v13 );
4876  v9 = vec_mergel( v10, v16 );
4877 
4878  vecToX = vec_mergeh( v5, v6 );
4879  vecToY = vec_mergel( v5, v6 );
4880  vecToZ = vec_mergeh( v8, v9 );
4881  vecToW = vec_mergel( v8, v9 );
4882 
4883  // calculate cosom
4884  vector float vecCosom = vec_madd( vecFromX, vecToX, (vector float)(0) );
4885  vecCosom = vec_madd( vecFromY, vecToY, vecCosom );
4886  vecCosom = vec_madd( vecFromZ, vecToZ, vecCosom );
4887  vecCosom = vec_madd( vecFromW, vecToW, vecCosom );
4888 
4889  // if cosom is < 0, negate it and set temp to negated elements in to. otherwise, set temp to
4890  // to
4891  vector bool int vecCmp, vecCmp2;
4892  vecCmp = vec_cmplt( vecCosom, zeroVector );
4893 
4894  // negate if needed
4895  vecToX = vec_sel( vecToX, vec_madd( vecToX, (vector float)(-1), zeroVector ), vecCmp );
4896  vecToY = vec_sel( vecToY, vec_madd( vecToY, (vector float)(-1), zeroVector ), vecCmp );
4897  vecToZ = vec_sel( vecToZ, vec_madd( vecToZ, (vector float)(-1), zeroVector ), vecCmp );
4898  vecToW = vec_sel( vecToW, vec_madd( vecToW, (vector float)(-1), zeroVector ), vecCmp );
4899  vecCosom = vec_sel( vecCosom, vec_madd( vecCosom, (vector float)(-1), zeroVector ), vecCmp );
4900 
4901  // check if we need to calculate scale
4902  vecCmp2 = vec_cmpgt( vec_sub( (vector float)(1), vecCosom ), (vector float)(1e-6f) );
4903  vector float vecScale0 = vec_sub( (vector float)(1), vecLerp );
4904  vector float vecScale1 = vec_splat( vecLerp, 0 );
4905 
4906  vector float vecWork1 = vec_sub( (vector float)(1), vec_madd( vecCosom, vecCosom, zeroVector ) );
4907  vector float vecWork2 = ReciprocalSquareRoot( vecWork1 );
4908  vector float vecWork3 = VectorATan16( vec_madd( vecWork1, vecWork2, zeroVector ), vecCosom );
4909 
4910  vecWork1 = vec_madd( VectorSin16( vec_madd( vecScale0, vecWork3, zeroVector ) ), vecWork2, zeroVector );
4911  vecWork2 = vec_madd( VectorSin16( vec_madd( vecLerp, vecWork3, zeroVector ) ), vecWork2, zeroVector );
4912 
4913  // see which ones we have to insert into our scale0 and scale1 vectors
4914  vecScale0 = vec_sel( vecScale0, vecWork1, vecCmp2 );
4915  vecScale1 = vec_sel( vecScale1, vecWork2, vecCmp2 );
4916 
4917  // multiply each element by the scale
4918  vecFromX = vec_madd( vecFromX, vecScale0, zeroVector );
4919  vecFromY = vec_madd( vecFromY, vecScale0, zeroVector );
4920  vecFromZ = vec_madd( vecFromZ, vecScale0, zeroVector );
4921  vecFromW = vec_madd( vecFromW, vecScale0, zeroVector );
4922 
4923  // multiply temp by scale and add to result
4924  vecFromX = vec_madd( vecToX, vecScale1, vecFromX );
4925  vecFromY = vec_madd( vecToY, vecScale1, vecFromY );
4926  vecFromZ = vec_madd( vecToZ, vecScale1, vecFromZ );
4927  vecFromW = vec_madd( vecToW, vecScale1, vecFromW );
4928 
4929  // do a transform again to get the results back to vectors we can store out
4930  v5 = vec_mergeh( vecFromX, vecFromZ );
4931  v6 = vec_mergeh( vecFromY, vecFromW );
4932  v8 = vec_mergel( vecFromX, vecFromZ );
4933  v9 = vec_mergel( vecFromY, vecFromW );
4934 
4935  vecToX = vec_mergeh( v5, v6 );
4936  vecToY = vec_mergel( v5, v6 );
4937  vecToZ = vec_mergeh( v8, v9 );
4938  vecToW = vec_mergel( v8, v9 );
4939 
4940  vector unsigned char storePerm1 = vec_lvsr( 0, jointPtr );
4941  vector unsigned char storePerm2 = vec_lvsr( 0, jointPtr2 );
4942  vector unsigned char storePerm3 = vec_lvsr( 0, jointPtr3 );
4943  vector unsigned char storePerm4 = vec_lvsr( 0, jointPtr4 );
4944 
4945  // right rotate the input data
4946  vecToX = vec_perm( vecToX, vecToX, storePerm1 );
4947  vecToY = vec_perm( vecToY, vecToY, storePerm2 );
4948  vecToZ = vec_perm( vecToZ, vecToZ, storePerm3 );
4949  vecToW = vec_perm( vecToW, vecToW, storePerm4 );
4950 
4951  vec_ste( vecToX, 0, (float*) jointPtr );
4952  vec_ste( vecToX, 4, (float*) jointPtr );
4953  vec_ste( vecToX, 8, (float*) jointPtr );
4954  vec_ste( vecToX, 12, (float*) jointPtr );
4955 
4956  vec_ste( vecToY, 0, (float*) jointPtr2 );
4957  vec_ste( vecToY, 4, (float*) jointPtr2 );
4958  vec_ste( vecToY, 8, (float*) jointPtr2 );
4959  vec_ste( vecToY, 12, (float*) jointPtr2 );
4960 
4961  vec_ste( vecToZ, 0, (float*) jointPtr3 );
4962  vec_ste( vecToZ, 4, (float*) jointPtr3 );
4963  vec_ste( vecToZ, 8, (float*) jointPtr3 );
4964  vec_ste( vecToZ, 12, (float*) jointPtr3 );
4965 
4966  vec_ste( vecToW, 0, (float*) jointPtr4 );
4967  vec_ste( vecToW, 4, (float*) jointPtr4 );
4968  vec_ste( vecToW, 8, (float*) jointPtr4 );
4969  vec_ste( vecToW, 12, (float*) jointPtr4 );
4970 
4971  // lerp is v1 + l * ( v2 - v1 );
4972  // the idVec3 T is going to be 12 bytes after the Q, so we can do this without calling ToFloatPtr() again. since its
4973  float *jointVecPtr = (float*)( jointPtr + 4 );
4974  float *jointVecPtr2 = (float*)( jointPtr2 + 4 );
4975  float *jointVecPtr3 = (float*)( jointPtr3 + 4 );
4976  float *jointVecPtr4 = (float*)( jointPtr4 + 4 );
4977 
4978  v0 = vec_ld( 0, jointVecPtr );
4979  v1 = vec_ld( 11, jointVecPtr );
4980  vector float vecLd1 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, jointVecPtr ), (vector unsigned char)(1) ) );
4981 
4982  v2 = vec_ld( 0, jointVecPtr2 );
4983  v3 = vec_ld( 11, jointVecPtr2 );
4984  vector float vecLd2 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, jointVecPtr2 ), (vector unsigned char)(1) ) );
4985 
4986  v4 = vec_ld( 0, jointVecPtr3 );
4987  v5 = vec_ld( 11, jointVecPtr3 );
4988  vector float vecLd3 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, jointVecPtr3 ), (vector unsigned char)(1) ) );
4989 
4990  v6 = vec_ld( 0, jointVecPtr4 );
4991  v7 = vec_ld( 11, jointVecPtr4 );
4992  vector float vecLd4 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, jointVecPtr4 ), (vector unsigned char)(1) ) );
4993 
4994  vector float vecVecX, vecVecY, vecVecZ;
4995  vecVecX = vecVecY = vecVecZ = zeroVector;
4996 
4997  // planarize
4998  v0 = vec_mergeh( vecLd1, vecLd3 );
4999  v1 = vec_mergeh( vecLd2, vecLd4 );
5000  v3 = vec_mergel( vecLd1, vecLd3 );
5001  v4 = vec_mergel( vecLd2, vecLd4 );
5002 
5003  vecVecX = vec_mergeh( v0, v1 );
5004  vecVecY = vec_mergel( v0, v1 );
5005  vecVecZ = vec_mergeh( v3, v4 );
5006 
5007  // load blend joint idvec3's
5008  float *blendVecPtr = (float*)( blendPtr + 4 );
5009  float *blendVecPtr2 =(float*)( blendPtr2 + 4 );
5010  float *blendVecPtr3 = (float*)( blendPtr3 + 4 );
5011  float *blendVecPtr4 = (float*)( blendPtr4 + 4 );
5012 
5013  v0 = vec_ld( 0, blendVecPtr );
5014  v1 = vec_ld( 11, blendVecPtr );
5015  vector float vecLd5 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, blendVecPtr ), (vector unsigned char)(1) ) );
5016 
5017  v2 = vec_ld( 0, blendVecPtr2 );
5018  v3 = vec_ld( 11, blendVecPtr2 );
5019  vector float vecLd6 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, blendVecPtr2 ), (vector unsigned char)(1) ) );
5020 
5021  v4 = vec_ld( 0, blendVecPtr3 );
5022  v5 = vec_ld( 11, blendVecPtr3 );
5023  vector float vecLd7 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, blendVecPtr3 ), (vector unsigned char)(1) ) );
5024 
5025  v6 = vec_ld( 0, blendVecPtr4 );
5026  v7 = vec_ld( 11, blendVecPtr4 );
5027  vector float vecLd8 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, blendVecPtr4 ), (vector unsigned char)(1) ) );
5028 
5029  vector float vecBlendX, vecBlendY, vecBlendZ;
5030  vecBlendX = vecBlendY = vecBlendZ = zeroVector;
5031 
5032  // planarize
5033  v0 = vec_mergeh( vecLd5, vecLd7 );
5034  v1 = vec_mergeh( vecLd6, vecLd8 );
5035  v3 = vec_mergel( vecLd5, vecLd7 );
5036  v4 = vec_mergel( vecLd6, vecLd8 );
5037 
5038  vecBlendX = vec_mergeh( v0, v1 );
5039  vecBlendY = vec_mergel( v0, v1 );
5040  vecBlendZ = vec_mergeh( v3, v4 );
5041 
5042  // do subtraction
5043  vecWork1 = vec_sub( vecBlendX, vecVecX );
5044  vecWork2 = vec_sub( vecBlendY, vecVecY );
5045  vecWork3 = vec_sub( vecBlendZ, vecVecZ );
5046 
5047  // multiply by lerp and add to v1
5048  vecVecX = vec_madd( vecWork1, vecLerp, vecVecX );
5049  vecVecY = vec_madd( vecWork2, vecLerp, vecVecY );
5050  vecVecZ = vec_madd( vecWork3, vecLerp, vecVecZ );
5051 
5052  // put it back in original form
5053  v0 = vec_mergeh( vecVecX, vecVecZ );
5054  v1 = vec_mergeh( vecVecY, zeroVector );
5055  v3 = vec_mergel( vecVecX, vecVecZ );
5056  v4 = vec_mergel( vecVecY, zeroVector );
5057 
5058  // generate vectors to store
5059  vecWork1 = vec_mergeh( v0, v1 );
5060  vecWork2 = vec_mergel( v0, v1 );
5061  vecWork3 = vec_mergeh( v3, v4 );
5062  vector float vecWork4 = vec_mergel( v3, v4 );
5063 
5064  // store the T values
5065  storePerm1 = vec_lvsr( 0, jointVecPtr );
5066  storePerm2 = vec_lvsr( 0, jointVecPtr2 );
5067  storePerm3 = vec_lvsr( 0, jointVecPtr3 );
5068  storePerm4 = vec_lvsr( 0, jointVecPtr4 );
5069 
5070  // right rotate the input data
5071  vecWork1 = vec_perm( vecWork1, vecWork1, storePerm1 );
5072  vecWork2 = vec_perm( vecWork2, vecWork2, storePerm2 );
5073  vecWork3 = vec_perm( vecWork3, vecWork3, storePerm3 );
5074  vecWork4 = vec_perm( vecWork4, vecWork4, storePerm4 );
5075 
5076  vec_ste( vecWork1, 0, (float*) jointVecPtr );
5077  vec_ste( vecWork1, 4, (float*) jointVecPtr );
5078  vec_ste( vecWork1, 8, (float*) jointVecPtr );
5079 
5080  vec_ste( vecWork2, 0, (float*) jointVecPtr2 );
5081  vec_ste( vecWork2, 4, (float*) jointVecPtr2 );
5082  vec_ste( vecWork2, 8, (float*) jointVecPtr2 );
5083 
5084  vec_ste( vecWork3, 0, (float*) jointVecPtr3 );
5085  vec_ste( vecWork3, 4, (float*) jointVecPtr3 );
5086  vec_ste( vecWork3, 8, (float*) jointVecPtr3 );
5087 
5088  vec_ste( vecWork4, 0, (float*) jointVecPtr4 );
5089  vec_ste( vecWork4, 4, (float*) jointVecPtr4 );
5090  vec_ste( vecWork4, 8, (float*) jointVecPtr4 );
5091  }
5092 
5093  // cleanup
5094  for ( ; i < numJoints; i++ ) {
5095  int j = index[i];
5096  joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );
5097  joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
5098  }
5099 }
5100 
5101 /*
5102 ============
5103 idSIMD_AltiVec::ConvertJointQuatsToJointMats
5104 ============
5105 */
5106 
5107 // SSE doesn't vectorize this, and I don't think we should either. Its mainly just copying data, there's very little math involved and
5108 // it's not easily parallelizable
5109 void VPCALL idSIMD_AltiVec::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
5110 
5111  for ( int i = 0; i < numJoints; i++ ) {
5112 
5113  const float *q = jointQuats[i].q.ToFloatPtr();
5114  float *m = jointMats[i].ToFloatPtr();
5115 
5116  m[0*4+3] = q[4];
5117  m[1*4+3] = q[5];
5118  m[2*4+3] = q[6];
5119 
5120  float x2 = q[0] + q[0];
5121  float y2 = q[1] + q[1];
5122  float z2 = q[2] + q[2];
5123 
5124  {
5125  float xx = q[0] * x2;
5126  float yy = q[1] * y2;
5127  float zz = q[2] * z2;
5128 
5129  m[0*4+0] = 1.0f - yy - zz;
5130  m[1*4+1] = 1.0f - xx - zz;
5131  m[2*4+2] = 1.0f - xx - yy;
5132  }
5133 
5134  {
5135  float yz = q[1] * z2;
5136  float wx = q[3] * x2;
5137 
5138  m[2*4+1] = yz - wx;
5139  m[1*4+2] = yz + wx;
5140  }
5141 
5142  {
5143  float xy = q[0] * y2;
5144  float wz = q[3] * z2;
5145 
5146  m[1*4+0] = xy - wz;
5147  m[0*4+1] = xy + wz;
5148  }
5149 
5150  {
5151  float xz = q[0] * z2;
5152  float wy = q[3] * y2;
5153 
5154  m[0*4+2] = xz - wy;
5155  m[2*4+0] = xz + wy;
5156  }
5157  }
5158 }
5159 
5160 /*
5161 ============
5162 idSIMD_AltiVec::ConvertJointMatsToJointQuats
5163 ============
5164 */
5165 void VPCALL idSIMD_AltiVec::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
5166 
5167  int index;
5168 
5169  // Since we use very little of the data we have to pull in for the altivec version, we end up with
5170  // a lot of wasted math. Rather than try to force it to use altivec, I wrote an optimized version
5171  // of InvSqrt for the G5, and made it use that instead. With only this change, we get a little
5172  // bigger than 50% speedup, which is not too shabby. Should really replace idMath::InvSqrt with
5173  // my function so everyone can benefit on G5.
5174 
5175  for ( index = 0; index < numJoints; index++ ) {
5176 
5177  idJointQuat jq;
5178  float trace;
5179  float s;
5180  float t;
5181  int i;
5182  int j;
5183  int k;
5184 
5185  static int next[3] = { 1, 2, 0 };
5186 
5187  float *mat = (float*)( jointMats[index].ToFloatPtr() );
5188  trace = mat[0 * 4 + 0] + mat[1 * 4 + 1] + mat[2 * 4 + 2];
5189 
5190  if ( trace > 0.0f ) {
5191 
5192  t = trace + 1.0f;
5193  //s = idMath::InvSqrt( t ) * 0.5f;
5194  s = FastScalarInvSqrt( t ) * 0.5f;
5195 
5196  jq.q[3] = s * t;
5197  jq.q[0] = ( mat[1 * 4 + 2] - mat[2 * 4 + 1] ) * s;
5198  jq.q[1] = ( mat[2 * 4 + 0] - mat[0 * 4 + 2] ) * s;
5199  jq.q[2] = ( mat[0 * 4 + 1] - mat[1 * 4 + 0] ) * s;
5200 
5201  } else {
5202 
5203  i = 0;
5204  if ( mat[1 * 4 + 1] > mat[0 * 4 + 0] ) {
5205  i = 1;
5206  }
5207  if ( mat[2 * 4 + 2] > mat[i * 4 + i] ) {
5208  i = 2;
5209  }
5210  j = next[i];
5211  k = next[j];
5212 
5213  t = ( mat[i * 4 + i] - ( mat[j * 4 + j] + mat[k * 4 + k] ) ) + 1.0f;
5214  //s = idMath::InvSqrt( t ) * 0.5f;
5215  s = FastScalarInvSqrt( t ) * 0.5f;
5216 
5217  jq.q[i] = s * t;
5218  jq.q[3] = ( mat[j * 4 + k] - mat[k * 4 + j] ) * s;
5219  jq.q[j] = ( mat[i * 4 + j] + mat[j * 4 + i] ) * s;
5220  jq.q[k] = ( mat[i * 4 + k] + mat[k * 4 + i] ) * s;
5221  }
5222 
5223  jq.t[0] = mat[0 * 4 + 3];
5224  jq.t[1] = mat[1 * 4 + 3];
5225  jq.t[2] = mat[2 * 4 + 3];
5226  jointQuats[index] = jq;
5227  }
5228 }
5229 
5230 /*
5231 ============
5232 idSIMD_AltiVec::TransformJoints
5233 ============
5234 */
5235 void VPCALL idSIMD_AltiVec::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
5236  int i;
5237 #if 0
5238  for( i = firstJoint; i <= lastJoint; i++ ) {
5239  assert( parents[i] < i );
5240  jointMats[i] *= jointMats[parents[i]];
5241  }
5242 #else
5243 
5244  // I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
5245  // on what the parents array looks like. This is true in the test code.
5246  for ( i = firstJoint; i <= lastJoint; i++ ) {
5247  assert( parents[i] < i );
5248  float *jointPtr = jointMats[i].ToFloatPtr();
5249  float *parentPtr = jointMats[parents[i]].ToFloatPtr();
5250 
5251  vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
5252  vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
5253  vector float v0, v1, v2, v3, v4, v5, v6, v7;
5254 
5255  // we need to load up 12 float elements that make up the Mat
5256  v0 = vec_ld( 0, jointPtr );
5257  v1 = vec_ld( 15, jointPtr );
5258  v2 = vec_ld( 31, jointPtr );
5259  v3 = vec_ld( 47, jointPtr );
5260 
5261  // load parents
5262  v4 = vec_ld( 0, parentPtr );
5263  v5 = vec_ld( 15, parentPtr );
5264  v6 = vec_ld( 31, parentPtr );
5265  v7 = vec_ld( 47, parentPtr );
5266 
5267  // permute into vectors
5268  vector float vecJointMat1 = vec_perm( v0, v1, permVec );
5269  vector float vecJointMat2 = vec_perm( v1, v2, permVec );
5270  vector float vecJointMat3 = vec_perm( v2, v3, permVec );
5271 
5272  vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
5273  vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
5274  vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
5275 
5276  vector float zero = (vector float)(0);
5277  vector float C1, C2, C3;
5278 
5279  // matrix multiply
5280  C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero ); // m(0 to 3) * a(0)
5281  C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat2, 0 ), zero ); // m(4 to 7) * a(4)
5282  C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat3, 0 ), zero ); // m(8 to 11) * a(8)
5283 
5284  C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat1, 1 ), C1 ); // add in m(4 to 7) * a(1)
5285  C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 ); // add in m(4 to 7) * a(5)
5286  C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat3, 1 ), C3 ); // add in m(4 to 7) * a(9)
5287 
5288  C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat1, 2 ), C1 );
5289  C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat2, 2 ), C2 );
5290  C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
5291 
5292  // do the addition at the end
5293  vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
5294  C1 = vec_add( C1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
5295  C2 = vec_add( C2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
5296  C3 = vec_add( C3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
5297 
5298  // store results
5299  UNALIGNED_STORE3( (float*) jointPtr, C1, C2, C3 );
5300  }
5301 #endif
5302 }
5303 
5304 /*
5305 ============
5306 idSIMD_AltiVec::UntransformJoints
5307 ============
5308 */
5309 void VPCALL idSIMD_AltiVec::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
5310  int i;
5311 #if 0
5312  for( i = lastJoint; i >= firstJoint; i-- ) {
5313  assert( parents[i] < i );
5314  jointMats[i] /= jointMats[parents[i]];
5315  }
5316 #else
5317  // I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
5318  // on what the parents array looks like. This is true in the test code.
5319  for ( i = lastJoint; i >= firstJoint; i-- ) {
5320  assert( parents[i] < i );
5321  float *jointPtr = jointMats[i].ToFloatPtr();
5322  float *parentPtr = jointMats[parents[i]].ToFloatPtr();
5323 
5324  vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
5325  vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
5326  vector float v0, v1, v2, v3, v4, v5, v6, v7;
5327 
5328  // we need to load up 12 float elements that make up the Mat
5329  v0 = vec_ld( 0, jointPtr );
5330  v1 = vec_ld( 15, jointPtr );
5331  v2 = vec_ld( 31, jointPtr );
5332  v3 = vec_ld( 47, jointPtr );
5333 
5334  // load parents
5335  v4 = vec_ld( 0, parentPtr );
5336  v5 = vec_ld( 15, parentPtr );
5337  v6 = vec_ld( 31, parentPtr );
5338  v7 = vec_ld( 47, parentPtr );
5339 
5340  // permute into vectors
5341  vector float vecJointMat1 = vec_perm( v0, v1, permVec );
5342  vector float vecJointMat2 = vec_perm( v1, v2, permVec );
5343  vector float vecJointMat3 = vec_perm( v2, v3, permVec );
5344 
5345  vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
5346  vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
5347  vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
5348 
5349  vector float zero = (vector float)(0);
5350  vector float C1, C2, C3;
5351 
5352  // do subtraction at the beginning
5353  vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
5354  vecJointMat1 = vec_sub( vecJointMat1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
5355  vecJointMat2 = vec_sub( vecJointMat2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
5356  vecJointMat3 = vec_sub( vecJointMat3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
5357 
5358  // matrix multiply
5359  C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero );
5360  C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 1 ), zero );
5361  C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 2 ), zero );
5362 
5363  C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 0 ), C1 );
5364  C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 );
5365  C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 2 ), C3 );
5366 
5367  C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 0 ), C1 );
5368  C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 1 ), C2 );
5369  C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
5370 
5371  // store results back
5372  vector unsigned char storePerm = vec_lvsr( 0, jointPtr );
5373 
5374  // right rotate the input data
5375  C1 = vec_perm( C1, C1, storePerm );
5376  C2 = vec_perm( C2, C2, storePerm );
5377  C3 = vec_perm( C3, C3, storePerm );
5378 
5379  vec_ste( C1, 0, (float*) jointPtr );
5380  vec_ste( C1, 4, (float*) jointPtr );
5381  vec_ste( C1, 8, (float*) jointPtr );
5382  vec_ste( C1, 12, (float*) jointPtr );
5383 
5384  vec_ste( C2, 16, (float*) jointPtr );
5385  vec_ste( C2, 20, (float*) jointPtr );
5386  vec_ste( C2, 24, (float*) jointPtr );
5387  vec_ste( C2, 28, (float*) jointPtr );
5388 
5389  vec_ste( C3, 32, (float*) jointPtr );
5390  vec_ste( C3, 36, (float*) jointPtr );
5391  vec_ste( C3, 40, (float*) jointPtr );
5392  vec_ste( C3, 44, (float*) jointPtr );
5393  }
5394 
5395 #endif
5396 }
5397 
5398 /*
5399 ============
5400 idSIMD_AltiVec::TransformVerts
5401 ============
5402 */
5403 
5404 // Here we don't have much for the vector unit to do, and the gain we get from doing the math
5405 // in parallel is eaten by doing unaligned stores.
5406 void VPCALL idSIMD_AltiVec::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, int numWeights ) {
5407  int i, j;
5408  const byte *jointsPtr = (byte *)joints;
5409 
5410  for( j = i = 0; i < numVerts; i++ ) {
5411  idVec3 v;
5412 
5413  float *matPtrOrig = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
5414  float *weightPtr = (float*) weights[j].ToFloatPtr();
5415 
5416  v[0] = matPtrOrig[0] * weightPtr[0];
5417  v[0] += matPtrOrig[1] * weightPtr[1];
5418  v[0] += matPtrOrig[2] * weightPtr[2];
5419  v[0] += matPtrOrig[3] * weightPtr[3];
5420 
5421  v[1] = matPtrOrig[4] * weightPtr[0];
5422  v[1] += matPtrOrig[5] * weightPtr[1];
5423  v[1] += matPtrOrig[6] * weightPtr[2];
5424  v[1] += matPtrOrig[7] * weightPtr[3];
5425 
5426  v[2] = matPtrOrig[8] * weightPtr[0];
5427  v[2] += matPtrOrig[9] * weightPtr[1];
5428  v[2] += matPtrOrig[10] * weightPtr[2];
5429  v[2] += matPtrOrig[11] * weightPtr[3];
5430 
5431  while( index[j*2+1] == 0 ) {
5432  j++;
5433  float *matPtr = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
5434  weightPtr = (float*) weights[j].ToFloatPtr();
5435 
5436  v[0] += matPtr[0] * weightPtr[0];
5437  v[0] += matPtr[1] * weightPtr[1];
5438  v[0] += matPtr[2] * weightPtr[2];
5439  v[0] += matPtr[3] * weightPtr[3];
5440 
5441  v[1] += matPtr[4] * weightPtr[0];
5442  v[1] += matPtr[5] * weightPtr[1];
5443  v[1] += matPtr[6] * weightPtr[2];
5444  v[1] += matPtr[7] * weightPtr[3];
5445 
5446  v[2] += matPtr[8] * weightPtr[0];
5447  v[2] += matPtr[9] * weightPtr[1];
5448  v[2] += matPtr[10] * weightPtr[2];
5449  v[2] += matPtr[11] * weightPtr[3];
5450  }
5451  j++;
5452 
5453  verts[i].xyz = v;
5454  }
5455 }
5456 #endif /* LIVE_VICARIOUSLY */
5457 
5458 #ifdef ENABLE_CULL
5459 
5460 #ifndef DRAWVERT_PADDED
5461 /*
5462 ============
5463 idSIMD_AltiVec::TracePointCull
5464 ============
5465 */
5466 void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
5467 
5468  // idDrawVert size
5469  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
5470 
5471  byte tOr;
5472  tOr = 0;
5473 
5474  // pointers
5475  const float *planePtr = planes[0].ToFloatPtr();
5476 
5477  vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
5478  vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
5479  vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
5480  vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
5481  vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5482  vector unsigned char vecPerm;
5483  vector float v0, v1, v2, v3, v4, v5, v6, v7;
5484  vector float zeroVector = (vector float)(0);
5485  vector float vecRadius;
5486  vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5487  vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
5488  vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
5489  vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
5490  vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
5491  vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
5492  vector bool int oneIntVector = (vector bool int)(1);
5493  vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
5494  vector unsigned int vecTotals;
5495  vector unsigned int tempIntSum;
5496  vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
5497 
5498  vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
5499 
5500  // populate planes
5501  v0 = vec_ld( 0, planePtr );
5502  v1 = vec_ld( 15, planePtr );
5503  vecPlane0 = vec_perm( v0, v1, vecPerm );
5504 
5505  v2 = vec_ld( 0, planePtr + 4 );
5506  v3 = vec_ld( 15, planePtr + 4 );
5507  vecPlane1 = vec_perm( v2, v3, vecPerm );
5508 
5509  v0 = vec_ld( 0, planePtr + 8 );
5510  v1 = vec_ld( 15, planePtr + 8 );
5511  vecPlane2 = vec_perm( v0, v1, vecPerm );
5512 
5513  v2 = vec_ld( 0, planePtr + 12 );
5514  v3 = vec_ld( 15, planePtr + 12 );
5515  vecPlane3 = vec_perm( v2, v3, vecPerm );
5516 
5517  // transpose
5518  v0 = vec_mergeh( vecPlane0, vecPlane2 );
5519  v1 = vec_mergeh( vecPlane1, vecPlane3 );
5520  v2 = vec_mergel( vecPlane0, vecPlane2 );
5521  v3 = vec_mergel( vecPlane1, vecPlane3 );
5522 
5523  vecPlane0 = vec_mergeh( v0, v1 );
5524  vecPlane1 = vec_mergel( v0, v1 );
5525  vecPlane2 = vec_mergeh( v2, v3 );
5526  vecPlane3 = vec_mergel( v2, v3 );
5527 
5528  // load constants
5529  vecRadius = loadSplatUnalignedScalar( &radius );
5530 
5531  unsigned int cullBitVal[4];
5532  vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
5533  int i = 0;
5534 
5535  // every fourth one will have the same alignment. Make sure we've got enough here
5536  if ( i+3 < numVerts ) {
5537  vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5538  vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5539  vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5540  vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5541  }
5542 
5543 
5544  for ( ; i+3 < numVerts; i+=4 ) {
5545  const float *vertPtr = verts[i].xyz.ToFloatPtr();
5546  const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
5547  const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
5548  const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
5549 
5550  v0 = vec_ld( 0, vertPtr );
5551  v1 = vec_ld( 15, vertPtr );
5552  v2 = vec_ld( 0, vertPtr2 );
5553  v3 = vec_ld( 15, vertPtr2 );
5554  v4 = vec_ld( 0, vertPtr3 );
5555  v5 = vec_ld( 15, vertPtr3 );
5556  v6 = vec_ld( 0, vertPtr4 );
5557  v7 = vec_ld( 15, vertPtr4 );
5558 
5559  vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
5560  vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
5561  vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
5562  vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
5563 
5564  vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
5565  vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
5566  vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
5567  vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
5568 
5569  vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
5570  vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
5571  vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
5572  vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
5573 
5574  vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
5575  vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
5576  vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
5577  vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
5578 
5579  vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
5580  vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
5581  vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
5582  vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
5583 
5584  // vec1Sum1 now holds d0, d1, d2, d3. calculate the
5585  // difference with +radius and -radius
5586  vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
5587  vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
5588  vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
5589  vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
5590  vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
5591  vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
5592  vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
5593  vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
5594 
5595  // do compare
5596  vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
5597  vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
5598  vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
5599  vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
5600  vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
5601  vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
5602  vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
5603  vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
5604 
5605  //and it with 1 so we multiply by 1 not 1111's
5606  vecCmp1 = vec_and( vecCmp1, oneIntVector );
5607  vecCmp2 = vec_and( vecCmp2, oneIntVector );
5608  vecCmp3 = vec_and( vecCmp3, oneIntVector );
5609  vecCmp4 = vec_and( vecCmp4, oneIntVector );
5610  vecCmp5 = vec_and( vecCmp5, oneIntVector );
5611  vecCmp6 = vec_and( vecCmp6, oneIntVector );
5612  vecCmp7 = vec_and( vecCmp7, oneIntVector );
5613  vecCmp8 = vec_and( vecCmp8, oneIntVector );
5614 
5615  vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
5616  vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
5617  vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
5618  vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
5619  vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
5620  vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
5621  vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
5622  vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
5623 
5624  // OR (add) them all together
5625  vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
5626  vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
5627  vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
5628  vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
5629 
5630  vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
5631  vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
5632  tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
5633  tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5634  vecTotals = vec_mergeh( vecTotals, tempIntSum );
5635  tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
5636  tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5637  vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
5638  tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
5639  tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5640  vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
5641 
5642  // store out results
5643  vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
5644  tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
5645  vec_ste( tempSt, 0, &cullBitVal[0] );
5646  vec_ste( tempSt, 4, &cullBitVal[0] );
5647  vec_ste( tempSt, 8, &cullBitVal[0] );
5648  vec_ste( tempSt, 12, &cullBitVal[0] );
5649 
5650  tOr |= cullBitVal[0];
5651  tOr |= cullBitVal[1];
5652  tOr |= cullBitVal[2];
5653  tOr |= cullBitVal[3];
5654 
5655  cullBits[i] = cullBitVal[0];
5656  cullBits[i+1] = cullBitVal[1];
5657  cullBits[i+2] = cullBitVal[2];
5658  cullBits[i+3] = cullBitVal[3];
5659  }
5660 
5661  // cleanup
5662  for ( ; i < numVerts; i++ ) {
5663  byte bits;
5664  float d0, d1, d2, d3, t;
5665  const idVec3 &v = verts[i].xyz;
5666 
5667  d0 = planes[0].Distance( v );
5668  d1 = planes[1].Distance( v );
5669  d2 = planes[2].Distance( v );
5670  d3 = planes[3].Distance( v );
5671 
5672  t = d0 + radius;
5673  bits = FLOATSIGNBITSET( t ) << 0;
5674  t = d1 + radius;
5675  bits |= FLOATSIGNBITSET( t ) << 1;
5676  t = d2 + radius;
5677  bits |= FLOATSIGNBITSET( t ) << 2;
5678  t = d3 + radius;
5679  bits |= FLOATSIGNBITSET( t ) << 3;
5680 
5681  t = d0 - radius;
5682  bits |= FLOATSIGNBITSET( t ) << 4;
5683  t = d1 - radius;
5684  bits |= FLOATSIGNBITSET( t ) << 5;
5685  t = d2 - radius;
5686  bits |= FLOATSIGNBITSET( t ) << 6;
5687  t = d3 - radius;
5688  bits |= FLOATSIGNBITSET( t ) << 7;
5689 
5690  bits ^= 0x0F; // flip lower four bits
5691 
5692  tOr |= bits;
5693  cullBits[i] = bits;
5694  }
5695 
5696  totalOr = tOr;
5697 }
5698 #else
5699 
5700 /*
5701 ============
5702 idSIMD_AltiVec::TracePointCull
5703 ============
5704 */
5705 void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
5706 
5707  // idDrawVert size
5708  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
5709 
5710  byte tOr;
5711  tOr = 0;
5712 
5713  // pointers
5714  const float *planePtr = planes[0].ToFloatPtr();
5715 
5716  vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
5717  vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
5718  vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
5719  vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
5720  vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5721  vector unsigned char vecPerm;
5722  vector float v0, v1, v2, v3, v4, v5, v6, v7;
5723  vector float zeroVector = (vector float)(0);
5724  vector float vecRadius;
5725  vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5726  vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
5727  vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
5728  vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
5729  vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
5730  vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
5731  vector bool int oneIntVector = (vector bool int)(1);
5732  vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
5733  vector unsigned int vecTotals;
5734  vector unsigned int tempIntSum;
5735  vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
5736 
5737  vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
5738 
5739  // populate planes
5740  v0 = vec_ld( 0, planePtr );
5741  v1 = vec_ld( 15, planePtr );
5742  vecPlane0 = vec_perm( v0, v1, vecPerm );
5743 
5744  v2 = vec_ld( 0, planePtr + 4 );
5745  v3 = vec_ld( 15, planePtr + 4 );
5746  vecPlane1 = vec_perm( v2, v3, vecPerm );
5747 
5748  v0 = vec_ld( 0, planePtr + 8 );
5749  v1 = vec_ld( 15, planePtr + 8 );
5750  vecPlane2 = vec_perm( v0, v1, vecPerm );
5751 
5752  v2 = vec_ld( 0, planePtr + 12 );
5753  v3 = vec_ld( 15, planePtr + 12 );
5754  vecPlane3 = vec_perm( v2, v3, vecPerm );
5755 
5756  // transpose
5757  v0 = vec_mergeh( vecPlane0, vecPlane2 );
5758  v1 = vec_mergeh( vecPlane1, vecPlane3 );
5759  v2 = vec_mergel( vecPlane0, vecPlane2 );
5760  v3 = vec_mergel( vecPlane1, vecPlane3 );
5761 
5762  vecPlane0 = vec_mergeh( v0, v1 );
5763  vecPlane1 = vec_mergel( v0, v1 );
5764  vecPlane2 = vec_mergeh( v2, v3 );
5765  vecPlane3 = vec_mergel( v2, v3 );
5766 
5767  // load constants
5768  vecRadius = loadSplatUnalignedScalar( &radius );
5769 
5770  unsigned int cullBitVal[4];
5771  vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
5772  int i = 0;
5773 
5774 
5775  for ( ; i+3 < numVerts; i+=4 ) {
5776  const float *vertPtr = verts[i].xyz.ToFloatPtr();
5777  const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
5778  const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
5779  const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
5780 
5781  vecXYZ1 = vec_ld( 0, vertPtr );
5782  vecXYZ2 = vec_ld( 0, vertPtr2 );
5783  vecXYZ3 = vec_ld( 0, vertPtr3 );
5784  vecXYZ4 = vec_ld( 0, vertPtr4 );
5785 
5786  vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
5787  vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
5788  vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
5789  vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
5790 
5791  vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
5792  vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
5793  vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
5794  vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
5795 
5796  vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
5797  vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
5798  vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
5799  vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
5800 
5801  vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
5802  vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
5803  vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
5804  vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
5805 
5806  // vec1Sum1 now holds d0, d1, d2, d3. calculate the
5807  // difference with +radius and -radius
5808  vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
5809  vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
5810  vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
5811  vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
5812  vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
5813  vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
5814  vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
5815  vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
5816 
5817  // do compare
5818  vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
5819  vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
5820  vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
5821  vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
5822  vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
5823  vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
5824  vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
5825  vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
5826 
5827  //and it with 1 so we multiply by 1 not 1111's
5828  vecCmp1 = vec_and( vecCmp1, oneIntVector );
5829  vecCmp2 = vec_and( vecCmp2, oneIntVector );
5830  vecCmp3 = vec_and( vecCmp3, oneIntVector );
5831  vecCmp4 = vec_and( vecCmp4, oneIntVector );
5832  vecCmp5 = vec_and( vecCmp5, oneIntVector );
5833  vecCmp6 = vec_and( vecCmp6, oneIntVector );
5834  vecCmp7 = vec_and( vecCmp7, oneIntVector );
5835  vecCmp8 = vec_and( vecCmp8, oneIntVector );
5836 
5837  vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
5838  vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
5839  vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
5840  vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
5841  vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
5842  vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
5843  vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
5844  vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
5845 
5846  // OR (add) them all together
5847  vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
5848  vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
5849  vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
5850  vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
5851 
5852  vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
5853  vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
5854  tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
5855  tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5856  vecTotals = vec_mergeh( vecTotals, tempIntSum );
5857  tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
5858  tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5859  vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
5860  tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
5861  tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5862  vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
5863 
5864  // store out results
5865  vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
5866  tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
5867  vec_ste( tempSt, 0, &cullBitVal[0] );
5868  vec_ste( tempSt, 4, &cullBitVal[0] );
5869  vec_ste( tempSt, 8, &cullBitVal[0] );
5870  vec_ste( tempSt, 12, &cullBitVal[0] );
5871 
5872  tOr |= cullBitVal[0];
5873  tOr |= cullBitVal[1];
5874  tOr |= cullBitVal[2];
5875  tOr |= cullBitVal[3];
5876 
5877  cullBits[i] = cullBitVal[0];
5878  cullBits[i+1] = cullBitVal[1];
5879  cullBits[i+2] = cullBitVal[2];
5880  cullBits[i+3] = cullBitVal[3];
5881  }
5882 
5883  // cleanup
5884  for ( ; i < numVerts; i++ ) {
5885  byte bits;
5886  float d0, d1, d2, d3, t;
5887  const idVec3 &v = verts[i].xyz;
5888 
5889  d0 = planes[0].Distance( v );
5890  d1 = planes[1].Distance( v );
5891  d2 = planes[2].Distance( v );
5892  d3 = planes[3].Distance( v );
5893 
5894  t = d0 + radius;
5895  bits = FLOATSIGNBITSET( t ) << 0;
5896  t = d1 + radius;
5897  bits |= FLOATSIGNBITSET( t ) << 1;
5898  t = d2 + radius;
5899  bits |= FLOATSIGNBITSET( t ) << 2;
5900  t = d3 + radius;
5901  bits |= FLOATSIGNBITSET( t ) << 3;
5902 
5903  t = d0 - radius;
5904  bits |= FLOATSIGNBITSET( t ) << 4;
5905  t = d1 - radius;
5906  bits |= FLOATSIGNBITSET( t ) << 5;
5907  t = d2 - radius;
5908  bits |= FLOATSIGNBITSET( t ) << 6;
5909  t = d3 - radius;
5910  bits |= FLOATSIGNBITSET( t ) << 7;
5911 
5912  bits ^= 0x0F; // flip lower four bits
5913 
5914  tOr |= bits;
5915  cullBits[i] = bits;
5916  }
5917 
5918  totalOr = tOr;
5919 }
5920 
5921 #endif /* DRAWVERT_PADDED */
5922 
5923 #ifndef DRAWVERT_PADDED
5924 /*
5925 ============
5926 idSIMD_AltiVec::DecalPointCull
5927 ============
5928 */
5929 void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
5930 
5931  // idDrawVert size
5932  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
5933 
5934  int i;
5935  const float *planePtr = planes[0].ToFloatPtr();
5936 
5937  vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
5938  vector float zeroVector = (vector float)(0.0);
5939  vector unsigned char vecPerm;
5940  vector float v0, v1, v2, v3, v4, v5, v6, v7;
5941 
5942  vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
5943 
5944  // populate planes
5945  v0 = vec_ld( 0, planePtr );
5946  v1 = vec_ld( 15, planePtr );
5947  vecPlane0 = vec_perm( v0, v1, vecPerm );
5948 
5949  v2 = vec_ld( 0, planePtr + 4 );
5950  v3 = vec_ld( 15, planePtr + 4 );
5951  vecPlane1 = vec_perm( v2, v3, vecPerm );
5952 
5953  v0 = vec_ld( 0, planePtr + 8 );
5954  v1 = vec_ld( 15, planePtr + 8 );
5955  vecPlane2 = vec_perm( v0, v1, vecPerm );
5956 
5957  v2 = vec_ld( 0, planePtr + 12 );
5958  v3 = vec_ld( 15, planePtr + 12 );
5959  vecPlane3 = vec_perm( v2, v3, vecPerm );
5960 
5961  v0 = vec_ld( 0, planePtr + 16 );
5962  v1 = vec_ld( 15, planePtr + 16 );
5963  vecPlane4 = vec_perm( v0, v1, vecPerm );
5964 
5965  v2 = vec_ld( 0, planePtr + 20 );
5966  v3 = vec_ld( 15, planePtr + 20 );
5967  vecPlane5 = vec_perm( v2, v3, vecPerm );
5968 
5969  // transpose
5970  v0 = vec_mergeh( vecPlane0, vecPlane2 );
5971  v1 = vec_mergeh( vecPlane1, vecPlane3 );
5972  v2 = vec_mergel( vecPlane0, vecPlane2 );
5973  v3 = vec_mergel( vecPlane1, vecPlane3 );
5974 
5975  vecPlane0 = vec_mergeh( v0, v1 );
5976  vecPlane1 = vec_mergel( v0, v1 );
5977  vecPlane2 = vec_mergeh( v2, v3 );
5978  vecPlane3 = vec_mergel( v2, v3 );
5979 
5980  v0 = vec_mergeh( vecPlane4, zeroVector );
5981  v1 = vec_mergeh( vecPlane5, zeroVector );
5982  v2 = vec_mergel( vecPlane4, zeroVector );
5983  v3 = vec_mergel( vecPlane5, zeroVector );
5984 
5985  vecPlane4 = vec_mergeh( v0, v1 );
5986  vecPlane5 = vec_mergel( v0, v1 );
5987  vecPlane6 = vec_mergeh( v2, v3 );
5988  vecPlane7 = vec_mergel( v2, v3 );
5989 
5990 
5991  vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5992  vector bool int oneIntVector = (vector bool int)(1);
5993  vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
5994  vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
5995  vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
5996 
5997  vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5998  vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
5999  vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
6000  vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
6001  vector unsigned int vecR1, vecR2, vecR3, vecR4;
6002  vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
6003  vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6004  unsigned int vBits[4];
6005  vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
6006 
6007  i = 0;
6008  // every fourth one will have the same alignment. Make sure we've got enough here
6009  if ( i+3 < numVerts ) {
6010  vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6011  vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6012  vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6013  vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6014  }
6015 
6016 
6017  for ( ; i+3 < numVerts; i+=4 ) {
6018  const float *vertPtr = verts[i].xyz.ToFloatPtr();
6019  const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6020  const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6021  const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6022 
6023  v0 = vec_ld( 0, vertPtr );
6024  v1 = vec_ld( 15, vertPtr );
6025  v2 = vec_ld( 0, vertPtr2 );
6026  v3 = vec_ld( 15, vertPtr2 );
6027  v4 = vec_ld( 0, vertPtr3 );
6028  v5 = vec_ld( 15, vertPtr3 );
6029  v6 = vec_ld( 0, vertPtr4 );
6030  v7 = vec_ld( 15, vertPtr4 );
6031 
6032  vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
6033  vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
6034  vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
6035  vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
6036 
6037  vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
6038  vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
6039  vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
6040  vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
6041 
6042  vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
6043  vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
6044  vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
6045  vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
6046 
6047  vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
6048  vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
6049  vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
6050  vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
6051 
6052  vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
6053  vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
6054  vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
6055  vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
6056 
6057  vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
6058  vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
6059  vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
6060  vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
6061 
6062  vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
6063  vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
6064  vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
6065  vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
6066 
6067  vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
6068  vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
6069  vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
6070  vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
6071 
6072  vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
6073  vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
6074  vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
6075  vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
6076 
6077  vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
6078  vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
6079  vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
6080  vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
6081  vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
6082  vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
6083  vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
6084  vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
6085 
6086  //and it with 1 so we multiply by 1 not 1111's
6087  vecCmp1 = vec_and( vecCmp1, oneIntVector );
6088  vecCmp2 = vec_and( vecCmp2, oneIntVector );
6089  vecCmp3 = vec_and( vecCmp3, oneIntVector );
6090  vecCmp4 = vec_and( vecCmp4, oneIntVector );
6091  vecCmp5 = vec_and( vecCmp5, oneIntVector );
6092  vecCmp6 = vec_and( vecCmp6, oneIntVector );
6093  vecCmp7 = vec_and( vecCmp7, oneIntVector );
6094  vecCmp8 = vec_and( vecCmp8, oneIntVector );
6095 
6096  vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
6097  vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
6098  vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
6099  vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
6100  vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
6101  vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
6102  vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
6103  vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
6104 
6105  //OR them all together (this is the same as adding them, since they're all only 1 bit set)
6106  vecR1 = (vector unsigned int)(0); //zeroIntVector;
6107  vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
6108  vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
6109  vecR1 = vec_add(vecR1, vecBitShifted2 );
6110  vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6111 
6112  vecR2 = (vector unsigned int)(0); //zeroIntVector;
6113  vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
6114  vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
6115  vecR2 = vec_add(vecR2, vecBitShifted4 );
6116  vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
6117 
6118  vecR3 = (vector unsigned int)(0); //zeroIntVector;
6119  vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
6120  vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
6121  vecR3 = vec_add(vecR3, vecBitShifted6 );
6122  vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
6123 
6124  vecR4 = (vector unsigned int)(0); //zeroIntVector;
6125  vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
6126  vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
6127  vecR4 = vec_add(vecR4, vecBitShifted8 );
6128  vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
6129 
6130  // take the first element from each vector and put them into vecR1
6131  vecR1 = vec_mergeh( vecR1, vecR2 );
6132  vecR3 = vec_mergeh( vecR3, vecR4 );
6133  vecR1 = vec_perm( vecR1, vecR3, permHalves );
6134 
6135  // XOR with 0x3F to flip lower 6 bits
6136  vecR1 = vec_xor( vecR1, vecFlipBits );
6137 
6138  // store out results. don't have 16 at a time so let's just
6139  // do this and avoid alignment concerns
6140  vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
6141  vec_ste( vecR1, 0, &vBits[0] );
6142  vec_ste( vecR1, 4, &vBits[0] );
6143  vec_ste( vecR1, 8, &vBits[0] );
6144  vec_ste( vecR1, 12, &vBits[0] );
6145 
6146  cullBits[i] = vBits[0];
6147  cullBits[i+1] = vBits[1];
6148  cullBits[i+2] = vBits[2];
6149  cullBits[i+3] = vBits[3];
6150  }
6151 
6152  for ( ; i < numVerts; i++ ) {
6153  byte bits;
6154  float d0, d1, d2, d3, d4, d5;
6155  const idVec3 &v = verts[i].xyz;
6156 
6157  d0 = planes[0].Distance( v );
6158  d1 = planes[1].Distance( v );
6159  d2 = planes[2].Distance( v );
6160  d3 = planes[3].Distance( v );
6161  d4 = planes[4].Distance( v );
6162  d5 = planes[5].Distance( v );
6163 
6164  // they check if the sign bit is set by casting as long and shifting right 31 places.
6165  bits = FLOATSIGNBITSET( d0 ) << 0;
6166  bits |= FLOATSIGNBITSET( d1 ) << 1;
6167  bits |= FLOATSIGNBITSET( d2 ) << 2;
6168  bits |= FLOATSIGNBITSET( d3 ) << 3;
6169  bits |= FLOATSIGNBITSET( d4 ) << 4;
6170  bits |= FLOATSIGNBITSET( d5 ) << 5;
6171 
6172  cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
6173  }
6174 }
6175 
6176 #else
6177 
6178 /*
6179 ============
6180 idSIMD_AltiVec::DecalPointCull
6181 ============
6182 */
6183 void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
6184 
6185  // idDrawVert size
6186  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6187 
6188  int i;
6189  const float *planePtr = planes[0].ToFloatPtr();
6190 
6191  vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
6192  vector float zeroVector = (vector float)(0.0);
6193  vector unsigned char vecPerm;
6194  vector float v0, v1, v2, v3, v4, v5, v6, v7;
6195 
6196  vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
6197 
6198  // populate planes
6199  v0 = vec_ld( 0, planePtr );
6200  v1 = vec_ld( 15, planePtr );
6201  vecPlane0 = vec_perm( v0, v1, vecPerm );
6202 
6203  v2 = vec_ld( 0, planePtr + 4 );
6204  v3 = vec_ld( 15, planePtr + 4 );
6205  vecPlane1 = vec_perm( v2, v3, vecPerm );
6206 
6207  v0 = vec_ld( 0, planePtr + 8 );
6208  v1 = vec_ld( 15, planePtr + 8 );
6209  vecPlane2 = vec_perm( v0, v1, vecPerm );
6210 
6211  v2 = vec_ld( 0, planePtr + 12 );
6212  v3 = vec_ld( 15, planePtr + 12 );
6213  vecPlane3 = vec_perm( v2, v3, vecPerm );
6214 
6215  v0 = vec_ld( 0, planePtr + 16 );
6216  v1 = vec_ld( 15, planePtr + 16 );
6217  vecPlane4 = vec_perm( v0, v1, vecPerm );
6218 
6219  v2 = vec_ld( 0, planePtr + 20 );
6220  v3 = vec_ld( 15, planePtr + 20 );
6221  vecPlane5 = vec_perm( v2, v3, vecPerm );
6222 
6223  // transpose
6224  v0 = vec_mergeh( vecPlane0, vecPlane2 );
6225  v1 = vec_mergeh( vecPlane1, vecPlane3 );
6226  v2 = vec_mergel( vecPlane0, vecPlane2 );
6227  v3 = vec_mergel( vecPlane1, vecPlane3 );
6228 
6229  vecPlane0 = vec_mergeh( v0, v1 );
6230  vecPlane1 = vec_mergel( v0, v1 );
6231  vecPlane2 = vec_mergeh( v2, v3 );
6232  vecPlane3 = vec_mergel( v2, v3 );
6233 
6234  v0 = vec_mergeh( vecPlane4, zeroVector );
6235  v1 = vec_mergeh( vecPlane5, zeroVector );
6236  v2 = vec_mergel( vecPlane4, zeroVector );
6237  v3 = vec_mergel( vecPlane5, zeroVector );
6238 
6239  vecPlane4 = vec_mergeh( v0, v1 );
6240  vecPlane5 = vec_mergel( v0, v1 );
6241  vecPlane6 = vec_mergeh( v2, v3 );
6242  vecPlane7 = vec_mergel( v2, v3 );
6243 
6244 
6245  vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6246  vector bool int oneIntVector = (vector bool int)(1);
6247  vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
6248  vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
6249  vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
6250 
6251  vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
6252  vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
6253  vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
6254  vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
6255  vector unsigned int vecR1, vecR2, vecR3, vecR4;
6256  vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
6257  vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6258  unsigned int vBits[4];
6259  vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
6260 
6261  i = 0;
6262 
6263  for ( ; i+3 < numVerts; i+=4 ) {
6264  const float *vertPtr = verts[i].xyz.ToFloatPtr();
6265  const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6266  const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6267  const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6268 
6269  v0 = vec_ld( 0, vertPtr );
6270  v2 = vec_ld( 0, vertPtr2 );
6271  v4 = vec_ld( 0, vertPtr3 );
6272  v6 = vec_ld( 0, vertPtr4 );
6273 
6274  vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
6275  vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
6276  vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
6277  vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
6278 
6279  vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
6280  vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
6281  vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
6282  vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
6283 
6284  vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
6285  vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
6286  vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
6287  vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
6288 
6289  vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
6290  vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
6291  vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
6292  vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
6293 
6294  vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
6295  vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
6296  vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
6297  vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
6298 
6299  vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
6300  vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
6301  vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
6302  vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
6303 
6304  vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
6305  vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
6306  vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
6307  vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
6308 
6309  vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
6310  vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
6311  vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
6312  vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
6313 
6314  vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
6315  vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
6316  vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
6317  vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
6318  vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
6319  vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
6320  vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
6321  vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
6322 
6323  //and it with 1 so we multiply by 1 not 1111's
6324  vecCmp1 = vec_and( vecCmp1, oneIntVector );
6325  vecCmp2 = vec_and( vecCmp2, oneIntVector );
6326  vecCmp3 = vec_and( vecCmp3, oneIntVector );
6327  vecCmp4 = vec_and( vecCmp4, oneIntVector );
6328  vecCmp5 = vec_and( vecCmp5, oneIntVector );
6329  vecCmp6 = vec_and( vecCmp6, oneIntVector );
6330  vecCmp7 = vec_and( vecCmp7, oneIntVector );
6331  vecCmp8 = vec_and( vecCmp8, oneIntVector );
6332 
6333  vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
6334  vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
6335  vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
6336  vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
6337  vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
6338  vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
6339  vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
6340  vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
6341 
6342  //OR them all together (this is the same as adding them, since they're all only 1 bit set)
6343  vecR1 = (vector unsigned int)(0); //zeroIntVector;
6344  vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
6345  vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
6346  vecR1 = vec_add(vecR1, vecBitShifted2 );
6347  vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6348 
6349  vecR2 = (vector unsigned int)(0); //zeroIntVector;
6350  vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
6351  vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
6352  vecR2 = vec_add(vecR2, vecBitShifted4 );
6353  vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
6354 
6355  vecR3 = (vector unsigned int)(0); //zeroIntVector;
6356  vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
6357  vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
6358  vecR3 = vec_add(vecR3, vecBitShifted6 );
6359  vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
6360 
6361  vecR4 = (vector unsigned int)(0); //zeroIntVector;
6362  vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
6363  vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
6364  vecR4 = vec_add(vecR4, vecBitShifted8 );
6365  vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
6366 
6367  // take the first element from each vector and put them into vecR1
6368  vecR1 = vec_mergeh( vecR1, vecR2 );
6369  vecR3 = vec_mergeh( vecR3, vecR4 );
6370  vecR1 = vec_perm( vecR1, vecR3, permHalves );
6371 
6372  // XOR with 0x3F to flip lower 6 bits
6373  vecR1 = vec_xor( vecR1, vecFlipBits );
6374 
6375  // store out results. don't have 16 at a time so let's just
6376  // do this and avoid alignment concerns
6377  vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
6378  vec_ste( vecR1, 0, &vBits[0] );
6379  vec_ste( vecR1, 4, &vBits[0] );
6380  vec_ste( vecR1, 8, &vBits[0] );
6381  vec_ste( vecR1, 12, &vBits[0] );
6382 
6383  cullBits[i] = vBits[0];
6384  cullBits[i+1] = vBits[1];
6385  cullBits[i+2] = vBits[2];
6386  cullBits[i+3] = vBits[3];
6387  }
6388 
6389  for ( ; i < numVerts; i++ ) {
6390  byte bits;
6391  float d0, d1, d2, d3, d4, d5;
6392  const idVec3 &v = verts[i].xyz;
6393 
6394  d0 = planes[0].Distance( v );
6395  d1 = planes[1].Distance( v );
6396  d2 = planes[2].Distance( v );
6397  d3 = planes[3].Distance( v );
6398  d4 = planes[4].Distance( v );
6399  d5 = planes[5].Distance( v );
6400 
6401  // they check if the sign bit is set by casting as long and shifting right 31 places.
6402  bits = FLOATSIGNBITSET( d0 ) << 0;
6403  bits |= FLOATSIGNBITSET( d1 ) << 1;
6404  bits |= FLOATSIGNBITSET( d2 ) << 2;
6405  bits |= FLOATSIGNBITSET( d3 ) << 3;
6406  bits |= FLOATSIGNBITSET( d4 ) << 4;
6407  bits |= FLOATSIGNBITSET( d5 ) << 5;
6408 
6409  cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
6410  }
6411 }
6412 
6413 
6414 #endif /*DRAWVERT_PADDED */
6415 
6416 #ifndef DRAWVERT_PADDED
6417 /*
6418 ============
6419 idSIMD_AltiVec::OverlayPointCull
6420 ============
6421 */
6422 void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
6423 
6424  // idDrawVert size
6425  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6426 
6427  int i;
6428 
6429  float p0x, p0y, p0z, p0d;
6430  float p1x, p1y, p1z, p1d;
6431 
6432  const float *planePtr = planes[0].ToFloatPtr();
6433  const float *vertPtr = verts[0].xyz.ToFloatPtr();
6434 
6435  vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
6436  vector float v0, v1, v2, v3, v4, v5, v6, v7;
6437  vector unsigned char vecPerm;
6438  vector float zeroVector = (vector float)(0);
6439 
6440  p0x = *(planePtr + 0);
6441  p0y = *(planePtr + 1);
6442  p0z = *(planePtr + 2);
6443  p0d = *(planePtr + 3);
6444  p1x = *(planePtr + 4);
6445  p1y = *(planePtr + 5);
6446  p1z = *(planePtr + 6);
6447  p1d = *(planePtr + 7);
6448 
6449  // populate the planes
6450  vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
6451  v0 = vec_ld( 0, planePtr );
6452  v1 = vec_ld( 15, planePtr );
6453  vecPlane0 = vec_perm( v0, v1, vecPerm );
6454 
6455  v2 = vec_ld( 31, planePtr );
6456  vecPlane1 = vec_perm( v1, v2, vecPerm );
6457 
6458  // transpose
6459  v0 = vec_mergeh( vecPlane0, vecPlane0 );
6460  v1 = vec_mergeh( vecPlane1, vecPlane1 );
6461  v2 = vec_mergel( vecPlane0, vecPlane0 );
6462  v3 = vec_mergel( vecPlane1, vecPlane1);
6463 
6464  vecPlane0 = vec_mergeh( v0, v1 );
6465  vecPlane1 = vec_mergel( v0, v1 );
6466  vecPlane2 = vec_mergeh( v2, v3 );
6467  vecPlane3 = vec_mergel( v2, v3 );
6468 
6469  vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6470  vector float oneVector = (vector float)(1);
6471 
6472  vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
6473 
6474  vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
6475  vector float negTwoVector = (vector float)(-2);
6476  vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
6477  vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
6478  vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
6479  vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
6480  vector bool int oneIntVector = (vector bool int)(1);
6481  vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6482  unsigned int cullBitVal[4];
6483  vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
6484 
6485  i = 0;
6486  // every fourth one will have the same alignment. Make sure we've got enough here
6487  if ( i+3 < numVerts ) {
6488  vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6489  vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6490  vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6491  vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6492  }
6493 
6494 
6495  for ( ; i+3 < numVerts; i+=4 ) {
6496  const float *vertPtr = verts[i].xyz.ToFloatPtr();
6497  const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6498  const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6499  const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6500 
6501  v0 = vec_ld( 0, vertPtr );
6502  v1 = vec_ld( 15, vertPtr );
6503  v2 = vec_ld( 0, vertPtr2 );
6504  v3 = vec_ld( 15, vertPtr2 );
6505  v4 = vec_ld( 0, vertPtr3 );
6506  v5 = vec_ld( 15, vertPtr3 );
6507  v6 = vec_ld( 0, vertPtr4 );
6508  v7 = vec_ld( 15, vertPtr4 );
6509 
6510  vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
6511  vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
6512  vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
6513  vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
6514 
6515  // like a splat, but only doing halves
6516  vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6517  vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
6518  vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
6519  vecSum1 = vec_add( vecSum1, vecPlane3 );
6520 
6521  vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6522  vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
6523  vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
6524  vecSum2 = vec_add( vecSum2, vecPlane3 );
6525 
6526  // store out results
6527  UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
6528 
6529  // bit manipulation
6530  vecCmp1 = vec_cmplt( vecSum1, zeroVector );
6531  vecCmp2 = vec_cmplt( vecSum2, zeroVector );
6532 
6533  //and it with 1 so we multiply by 1 not 1111's
6534  vecCmp1 = vec_and( vecCmp1, oneIntVector );
6535  vecCmp2 = vec_and( vecCmp2, oneIntVector );
6536 
6537  // store out and write to cullBits
6538  // finally, a use for algebra! 1-x = x + 1 - 2x
6539  vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
6540  vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
6541  vecSum1Inv = vec_add( vecSum1Inv, oneVector );
6542  vecSum2Inv = vec_add( vecSum2Inv, oneVector );
6543 
6544  // do the same comparisons for the inverted d0/d1
6545  vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
6546  vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
6547 
6548  //and it with 1 so we multiply by 1 not 1111's
6549  vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
6550  vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
6551 
6552  // shift them as needed
6553  vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
6554  vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
6555  vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
6556  vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
6557 
6558  // OR them all together. since only 1 bit is set for each value, thats
6559  // the same as adding them. add up d0 + d1 + d0Inv + d1Inv
6560  vector unsigned int vecResult;
6561  vector unsigned int vecResult2;
6562  vector unsigned int vecResult3;
6563  vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
6564 
6565  vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6566 
6567  // vecResult now holds the values without the inverses yet, so add those
6568  vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
6569  vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
6570  vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
6571  vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
6572 
6573  vecResult = vec_add( vecResult, vecResult2 );
6574 
6575  //store out results
6576  vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
6577  vec_ste( vecResult, 0, &cullBitVal[0] );
6578  vec_ste( vecResult, 4, &cullBitVal[0] );
6579  vec_ste( vecResult, 8, &cullBitVal[0] );
6580  vec_ste( vecResult, 12, &cullBitVal[0] );
6581 
6582  cullBits[i] = cullBitVal[0];
6583  cullBits[i+1] = cullBitVal[1];
6584  cullBits[i+2] = cullBitVal[2];
6585  cullBits[i+3] = cullBitVal[3];
6586  }
6587 
6588  // cleanup
6589  for ( ; i < numVerts; i++ ) {
6590  byte bits;
6591  float d0, d1;
6592  float vx, vy, vz;
6593 
6594  vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
6595  vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
6596  vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
6597 
6598  d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
6599  d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
6600  texCoords[i][0] = d0;
6601  texCoords[i][1] = d1;
6602 
6603  bits = ( d0 >= 0 ) ? 0 : 1;
6604  d0 = 1.0f - d0;
6605  bits |= ( d1 >= 0 ) ? 0 : 1*2;
6606  d1 = 1.0f - d1;
6607 
6608  bits |= ( d0 >= 0 ) ? 0: 1*4;
6609  bits |= ( d1 >= 0 ) ? 0: 1*8;
6610 
6611  cullBits[i] = bits;
6612  }
6613 }
6614 #else
6615 
6616 /*
6617 ============
6618 idSIMD_AltiVec::OverlayPointCull
6619 ============
6620 */
6621 void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
6622 
6623  // idDrawVert size
6624  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6625 
6626  int i;
6627 
6628  float p0x, p0y, p0z, p0d;
6629  float p1x, p1y, p1z, p1d;
6630 
6631  const float *planePtr = planes[0].ToFloatPtr();
6632  const float *vertPtr = verts[0].xyz.ToFloatPtr();
6633 
6634  vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
6635  vector float v0, v1, v2, v3, v4, v5, v6, v7;
6636  vector unsigned char vecPerm;
6637  vector float zeroVector = (vector float)(0);
6638 
6639  p0x = *(planePtr + 0);
6640  p0y = *(planePtr + 1);
6641  p0z = *(planePtr + 2);
6642  p0d = *(planePtr + 3);
6643  p1x = *(planePtr + 4);
6644  p1y = *(planePtr + 5);
6645  p1z = *(planePtr + 6);
6646  p1d = *(planePtr + 7);
6647 
6648  // populate the planes
6649  vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
6650  v0 = vec_ld( 0, planePtr );
6651  v1 = vec_ld( 15, planePtr );
6652  vecPlane0 = vec_perm( v0, v1, vecPerm );
6653 
6654  v2 = vec_ld( 31, planePtr );
6655  vecPlane1 = vec_perm( v1, v2, vecPerm );
6656 
6657  // transpose
6658  v0 = vec_mergeh( vecPlane0, vecPlane0 );
6659  v1 = vec_mergeh( vecPlane1, vecPlane1 );
6660  v2 = vec_mergel( vecPlane0, vecPlane0 );
6661  v3 = vec_mergel( vecPlane1, vecPlane1);
6662 
6663  vecPlane0 = vec_mergeh( v0, v1 );
6664  vecPlane1 = vec_mergel( v0, v1 );
6665  vecPlane2 = vec_mergeh( v2, v3 );
6666  vecPlane3 = vec_mergel( v2, v3 );
6667 
6668  vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6669  vector float oneVector = (vector float)(1);
6670 
6671  vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
6672 
6673  vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
6674  vector float negTwoVector = (vector float)(-2);
6675  vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
6676  vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
6677  vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
6678  vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
6679  vector bool int oneIntVector = (vector bool int)(1);
6680  vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6681  unsigned int cullBitVal[4];
6682  vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
6683 
6684  i = 0;
6685 
6686  for ( ; i+3 < numVerts; i+=4 ) {
6687  const float *vertPtr = verts[i].xyz.ToFloatPtr();
6688  const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6689  const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6690  const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6691 
6692  vecXYZ1 = vec_ld( 0, vertPtr );
6693  vecXYZ2 = vec_ld( 0, vertPtr2 );
6694  vecXYZ3 = vec_ld( 0, vertPtr3 );
6695  vecXYZ4 = vec_ld( 0, vertPtr4 );
6696 
6697  // like a splat, but only doing halves
6698  vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6699  vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
6700  vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
6701  vecSum1 = vec_add( vecSum1, vecPlane3 );
6702 
6703  vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6704  vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
6705  vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
6706  vecSum2 = vec_add( vecSum2, vecPlane3 );
6707 
6708  // store out results
6709  UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
6710 
6711  // bit manipulation
6712  vecCmp1 = vec_cmplt( vecSum1, zeroVector );
6713  vecCmp2 = vec_cmplt( vecSum2, zeroVector );
6714 
6715  //and it with 1 so we multiply by 1 not 1111's
6716  vecCmp1 = vec_and( vecCmp1, oneIntVector );
6717  vecCmp2 = vec_and( vecCmp2, oneIntVector );
6718 
6719  // store out and write to cullBits
6720  // finally, a use for algebra! 1-x = x + 1 - 2x
6721  vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
6722  vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
6723  vecSum1Inv = vec_add( vecSum1Inv, oneVector );
6724  vecSum2Inv = vec_add( vecSum2Inv, oneVector );
6725 
6726  // do the same comparisons for the inverted d0/d1
6727  vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
6728  vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
6729 
6730  //and it with 1 so we multiply by 1 not 1111's
6731  vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
6732  vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
6733 
6734  // shift them as needed
6735  vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
6736  vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
6737  vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
6738  vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
6739 
6740  // OR them all together. since only 1 bit is set for each value, thats
6741  // the same as adding them. add up d0 + d1 + d0Inv + d1Inv
6742  vector unsigned int vecResult;
6743  vector unsigned int vecResult2;
6744  vector unsigned int vecResult3;
6745  vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
6746 
6747  vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6748 
6749  // vecResult now holds the values without the inverses yet, so add those
6750  vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
6751  vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
6752  vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
6753  vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
6754 
6755  vecResult = vec_add( vecResult, vecResult2 );
6756 
6757  //store out results
6758  vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
6759  vec_ste( vecResult, 0, &cullBitVal[0] );
6760  vec_ste( vecResult, 4, &cullBitVal[0] );
6761  vec_ste( vecResult, 8, &cullBitVal[0] );
6762  vec_ste( vecResult, 12, &cullBitVal[0] );
6763 
6764  cullBits[i] = cullBitVal[0];
6765  cullBits[i+1] = cullBitVal[1];
6766  cullBits[i+2] = cullBitVal[2];
6767  cullBits[i+3] = cullBitVal[3];
6768  }
6769 
6770  // cleanup
6771  for ( ; i < numVerts; i++ ) {
6772  byte bits;
6773  float d0, d1;
6774  float vx, vy, vz;
6775 
6776  vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
6777  vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
6778  vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
6779 
6780  d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
6781  d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
6782  texCoords[i][0] = d0;
6783  texCoords[i][1] = d1;
6784 
6785  bits = ( d0 >= 0 ) ? 0 : 1;
6786  d0 = 1.0f - d0;
6787  bits |= ( d1 >= 0 ) ? 0 : 1*2;
6788  d1 = 1.0f - d1;
6789 
6790  bits |= ( d0 >= 0 ) ? 0: 1*4;
6791  bits |= ( d1 >= 0 ) ? 0: 1*8;
6792 
6793  cullBits[i] = bits;
6794  }
6795 }
6796 
6797 
6798 #endif /* DRAWVERT_PADDED */
6799 
6800 #endif /* ENABLE_CULL */
6801 
6802 #ifdef ENABLE_DERIVE
6803 /*
6804 ============
6805 idSIMD_AltiVec::DeriveTriPlanes
6806 
6807  Derives a plane equation for each triangle.
6808 ============
6809 */
6810 void VPCALL idSIMD_AltiVec::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
6811 
6812  // idDrawVert size
6813  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6814  // idPlane size
6815  assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
6816  int i;
6817 
6818  vector float vecD0, vecD1, vecD2, vecD3, vecD4, vecD5, vecD6, vecD7;
6819  vector float vecVertA, vecVertB, vecVertC;
6820  vector float vecVertA2, vecVertB2, vecVertC2;
6821  vector float vecVertA3, vecVertB3, vecVertC3;
6822  vector float vecVertA4, vecVertB4, vecVertC4;
6823 
6824  vector float vecN, vecN2, vecN3, vecN4;
6825  vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
6826  vector unsigned char vecPerm1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
6827  vector unsigned char vecPerm2 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
6828  vector float vecF;
6829  vector float vecF1, vecF2, vecF3, vecF4;
6830  vector float zeroVector = (vector float)(0);
6831  vector float vecNegOne = (vector float)(-1);
6832  vector float vecSecondHalf, vecFirstHalf, vecSecondHalf2, vecFirstHalf2, vecSecondHalf3, vecFirstHalf3, vecFirstHalf4, vecSecondHalf4;
6833 
6834  vector unsigned char vecPermA, vecPermA2, vecPermA3, vecPermA4;
6835  vector unsigned char vecPermB, vecPermB2, vecPermB3, vecPermB4;
6836  vector unsigned char vecPermC, vecPermC2, vecPermC3, vecPermC4;
6837 
6838  vector unsigned char oneVector = (vector unsigned char)(1);
6839  vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
6840  vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
6841 
6842  const float *xyzPtr = verts[0].xyz.ToFloatPtr();
6843  float *planePtr = planes[0].ToFloatPtr();
6844 
6845  int j;
6846  for ( j = 0, i = 0; i+11 < numIndexes; i += 12, j += 4 ) {
6847 
6848 #ifndef DRAWVERT_PADDED
6849  // calculate permute vectors to load as needed. these are all
6850  // triangle indexes and are usaully pretty close together but
6851  // not guaranteed to be in any particular order
6852  vecPermA = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) ), oneVector );
6853  vecPermB = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) ), oneVector );
6854  vecPermC = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) ), oneVector );
6855  vecPermA2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) ), oneVector );
6856  vecPermB2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) ), oneVector );
6857  vecPermC2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) ), oneVector );
6858  vecPermA3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) ), oneVector );
6859  vecPermB3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) ), oneVector );
6860  vecPermC3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) ), oneVector );
6861  vecPermA4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) ), oneVector );
6862  vecPermB4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) ), oneVector );
6863  vecPermC4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) ), oneVector );
6864 #endif
6865 
6866 #ifndef DRAWVERT_PADDED
6867  // load first A B C
6868  vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6869  vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6870  vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6871  vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6872  vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6873  vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6874 
6875  vecVertA = vec_perm( vecLd1, vecLd2, vecPermA );
6876  vecVertB = vec_perm( vecLd3, vecLd4, vecPermB );
6877  vecVertC = vec_perm( vecLd5, vecLd6, vecPermC );
6878 
6879  // set the last element to 0
6880  vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
6881  vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
6882  vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
6883 
6884  // load second A B C
6885  vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6886  vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6887  vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6888  vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6889  vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6890  vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6891 
6892  vecVertA2 = vec_perm( vecLd1, vecLd2, vecPermA2 );
6893  vecVertB2 = vec_perm( vecLd3, vecLd4, vecPermB2 );
6894  vecVertC2 = vec_perm( vecLd5, vecLd6, vecPermC2 );
6895 
6896  // set the last element to 0
6897  vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6898  vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6899  vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6900 
6901  // load third A B C
6902  vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6903  vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6904  vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6905  vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6906  vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6907  vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6908 
6909  vecVertA3 = vec_perm( vecLd1, vecLd2, vecPermA3 );
6910  vecVertB3 = vec_perm( vecLd3, vecLd4, vecPermB3 );
6911  vecVertC3 = vec_perm( vecLd5, vecLd6, vecPermC3 );
6912 
6913  // set the last element to 0
6914  vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6915  vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6916  vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6917 
6918  // load the fourth A B C
6919  vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6920  vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6921  vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6922  vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6923  vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6924  vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6925 
6926  vecVertA4 = vec_perm( vecLd1, vecLd2, vecPermA4 );
6927  vecVertB4 = vec_perm( vecLd3, vecLd4, vecPermB4 );
6928  vecVertC4 = vec_perm( vecLd5, vecLd6, vecPermC4 );
6929 
6930  // set the last element to 0
6931  vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
6932  vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
6933  vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
6934 #else
6935  // load first A B C
6936  vecVertA = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6937  vecVertB = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6938  vecVertC = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6939 
6940  // set the last element to 0
6941  vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
6942  vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
6943  vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
6944 
6945  // load second A B C
6946  vecVertA2 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6947  vecVertB2 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6948  vecVertC2 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6949 
6950  // set the last element to 0
6951  vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6952  vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6953  vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6954 
6955  // load third A B C
6956  vecVertA3 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6957  vecVertB3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6958  vecVertC3 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6959 
6960  // set the last element to 0
6961  vecVertA3 = vec_perm( vecVertA3, zeroVector, vecPermZeroLast );
6962  vecVertB3 = vec_perm( vecVertB3, zeroVector, vecPermZeroLast );
6963  vecVertC3 = vec_perm( vecVertC3, zeroVector, vecPermZeroLast );
6964 
6965  // load the fourth A B C
6966  vecVertA4 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6967  vecVertB4 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6968  vecVertC4 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6969 
6970  // set the last element to 0
6971  vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
6972  vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
6973  vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
6974 #endif
6975  // calculate d0 and d1 for each
6976  vecD0 = vec_sub( vecVertB, vecVertA );
6977  vecD1 = vec_sub( vecVertC, vecVertA );
6978 
6979  vecD2 = vec_sub( vecVertB2, vecVertA2 );
6980  vecD3 = vec_sub( vecVertC2, vecVertA2 );
6981 
6982  vecD4 = vec_sub( vecVertB3, vecVertA3 );
6983  vecD5 = vec_sub( vecVertC3, vecVertA3 );
6984 
6985  vecD6 = vec_sub( vecVertB4, vecVertA4 );
6986  vecD7 = vec_sub( vecVertC4, vecVertA4 );
6987 
6988  vecWork1 = vec_perm( vecD0, vecD0, vecPerm1 );
6989  vecWork2 = vec_perm( vecD1, vecD1, vecPerm2 );
6990  vecWork3 = vec_perm( vecD2, vecD2, vecPerm1 );
6991  vecWork4 = vec_perm( vecD3, vecD3, vecPerm2 );
6992  vecWork5 = vec_perm( vecD4, vecD4, vecPerm1 );
6993  vecWork6 = vec_perm( vecD5, vecD5, vecPerm2 );
6994  vecWork7 = vec_perm( vecD6, vecD6, vecPerm1 );
6995  vecWork8 = vec_perm( vecD7, vecD7, vecPerm2 );
6996 
6997  vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
6998  vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
6999  vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7000  vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7001 
7002  vecWork1 = vec_perm( vecD1, vecD1, vecPerm1 );
7003  vecWork2 = vec_perm( vecD0, vecD0, vecPerm2 );
7004  vecWork3 = vec_perm( vecD3, vecD3, vecPerm1 );
7005  vecWork4 = vec_perm( vecD2, vecD2, vecPerm2 );
7006  vecWork5 = vec_perm( vecD5, vecD5, vecPerm1 );
7007  vecWork6 = vec_perm( vecD4, vecD4, vecPerm2 );
7008  vecWork7 = vec_perm( vecD7, vecD7, vecPerm1 );
7009  vecWork8 = vec_perm( vecD6, vecD6, vecPerm2 );
7010 
7011  vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7012  vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7013  vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7014  vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7015 
7016  vecN = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
7017  vecN2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
7018  vecN3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
7019  vecN4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
7020 
7021  // transpose vecNs
7022  vector float v0, v1, v2, v3;
7023  v0 = vec_mergeh( vecN, vecN3 );
7024  v1 = vec_mergeh( vecN2, vecN4 );
7025  v2 = vec_mergel( vecN, vecN3 );
7026  v3 = vec_mergel( vecN2, vecN4 );
7027 
7028  vecN = vec_mergeh( v0, v1 );
7029  vecN2 = vec_mergel( v0, v1 );
7030  vecN3 = vec_mergeh( v2, v3 );
7031  vecN4 = vec_mergel( v2, v3 );
7032 
7033  vecF = vec_madd( vecN, vecN, zeroVector );
7034  vecF = vec_madd( vecN2, vecN2, vecF );
7035  vecF = vec_madd( vecN3, vecN3, vecF );
7036 
7037  vecF = ReciprocalSquareRoot( vecF );
7038 
7039  vecF1 = vec_madd( vecF, vecN, zeroVector );
7040  vecF2 = vec_madd( vecF, vecN2, zeroVector );
7041  vecF3 = vec_madd( vecF, vecN3, zeroVector );
7042  vecF4 = vec_madd( vecF, vecN4, zeroVector );
7043 
7044  vector float v8, v9, v10, v11;
7045  v8 = vecF1;
7046  v9 = vecF2;
7047  v10 = vecF3;
7048  v11 = vecF4;
7049 
7050  // transpose vecVerts
7051  v0 = vec_mergeh( vecVertA, vecVertA3 );
7052  v1 = vec_mergeh( vecVertA2, vecVertA4 );
7053  v2 = vec_mergel( vecVertA, vecVertA3 );
7054  v3 = vec_mergel( vecVertA2, vecVertA4 );
7055 
7056  vecVertA = vec_mergeh( v0, v1 );
7057  vecVertA2 = vec_mergel( v0, v1 );
7058  vecVertA3 = vec_mergeh( v2, v3 );
7059  vecVertA4 = vec_mergel( v2, v3 );
7060 
7061  vector float vecTotals;
7062  vecTotals = vec_madd( vecVertA, v8, zeroVector );
7063  vecTotals = vec_madd( vecVertA2, v9, vecTotals );
7064  vecTotals = vec_madd( vecVertA3, v10, vecTotals );
7065  vecTotals = vec_madd( vecVertA4, v11, vecTotals );
7066  vecF = vec_madd( vecTotals, vecNegOne, zeroVector );
7067 
7068  // transpose vecFs
7069  v0 = vec_mergeh( vecF1, vecF3 );
7070  v1 = vec_mergeh( vecF2, vecF );
7071  v2 = vec_mergel( vecF1, vecF3 );
7072  v3 = vec_mergel( vecF2, vecF );
7073 
7074  vecF1 = vec_mergeh( v0, v1 );
7075  vecF2 = vec_mergel( v0, v1 );
7076  vecF3 = vec_mergeh( v2, v3 );
7077  vecF4 = vec_mergel( v2, v3 );
7078 
7079  // store results
7080  UNALIGNED_STORE4( planePtr + ( j * PLANE_OFFSET ), vecF1, vecF2, vecF3, vecF4 );
7081  }
7082 
7083  // cleanup
7084  for ( ; i < numIndexes; i += 3, j++ ) {
7085  const idDrawVert *a, *b, *c;
7086  float d0[3], d1[3], f;
7087  idVec3 n;
7088 
7089  a = verts + indexes[i + 0];
7090  b = verts + indexes[i + 1];
7091  c = verts + indexes[i + 2];
7092 
7093  d0[0] = b->xyz[0] - a->xyz[0];
7094  d0[1] = b->xyz[1] - a->xyz[1];
7095  d0[2] = b->xyz[2] - a->xyz[2];
7096 
7097  d1[0] = c->xyz[0] - a->xyz[0];
7098  d1[1] = c->xyz[1] - a->xyz[1];
7099  d1[2] = c->xyz[2] - a->xyz[2];
7100 
7101  n[0] = d1[1] * d0[2] - d1[2] * d0[1];
7102  n[1] = d1[2] * d0[0] - d1[0] * d0[2];
7103  n[2] = d1[0] * d0[1] - d1[1] * d0[0];
7104 
7105  f = FastScalarInvSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
7106  //idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
7107 
7108  n.x *= f;
7109  n.y *= f;
7110  n.z *= f;
7111 
7112  planes[j].SetNormal( n );
7113  planes[j].FitThroughPoint( a->xyz );
7114  }
7115 }
7116 
7117 /*
7118 ============
7119 idSIMD_AltiVec::DeriveTangents
7120 
7121  Derives the normal and orthogonal tangent vectors for the triangle vertices.
7122  For each vertex the normal and tangent vectors are derived from all triangles
7123  using the vertex which results in smooth tangents across the mesh.
7124  In the process the triangle planes are calculated as well.
7125 
7126 ============
7127 */
7128 void VPCALL idSIMD_AltiVec::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
7129  int i;
7130 
7131  bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
7132  memset( used, 0, numVerts * sizeof( used[0] ) );
7133 
7134  idPlane *planesPtr = planes;
7135  for ( i = 0; i < numIndexes; i += 3 ) {
7136  idDrawVert *a, *b, *c;
7137  // unsigned long signBit;
7138  float d0[5], d1[5], area;
7139  idVec3 n, t0, t1;
7140  float f1, f2, f3;
7141 
7142  int v0 = indexes[i + 0];
7143  int v1 = indexes[i + 1];
7144  int v2 = indexes[i + 2];
7145 
7146  a = verts + v0;
7147  b = verts + v1;
7148  c = verts + v2;
7149 
7150  d0[0] = b->xyz[0] - a->xyz[0];
7151  d0[1] = b->xyz[1] - a->xyz[1];
7152  d0[2] = b->xyz[2] - a->xyz[2];
7153  d0[3] = b->st[0] - a->st[0];
7154  d0[4] = b->st[1] - a->st[1];
7155 
7156  d1[0] = c->xyz[0] - a->xyz[0];
7157  d1[1] = c->xyz[1] - a->xyz[1];
7158  d1[2] = c->xyz[2] - a->xyz[2];
7159  d1[3] = c->st[0] - a->st[0];
7160  d1[4] = c->st[1] - a->st[1];
7161 
7162  // normal
7163  n[0] = d1[1] * d0[2] - d1[2] * d0[1];
7164  n[1] = d1[2] * d0[0] - d1[0] * d0[2];
7165  n[2] = d1[0] * d0[1] - d1[1] * d0[0];
7166 
7167  f1 = n.x * n.x + n.y * n.y + n.z * n.z;
7168 
7169  // area sign bit
7170  area = d0[3] * d1[4] - d0[4] * d1[3];
7171 
7172  // first tangent
7173  t0[0] = d0[0] * d1[4] - d0[4] * d1[0];
7174  t0[1] = d0[1] * d1[4] - d0[4] * d1[1];
7175  t0[2] = d0[2] * d1[4] - d0[4] * d1[2];
7176 
7177  f2 = t0.x * t0.x + t0.y * t0.y + t0.z * t0.z;
7178 
7179  // second tangent
7180  t1[0] = d0[3] * d1[0] - d0[0] * d1[3];
7181  t1[1] = d0[3] * d1[1] - d0[1] * d1[3];
7182  t1[2] = d0[3] * d1[2] - d0[2] * d1[3];
7183 
7184  f3 = t1.x * t1.x + t1.y * t1.y + t1.z * t1.z;
7185 
7186  // Behold! The power of the pipeline
7187  FastScalarInvSqrt_x3( &f1, &f2, &f3 );
7188 #ifdef PPC_INTRINSICS
7189  f2 = __fsel( area, f2, -f2 );
7190  f3 = __fsel( area, f3, -f3 );
7191 #else
7192  f2 = ( area < 0.0f ) ? -f2 : f2;
7193  f3 = ( area < 0.0f ) ? -f3 : f3;
7194 #endif
7195  t0.x *= f2;
7196  t0.y *= f2;
7197  t0.z *= f2;
7198 
7199  n.x *= f1;
7200  n.y *= f1;
7201  n.z *= f1;
7202 
7203  planesPtr->SetNormal( n );
7204  planesPtr->FitThroughPoint( a->xyz );
7205  planesPtr++;
7206 
7207  t1.x *= f3;
7208  t1.y *= f3;
7209  t1.z *= f3;
7210 
7211  if ( used[v0] ) {
7212  a->normal += n;
7213  a->tangents[0] += t0;
7214  a->tangents[1] += t1;
7215  } else {
7216  a->normal = n;
7217  a->tangents[0] = t0;
7218  a->tangents[1] = t1;
7219  used[v0] = true;
7220  }
7221 
7222  if ( used[v1] ) {
7223  b->normal += n;
7224  b->tangents[0] += t0;
7225  b->tangents[1] += t1;
7226  } else {
7227  b->normal = n;
7228  b->tangents[0] = t0;
7229  b->tangents[1] = t1;
7230  used[v1] = true;
7231  }
7232 
7233  if ( used[v2] ) {
7234  c->normal += n;
7235  c->tangents[0] += t0;
7236  c->tangents[1] += t1;
7237  } else {
7238  c->normal = n;
7239  c->tangents[0] = t0;
7240  c->tangents[1] = t1;
7241  used[v2] = true;
7242  }
7243  }
7244 }
7245 
7246 
7247 #ifdef DERIVE_UNSMOOTH_DRAWVERT_ALIGNED
7248 
7249 /*
7250 ============
7251 idSIMD_AltiVec::DeriveUnsmoothedTangents
7252 
7253  Derives the normal and orthogonal tangent vectors for the triangle vertices.
7254  For each vertex the normal and tangent vectors are derived from a single dominant triangle.
7255 ============
7256 */
7257 #define DERIVE_UNSMOOTHED_BITANGENT
7258 void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
7259 
7260  int i;
7261  // idDrawVert size
7262  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
7263  // drawverts aligned
7264  assert( IS_16BYTE_ALIGNED( verts[0] ) );
7265 
7266  vector float vecVertA, vecVertB, vecVertC;
7267  vector float vecVertA2, vecVertB2, vecVertC2;
7268  vector float vecVertA3, vecVertB3, vecVertC3;
7269  vector float vecVertA4, vecVertB4, vecVertC4;
7270 
7271  vector float v0, v1, v2, v3, v4, v5, v6, v7, v8;
7272  vector float vecS0, vecS1, vecS2;
7273  vector float vecS0_2, vecS1_2, vecS2_2;
7274  vector float vecS0_3, vecS1_3, vecS2_3;
7275  vector float vecS0_4, vecS1_4, vecS2_4;
7276 
7277  vector float vecD1, vecD2, vecD3, vecD4, vecD5, vecD6;
7278  vector float vecD7, vecD8, vecD9, vecD10, vecD11, vecD12;
7279  vector float vecT1, vecT1_2, vecT1_3, vecT1_4, vecT2, vecT2_2, vecT2_3, vecT2_4;
7280  vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
7281  vector float vecN, vecN2, vecN3, vecN4;
7282 
7283  vector unsigned char vecPermN0 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
7284  vector unsigned char vecPermN1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
7285  vector unsigned char vecPermT0 = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3);
7286  vector unsigned char vecPermT1 = (vector unsigned char)(8,9,10,11,8,9,10,11,8,9,10,11,8,9,10,11);
7287  vector float zeroVector = (vector float)(0);
7288 
7289  vector float vecNegOne = (vector float)(-1.0);
7290 
7291  vector float vecStore1, vecStore2, vecStore3;
7292  vector unsigned char vecPermFirstThreeLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
7293  vector unsigned char vecPermStoreSecond = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
7294  vector unsigned char vecPermLeadAndThree = (vector unsigned char)(0,1,2,3,16,17,18,19,20,21,22,23,24,25,26,27);
7295  vector unsigned char vecPermStore2 = (vector unsigned char)(4,5,6,7,8,9,10,11,24,25,26,27,28,29,30,31);
7296  vector unsigned char vecPermStore3 = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
7297  vector unsigned char vecPermStore4 = (vector unsigned char)(8,9,10,11,16,17,18,19,20,21,22,23,24,25,26,27);
7298  vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
7299 
7300  vector float vecLd1, vecLd2, vecLd3;
7301  vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3, vecPerm4;
7302 
7303  float *normalPtr = verts[0].normal.ToFloatPtr();
7304  float *xyzPtr = verts[0].xyz.ToFloatPtr();
7305 
7306  vector float vecFirstHalf, vecSecondHalf;
7307  vector float vecFirstHalf2, vecSecondHalf2;
7308  vector float vecFirstHalf3, vecSecondHalf3;
7309  vector float vecFirstHalf4, vecSecondHalf4;
7310 
7311  for ( i = 0; i+3 < numVerts; i+=4 ) {
7312  int bOffset1, bOffset2, bOffset3, bOffset4;
7313  int cOffset1, cOffset2, cOffset3, cOffset4;
7314 
7315  bOffset1 = dominantTris[i].v2;
7316  cOffset1 = dominantTris[i].v3;
7317  bOffset2 = dominantTris[i+1].v2;
7318  cOffset2 = dominantTris[i+1].v3;
7319  bOffset3 = dominantTris[i+2].v2;
7320  cOffset3 = dominantTris[i+2].v3;
7321  bOffset4 = dominantTris[i+3].v2;
7322  cOffset4 = dominantTris[i+3].v3;
7323 
7324  vecPerm0 = vec_lvsl( 0, xyzPtr + ( i * DRAWVERT_OFFSET ) );
7325  v0 = vec_ld( 0, xyzPtr + (i * DRAWVERT_OFFSET ) );
7326  v1 = vec_ld( 16, xyzPtr + (i * DRAWVERT_OFFSET ) );
7327  vecVertA = vec_perm( v0, v1, vecPerm0 );
7328 
7329  vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset1 * DRAWVERT_OFFSET ) );
7330  v2 = vec_ld( 0, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
7331  v3 = vec_ld( 16, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
7332  vecVertB = vec_perm( v2, v3, vecPerm1 );
7333 
7334  vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7335  v4 = vec_ld( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7336  v5 = vec_ld( 16, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7337  vecVertC = vec_perm( v4, v5, vecPerm2 );
7338 
7339  // put remainder into v2
7340  v1 = vec_perm( v1, v1, vecPerm0 );
7341  v3 = vec_perm( v3, v3, vecPerm1 );
7342  v5 = vec_perm( v5, v5, vecPerm2 );
7343 
7344  v1 = vec_mergeh( v1, v5 );
7345  v2 = vec_mergeh( v3, zeroVector );
7346  v2 = vec_mergeh( v1, v2 );
7347  v2 = vec_perm( v2, v2, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7348 
7349  // load second one
7350  vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7351  v0 = vec_ld( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7352  v1 = vec_ld( 16, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7353  vecVertA2 = vec_perm( v0, v1, vecPerm0 );
7354 
7355  vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset2 * DRAWVERT_OFFSET ) );
7356  v3 = vec_ld( 0, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
7357  v4 = vec_ld( 16, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
7358  vecVertB2 = vec_perm( v3, v4, vecPerm3 );
7359 
7360  vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7361  v5 = vec_ld( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7362  v6 = vec_ld( 16, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7363  vecVertC2 = vec_perm( v5, v6, vecPerm4 );
7364 
7365  // put remainder into v3
7366  v1 = vec_perm( v1, v1, vecPerm0 );
7367  v4 = vec_perm( v4, v4, vecPerm3 );
7368  v5 = vec_perm( v6, v6, vecPerm4 );
7369 
7370  v1 = vec_mergeh( v1, v5 );
7371  v3 = vec_mergeh( v4, zeroVector );
7372  v3 = vec_mergeh( v1, v3 );
7373  v3 = vec_perm( v3, v3, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7374 
7375  // load third one
7376  vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7377  v0 = vec_ld( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7378  v1 = vec_ld( 16, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7379  vecVertA3 = vec_perm( v0, v1, vecPerm0 );
7380 
7381  vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset3 * DRAWVERT_OFFSET ) );
7382  v4 = vec_ld( 0, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
7383  v5 = vec_ld( 16, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
7384  vecVertB3 = vec_perm( v4, v5, vecPerm1 );
7385 
7386  vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7387  v6 = vec_ld( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7388  v7 = vec_ld( 16, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7389  vecVertC3 = vec_perm( v6, v7, vecPerm2 );
7390 
7391  // put remainder into v4
7392  v1 = vec_perm( v1, v1, vecPerm0 );
7393  v5 = vec_perm( v5, v5, vecPerm1 );
7394  v7 = vec_perm( v7, v7, vecPerm2 );
7395 
7396  v1 = vec_mergeh( v1, v7 );
7397  v4 = vec_mergeh( v5, zeroVector );
7398  v4 = vec_mergeh( v1, v4 );
7399  v4 = vec_perm( v4, v4, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7400 
7401  // load fourth one
7402  vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7403  v0 = vec_ld( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7404  v1 = vec_ld( 16, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7405  vecVertA4 = vec_perm( v0, v1, vecPerm0 );
7406 
7407  vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset4 * DRAWVERT_OFFSET ) );
7408  v5 = vec_ld( 0, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
7409  v6 = vec_ld( 16, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
7410  vecVertB4 = vec_perm( v5, v6, vecPerm3 );
7411 
7412  vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7413  v7 = vec_ld( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7414  v8 = vec_ld( 16, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7415  vecVertC4 = vec_perm( v7, v8, vecPerm4 );
7416 
7417  // put remainder into v5
7418  v1 = vec_perm( v1, v1, vecPerm0 );
7419  v6 = vec_perm( v6, v6, vecPerm3 );
7420  v8 = vec_perm( v8, v8, vecPerm4 );
7421 
7422  v1 = vec_mergeh( v1, v8 );
7423  v5 = vec_mergeh( v6, zeroVector );
7424  v5 = vec_mergeh( v1, v5 );
7425  v5 = vec_perm( v5, v5, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7426 
7427  // remainder vectors look like b->st[1], a->st[1], c->st[1], a->st[1]
7428 
7429  //vecD1 now holds d0, d1, d2, d3
7430  vecD1 = vec_sub( vecVertB, vecVertA );
7431  vecD4 = vec_sub( vecVertB2, vecVertA2 );
7432  vecD7 = vec_sub( vecVertB3, vecVertA3 );
7433  vecD10 = vec_sub( vecVertB4, vecVertA4 );
7434 
7435  // vecD2 how holds d5, d6, d7, d8
7436  vecD2 = vec_sub( vecVertC, vecVertA );
7437  vecD5 = vec_sub( vecVertC2, vecVertA2 );
7438  vecD8 = vec_sub( vecVertC3, vecVertA3 );
7439  vecD11 = vec_sub( vecVertC4, vecVertA4 );
7440 
7441  // vecD3 now holds d4, crap, d9, crap
7442  vecD3 = vec_sub( v2, vec_sld( v2, v2, 4 ) );
7443  vecD6 = vec_sub( v3, vec_sld( v3, v3, 4 ) );
7444  vecD9 = vec_sub( v4, vec_sld( v4, v4, 4 ) );
7445  vecD12 = vec_sub( v5, vec_sld( v5, v5, 4 ) );
7446 
7447  // get permute vectors for loading from dt
7448  vecPerm1 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i].normalizationScale[0] ), (vector unsigned char)(1) );
7449  vecPerm2 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+1].normalizationScale[0] ), (vector unsigned char)(1) );
7450  vecPerm3 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+2].normalizationScale[0] ), (vector unsigned char)(1) );
7451  vecPerm4 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+3].normalizationScale[0] ), (vector unsigned char)(1) );
7452 
7453  // load S values from dominantTris
7454  v0 = vec_ld( 0, &dominantTris[i].normalizationScale[0] );
7455  v1 = vec_ld( 11, &dominantTris[i].normalizationScale[0] );
7456  v2 = vec_ld( 0, &dominantTris[i+1].normalizationScale[0] );
7457  v3 = vec_ld( 11, &dominantTris[i+1].normalizationScale[0] );
7458  v4 = vec_ld( 0, &dominantTris[i+2].normalizationScale[0] );
7459  v5 = vec_ld( 11, &dominantTris[i+2].normalizationScale[0] );
7460  v6 = vec_ld( 0, &dominantTris[i+3].normalizationScale[0] );
7461  v7 = vec_ld( 11, &dominantTris[i+3].normalizationScale[0] );
7462 
7463  v0 = vec_perm( v0, v1, vecPerm1 );
7464  v2 = vec_perm( v2, v3, vecPerm2 );
7465  v4 = vec_perm( v4, v5, vecPerm3 );
7466  v6 = vec_perm( v6, v7, vecPerm4 );
7467 
7468  vecS0 = vec_splat( v0, 0 );
7469  vecS1 = vec_splat( v0, 1 );
7470  vecS2 = vec_splat( v0, 2 );
7471 
7472  vecS0_2 = vec_splat( v2, 0);
7473  vecS1_2 = vec_splat( v2, 1 );
7474  vecS2_2 = vec_splat( v2, 2 );
7475 
7476  vecS0_3 = vec_splat( v4, 0 );
7477  vecS1_3 = vec_splat( v4, 1 );
7478  vecS2_3 = vec_splat( v4, 2 );
7479 
7480  vecS0_4 = vec_splat( v6, 0 );
7481  vecS1_4 = vec_splat( v6, 1 );
7482  vecS2_4 = vec_splat( v6, 2 );
7483 
7484  // do calculation
7485  vecWork1 = vec_perm( vecD2, vecD2, vecPermN1 );
7486  vecWork2 = vec_perm( vecD1, vecD1, vecPermN0 );
7487  vecWork3 = vec_perm( vecD5, vecD5, vecPermN1 );
7488  vecWork4 = vec_perm( vecD4, vecD4, vecPermN0 );
7489  vecWork5 = vec_perm( vecD8, vecD8, vecPermN1 );
7490  vecWork6 = vec_perm( vecD7, vecD7, vecPermN0 );
7491  vecWork7 = vec_perm( vecD11, vecD11, vecPermN1 );
7492  vecWork8 = vec_perm( vecD10, vecD10, vecPermN0 );
7493 
7494  vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7495  vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7496  vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7497  vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7498 
7499  vecWork1 = vec_perm( vecD2, vecD2, vecPermN0 );
7500  vecWork2 = vec_perm( vecD1, vecD1, vecPermN1 );
7501  vecWork3 = vec_perm( vecD5, vecD5, vecPermN0 );
7502  vecWork4 = vec_perm( vecD4, vecD4, vecPermN1 );
7503  vecWork5 = vec_perm( vecD8, vecD8, vecPermN0 );
7504  vecWork6 = vec_perm( vecD7, vecD7, vecPermN1 );
7505  vecWork7 = vec_perm( vecD11, vecD11, vecPermN0 );
7506  vecWork8 = vec_perm( vecD10, vecD10, vecPermN1 );
7507 
7508  vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
7509  vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
7510  vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
7511  vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
7512 
7513 
7514  // calculate N values
7515  vecN = vec_madd( vecS2, vecSecondHalf, zeroVector );
7516  vecN2 = vec_madd( vecS2_2, vecSecondHalf2, zeroVector );
7517  vecN3 = vec_madd( vecS2_3, vecSecondHalf3, zeroVector );
7518  vecN4 = vec_madd( vecS2_4, vecSecondHalf4, zeroVector );
7519 
7520  // calculate both halves of the calculation for t
7521  vecWork1 = vecD1;
7522  vecWork2 = vec_perm( vecD3, vecD3, vecPermT1 );
7523  vecWork3 = vecD4;
7524  vecWork4 = vec_perm( vecD6, vecD6, vecPermT1 );
7525  vecWork5 = vecD7;
7526  vecWork6 = vec_perm( vecD9, vecD9, vecPermT1 );
7527  vecWork7 = vecD10;
7528  vecWork8 = vec_perm( vecD12, vecD12, vecPermT1 );
7529 
7530  vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7531  vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7532  vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7533  vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7534 
7535  vecWork1 = vecD2;
7536  vecWork2 = vec_perm( vecD3, vecD3, vecPermT0 );
7537  vecWork3 = vecD5;
7538  vecWork4 = vec_perm( vecD6, vecD6, vecPermT0 );
7539  vecWork5 = vecD8;
7540  vecWork6 = vec_perm( vecD9, vecD9, vecPermT0 );
7541  vecWork7 = vecD11;
7542  vecWork8 = vec_perm( vecD12, vecD12, vecPermT0 );
7543 
7544  vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
7545  vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
7546  vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
7547  vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
7548 
7549  // calculate T values
7550  vecT1 = vec_madd( vecS0, vecSecondHalf, zeroVector );
7551  vecT1_2 = vec_madd( vecS0_2, vecSecondHalf2, zeroVector );
7552  vecT1_3 = vec_madd( vecS0_3, vecSecondHalf3, zeroVector );
7553  vecT1_4 = vec_madd( vecS0_4, vecSecondHalf4, zeroVector );
7554 
7555 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7556  vecWork1 = vecD1;
7557  vecWork2 = vec_perm( vecD2, vecD2, vecPermT2 );
7558  vecWork3 = vecD4;
7559  vecWork4 = vec_perm( vecD5, vecD5, vecPermT2 );
7560  vecWork5 = vecD7;
7561  vecWork6 = vec_perm( vecD8, vecD8, vecPermT2 );
7562  vecWork7 = vecD10;
7563  vecWork8 = vec_perm( vecD11, vecD11, vecPermT2 );
7564 
7565  vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7566  vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7567  vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7568  vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7569 
7570  vecWork1 = vec_perm( vecD1, vecD1, vecPermT2 );
7571  vecWork2 = vecD2;
7572  vecWork3 = vec_perm( vecD4, vecD4, vecPermT2 );
7573  vecWork4 = vecD5;
7574  vecWork5 = vec_perm( vecD7, vecD7, vecPermT2 );
7575  vecWork6 = vecD8;
7576  vecWork7 = vec_perm( vecD10, vecD10, vecPermT2 );
7577  vecWork8 = vecD11;
7578 
7579  vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7580  vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7581  vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7582  vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7583 
7584 #else
7585  vecWork1 = vec_perm( vecN, vecN, vecPermN1 );
7586  vecWork2 = vec_perm( vecT1, vecT1, vecPermN0 );
7587  vecWork3 = vec_perm( vecN2, vecN2, vecPermN1 );
7588  vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN0 );
7589  vecWork5 = vec_perm( vecN3, vecN3, vecPermN1 );
7590  vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN0 );
7591  vecWork7 = vec_perm( vecN4, vecN4, vecPermN1 );
7592  vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN0 );
7593 
7594  vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7595  vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7596  vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7597  vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7598 
7599  vecWork1 = vec_perm( vecN, vecN, vecPermN0 );
7600  vecWork2 = vec_perm( vecT1, vecT1, vecPermN1 );
7601  vecWork3 = vec_perm( vecN2, vecN2, vecPermN0 );
7602  vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN1 );
7603  vecWork5 = vec_perm( vecN3, vecN3, vecPermN0 );
7604  vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN1 );
7605  vecWork7 = vec_perm( vecN4, vecN4, vecPermN0 );
7606  vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN1 );
7607 
7608  vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7609  vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7610  vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7611  vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7612 #endif
7613  // finish the calculation
7614  vecSecondHalf = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
7615  vecSecondHalf2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
7616  vecSecondHalf3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
7617  vecSecondHalf4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
7618 
7619  vecT2 = vec_madd( vecS1, vecSecondHalf, zeroVector );
7620  vecT2_2 = vec_madd( vecS1_2, vecSecondHalf2, zeroVector );
7621  vecT2_3 = vec_madd( vecS1_3, vecSecondHalf3, zeroVector );
7622  vecT2_4 = vec_madd( vecS1_4, vecSecondHalf4, zeroVector );
7623 
7624  // Store results
7625 
7626  // read values that we need to preserve
7627  vecLd1 = vec_ld( 0, normalPtr + ( i * DRAWVERT_OFFSET ) );
7628  vecLd2 = vec_ld( 32, normalPtr + ( i * DRAWVERT_OFFSET ) );
7629 
7630  //generate vectors to store
7631  vecStore1 = vec_perm( vecLd1, vecN, vecPermLeadAndThree );
7632  vecStore2 = vec_perm( vecT1, vecT2, vecPermFirstThreeLast );
7633  vecStore3 = vec_perm( vecT2, vecLd2, vecPermStore2 );
7634 
7635  // store out results
7636  ALIGNED_STORE3( normalPtr + ( i * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
7637 
7638  // read values that we need to preserve
7639  vecLd3 = vec_ld( 32, normalPtr + ( (i+1) * DRAWVERT_OFFSET ));
7640 
7641  // generate vectors to store
7642  vecStore1 = vec_perm( vecN2, vecT1_2, vecPermFirstThreeLast );
7643  vecStore2 = vec_perm( vecT1_2, vecT2_2, vecPermStoreSecond );
7644  vecStore3 = vec_perm( vecT2_2, vecLd3, (vector unsigned char)(8,9,10,11,20,21,22,23,24,25,26,27,28,29,30,31) );
7645 
7646  // instead of doing permute, shift it where it needs to be and use vec_ste
7647  // store out vectors
7648  ALIGNED_STORE3( normalPtr + ((i+1) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
7649 
7650  // read values that we need to preserve
7651  vecLd1 = vec_ld( 0, normalPtr + ( (i+2) * DRAWVERT_OFFSET ) );
7652 
7653  // generate vectors to store
7654  vecStore1 = vec_perm( vecLd1, vecN3, vecPermFirstThreeLast );
7655  vecStore2 = vec_perm( vecN3, vecT1_3, vecPermStore3 );
7656  vecStore3 = vec_perm( vecT1_3, vecT2_3, vecPermStore4 );
7657 
7658  // store out vectors
7659  ALIGNED_STORE3( normalPtr + ((i+2) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
7660 
7661  // read values that we need to preserve
7662  vecLd2 = vec_ld( 0, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
7663  vecLd3 = vec_ld( 32, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
7664 
7665  // generate vectors to store
7666  vecStore1 = vec_perm( vecLd2, vecN4, vecPermHalves );
7667  vecStore2 = vec_perm( vecN4, vecT1_4, vecPermStore4 );
7668  vecStore3 = vec_perm( vecT2_4, vecLd3, vecPermFirstThreeLast );
7669 
7670  // store out vectors
7671  ALIGNED_STORE3( normalPtr + ((i+3) * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
7672  }
7673 
7674  // cleanup
7675  for ( ; i < numVerts; i++ ) {
7676  idDrawVert *a, *b, *c;
7677  float d0, d1, d2, d3, d4;
7678  float d5, d6, d7, d8, d9;
7679  float s0, s1, s2;
7680  float n0, n1, n2;
7681  float t0, t1, t2;
7682  float t3, t4, t5;
7683 
7684  const dominantTri_s &dt = dominantTris[i];
7685 
7686  a = verts + i;
7687  b = verts + dt.v2;
7688  c = verts + dt.v3;
7689 
7690  d0 = b->xyz[0] - a->xyz[0];
7691  d1 = b->xyz[1] - a->xyz[1];
7692  d2 = b->xyz[2] - a->xyz[2];
7693  d3 = b->st[0] - a->st[0];
7694 
7695  d4 = b->st[1] - a->st[1];
7696 
7697  d5 = c->xyz[0] - a->xyz[0];
7698  d6 = c->xyz[1] - a->xyz[1];
7699  d7 = c->xyz[2] - a->xyz[2];
7700  d8 = c->st[0] - a->st[0];
7701 
7702  d9 = c->st[1] - a->st[1];
7703 
7704  s0 = dt.normalizationScale[0];
7705  s1 = dt.normalizationScale[1];
7706  s2 = dt.normalizationScale[2];
7707 
7708  n0 = s2 * ( d6 * d2 - d7 * d1 );
7709  n1 = s2 * ( d7 * d0 - d5 * d2 );
7710  n2 = s2 * ( d5 * d1 - d6 * d0 );
7711 
7712  t0 = s0 * ( d0 * d9 - d4 * d5 );
7713  t1 = s0 * ( d1 * d9 - d4 * d6 );
7714  t2 = s0 * ( d2 * d9 - d4 * d7 );
7715 
7716 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7717  t3 = s1 * ( d3 * d5 - d0 * d8 );
7718  t4 = s1 * ( d3 * d6 - d1 * d8 );
7719  t5 = s1 * ( d3 * d7 - d2 * d8 );
7720 #else
7721  t3 = s1 * ( n2 * t1 - n1 * t2 );
7722  t4 = s1 * ( n0 * t2 - n2 * t0 );
7723  t5 = s1 * ( n1 * t0 - n0 * t1 );
7724 #endif
7725 
7726  a->normal[0] = n0;
7727  a->normal[1] = n1;
7728  a->normal[2] = n2;
7729 
7730  a->tangents[0][0] = t0;
7731  a->tangents[0][1] = t1;
7732  a->tangents[0][2] = t2;
7733 
7734  a->tangents[1][0] = t3;
7735  a->tangents[1][1] = t4;
7736  a->tangents[1][2] = t5;
7737  }
7738 }
7739 
7740 #else
7741 /*
7742 ============
7743 idSIMD_AltiVec::DeriveUnsmoothedTangents
7744 
7745  Derives the normal and orthogonal tangent vectors for the triangle vertices.
7746  For each vertex the normal and tangent vectors are derived from a single dominant triangle.
7747 ============
7748 */
7749 #define DERIVE_UNSMOOTHED_BITANGENT
7750 
7751 void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
7752  int i;
7753 
7754  for ( i = 0; i < numVerts; i++ ) {
7755  idDrawVert *a, *b, *c;
7756  float d0, d1, d2, d3, d4;
7757  float d5, d6, d7, d8, d9;
7758  float s0, s1, s2;
7759  float n0, n1, n2;
7760  float t0, t1, t2;
7761  float t3, t4, t5;
7762 
7763  const dominantTri_s &dt = dominantTris[i];
7764 
7765  a = verts + i;
7766  b = verts + dt.v2;
7767  c = verts + dt.v3;
7768 
7769  d0 = b->xyz[0] - a->xyz[0];
7770  d1 = b->xyz[1] - a->xyz[1];
7771  d2 = b->xyz[2] - a->xyz[2];
7772  d3 = b->st[0] - a->st[0];
7773 
7774  d4 = b->st[1] - a->st[1];
7775 
7776  d5 = c->xyz[0] - a->xyz[0];
7777  d6 = c->xyz[1] - a->xyz[1];
7778  d7 = c->xyz[2] - a->xyz[2];
7779  d8 = c->st[0] - a->st[0];
7780 
7781  d9 = c->st[1] - a->st[1];
7782 
7783  s0 = dt.normalizationScale[0];
7784  s1 = dt.normalizationScale[1];
7785  s2 = dt.normalizationScale[2];
7786 
7787  n0 = s2 * ( d6 * d2 - d7 * d1 );
7788  n1 = s2 * ( d7 * d0 - d5 * d2 );
7789  n2 = s2 * ( d5 * d1 - d6 * d0 );
7790 
7791  t0 = s0 * ( d0 * d9 - d4 * d5 );
7792  t1 = s0 * ( d1 * d9 - d4 * d6 );
7793  t2 = s0 * ( d2 * d9 - d4 * d7 );
7794 
7795 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7796  t3 = s1 * ( d3 * d5 - d0 * d8 );
7797  t4 = s1 * ( d3 * d6 - d1 * d8 );
7798  t5 = s1 * ( d3 * d7 - d2 * d8 );
7799 #else
7800  t3 = s1 * ( n2 * t1 - n1 * t2 );
7801  t4 = s1 * ( n0 * t2 - n2 * t0 );
7802  t5 = s1 * ( n1 * t0 - n0 * t1 );
7803 #endif
7804 
7805  a->normal[0] = n0;
7806  a->normal[1] = n1;
7807  a->normal[2] = n2;
7808 
7809  a->tangents[0][0] = t0;
7810  a->tangents[0][1] = t1;
7811  a->tangents[0][2] = t2;
7812 
7813  a->tangents[1][0] = t3;
7814  a->tangents[1][1] = t4;
7815  a->tangents[1][2] = t5;
7816  }
7817 
7818 }
7819 #endif /* DERIVE_UNSMOOTH_DRAWVERT_ALIGNED */
7820 
7821 /*
7822 ============
7823 idSIMD_AltiVec::NormalizeTangents
7824 
7825  Normalizes each vertex normal and projects and normalizes the
7826  tangent vectors onto the plane orthogonal to the vertex normal.
7827 ============
7828 */
7829 void VPCALL idSIMD_AltiVec::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
7830 
7831  // idDrawVert size
7832  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
7833 
7834  float *addr = verts[0].normal.ToFloatPtr();
7835  float *tAddr = verts[0].tangents[0].ToFloatPtr();
7836 
7837  // v0 through v3 maintain originally loaded values so we don't take
7838  // as much hit for unaligned stores
7839  vector float v0, v1, v2, v3;
7840  // v5 through v8 are the "working" values of the vectors
7841  vector float v5, v6, v7, v8;
7842  // working values
7843  vector float vec1T0, vec1T1, vec2T0, vec2T1, vec3T0, vec3T1, vec4T0, vec4T1;
7844  vector float vecSum, vecTSum1, vecTSum2, tempSum, tempSum2, tempSum3;
7845  vector float vecF, vecF2;
7846  vector float vecTemp, vecTemp2, vecTemp3, vecTemp4;
7847 
7848  register vector float zeroVector = (vector float)(0.0);
7849 
7850  vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
7851  vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
7852  vector unsigned char vecPermSplatFirstWithZero = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,16,17,18,19);
7853  vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3;
7854  vector unsigned char storePerm0, storePerm1, storePerm2, storePerm3;
7855 
7856  vector float vecTan11, vecTan12, vecTan13, vecTan21, vecTan22, vecTan23;
7857  vector float vecTan31, vecTan32, vecTan33, vecTan41, vecTan42, vecTan43;
7858 
7859  vector unsigned char vec1T0Perm, vec1T1Perm, vec2T0Perm, vec2T1Perm, vec3T0Perm, vec3T1Perm, vec4T0Perm, vec4T1Perm;
7860  vector unsigned char storeT11, storeT12, storeT21, storeT22, storeT31, storeT32;
7861  vector unsigned char storeT41, storeT42;
7862 
7863  int i = 0;
7864 
7865  if ( i+3 < numVerts ) {
7866  // for loading normal from idDrawVert
7867  vecPerm0 = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
7868  vecPerm1 = vec_add( vec_lvsl( -1, addr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7869  vecPerm2 = vec_add( vec_lvsl( -1, addr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7870  vecPerm3 = vec_add( vec_lvsl( -1, addr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7871 
7872  // for loading tangents from idDrawVert
7873  vec1T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7874  vec1T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7875  vec2T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7876  vec2T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7877  vec3T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7878  vec3T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7879  vec4T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7880  vec4T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7881 
7882  // generate permute vectors to store normals
7883  storePerm0 = vec_lvsr( 0, addr );
7884  storePerm1 = vec_lvsr( 0, addr + ( 1 * DRAWVERT_OFFSET ) );
7885  storePerm2 = vec_lvsr( 0, addr + ( 2 * DRAWVERT_OFFSET ) );
7886  storePerm3 = vec_lvsr( 0, addr + ( 3 * DRAWVERT_OFFSET ) );
7887 
7888  // generate permute vectors to store tangents
7889  storeT11 = vec_lvsr( 0, tAddr + ( 0 * DRAWVERT_OFFSET ) );
7890  storeT12 = vec_lvsr( 12, tAddr + ( 0 * DRAWVERT_OFFSET ) );
7891 
7892  storeT21 = vec_lvsr( 0, tAddr + ( 1 * DRAWVERT_OFFSET ) );
7893  storeT22 = vec_lvsr( 12, tAddr + ( 1 * DRAWVERT_OFFSET ) );
7894 
7895  storeT31 = vec_lvsr( 0, tAddr + ( 2 * DRAWVERT_OFFSET ) );
7896  storeT32 = vec_lvsr( 12, tAddr + ( 2 * DRAWVERT_OFFSET ) );
7897 
7898  storeT41 = vec_lvsr( 0, tAddr + ( 3 * DRAWVERT_OFFSET ) );
7899  storeT42 = vec_lvsr( 12, tAddr + ( 3 * DRAWVERT_OFFSET ) );
7900  }
7901 
7902  for ( ; i+3 < numVerts; i+=4 ) {
7903 
7904  // load normals
7905  vector float vecNormal11 = vec_ld( 0, addr + ( i * DRAWVERT_OFFSET ) );
7906  vector float vecNormal12 = vec_ld( 15, addr + ( i * DRAWVERT_OFFSET ) );
7907  v0 = vec_perm( vecNormal11, vecNormal12, vecPerm0 );
7908 
7909  vector float vecNormal21 = vec_ld( 0, addr + ((i+1) * DRAWVERT_OFFSET ) );
7910  vector float vecNormal22 = vec_ld( 15, addr + ((i+1) * DRAWVERT_OFFSET ) );
7911  v1 = vec_perm( vecNormal21, vecNormal22, vecPerm1 );
7912 
7913  vector float vecNormal31 = vec_ld( 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
7914  vector float vecNormal32 = vec_ld( 15, addr + ( (i+2) * DRAWVERT_OFFSET ) );
7915  v2 = vec_perm( vecNormal31, vecNormal32, vecPerm2 );
7916 
7917  vector float vecNormal41 = vec_ld( 0, addr + ((i+3) * DRAWVERT_OFFSET ) );
7918  vector float vecNormal42 = vec_ld( 15, addr + ((i+3) * DRAWVERT_OFFSET ) );
7919  v3 = vec_perm( vecNormal41, vecNormal42, vecPerm3 );
7920 
7921  // zero out the last element of each useless vector
7922  v0 = vec_perm( v0, zeroVector, vecPermLast );
7923  v1 = vec_perm( v1, zeroVector, vecPermLast );
7924  v2 = vec_perm( v2, zeroVector, vecPermLast );
7925  v3 = vec_perm( v3, zeroVector, vecPermLast );
7926 
7927  // got 4 vectors in v0 through v3, sum them each accross
7928  // and put into one vector
7929  vecTemp = vec_madd( v0, v0, zeroVector );
7930 
7931  vecSum = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
7932  vecSum = vec_add( vecSum, vec_sld( vecSum, vecSum, 4 ) );
7933  // element 0 of vecSum now has sum of v0
7934 
7935  vecTemp2 = vec_madd( v1, v1, zeroVector );
7936  tempSum = vec_add( vecTemp2, vec_sld( vecTemp2, vecTemp2, 8 ) );
7937  tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7938  // put this into vecSum
7939  vecSum = vec_mergeh( vecSum, tempSum );
7940 
7941  vecTemp3 = vec_madd( v2, v2, zeroVector );
7942  tempSum = vec_add( vecTemp3, vec_sld( vecTemp3, vecTemp3, 8 ) );
7943  tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7944  // put this into vecSum
7945  vecSum = vec_perm( vecSum, tempSum, vecPermHalves );
7946 
7947  vecTemp4 = vec_madd( v3, v3, zeroVector );
7948  tempSum = vec_add( vecTemp4, vec_sld( vecTemp4, vecTemp4, 8 ) );
7949  tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7950  // put this into vecSum
7951  vecSum = vec_perm( vecSum, tempSum, vecPermLast );
7952 
7953  // take reciprocal square roots of these
7954  vecF = ReciprocalSquareRoot( vecSum );
7955 
7956  // multiply each vector by f
7957  v5 = vec_madd( v0, vec_splat( vecF, 0 ), zeroVector );
7958  v6 = vec_madd( v1, vec_splat( vecF, 1 ), zeroVector );
7959  v7 = vec_madd( v2, vec_splat( vecF, 2 ), zeroVector );
7960  v8 = vec_madd( v3, vec_splat( vecF, 3 ), zeroVector );
7961 
7962  // load tangents as unaligned
7963  vecTan11 = vec_ld( 0, tAddr + ( i * DRAWVERT_OFFSET ) );
7964  vecTan12 = vec_ld( 11, tAddr + ( i * DRAWVERT_OFFSET ) );
7965  vecTan13 = vec_ld( 23, tAddr + ( i * DRAWVERT_OFFSET ) );
7966 
7967  vecTan21 = vec_ld( 0, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7968  vecTan22 = vec_ld( 11, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7969  vecTan23 = vec_ld( 23, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7970 
7971  vecTan31 = vec_ld( 0, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7972  vecTan32 = vec_ld( 11, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7973  vecTan33 = vec_ld( 23, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7974 
7975  vecTan41 = vec_ld( 0, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7976  vecTan42 = vec_ld( 11, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7977  vecTan43 = vec_ld( 23, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7978 
7979  vec1T0 = vec_perm( vecTan11, vecTan12, vec1T0Perm );
7980  vec1T1 = vec_perm( vecTan12, vecTan13, vec1T1Perm );
7981  vec2T0 = vec_perm( vecTan21, vecTan22, vec2T0Perm );
7982  vec2T1 = vec_perm( vecTan22, vecTan23, vec2T1Perm );
7983  vec3T0 = vec_perm( vecTan31, vecTan32, vec3T0Perm );
7984  vec3T1 = vec_perm( vecTan32, vecTan33, vec3T1Perm );
7985  vec4T0 = vec_perm( vecTan41, vecTan42, vec4T0Perm );
7986  vec4T1 = vec_perm( vecTan42, vecTan43, vec4T1Perm );
7987 
7988  //zero out last element of tangents
7989  vec1T0 = vec_perm( vec1T0, zeroVector, vecPermLast );
7990  vec1T1 = vec_perm( vec1T1, zeroVector, vecPermLast );
7991  vec2T0 = vec_perm( vec2T0, zeroVector, vecPermLast );
7992  vec2T1 = vec_perm( vec2T1, zeroVector, vecPermLast );
7993  vec3T0 = vec_perm( vec3T0, zeroVector, vecPermLast );
7994  vec3T1 = vec_perm( vec3T1, zeroVector, vecPermLast );
7995  vec4T0 = vec_perm( vec4T0, zeroVector, vecPermLast );
7996  vec4T1 = vec_perm( vec4T1, zeroVector, vecPermLast );
7997 
7998  // all tangents[0]
7999  tempSum = zeroVector;
8000  tempSum = vec_madd( vec1T0, v5, tempSum );
8001  //sum accross tempSum
8002  vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8003  vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8004  // put tempSum splatted accross vecTSum1
8005  vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8006  vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
8007 
8008  //vec1T0 now contains what needs to be rsqrt'd and multiplied by f
8009  vec1T0 = vec_sub( vec1T0, vecTSum1 );
8010 
8011  tempSum = zeroVector;
8012  tempSum = vec_madd( vec2T0, v6, tempSum );
8013 
8014  //sum accross tempSum
8015  vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8016  vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8017  vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8018  vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
8019  vec2T0 = vec_sub( vec2T0, vecTSum1 );
8020 
8021  tempSum = zeroVector;
8022  tempSum = vec_madd( vec3T0, v7, tempSum );
8023 
8024  //sum accross tempSum
8025  vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8026  vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8027  vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8028  vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
8029  vec3T0 = vec_sub( vec3T0, vecTSum1 );
8030 
8031  tempSum = zeroVector;
8032  tempSum = vec_madd( vec4T0, v8, tempSum );
8033 
8034  //sum accross tempSum
8035  vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8036  vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8037  vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8038  vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
8039  vec4T0 = vec_sub( vec4T0, vecTSum1 );
8040 
8041  // all tangents[1]
8042  tempSum = zeroVector;
8043  tempSum = vec_madd( vec1T1, v5, tempSum );
8044 
8045  //sum accross tempSum
8046  vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8047  vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8048  vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8049  vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
8050 
8051  //vec1T0 now contains what needs to be rsqrt'd and multiplied by f
8052  vec1T1 = vec_sub( vec1T1, vecTSum1 );
8053 
8054  tempSum = zeroVector;
8055  tempSum = vec_madd( vec2T1, v6, tempSum );
8056 
8057  //sum accross tempSum
8058  vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8059  vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8060  vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8061  vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
8062  vec2T1 = vec_sub( vec2T1, vecTSum1 );
8063 
8064  tempSum = zeroVector;
8065  tempSum = vec_madd( vec3T1, v7, tempSum );
8066 
8067  //sum accross tempSum
8068  vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8069  vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8070  vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8071  vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
8072  vec3T1 = vec_sub( vec3T1, vecTSum1 );
8073 
8074  tempSum = zeroVector;
8075  tempSum = vec_madd( vec4T1, v8, tempSum );
8076 
8077  //sum accross tempSum
8078  vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8079  vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8080  vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8081  vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
8082  vec4T1 = vec_sub( vec4T1, vecTSum1 );
8083 
8084 
8085  // sum accross vectors and put into one vector
8086  vecTemp = vec_madd( vec1T0, vec1T0, zeroVector );
8087  vecTSum1 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8088  vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8089 
8090  // element 0 of vecSum now has sum of v0
8091  vecTemp = vec_madd( vec2T0, vec2T0, zeroVector );
8092  tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8093  tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8094  // put this into vecSum
8095  vecTemp = vec_madd( vec3T0, vec3T0, zeroVector );
8096  vecTSum1 = vec_mergeh( vecTSum1, tempSum2 );
8097  tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8098  tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8099  // put this into vecSum
8100  vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermHalves );
8101  vecTemp = vec_madd( vec4T0, vec4T0, zeroVector );
8102  tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8103  tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8104  // put this into vecSum
8105  vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermLast );
8106 
8107  vecTemp = vec_madd( vec1T1, vec1T1, zeroVector );
8108  vecTSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8109  vecTSum2 = vec_add( vecTSum2, vec_sld( vecTSum2, vecTSum2, 4 ) );
8110  // element 0 of vecSum now has sum of v0
8111  vecTemp = vec_madd( vec2T1, vec2T1, zeroVector );
8112  tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8113  tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8114  // put this into vecSum
8115  vecTSum2 = vec_mergeh( vecTSum2, tempSum3 );
8116  vecTemp = vec_madd( vec3T1, vec3T1, zeroVector );
8117  tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8118  tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8119  // put this into vecSum
8120  vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermHalves );
8121  vecTemp = vec_madd( vec4T1, vec4T1, zeroVector );
8122  tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8123  tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8124  // put this into vecSum
8125  vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermLast );
8126 
8127  // tangents[0]
8128  vecF = ReciprocalSquareRoot( vecTSum1 );
8129  // tangents[1]
8130  vecF2 = ReciprocalSquareRoot( vecTSum2 );
8131 
8132  // multiply each tangent vector by f
8133 
8134  vec1T0 = vec_madd( vec1T0, vec_splat( vecF, 0 ), zeroVector );
8135  vec2T0 = vec_madd( vec2T0, vec_splat( vecF, 1 ), zeroVector );
8136  vec3T0 = vec_madd( vec3T0, vec_splat( vecF, 2 ), zeroVector );
8137  vec4T0 = vec_madd( vec4T0, vec_splat( vecF, 3 ), zeroVector );
8138 
8139  vec1T1 = vec_madd( vec1T1, vec_splat( vecF2, 0 ), zeroVector );
8140  vec2T1 = vec_madd( vec2T1, vec_splat( vecF2, 1 ), zeroVector );
8141  vec3T1 = vec_madd( vec3T1, vec_splat( vecF2, 2 ), zeroVector );
8142  vec4T1 = vec_madd( vec4T1, vec_splat( vecF2, 3 ), zeroVector );
8143 
8144  // rotate input data
8145  v5 = vec_perm( v5, v5, storePerm0 );
8146  v6 = vec_perm( v6, v6, storePerm1 );
8147  v7 = vec_perm( v7, v7, storePerm2 );
8148  v8 = vec_perm( v8, v8, storePerm3 );
8149 
8150  vec_ste( v5, 0, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8151  vec_ste( v5, 4, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8152  vec_ste( v5, 8, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8153 
8154  vec_ste( v6, 0, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8155  vec_ste( v6, 4, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8156  vec_ste( v6, 8, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8157 
8158  vec_ste( v7, 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8159  vec_ste( v7, 4, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8160  vec_ste( v7, 8, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8161 
8162  vec_ste( v8, 0, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8163  vec_ste( v8, 4, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8164  vec_ste( v8, 8, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8165 
8166  // store tangents[0] and tangents[1]
8167  vec1T0 = vec_perm( vec1T0, vec1T0, storeT11 );
8168  vec1T1 = vec_perm( vec1T1, vec1T1, storeT12 );
8169 
8170  vec_ste( vec1T0, 0, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8171  vec_ste( vec1T0, 4, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8172  vec_ste( vec1T0, 8, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8173  vec_ste( vec1T1, 12, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8174  vec_ste( vec1T1, 16, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8175  vec_ste( vec1T1, 20, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8176 
8177  // store second tangents[0] and tangents[1]
8178  vec2T0 = vec_perm( vec2T0, vec2T0, storeT21 );
8179  vec2T1 = vec_perm( vec2T1, vec2T1, storeT22 );
8180 
8181  vec_ste( vec2T0, 0, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8182  vec_ste( vec2T0, 4, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8183  vec_ste( vec2T0, 8, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8184  vec_ste( vec2T1, 12, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8185  vec_ste( vec2T1, 16, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8186  vec_ste( vec2T1, 20, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8187 
8188  // store third tangents[0] and tangents[1]
8189  vec3T0 = vec_perm( vec3T0, vec3T0, storeT31 );
8190  vec3T1 = vec_perm( vec3T1, vec3T1, storeT32 );
8191 
8192  vec_ste( vec3T0, 0, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8193  vec_ste( vec3T0, 4, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8194  vec_ste( vec3T0, 8, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8195  vec_ste( vec3T1, 12, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8196  vec_ste( vec3T1, 16, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8197  vec_ste( vec3T1, 20, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8198 
8199  // store fourth tangents[0] and tangents[1]
8200  vec4T0 = vec_perm( vec4T0, vec4T0, storeT41 );
8201  vec4T1 = vec_perm( vec4T1, vec4T1, storeT42 );
8202 
8203  vec_ste( vec4T0, 0, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8204  vec_ste( vec4T0, 4, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8205  vec_ste( vec4T0, 8, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8206  vec_ste( vec4T1, 12, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8207  vec_ste( vec4T1, 16, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8208  vec_ste( vec4T1, 20, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8209  }
8210 
8211  // cleanup
8212  for ( ; i < numVerts; i++ ) {
8213  idVec3 &v = verts[i].normal;
8214  float f;
8215 
8216  //f = idMath::RSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
8217  f = FastScalarInvSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
8218  v.x *= f; v.y *= f; v.z *= f;
8219 
8220  for ( int j = 0; j < 2; j++ ) {
8221  idVec3 &t = verts[i].tangents[j];
8222 
8223  t -= ( t * v ) * v;
8224  // f = idMath::RSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
8225  f = FastScalarInvSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
8226  t.x *= f; t.y *= f; t.z *= f;
8227  }
8228  }
8229 }
8230 #endif /* ENABLE_DERIVE */
8231 
8232 #ifdef ENABLE_CREATE
8233 
8234 /*
8235 ============
8236 idSIMD_AltiVec::CreateTextureSpaceLightVectors
8237 
8238  Calculates light vectors in texture space for the given triangle vertices.
8239  For each vertex the direction towards the light origin is projected onto texture space.
8240  The light vectors are only calculated for the vertices referenced by the indexes.
8241 ============
8242 */
8243 
8244 void VPCALL idSIMD_AltiVec::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
8245 
8246  bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
8247  memset( used, 0, numVerts * sizeof( used[0] ) );
8248 
8249  int i;
8250  for ( i = 0; i+7 < numIndexes; i+= 8 ) {
8251  used[indexes[i]] = true;
8252  used[indexes[i+1]] = true;
8253  used[indexes[i+2]] = true;
8254  used[indexes[i+3]] = true;
8255  used[indexes[i+4]] = true;
8256  used[indexes[i+5]] = true;
8257  used[indexes[i+6]] = true;
8258  used[indexes[i+7]] = true;
8259  }
8260 
8261  for ( ; i < numIndexes; i++ ) {
8262  used[indexes[i]] = true;
8263  }
8264 
8265  for ( i = 0; i+1 < numVerts; i+=2 ) {
8266 
8267  const idDrawVert *v = &verts[i];
8268  const idDrawVert *v2 = &verts[i+1];
8269 
8270  float x, y, z;
8271  float x2, y2, z2;
8272  idVec3 lightDir, lightDir2;
8273 
8274  lightDir[0] = lightOrigin[0] - v->xyz[0];
8275  lightDir[1] = lightOrigin[1] - v->xyz[1];
8276  lightDir[2] = lightOrigin[2] - v->xyz[2];
8277 
8278  lightDir2[0] = lightOrigin[0] - v2->xyz[0];
8279  lightDir2[1] = lightOrigin[1] - v2->xyz[1];
8280  lightDir2[2] = lightOrigin[2] - v2->xyz[2];
8281 
8282  x = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
8283  y = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
8284  z = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
8285 
8286  x2 = lightDir2[0] * v2->tangents[0][0] + lightDir2[1] * v2->tangents[0][1] + lightDir2[2] * v2->tangents[0][2];
8287  y2 = lightDir2[0] * v2->tangents[1][0] + lightDir2[1] * v2->tangents[1][1] + lightDir2[2] * v2->tangents[1][2];
8288  z2 = lightDir2[0] * v2->normal[0] + lightDir2[1] * v2->normal[1] + lightDir2[2] * v2->normal[2];
8289 
8290  if ( used[i] ) {
8291  lightVectors[i][0] = x;
8292  lightVectors[i][1] = y;
8293  lightVectors[i][2] = z;
8294  }
8295 
8296  if ( used[i+1] ) {
8297  lightVectors[i+1][0] = x2;
8298  lightVectors[i+1][1] = y2;
8299  lightVectors[i+1][2] = z2;
8300  }
8301  }
8302 
8303  // cleanup
8304  for ( ; i < numVerts; i++ ) {
8305  if ( !used[i] ) {
8306  continue;
8307  }
8308 
8309  const idDrawVert *v = &verts[i];
8310  idVec3 lightDir;
8311 
8312  lightDir[0] = lightOrigin[0] - v->xyz[0];
8313  lightDir[1] = lightOrigin[1] - v->xyz[1];
8314  lightDir[2] = lightOrigin[2] - v->xyz[2];
8315 
8316  lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
8317  lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
8318  lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
8319  }
8320 }
8321 
8322 #if 1
8323 /*
8324 ============
8325 idSIMD_AltiVec::CreateSpecularTextureCoords
8326 
8327  Calculates specular texture coordinates for the given triangle vertices.
8328  For each vertex the normalized direction towards the light origin is added to the
8329  normalized direction towards the view origin and the result is projected onto texture space.
8330  The texture coordinates are only calculated for the vertices referenced by the indexes.
8331 ============
8332 */
8333 void VPCALL idSIMD_AltiVec::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
8334 
8335  bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
8336  memset( used, 0, numVerts * sizeof( used[0] ) );
8337 
8338  int i;
8339  for ( i = 0; i+7 < numIndexes; i+= 8 ) {
8340  used[indexes[i]] = true;
8341  used[indexes[i+1]] = true;
8342  used[indexes[i+2]] = true;
8343  used[indexes[i+3]] = true;
8344  used[indexes[i+4]] = true;
8345  used[indexes[i+5]] = true;
8346  used[indexes[i+6]] = true;
8347  used[indexes[i+7]] = true;
8348  }
8349 
8350  for ( ; i < numIndexes; i++ ) {
8351  used[indexes[i]] = true;
8352  }
8353 
8354  // load lightOrigin and viewOrigin into vectors
8355  const float *lightOriginPtr = lightOrigin.ToFloatPtr();
8356  const float *viewOriginPtr = viewOrigin.ToFloatPtr();
8357  vector unsigned char permVec = vec_lvsl( 0, lightOriginPtr );
8358  vector unsigned char permVec2 = vec_lvsl( 0, viewOriginPtr );
8359  vector float v0 = vec_ld( 0, lightOriginPtr );
8360  vector float v1 = vec_ld( 15, lightOriginPtr );
8361  vector float v2 = vec_ld( 0, viewOriginPtr );
8362  vector float v3 = vec_ld( 15, viewOriginPtr );
8363  vector float vecLightOrigin = vec_perm( v0, v1, permVec );
8364  vector float vecViewOrigin = vec_perm( v2, v3, permVec2 );
8365  const vector float zeroVector = (vector float)(0);
8366  int index;
8367 
8368  for ( index = 0; index+1 < numVerts; index+=2 ) {
8369  const float *vertPtr = verts[index].xyz.ToFloatPtr();
8370  const float *vertPtr2 = verts[index+1].xyz.ToFloatPtr();
8371 
8372  permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
8373  permVec2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
8374 
8375  v0 = vec_ld( 0, vertPtr );
8376  v1 = vec_ld( 15, vertPtr );
8377  vector float v2 = vec_ld( 31, vertPtr );
8378  vector float v3 = vec_ld( 47, vertPtr );
8379  vector float v4 = vec_ld( 63, vertPtr );
8380 
8381  vector float v5 = vec_ld( 0, vertPtr2 );
8382  vector float v6 = vec_ld( 15, vertPtr2 );
8383  vector float v7 = vec_ld( 31, vertPtr2 );
8384  vector float v8 = vec_ld( 47, vertPtr2 );
8385  vector float v9 = vec_ld( 63, vertPtr2 );
8386 
8387  // figure out what values go where
8388  vector float vecXYZ = vec_perm( v0, v1, permVec );
8389  vector float vecNormal = vec_perm( v1, v2, permVec );
8390  vecNormal = vec_sld( vecNormal, vecNormal, 4 );
8391  const vector float vecTangent0 = vec_perm( v2, v3, permVec );
8392  permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
8393  const vector float vecTangent1 = vec_perm( v3, v4, permVec );
8394 
8395  vector float vecXYZ2 = vec_perm( v5, v6, permVec2 );
8396  vector float vecNormal2 = vec_perm( v6, v7, permVec2 );
8397  vecNormal2 = vec_sld( vecNormal2, vecNormal2, 4 );
8398  const vector float vecTangent02 = vec_perm( v7, v8, permVec2 );
8399  permVec2 = vec_add( permVec2, (vector unsigned char)(-4) );
8400  const vector float vecTangent12 = vec_perm( v8, v9, permVec2 );
8401 
8402  // calculate lightDir
8403  vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
8404  vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
8405 
8406  vector float vecLightDir2 = vec_sub( vecLightOrigin, vecXYZ2 );
8407  vector float vecViewDir2 = vec_sub( vecViewOrigin, vecXYZ2 );
8408 
8409  // calculate distance
8410  vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
8411  vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
8412 
8413  vector float vecTempLight2 = vec_madd( vecLightDir2, vecLightDir2, zeroVector );
8414  vector float vecTempView2 = vec_madd( vecViewDir2, vecViewDir2, zeroVector );
8415 
8416  // sum accross first 3 elements of vector
8417  vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
8418  vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8419  vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
8420  vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
8421 
8422  vector float tempSum4 = vec_add( vecTempLight2, vec_sld( vecTempLight2, vecTempLight2, 4 ) );
8423  vecTempLight2 = vec_add( tempSum4, vec_sld( tempSum4, tempSum4, 8 ) );
8424  vector float tempSum5 = vec_add( vecTempView2, vec_sld( vecTempView2, vecTempView2, 4 ) );
8425  vecTempView2 = vec_add( tempSum5, vec_sld( tempSum5, tempSum5, 8 ) );
8426 
8427  // splat sum accross the whole vector
8428  vecTempLight = vec_splat( vecTempLight, 0 );
8429  vecTempView = vec_splat( vecTempView, 0 );
8430 
8431  vecTempLight2 = vec_splat( vecTempLight2, 0 );
8432  vecTempView2 = vec_splat( vecTempView2, 0 );
8433 
8434  vecTempLight = ReciprocalSquareRoot( vecTempLight );
8435  vecTempView = ReciprocalSquareRoot( vecTempView );
8436 
8437  vecTempLight2 = ReciprocalSquareRoot( vecTempLight2 );
8438  vecTempView2 = ReciprocalSquareRoot( vecTempView2 );
8439 
8440  // modify light and view vectors based on ilength
8441  vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
8442  vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
8443 
8444  vecViewDir2 = vec_madd( vecViewDir2, vecTempView2, zeroVector );
8445  vecLightDir2 = vec_madd( vecLightDir2, vecTempLight2, vecViewDir2 );
8446 
8447  // calculate what to store in each texture coord
8448  vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
8449  vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
8450  vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
8451 
8452  vector float vecTC3 = vec_madd( vecLightDir2, vecTangent02, zeroVector );
8453  vector float vecTC4 = vec_madd( vecLightDir2, vecTangent12, zeroVector );
8454  vector float vecTC5 = vec_madd( vecLightDir2, vecNormal2, zeroVector );
8455 
8456  // sum accross first 3 elements of vector
8457  vector float tempSum3;
8458  tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
8459  vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
8460  tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
8461  vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
8462  tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
8463  vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
8464 
8465  tempSum4 = vec_add( vecTC3, vec_sld( vecTC3, vecTC3, 4 ) );
8466  vecTC3 = vec_add( tempSum4, vec_sld( vecTC3, vecTC3, 8 ) );
8467  tempSum5 = vec_add( vecTC4, vec_sld( vecTC4, vecTC4, 4 ) );
8468  vecTC4 = vec_add( tempSum5, vec_sld( vecTC4, vecTC4, 8 ) );
8469  vector float tempSum6 = vec_add( vecTC5, vec_sld( vecTC5, vecTC5, 4 ) );
8470  vecTC5 = vec_add( tempSum6, vec_sld( vecTC5, vecTC5, 8 ) );
8471 
8472  vecTC0 = vec_splat( vecTC0, 0 );
8473  vecTC1 = vec_splat( vecTC1, 0 );
8474  vecTC2 = vec_splat( vecTC2, 0 );
8475 
8476  vecTC3 = vec_splat( vecTC3, 0 );
8477  vecTC4 = vec_splat( vecTC4, 0 );
8478  vecTC5 = vec_splat( vecTC5, 0 );
8479 
8480  if ( used[index] ) {
8481  // store out results
8482  vec_ste( vecTC0, 0, &texCoords[index][0] );
8483  vec_ste( vecTC1, 0, &texCoords[index][1] );
8484  vec_ste( vecTC2, 0, &texCoords[index][2] );
8485  vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
8486  }
8487 
8488  if ( used[index+1] ) {
8489  vec_ste( vecTC3, 0, &texCoords[index+1][0] );
8490  vec_ste( vecTC4, 0, &texCoords[index+1][1] );
8491  vec_ste( vecTC5, 0, &texCoords[index+1][2] );
8492  vec_ste( (vector float)(1.0), 0, &texCoords[index+1][3] );
8493  }
8494  }
8495 
8496  // cleanup
8497  for ( ; index < numVerts; index++ ) {
8498  if ( !used[index] ) {
8499  continue;
8500  }
8501 
8502  const float *vertPtr = verts[index].xyz.ToFloatPtr();
8503 
8504  permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
8505 
8506  v0 = vec_ld( 0, vertPtr );
8507  v1 = vec_ld( 15, vertPtr );
8508  vector float v2 = vec_ld( 31, vertPtr );
8509  vector float v3 = vec_ld( 47, vertPtr );
8510  vector float v4 = vec_ld( 63, vertPtr );
8511 
8512  // figure out what values go where
8513  vector float vecXYZ = vec_perm( v0, v1, permVec );
8514  vector float vecNormal = vec_perm( v1, v2, permVec );
8515  vecNormal = vec_sld( vecNormal, vecNormal, 4 );
8516  const vector float vecTangent0 = vec_perm( v2, v3, permVec );
8517  permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
8518  const vector float vecTangent1 = vec_perm( v3, v4, permVec );
8519 
8520  // calculate lightDir
8521  vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
8522  vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
8523 
8524  // calculate distance
8525  vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
8526  vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
8527 
8528  // sum accross first 3 elements of vector
8529  vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
8530  vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8531  vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
8532  vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
8533 
8534  // splat sum accross the whole vector
8535  vecTempLight = vec_splat( vecTempLight, 0 );
8536  vecTempView = vec_splat( vecTempView, 0 );
8537 
8538  vecTempLight = ReciprocalSquareRoot( vecTempLight );
8539  vecTempView = ReciprocalSquareRoot( vecTempView );
8540 
8541  // modify light and view vectors based on ilength
8542  vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
8543  vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
8544 
8545  // calculate what to store in each texture coord
8546  vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
8547  vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
8548  vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
8549 
8550  // sum accross first 3 elements of vector
8551  vector float tempSum3;
8552  tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
8553  vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
8554  tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
8555  vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
8556  tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
8557  vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
8558 
8559  vecTC0 = vec_splat( vecTC0, 0 );
8560  vecTC1 = vec_splat( vecTC1, 0 );
8561  vecTC2 = vec_splat( vecTC2, 0 );
8562 
8563  // store out results
8564  vec_ste( vecTC0, 0, &texCoords[index][0] );
8565  vec_ste( vecTC1, 0, &texCoords[index][1] );
8566  vec_ste( vecTC2, 0, &texCoords[index][2] );
8567  vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
8568 
8569  }
8570 }
8571 #endif /* 0 for disable spec coord */
8572 
8573 #if 1
8574 
8575 #ifdef VERTEXCACHE_ALIGNED
8576 /*
8577 ============
8578 idSIMD_AltiVec::CreateShadowCache
8579 ============
8580 */
8581 int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
8582  int outVerts = 0;
8583  int i = 0;
8584 
8585  assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
8586 
8587  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8588  register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
8589  register vector float zeroVector = (vector float)(0.0);
8590  register vector float oneVector = (vector float)(1);
8591  register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8592 
8593  const float *lPtr = lightOrigin.ToFloatPtr();
8594  const float *vPtr;
8595  const float *vPtr2;
8596  const float *vPtr3;
8597  const float *vPtr4;
8598 
8599  // put values into a vector
8600  vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
8601  v0 = vec_ld( 0, lPtr );
8602  v1 = vec_ld( 15, lPtr );
8603  v0 = vec_perm( v0, v1, vecPerm );
8604  v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
8605 
8606  //v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
8607  for ( ; i+3 < numVerts; i+= 4 ) {
8608  if ( ! vertRemap[i] ) {
8609  vPtr = verts[i].xyz.ToFloatPtr();
8610 
8611 #ifndef DRAWVERT_PADDED
8612  vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
8613  v2 = vec_ld( 0, vPtr );
8614  v3 = vec_ld( 15, vPtr );
8615  v7 = vec_perm( v2, v3, vecPerm2 );
8616 #else
8617  v7 = vec_ld( 0, vPtr );
8618 #endif
8619  v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
8620  v3 = vec_perm( v7, oneVector, vecPermZeroLast );
8621  v1 = vec_sub( v2, v0 );
8622 
8623  vec_st( v3, 0, &vertexCache[outVerts][0] );
8624  vec_st( v1, 0, &vertexCache[outVerts+1][0] );
8625 
8626  vertRemap[i] = outVerts;
8627  outVerts += 2;
8628  }
8629 
8630  if ( ! vertRemap[i+1] ) {
8631  vPtr2 = verts[i+1].xyz.ToFloatPtr();
8632 
8633 #ifndef DRAWVERT_PADDED
8634  vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
8635  v4 = vec_ld( 0, vPtr2 );
8636  v5 = vec_ld( 15, vPtr2 );
8637  v6 = vec_perm( v4, v5, vecPerm3 );
8638 #else
8639  v6 = vec_ld( 0, vPtr2 );
8640 #endif
8641  v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8642  v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8643  v6 = vec_sub( v4, v0 );
8644 
8645  vec_st( v5, 0, &vertexCache[outVerts][0] );
8646  vec_st( v6, 0, &vertexCache[outVerts+1][0] );
8647 
8648  vertRemap[i+1] = outVerts;
8649  outVerts += 2;
8650  }
8651 
8652  if ( ! vertRemap[i+2] ) {
8653  vPtr3 = verts[i+2].xyz.ToFloatPtr();
8654 
8655 #ifndef DRAWVERT_PADDED
8656  vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
8657  v1 = vec_ld( 0, vPtr3 );
8658  v2 = vec_ld( 15, vPtr3 );
8659  v3 = vec_perm( v1, v2, vecPerm4 );
8660 #else
8661  v3 = vec_ld( 0, vPtr3 );
8662 #endif
8663  v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
8664  v2 = vec_perm( v3, oneVector, vecPermZeroLast );
8665  v3 = vec_sub( v1, v0 );
8666 
8667  vec_st( v2, 0, &vertexCache[outVerts][0] );
8668  vec_st( v3, 0, &vertexCache[outVerts+1][0] );
8669 
8670  vertRemap[i+2] = outVerts;
8671  outVerts += 2;
8672  }
8673 
8674  if ( ! vertRemap[i+3] ) {
8675  vPtr4 = verts[i+3].xyz.ToFloatPtr();
8676 #ifndef DRAWVERT_PADDED
8677  vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
8678  v4 = vec_ld( 0, vPtr4 );
8679  v5 = vec_ld( 16, vPtr4 );
8680  v6 = vec_perm( v4, v5, vecPerm5 );
8681 #else
8682  v6 = vec_ld( 0, vPtr4 );
8683 #endif
8684  v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8685  v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8686  v6 = vec_sub( v4, v0 );
8687 
8688  vec_st( v5, 0, &vertexCache[outVerts][0] );
8689  vec_st( v6, 0, &vertexCache[outVerts+1][0] );
8690 
8691  vertRemap[i+3] = outVerts;
8692  outVerts += 2;
8693  }
8694  }
8695 
8696  // cleanup
8697  for (; i < numVerts; i++ ) {
8698  if ( vertRemap[i] ) {
8699  continue;
8700  }
8701  const float *v = verts[i].xyz.ToFloatPtr();
8702  vertexCache[outVerts+0][0] = v[0];
8703  vertexCache[outVerts+0][1] = v[1];
8704  vertexCache[outVerts+0][2] = v[2];
8705  vertexCache[outVerts+0][3] = 1.0f;
8706 
8707  // R_SetupProjection() builds the projection matrix with a slight crunch
8708  // for depth, which keeps this w=0 division from rasterizing right at the
8709  // wrap around point and causing depth fighting with the rear caps
8710  vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
8711  vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
8712  vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
8713  vertexCache[outVerts+1][3] = 0.0f;
8714  vertRemap[i] = outVerts;
8715  outVerts += 2;
8716  }
8717  return outVerts;
8718 }
8719 
8720 #else
8721 
8722 /*
8723 ============
8724 idSIMD_AltiVec::CreateShadowCache
8725 ============
8726 */
8727 int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
8728  int outVerts = 0;
8729  int i = 0;
8730 
8731  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8732  register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
8733  register vector float zeroVector = (vector float)(0.0);
8734  register vector float oneVector = (vector float)(1);
8735  register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8736 
8737  const float *lPtr = lightOrigin.ToFloatPtr();
8738  const float *vPtr;
8739  const float *vPtr2;
8740  const float *vPtr3;
8741  const float *vPtr4;
8742 
8743  // put values into a vector
8744  vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
8745  v0 = vec_ld( 0, lPtr );
8746  v1 = vec_ld( 15, lPtr );
8747  v0 = vec_perm( v0, v1, vecPerm );
8748  v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
8749 
8750  //v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
8751  for ( ; i+3 < numVerts; i+= 4 ) {
8752  if ( ! vertRemap[i] ) {
8753  vPtr = verts[i].xyz.ToFloatPtr();
8754 #ifndef DRAWVERT_PADDED
8755  vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
8756  v2 = vec_ld( 0, vPtr );
8757  v3 = vec_ld( 15, vPtr );
8758  v7 = vec_perm( v2, v3, vecPerm2 );
8759 #else
8760  v7 = vec_ld( 0, vPtr );
8761 #endif
8762  v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
8763  v3 = vec_perm( v7, oneVector, vecPermZeroLast );
8764  v1 = vec_sub( v2, v0 );
8765 
8766  // store results
8767  UNALIGNED_STORE2( &vertexCache[outVerts][0], v3, v1 );
8768 
8769  vertRemap[i] = outVerts;
8770  outVerts += 2;
8771  }
8772 
8773  if ( ! vertRemap[i+1] ) {
8774  vPtr2 = verts[i+1].xyz.ToFloatPtr();
8775 #ifndef DRAWVERT_PADDED
8776  vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
8777  v4 = vec_ld( 0, vPtr2 );
8778  v5 = vec_ld( 15, vPtr2 );
8779  v6 = vec_perm( v4, v5, vecPerm3 );
8780 #else
8781  v6 = vec_ld( 0, vPtr2 );
8782 #endif
8783  v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8784  v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8785  v6 = vec_sub( v4, v0 );
8786 
8787  // store results
8788  UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
8789 
8790  vertRemap[i+1] = outVerts;
8791  outVerts += 2;
8792  }
8793 
8794  if ( ! vertRemap[i+2] ) {
8795  vPtr3 = verts[i+2].xyz.ToFloatPtr();
8796 #ifndef DRAWVERT_PADDED
8797  vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
8798  v1 = vec_ld( 0, vPtr3 );
8799  v2 = vec_ld( 15, vPtr3 );
8800  v3 = vec_perm( v1, v2, vecPerm4 );
8801 #else
8802  v3 = vec_ld( 0, vPtr3 );
8803 #endif
8804  v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
8805  v2 = vec_perm( v3, oneVector, vecPermZeroLast );
8806  v3 = vec_sub( v1, v0 );
8807 
8808  // store results
8809  UNALIGNED_STORE2( &vertexCache[outVerts][0], v2, v3 );
8810 
8811  vertRemap[i+2] = outVerts;
8812  outVerts += 2;
8813  }
8814  if ( ! vertRemap[i+3] ) {
8815  vPtr4 = verts[i+3].xyz.ToFloatPtr();
8816 #ifndef DRAWVERT_PADDED
8817  vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
8818  v4 = vec_ld( 0, vPtr4 );
8819  v5 = vec_ld( 16, vPtr4 );
8820  v6 = vec_perm( v4, v5, vecPerm5 );
8821 #else
8822  v6 = vec_ld( 0, vPtr4 );
8823 #endif
8824 
8825  v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8826  v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8827  v6 = vec_sub( v4, v0 );
8828 
8829  // store results
8830  UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
8831 
8832 
8833  vertRemap[i+3] = outVerts;
8834  outVerts += 2;
8835  }
8836  }
8837 
8838  // cleanup
8839  for (; i < numVerts; i++ ) {
8840  if ( vertRemap[i] ) {
8841  continue;
8842  }
8843  const float *v = verts[i].xyz.ToFloatPtr();
8844  vertexCache[outVerts+0][0] = v[0];
8845  vertexCache[outVerts+0][1] = v[1];
8846  vertexCache[outVerts+0][2] = v[2];
8847  vertexCache[outVerts+0][3] = 1.0f;
8848 
8849  // R_SetupProjection() builds the projection matrix with a slight crunch
8850  // for depth, which keeps this w=0 division from rasterizing right at the
8851  // wrap around point and causing depth fighting with the rear caps
8852  vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
8853  vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
8854  vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
8855  vertexCache[outVerts+1][3] = 0.0f;
8856  vertRemap[i] = outVerts;
8857  outVerts += 2;
8858  }
8859  return outVerts;
8860 }
8861 #endif /* VERTEXCACHE_ALIGNED */
8862 
8863 #endif /* 0 to disable shadow cache */
8864 
8865 #if 1
8866 
8867 #ifdef VERTEXCACHE_ALIGNED
8868 /*
8869 ============
8870 idSIMD_AltiVec::CreateVertexProgramShadowCache
8871 ============
8872 */
8873 int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
8874 
8875  // vertexCache aligned
8876  assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
8877  // idDrawVert size
8878  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
8879  // idVec4 size
8880  assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
8881 
8882  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8883  register vector float zeroVector = (vector float)(0.0);
8884  register vector float oneVector = (vector float)(1);
8885  register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8886  vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
8887  int i = 0;
8888 
8889 #ifndef DRAWVERT_PADDED
8890  // every fourth one will have the same alignment. Make sure we've got enough here
8891  if ( i+3 < numVerts ) {
8892  vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8893  vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8894  vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8895  vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8896  }
8897 #endif
8898 
8899  for ( ; i+3 < numVerts; i+=4 ) {
8900  const float *vertPtr = verts[i].xyz.ToFloatPtr();
8901  const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
8902  const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
8903  const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
8904 
8905 #ifndef DRAWVERT_PADDED
8906  v0 = vec_ld( 0, vertPtr );
8907  v1 = vec_ld( 15, vertPtr );
8908  v2 = vec_ld( 0, vertPtr2 );
8909  v3 = vec_ld( 15, vertPtr2 );
8910  v4 = vec_ld( 0, vertPtr3 );
8911  v5 = vec_ld( 15, vertPtr3 );
8912  v6 = vec_ld( 0, vertPtr4 );
8913  v7 = vec_ld( 15, vertPtr4 );
8914 
8915  v0 = vec_perm( v0, v1, vertPerm1 );
8916  v1 = vec_perm( v2, v3, vertPerm2 );
8917  v2 = vec_perm( v4, v5, vertPerm3 );
8918  v3 = vec_perm( v6, v7, vertPerm4 );
8919 #else
8920  v0 = vec_ld( 0, vertPtr );
8921  v1 = vec_ld( 0, vertPtr2 );
8922  v2 = vec_ld( 0, vertPtr3 );
8923  v3 = vec_ld( 0, vertPtr4 );
8924 #endif
8925 
8926  v0 = vec_perm( v0, oneVector, vecPermThreeOne );
8927  v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
8928 
8929  v1 = vec_perm( v1, oneVector, vecPermThreeOne );
8930  v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
8931 
8932  v2 = vec_perm( v2, oneVector, vecPermThreeOne );
8933  v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
8934 
8935  v3 = vec_perm( v3, oneVector, vecPermThreeOne );
8936  v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
8937 
8938  // store results
8939  ALIGNED_STORE4( &vertexCache[i*2][0], v0, v4, v1, v5 );
8940  ALIGNED_STORE4( &vertexCache[(i+2)*2][0], v2, v6, v3, v7 );
8941 
8942  }
8943 
8944  // cleanup
8945  for ( ; i < numVerts; i++ ) {
8946  const float *v = verts[i].xyz.ToFloatPtr();
8947  vertexCache[i*2+0][0] = v[0];
8948  vertexCache[i*2+1][0] = v[0];
8949  vertexCache[i*2+0][1] = v[1];
8950  vertexCache[i*2+1][1] = v[1];
8951  vertexCache[i*2+0][2] = v[2];
8952  vertexCache[i*2+1][2] = v[2];
8953  vertexCache[i*2+0][3] = 1.0f;
8954  vertexCache[i*2+1][3] = 0.0f;
8955  }
8956  return numVerts * 2;
8957 }
8958 
8959 #else
8960 /*
8961 ============
8962 idSIMD_AltiVec::CreateVertexProgramShadowCache
8963 ============
8964 */
8965 int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
8966 
8967  // idDrawVert size
8968  assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
8969  // idVec4 size
8970  assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
8971 
8972  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8973  register vector float zeroVector = (vector float)(0.0);
8974  register vector float oneVector = (vector float)(1);
8975  register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8976  vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
8977  int i = 0;
8978 
8979 #ifndef DRAWVERT_PADDED
8980  // every fourth one will have the same alignment. Make sure we've got enough here
8981  if ( i+3 < numVerts ) {
8982  vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8983  vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8984  vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8985  vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8986  }
8987 #endif
8988 
8989  for ( ; i+3 < numVerts; i+=4 ) {
8990  const float *vertPtr = verts[i].xyz.ToFloatPtr();
8991  const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
8992  const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
8993  const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
8994 
8995 #ifndef DRAWVERT_PADDED
8996  v0 = vec_ld( 0, vertPtr );
8997  v1 = vec_ld( 15, vertPtr );
8998  v2 = vec_ld( 0, vertPtr2 );
8999  v3 = vec_ld( 15, vertPtr2 );
9000  v4 = vec_ld( 0, vertPtr3 );
9001  v5 = vec_ld( 15, vertPtr3 );
9002  v6 = vec_ld( 0, vertPtr4 );
9003  v7 = vec_ld( 15, vertPtr4 );
9004 
9005  v0 = vec_perm( v0, v1, vertPerm1 );
9006  v1 = vec_perm( v2, v3, vertPerm2 );
9007  v2 = vec_perm( v4, v5, vertPerm3 );
9008  v3 = vec_perm( v6, v7, vertPerm4 );
9009 #else
9010  v0 = vec_ld( 0, vertPtr );
9011  v1 = vec_ld( 0, vertPtr2 );
9012  v2 = vec_ld( 0, vertPtr3 );
9013  v3 = vec_ld( 0, vertPtr4 );
9014 #endif
9015 
9016  v0 = vec_perm( v0, oneVector, vecPermThreeOne );
9017  v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
9018 
9019  v1 = vec_perm( v1, oneVector, vecPermThreeOne );
9020  v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
9021 
9022  v2 = vec_perm( v2, oneVector, vecPermThreeOne );
9023  v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
9024 
9025  v3 = vec_perm( v3, oneVector, vecPermThreeOne );
9026  v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
9027 
9028  // store results as unaligned
9029  vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &vertexCache[i*2][0] ), (vector unsigned char)(1) );
9030  vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
9031  vector float vc1 = vec_ld( 0, &vertexCache[i*2][0] );
9032  vector float vc2 = vec_ld( 127, &vertexCache[i*2][0] );
9033 
9034  // right rotate input data
9035  v0 = vec_perm( v0, v0, storePerm );
9036  v4 = vec_perm( v4, v4, storePerm );
9037  v1 = vec_perm( v1, v1, storePerm );
9038  v5 = vec_perm( v5, v5, storePerm );
9039  v2 = vec_perm( v2, v2, storePerm );
9040  v6 = vec_perm( v6, v6, storePerm );
9041  v3 = vec_perm( v3, v3, storePerm );
9042  v7 = vec_perm( v7, v7, storePerm );
9043 
9044  vec_st( vec_sel( vc1, v0, mask ), 0 , &vertexCache[i*2][0] );
9045  vec_st( vec_sel( v0, v4, mask ), 15 , &vertexCache[i*2][0] );
9046  vec_st( vec_sel( v4, v1, mask ), 31 , &vertexCache[i*2][0] );
9047  vec_st( vec_sel( v1, v5, mask ), 47 , &vertexCache[i*2][0] );
9048  vec_st( vec_sel( v5, v2, mask ), 63 , &vertexCache[i*2][0] );
9049  vec_st( vec_sel( v2, v6, mask ), 79 , &vertexCache[i*2][0] );
9050  vec_st( vec_sel( v6, v3, mask ), 95 , &vertexCache[i*2][0] );
9051  vec_st( vec_sel( v3, v7, mask ), 111 , &vertexCache[i*2][0] );
9052  vec_st( vec_sel( v7, vc2, mask ), 127 , &vertexCache[i*2][0] );
9053  }
9054 
9055  // cleanup
9056  for ( ; i < numVerts; i++ ) {
9057  const float *v = verts[i].xyz.ToFloatPtr();
9058  vertexCache[i*2+0][0] = v[0];
9059  vertexCache[i*2+1][0] = v[0];
9060  vertexCache[i*2+0][1] = v[1];
9061  vertexCache[i*2+1][1] = v[1];
9062  vertexCache[i*2+0][2] = v[2];
9063  vertexCache[i*2+1][2] = v[2];
9064  vertexCache[i*2+0][3] = 1.0f;
9065  vertexCache[i*2+1][3] = 0.0f;
9066  }
9067  return numVerts * 2;
9068 }
9069 
9070 #endif /* VERTEXCACHE_ALIGNED */
9071 
9072 #endif /* 0 to kill VP shader cache */
9073 
9074 #endif /* ENABLE_CREATE */
9075 
9076 #ifdef ENABLE_SOUND_ROUTINES
9077 
9078 #ifdef SOUND_DEST_ALIGNED
9079 /*
9080 ============
9081 idSIMD_AltiVec::UpSamplePCMTo44kHz
9082 
9083  Duplicate samples for 44kHz output.
9084 
9085  Assumptions:
9086  Assumes that dest starts at aligned address
9087 ============
9088 */
9089 void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
9090 
9091  // dest is aligned
9092  assert( IS_16BYTE_ALIGNED( dest[0] ) );
9093 
9094  vector signed short vs0, vs1;
9095  register vector signed int vi0, vi1;
9096  register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
9097  // permute vectors
9098  register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
9099  register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
9100 
9101  register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9102  register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9103 
9104  // If this can be assumed true, we can eliminate another conditional that checks to see if we can
9105  // load up a vector before the loop
9106  assert( numSamples >= 12 );
9107 
9108  if ( kHz == 11025 ) {
9109  if ( numChannels == 1 ) {
9110  // 8 at a time
9111  int i = 0;
9112 
9113  vector signed short vsOld = vec_ld( 0, &src[i] );
9114  vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
9115 
9116  for ( ; i+7 < numSamples; i+= 8 ) {
9117  // load src
9118  vs1 = vec_ld( 15, &src[i] );
9119  vs0 = vec_perm( vsOld, vs1, permVec );
9120  vsOld = vs1;
9121 
9122  // unpack shorts to ints
9123  vi0 = vec_unpackh( vs0 );
9124  vi1 = vec_unpackl( vs0 );
9125  // convert ints to floats
9126  v0 = vec_ctf( vi0, 0 );
9127  v1 = vec_ctf( vi1, 0 );
9128  // permute into vectors in the order to store
9129 
9130  v2 = vec_splat( v0, 0 );
9131  v3 = vec_splat( v0, 1 );
9132  v4 = vec_splat( v0, 2 );
9133  v5 = vec_splat( v0, 3 );
9134  v6 = vec_splat( v1, 0 );
9135  v7 = vec_splat( v1, 1 );
9136  v8 = vec_splat( v1, 2 );
9137  v9 = vec_splat( v1, 3 );
9138 
9139  // store results
9140  ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
9141  }
9142  // cleanup
9143  for (; i < numSamples; i++ ) {
9144  dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
9145  }
9146  } else {
9147  int i = 0;
9148 
9149  vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9150  vector signed short vsOld = vec_ld( 0, &src[0] );
9151 
9152  for ( ; i+7 < numSamples; i += 8 ) {
9153  // load src
9154  vs1 = vec_ld( 15, &src[i] );
9155  vs0 = vec_perm( vsOld, vs1, permVec );
9156  vsOld = vs1;
9157 
9158  // unpack shorts to ints
9159  vi0 = vec_unpackh( vs0 );
9160  vi1 = vec_unpackl( vs0 );
9161  // convert ints to floats
9162  v0 = vec_ctf( vi0, 0 );
9163  v1 = vec_ctf( vi1, 0 );
9164  // put into vectors in order to store
9165  v2 = vec_perm( v0, v0, vecFirstHalf );
9166  v3 = v2;
9167  v4 = vec_perm( v0, v0, vecSecondHalf );
9168  v5 = v4;
9169  v6 = vec_perm( v1, v1, vecFirstHalf );
9170  v7 = v6;
9171  v8 = vec_perm (v1, v1, vecSecondHalf );
9172  v9 = v8;
9173 
9174  // store results
9175  ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
9176  }
9177 
9178  for ( ; i < numSamples; i += 2 ) {
9179  dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
9180  dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
9181  }
9182  }
9183  } else if ( kHz == 22050 ) {
9184  if ( numChannels == 1 ) {
9185  int i;
9186  vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9187  vector signed short vsOld = vec_ld( 0, &src[0] );
9188 
9189  for ( i = 0; i+7 < numSamples; i += 8 ) {
9190  // load src
9191  vs1 = vec_ld( 0, &src[i] );
9192  vs0 = vec_perm( vsOld, vs1, permVec );
9193  vsOld = vs1;
9194 
9195  // unpack shorts to ints
9196  vi0 = vec_unpackh( vs0 );
9197  vi1 = vec_unpackl( vs0 );
9198  // convert ints to floats
9199  v0 = vec_ctf( vi0, 0 );
9200  v1 = vec_ctf( vi1, 0 );
9201  // put into vectors in order to store
9202  v2 = vec_perm( v0, v0, vecBottom );
9203  v3 = vec_perm( v0, v0, vecTop );
9204  v4 = vec_perm( v1, v1, vecBottom );
9205  v5 = vec_perm (v1, v1, vecTop );
9206 
9207  // store results
9208  ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
9209  }
9210  // cleanup
9211  for ( ; i < numSamples; i++ ) {
9212  dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
9213  }
9214  } else {
9215  int i;
9216  vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9217  vector signed short vsOld = vec_ld( 0, &src[0] );
9218 
9219  for ( i = 0; i+7 < numSamples; i += 8 ) {
9220  // load src
9221  vs1 = vec_ld( 15, &src[i] );
9222  vs0 = vec_perm( vsOld, vs1, permVec );
9223  vsOld = vs1;
9224 
9225  // unpack shorts to ints
9226  vi0 = vec_unpackh( vs0 );
9227  vi1 = vec_unpackl( vs0 );
9228  // convert ints to floats
9229  v0 = vec_ctf( vi0, 0 );
9230  v1 = vec_ctf( vi1, 0 );
9231  // put into vectors in order to store
9232  v2 = vec_perm( v0, v0, vecFirstHalf );
9233  v3 = vec_perm( v0, v0, vecSecondHalf );
9234  v4 = vec_perm( v1, v1, vecFirstHalf );
9235  v5 = vec_perm (v1, v1, vecSecondHalf );
9236 
9237  // store results
9238  ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
9239  }
9240  // cleanup
9241  for ( ; i < numSamples; i += 2 ) {
9242  dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
9243  dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
9244  }
9245  }
9246  } else if ( kHz == 44100 ) {
9247  int i;
9248  vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9249  vector signed short vsOld = vec_ld( 0, &src[0] );
9250 
9251  for ( i = 0; i+7 < numSamples; i += 8 ) {
9252  vs1 = vec_ld( 15, &src[i] );
9253  vs0 = vec_perm( vsOld, vs1, permVec );
9254  vsOld = vs1;
9255 
9256  //unpack shorts to ints
9257  vi0 = vec_unpackh( vs0 );
9258  vi1 = vec_unpackl( vs0 );
9259 
9260  //convert ints to floats
9261  v0 = vec_ctf( vi0, 0 );
9262  v1 = vec_ctf( vi1, 0 );
9263 
9264  //store results
9265  ALIGNED_STORE2( &dest[i], v0, v1 );
9266  }
9267  // cleanup
9268  for ( ; i < numSamples; i++ ) {
9269  dest[i] = (float) src[i];
9270  }
9271  } else {
9272  assert( 0 );
9273  }
9274 }
9275 
9276 #else
9277 
9278 /*
9279 ============
9280 idSIMD_AltiVec::UpSamplePCMTo44kHz
9281 
9282  Duplicate samples for 44kHz output.
9283 
9284  Assumptions:
9285  No assumptions
9286 ============
9287 */
9288 void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
9289 
9290  vector signed short vs0, vs1;
9291  register vector signed int vi0, vi1;
9292  register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
9293  // permute vectors
9294  register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
9295  register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
9296 
9297  register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9298  register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9299 
9300  // calculate perm vector and masks for stores
9301  vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
9302  // original values of dest
9303  vector float vecDest = vec_ld( 0, &dest[0] );
9304  vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
9305 
9306  if ( kHz == 11025 ) {
9307  if ( numChannels == 1 ) {
9308  // 8 at a time
9309  int i = 0;
9310 
9311  vector signed short vsOld = vec_ld( 0, &src[i] );
9312  vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
9313 
9314  for ( ; i+7 < numSamples; i+= 8 ) {
9315  // load src
9316  vs1 = vec_ld( 15, &src[i] );
9317  vs0 = vec_perm( vsOld, vs1, permVec );
9318  vsOld = vs1;
9319  vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
9320 
9321  // unpack shorts to ints
9322  vi0 = vec_unpackh( vs0 );
9323  vi1 = vec_unpackl( vs0 );
9324  // convert ints to floats
9325  v0 = vec_ctf( vi0, 0 );
9326  v1 = vec_ctf( vi1, 0 );
9327  // permute into vectors in the order to store
9328 
9329  v2 = vec_splat( v0, 0 );
9330  v3 = vec_splat( v0, 1 );
9331  v4 = vec_splat( v0, 2 );
9332  v5 = vec_splat( v0, 3 );
9333  v6 = vec_splat( v1, 0 );
9334  v7 = vec_splat( v1, 1 );
9335  v8 = vec_splat( v1, 2 );
9336  v9 = vec_splat( v1, 3 );
9337 
9338  v2 = vec_perm( v2, v2, storePerm );
9339  v3 = vec_perm( v3, v3, storePerm );
9340  v4 = vec_perm( v4, v4, storePerm );
9341  v5 = vec_perm( v5, v5, storePerm );
9342  v6 = vec_perm( v6, v6, storePerm );
9343  v7 = vec_perm( v7, v7, storePerm );
9344  v8 = vec_perm( v8, v8, storePerm );
9345  v9 = vec_perm( v9, v9, storePerm );
9346 
9347  // store results
9348  vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
9349  vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
9350  vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
9351  vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
9352  vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
9353  vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
9354  vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
9355  vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
9356  vecDest = vec_sel( v9, vecDestEnd, mask );
9357  vec_st( vecDest, 127, &dest[i*4] );
9358  }
9359  // cleanup
9360  for (; i < numSamples; i++ ) {
9361  dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
9362  }
9363  } else {
9364  int i = 0;
9365 
9366  vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9367  vector signed short vsOld = vec_ld( 0, &src[0] );
9368 
9369  for ( ; i+7 < numSamples; i += 8 ) {
9370  // load src
9371  vs1 = vec_ld( 15, &src[i] );
9372  vs0 = vec_perm( vsOld, vs1, permVec );
9373  vsOld = vs1;
9374  vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
9375 
9376  // unpack shorts to ints
9377  vi0 = vec_unpackh( vs0 );
9378  vi1 = vec_unpackl( vs0 );
9379  // convert ints to floats
9380  v0 = vec_ctf( vi0, 0 );
9381  v1 = vec_ctf( vi1, 0 );
9382  // put into vectors in order to store
9383  v2 = vec_perm( v0, v0, vecFirstHalf );
9384  v3 = v2;
9385  v4 = vec_perm( v0, v0, vecSecondHalf );
9386  v5 = v4;
9387  v6 = vec_perm( v1, v1, vecFirstHalf );
9388  v7 = v6;
9389  v8 = vec_perm (v1, v1, vecSecondHalf );
9390  v9 = v8;
9391 
9392  v2 = vec_perm( v2, v2, storePerm );
9393  v3 = vec_perm( v3, v3, storePerm );
9394  v4 = vec_perm( v4, v4, storePerm );
9395  v5 = vec_perm( v5, v5, storePerm );
9396  v6 = vec_perm( v6, v6, storePerm );
9397  v7 = vec_perm( v7, v7, storePerm );
9398  v8 = vec_perm( v8, v8, storePerm );
9399  v9 = vec_perm( v9, v9, storePerm );
9400 
9401  // store results
9402  vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
9403  vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
9404  vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
9405  vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
9406  vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
9407  vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
9408  vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
9409  vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
9410  vecDest = vec_sel( v9, vecDestEnd, mask );
9411  vec_st( vecDest, 127, &dest[i*4] );
9412  }
9413 
9414  for ( ; i < numSamples; i += 2 ) {
9415  dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
9416  dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
9417  }
9418  }
9419  } else if ( kHz == 22050 ) {
9420  if ( numChannels == 1 ) {
9421  int i;
9422  vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9423  vector signed short vsOld = vec_ld( 0, &src[0] );
9424 
9425  for ( i = 0; i+7 < numSamples; i += 8 ) {
9426  // load src
9427  vs1 = vec_ld( 0, &src[i] );
9428  vs0 = vec_perm( vsOld, vs1, permVec );
9429  vsOld = vs1;
9430  vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
9431 
9432  // unpack shorts to ints
9433  vi0 = vec_unpackh( vs0 );
9434  vi1 = vec_unpackl( vs0 );
9435  // convert ints to floats
9436  v0 = vec_ctf( vi0, 0 );
9437  v1 = vec_ctf( vi1, 0 );
9438  // put into vectors in order to store
9439  v2 = vec_perm( v0, v0, vecBottom );
9440  v3 = vec_perm( v0, v0, vecTop );
9441  v4 = vec_perm( v1, v1, vecBottom );
9442  v5 = vec_perm (v1, v1, vecTop );
9443 
9444  v2 = vec_perm( v2, v2, storePerm );
9445  v3 = vec_perm( v3, v3, storePerm );
9446  v4 = vec_perm( v4, v4, storePerm );
9447  v5 = vec_perm( v5, v5, storePerm );
9448 
9449  // store results
9450  vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
9451  vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
9452  vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
9453  vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
9454  vecDest = vec_sel( v5, vecDestEnd, mask );
9455  vec_st( vecDest, 63, &dest[i*2] );
9456 
9457  }
9458  // cleanup
9459  for ( ; i < numSamples; i++ ) {
9460  dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
9461  }
9462  } else {
9463  int i;
9464  vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9465  vector signed short vsOld = vec_ld( 0, &src[0] );
9466 
9467  for ( i = 0; i+7 < numSamples; i += 8 ) {
9468  // load src
9469  vs1 = vec_ld( 15, &src[i] );
9470  vs0 = vec_perm( vsOld, vs1, permVec );
9471  vsOld = vs1;
9472  vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
9473 
9474  // unpack shorts to ints
9475  vi0 = vec_unpackh( vs0 );
9476  vi1 = vec_unpackl( vs0 );
9477  // convert ints to floats
9478  v0 = vec_ctf( vi0, 0 );
9479  v1 = vec_ctf( vi1, 0 );
9480  // put into vectors in order to store
9481  v2 = vec_perm( v0, v0, vecFirstHalf );
9482  v3 = vec_perm( v0, v0, vecSecondHalf );
9483  v4 = vec_perm( v1, v1, vecFirstHalf );
9484  v5 = vec_perm (v1, v1, vecSecondHalf );
9485 
9486  v2 = vec_perm( v2, v2, storePerm );
9487  v3 = vec_perm( v3, v3, storePerm );
9488  v4 = vec_perm( v4, v4, storePerm );
9489  v5 = vec_perm( v5, v5, storePerm );
9490 
9491  // store results
9492  vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
9493  vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
9494  vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
9495  vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
9496  vecDest = vec_sel( v5, vecDestEnd, mask );
9497  vec_st( vecDest, 63, &dest[i*2] );
9498  }
9499  // cleanup
9500  for ( ; i < numSamples; i += 2 ) {
9501  dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
9502  dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
9503  }
9504  }
9505  } else if ( kHz == 44100 ) {
9506  int i;
9507  vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9508  vector signed short vsOld = vec_ld( 0, &src[0] );
9509 
9510  for ( i = 0; i+7 < numSamples; i += 8 ) {
9511  //vs0 = vec_ld( 0, &src[i] );
9512  vs1 = vec_ld( 15, &src[i] );
9513  vs0 = vec_perm( vsOld, vs1, permVec );
9514  vsOld = vs1;
9515  vector float vecDestEnd = vec_ld( 31, &dest[i] );
9516 
9517  //unpack shorts to ints
9518  vi0 = vec_unpackh( vs0 );
9519  vi1 = vec_unpackl( vs0 );
9520 
9521  //convert ints to floats
9522  v0 = vec_ctf( vi0, 0 );
9523  v1 = vec_ctf( vi1, 0 );
9524 
9525  v0 = vec_perm( v0, v0, storePerm );
9526  v1 = vec_perm( v1, v1, storePerm );
9527 
9528  // store results
9529  vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
9530  vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
9531  vecDest = vec_sel( v1, vecDestEnd, mask );
9532  vec_st( vecDest, 31, &dest[i] );
9533  }
9534  // cleanup
9535  for ( ; i < numSamples; i++ ) {
9536  dest[i] = (float) src[i];
9537  }
9538  } else {
9539  assert( 0 );
9540  }
9541 }
9542 
9543 #endif
9544 
9545 #ifdef SOUND_DEST_ALIGNED
9546 /*
9547 ============
9548 idSIMD_AltiVec::UpSampleOGGTo44kHz
9549 
9550  Duplicate samples for 44kHz output.
9551 
9552  Assumptions:
9553  Assumes that dest starts at aligned address
9554 ============
9555 */
9556 void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
9557  // dest is aligned
9558  assert( IS_16BYTE_ALIGNED( dest[0] ) );
9559 
9560  register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
9561  register vector float constVec, zeroVector;
9562  register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
9563  vector unsigned char vecPerm1;
9564  vector unsigned char vecPerm2;
9565 
9566  vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9567  vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9568  vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
9569  vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
9570  vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
9571  vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
9572 
9573  constVec = (vector float)(32768.0f);
9574  zeroVector = (vector float)(0.0);
9575 
9576  if ( kHz == 11025 ) {
9577  if ( numChannels == 1 ) {
9578  // calculate perm vector and do first load
9579  vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9580  v10 = vec_ld( 0, &ogg[0][0] );
9581 
9582  int i;
9583  for ( i = 0; i+7 < numSamples; i += 8 ) {
9584  // as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
9585  v8 = v10;
9586  v9 = vec_ld( 15, &ogg[0][i] );
9587  v10 = vec_ld( 31, &ogg[0][i] );
9588  v0 = vec_perm( v8, v9, vecPerm1 );
9589  v1 = vec_perm( v9, v10, vecPerm1 );
9590 
9591  // now we have the elements in a vector, we want
9592  // to splat them each accross their own vector
9593  oggVec1 = vec_splat( v0, 0 );
9594  oggVec2 = vec_splat( v0, 1 );
9595  oggVec3 = vec_splat( v0, 2 );
9596  oggVec4 = vec_splat( v0, 3 );
9597  oggVec5 = vec_splat( v1, 0 );
9598  oggVec6 = vec_splat( v1, 1 );
9599  oggVec7 = vec_splat( v1, 2 );
9600  oggVec8 = vec_splat( v1, 3 );
9601 
9602  v0 = vec_madd( oggVec1, constVec, zeroVector );
9603  v1 = vec_madd( oggVec2, constVec, zeroVector );
9604  v2 = vec_madd( oggVec3, constVec, zeroVector );
9605  v3 = vec_madd( oggVec4, constVec, zeroVector );
9606  v4 = vec_madd( oggVec5, constVec, zeroVector );
9607  v5 = vec_madd( oggVec6, constVec, zeroVector );
9608  v6 = vec_madd( oggVec7, constVec, zeroVector );
9609  v7 = vec_madd( oggVec8, constVec, zeroVector );
9610 
9611  //store results
9612  ALIGNED_STORE8( &dest[i*4], v0, v1, v2, v3, v4, v5, v6, v7 );
9613 
9614  }
9615 
9616  //cleanup
9617  for ( ; i < numSamples; i++ ) {
9618  dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
9619  }
9620 
9621  } else {
9622 
9623  // calculate perm vec for ogg
9624  vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9625  vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9626  v7 = vec_ld( 0, &ogg[1][0] );
9627  v9 = vec_ld( 0, &ogg[0][0] );
9628  int i;
9629 
9630  for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
9631  // load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
9632  v8 = v9;
9633  v9 = vec_ld( 15, &ogg[0][i] );
9634  v0 = vec_perm( v8, v9, vecPerm1 );
9635 
9636  // now we have the elements in a vector, we want
9637  // to splat them each accross their own vector
9638  oggVec1 = vec_splat( v0, 0 );
9639  oggVec2 = vec_splat( v0, 1 );
9640  oggVec3 = vec_splat( v0, 2 );
9641  oggVec4 = vec_splat( v0, 3 );
9642 
9643  // load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
9644  v6 = v7;
9645  v7 = vec_ld( 15, &ogg[1][i] );
9646  v1 = vec_perm( v6, v7, vecPerm2 );
9647 
9648  // now we have the elements in a vector, we want
9649  // to splat them each accross their own vector
9650  oggVec5 = vec_splat( v1, 0 );
9651  oggVec6 = vec_splat( v1, 1 );
9652  oggVec7 = vec_splat( v1, 2 );
9653  oggVec8 = vec_splat( v1, 3 );
9654 
9655  oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
9656  oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
9657  oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
9658  oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
9659  oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
9660  oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
9661  oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
9662  oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
9663 
9664  //merge generates the interleaved pattern that we want and it
9665  //doesn't require a permute vector, so use that instead
9666  v0 = vec_mergeh( oggVec1, oggVec5 );
9667  v1 = vec_mergel( oggVec1, oggVec5 );
9668  v2 = vec_mergeh( oggVec2, oggVec6 );
9669  v3 = vec_mergel( oggVec2, oggVec6 );
9670 
9671  v4 = vec_mergeh( oggVec3, oggVec7 );
9672  v5 = vec_mergel( oggVec3, oggVec7 );
9673  v6 = vec_mergeh( oggVec4, oggVec8 );
9674  v10 = vec_mergel( oggVec4, oggVec8 );
9675 
9676  //store results
9677  ALIGNED_STORE8( &dest[i*8], v0, v1, v2, v3, v4, v5, v6, v10 );
9678  }
9679 
9680  //cleanup
9681  for ( ; i < numSamples >> 1; i++ ) {
9682  dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
9683  dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
9684  }
9685  }
9686  } else if ( kHz == 22050 ) {
9687  if ( numChannels == 1 ) {
9688 
9689  // calculate perm vector and do first load
9690  vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9691  v10 = vec_ld( 0, &ogg[0][0] );
9692 
9693  int i;
9694 
9695  for ( i = 0; i+7 < numSamples; i += 8 ) {
9696  // load values from ogg
9697  v8 = v10;
9698  v9 = vec_ld( 15, &ogg[0][i] );
9699  v10 = vec_ld( 31, &ogg[0][i] );
9700  v0 = vec_perm( v8, v9, vecPerm1 );
9701  v1 = vec_perm( v9, v10, vecPerm1 );
9702 
9703  // multiply
9704  v0 = vec_madd( v0, constVec, zeroVector );
9705  v1 = vec_madd( v1, constVec, zeroVector );
9706 
9707  // permute into results vectors to store
9708  v5 = vec_perm( v0, v0, vecOneTwo );
9709  v6 = vec_perm( v0, v0, vecThreeFour);
9710  v7 = vec_perm( v1, v1, vecOneTwo );
9711  v8 = vec_perm( v1, v1, vecThreeFour );
9712 
9713  //store results
9714  ALIGNED_STORE4( &dest[i*2], v5, v6, v7, v8 );
9715  }
9716  // cleanup
9717  for ( ; i < numSamples; i++ ) {
9718  dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
9719  }
9720  } else {
9721 
9722  // calculate perm vector and do first load
9723  vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9724  vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9725  v7 = vec_ld( 0, &ogg[1][0] );
9726  v9 = vec_ld( 0, &ogg[0][0] );
9727 
9728  int i;
9729  for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
9730  // load ogg[0][i] to ogg[0][i+4]
9731  v8 = v9;
9732  v9 = vec_ld( 15, &ogg[0][i] );
9733  v0 = vec_perm( v8, v9, vecPerm1 );
9734 
9735  // load ogg[1][i] to ogg[1][i+3]
9736  v6 = v7;
9737  v7 = vec_ld( 15, &ogg[1][i] );
9738  v1 = vec_perm( v6, v7, vecPerm2 );
9739 
9740  // multiply
9741  v0 = vec_madd( v0, constVec, zeroVector );
9742  v1 = vec_madd( v1, constVec, zeroVector );
9743 
9744  // generate result vectors to store
9745  v2 = vec_perm( v0, v1, vecFirst );
9746  v3 = vec_perm( v0, v1, vecSecond );
9747  v4 = vec_perm( v0, v1, vecThird );
9748  v5 = vec_perm( v0, v1, vecFourth );
9749 
9750  // store results
9751  ALIGNED_STORE4( &dest[i*4], v2, v3, v4, v5 );
9752  }
9753  // cleanup
9754  for ( ; i < numSamples >> 1; i++ ) {
9755  dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
9756  dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
9757  }
9758  }
9759  } else if ( kHz == 44100 ) {
9760  if ( numChannels == 1 ) {
9761  // calculate perm vector and do first load
9762  vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9763 
9764  v9 = vec_ld( 0, &ogg[0][0] );
9765  int i;
9766 
9767  for ( i = 0; i+7 < numSamples; i += 8 ) {
9768  // load values from ogg
9769  v8 = v9;
9770  v7 = vec_ld( 15, &ogg[0][i] );
9771  v6 = v7;
9772  v9 = vec_ld( 31, &ogg[0][i] );
9773 
9774  v0 = vec_perm( v8, v7, vecPerm1 );
9775  v1 = vec_perm( v6, v9, vecPerm1 );
9776 
9777  // multiply
9778  v0 = vec_madd( v0, constVec, zeroVector );
9779  v1 = vec_madd( v1, constVec, zeroVector );
9780 
9781  ALIGNED_STORE2( &dest[i], v0, v1 );
9782  }
9783 
9784  // cleanup
9785  for ( ; i < numSamples; i++ ) {
9786  dest[i*1+0] = ogg[0][i] * 32768.0f;
9787  }
9788  } else {
9789 
9790  // calculate perm vector and do first load
9791  vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9792  vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9793  v7 = vec_ld( 0, &ogg[1][0] );
9794  v9 = vec_ld( 0, &ogg[0][0] );
9795  int i;
9796 
9797  for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
9798  v8 = v9;
9799  v9 = vec_ld( 15, &ogg[0][i] );
9800  v0 = vec_perm( v8, v9, vecPerm1 );
9801 
9802  // load ogg[1][i] to ogg[1][i+3]
9803  v6 = v7;
9804  v7 = vec_ld( 15, &ogg[1][i] );
9805  v1 = vec_perm( v6, v7, vecPerm2 );
9806 
9807  // multiply
9808  v0 = vec_madd( v0, constVec, zeroVector );
9809  v1 = vec_madd( v1, constVec, zeroVector );
9810 
9811  // generate result vectors
9812  v2 = vec_mergeh( v0, v1 );
9813  v3 = vec_mergel( v0, v1 );
9814 
9815  // store results
9816  ALIGNED_STORE2( &dest[i*2], v2, v3 );
9817  }
9818  // cleanup
9819  for ( ; i < numSamples >> 1; i++ ) {
9820  dest[i*2+0] = ogg[0][i] * 32768.0f;
9821  dest[i*2+1] = ogg[1][i] * 32768.0f;
9822  }
9823  }
9824  } else {
9825  assert( 0 );
9826  }
9827 }
9828 
9829 #else
9830 
9831 /*
9832 ============
9833 idSIMD_AltiVec::UpSampleOGGTo44kHz
9834 
9835  Duplicate samples for 44kHz output.
9836 
9837  Assumptions:
9838  No assumptions
9839 ============
9840 */
9841 void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
9842 
9843  register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
9844  register vector float constVec, zeroVector;
9845  register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
9846  vector unsigned char vecPerm1;
9847  vector unsigned char vecPerm2;
9848 
9849  vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9850  vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9851  vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
9852  vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
9853  vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
9854  vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
9855 
9856  vector unsigned char storePerm;
9857 
9858  constVec = (vector float)(32768.0f);
9859  zeroVector = (vector float)(0.0);
9860 
9861  // calculate perm vector and masks for stores
9862  storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
9863  // original values of dest
9864  vector float vecDest = vec_ld( 0, &dest[0] );
9865  vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
9866 
9867  if ( kHz == 11025 ) {
9868  if ( numChannels == 1 ) {
9869  // calculate perm vector and do first load
9870  vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9871  v10 = vec_ld( 0, &ogg[0][0] );
9872 
9873  int i;
9874  for ( i = 0; i+7 < numSamples; i += 8 ) {
9875  // as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
9876  v8 = v10;
9877  v9 = vec_ld( 15, &ogg[0][i] );
9878  v10 = vec_ld( 31, &ogg[0][i] );
9879  vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
9880  v0 = vec_perm( v8, v9, vecPerm1 );
9881  v1 = vec_perm( v9, v10, vecPerm1 );
9882 
9883  // now we have the elements in a vector, we want
9884  // to splat them each accross their own vector
9885  oggVec1 = vec_splat( v0, 0 );
9886  oggVec2 = vec_splat( v0, 1 );
9887  oggVec3 = vec_splat( v0, 2 );
9888  oggVec4 = vec_splat( v0, 3 );
9889  oggVec5 = vec_splat( v1, 0 );
9890  oggVec6 = vec_splat( v1, 1 );
9891  oggVec7 = vec_splat( v1, 2 );
9892  oggVec8 = vec_splat( v1, 3 );
9893 
9894  v0 = vec_madd( oggVec1, constVec, zeroVector );
9895  v1 = vec_madd( oggVec2, constVec, zeroVector );
9896  v2 = vec_madd( oggVec3, constVec, zeroVector );
9897  v3 = vec_madd( oggVec4, constVec, zeroVector );
9898  v4 = vec_madd( oggVec5, constVec, zeroVector );
9899  v5 = vec_madd( oggVec6, constVec, zeroVector );
9900  v6 = vec_madd( oggVec7, constVec, zeroVector );
9901  v7 = vec_madd( oggVec8, constVec, zeroVector );
9902 
9903  // rotate input data
9904  v0 = vec_perm( v0, v0, storePerm );
9905  v1 = vec_perm( v1, v1, storePerm );
9906  v2 = vec_perm( v2, v2, storePerm );
9907  v3 = vec_perm( v3, v3, storePerm );
9908  v4 = vec_perm( v4, v4, storePerm );
9909  v5 = vec_perm( v5, v5, storePerm );
9910  v6 = vec_perm( v6, v6, storePerm );
9911  v7 = vec_perm( v7, v7, storePerm );
9912 
9913  // store results
9914  vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*4] );
9915  vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*4] );
9916  vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*4] );
9917  vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*4] );
9918  vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*4] );
9919  vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*4] );
9920  vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*4] );
9921  vec_st( vec_sel( v6, v7, mask ), 111, &dest[i*4] );
9922  vecDest = vec_sel( v7, vecDestEnd, mask );
9923  vec_st( vecDest, 127, &dest[i*4] );
9924  }
9925 
9926  //cleanup
9927  for ( ; i < numSamples; i++ ) {
9928  dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
9929  }
9930 
9931  } else {
9932 
9933  // calculate perm vec for ogg
9934  vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9935  vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9936  v7 = vec_ld( 0, &ogg[1][0] );
9937  v9 = vec_ld( 0, &ogg[0][0] );
9938  int i;
9939 
9940  for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
9941  // load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
9942  v8 = v9;
9943  v9 = vec_ld( 15, &ogg[0][i] );
9944  vector float vecDestEnd = vec_ld( 127, &dest[i*8] );
9945  v0 = vec_perm( v8, v9, vecPerm1 );
9946 
9947  // now we have the elements in a vector, we want
9948  // to splat them each accross their own vector
9949  oggVec1 = vec_splat( v0, 0 );
9950  oggVec2 = vec_splat( v0, 1 );
9951  oggVec3 = vec_splat( v0, 2 );
9952  oggVec4 = vec_splat( v0, 3 );
9953 
9954  // load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
9955  v6 = v7;
9956  v7 = vec_ld( 15, &ogg[1][i] );
9957  v1 = vec_perm( v6, v7, vecPerm2 );
9958 
9959  // now we have the elements in a vector, we want
9960  // to splat them each accross their own vector
9961  oggVec5 = vec_splat( v1, 0 );
9962  oggVec6 = vec_splat( v1, 1 );
9963  oggVec7 = vec_splat( v1, 2 );
9964  oggVec8 = vec_splat( v1, 3 );
9965 
9966  oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
9967  oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
9968  oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
9969  oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
9970  oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
9971  oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
9972  oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
9973  oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
9974 
9975  //merge generates the interleaved pattern that we want and it
9976  //doesn't require a permute vector, so use that instead
9977  v0 = vec_mergeh( oggVec1, oggVec5 );
9978  v1 = vec_mergel( oggVec1, oggVec5 );
9979  v2 = vec_mergeh( oggVec2, oggVec6 );
9980  v3 = vec_mergel( oggVec2, oggVec6 );
9981 
9982  v4 = vec_mergeh( oggVec3, oggVec7 );
9983  v5 = vec_mergel( oggVec3, oggVec7 );
9984  v6 = vec_mergeh( oggVec4, oggVec8 );
9985  v10 = vec_mergel( oggVec4, oggVec8 );
9986 
9987  // rotate input data
9988  v0 = vec_perm( v0, v0, storePerm );
9989  v1 = vec_perm( v1, v1, storePerm );
9990  v2 = vec_perm( v2, v2, storePerm );
9991  v3 = vec_perm( v3, v3, storePerm );
9992  v4 = vec_perm( v4, v4, storePerm );
9993  v5 = vec_perm( v5, v5, storePerm );
9994  v6 = vec_perm( v6, v6, storePerm );
9995  v10 = vec_perm( v10, v10, storePerm );
9996 
9997  // store results
9998  vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*8] );
9999  vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*8] );
10000  vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*8] );
10001  vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*8] );
10002  vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*8] );
10003  vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*8] );
10004  vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*8] );
10005  vec_st( vec_sel( v6, v10, mask ), 111, &dest[i*8] );
10006  vecDest = vec_sel( v10, vecDestEnd, mask );
10007  vec_st( vecDest, 127, &dest[i*8] );
10008  }
10009 
10010  //cleanup
10011  for ( ; i < numSamples >> 1; i++ ) {
10012  dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
10013  dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
10014  }
10015  }
10016  } else if ( kHz == 22050 ) {
10017  if ( numChannels == 1 ) {
10018 
10019  // calculate perm vector and do first load
10020  vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10021  v10 = vec_ld( 0, &ogg[0][0] );
10022 
10023  int i;
10024 
10025  for ( i = 0; i+7 < numSamples; i += 8 ) {
10026 
10027  // load values from ogg
10028  v8 = v10;
10029  v9 = vec_ld( 15, &ogg[0][i] );
10030  v10 = vec_ld( 31, &ogg[0][i] );
10031  vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
10032  v0 = vec_perm( v8, v9, vecPerm1 );
10033  v1 = vec_perm( v9, v10, vecPerm1 );
10034 
10035  // multiply
10036  v0 = vec_madd( v0, constVec, zeroVector );
10037  v1 = vec_madd( v1, constVec, zeroVector );
10038 
10039  // permute into results vectors to store
10040  v5 = vec_perm( v0, v0, vecOneTwo );
10041  v6 = vec_perm( v0, v0, vecThreeFour);
10042  v7 = vec_perm( v1, v1, vecOneTwo );
10043  v8 = vec_perm( v1, v1, vecThreeFour );
10044 
10045  // rotate input data
10046  v5 = vec_perm( v5, v5, storePerm );
10047  v6 = vec_perm( v6, v6, storePerm );
10048  v7 = vec_perm( v7, v7, storePerm );
10049  v8 = vec_perm( v8, v8, storePerm );
10050 
10051  // store results
10052  vec_st( vec_sel( vecDest, v5, mask ), 0, &dest[i*2] );
10053  vec_st( vec_sel( v5, v6, mask ), 15, &dest[i*2] );
10054  vec_st( vec_sel( v6, v7, mask ), 31, &dest[i*2] );
10055  vec_st( vec_sel( v7, v8, mask ), 47, &dest[i*2] );
10056  vecDest = vec_sel( v8, vecDestEnd, mask );
10057  vec_st( vecDest, 63, &dest[i*2] );
10058  }
10059 
10060  // cleanup
10061  for ( ; i < numSamples; i++ ) {
10062  dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
10063  }
10064  } else {
10065 
10066  // calculate perm vector and do first load
10067  vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10068  vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
10069  v7 = vec_ld( 0, &ogg[1][0] );
10070  v9 = vec_ld( 0, &ogg[0][0] );
10071 
10072  int i;
10073  for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
10074  // load ogg[0][i] to ogg[0][i+4]
10075  v8 = v9;
10076  v9 = vec_ld( 15, &ogg[0][i] );
10077  vector float vecDestEnd = vec_ld( 63, &dest[i*4] );
10078  v0 = vec_perm( v8, v9, vecPerm1 );
10079 
10080  // load ogg[1][i] to ogg[1][i+3]
10081  v6 = v7;
10082  v7 = vec_ld( 15, &ogg[1][i] );
10083  v1 = vec_perm( v6, v7, vecPerm2 );
10084 
10085  // multiply
10086  v0 = vec_madd( v0, constVec, zeroVector );
10087  v1 = vec_madd( v1, constVec, zeroVector );
10088 
10089  // generate result vectors to store
10090  v2 = vec_perm( v0, v1, vecFirst );
10091  v3 = vec_perm( v0, v1, vecSecond );
10092  v4 = vec_perm( v0, v1, vecThird );
10093  v5 = vec_perm( v0, v1, vecFourth );
10094 
10095  // rotate input data
10096  v2 = vec_perm( v2, v2, storePerm );
10097  v3 = vec_perm( v3, v3, storePerm );
10098  v4 = vec_perm( v4, v4, storePerm );
10099  v5 = vec_perm( v5, v5, storePerm );
10100 
10101  // store results
10102  vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
10103  vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
10104  vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
10105  vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
10106  vecDest = vec_sel( v5, vecDestEnd, mask );
10107  vec_st( vecDest, 63, &dest[i*4] );
10108  }
10109 
10110  // cleanup
10111  for ( ; i < numSamples >> 1; i++ ) {
10112  dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
10113  dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
10114  }
10115  }
10116  } else if ( kHz == 44100 ) {
10117  if ( numChannels == 1 ) {
10118  // calculate perm vector and do first load
10119  vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10120 
10121  v9 = vec_ld( 0, &ogg[0][0] );
10122  int i;
10123 
10124  for ( i = 0; i+7 < numSamples; i += 8 ) {
10125  // load values from ogg
10126  v8 = v9;
10127  v7 = vec_ld( 15, &ogg[0][i] );
10128  v6 = v7;
10129  v9 = vec_ld( 31, &ogg[0][i] );
10130  vector float vecDestEnd = vec_ld( 31, &dest[i] );
10131 
10132  v0 = vec_perm( v8, v7, vecPerm1 );
10133  v1 = vec_perm( v6, v9, vecPerm1 );
10134 
10135  // multiply
10136  v0 = vec_madd( v0, constVec, zeroVector );
10137  v1 = vec_madd( v1, constVec, zeroVector );
10138 
10139  // rotate data
10140  v0 = vec_perm( v0, v0, storePerm );
10141  v1 = vec_perm( v1, v1, storePerm );
10142 
10143  // store results
10144  vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
10145  vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
10146  vecDest = vec_sel( v1, vecDestEnd, mask );
10147  vec_st( vecDest, 31, &dest[i] );
10148  }
10149 
10150  // cleanup
10151  for ( ; i < numSamples; i++ ) {
10152  dest[i*1+0] = ogg[0][i] * 32768.0f;
10153  }
10154  } else {
10155 
10156  // calculate perm vector and do first load
10157  vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10158  vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
10159  v7 = vec_ld( 0, &ogg[1][0] );
10160  v9 = vec_ld( 0, &ogg[0][0] );
10161  int i;
10162 
10163  for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
10164  v8 = v9;
10165  v9 = vec_ld( 15, &ogg[0][i] );
10166  v0 = vec_perm( v8, v9, vecPerm1 );
10167 
10168  // load ogg[1][i] to ogg[1][i+3]
10169  v6 = v7;
10170  v7 = vec_ld( 15, &ogg[1][i] );
10171  v1 = vec_perm( v6, v7, vecPerm2 );
10172 
10173  // multiply
10174  v0 = vec_madd( v0, constVec, zeroVector );
10175  v1 = vec_madd( v1, constVec, zeroVector );
10176 
10177  // generate result vectors
10178  v2 = vec_mergeh( v0, v1 );
10179  v3 = vec_mergel( v0, v1 );
10180 
10181  // store results
10182  UNALIGNED_STORE2( &dest[i*2], v2, v3 );
10183  }
10184  // cleanup
10185  for ( ; i < numSamples >> 1; i++ ) {
10186  dest[i*2+0] = ogg[0][i] * 32768.0f;
10187  dest[i*2+1] = ogg[1][i] * 32768.0f;
10188  }
10189  }
10190  } else {
10191  assert( 0 );
10192  }
10193 }
10194 #endif /* SOUND_DEST_ALIGNED */
10195 
10196 #ifdef SOUND_DEST_ALIGNED
10197 /*
10198 ============
10199 idSIMD_AltiVec::MixSoundTwoSpeakerMono
10200 
10201  Assumptions:
10202  Assumes that mixBuffer starts at aligned address
10203 ============
10204 */
10205 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10206 
10207  // mixBuffer is aligned
10208  assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10209 
10210  int i;
10211  float inc[2];
10212  float spkr[4];
10213 
10214  register vector float vecInc;
10215  register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10216  register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10217  register vector float vecSamplesLd1, vecSamplesLd2;
10218  register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10219 
10220  register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
10221  register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
10222  register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
10223  register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
10224 
10225  //constants
10226  vector float fourVec = (vector float)(4.0);
10227  vector float zeroVec = (vector float)(0.0);
10228 
10229  inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10230  inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10231 
10232  spkr[0] = lastV[0];
10233  spkr[1] = lastV[1];
10234  spkr[2] = lastV[0] + inc[0];
10235  spkr[3] = lastV[1] + inc[1];
10236 
10237  assert( numSamples == MIXBUFFER_SAMPLES );
10238 
10239  inc[0] *= 2;
10240  inc[1] *= 2;
10241 
10242  //load data into registers
10243  vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10244  vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10245  vecInc = vec_mergeh( v0, v1 );
10246 
10247  vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10248  vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10249  vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10250  vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10251 
10252  // load spkr array
10253  v0 = vec_mergeh( v2, v4 );
10254  v1 = vec_mergeh( v3, v5 );
10255  vecSpeaker1 = vec_mergeh( v0, v1 );
10256 
10257  vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10258  vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10259  vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10260  vecInc = vec_madd( vecInc, fourVec, zeroVec );
10261 
10262  vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10263  vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10264 
10265  //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10266  //need a cleanup loop
10267  for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10268 
10269  //load samples and mix buffers
10270  vecSamplesLd1 = vecSamplesLast; //vec_ld( 0, &samples[i] );
10271  vecSamplesLd2 = vec_ld( 15, &samples[i] );
10272  vecSamplesLast = vec_ld( 31, &samples[i] );
10273 
10274  vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
10275  vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
10276 
10277  vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
10278  vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
10279  vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
10280  vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
10281 
10282  vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
10283  vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
10284  vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
10285  vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
10286 
10287  vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10288  vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10289  vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10290  vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10291 
10292  // store results
10293  ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10294 
10295  //add for next iteration
10296  vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10297  vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10298  vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10299  vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10300  }
10301 }
10302 
10303 #else
10304 
10305 /*
10306 ============
10307 idSIMD_AltiVec::MixSoundTwoSpeakerMono
10308 
10309  Assumptions:
10310  No assumptions
10311 ============
10312 */
10313 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10314 
10315  int i;
10316  float inc[2];
10317  float spkr[4];
10318 
10319  register vector float vecInc;
10320  register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10321  register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10322  register vector float vecSamplesLd1, vecSamplesLd2;
10323  register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10324 
10325  register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
10326  register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
10327  register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
10328  register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
10329 
10330  //constants
10331  vector float fourVec = (vector float)(4.0);
10332  vector float zeroVec = (vector float)(0.0);
10333 
10334  inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10335  inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10336 
10337  spkr[0] = lastV[0];
10338  spkr[1] = lastV[1];
10339  spkr[2] = lastV[0] + inc[0];
10340  spkr[3] = lastV[1] + inc[1];
10341 
10342  assert( numSamples == MIXBUFFER_SAMPLES );
10343 
10344  inc[0] *= 2;
10345  inc[1] *= 2;
10346 
10347  //load data into registers
10348  vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10349  vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10350  vecInc = vec_mergeh( v0, v1 );
10351 
10352  vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10353  vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10354  vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10355  vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10356 
10357  // load spkr array
10358  v0 = vec_mergeh( v2, v4 );
10359  v1 = vec_mergeh( v3, v5 );
10360  vecSpeaker1 = vec_mergeh( v0, v1 );
10361 
10362  vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10363  vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10364  vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10365  vecInc = vec_madd( vecInc, fourVec, zeroVec );
10366 
10367  vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10368  vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0]), (vector unsigned char)(1) );
10369  vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10370  vector float vecDest = vec_ld( 0, &mixBuffer[0] );
10371 
10372  //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10373  //need a cleanup loop
10374  for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10375 
10376  //load samples and mix buffers
10377  vecSamplesLd1 = vecSamplesLast;
10378  vecSamplesLd2 = vec_ld( 15, &samples[i] );
10379  vecSamplesLast = vec_ld( 31, &samples[i] );
10380 
10381  vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
10382  vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
10383 
10384  vecMixBuffer1 = vecDest;
10385  vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
10386  vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
10387  vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
10388  vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
10389 
10390  vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10391  vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10392  vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10393  vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
10394 
10395  vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
10396  vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
10397  vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
10398  vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
10399 
10400  vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10401  vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10402  vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10403  vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10404 
10405  // store results
10406  UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10407 
10408  //add for next iteration
10409  vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10410  vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10411  vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10412  vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10413  }
10414 }
10415 
10416 #endif /* SOUND_DEST_ALIGNED */
10417 
10418 #ifdef SOUND_DEST_ALIGNED
10419 /*
10420 ============
10421 idSIMD_AltiVec::MixSoundTwoSpeakerStereo
10422 
10423  Assumptions:
10424  Assumes that mixBuffer starts at aligned address
10425 ============
10426 */
10427 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10428  // mixBuffer is aligned
10429  assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10430 
10431  int i, k;
10432  float inc[2];
10433  float spkr[4];
10434 
10435  // loading buffers
10436  register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10437  // loading buffers
10438  register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10439  register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10440  register vector float vecInc;
10441  vector float fourVec = (vector float)(4.0);
10442  vector float zeroVec = (vector float)(0.0);
10443 
10444  assert( numSamples == MIXBUFFER_SAMPLES );
10445 
10446  inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10447  inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10448 
10449  spkr[0] = lastV[0];
10450  spkr[1] = lastV[1];
10451  spkr[2] = lastV[0] + inc[0];
10452  spkr[3] = lastV[1] + inc[1];
10453 
10454  for ( k = 0; k < 2; k++ ) {
10455  inc[k] *= 2;
10456  }
10457 
10458  // load data in vectors
10459  vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10460  vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10461  vecInc = vec_mergeh( v0, v1 );
10462 
10463  vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10464  vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10465  vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10466  vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10467 
10468  // load spkr array
10469  v0 = vec_mergeh( v2, v4 );
10470  v1 = vec_mergeh( v3, v5 );
10471  vecSpeaker1 = vec_mergeh( v0, v1 );
10472 
10473  vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10474  vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10475  vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10476  vecInc = vec_madd( vecInc, fourVec, zeroVec );
10477 
10478  vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10479  vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10480 
10481  //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10482  //need a cleanup loop
10483  for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10484  // load mix buffers and samples
10485  vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
10486  vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
10487  vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
10488  vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
10489 
10490  vecSamples1 = vecSamplesLast;
10491  vecSamples2 = vec_ld( 15, &samples[i*2] );
10492  vecSamples3 = vec_ld( 31, &samples[i*2] );
10493  vecSamples4 = vec_ld( 47, &samples[i*2] );
10494  vecSamplesLast = vec_ld( 63, &samples[i*2] );
10495 
10496  vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
10497  vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
10498  vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
10499  vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
10500 
10501  vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10502  vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10503  vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10504  vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10505 
10506  vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10507  vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10508  vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10509  vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10510 
10511  //store results
10512  ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10513  }
10514 }
10515 #else
10516 
10517 /*
10518 ============
10519 idSIMD_AltiVec::MixSoundTwoSpeakerStereo
10520 
10521  Assumptions:
10522  No assumptions
10523 ============
10524 */
10525 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10526 
10527  int i, k;
10528  float inc[2];
10529  float spkr[4];
10530  // loading buffers
10531  register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10532  // loading buffers
10533  register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10534  register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10535  register vector float vecInc;
10536  vector float fourVec = (vector float)(4.0);
10537  vector float zeroVec = (vector float)(0.0);
10538 
10539  assert( numSamples == MIXBUFFER_SAMPLES );
10540 
10541  inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10542  inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10543 
10544  spkr[0] = lastV[0];
10545  spkr[1] = lastV[1];
10546  spkr[2] = lastV[0] + inc[0];
10547  spkr[3] = lastV[1] + inc[1];
10548 
10549  for ( k = 0; k < 2; k++ ) {
10550  inc[k] *= 2;
10551  }
10552 
10553  // load data in vectors
10554  vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10555  vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10556  vecInc = vec_mergeh( v0, v1 );
10557 
10558  vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10559  vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10560  vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10561  vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10562 
10563  // load spkr array
10564  v0 = vec_mergeh( v2, v4 );
10565  v1 = vec_mergeh( v3, v5 );
10566  vecSpeaker1 = vec_mergeh( v0, v1 );
10567 
10568  vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10569  vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10570  vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10571  vecInc = vec_madd( vecInc, fourVec, zeroVec );
10572 
10573  vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10574  vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
10575  vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10576  vector float vecDest = vec_ld( 0, &mixBuffer[0] );
10577 
10578  //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10579  //need a cleanup loop
10580  for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10581  // load mix buffers and samples
10582  vecMixBuffer1 = vecDest;
10583  vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
10584  vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
10585  vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
10586  vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
10587 
10588  vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10589  vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10590  vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10591  vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
10592 
10593  vecSamples1 = vecSamplesLast;
10594  vecSamples2 = vec_ld( 15, &samples[i*2] );
10595  vecSamples3 = vec_ld( 31, &samples[i*2] );
10596  vecSamples4 = vec_ld( 47, &samples[i*2] );
10597  vecSamplesLast = vec_ld( 63, &samples[i*2] );
10598 
10599  vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
10600  vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
10601  vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
10602  vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
10603 
10604  vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10605  vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10606  vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10607  vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10608 
10609  vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10610  vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10611  vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10612  vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10613 
10614  // store results
10615  UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10616  }
10617 }
10618 
10619 #endif /* SOUND_DEST_ALIGNED */
10620 
10621 #ifdef SOUND_DEST_ALIGNED
10622 /*
10623 ============
10624 idSIMD_AltiVec::MixSoundSixSpeakerMono
10625 
10626  Assumptions:
10627  Assumes that mixBuffer starts at aligned address
10628 ============
10629 */
10630 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
10631 
10632  // mixBuffer is aligned
10633  assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10634 
10635  float incL[24];
10636  float sL[24];
10637  int i, k;
10638 
10639  vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
10640  vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
10641  vector float vecSamplesLd;
10642  vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
10643  vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
10644  // permute vectors for sample
10645  vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
10646  vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
10647 
10648  assert( numSamples == MIXBUFFER_SAMPLES );
10649  assert( SPEAKER_RIGHT == 1 );
10650  assert( SPEAKER_BACKRIGHT == 5 );
10651 
10652  // incL array, 6 elements repeated
10653  incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10654  incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10655  incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
10656  incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
10657  incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
10658  incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
10659 
10660  // sL array repeated
10661  for ( k = 0; k < 6; k++ ) {
10662  sL[k] = lastV[k];
10663  }
10664  for ( k = 6; k < 12; k++ ) {
10665  sL[k] = lastV[k-6] + incL[k];
10666  }
10667  for ( k = 12; k < 18; k++ ) {
10668  sL[k] = lastV[k-12] + incL[k] + incL[k];
10669  }
10670  for ( k = 18; k < 24; k++ ) {
10671  sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
10672  }
10673 
10674  // multiply by 2 since doing 12 at a time
10675  for ( k = 0; k < 24; k++ ) {
10676  incL[k] *= 4;
10677  }
10678 
10679  //load the data
10680  vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
10681  vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
10682 
10683  vecIncl1 = vec_ld( 0, &incL[0] );
10684  vecIncl2 = vec_ld( 15, &incL[0] );
10685  vecIncl3 = vec_ld( 31, &incL[0] );
10686  vecIncl4 = vec_ld( 47, &incL[0] );
10687  vecIncl5 = vec_ld( 63, &incL[0] );
10688  vecIncl6 = vec_ld( 79, &incL[0] );
10689  vecIncl7 = vec_ld( 95, &incL[0] );
10690 
10691  vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10692  vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10693  vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10694  vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
10695  vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
10696  vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
10697 
10698  vecSL1 = vec_ld( 0, &sL[0] );
10699  vecSL2 = vec_ld( 15, &sL[0] );
10700  vecSL3 = vec_ld( 31, &sL[0] );
10701  vecSL4 = vec_ld( 47, &sL[0] );
10702  vecSL5 = vec_ld( 63, &sL[0] );
10703  vecSL6 = vec_ld( 79, &sL[0] );
10704  vecSL7 = vec_ld( 95, &sL[0] );
10705 
10706  vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10707  vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10708  vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10709  vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
10710  vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
10711  vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
10712 
10713 
10714  vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10715  vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10716 
10717  //since MIXBUFFER_SAMPLES is a multiple of 4, we don't
10718  //need a cleanup loop
10719  for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
10720  //load mix buffer into vectors, assume aligned
10721  vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
10722  vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
10723  vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
10724  vecMixBuffer4 = vec_ld( 0, &mixBuffer[(i*6)+12] );
10725  vecMixBuffer5 = vec_ld( 0, &mixBuffer[(i*6)+16] );
10726  vecMixBuffer6 = vec_ld( 0, &mixBuffer[(i*6)+20] );
10727 
10728  //load samples into vector
10729  vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
10730  vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
10731  vecSamplesLast = vecSamplesLd2;
10732 
10733  //permute to get them ordered how we want
10734  vecSamples1 = vec_splat( vecSamplesLd, 0 );
10735  vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
10736  vecSamples3 = vec_splat( vecSamplesLd, 1 );
10737  vecSamples4 = vec_splat( vecSamplesLd, 2 );
10738  vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
10739  vecSamples6 = vec_splat( vecSamplesLd, 3 );
10740 
10741  //do calculation
10742  vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
10743  vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
10744  vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
10745  vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
10746  vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
10747  vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
10748 
10749  //store out results
10750  ALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
10751 
10752  // add for next iteration
10753  vecSL1 = vec_add( vecSL1, vecIncl1 );
10754  vecSL2 = vec_add( vecSL2, vecIncl2 );
10755  vecSL3 = vec_add( vecSL3, vecIncl3 );
10756  vecSL4 = vec_add( vecSL4, vecIncl4 );
10757  vecSL5 = vec_add( vecSL5, vecIncl5 );
10758  vecSL6 = vec_add( vecSL6, vecIncl6 );
10759  }
10760 }
10761 #else
10762 
10763 /*
10764 ============
10765 idSIMD_AltiVec::MixSoundSixSpeakerMono
10766 
10767  Assumptions:
10768  No assumptions
10769 ============
10770 */
10771 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
10772 
10773  float incL[24];
10774  float sL[24];
10775  int i, k;
10776 
10777  vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
10778  vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
10779  vector float vecSamplesLd;
10780  vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
10781  vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
10782  // permute vectors for sample
10783  register vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
10784  register vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
10785 
10786  assert( numSamples == MIXBUFFER_SAMPLES );
10787  assert( SPEAKER_RIGHT == 1 );
10788  assert( SPEAKER_BACKRIGHT == 5 );
10789 
10790  // incL array, 6 elements repeated
10791  incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10792  incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10793  incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
10794  incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
10795  incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
10796  incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
10797 
10798  // sL array repeated
10799  for ( k = 0; k < 6; k++ ) {
10800  sL[k] = lastV[k];
10801  }
10802  for ( k = 6; k < 12; k++ ) {
10803  sL[k] = lastV[k-6] + incL[k];
10804  }
10805  for ( k = 12; k < 18; k++ ) {
10806  sL[k] = lastV[k-12] + incL[k] + incL[k];
10807  }
10808  for ( k = 18; k < 24; k++ ) {
10809  sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
10810  }
10811 
10812  // multiply by 2 since doing 12 at a time
10813  for ( k = 0; k < 24; k++ ) {
10814  incL[k] *= 4;
10815  }
10816 
10817  // load the data
10818  vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
10819  vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
10820 
10821  vecIncl1 = vec_ld( 0, &incL[0] );
10822  vecIncl2 = vec_ld( 15, &incL[0] );
10823  vecIncl3 = vec_ld( 31, &incL[0] );
10824  vecIncl4 = vec_ld( 47, &incL[0] );
10825  vecIncl5 = vec_ld( 63, &incL[0] );
10826  vecIncl6 = vec_ld( 79, &incL[0] );
10827  vecIncl7 = vec_ld( 95, &incL[0] );
10828 
10829  vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10830  vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10831  vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10832  vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
10833  vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
10834  vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
10835 
10836  vecSL1 = vec_ld( 0, &sL[0] );
10837  vecSL2 = vec_ld( 15, &sL[0] );
10838  vecSL3 = vec_ld( 31, &sL[0] );
10839  vecSL4 = vec_ld( 47, &sL[0] );
10840  vecSL5 = vec_ld( 63, &sL[0] );
10841  vecSL6 = vec_ld( 79, &sL[0] );
10842  vecSL7 = vec_ld( 95, &sL[0] );
10843 
10844  vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10845  vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10846  vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10847  vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
10848  vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
10849  vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
10850 
10851  vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10852  vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
10853  vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10854  vector float vecDest = vec_ld( 0, &mixBuffer[0] );
10855 
10856  //since MIXBUFFER_SAMPLES is a multiple of 4, we don't
10857  //need a cleanup loop
10858  for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
10859  //load mix buffer into vectors
10860  vecMixBuffer1 = vecDest;
10861  vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
10862  vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
10863  vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*6] );
10864  vecMixBuffer5 = vec_ld( 63, &mixBuffer[i*6] );
10865  vecMixBuffer6 = vec_ld( 79, &mixBuffer[i*6] );
10866  vector float vecDestEnd = vec_ld( 95, &mixBuffer[i*6] );
10867 
10868  vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10869  vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10870  vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10871  vecMixBuffer4 = vec_perm( vecMixBuffer4, vecMixBuffer5, mixBufferPerm );
10872  vecMixBuffer5 = vec_perm( vecMixBuffer5, vecMixBuffer6, mixBufferPerm );
10873  vecMixBuffer6 = vec_perm( vecMixBuffer6, vecDestEnd, mixBufferPerm );
10874 
10875  //load samples into vector
10876  vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
10877  vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
10878  vecSamplesLast = vecSamplesLd2;
10879 
10880  //permute to get them ordered how we want
10881  vecSamples1 = vec_splat( vecSamplesLd, 0 );
10882  vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
10883  vecSamples3 = vec_splat( vecSamplesLd, 1 );
10884  vecSamples4 = vec_splat( vecSamplesLd, 2 );
10885  vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
10886  vecSamples6 = vec_splat( vecSamplesLd, 3 );
10887 
10888  //do calculation
10889  vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
10890  vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
10891  vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
10892  vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
10893  vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
10894  vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
10895 
10896  // store results
10897  UNALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
10898 
10899  // add for next iteration
10900  vecSL1 = vec_add( vecSL1, vecIncl1 );
10901  vecSL2 = vec_add( vecSL2, vecIncl2 );
10902  vecSL3 = vec_add( vecSL3, vecIncl3 );
10903  vecSL4 = vec_add( vecSL4, vecIncl4 );
10904  vecSL5 = vec_add( vecSL5, vecIncl5 );
10905  vecSL6 = vec_add( vecSL6, vecIncl6 );
10906  }
10907 }
10908 
10909 #endif /* SOUND_DEST_ALIGNED */
10910 
10911 #ifdef SOUND_DEST_ALIGNED
10912 /*
10913 ============
10914 idSIMD_AltiVec::MixSoundSixSpeakerStereo
10915 
10916  Assumptions:
10917  Assumes that mixBuffer starts at aligned address
10918 ============
10919 */
10920 
10921 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
10922 
10923  // mixBuffer is aligned
10924  assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10925 
10926  float incL[12];
10927  float sL[12];
10928  int i;
10929  vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
10930  vector float vecSL1, vecSL2, vecSL3, vecSL4;
10931  vector float vecSamplesLd;
10932  vector float vecSamples1, vecSamples2, vecSamples3;
10933  vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
10934  // permute vectors for sample
10935  vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
10936  vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
10937 
10938  assert( numSamples == MIXBUFFER_SAMPLES );
10939  assert( SPEAKER_RIGHT == 1 );
10940  assert( SPEAKER_BACKRIGHT == 5 );
10941 
10942  // incL array, 6 elements repeated
10943  incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10944  incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10945  incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
10946  incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
10947  incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
10948  incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
10949 
10950  // sL array repeated
10951  sL[0] = lastV[0];
10952  sL[1] = lastV[1];
10953  sL[2] = lastV[2];
10954  sL[3] = lastV[3];
10955  sL[4] = lastV[4];
10956  sL[5] = lastV[5];
10957  sL[6] = lastV[0] + incL[0];
10958  sL[7] = lastV[1] + incL[1];
10959  sL[8] = lastV[2] + incL[2];
10960  sL[9] = lastV[3] + incL[3];
10961  sL[10] = lastV[4] + incL[4];
10962  sL[11] = lastV[5] + incL[5];
10963 
10964  // multiply by 2 since doing 12 at a time
10965  incL[0] *= 2;
10966  incL[1] *= 2;
10967  incL[2] *= 2;
10968  incL[3] *= 2;
10969  incL[4] *= 2;
10970  incL[5] *= 2;
10971  incL[6] *= 2;
10972  incL[7] *= 2;
10973  incL[8] *= 2;
10974  incL[9] *= 2;
10975  incL[10] *= 2;
10976  incL[11] *= 2;
10977 
10978  //we aligned this data, so load it up
10979  vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
10980  vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
10981  vecIncl1 = vec_ld( 0, &incL[0] );
10982  vecIncl2 = vec_ld( 15, &incL[0] );
10983  vecIncl3 = vec_ld( 31, &incL[0] );
10984  vecIncl4 = vec_ld( 47, &incL[0] );
10985 
10986  vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10987  vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10988  vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10989 
10990  vecSL1 = vec_ld( 0, &sL[0] );
10991  vecSL2 = vec_ld( 15, &sL[0] );
10992  vecSL3 = vec_ld( 31, &sL[0] );
10993  vecSL4 = vec_ld( 47, &sL[0] );
10994 
10995  vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10996  vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10997  vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10998 
10999  vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
11000  vector float vecSamplesLast = vec_ld( 0, &samples[0] );
11001 
11002  for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
11003 
11004  //load mix buffer into vectors, assume aligned
11005  vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
11006  vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
11007  vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
11008 
11009  //load samples into vector
11010  vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
11011  vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
11012  vecSamplesLast = vecSamplesLd2;
11013 
11014  //permute to get them ordered how we want. For the 2nd vector,
11015  //the order happens to be the same as the order we loaded them
11016  //in, so there's no need to permute that one
11017  vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
11018  vecSamples2 = vecSamplesLd;
11019  vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
11020 
11021  //do calculation
11022  vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
11023  vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
11024  vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
11025 
11026  //store out results
11027  ALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
11028 
11029  // add for next iteration
11030  vecSL1 = vec_add( vecSL1, vecIncl1 );
11031  vecSL2 = vec_add( vecSL2, vecIncl2 );
11032  vecSL3 = vec_add( vecSL3, vecIncl3 );
11033  }
11034 }
11035 #else
11036 
11037 /*
11038 ============
11039 idSIMD_AltiVec::MixSoundSixSpeakerStereo
11040 
11041  Assumptions:
11042  No assumptions
11043 ============
11044 */
11045 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
11046 
11047  float incL[12];
11048  float sL[12];
11049 
11050  int i;
11051  vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
11052  vector float vecSL1, vecSL2, vecSL3, vecSL4;
11053  vector float vecSamplesLd;
11054  vector float vecSamples1, vecSamples2, vecSamples3;
11055  vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
11056  // permute vectors for sample
11057  vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
11058  vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
11059 
11060  assert( numSamples == MIXBUFFER_SAMPLES );
11061  assert( SPEAKER_RIGHT == 1 );
11062  assert( SPEAKER_BACKRIGHT == 5 );
11063 
11064  // incL array, 6 elements repeated
11065  incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
11066  incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
11067  incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
11068  incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
11069  incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
11070  incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
11071 
11072  // sL array repeated
11073  sL[0] = lastV[0];
11074  sL[1] = lastV[1];
11075  sL[2] = lastV[2];
11076  sL[3] = lastV[3];
11077  sL[4] = lastV[4];
11078  sL[5] = lastV[5];
11079  sL[6] = lastV[0] + incL[0];
11080  sL[7] = lastV[1] + incL[1];
11081  sL[8] = lastV[2] + incL[2];
11082  sL[9] = lastV[3] + incL[3];
11083  sL[10] = lastV[4] + incL[4];
11084  sL[11] = lastV[5] + incL[5];
11085 
11086  // multiply by 2 since doing 12 at a time
11087  incL[0] *= 2;
11088  incL[1] *= 2;
11089  incL[2] *= 2;
11090  incL[3] *= 2;
11091  incL[4] *= 2;
11092  incL[5] *= 2;
11093  incL[6] *= 2;
11094  incL[7] *= 2;
11095  incL[8] *= 2;
11096  incL[9] *= 2;
11097  incL[10] *= 2;
11098  incL[11] *= 2;
11099 
11100  // load the data
11101  vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
11102  vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
11103  vecIncl1 = vec_ld( 0, &incL[0] );
11104  vecIncl2 = vec_ld( 15, &incL[0] );
11105  vecIncl3 = vec_ld( 31, &incL[0] );
11106  vecIncl4 = vec_ld( 47, &incL[0] );
11107 
11108  vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
11109  vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
11110  vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
11111 
11112  vecSL1 = vec_ld( 0, &sL[0] );
11113  vecSL2 = vec_ld( 15, &sL[0] );
11114  vecSL3 = vec_ld( 31, &sL[0] );
11115  vecSL4 = vec_ld( 47, &sL[0] );
11116 
11117  vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
11118  vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
11119  vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
11120 
11121  vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
11122  vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
11123  vector float vecSamplesLast = vec_ld( 0, &samples[0] );
11124  vector float vecDest = vec_ld( 0, &mixBuffer[0] );
11125 
11126  for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
11127 
11128  //load mix buffer into vectors
11129  vecMixBuffer1 = vecDest;
11130  vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
11131  vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
11132  vector float vecDestEnd = vec_ld( 47, &mixBuffer[i*6] );
11133 
11134  vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
11135  vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
11136  vecMixBuffer3 = vec_perm( vecMixBuffer3, vecDestEnd, mixBufferPerm );
11137 
11138  //load samples into vector
11139  vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
11140  vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
11141  vecSamplesLast = vecSamplesLd2;
11142 
11143  //permute to get them ordered how we want. For the 2nd vector,
11144  //the order happens to be the same as the order we loaded them
11145  //in, so there's no need to permute that one
11146  vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
11147  vecSamples2 = vecSamplesLd;
11148  vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
11149 
11150  //do calculation
11151  vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
11152  vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
11153  vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
11154 
11155  // store results
11156  UNALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
11157 
11158  // add for next iteration
11159  vecSL1 = vec_add( vecSL1, vecIncl1 );
11160  vecSL2 = vec_add( vecSL2, vecIncl2 );
11161  vecSL3 = vec_add( vecSL3, vecIncl3 );
11162  }
11163 }
11164 
11165 #endif
11166 
11167 /*
11168 ============
11169 idSIMD_AltiVec::MixedSoundToSamples
11170 ============
11171 */
11172 void VPCALL idSIMD_AltiVec::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
11173  //this is basically a clamp for sound mixing
11174  register vector float v0, v1, v2, v3, v4, v5, v6, v7;
11175  register vector signed int vi0, vi1, vi2, vi3;
11176  register vector signed short vs0, vs1;
11177  register vector float minVec, maxVec, constVec;
11178  int i = 0;
11179 
11180  //unaligned at start, since samples is not 16-byte aligned
11181  for ( ; NOT_16BYTE_ALIGNED( samples[i] ) && ( i < numSamples ); i++ ) {
11182  samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
11183  }
11184 
11185  constVec = (vector float)(65536.0f);
11186 
11187  //splat min/max into a vector
11188  minVec = (vector float)(-32768.0f);
11189  maxVec = (vector float)(32767.0f);
11190 
11191  vector float vecOld = vec_ld( 0, &mixBuffer[i] );
11192  vector unsigned char permVec = vec_add( vec_lvsl( -1, &mixBuffer[i] ), (vector unsigned char)(1) );
11193 
11194  //vectorize!
11195  for ( ; i+15 < numSamples; i += 16 ) {
11196  //load source
11197  v0 = vecOld;
11198  v1 = vec_ld( 15, &mixBuffer[i] );
11199  v2 = vec_ld( 31, &mixBuffer[i] );
11200  v3 = vec_ld( 31, &mixBuffer[i] );
11201  vecOld = vec_ld( 47, &mixBuffer[i] );
11202 
11203  v0 = vec_perm( v0, v1, permVec );
11204  v1 = vec_perm( v1, v2, permVec );
11205  v2 = vec_perm( v2, v3, permVec );
11206  v3 = vec_perm( v3, vecOld, permVec );
11207 
11208  //apply minimum
11209  v4 = vec_max( v0, minVec );
11210  v5 = vec_max( v1, minVec );
11211  v6 = vec_max( v2, minVec );
11212  v7 = vec_max( v3, minVec );
11213 
11214  //apply maximum
11215  v4 = vec_min( v4, maxVec );
11216  v5 = vec_min( v5, maxVec );
11217  v6 = vec_min( v6, maxVec );
11218  v7 = vec_min( v7, maxVec );
11219 
11220  // convert floats to ints
11221  vi0 = vec_cts( v4, 0 );
11222  vi1 = vec_cts( v5, 0 );
11223  vi2 = vec_cts( v6, 0 );
11224  vi3 = vec_cts( v7, 0 );
11225 
11226  // pack ints into shorts
11227  vs0 = vec_pack( vi0, vi1 );
11228  vs1 = vec_pack( vi2, vi3 );
11229  ALIGNED_STORE2( &samples[i], vs0, vs1 );
11230  }
11231 
11232  //handle cleanup
11233  for ( ; i < numSamples ; i++ ) {
11234  samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
11235  }
11236 }
11237 #endif /* ENABLE_SOUND_ROUTINES */
11238 
11239 #endif /* MACOS_X */
virtual void VPCALL MatX_LowerTriangularSolve(const idMatX &L, float *x, const float *b, const int n, int skip=0)
virtual void VPCALL CreateSpecularTextureCoords(idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
GLdouble GLdouble GLdouble GLdouble q
Definition: glext.h:2959
virtual void VPCALL TransformVerts(idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights)
static const float INFINITY
Definition: Math.h:218
#define min(a, b)
virtual void VPCALL AddAssign16(float *dst, const float *src, const int count)
assert(prefInfo.fullscreenBtn)
const GLbyte * weights
Definition: glext.h:3273
const idVec3 & Normal(void) const
Definition: Plane.h:239
const float * ToFloatPtr(void) const
Definition: Quat.h:289
virtual void VPCALL Dot(float *dst, const idVec3 &constant, const idVec3 *src, const int count)
GLenum GLint GLuint mask
Definition: glext.h:5864
virtual void VPCALL BlendJoints(idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints)
const GLdouble * v
Definition: glext.h:2936
GLdouble GLdouble x2
Definition: qgl.h:415
const int MIXBUFFER_SAMPLES
Definition: Simd.h:84
const float * ToFloatPtr(void) const
Definition: Vector.h:719
float Distance(const idVec3 &v) const
Definition: Plane.h:324
idVec3 xyz
Definition: DrawVert.h:42
GLenum GLint GLint y
Definition: glext.h:2849
virtual void VPCALL MixedSoundToSamples(short *samples, const float *mixBuffer, const int numSamples)
GLenum GLsizei n
Definition: glext.h:3705
virtual void VPCALL ClampMax(float *dst, const float *src, const float max, const int count)
float z
Definition: Vector.h:320
case const int
Definition: Callbacks.cpp:52
idVec3 tangents[2]
Definition: DrawVert.h:45
const float * ToFloatPtr(void) const
Definition: Plane.h:383
Definition: Vector.h:316
case const float
Definition: Callbacks.cpp:62
virtual void VPCALL TracePointCull(byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL Add(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL CmpGE(byte *dst, const float *src0, const float constant, const int count)
virtual void VPCALL MixSoundTwoSpeakerStereo(float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2])
GLdouble s
Definition: glext.h:2935
GLuint src
Definition: glext.h:5390
glIndex_t v3
Definition: Model.h:70
void SetNormal(const idVec3 &normal)
Definition: Plane.h:233
GLfloat v0
Definition: glext.h:3606
float x
Definition: Vector.h:318
virtual void VPCALL MatX_LowerTriangularSolveTranspose(const idMatX &L, float *x, const float *b, const int n)
idQuat & Slerp(const idQuat &from, const idQuat &to, float t)
Definition: Quat.cpp:160
GLuint GLuint GLuint GLuint GLuint GLuint GLuint arg2
Definition: glext.h:5286
GLenum GLint x
Definition: glext.h:2849
int i
Definition: process.py:33
#define C2
virtual void VPCALL Sub16(float *dst, const float *src1, const float *src2, const int count)
#define PI
virtual void VPCALL ClampMin(float *dst, const float *src, const float min, const int count)
GLuint GLuint GLuint GLuint arg1
Definition: glext.h:5285
virtual void VPCALL DeriveUnsmoothedTangents(idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts)
virtual void VPCALL MulAssign16(float *dst, const float constant, const int count)
virtual void VPCALL MixSoundTwoSpeakerMono(float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2])
idVec2 st
Definition: DrawVert.h:43
GLfloat GLfloat GLfloat v2
Definition: glext.h:3608
GLuint dst
Definition: glext.h:5285
float normalizationScale[3]
Definition: Model.h:71
GLuint GLuint GLsizei count
Definition: glext.h:2845
GLint GLint GLint GLint j2
Definition: qgl.h:262
int GetNumColumns(void) const
Definition: Matrix.h:1822
Definition: Vector.h:52
#define FLOATSIGNBITSET(f)
Definition: Math.h:68
virtual void VPCALL OverlayPointCull(byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL Negate16(float *dst, const int count)
GLuint index
Definition: glext.h:3476
const GLubyte * c
Definition: glext.h:4677
Definition: Vector.h:808
virtual void VPCALL DecalPointCull(byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL SubAssign16(float *dst, const float *src, const int count)
virtual void VPCALL CmpLE(byte *dst, const float *src0, const float constant, const int count)
GLuint GLuint GLuint GLuint GLuint GLuint GLuint GLuint GLuint GLuint arg3
Definition: glext.h:5287
virtual void VPCALL Div(float *dst, const float constant, const float *src, const int count)
float y
Definition: Vector.h:319
virtual void VPCALL ConvertJointQuatsToJointMats(idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints)
Definition: Plane.h:71
Definition: eax4.h:1413
const float * ToFloatPtr(void) const
virtual void VPCALL DeriveTriPlanes(idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
INT64 INT64 divisor
Definition: wglext.h:504
virtual bool VPCALL MatX_LDLTFactor(idMatX &mat, idVecX &invDiag, const int n)
idVec3 normal
Definition: DrawVert.h:44
virtual void VPCALL MulSub(float *dst, const float constant, const float *src, const int count)
static float InvSqrt(float x)
Definition: Math.h:268
GLubyte GLubyte GLubyte a
Definition: glext.h:4662
GLenum const GLvoid * addr
Definition: glext.h:5393
GLdouble GLdouble GLdouble y2
Definition: qgl.h:415
virtual void VPCALL MixSoundSixSpeakerStereo(float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6])
GLfloat GLfloat v1
Definition: glext.h:3607
GLubyte GLubyte b
Definition: glext.h:4662
virtual void VPCALL MixSoundSixSpeakerMono(float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6])
GLfloat GLfloat GLfloat GLfloat v3
Definition: glext.h:3609
virtual void VPCALL CreateTextureSpaceLightVectors(idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
#define bits
Definition: Unzip.cpp:3797
glIndex_t v2
Definition: Model.h:70
virtual void VPCALL UpSamplePCMTo44kHz(float *dest, const short *pcm, const int numSamples, const int kHz, const int numChannels)
tuple f
Definition: idal.py:89
virtual void VPCALL DeriveTangents(idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
virtual void VPCALL Mul16(float *dst, const float *src1, const float constant, const int count)
const float * ToFloatPtr(void) const
Definition: Vector.h:301
unsigned char byte
Definition: Lib.h:75
virtual int VPCALL CreateShadowCache(idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts)
virtual void VPCALL Zero16(float *dst, const int count)
virtual void VPCALL Mul(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL NormalizeTangents(idDrawVert *verts, const int numVerts)
idVertexCache vertexCache
Definition: VertexCache.cpp:41
virtual void VPCALL Clamp(float *dst, const float *src, const float min, const float max, const int count)
#define VPCALL
Definition: Simd.h:63
virtual void VPCALL UntransformJoints(idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint)
GLint j
Definition: qgl.h:264
void Lerp(const idVec3 &v1, const idVec3 &v2, const float l)
Definition: Vector.cpp:232
float dot(float a[], float b[])
Definition: Model_lwo.cpp:3883
virtual int VPCALL CreateVertexProgramShadowCache(idVec4 *vertexCache, const idDrawVert *verts, const int numVerts)
virtual void VPCALL ConvertJointMatsToJointQuats(idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints)
virtual void VPCALL MulAdd(float *dst, const float constant, const float *src, const int count)
#define max(x, y)
Definition: os.h:70
virtual void VPCALL MinMax(float &min, float &max, const float *src, const int count)
virtual void VPCALL Copy16(float *dst, const float *src, const int count)
virtual void VPCALL Sub(float *dst, const float constant, const float *src, const int count)
GLdouble GLdouble z
Definition: glext.h:3067
const float * ToFloatPtr(void) const
Definition: Matrix.h:2935
virtual void VPCALL TransformJoints(idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint)
virtual void VPCALL CmpLT(byte *dst, const float *src0, const float constant, const int count)
virtual void VPCALL Add16(float *dst, const float *src1, const float *src2, const int count)
virtual void VPCALL UpSampleOGGTo44kHz(float *dest, const float *const *ogg, const int numSamples, const int kHz, const int numChannels)
virtual void VPCALL CmpGT(byte *dst, const float *src0, const float constant, const int count)
virtual const char *VPCALL GetName(void) const
void FitThroughPoint(const idVec3 &p)
Definition: Plane.h:297
GLdouble GLdouble t
Definition: glext.h:2943