30 #include "../precompiled.h"
39 #include <ppc_intrinsics.h>
54 #if defined(MACOS_X) && defined(__ppc__)
58 #ifndef DRAWVERT_PADDED
60 #define DRAWVERT_OFFSET 15
63 #define DRAWVERT_OFFSET 16
66 #define PLANE_OFFSET 4
68 #define IDVEC4_OFFSET 4
71 #define IS_16BYTE_ALIGNED( x ) ( ( (unsigned long)&x & 0x0F ) == 0 )
72 #define NOT_16BYTE_ALIGNED( x ) ( ( (unsigned long)&x & 0x0F) != 0 )
75 #define ALIGNED_STORE2( ADDR, V0, V1 ) \
76 vec_st( V0, 0, ADDR ); \
77 vec_st( V1, 16, ADDR )
79 #define ALIGNED_STORE3( ADDR, V0, V1, V2 ) \
80 vec_st( V0, 0, ADDR ); \
81 vec_st( V1, 16, ADDR ); \
82 vec_st( V2, 32, ADDR )
84 #define ALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) \
85 vec_st( V0, 0, ADDR ); \
86 vec_st( V1, 16, ADDR ); \
87 vec_st( V2, 32, ADDR ); \
88 vec_st( V3, 48, ADDR )
90 #define ALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) \
91 vec_st( V0, 0, ADDR ); \
92 vec_st( V1, 16, ADDR ); \
93 vec_st( V2, 32, ADDR ); \
94 vec_st( V3, 48, ADDR ); \
95 vec_st( V4, 64, ADDR ); \
96 vec_st( V5, 80, ADDR )
98 #define ALIGNED_STORE8( ADDR, V0, V1, V2, V3, V4, V5, V6, V7 ) \
99 vec_st( V0, 0, ADDR ); \
100 vec_st( V1, 16, ADDR ); \
101 vec_st( V2, 32, ADDR ); \
102 vec_st( V3, 48, ADDR ); \
103 vec_st( V4, 64, ADDR ); \
104 vec_st( V5, 80, ADDR ); \
105 vec_st( V6, 96, ADDR ); \
106 vec_st( V7, 112, ADDR )
109 #define UNALIGNED_STORE1( ADDR, V0 ) { \
111 vector unsigned char ULStoreMacroPerm = vec_lvsr( 0, ADDR ); \
112 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
113 vec_ste( V0, 0, ADDR ); \
114 vec_ste( V0, 4, ADDR ); \
115 vec_ste( V0, 8, ADDR ); \
116 vec_ste( V0, 12, ADDR ); \
119 #define UNALIGNED_STORE2( ADDR, V0, V1 ) { \
121 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
122 vector float ULStoreMacro2 = vec_ld( 31, ADDR ); \
124 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
125 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
127 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
128 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
130 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3; \
131 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
132 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
133 ULStoreVal3 = vec_sel( V1, ULStoreMacro2, ULStoreMacroMask ); \
135 vec_st( ULStoreVal1, 0, ADDR ); \
136 vec_st( ULStoreVal2, 15, ADDR ); \
137 vec_st( ULStoreVal3, 31, ADDR ); }
139 #define UNALIGNED_STORE3( ADDR, V0, V1, V2 ) { \
141 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
142 vector float ULStoreMacro2 = vec_ld( 47, ADDR ); \
144 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
145 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
147 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
148 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
149 V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
151 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4; \
152 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
153 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
154 ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
155 ULStoreVal4 = vec_sel( V2, ULStoreMacro2, ULStoreMacroMask ); \
157 vec_st( ULStoreVal1, 0, ADDR ); \
158 vec_st( ULStoreVal2, 15, ADDR ); \
159 vec_st( ULStoreVal3, 31, ADDR ); \
160 vec_st( ULStoreVal4, 47, ADDR ); }
162 #define UNALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) { \
164 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
165 vector float ULStoreMacro2 = vec_ld( 63, ADDR ); \
167 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
168 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
170 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
171 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
172 V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
173 V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
175 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5; \
176 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
177 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
178 ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
179 ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
180 ULStoreVal5 = vec_sel( V3, ULStoreMacro2, ULStoreMacroMask ); \
182 vec_st( ULStoreVal1, 0, ADDR ); \
183 vec_st( ULStoreVal2, 15, ADDR ); \
184 vec_st( ULStoreVal3, 31, ADDR ); \
185 vec_st( ULStoreVal4, 47, ADDR ); \
186 vec_st( ULStoreVal5, 63, ADDR ); }
188 #define UNALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) { \
190 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
191 vector float ULStoreMacro2 = vec_ld( 95, ADDR ); \
193 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
194 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
196 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
197 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
198 V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
199 V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
200 V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
201 V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
203 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
204 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
205 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
206 ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
207 ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
208 ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
209 ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
210 ULStoreVal7 = vec_sel( V5, ULStoreMacro2, ULStoreMacroMask ); \
212 vec_st( ULStoreVal1, 0, ADDR ); \
213 vec_st( ULStoreVal2, 15, ADDR ); \
214 vec_st( ULStoreVal3, 31, ADDR ); \
215 vec_st( ULStoreVal4, 47, ADDR ); \
216 vec_st( ULStoreVal5, 63, ADDR ); \
217 vec_st( ULStoreVal6, 79, ADDR ); \
218 vec_st( ULStoreVal7, 95, ADDR ); }
220 #define UNALIGNED_STORE9( ADDR, V0, V1, V2, V3, V4, V5, V6, V7, V8 ) { \
222 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
223 vector float ULStoreMacro2 = vec_ld( 143, ADDR ); \
225 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
226 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
228 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
229 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
230 V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
231 V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
232 V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
233 V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
234 V6 = vec_perm( V6, V6, ULStoreMacroPerm ); \
235 V7 = vec_perm( V7, V7, ULStoreMacroPerm ); \
236 V8 = vec_perm( V8, V8, ULStoreMacroPerm ); \
238 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
239 vector float ULStoreVal8, ULStoreVal9, ULStoreVal10; \
240 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
241 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
242 ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
243 ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
244 ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
245 ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
246 ULStoreVal7 = vec_sel( V5, V6, ULStoreMacroMask ); \
247 ULStoreVal8 = vec_sel( V6, V7, ULStoreMacroMask ); \
248 ULStoreVal9 = vec_sel( V7, V8, ULStoreMacroMask ); \
249 ULStoreVal10 = vec_sel( V8, ULStoreMacro2, ULStoreMacroMask ); \
251 vec_st( ULStoreVal1, 0, ADDR ); \
252 vec_st( ULStoreVal2, 15, ADDR ); \
253 vec_st( ULStoreVal3, 31, ADDR ); \
254 vec_st( ULStoreVal4, 47, ADDR ); \
255 vec_st( ULStoreVal5, 63, ADDR ); \
256 vec_st( ULStoreVal6, 79, ADDR ); \
257 vec_st( ULStoreVal7, 95, ADDR ); \
258 vec_st( ULStoreVal8, 111, ADDR ); \
259 vec_st( ULStoreVal9, 127, ADDR ); \
260 vec_st( ULStoreVal10, 143, ADDR ); }
277 inline void debugPrintVector( vector
float v,
char *msg ) {
278 printf(
"%s -- %vf\n", msg, v );
281 inline void debugPrintVector( vector
unsigned int v,
char *msg ) {
282 printf(
"%s -- %vd\n", msg, v );
285 inline void debugPrintVector( vector
bool int v,
char *msg ) {
286 printf(
"%s -- %vi\n", msg, v );
289 inline void debugPrintVector( vector
unsigned char v,
char *msg ) {
290 printf(
"%s -- %vuc\n", msg, v );
293 inline void debugPrintVector( vector
unsigned short v,
char *msg ) {
294 printf(
"%s -- %vs\n", msg, v );
307 inline vector
float Reciprocal( vector
float v ) {
309 vector
float estimate = vec_re( v );
311 return vec_madd( vec_nmsub( estimate, v, (vector
float) (1.0) ), estimate, estimate );
323 inline vector
float ReciprocalSquareRoot( vector
float v ) {
325 vector
float zero = (vector
float)(0);
326 vector
float oneHalf = (vector
float)(0.5);
327 vector
float one = (vector
float)(1.0);
328 vector
float estimate = vec_rsqrte( vec_max( v, (vector
float)(FLT_MIN) ) );
331 vector
float estimateSquared = vec_madd( estimate, estimate, zero );
332 vector
float halfEstimate = vec_madd( estimate, oneHalf, zero );
333 return vec_madd( vec_nmsub( v, estimateSquared, one ), halfEstimate, estimate );
346 inline vector
float Divide( vector
float a, vector
float b ) {
347 return vec_madd( a, Reciprocal( b ), (vector
float)(0) );
358 inline vector
float loadSplatUnalignedScalar(
const float *
s ) {
359 vector
unsigned char splatMap = vec_lvsl( 0, s );
360 vector
float v = vec_ld( 0, s );
361 splatMap = (vector
unsigned char) vec_splat( (vector
float) splatMap, 0 );
362 return vec_perm( v, v, splatMap );
374 inline vector
float VectorATan16( vector
float x, vector
float y ) {
376 vector
float xDivY = Divide( x, y );
377 vector
float yDivX = Divide( y, x );
378 vector
float zeroVector = (vector
float)(0);
380 vector
bool int vecCmp = vec_cmpgt( vec_abs( y ), vec_abs( x ) );
381 vector
float vecA = vec_sel( yDivX, xDivY, vecCmp );
382 vector
bool int vecCmp2 = vec_cmplt( vecA, zeroVector );
383 vector
float vecS = vec_madd( vecA, vecA, (vector
float)(0) );
386 vector
float vecWork1 = vec_madd( (vector
float)(0.0028662257
f), vecS, (vector
float)(-0.0161657367
f) );
387 vecWork1 = vec_madd( vecWork1, vecS, (vector
float)(0.0429096138
f) );
388 vecWork1 = vec_madd( vecWork1, vecS, (vector
float)(-0.0752896400
f) );
389 vecWork1 = vec_madd( vecWork1, vecS, (vector
float)(0.1065626393
f) );
390 vecWork1 = vec_madd( vecWork1, vecS, (vector
float)(-0.1420889944
f) );
391 vecWork1 = vec_madd( vecWork1, vecS, (vector
float)(0.1999355085
f) );
392 vecWork1 = vec_madd( vecWork1, vecS, (vector
float)(-0.3333314528
f) );
393 vecWork1 = vec_madd( vecWork1, vecS, (vector
float)(1) );
396 vecS = vec_madd( vecWork1, vecA, (vector
float)(0) );
399 vector
float negSPlusHalfPI = vec_madd( vecS, (vector
float)(-1), (vector
float)(0.5
f * 3.14159265358979323846
f) );
400 vector
float negSMinusHalfPI = vec_madd( vecS, (vector
float)(-1), (vector
float)(-0.5
f * 3.14159265358979323846
f) );
401 vector
float modRet = vec_sel( negSPlusHalfPI, negSMinusHalfPI, vecCmp2 );
403 return vec_sel( modRet, vecS, vecCmp );
414 inline vector
float VectorSin16( vector
float v ) {
415 vector
float zero = (vector
float)(0);
421 vector
float halfPI = (vector
float) ( 0.5
f * 3.14159265358979323846
f );
422 vector
float PI = vec_add( halfPI, halfPI );
423 vector
float oneandhalfPI = vec_add( PI, halfPI );
424 vector
float twoPI = vec_add( oneandhalfPI, halfPI );
426 vector
float halfPI = (vector
float) ( 0.5f * 3.14159265358979323846f );
427 vector
float PI = (vector
float)(3.14159265358979323846f);
428 vector
float oneandhalfPI = (vector
float)(3.14159265358979323846f + ( 0.5f * 3.14159265358979323846f ) );
429 vector
float twoPI = (vector
float)( 2.0f * 3.14159265358979323846f);
432 vector
bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4;
435 vector
float vecResult;
438 vecMod = vec_floor( Divide( v, twoPI ) );
439 vecResult = vec_nmsub( vecMod, twoPI, v );
441 vector
float vecPIminusA = vec_sub( PI, vecResult );
442 vector
float vecAminus2PI = vec_sub( vecResult, twoPI );
444 vecCmp1 = vec_cmplt( vecResult, PI );
445 vecCmp2 = vec_cmpgt( vecResult, halfPI );
448 vecCmp3 = vec_cmpgt( vecResult, oneandhalfPI );
451 vecCmp4 = vec_and( vec_xor( vecCmp3, (vector
bool int)(1) ), vec_xor( vecCmp1, (vector
bool int)(1) ) );
454 vecCmp1 = vec_and( vecCmp1, vecCmp2 );
455 vecCmp1 = vec_or( vecCmp1, vecCmp4 );
458 vecResult = vec_sel( vecResult, vecPIminusA, vecCmp1 );
459 vecResult = vec_sel( vecResult, vecAminus2PI, vecCmp3 );
462 vector
float vecASquared = vec_madd( vecResult, vecResult, zero );
463 vector
float vecEst = vec_madd( (vector
float)(-2.39e-08f), vecASquared, (vector
float)(2.7526e-06f) );
464 vecEst = vec_madd( vecEst, vecASquared, (vector
float)(-1.98409e-04f) );
465 vecEst = vec_madd( vecEst, vecASquared, (vector
float)(8.3333315e-03f) );
466 vecEst = vec_madd( vecEst, vecASquared, (vector
float)(-1.666666664e-01f) );
467 vecEst = vec_madd( vecEst, vecASquared, (vector
float)(1.0f) );
468 return vec_madd( vecResult, vecEst, zero );
480 inline vector
float vecSplatWithRunTime( vector
float v,
int i ) {
481 vector
unsigned char rotate = vec_lvsl( i *
sizeof(
float ), (
int*) 0
L );
482 v = vec_perm( v, v, rotate );
483 return vec_splat( v, 0 );
494 inline float FastScalarInvSqrt(
float f ) {
495 #ifdef PPC_INTRINSICS
497 const float kSmallestFloat = FLT_MIN;
500 estimate = __frsqrte ( f + kSmallestFloat );
504 estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
505 estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
521 inline void FastScalarInvSqrt_x3(
float *
arg1,
float *
arg2,
float *
arg3 ) {
522 #ifdef PPC_INTRINSICS
523 register float estimate1, estimate2, estimate3;
524 const float kSmallestFloat = FLT_MIN;
527 estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
528 estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
529 estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
532 estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
533 estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
534 estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
535 estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
536 estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
537 estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
563 inline void FastScalarInvSqrt_x6(
float *
arg1,
float *
arg2,
float *
arg3,
float *arg4,
float *arg5,
float *arg6 ) {
564 #ifdef PPC_INTRINSICS
565 register float estimate1, estimate2, estimate3, estimate4, estimate5, estimate6;
566 const float kSmallestFloat = FLT_MIN;
569 estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
570 estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
571 estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
572 estimate4 = __frsqrte ( *arg4 + kSmallestFloat );
573 estimate5 = __frsqrte ( *arg5 + kSmallestFloat );
574 estimate6 = __frsqrte ( *arg6 + kSmallestFloat );
577 estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
578 estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
579 estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
580 estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
581 estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
582 estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
584 estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
585 estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
586 estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
587 estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
588 estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
589 estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
610 #ifdef ENABLE_SIMPLE_MATH
621 vector
float v0_low, v0_hi, v1_hi;
622 vector
unsigned char permVec;
623 vector
float constVec;
627 for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
628 dst[
i] = constant + src[
i];
632 constVec = loadSplatUnalignedScalar( &constant );
635 permVec = vec_add( vec_lvsl( -1, (
int*) &src[i] ), (vector
unsigned char)(1) );
636 v1_hi = vec_ld( 0, &src[i] );
639 for ( ; i+7 <
count; i += 8 ) {
642 v0_hi = vec_ld( 15, &src[i] );
643 v1_hi = vec_ld( 31, &src[i] );
645 v0 = vec_perm( v0_low, v0_hi, permVec );
646 v1 = vec_perm( v0_hi, v1_hi, permVec );
648 v2 = vec_add( v0, constVec );
649 v3 = vec_add( v1, constVec );
652 ALIGNED_STORE2( &dst[i], v2, v3 );
656 for ( ; i <
count ; i++ ) {
657 dst[
i] = constant + src[
i];
670 register vector
float v0,
v1,
v2,
v3, v4, v5;
672 register vector
float v0_low, v0_hi, v2_low, v2_hi;
674 register vector
float v1_low, v1_hi, v3_low, v3_hi;
676 register vector
unsigned char permVec1, permVec2;
677 vector
unsigned char oneCharVector = (vector
unsigned char)(1);
682 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
683 dst[
i] = src0[
i] + src1[
i];
687 permVec1 = vec_add( vec_lvsl( -1, (
int*) &src0[i] ), oneCharVector );
688 permVec2 = vec_add( vec_lvsl( -1, (
int*) &src1[i] ), oneCharVector );
689 v2_hi = vec_ld( 0, &src0[i] );
690 v3_hi = vec_ld( 0, &src1[i] );
693 for ( ; i+7 <
count; i += 8 ) {
696 v0_hi = vec_ld( 15, &src0[i] );
698 v2_hi = vec_ld( 31, &src0[i] );
701 v1_hi = vec_ld( 15, &src1[i] );
703 v3_hi = vec_ld( 31, &src1[i] );
705 v0 = vec_perm( v0_low, v0_hi, permVec1 );
706 v1 = vec_perm( v1_low, v1_hi, permVec2 );
707 v2 = vec_perm( v2_low, v2_hi, permVec1 );
708 v3 = vec_perm( v3_low, v3_hi, permVec2 );
710 v4 = vec_add( v0, v1 );
711 v5 = vec_add( v2, v3 );
713 ALIGNED_STORE2( &dst[i], v4, v5 );
718 for ( ; i <
count ; i++ ) {
719 dst[
i] = src0[
i] + src1[
i];
732 register vector
float v0,
v1,
v2,
v3;
733 register vector
float v0_low, v0_hi, v1_low, v1_hi;
734 register vector
unsigned char permVec;
735 register vector
float constVec;
736 vector
unsigned char oneCharVector = (vector
unsigned char)(1);
740 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
741 dst[
i] = constant - src[
i];
745 constVec = loadSplatUnalignedScalar( &constant );
748 permVec = vec_add( vec_lvsl( -1, (
int*) &src[i] ), oneCharVector );
749 v1_hi = vec_ld( 0, &src[i] );
752 for ( ; i+7 <
count; i += 8 ) {
755 v0_hi = vec_ld( 15, &src[i] );
757 v1_hi = vec_ld( 31, &src[i] );
759 v0 = vec_perm( v0_low, v0_hi, permVec );
760 v1 = vec_perm( v1_low, v1_hi, permVec );
762 v2 = vec_sub( constVec, v0 );
763 v3 = vec_sub( constVec, v1 );
765 ALIGNED_STORE2( &dst[i], v2, v3 );
769 for ( ; i <
count ; i++ ) {
770 dst[
i] = constant - src[
i];
782 register vector
float v0,
v1,
v2,
v3, v4, v5;
784 register vector
float v0_low, v0_hi, v2_low, v2_hi;
786 register vector
float v1_low, v1_hi, v3_low, v3_hi;
787 register vector
unsigned char permVec1, permVec2;
788 vector
unsigned char oneCharVector = (vector
unsigned char)(1);
792 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
793 dst[
i] = src0[
i] - src1[
i];
797 permVec1 = vec_add( vec_lvsl( -1, (
int*) &src0[i] ), oneCharVector );
798 permVec2 = vec_add( vec_lvsl( -1, (
int*) &src1[i] ), oneCharVector );
799 v2_hi = vec_ld( 0, &src0[i] );
800 v3_hi = vec_ld( 0, &src1[i] );
803 for ( ; i+7 <
count; i += 8 ) {
806 v0_hi = vec_ld( 15, &src0[i] );
808 v2_hi = vec_ld( 31, &src0[i] );
811 v1_hi = vec_ld( 15, &src1[i] );
813 v3_hi = vec_ld( 31, &src1[i] );
815 v0 = vec_perm( v0_low, v0_hi, permVec1 );
816 v1 = vec_perm( v1_low, v1_hi, permVec2 );
817 v2 = vec_perm( v2_low, v2_hi, permVec1 );
818 v3 = vec_perm( v3_low, v3_hi, permVec2 );
820 v4 = vec_sub( v0, v1 );
821 v5 = vec_sub( v2, v3 );
823 ALIGNED_STORE2( &dst[i], v4, v5 );
827 for ( ; i <
count ; i++ ) {
828 dst[
i] = src0[
i] - src1[
i];
840 register vector
float v0, v0_low, v0_hi, v1_low, v1_hi,
v1,
v2,
v3;
841 register vector
float constVec;
842 register vector
unsigned char permVec;
843 vector
unsigned char oneCharVector = (vector
unsigned char)(1);
844 register vector
float zeroVector = (vector
float)(0.0);
848 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
849 dst[
i] = constant * src[
i];
853 constVec = loadSplatUnalignedScalar( &constant );
855 permVec = vec_add( vec_lvsl( -1, (
int*) &src[i] ), oneCharVector );
856 v1_hi = vec_ld( 0, &src[i] );
859 for ( ; i+7 <
count; i += 8 ) {
862 v0_hi = vec_ld( 15, &src[i] );
864 v1_hi = vec_ld( 31, &src[i] );
866 v0 = vec_perm( v0_low, v0_hi, permVec );
867 v1 = vec_perm( v1_low, v1_hi, permVec );
869 v2 = vec_madd( constVec, v0, zeroVector );
870 v3 = vec_madd( constVec, v1, zeroVector );
872 ALIGNED_STORE2( &dst[i], v2, v3 );
876 for ( ; i <
count ; i++ ) {
877 dst[
i] = constant * src[
i];
889 register vector
float v0,
v1,
v2,
v3, v4, v5;
891 register vector
float v0_low, v0_hi, v2_low, v2_hi;
893 register vector
float v1_low, v1_hi, v3_low, v3_hi;
895 register vector
unsigned char permVec1, permVec2;
896 register vector
float constVec = (vector
float)(0.0);
897 vector
unsigned char oneCharVector = (vector
unsigned char)(1);
901 for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
902 dst[
i] = src0[
i] * src1[
i];
906 permVec1 = vec_add( vec_lvsl( -1, (
int*) &src0[i] ), oneCharVector );
907 permVec2 = vec_add( vec_lvsl( -1, (
int*) &src1[i] ), oneCharVector );
908 v2_hi = vec_ld( 0, &src0[i] );
909 v3_hi = vec_ld( 0, &src1[i] );
912 for ( ; i+7 <
count; i += 8 ) {
915 v0_hi = vec_ld( 15, &src0[i] );
917 v2_hi = vec_ld( 31, &src0[i] );
920 v1_hi = vec_ld( 15, &src1[i] );
922 v3_hi = vec_ld( 31, &src1[i] );
924 v0 = vec_perm( v0_low, v0_hi, permVec1 );
925 v1 = vec_perm( v1_low, v1_hi, permVec2 );
926 v2 = vec_perm( v2_low, v2_hi, permVec1 );
927 v3 = vec_perm( v3_low, v3_hi, permVec2 );
931 v4 = vec_madd( v0, v1, constVec );
932 v5 = vec_madd( v2, v3, constVec );
934 ALIGNED_STORE2( &dst[i], v4, v5 );
938 for ( ; i <
count ; i++ ) {
939 dst[
i] = src0[
i] * src1[
i];
951 register vector
float v0,
v1,
v2,
v3;
952 register vector
float v0_low, v0_hi, v1_low, v1_hi;
953 register vector
unsigned char permVec;
954 register vector
float constVec;
955 vector
unsigned char oneCharVector = (vector
unsigned char)(1);
959 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
960 dst[
i] = constant / divisor[
i];
964 constVec = loadSplatUnalignedScalar( &constant );
967 permVec = vec_add( vec_lvsl( -1, (
int*) &divisor[i] ), oneCharVector );
968 v1_hi = vec_ld( 0, &divisor[i] );
971 for ( ; i+7 <
count; i += 8 ) {
974 v0_hi = vec_ld( 15, &divisor[i] );
976 v1_hi = vec_ld( 31, &divisor[i] );
978 v0 = vec_perm( v0_low, v0_hi, permVec );
979 v1 = vec_perm( v1_low, v1_hi, permVec );
981 v2 = Divide( constVec, v0 );
982 v3 = Divide( constVec, v1 );
984 ALIGNED_STORE2( &dst[i], v2, v3 );
988 for ( ; i <
count ; i++ ) {
989 dst[
i] = constant / divisor[
i];
1001 register vector
float v0,
v1,
v2,
v3, v4, v5;
1003 register vector
float v0_low, v0_hi, v2_low, v2_hi;
1005 register vector
float v1_low, v1_hi, v3_low, v3_hi;
1007 register vector
unsigned char permVec1, permVec2;
1008 vector
unsigned char oneCharVector = (vector
unsigned char)(1);
1012 for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
1013 dst[
i] = src0[
i] / src1[
i];
1017 permVec1 = vec_add( vec_lvsl( -1, (
int*) &src0[i] ), oneCharVector );
1018 permVec2 = vec_add( vec_lvsl( -1, (
int*) &src1[i] ), oneCharVector );
1019 v2_hi = vec_ld( 0, &src0[i] );
1020 v3_hi = vec_ld( 0, &src1[i] );
1023 for ( ; i+7 <
count; i += 8 ) {
1026 v0_hi = vec_ld( 15, &src0[i] );
1028 v2_hi = vec_ld( 31, &src0[i] );
1031 v1_hi = vec_ld( 15, &src1[i] );
1033 v3_hi = vec_ld( 31, &src1[i] );
1035 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1036 v1 = vec_perm( v1_low, v1_hi, permVec2 );
1037 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1038 v3 = vec_perm( v3_low, v3_hi, permVec2 );
1040 v4 = Divide( v0, v1 );
1041 v5 = Divide( v2, v3 );
1043 ALIGNED_STORE2( &dst[i], v4, v5 );
1047 for ( ; i <
count ; i++ ) {
1048 dst[
i] = src0[
i] / src1[
i];
1061 register vector
float v0,
v1,
v2,
v3, v4, v5;
1062 register vector
float constVec;
1064 register vector
float v0_low, v0_hi, v2_low, v2_hi;
1066 register vector
unsigned char permVec1;
1067 vector
unsigned char oneCharVector = (vector
unsigned char)(1);
1071 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
1072 dst[
i] += constant * src[
i];
1076 constVec = loadSplatUnalignedScalar( &constant );
1079 permVec1 = vec_add( vec_lvsl( -1, (
int*) &src[i] ), oneCharVector );
1080 v2_hi = vec_ld( 0, &src[i] );
1083 for ( ; i+7 <
count; i += 8 ) {
1085 v0_hi = vec_ld( 15, &src[i] );
1087 v2_hi = vec_ld( 31, &src[i] );
1089 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1090 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1093 v1 = vec_ld( 0, &dst[i] );
1094 v3 = vec_ld( 16, &dst[i] );
1096 v4 = vec_madd( constVec, v0, v1 );
1097 v5 = vec_madd( constVec, v2, v3 );
1099 ALIGNED_STORE2( &dst[i], v4, v5 );
1103 for ( ; i <
count ; i++ ) {
1104 dst[
i] += constant * src[
i];
1116 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
1118 register vector
float v0_low, v0_hi, v2_low, v2_hi;
1120 register vector
float v1_low, v1_hi, v3_low, v3_hi;
1122 register vector
unsigned char permVec1, permVec2;
1123 vector
unsigned char oneCharVector = (vector
unsigned char)(1);
1128 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
1129 dst[
i] += src0[
i] * src1[
i];
1133 permVec1 = vec_add( vec_lvsl( -1, (
int*) &src0[i] ), oneCharVector );
1134 permVec2 = vec_add( vec_lvsl( -1, (
int*) &src1[i] ), oneCharVector );
1135 v2_hi = vec_ld( 0, &src0[i] );
1136 v3_hi = vec_ld( 0, &src1[i] );
1139 for ( ; i+7 <
count; i += 8 ) {
1142 v0_hi = vec_ld( 15, &src0[i] );
1144 v2_hi = vec_ld( 31, &src0[i] );
1147 v1_hi = vec_ld( 15, &src1[i] );
1149 v3_hi = vec_ld( 31, &src1[i] );
1151 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1152 v1 = vec_perm( v1_low, v1_hi, permVec2 );
1153 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1154 v3 = vec_perm( v3_low, v3_hi, permVec2 );
1158 v4 = vec_ld( 0, &dst[i] );
1159 v5 = vec_ld( 16, &dst[i] );
1161 v6 = vec_madd( v0, v1, v4 );
1162 v7 = vec_madd( v2, v3, v5 );
1164 ALIGNED_STORE2( &dst[i], v6, v7 );
1168 for ( ; i <
count ; i++ ) {
1169 dst[
i] += src0[
i] * src1[
i];
1181 register vector
float v0,
v1,
v2,
v3, v4, v5;
1182 register vector
float constVec;
1184 register vector
float v0_low, v0_hi, v2_low, v2_hi;
1186 register vector
unsigned char permVec1;
1187 vector
unsigned char oneCharVector = (vector
unsigned char)(1);
1191 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
1192 dst[
i] -= constant * src[
i];
1196 constVec = loadSplatUnalignedScalar( &constant );
1199 permVec1 = vec_add( vec_lvsl( -1, (
int*) &src[i] ), oneCharVector );
1200 v2_hi = vec_ld( 0, &src[i] );
1203 for ( ; i+7 <
count; i += 8 ) {
1205 v0_hi = vec_ld( 15, &src[i] );
1207 v2_hi = vec_ld( 31, &src[i] );
1209 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1210 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1214 v1 = vec_ld( 0, &dst[i] );
1215 v3 = vec_ld( 16, &dst[i] );
1217 v4 = vec_nmsub( v0, constVec, v1 );
1218 v5 = vec_nmsub( v2, constVec, v3 );
1220 ALIGNED_STORE2( &dst[i], v4, v5 );
1224 for ( ; i <
count ; i++ ) {
1225 dst[
i] -= constant * src[
i];
1237 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
1239 register vector
float v0_low, v0_hi, v2_low, v2_hi;
1241 register vector
float v1_low, v1_hi, v3_low, v3_hi;
1243 register vector
unsigned char permVec1, permVec2;
1244 vector
unsigned char oneCharVector = (vector
unsigned char)(1);
1248 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
1249 dst[
i] -= src0[
i] * src1[
i];
1253 permVec1 = vec_add( vec_lvsl( -1, (
int*) &src0[i] ), oneCharVector );
1254 permVec2 = vec_add( vec_lvsl( -1, (
int*) &src1[i] ), oneCharVector );
1255 v2_hi = vec_ld( 0, &src0[i] );
1256 v3_hi = vec_ld( 0, &src1[i] );
1260 for ( ; i+7 <
count; i += 8 ) {
1263 v0_hi = vec_ld( 15, &src0[i] );
1265 v2_hi = vec_ld( 31, &src0[i] );
1268 v1_hi = vec_ld( 15, &src1[i] );
1270 v3_hi = vec_ld( 31, &src1[i] );
1272 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1273 v1 = vec_perm( v1_low, v1_hi, permVec2 );
1274 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1275 v3 = vec_perm( v3_low, v3_hi, permVec2 );
1279 v4 = vec_ld( 0, &dst[i] );
1280 v5 = vec_ld( 16, &dst[i] );
1282 v6 = vec_nmsub( v0, v1, v4 );
1283 v7 = vec_nmsub( v2, v3, v5 );
1285 ALIGNED_STORE2( &dst[i], v6, v7 );
1289 for ( ; i <
count ; i++ ) {
1290 dst[
i] -= src0[
i] * src1[
i];
1306 register vector
float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
1307 register vector
float vecX, vecY, vecZ;
1308 vector
float vecX2, vecY2, vecZ2;
1312 register vector
float zeroVector = (vector
float)(0.0);
1313 register vector
float vecConstX, vecConstY, vecConstZ;
1316 register vector
unsigned char permX1 = (vector
unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31);
1317 register vector
unsigned char permX2 = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
1319 register vector
unsigned char permY1 = (vector
unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
1320 register vector
unsigned char permY2 = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
1322 register vector
unsigned char permZ1 = (vector
unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
1323 register vector
unsigned char permZ2 = (vector
unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
1328 constVal[0] = constant[0];
1329 constVal[1] = constant[1];
1330 constVal[2] = constant[2];
1333 vector
unsigned char constPerm = vec_lvsl( 0, constant.
ToFloatPtr() );
1335 vecLd2 = vec_ld( 11, constant.
ToFloatPtr() );
1336 vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
1340 vecConstX = vec_splat( vecLd1, 0 );
1341 vecConstY = vec_splat( vecLd1, 1 );
1342 vecConstZ = vec_splat( vecLd1, 2 );
1344 vector
unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector
unsigned char)(1) );
1345 vector
float vecOld = vec_ld( 0, addr );
1348 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
1349 dst[
i] = constant * src[
i];
1352 for ( ; i + 7 <
count; i += 8 ) {
1353 float *vecPtr = (
float*)( addr + (i*3) );
1354 vector
float v0,
v1,
v2,
v3, v4, v5;
1357 v1 = vec_ld( 15, vecPtr );
1358 v2 = vec_ld( 31, vecPtr );
1359 v3 = vec_ld( 47, vecPtr );
1360 v4 = vec_ld( 63, vecPtr );
1361 v5 = vec_ld( 79, vecPtr );
1362 vecOld = vec_ld( 95, vecPtr );
1364 vecLd1 = vec_perm( v0, v1, permVec );
1365 vecLd2 = vec_perm( v1, v2, permVec );
1366 vecLd3 = vec_perm( v2, v3, permVec );
1368 vecLd4 = vec_perm( v3, v4, permVec );
1369 vecLd5 = vec_perm( v4, v5, permVec );
1370 vecLd6 = vec_perm( v5, vecOld, permVec );
1373 vecX = vec_perm( vecLd1, vecLd2, permX1 );
1374 vecY = vec_perm( vecLd1, vecLd2, permY1 );
1375 vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
1376 vecX = vec_perm( vecX, vecLd3, permX2 );
1377 vecY = vec_perm( vecY, vecLd3, permY2 );
1378 vecZ = vec_perm( vecZ, vecLd3, permZ2 );
1380 vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
1381 vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
1382 vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
1383 vecX2 = vec_perm( vecX2, vecLd6, permX2 );
1384 vecY2 = vec_perm( vecY2, vecLd6, permY2 );
1385 vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
1388 vecX = vec_madd( vecX, vecConstX, zeroVector );
1389 vecY = vec_madd( vecY, vecConstY, vecX );
1390 vecZ = vec_madd( vecZ, vecConstZ, vecY );
1392 vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
1393 vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
1394 vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
1397 ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
1401 for ( ; i <
count; i++ ) {
1404 tempVal[0] = *( addr + (i*3) + 0 );
1405 tempVal[1] = *( addr + (i*3) + 1 );
1406 tempVal[2] = *( addr + (i*3) + 2 );
1407 dst[
i] = constVal[0] * tempVal[0] + constVal[1] * tempVal[1] + constVal[2] * tempVal[2];
1430 vector
float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
1431 vector
float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
1432 vector
float vecX, vecY, vecZ, vecI3;
1433 vector
float vecX2, vecY2, vecZ2, vecI32;
1434 vector
float vecConstX, vecConstY, vecConstZ;
1436 constVal[0] = constant[0];
1437 constVal[1] = constant[1];
1438 constVal[2] = constant[2];
1441 vector
unsigned char constPerm = vec_lvsl( 0, constant.
ToFloatPtr() );
1442 vector
float v0 = vec_ld( 0, constant.
ToFloatPtr() );
1443 vector
float v1 = vec_ld( 11, constant.
ToFloatPtr() );
1444 vector
float vecConst = vec_perm( v0, v1, constPerm );
1446 vecConstX = vec_splat( vecConst, 0 );
1447 vecConstY = vec_splat( vecConst, 1 );
1448 vecConstZ = vec_splat( vecConst, 2 );
1451 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
1452 dst[
i] = constant * src[
i].
Normal() + src[
i][3];
1456 vector
unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector
unsigned char)(1) );
1457 vector
float vecOld = vec_ld( 0, addr );
1459 for ( ; i + 7 <
count; i += 8 ) {
1460 float *planePtr = (
float*)( addr + (i*PLANE_OFFSET) );
1461 vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
1464 v1 = vec_ld( 15, planePtr );
1465 v2 = vec_ld( 31, planePtr );
1466 v3 = vec_ld( 47, planePtr );
1467 v4 = vec_ld( 63, planePtr );
1468 v5 = vec_ld( 79, planePtr );
1469 v6 = vec_ld( 95, planePtr );
1470 v7 = vec_ld( 111, planePtr );
1471 vecOld = vec_ld( 127, planePtr );
1473 vecPlaneLd1 = vec_perm( v0, v1, permVec );
1474 vecPlaneLd2 = vec_perm( v1, v2, permVec );
1475 vecPlaneLd3 = vec_perm( v2, v3, permVec );
1476 vecPlaneLd4 = vec_perm( v3, v4, permVec );
1478 vecPlaneLd5 = vec_perm( v4, v5, permVec );
1479 vecPlaneLd6 = vec_perm( v5, v6, permVec );
1480 vecPlaneLd7 = vec_perm( v6, v7, permVec );
1481 vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
1485 v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
1486 v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
1487 v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
1488 v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
1490 vecX = vec_mergeh( v0, v1 );
1491 vecY = vec_mergel( v0, v1 );
1492 vecZ = vec_mergeh( v2, v3 );
1493 vecI3 = vec_mergel( v2, v3 );
1495 v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
1496 v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
1497 v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
1498 v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
1500 vecX2 = vec_mergeh( v4, v5 );
1501 vecY2 = vec_mergel( v4, v5 );
1502 vecZ2 = vec_mergeh( v6, v7 );
1503 vecI32 = vec_mergel( v6, v7 );
1506 v6 = vec_madd( vecZ, vecConstZ, vecI3 );
1507 v5 = vec_madd( vecY, vecConstY, v6 );
1508 v4 = vec_madd( vecX, vecConstX, v5 );
1510 v0 = vec_madd( vecZ2, vecConstZ, vecI32 );
1511 v1 = vec_madd( vecY2, vecConstY, v0 );
1512 v2 = vec_madd( vecX2, vecConstX, v1 );
1515 ALIGNED_STORE2( &dst[i], v4, v2 );
1519 for ( ; i <
count; i++ ) {
1521 srcVal[0] = *(addr + (i*PLANE_OFFSET) + 0 );
1522 srcVal[1] = *(addr + (i*PLANE_OFFSET) + 1 );
1523 srcVal[2] = *(addr + (i*PLANE_OFFSET) + 2 );
1526 srcI3 = *(addr + (i*PLANE_OFFSET) + 3 );
1528 tempVal = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
1529 dst[
i] = tempVal + srcI3;
1533 #ifndef DRAWVERT_PADDED
1547 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
1549 register vector
float vecConstX, vecConstY, vecConstZ;
1550 register vector
float vecSrcX1, vecSrcY1, vecSrcZ1;
1551 register vector
float zeroVector = (vector
float)(0.0);
1552 vector
unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1554 vector
unsigned char constPerm = vec_lvsl( 0, constant.
ToFloatPtr() );
1557 v0 = vec_perm( v0, v1, constPerm );
1560 vecConstX = vec_splat( v0, 0 );
1561 vecConstY = vec_splat( v0, 1 );
1562 vecConstZ = vec_splat( v0, 2 );
1565 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
1566 dst[
i] = constant * src[
i].
xyz;
1570 if ( i+3 < count ) {
1571 vertPerm1 = vec_add( vec_lvsl( -1, (
float*) src[i].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
1572 vertPerm2 = vec_add( vec_lvsl( -1, (
float*) src[i+1].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
1573 vertPerm3 = vec_add( vec_lvsl( -1, (
float*) src[i+2].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
1574 vertPerm4 = vec_add( vec_lvsl( -1, (
float*) src[i+3].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
1577 for ( ; i+3 <
count; i += 4 ) {
1583 v0 = vec_ld( 0, vertPtr );
1584 v1 = vec_ld( 11, vertPtr );
1585 v2 = vec_ld( 0, vertPtr2 );
1586 v3 = vec_ld( 11, vertPtr2 );
1587 v4 = vec_ld( 0, vertPtr3 );
1588 v5 = vec_ld( 11, vertPtr3 );
1589 v6 = vec_ld( 0, vertPtr4 );
1590 v7 = vec_ld( 11, vertPtr4 );
1592 v0 = vec_perm( v0, v1, vertPerm1 );
1593 v2 = vec_perm( v2, v3, vertPerm2 );
1594 v4 = vec_perm( v4, v5, vertPerm3 );
1595 v6 = vec_perm( v6, v7, vertPerm4 );
1598 v1 = vec_mergeh( v0, v4 );
1599 v3 = vec_mergeh( v2, v6 );
1600 v5 = vec_mergel( v0, v4 );
1601 v7 = vec_mergel( v2, v6 );
1603 vecSrcX1 = vec_mergeh( v1, v3 );
1604 vecSrcY1 = vec_mergel( v1, v3 );
1605 vecSrcZ1 = vec_mergeh( v5, v7 );
1608 vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
1609 vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
1610 vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
1613 vec_st( vecSrcZ1, 0, &dst[i] );
1616 for ( ; i <
count; i++ ) {
1617 dst[
i] = constant * src[
i].
xyz;
1634 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
1636 register vector
float vecConstX, vecConstY, vecConstZ;
1637 register vector
float vecSrcX1, vecSrcY1, vecSrcZ1;
1638 register vector
float zeroVector = (vector
float)(0.0);
1639 vector
unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1641 vector
unsigned char constPerm = vec_lvsl( 0, constant.
ToFloatPtr() );
1644 v0 = vec_perm( v0, v1, constPerm );
1647 vecConstX = vec_splat( v0, 0 );
1648 vecConstY = vec_splat( v0, 1 );
1649 vecConstZ = vec_splat( v0, 2 );
1652 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
1653 dst[
i] = constant * src[
i].
xyz;
1656 for ( ; i+3 <
count; i += 4 ) {
1662 v0 = vec_ld( 0, vertPtr );
1663 v2 = vec_ld( 0, vertPtr2 );
1664 v4 = vec_ld( 0, vertPtr3 );
1665 v6 = vec_ld( 0, vertPtr4 );
1668 v1 = vec_mergeh( v0, v4 );
1669 v3 = vec_mergeh( v2, v6 );
1670 v5 = vec_mergel( v0, v4 );
1671 v7 = vec_mergel( v2, v6 );
1673 vecSrcX1 = vec_mergeh( v1, v3 );
1674 vecSrcY1 = vec_mergel( v1, v3 );
1675 vecSrcZ1 = vec_mergeh( v5, v7 );
1678 vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
1679 vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
1680 vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
1683 vec_st( vecSrcZ1, 0, &dst[i] );
1686 for ( ; i <
count; i++ ) {
1687 dst[
i] = constant * src[
i].
xyz;
1703 register vector
float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
1704 register vector
float vecX, vecY, vecZ, vecX2, vecY2, vecZ2;
1705 register vector
float zeroVector = (vector
float)(0.0);
1706 register vector
float vecConstX, vecConstY, vecConstZ;
1707 register vector
float vecConst3;
1710 float const3 = constant[3];
1713 register vector
unsigned char permX1 = (vector
unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31);
1714 register vector
unsigned char permX2 = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
1716 register vector
unsigned char permY1 = (vector
unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
1717 register vector
unsigned char permY2 = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
1719 register vector
unsigned char permZ1 = (vector
unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
1720 register vector
unsigned char permZ2 = (vector
unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
1724 vector
unsigned char constPerm = vec_lvsl( 0, constant.
ToFloatPtr() );
1726 vecLd2 = vec_ld( 15, constant.
ToFloatPtr() );
1727 vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
1730 vecConstX = vec_splat( vecLd1, 0 );
1731 vecConstY = vec_splat( vecLd1, 1 );
1732 vecConstZ = vec_splat( vecLd1, 2 );
1735 vecConst3 = loadSplatUnalignedScalar( &const3 );
1738 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
1739 dst[
i] = constant.
Normal() * src[
i] + constant[3];
1743 vector
unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector
unsigned char)(1) );
1744 vector
float vecOld = vec_ld( 0, addr );
1746 for ( ; i+7 <
count; i += 8 ) {
1747 float *vecPtr = (
float*)( addr + (i*3) );
1748 vector
float v0,
v1,
v2,
v3, v4, v5;
1751 v1 = vec_ld( 15, vecPtr );
1752 v2 = vec_ld( 31, vecPtr );
1753 v3 = vec_ld( 47, vecPtr );
1754 v4 = vec_ld( 63, vecPtr );
1755 v5 = vec_ld( 79, vecPtr );
1756 vecOld = vec_ld( 95, vecPtr );
1758 vecLd1 = vec_perm( v0, v1, permVec );
1759 vecLd2 = vec_perm( v1, v2, permVec );
1760 vecLd3 = vec_perm( v2, v3, permVec );
1762 vecLd4 = vec_perm( v3, v4, permVec );
1763 vecLd5 = vec_perm( v4, v5, permVec );
1764 vecLd6 = vec_perm( v5, vecOld, permVec );
1767 vecX = vec_perm( vecLd1, vecLd2, permX1 );
1768 vecY = vec_perm( vecLd1, vecLd2, permY1 );
1769 vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
1770 vecX = vec_perm( vecX, vecLd3, permX2 );
1771 vecY = vec_perm( vecY, vecLd3, permY2 );
1772 vecZ = vec_perm( vecZ, vecLd3, permZ2 );
1774 vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
1775 vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
1776 vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
1777 vecX2 = vec_perm( vecX2, vecLd6, permX2 );
1778 vecY2 = vec_perm( vecY2, vecLd6, permY2 );
1779 vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
1782 vecX = vec_madd( vecX, vecConstX, zeroVector );
1783 vecY = vec_madd( vecY, vecConstY, vecX );
1784 vecZ = vec_madd( vecZ, vecConstZ, vecY );
1786 vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
1787 vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
1788 vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
1791 vecZ = vec_add( vecZ, vecConst3 );
1792 vecZ2 = vec_add( vecZ2, vecConst3 );
1795 ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
1799 for ( ; i <
count; i++ ) {
1800 dst[
i] = constNormal * src[
i] + const3;
1821 const float *constPtr = constant.
ToFloatPtr();
1823 register vector
float vecX, vecY, vecZ, vecI3;
1824 register vector
float vecX2, vecY2, vecZ2, vecI32;
1826 vector
float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
1827 vector
float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
1828 register vector
float zeroVector = (vector
float)(0.0);
1829 register vector
float vecConstX, vecConstY, vecConstZ, vecConstI3;
1831 constVal[0] = *(constPtr);
1832 constVal[1] = *(constPtr+1);
1833 constVal[2] = *(constPtr+2);
1834 constVal[3] = *(constPtr+3);
1837 vector
unsigned char constPerm = vec_lvsl( 0, constant.
ToFloatPtr() );
1838 vector
float v0 = vec_ld( 0, constant.
ToFloatPtr() );
1839 vector
float v1 = vec_ld( 15, constant.
ToFloatPtr() );
1840 vector
float vecConst = vec_perm( v0, v1, constPerm );
1842 vecConstX = vec_splat( vecConst, 0 );
1843 vecConstY = vec_splat( vecConst, 1 );
1844 vecConstZ = vec_splat( vecConst, 2 );
1845 vecConstI3 = vec_splat( vecConst, 3 );
1848 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
1849 dst[
i] = constant.
Normal() * src[
i].
Normal() + constant[3] * src[
i][3];
1853 vector
unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector
unsigned char)(1) );
1854 vector
float vecOld = vec_ld( 0, srcPtr );
1856 for ( ; i+7 <
count; i += 8 ) {
1857 float *planePtr = (
float*)( srcPtr + (i*PLANE_OFFSET) );
1858 vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
1861 v1 = vec_ld( 15, planePtr );
1862 v2 = vec_ld( 31, planePtr );
1863 v3 = vec_ld( 47, planePtr );
1864 v4 = vec_ld( 63, planePtr );
1865 v5 = vec_ld( 79, planePtr );
1866 v6 = vec_ld( 95, planePtr );
1867 v7 = vec_ld( 111, planePtr );
1868 vecOld = vec_ld( 127, planePtr );
1870 vecPlaneLd1 = vec_perm( v0, v1, permVec );
1871 vecPlaneLd2 = vec_perm( v1, v2, permVec );
1872 vecPlaneLd3 = vec_perm( v2, v3, permVec );
1873 vecPlaneLd4 = vec_perm( v3, v4, permVec );
1875 vecPlaneLd5 = vec_perm( v4, v5, permVec );
1876 vecPlaneLd6 = vec_perm( v5, v6, permVec );
1877 vecPlaneLd7 = vec_perm( v6, v7, permVec );
1878 vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
1882 v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
1883 v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
1884 v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
1885 v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
1887 vecX = vec_mergeh( v0, v1 );
1888 vecY = vec_mergel( v0, v1 );
1889 vecZ = vec_mergeh( v2, v3 );
1890 vecI3 = vec_mergel( v2, v3 );
1892 v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
1893 v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
1894 v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
1895 v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
1897 vecX2 = vec_mergeh( v4, v5 );
1898 vecY2 = vec_mergel( v4, v5 );
1899 vecZ2 = vec_mergeh( v6, v7 );
1900 vecI32 = vec_mergel( v6, v7 );
1903 v4 = vec_madd( vecConstX, vecX, zeroVector );
1904 v5 = vec_madd( vecConstY, vecY, v4 );
1905 v6 = vec_madd( vecConstZ, vecZ, v5 );
1906 v7 = vec_madd( vecConstI3, vecI3, v6 );
1908 v0 = vec_madd( vecConstX, vecX2, zeroVector );
1909 v1 = vec_madd( vecConstY, vecY2, v0 );
1910 v2 = vec_madd( vecConstZ, vecZ2, v1 );
1911 v3 = vec_madd( vecConstI3, vecI32, v2 );
1914 ALIGNED_STORE2( &dst[i], v7, v3 );
1918 for ( ; i <
count; i++ ) {
1920 srcVal[0] = *(srcPtr + (i*PLANE_OFFSET) + 0 );
1921 srcVal[1] = *(srcPtr + (i*PLANE_OFFSET) + 1 );
1922 srcVal[2] = *(srcPtr + (i*PLANE_OFFSET) + 2 );
1923 srcVal[3] = *(srcPtr + (i*PLANE_OFFSET) + 3 );
1924 dst[
i] = srcVal[0] * constVal[0] + srcVal[1] * constVal[1] + srcVal[2] * constVal[2] + constVal[3] * srcVal[3];
1929 #ifndef DRAWVERT_PADDED
1944 const float *constPtr = constant.
ToFloatPtr();
1947 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
1948 register vector
float vecConstX, vecConstY, vecConstZ, vecConstI3;
1949 register vector
float vecSrcX1, vecSrcY1, vecSrcZ1;
1950 register vector
float vecDest1;
1951 register vector
float zeroVector = (vector
float)(0.0);
1952 vector
unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1957 constVal[0] = *(constPtr+0);
1958 constVal[1] = *(constPtr+1);
1959 constVal[2] = *(constPtr+2);
1960 constVal[3] = *(constPtr+3);
1963 vector
unsigned char constPerm = vec_lvsl( 0, constant.
ToFloatPtr() );
1966 v0 = vec_perm( v0, v1, constPerm );
1968 vecConstX = vec_splat( v0, 0 );
1969 vecConstY = vec_splat( v0, 1 );
1970 vecConstZ = vec_splat( v0, 2 );
1971 vecConstI3 = vec_splat( v0, 3 );
1974 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
1975 dst[
i] = constant.
Normal() * src[
i].
xyz + constant[3];
1980 if ( i+3 < count ) {
1981 vertPerm1 = vec_add( vec_lvsl( -1, (
float*) src[i].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
1982 vertPerm2 = vec_add( vec_lvsl( -1, (
float*) src[i+1].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
1983 vertPerm3 = vec_add( vec_lvsl( -1, (
float*) src[i+2].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
1984 vertPerm4 = vec_add( vec_lvsl( -1, (
float*) src[i+3].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
1987 for ( ; i+3 <
count; i+=4 ) {
1993 v0 = vec_ld( 0, vertPtr );
1994 v1 = vec_ld( 11, vertPtr );
1995 v2 = vec_ld( 0, vertPtr2 );
1996 v3 = vec_ld( 11, vertPtr2 );
1997 v4 = vec_ld( 0, vertPtr3 );
1998 v5 = vec_ld( 11, vertPtr3 );
1999 v6 = vec_ld( 0, vertPtr4 );
2000 v7 = vec_ld( 11, vertPtr4 );
2002 v0 = vec_perm( v0, v1, vertPerm1 );
2003 v2 = vec_perm( v2, v3, vertPerm2 );
2004 v4 = vec_perm( v4, v5, vertPerm3 );
2005 v6 = vec_perm( v6, v7, vertPerm4 );
2008 v1 = vec_mergeh( v0, v4 );
2009 v3 = vec_mergeh( v2, v6 );
2010 v5 = vec_mergel( v0, v4 );
2011 v7 = vec_mergel( v2, v6 );
2013 vecSrcX1 = vec_mergeh( v1, v3 );
2014 vecSrcY1 = vec_mergel( v1, v3 );
2015 vecSrcZ1 = vec_mergeh( v5, v7 );
2018 vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
2019 vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
2020 vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
2021 vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
2024 vec_st( vecDest1, 0, &dst[i] );
2028 for ( ; i <
count; i++ ) {
2029 srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
2030 srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
2031 srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
2034 dst[
i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
2035 dst[
i] += constVal[3];
2053 const float *constPtr = constant.
ToFloatPtr();
2056 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
2057 register vector
float vecConstX, vecConstY, vecConstZ, vecConstI3;
2058 register vector
float vecSrcX1, vecSrcY1, vecSrcZ1;
2059 register vector
float vecDest1;
2060 register vector
float zeroVector = (vector
float)(0.0);
2061 vector
unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
2066 constVal[0] = *(constPtr+0);
2067 constVal[1] = *(constPtr+1);
2068 constVal[2] = *(constPtr+2);
2069 constVal[3] = *(constPtr+3);
2072 vector
unsigned char constPerm = vec_lvsl( 0, constant.
ToFloatPtr() );
2075 v0 = vec_perm( v0, v1, constPerm );
2077 vecConstX = vec_splat( v0, 0 );
2078 vecConstY = vec_splat( v0, 1 );
2079 vecConstZ = vec_splat( v0, 2 );
2080 vecConstI3 = vec_splat( v0, 3 );
2083 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
2084 dst[
i] = constant.
Normal() * src[
i].
xyz + constant[3];
2087 for ( ; i+3 <
count; i+=4 ) {
2093 v0 = vec_ld( 0, vertPtr );
2094 v2 = vec_ld( 0, vertPtr2 );
2095 v4 = vec_ld( 0, vertPtr3 );
2096 v6 = vec_ld( 0, vertPtr4 );
2099 v1 = vec_mergeh( v0, v4 );
2100 v3 = vec_mergeh( v2, v6 );
2101 v5 = vec_mergel( v0, v4 );
2102 v7 = vec_mergel( v2, v6 );
2104 vecSrcX1 = vec_mergeh( v1, v3 );
2105 vecSrcY1 = vec_mergel( v1, v3 );
2106 vecSrcZ1 = vec_mergeh( v5, v7 );
2109 vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
2110 vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
2111 vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
2112 vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
2115 vec_st( vecDest1, 0, &dst[i] );
2119 for ( ; i <
count; i++ ) {
2120 srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
2121 srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
2122 srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
2125 dst[
i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
2126 dst[
i] += constVal[3];
2146 register vector
float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
2147 vector
float vecLd7, vecLd8, vecLd9, vecLd10, vecLd11, vecLd12;
2148 register vector
float vecX0, vecY0, vecZ0, vecX1, vecY1, vecZ1;
2149 register vector
float vecX02, vecY02, vecZ02, vecX12, vecY12, vecZ12;
2150 register vector
float zeroVector = (vector
float)(0.0);
2152 register vector
unsigned char permX1 = (vector
unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31);
2153 register vector
unsigned char permX2 = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2154 register vector
unsigned char permY1 = (vector
unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
2155 register vector
unsigned char permY2 = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2156 register vector
unsigned char permZ1 = (vector
unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
2157 register vector
unsigned char permZ2 = (vector
unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2160 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
2161 dst[
i] = src0[
i] * src1[
i];
2166 vector
unsigned char permVec1 = vec_add( vec_lvsl( -1, src0Ptr ), (vector
unsigned char)(1) );
2167 vector
unsigned char permVec2 = vec_add( vec_lvsl( -1, src1Ptr ), (vector
unsigned char)(1) );
2168 vector
float vecOld0 = vec_ld( 0, src0Ptr );
2169 vector
float vecOld1 = vec_ld( 0, src1Ptr );
2171 for ( i = 0; i+7 <
count; i += 8 ) {
2172 float *s0Ptr = (
float*)( src0Ptr + (i*3) );
2173 float *s1Ptr = (
float*)( src1Ptr + (i*3) );
2175 vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7, v8, v9, v10, v11;
2177 v1 = vec_ld( 15, s0Ptr );
2178 v2 = vec_ld( 31, s0Ptr );
2179 v3 = vec_ld( 47, s0Ptr );
2180 v4 = vec_ld( 63, s0Ptr );
2181 v5 = vec_ld( 79, s0Ptr );
2182 vecOld0 = vec_ld( 95, s0Ptr );
2185 v7 = vec_ld( 15, s1Ptr );
2186 v8 = vec_ld( 31, s1Ptr );
2187 v9 = vec_ld( 47, s1Ptr );
2188 v10 = vec_ld( 63, s1Ptr );
2189 v11 = vec_ld( 79, s1Ptr );
2190 vecOld1 = vec_ld( 95, s1Ptr );
2192 vecLd1 = vec_perm( v0, v1, permVec1 );
2193 vecLd2 = vec_perm( v1, v2, permVec1 );
2194 vecLd3 = vec_perm( v2, v3, permVec1 );
2195 vecLd4 = vec_perm( v3, v4, permVec1 );
2196 vecLd5 = vec_perm( v4, v5, permVec1 );
2197 vecLd6 = vec_perm( v5, vecOld0, permVec1 );
2199 vecLd7 = vec_perm( v6, v7, permVec2 );
2200 vecLd8 = vec_perm( v7, v8, permVec2 );
2201 vecLd9 = vec_perm( v8, v9, permVec2 );
2202 vecLd10 = vec_perm( v9, v10, permVec2 );
2203 vecLd11 = vec_perm( v10, v11, permVec2 );
2204 vecLd12 = vec_perm( v11, vecOld1, permVec2 );
2207 vecX0 = vec_perm( vecLd1, vecLd2, permX1 );
2208 vecY0 = vec_perm( vecLd1, vecLd2, permY1 );
2209 vecZ0 = vec_perm( vecLd1, vecLd2, permZ1 );
2210 vecX0 = vec_perm( vecX0, vecLd3, permX2 );
2211 vecY0 = vec_perm( vecY0, vecLd3, permY2 );
2212 vecZ0 = vec_perm( vecZ0, vecLd3, permZ2 );
2214 vecX02 = vec_perm( vecLd4, vecLd5, permX1 );
2215 vecY02 = vec_perm( vecLd4, vecLd5, permY1 );
2216 vecZ02 = vec_perm( vecLd4, vecLd5, permZ1 );
2217 vecX02 = vec_perm( vecX02, vecLd6, permX2 );
2218 vecY02 = vec_perm( vecY02, vecLd6, permY2 );
2219 vecZ02 = vec_perm( vecZ02, vecLd6, permZ2 );
2221 vecX1 = vec_perm( vecLd7, vecLd8, permX1 );
2222 vecY1 = vec_perm( vecLd7, vecLd8, permY1 );
2223 vecZ1 = vec_perm( vecLd7, vecLd8, permZ1 );
2224 vecX1 = vec_perm( vecX1, vecLd9, permX2 );
2225 vecY1 = vec_perm( vecY1, vecLd9, permY2 );
2226 vecZ1 = vec_perm( vecZ1, vecLd9, permZ2 );
2228 vecX12 = vec_perm( vecLd10, vecLd11, permX1 );
2229 vecY12 = vec_perm( vecLd10, vecLd11, permY1 );
2230 vecZ12 = vec_perm( vecLd10, vecLd11, permZ1 );
2231 vecX12 = vec_perm( vecX12, vecLd12, permX2 );
2232 vecY12 = vec_perm( vecY12, vecLd12, permY2 );
2233 vecZ12 = vec_perm( vecZ12, vecLd12, permZ2 );
2236 vecX0 = vec_madd( vecX0, vecX1, zeroVector );
2237 vecY0 = vec_madd( vecY0, vecY1, vecX0 );
2238 vecZ0 = vec_madd( vecZ0, vecZ1, vecY0 );
2239 vecX02 = vec_madd( vecX02, vecX12, zeroVector );
2240 vecY02 = vec_madd( vecY02, vecY12, vecX02 );
2241 vecZ02 = vec_madd( vecZ02, vecZ12, vecY02 );
2244 ALIGNED_STORE2( &dst[i], vecZ0, vecZ02 );
2248 for ( ; i <
count; i++ ) {
2250 src0Val[0] = *( src0Ptr + (i*3) + 0 );
2251 src0Val[1] = *( src0Ptr + (i*3) + 1 );
2252 src0Val[2] = *( src0Ptr + (i*3) + 2 );
2254 src1Val[0] = *( src1Ptr + (i*3) + 0 );
2255 src1Val[1] = *( src1Ptr + (i*3) + 1 );
2256 src1Val[2] = *( src1Ptr + (i*3) + 2 );
2258 dst[
i] = src0Val[0] * src1Val[0] + src0Val[1] * src1Val[1] + src0Val[2] * src1Val[2];
2272 register vector
float v0,
v1,
v2,
v3;
2273 register vector
float zeroVector;
2274 register vector
float runningTotal1, runningTotal2;
2276 register vector
float v0_low, v0_hi, v2_low, v2_hi;
2278 register vector
float v1_low, v1_hi, v3_low, v3_hi;
2280 register vector
unsigned char permVec1, permVec2;
2281 vector
unsigned char oneCharVector = (vector
unsigned char)(1);
2285 runningTotal1 = (vector
float)(0.0);
2286 runningTotal2 = (vector
float)(0.0);
2287 zeroVector = (vector
float)(0.0);
2291 permVec1 = vec_add( vec_lvsl( -1, (
int*) &src1[i] ), oneCharVector );
2292 permVec2 = vec_add( vec_lvsl( -1, (
int*) &src2[i] ), oneCharVector );
2293 v2_hi = vec_ld( 0, &src1[i] );
2294 v3_hi = vec_ld( 0, &src2[i] );
2297 for ( ; i+7 <
count; i += 8 ) {
2300 v0_hi = vec_ld( 15, &src1[i] );
2302 v2_hi = vec_ld( 31, &src1[i] );
2305 v1_hi = vec_ld( 15, &src2[i] );
2307 v3_hi = vec_ld( 31, &src2[i] );
2309 v0 = vec_perm( v0_low, v0_hi, permVec1 );
2310 v1 = vec_perm( v1_low, v1_hi, permVec2 );
2311 v2 = vec_perm( v2_low, v2_hi, permVec1 );
2312 v3 = vec_perm( v3_low, v3_hi, permVec2 );
2315 runningTotal1 = vec_madd( v0, v1, runningTotal1 );
2316 runningTotal2 = vec_madd( v2, v3, runningTotal2 );
2319 runningTotal1 = vec_add( runningTotal1, runningTotal2 );
2322 v0 = vec_add( runningTotal1, vec_sld( runningTotal1, runningTotal1, 8 ) );
2323 v1 = vec_add( v0, vec_sld( v0, v0, 4 ) );
2324 runningTotal1 = vec_splat( v1, 0 );
2325 vec_ste( runningTotal1, 0, &dot );
2331 for ( ; i <
count ; i++ ) {
2332 dot += src1[
i] * src2[
i];
2338 #ifdef ENABLE_COMPARES
2351 register vector
float v0,
v1,
v2,
v3;
2352 register vector
bool int vr1, vr2, vr3, vr4;
2353 register vector
bool short vs1, vs2;
2354 register vector
float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2355 register vector
unsigned char vc1;
2356 register vector
bool char vbc1;
2357 register vector
float constVec;
2358 register vector
unsigned char oneVector = (vector
unsigned char)(1);
2359 register vector
unsigned char permVec;
2363 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
2364 dst[
i] = src0[
i] > constant;
2368 constVec = loadSplatUnalignedScalar( &constant );
2371 permVec = vec_add( vec_lvsl( -1, (
int*) &src0[i] ), oneVector );
2372 v3_hi = vec_ld( 0, &src0[i] );
2375 for ( ; i+15 <
count; i += 16 ) {
2378 v0_hi = vec_ld( 15, &src0[i] );
2380 v1_hi = vec_ld( 31, &src0[i] );
2382 v2_hi = vec_ld( 47, &src0[i] );
2384 v3_hi = vec_ld( 63, &src0[i] );
2387 v0 = vec_perm( v0_low, v0_hi, permVec );
2388 v1 = vec_perm( v1_low, v1_hi, permVec );
2389 v2 = vec_perm( v2_low, v2_hi, permVec );
2390 v3 = vec_perm( v3_low, v3_hi, permVec );
2393 vr1 = vec_cmpgt( v0, constVec );
2394 vr2 = vec_cmpgt( v1, constVec );
2395 vr3 = vec_cmpgt( v2, constVec );
2396 vr4 = vec_cmpgt( v3, constVec );
2399 vs1 = vec_pack(vr1, vr2);
2400 vs2 = vec_pack(vr3, vr4);
2403 vbc1 = vec_pack(vs1, vs2);
2406 vc1 = vec_and( vbc1, oneVector );
2409 vec_st( vc1, 0, &dst[i] );
2413 for ( ; i <
count ; i++ ) {
2414 dst[
i] = src0[
i] > constant;
2430 register vector
bool int vtbi0, vtbi1, vtbi2, vtbi3;
2431 register vector
bool short vtbs0, vtbs1;
2432 register vector
bool char vtbc0;
2433 register vector
unsigned char vtuc0;
2434 register vector
unsigned char permVec, permVec2;
2437 register vector
unsigned char vd;
2439 register vector
unsigned char bitNumVec;
2441 register vector
float vs0, vs1, vs2, vs3;
2442 register vector
float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2444 register vector
float constVec;
2446 register vector
unsigned char oneVector = (vector
unsigned char)(1);
2450 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
2451 dst[
i] |= ( src0[
i] > constant ) << bitNum;
2455 constVec = loadSplatUnalignedScalar( &constant );
2458 permVec2 = vec_lvsl( 0, &bitNum );
2459 vtuc0 = vec_ld( 0, &bitNum );
2460 bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2461 bitNumVec = vec_splat( bitNumVec, 0 );
2464 permVec = vec_add( vec_lvsl( -1, (
int*) &src0[i] ), oneVector );
2465 vs3_hi = vec_ld( 0, &src0[i] );
2468 for ( ; i+15 <
count; i += 16 ) {
2471 vs0_hi = vec_ld( 15, &src0[i] );
2473 vs1_hi = vec_ld( 31, &src0[i] );
2475 vs2_hi = vec_ld( 47, &src0[i] );
2477 vs3_hi = vec_ld( 63, &src0[i] );
2480 vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2481 vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2482 vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2483 vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2486 vd = vec_ld( 0, &dst[i] );
2489 vtbi0 = vec_cmpgt( vs0, constVec );
2490 vtbi1 = vec_cmpgt( vs1, constVec );
2491 vtbi2 = vec_cmpgt( vs2, constVec );
2492 vtbi3 = vec_cmpgt( vs3, constVec );
2495 vtbs0 = vec_pack(vtbi0, vtbi1);
2496 vtbs1 = vec_pack(vtbi2, vtbi3);
2499 vtbc0 = vec_pack(vtbs0, vtbs1);
2502 vtuc0 = vec_and(vtbc0, oneVector);
2503 vtuc0 = vec_sl(vtuc0, bitNumVec );
2506 vd = vec_or( vd, vtuc0 );
2508 vec_st( vd, 0, &dst[i] );
2512 for ( ; i <
count ; i++ ) {
2513 dst[
i] |= ( src0[
i] > constant ) << bitNum;
2526 register vector
float v0,
v1,
v2,
v3;
2527 register vector
bool int vr1, vr2, vr3, vr4;
2528 register vector
bool short vs1, vs2;
2529 register vector
float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2530 register vector
unsigned char vc1;
2531 register vector
bool char vbc1;
2532 register vector
float constVec;
2533 register vector
unsigned char oneVector = (vector
unsigned char)(1);
2534 register vector
unsigned char permVec;
2538 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
2539 dst[
i] = src0[
i] >= constant;
2543 constVec = loadSplatUnalignedScalar( &constant );
2546 permVec = vec_add( vec_lvsl( -1, (
int*) &src0[i] ), oneVector );
2547 v3_hi = vec_ld( 0, &src0[i] );
2550 for ( ; i+15 <
count; i += 16 ) {
2553 v0_hi = vec_ld( 15, &src0[i] );
2555 v1_hi = vec_ld( 31, &src0[i] );
2557 v2_hi = vec_ld( 47, &src0[i] );
2559 v3_hi = vec_ld( 63, &src0[i] );
2562 v0 = vec_perm( v0_low, v0_hi, permVec );
2563 v1 = vec_perm( v1_low, v1_hi, permVec );
2564 v2 = vec_perm( v2_low, v2_hi, permVec );
2565 v3 = vec_perm( v3_low, v3_hi, permVec );
2568 vr1 = vec_cmpge( v0, constVec );
2569 vr2 = vec_cmpge( v1, constVec );
2570 vr3 = vec_cmpge( v2, constVec );
2571 vr4 = vec_cmpge( v3, constVec );
2574 vs1 = vec_pack(vr1, vr2);
2575 vs2 = vec_pack(vr3, vr4);
2578 vbc1 = vec_pack(vs1, vs2);
2581 vc1 = vec_and( vbc1, oneVector );
2584 vec_st( vc1, 0, &dst[i] );
2588 for ( ; i <
count ; i++ ) {
2589 dst[
i] = src0[
i] >= constant;
2601 register vector
bool int vtbi0, vtbi1, vtbi2, vtbi3;
2602 register vector
bool short vtbs0, vtbs1;
2603 register vector
bool char vtbc0;
2604 register vector
unsigned char vtuc0;
2605 register vector
unsigned char permVec, permVec2;
2608 register vector
unsigned char vd;
2610 register vector
unsigned char bitNumVec;
2612 register vector
float vs0, vs1, vs2, vs3;
2613 register vector
float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2615 register vector
float constVec;
2617 register vector
unsigned char oneVector = (vector
unsigned char)(1);
2621 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
2622 dst[
i] |= ( src0[
i] >= constant ) << bitNum;
2626 constVec = loadSplatUnalignedScalar( &constant );
2629 permVec2 = vec_lvsl( 0, &bitNum );
2630 vtuc0 = vec_ld( 0, &bitNum );
2631 bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2632 bitNumVec = vec_splat( bitNumVec, 0 );
2635 permVec = vec_add( vec_lvsl( -1, (
int*) &src0[i] ), oneVector );
2636 vs3_hi = vec_ld( 0, &src0[i] );
2639 for ( ; i+15 <
count; i += 16 ) {
2642 vs0_hi = vec_ld( 15, &src0[i] );
2644 vs1_hi = vec_ld( 31, &src0[i] );
2646 vs2_hi = vec_ld( 47, &src0[i] );
2648 vs3_hi = vec_ld( 63, &src0[i] );
2651 vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2652 vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2653 vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2654 vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2657 vd = vec_ld( 0, &dst[i] );
2660 vtbi0 = vec_cmpge( vs0, constVec );
2661 vtbi1 = vec_cmpge( vs1, constVec );
2662 vtbi2 = vec_cmpge( vs2, constVec );
2663 vtbi3 = vec_cmpge( vs3, constVec );
2666 vtbs0 = vec_pack(vtbi0, vtbi1);
2667 vtbs1 = vec_pack(vtbi2, vtbi3);
2670 vtbc0 = vec_pack(vtbs0, vtbs1);
2673 vtuc0 = vec_and(vtbc0, oneVector);
2674 vtuc0 = vec_sl(vtuc0, bitNumVec );
2677 vd = vec_or( vd, vtuc0 );
2679 vec_st( vd, 0, &dst[i] );
2683 for ( ; i <
count ; i++ ) {
2684 dst[
i] |= ( src0[
i] >= constant ) << bitNum;
2698 register vector
float v0,
v1,
v2,
v3;
2699 register vector
bool int vr1, vr2, vr3, vr4;
2700 register vector
bool short vs1, vs2;
2701 register vector
float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2702 register vector
unsigned char vc1;
2703 register vector
bool char vbc1;
2704 register vector
float constVec;
2705 register vector
unsigned char oneVector = (vector
unsigned char)(1);
2706 register vector
unsigned char permVec;
2710 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
2711 dst[
i] = src0[
i] < constant;
2715 constVec = loadSplatUnalignedScalar( &constant );
2718 permVec = vec_add( vec_lvsl( -1, (
int*) &src0[i] ), oneVector );
2719 v3_hi = vec_ld( 0, &src0[i] );
2722 for ( ; i+15 <
count; i += 16 ) {
2725 v0_hi = vec_ld( 15, &src0[i] );
2727 v1_hi = vec_ld( 31, &src0[i] );
2729 v2_hi = vec_ld( 47, &src0[i] );
2731 v3_hi = vec_ld( 63, &src0[i] );
2734 v0 = vec_perm( v0_low, v0_hi, permVec );
2735 v1 = vec_perm( v1_low, v1_hi, permVec );
2736 v2 = vec_perm( v2_low, v2_hi, permVec );
2737 v3 = vec_perm( v3_low, v3_hi, permVec );
2740 vr1 = vec_cmplt( v0, constVec );
2741 vr2 = vec_cmplt( v1, constVec );
2742 vr3 = vec_cmplt( v2, constVec );
2743 vr4 = vec_cmplt( v3, constVec );
2746 vs1 = vec_pack(vr1, vr2);
2747 vs2 = vec_pack(vr3, vr4);
2750 vbc1 = vec_pack(vs1, vs2);
2753 vc1 = vec_and( vbc1, oneVector );
2756 vec_st( vc1, 0, &dst[i] );
2760 for ( ; i <
count ; i++ ) {
2761 dst[
i] = src0[
i] < constant;
2774 register vector
bool int vtbi0, vtbi1, vtbi2, vtbi3;
2775 register vector
bool short vtbs0, vtbs1;
2776 register vector
bool char vtbc0;
2777 register vector
unsigned char vtuc0;
2778 register vector
unsigned char permVec, permVec2;
2781 register vector
unsigned char vd;
2783 register vector
unsigned char bitNumVec;
2785 register vector
float vs0, vs1, vs2, vs3;
2786 register vector
float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2788 register vector
float constVec;
2790 register vector
unsigned char oneVector = (vector
unsigned char)(1);
2794 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
2795 dst[
i] |= ( src0[
i] < constant ) << bitNum;
2799 constVec = loadSplatUnalignedScalar( &constant );
2802 permVec2 = vec_lvsl( 0, &bitNum );
2803 vtuc0 = vec_ld( 0, &bitNum );
2804 bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2805 bitNumVec = vec_splat( bitNumVec, 0 );
2808 permVec = vec_add( vec_lvsl( -1, (
int*) &src0[i] ), oneVector );
2809 vs3_hi = vec_ld( 0, &src0[i] );
2812 for ( ; i+15 <
count; i += 16 ) {
2815 vs0_hi = vec_ld( 15, &src0[i] );
2817 vs1_hi = vec_ld( 31, &src0[i] );
2819 vs2_hi = vec_ld( 47, &src0[i] );
2821 vs3_hi = vec_ld( 63, &src0[i] );
2824 vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2825 vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2826 vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2827 vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2830 vd = vec_ld( 0, &dst[i] );
2833 vtbi0 = vec_cmplt( vs0, constVec );
2834 vtbi1 = vec_cmplt( vs1, constVec );
2835 vtbi2 = vec_cmplt( vs2, constVec );
2836 vtbi3 = vec_cmplt( vs3, constVec );
2839 vtbs0 = vec_pack(vtbi0, vtbi1);
2840 vtbs1 = vec_pack(vtbi2, vtbi3);
2843 vtbc0 = vec_pack(vtbs0, vtbs1);
2846 vtuc0 = vec_and(vtbc0, oneVector);
2847 vtuc0 = vec_sl(vtuc0, bitNumVec );
2850 vd = vec_or( vd, vtuc0 );
2852 vec_st( vd, 0, &dst[i] );
2856 for ( ; i <
count ; i++ ) {
2857 dst[
i] |= ( src0[
i] < constant ) << bitNum;
2872 register vector
float v0,
v1,
v2,
v3;
2873 register vector
bool int vr1, vr2, vr3, vr4;
2874 register vector
bool short vs1, vs2;
2875 register vector
float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2876 register vector
unsigned char vc1;
2877 register vector
bool char vbc1;
2878 register vector
float constVec;
2879 register vector
unsigned char oneVector = (vector
unsigned char)(1);
2880 register vector
unsigned char permVec;
2884 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
2885 dst[
i] = src0[
i] <= constant;
2889 constVec = loadSplatUnalignedScalar( &constant );
2892 permVec = vec_add( vec_lvsl( -1, (
int*) &src0[i] ), oneVector );
2893 v3_hi = vec_ld( 0, &src0[i] );
2896 for ( ; i+15 <
count; i += 16 ) {
2899 v0_hi = vec_ld( 15, &src0[i] );
2901 v1_hi = vec_ld( 31, &src0[i] );
2903 v2_hi = vec_ld( 47, &src0[i] );
2905 v3_hi = vec_ld( 63, &src0[i] );
2908 v0 = vec_perm( v0_low, v0_hi, permVec );
2909 v1 = vec_perm( v1_low, v1_hi, permVec );
2910 v2 = vec_perm( v2_low, v2_hi, permVec );
2911 v3 = vec_perm( v3_low, v3_hi, permVec );
2914 vr1 = vec_cmple( v0, constVec );
2915 vr2 = vec_cmple( v1, constVec );
2916 vr3 = vec_cmple( v2, constVec );
2917 vr4 = vec_cmple( v3, constVec );
2920 vs1 = vec_pack(vr1, vr2);
2921 vs2 = vec_pack(vr3, vr4);
2924 vbc1 = vec_pack(vs1, vs2);
2927 vc1 = vec_and( vbc1, oneVector );
2930 vec_st( vc1, 0, &dst[i] );
2934 for ( ; i <
count ; i++ ) {
2935 dst[
i] = src0[
i] <= constant;
2948 register vector
bool int vtbi0, vtbi1, vtbi2, vtbi3;
2949 register vector
bool short vtbs0, vtbs1;
2950 register vector
bool char vtbc0;
2951 register vector
unsigned char vtuc0;
2952 register vector
unsigned char permVec, permVec2;
2955 register vector
unsigned char vd;
2957 register vector
unsigned char bitNumVec;
2959 register vector
float vs0, vs1, vs2, vs3;
2960 register vector
float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2962 register vector
float constVec;
2964 register vector
unsigned char oneVector = (vector
unsigned char)(1);
2968 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count );i++ ) {
2969 dst[
i] |= ( src0[
i] <= constant ) << bitNum;
2973 constVec = loadSplatUnalignedScalar( &constant );
2976 permVec2 = vec_lvsl( 0, &bitNum );
2977 vtuc0 = vec_ld( 0, &bitNum );
2978 bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2979 bitNumVec = vec_splat( bitNumVec, 0 );
2982 permVec = vec_add( vec_lvsl( -1, (
int*) &src0[i] ), oneVector );
2983 vs3_hi = vec_ld( 0, &src0[i] );
2986 for ( ; i+15 <
count; i += 16 ) {
2989 vs0_hi = vec_ld( 15, &src0[i] );
2991 vs1_hi = vec_ld( 31, &src0[i] );
2993 vs2_hi = vec_ld( 47, &src0[i] );
2995 vs3_hi = vec_ld( 63, &src0[i] );
2998 vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2999 vs1 = vec_perm( vs1_low, vs1_hi, permVec );
3000 vs2 = vec_perm( vs2_low, vs2_hi, permVec );
3001 vs3 = vec_perm( vs3_low, vs3_hi, permVec );
3004 vd = vec_ld( 0, &dst[i] );
3007 vtbi0 = vec_cmple( vs0, constVec );
3008 vtbi1 = vec_cmple( vs1, constVec );
3009 vtbi2 = vec_cmple( vs2, constVec );
3010 vtbi3 = vec_cmple( vs3, constVec );
3013 vtbs0 = vec_pack(vtbi0, vtbi1);
3014 vtbs1 = vec_pack(vtbi2, vtbi3);
3017 vtbc0 = vec_pack(vtbs0, vtbs1);
3020 vtuc0 = vec_and(vtbc0, oneVector);
3021 vtuc0 = vec_sl(vtuc0, bitNumVec );
3024 vd = vec_or( vd, vtuc0 );
3026 vec_st( vd, 0, &dst[i] );
3030 for ( ; i <
count ; i++ ) {
3031 dst[
i] |= ( src0[
i] <= constant ) << bitNum;
3036 #ifdef ENABLE_MINMAX
3047 register vector
float v0,
v1,
v2,
v3;
3048 register vector
float maxVec, minVec, tempMin, tempMax;
3049 register vector
unsigned char permVec;
3050 register vector
float v0_low, v0_hi, v1_low, v1_hi;
3051 vector
unsigned char oneCharVector = (vector
unsigned char)(1);
3058 permVec = vec_add( vec_lvsl( -1, (
int*) &src[0] ), oneCharVector );
3059 v1_hi = vec_ld( 0, &src[0] );
3061 maxVec = loadSplatUnalignedScalar( &max );
3062 minVec = loadSplatUnalignedScalar( &min );
3065 for ( ; i+7 <
count; i += 8 ) {
3068 v0_hi = vec_ld( 15, &src[i] );
3070 v1_hi = vec_ld( 31, &src[i] );
3071 v0 = vec_perm( v0_low, v0_hi, permVec );
3072 v1 = vec_perm( v1_low, v1_hi, permVec );
3075 v2 = vec_min( v0, v1 );
3076 minVec = vec_min( minVec, v2 );
3078 v3 = vec_max( v0, v1 );
3079 maxVec = vec_max( maxVec, v3 );
3089 tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 8 ) );
3090 tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 8 ) );
3091 tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 4 ) );
3092 tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 4 ) );
3093 minVec = vec_splat( tempMin, 0 );
3094 maxVec = vec_splat( tempMax, 0 );
3095 vec_ste( minVec, 0, &min );
3096 vec_ste( maxVec, 0, &max );
3100 for ( ; i <
count; i++ ) {
3101 if ( src[i] < min ) {
3104 if ( src[i] > max ) {
3124 register vector
float vecLd1, vecLd2, vecLd3, vecLd4;
3125 register vector
float vecMin, vecMax;
3127 register vector
float v0,
v1,
v2,
v3;
3131 vecMin = (vector
float)(FLT_MAX);
3132 vecMax = (vector
float)(FLT_MIN);
3134 vector
unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector
unsigned char)(1) );
3135 vector
float vecOld = vec_ld( 0, srcPtr );
3137 for ( i = 0, j = 0; i+7 <
count; i += 8, j += 4) {
3139 float *vecPtr = (
float*)( srcPtr + (j*4) );
3143 v1 = vec_ld( 15, vecPtr );
3144 v2 = vec_ld( 31, vecPtr );
3145 v3 = vec_ld( 47, vecPtr );
3146 vecOld = vec_ld( 63, vecPtr );
3148 vecLd1 = vec_perm( v0, v1, permVec );
3149 vecLd2 = vec_perm( v1, v2, permVec );
3150 vecLd3 = vec_perm( v2, v3, permVec );
3151 vecLd4 = vec_perm( v3, vecOld, permVec );
3155 v0 = vec_min( vecLd1, vecLd2 );
3156 v1 = vec_min( vecLd3, vecLd4 );
3157 v0 = vec_min( v0, v1 );
3159 v2 = vec_max( vecLd1, vecLd2 );
3160 v3 = vec_max( vecLd3, vecLd4 );
3161 v2 = vec_max( v2, v3 );
3165 vecMin = vec_min( v0, vecMin );
3166 vecMax = vec_max( v2, vecMax );
3169 vecMin = vec_min( vecMin, vec_sld( vecMin, vecMin, 8 ) );
3170 vecMax = vec_max( vecMax, vec_sld( vecMax, vecMax, 8 ) );
3171 v0 = vec_splat( vecMin, 0 );
3172 v1 = vec_splat( vecMin, 1 );
3173 v2 = vec_splat( vecMax, 0 );
3174 v3 = vec_splat( vecMax, 1 );
3176 vec_ste( v0, 0, &min[0] );
3177 vec_ste( v1, 0, &min[1] );
3178 vec_ste( v2, 0, &max[0] );
3179 vec_ste( v3, 0, &max[1] );
3183 for ( ; i <
count; i++ ) {
3186 if ( v[0] < min[0] ) {
3189 if ( v[0] > max[0] ) {
3193 if ( v[1] < min[1] ) {
3196 if ( v[1] > max[1] ) {
3215 register vector
float vecLd1, vecLd2, vecLd3;
3216 register vector
float vecMin, vecMax;
3217 register vector
float vecSrc1, vecSrc2, vecSrc3, vecSrc4;
3218 register vector
float vecMin1, vecMin2, vecMax1, vecMax2;
3222 vecMin = (vector
float)(FLT_MAX);
3223 vecMax = (vector
float)(FLT_MIN);
3225 vector
unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr), (vector
unsigned char)(1) );
3226 vector
float vecOld = vec_ld( 0, srcPtr );
3229 for ( ; i+3 <
count; i += 4 ) {
3230 float *vecPtr = (
float*)( srcPtr + (i*3) );
3234 v1 = vec_ld( 15, vecPtr );
3235 v2 = vec_ld( 31, vecPtr );
3236 vecOld = vec_ld( 47, vecPtr );
3238 vecLd1 = vec_perm( v0, v1, permVec );
3239 vecLd2 = vec_perm( v1, v2, permVec );
3240 vecLd3 = vec_perm( v2, vecOld, permVec );
3244 vecSrc2 = vec_sld( vecLd1, vecLd2, 12 );
3245 vecSrc3 = vec_sld( vecLd2, vecLd3, 8 );
3246 vecSrc4 = vec_sld( vecLd3, vecLd3, 4 );
3249 vecMin1 = vec_min( vecSrc1, vecSrc2 );
3250 vecMin2 = vec_min( vecSrc3, vecSrc4 );
3251 vecMin1 = vec_min( vecMin1, vecMin2 );
3252 vecMin = vec_min( vecMin, vecMin1 );
3254 vecMax1 = vec_max( vecSrc1, vecSrc2 );
3255 vecMax2 = vec_max( vecSrc3, vecSrc4 );
3256 vecMax1 = vec_max( vecMax1, vecMax2 );
3257 vecMax = vec_max( vecMax1, vecMax );
3261 vector
float v0,
v1,
v2,
v3, v4, v5;
3262 v0 = vec_splat( vecMin, 0 );
3263 v1 = vec_splat( vecMin, 1 );
3264 v2 = vec_splat( vecMin, 2 );
3265 v3 = vec_splat( vecMax, 0 );
3266 v4 = vec_splat( vecMax, 1 );
3267 v5 = vec_splat( vecMax, 2 );
3269 vec_ste( v0, 0, &min[0] );
3270 vec_ste( v1, 0, &min[1] );
3271 vec_ste( v2, 0, &min[2] );
3272 vec_ste( v3, 0, &max[0] );
3273 vec_ste( v4, 0, &max[1] );
3274 vec_ste( v5, 0, &max[2] );
3278 for ( ; i <
count; i ++ ) {
3281 if ( v[0] < min[0] ) {
3284 if ( v[0] > max[0] ) {
3287 if ( v[1] < min[1] ) {
3290 if ( v[1] > max[1] ) {
3293 if ( v[2] < min[2] ) {
3296 if ( v[2] > max[2] ) {
3302 #ifndef DRAWVERT_PADDED
3313 register vector
float vecMin, vecMax;
3315 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
3316 register vector
float vecMin1, vecMin2, vecMax1, vecMax2;
3319 vecMin = (vector
float)(FLT_MAX);
3320 vecMax = (vector
float)(FLT_MIN);
3322 vector
unsigned char vertPerm1 = vec_add( vec_lvsl( -1, (
float*) src[i].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
3323 vector
unsigned char vertPerm2 = vec_add( vec_lvsl( -1, (
float*) src[i+1].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
3324 vector
unsigned char vertPerm3 = vec_add( vec_lvsl( -1, (
float*) src[i+2].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
3325 vector
unsigned char vertPerm4 = vec_add( vec_lvsl( -1, (
float*) src[i+3].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
3327 for ( ; i+3 <
count; i += 4) {
3333 v0 = vec_ld( 0, vertPtr );
3334 v1 = vec_ld( 11, vertPtr );
3335 v2 = vec_ld( 0, vertPtr2 );
3336 v3 = vec_ld( 11, vertPtr2 );
3337 v4 = vec_ld( 0, vertPtr3 );
3338 v5 = vec_ld( 11, vertPtr3 );
3339 v6 = vec_ld( 0, vertPtr4 );
3340 v7 = vec_ld( 11, vertPtr4 );
3342 v0 = vec_perm( v0, v1, vertPerm1 );
3343 v2 = vec_perm( v2, v3, vertPerm2 );
3344 v4 = vec_perm( v4, v5, vertPerm3 );
3345 v6 = vec_perm( v6, v7, vertPerm4 );
3347 vecMin1 = vec_min( v0, v2 );
3348 vecMin2 = vec_min( v4, v6 );
3349 vecMin1 = vec_min( vecMin1, vecMin2 );
3350 vecMin = vec_min( vecMin, vecMin1 );
3352 vecMax1 = vec_max( v0, v2 );
3353 vecMax2 = vec_max( v4, v6 );
3354 vecMax1 = vec_max( vecMax1, vecMax2 );
3355 vecMax = vec_max( vecMax, vecMax1 );
3359 v0 = vec_splat( vecMin, 0 );
3360 v1 = vec_splat( vecMin, 1 );
3361 v2 = vec_splat( vecMin, 2 );
3362 v3 = vec_splat( vecMax, 0 );
3363 v4 = vec_splat( vecMax, 1 );
3364 v5 = vec_splat( vecMax, 2 );
3366 vec_ste( v0, 0, &min[0] );
3367 vec_ste( v1, 0, &min[1] );
3368 vec_ste( v2, 0, &min[2] );
3369 vec_ste( v3, 0, &max[0] );
3370 vec_ste( v4, 0, &max[1] );
3371 vec_ste( v5, 0, &max[2] );
3375 for ( ; i <
count; i++ ) {
3378 if ( v[0] < min[0] ) {
3381 if ( v[0] > max[0] ) {
3385 if ( v[1] < min[1] ) {
3388 if ( v[1] > max[1] ) {
3392 if ( v[2] > max[2] ) {
3396 if ( v[2] < min[2] ) {
3412 register vector
float vecMin, vecMax;
3414 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
3415 register vector
float vecMin1, vecMin2, vecMax1, vecMax2;
3418 vecMin = (vector
float)(FLT_MAX);
3419 vecMax = (vector
float)(FLT_MIN);
3421 for ( ; i+3 <
count; i += 4) {
3427 v0 = vec_ld( 0, vertPtr );
3428 v2 = vec_ld( 0, vertPtr2 );
3429 v4 = vec_ld( 0, vertPtr3 );
3430 v6 = vec_ld( 0, vertPtr4 );
3432 vecMin1 = vec_min( v0, v2 );
3433 vecMin2 = vec_min( v4, v6 );
3434 vecMin1 = vec_min( vecMin1, vecMin2 );
3435 vecMin = vec_min( vecMin, vecMin1 );
3437 vecMax1 = vec_max( v0, v2 );
3438 vecMax2 = vec_max( v4, v6 );
3439 vecMax1 = vec_max( vecMax1, vecMax2 );
3440 vecMax = vec_max( vecMax, vecMax1 );
3444 v0 = vec_splat( vecMin, 0 );
3445 v1 = vec_splat( vecMin, 1 );
3446 v2 = vec_splat( vecMin, 2 );
3447 v3 = vec_splat( vecMax, 0 );
3448 v4 = vec_splat( vecMax, 1 );
3449 v5 = vec_splat( vecMax, 2 );
3451 vec_ste( v0, 0, &min[0] );
3452 vec_ste( v1, 0, &min[1] );
3453 vec_ste( v2, 0, &min[2] );
3454 vec_ste( v3, 0, &max[0] );
3455 vec_ste( v4, 0, &max[1] );
3456 vec_ste( v5, 0, &max[2] );
3460 for ( ; i <
count; i++ ) {
3463 if ( v[0] < min[0] ) {
3466 if ( v[0] > max[0] ) {
3470 if ( v[1] < min[1] ) {
3473 if ( v[1] > max[1] ) {
3477 if ( v[2] > max[2] ) {
3481 if ( v[2] < min[2] ) {
3489 #ifndef DRAWVERT_PADDED
3501 register vector
float vecMin, vecMax;
3503 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
3504 register vector
float vecMin1, vecMin2, vecMax1, vecMax2;
3508 vecMin = (vector
float)(FLT_MAX);
3509 vecMax = (vector
float)(FLT_MIN);
3511 vector
unsigned char vertPerm1;
3512 vector
unsigned char vertPerm2;
3513 vector
unsigned char vertPerm3;
3514 vector
unsigned char vertPerm4;
3516 for ( ; i+3 <
count; i += 4) {
3522 vertPerm1 = vec_add( vec_lvsl( -1, vertPtr ), (vector
unsigned char)(1) );
3523 vertPerm2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector
unsigned char)(1) );
3524 vertPerm3 = vec_add( vec_lvsl( -1, vertPtr3 ), (vector
unsigned char)(1) );
3525 vertPerm4 = vec_add( vec_lvsl( -1, vertPtr4 ), (vector
unsigned char)(1) );
3527 v0 = vec_ld( 0, vertPtr );
3528 v1 = vec_ld( 15, vertPtr );
3529 v2 = vec_ld( 0, vertPtr2 );
3530 v3 = vec_ld( 15, vertPtr2 );
3531 v4 = vec_ld( 0, vertPtr3 );
3532 v5 = vec_ld( 15, vertPtr3 );
3533 v6 = vec_ld( 0, vertPtr4 );
3534 v7 = vec_ld( 15, vertPtr4 );
3536 v0 = vec_perm( v0, v1, vertPerm1 );
3537 v2 = vec_perm( v2, v3, vertPerm2 );
3538 v4 = vec_perm( v4, v5, vertPerm3 );
3539 v6 = vec_perm( v6, v7, vertPerm4 );
3541 vecMin1 = vec_min( v0, v2 );
3542 vecMin2 = vec_min( v4, v6 );
3543 vecMin1 = vec_min( vecMin1, vecMin2 );
3544 vecMin = vec_min( vecMin, vecMin1 );
3546 vecMax1 = vec_max( v0, v2 );
3547 vecMax2 = vec_max( v4, v6 );
3548 vecMax1 = vec_max( vecMax1, vecMax2 );
3549 vecMax = vec_max( vecMax, vecMax1 );
3553 v0 = vec_splat( vecMin, 0 );
3554 v1 = vec_splat( vecMin, 1 );
3555 v2 = vec_splat( vecMin, 2 );
3556 v3 = vec_splat( vecMax, 0 );
3557 v4 = vec_splat( vecMax, 1 );
3558 v5 = vec_splat( vecMax, 2 );
3560 vec_ste( v0, 0, &min[0] );
3561 vec_ste( v1, 0, &min[1] );
3562 vec_ste( v2, 0, &min[2] );
3563 vec_ste( v3, 0, &max[0] );
3564 vec_ste( v4, 0, &max[1] );
3565 vec_ste( v5, 0, &max[2] );
3569 for ( ; i <
count; i++ ) {
3570 v = src[indexes[
i]].
xyz;
3572 if ( v[0] < min[0] ) {
3575 if ( v[0] > max[0] ) {
3579 if ( v[1] < min[1] ) {
3582 if ( v[1] > max[1] ) {
3586 if ( v[2] > max[2] ) {
3590 if ( v[2] < min[2] ) {
3607 register vector
float vecMin, vecMax;
3609 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
3610 register vector
float vecMin1, vecMin2, vecMax1, vecMax2;
3614 vecMin = (vector
float)(FLT_MAX);
3615 vecMax = (vector
float)(FLT_MIN);
3617 vector
unsigned char vertPerm1;
3618 vector
unsigned char vertPerm2;
3619 vector
unsigned char vertPerm3;
3620 vector
unsigned char vertPerm4;
3622 for ( ; i+3 <
count; i += 4) {
3628 v0 = vec_ld( 0, vertPtr );
3629 v2 = vec_ld( 0, vertPtr2 );
3630 v4 = vec_ld( 0, vertPtr3 );
3631 v6 = vec_ld( 0, vertPtr4 );
3633 vecMin1 = vec_min( v0, v2 );
3634 vecMin2 = vec_min( v4, v6 );
3635 vecMin1 = vec_min( vecMin1, vecMin2 );
3636 vecMin = vec_min( vecMin, vecMin1 );
3638 vecMax1 = vec_max( v0, v2 );
3639 vecMax2 = vec_max( v4, v6 );
3640 vecMax1 = vec_max( vecMax1, vecMax2 );
3641 vecMax = vec_max( vecMax, vecMax1 );
3645 v0 = vec_splat( vecMin, 0 );
3646 v1 = vec_splat( vecMin, 1 );
3647 v2 = vec_splat( vecMin, 2 );
3648 v3 = vec_splat( vecMax, 0 );
3649 v4 = vec_splat( vecMax, 1 );
3650 v5 = vec_splat( vecMax, 2 );
3652 vec_ste( v0, 0, &min[0] );
3653 vec_ste( v1, 0, &min[1] );
3654 vec_ste( v2, 0, &min[2] );
3655 vec_ste( v3, 0, &max[0] );
3656 vec_ste( v4, 0, &max[1] );
3657 vec_ste( v5, 0, &max[2] );
3661 for ( ; i <
count; i++ ) {
3662 v = src[indexes[
i]].
xyz;
3664 if ( v[0] < min[0] ) {
3667 if ( v[0] > max[0] ) {
3671 if ( v[1] < min[1] ) {
3674 if ( v[1] > max[1] ) {
3678 if ( v[2] > max[2] ) {
3682 if ( v[2] < min[2] ) {
3702 register vector
float v0,
v1,
v2,
v3, v4, v5;
3703 register vector
unsigned char permVec;
3704 register vector
float v0_low, v0_hi, v1_low, v1_hi;
3705 vector
unsigned char oneVector = (vector
unsigned char)(1);
3706 register vector
float minVec, maxVec;
3710 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
3711 dst[
i] = src[
i] < min ? min : src[
i] > max ? max : src[
i];
3715 minVec = loadSplatUnalignedScalar( &min );
3716 maxVec = loadSplatUnalignedScalar( &max );
3719 permVec = vec_add( vec_lvsl( -1, (
int*) &src[i] ), oneVector );
3720 v1_hi = vec_ld( 0, &src[i] );
3724 for ( ; i+7 <
count; i += 8 ) {
3727 v0_hi = vec_ld( 15, &src[i] );
3729 v1_hi = vec_ld( 31, &src[i] );
3731 v0 = vec_perm( v0_low, v0_hi, permVec );
3732 v1 = vec_perm( v1_low, v1_hi, permVec );
3735 v2 = vec_max( v0, minVec );
3736 v3 = vec_max( v1, minVec );
3739 v4 = vec_min( v2, maxVec );
3740 v5 = vec_min( v3, maxVec );
3742 ALIGNED_STORE2( &dst[i], v4, v5 );
3746 for ( ; i <
count ; i++ ) {
3747 dst[
i] = src[
i] < min ? min : src[
i] > max ? max : src[
i];
3758 register vector
float v0,
v1,
v2,
v3;
3759 register vector
unsigned char permVec;
3760 register vector
float v0_low, v0_hi, v1_low, v1_hi;
3761 register vector
float constVec;
3762 vector
unsigned char oneVector = (vector
unsigned char)(1);
3766 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
3767 dst[
i] = src[
i] < min ? min : src[
i];
3771 constVec = loadSplatUnalignedScalar( &min );
3774 permVec = vec_add( vec_lvsl( -1, (
int*) &src[i] ), oneVector );
3775 v1_hi = vec_ld( 0, &src[i] );
3778 for ( ; i+7 <
count; i += 8 ) {
3781 v0_hi = vec_ld( 15, &src[i] );
3783 v1_hi = vec_ld( 31, &src[i] );
3785 v0 = vec_perm( v0_low, v0_hi, permVec );
3786 v1 = vec_perm( v1_low, v1_hi, permVec );
3788 v2 = vec_max( v0, constVec );
3789 v3 = vec_max( v1, constVec );
3791 ALIGNED_STORE2( &dst[i], v2, v3 );
3795 for ( ; i <
count ; i++ ) {
3796 dst[
i] = src[
i] < min ? min : src[
i];
3807 register vector
float v0,
v1,
v2,
v3;
3808 register vector
unsigned char permVec;
3809 register vector
float constVec;
3810 register vector
float v0_low, v0_hi, v1_low, v1_hi;
3811 vector
unsigned char oneVector = (vector
unsigned char)(1);
3815 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i <
count ); i++ ) {
3816 dst[
i] = src[
i] < max ? max : src[
i];
3820 constVec = loadSplatUnalignedScalar( &max );
3823 permVec = vec_add( vec_lvsl( -1, (
int*) &src[i] ), oneVector );
3824 v1_hi = vec_ld( 0, &src[i] );
3827 for ( ; i+7 <
count; i += 8 ) {
3830 v0_hi = vec_ld( 15, &src[i] );
3832 v1_hi = vec_ld( 31, &src[i] );
3834 v0 = vec_perm( v0_low, v0_hi, permVec );
3835 v1 = vec_perm( v1_low, v1_hi, permVec );
3836 v2 = vec_min( v0, constVec );
3837 v3 = vec_min( v1, constVec );
3839 ALIGNED_STORE2( &dst[i], v2, v3 );
3843 for ( ; i <
count ; i++ ) {
3844 dst[
i] = src[
i] < max ? max : src[
i];
3850 #ifdef ENABLE_16ROUTINES
3858 memset( dst, 0, count *
sizeof(
float ) );
3873 assert( IS_16BYTE_ALIGNED( dst[0] ) );
3876 int count2 = ( count + 3 ) & ~3;
3882 for ( ; i + 7 < count2; i += 8 ) {
3883 v0 = vec_ld( 0, &dst[i] );
3884 v1 = vec_ld( 16, &dst[i] );
3886 v2 = vec_sub( (vector
float)(0), v0 );
3887 v3 = vec_sub( (vector
float)(0), v1 );
3889 ALIGNED_STORE2( &dst[i], v2, v3 );
3892 for ( ; i < count2; i += 4 ) {
3893 v0 = vec_ld( 0, &dst[i] );
3894 v1 = vec_sub( (vector
float)(0), v0 );
3895 vec_st( v1, 0, &dst[i] );
3906 memcpy( dst, src,
sizeof(
float) * count );
3921 assert( IS_16BYTE_ALIGNED( dst[0] ) );
3923 assert( IS_16BYTE_ALIGNED( src1[0] ) );
3925 assert( IS_16BYTE_ALIGNED( src2[0] ) );
3928 int count2 = ( count + 3 ) & ~3;
3930 register vector
float v0,
v1,
v2,
v3, v4, v5;
3934 for ( ; i+7 < count2; i += 8 ) {
3936 v0 = vec_ld( 0, &src1[i] );
3937 v1 = vec_ld( 16, &src1[i] );
3938 v2 = vec_ld( 0, &src2[i] );
3939 v3 = vec_ld( 16, &src2[i] );
3940 v4 = vec_add( v0, v2 );
3941 v5 = vec_add( v1, v3 );
3943 ALIGNED_STORE2( &dst[i], v4, v5 );
3946 for ( ; i < count2; i += 4 ) {
3947 v0 = vec_ld( 0, &src1[i] );
3948 v1 = vec_ld( 0, &src2[i] );
3949 v2 = vec_add( v0, v1 );
3950 vec_st( v2, 0, &dst[i] );
3965 assert( IS_16BYTE_ALIGNED( dst[0] ) );
3967 assert( IS_16BYTE_ALIGNED( src1[0] ) );
3969 assert( IS_16BYTE_ALIGNED( src2[0] ) );
3972 int count2 = ( count + 3 ) & ~3;
3974 register vector
float v0,
v1,
v2,
v3, v4, v5;
3978 for ( ; i+7 < count2; i += 8 ) {
3980 v0 = vec_ld( 0, &src1[i] );
3981 v1 = vec_ld( 16, &src1[i] );
3982 v2 = vec_ld( 0, &src2[i] );
3983 v3 = vec_ld( 16, &src2[i] );
3984 v4 = vec_sub( v0, v2 );
3985 v5 = vec_sub( v1, v3 );
3987 ALIGNED_STORE2( &dst[i], v4, v5 );
3990 for ( ; i < count2; i += 4 ) {
3991 v0 = vec_ld( 0, &src1[i] );
3992 v1 = vec_ld( 0, &src2[i] );
3993 v2 = vec_sub( v0, v1 );
3994 vec_st( v2, 0, &dst[i] );
4010 assert( IS_16BYTE_ALIGNED( dst[0] ) );
4012 assert( IS_16BYTE_ALIGNED( src1[0] ) );
4015 int count2 = ( count + 3 ) & ~3;
4017 register vector
float v0,
v1,
v2,
v3;
4018 register vector
float constVec;
4019 register vector
float zeroVector = (vector
float)(0.0);
4023 constVec = loadSplatUnalignedScalar( &constant );
4026 for ( ; i+7 < count2; i += 8 ) {
4028 v0 = vec_ld( 0, &src1[i] );
4029 v1 = vec_ld( 16, &src1[i] );
4030 v2 = vec_madd( constVec, v0, zeroVector );
4031 v3 = vec_madd( constVec, v1, zeroVector );
4032 ALIGNED_STORE2( &dst[i], v2, v3 );
4035 for ( ; i < count2; i += 4 ) {
4036 v0 = vec_ld( 0, &src1[i] );
4037 v1 = vec_madd( constVec, v0, zeroVector );
4038 vec_st( v1, 0, &dst[i] );
4054 assert( IS_16BYTE_ALIGNED( dst[0] ) );
4056 assert( IS_16BYTE_ALIGNED( src[0] ) );
4059 int count2 = ( count + 3 ) & ~3;
4061 register vector
float v0,
v1,
v2,
v3, v4, v5;
4065 for ( ; i+7 < count2; i += 8 ) {
4066 v0 = vec_ld( 0, &src[i] );
4067 v1 = vec_ld( 16, &src[i] );
4068 v2 = vec_ld( 0, &dst[i] );
4069 v3 = vec_ld( 16, &dst[i] );
4070 v4 = vec_add( v0, v2 );
4071 v5 = vec_add( v1, v3 );
4072 ALIGNED_STORE2( &dst[i], v4, v5 );
4075 for ( ; i < count2; i += 4 ) {
4076 v0 = vec_ld( 0, &src[i] );
4077 v1 = vec_ld( 0, &dst[i] );
4078 v2 = vec_add( v0, v1 );
4079 vec_st( v2, 0, &dst[i] );
4093 register vector
float v0,
v1,
v2,
v3, v4, v5;
4097 assert( IS_16BYTE_ALIGNED( dst[0] ) );
4099 assert( IS_16BYTE_ALIGNED( src[0] ) );
4101 int count2 = ( count + 3 ) & ~3;
4104 for ( ; i+7 < count2; i += 8 ) {
4105 v0 = vec_ld( 0, &src[i] );
4106 v1 = vec_ld( 16, &src[i] );
4107 v2 = vec_ld( 0, &dst[i] );
4108 v3 = vec_ld( 16, &dst[i] );
4109 v4 = vec_sub( v2, v0 );
4110 v5 = vec_sub( v3, v1 );
4111 ALIGNED_STORE2( &dst[i], v4, v5 );
4114 for ( ; i < count2; i += 4 ) {
4115 v0 = vec_ld( 0, &src[i] );
4116 v1 = vec_ld( 0, &dst[i] );
4117 v2 = vec_sub( v1, v0 );
4118 vec_st( v2, 0, &dst[i] );
4134 assert( IS_16BYTE_ALIGNED( dst[0] ) );
4136 int count2 = ( count + 3 ) & ~3;
4138 register vector
float v0,
v1,
v2,
v3;
4139 register vector
float constVec;
4141 register vector
float zeroVector = (vector
float)(0.0);
4144 constVec = loadSplatUnalignedScalar( &constant );
4147 for ( ; i+7 < count2; i += 8 ) {
4148 v0 = vec_ld( 0, &dst[i] );
4149 v1 = vec_ld( 16, &dst[i] );
4150 v2 = vec_madd( v0, constVec, zeroVector );
4151 v3 = vec_madd( v1, constVec, zeroVector );
4152 ALIGNED_STORE2( &dst[i], v2, v3 );
4155 for ( ; i < count2; i += 4 ) {
4156 v0 = vec_ld( 0, &dst[i] );
4157 v1 = vec_madd( v0, constVec, zeroVector );
4158 vec_st( v1, 0, &dst[i] );
4164 #ifdef ENABLE_LOWER_TRIANGULAR
4192 vector
float vecSum1 = (vector
float)(0.0);
4193 vector
float vecSum2 = (vector
float)(0.0);
4194 vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7, v8, v9;
4195 vector
float zeroVector = (vector
float)(0.0);
4196 vector
float vecSum3, vecSum4, vecSum5, vecSum6, vecSum7, vecSum8;
4198 vector
unsigned char vecPermX = vec_add( vec_lvsl( -1, &x[0] ), (vector
unsigned char)(1) );
4201 for ( i = skip; i+3 <
n; i+=4 ) {
4207 vecSum1 = zeroVector;
4208 vecSum2 = zeroVector;
4209 vecSum3 = vecSum4 = vecSum5 = vecSum6 = vecSum7 = vecSum8 = zeroVector;
4215 vector
unsigned char vecPermLptr1 = vec_add( vec_lvsl( -1, lptr ), (vector
unsigned char)(1) );
4216 vector
unsigned char vecPermLptr2 = vec_add( vec_lvsl( -1, lptr2 ), (vector
unsigned char)(1) );
4217 vector
unsigned char vecPermLptr3 = vec_add( vec_lvsl( -1, lptr3 ), (vector
unsigned char)(1) );
4218 vector
unsigned char vecPermLptr4 = vec_add( vec_lvsl( -1, lptr4 ), (vector
unsigned char)(1) );
4220 for ( j = 0 ; j+7 <
i; j+=8 ) {
4222 v0 = vec_ld( 0, &x[j] );
4223 v1 = vec_ld( 15, &x[j] );
4224 vector
float vecExtraX = vec_ld( 31, &x[j] );
4225 v0 = vec_perm( v0, v1, vecPermX );
4226 v1 = vec_perm( v1, vecExtraX, vecPermX );
4228 v2 = vec_ld( 0, lptr + j );
4229 v3 = vec_ld( 15, lptr + j );
4230 vector
float vecExtra1 = vec_ld( 31, lptr + j );
4231 v2 = vec_perm( v2, v3, vecPermLptr1 );
4232 v3 = vec_perm( v3, vecExtra1, vecPermLptr1 );
4234 v4 = vec_ld( 0, lptr2 + j );
4235 v5 = vec_ld( 15, lptr2 + j );
4236 vector
float vecExtra2 = vec_ld( 31, lptr2 + j );
4237 v4 = vec_perm( v4, v5, vecPermLptr2 );
4238 v5 = vec_perm( v5, vecExtra2, vecPermLptr2 );
4240 v6 = vec_ld( 0, lptr3 + j );
4241 v7 = vec_ld( 15, lptr3 + j );
4242 vector
float vecExtra3 = vec_ld( 31, lptr3 + j );
4243 v6 = vec_perm( v6, v7, vecPermLptr3 );
4244 v7 = vec_perm( v7, vecExtra3, vecPermLptr3 );
4246 v8 = vec_ld( 0, lptr4 + j );
4247 v9 = vec_ld( 15, lptr4 + j );
4248 vector
float vecExtra4 = vec_ld( 31, lptr4 + j );
4249 v8 = vec_perm( v8, v9, vecPermLptr4 );
4250 v9 = vec_perm( v9, vecExtra4, vecPermLptr4 );
4252 vecSum1 = vec_madd( v2, v0, vecSum1 );
4253 vecSum2 = vec_madd( v3, v1, vecSum2 );
4255 vecSum3 = vec_madd( v4, v0, vecSum3 );
4256 vecSum4 = vec_madd( v5, v1, vecSum4 );
4258 vecSum5 = vec_madd( v6, v0, vecSum5 );
4259 vecSum6 = vec_madd( v7, v1, vecSum6 );
4261 vecSum7 = vec_madd( v8, v0, vecSum7 );
4262 vecSum8 = vec_madd( v9, v1, vecSum8 );
4268 vecSum1 = vec_add( vecSum1, vecSum2 );
4269 vecSum3 = vec_add( vecSum3, vecSum4 );
4270 vecSum5 = vec_add( vecSum5, vecSum6 );
4271 vecSum7 = vec_add( vecSum7, vecSum8 );
4273 vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
4274 vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
4276 vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 8 ) );
4277 vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 4 ) );
4279 vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 8 ) );
4280 vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 4 ) );
4282 vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 8 ) );
4283 vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 4 ) );
4286 vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
4287 vec_ste( vec_splat( vecSum3, 0 ), 0, &tempSum2 );
4288 vec_ste( vec_splat( vecSum5, 0 ), 0, &tempSum3 );
4289 vec_ste( vec_splat( vecSum7, 0 ), 0, &tempSum4 );
4298 for ( ; j <
i; j++ ) {
4299 sum -= lptr[
j] * x[
j];
4300 sum2 -= lptr2[
j] * x[
j];
4301 sum3 -= lptr3[
j] * x[
j];
4302 sum4 -= lptr4[
j] * x[
j];
4306 sum2 -= ( lptr2[
i] * sum );
4307 sum3 = sum3 - ( lptr3[i+1] * sum2 ) - ( lptr3[i] * sum );
4308 sum4 = sum4 - ( lptr4[i+2] * sum3 ) - ( lptr4[i+1] * sum2 ) - ( lptr4[
i] * sum );
4317 for ( ; i <
n; i++ ) {
4319 vecSum1 = zeroVector;
4320 vecSum2 = zeroVector;
4322 vector
unsigned char vecPermLptr = vec_add( vec_lvsl( -1, lptr ), (vector
unsigned char)(1) );
4324 for ( j = 0 ; j+7 <
i; j+=8 ) {
4326 v0 = vec_ld( 0, &x[j] );
4327 v2 = vec_ld( 15, &x[j] );
4328 vector
float vecExtraX = vec_ld( 31, &x[j] );
4329 v0 = vec_perm( v0, v2, vecPermX );
4330 v2 = vec_perm( v2, vecExtraX, vecPermX );
4332 v1 = vec_ld( 0, lptr + j );
4333 v3 = vec_ld( 15, lptr + j );
4334 vector
float vecExtra = vec_ld( 31, lptr + j );
4335 v1 = vec_perm( v1, v3, vecPermLptr );
4336 v3 = vec_perm( v3, vecExtra, vecPermLptr );
4338 vecSum1 = vec_madd( v1, v0, vecSum1 );
4339 vecSum2 = vec_madd( v3, v2, vecSum2 );
4346 vecSum1 = vec_add( vecSum1, vecSum2 );
4347 vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
4348 vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
4351 vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
4356 for ( ; j <
i; j++ ) {
4357 sum -= lptr[
j] * x[
j];
4380 float x0, x1,
x2, x3, x4, x5, x6;
4392 x0 = b[0] - lptr[1*nc+0] * x1;
4399 x1 = b[1] - lptr[2*nc+1] *
x2;
4400 x0 = b[0] - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4408 x2 = b[2] - lptr[3*nc+2] * x3;
4409 x1 = b[1] - lptr[3*nc+1] * x3 - lptr[2*nc+1] *
x2;
4410 x0 = b[0] - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4420 x3 = b[3] - lptr[4*nc+3] * x4;
4421 x2 = b[2] - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4422 x1 = b[1] - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] *
x2;
4423 x0 = b[0] - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4433 x4 = b[4] - lptr[5*nc+4] * x5;
4434 x3 = b[3] - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
4435 x2 = b[2] - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4436 x1 = b[1] - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] *
x2;
4437 x0 = b[0] - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4449 x5 = b[5] - lptr[6*nc+5] * x6;
4450 x4 = b[4] - lptr[6*nc+4] * x6 - lptr[5*nc+4] * x5;
4451 x3 = b[3] - lptr[6*nc+3] * x6 - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
4452 x2 = b[2] - lptr[6*nc+2] * x6 - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4453 x1 = b[1] - lptr[6*nc+1] * x6 - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] *
x2;
4454 x0 = b[0] - lptr[6*nc+0] * x6 - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4469 register float s0, s1, s2, s3;
4476 for ( i = n; i >= 4; i -= 4 ) {
4482 for ( j = 0; j < n-
i; j += 4 ) {
4483 s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
4484 s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
4485 s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
4486 s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
4487 s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
4488 s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
4489 s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
4490 s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
4491 s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
4492 s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
4493 s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
4494 s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
4495 s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
4496 s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
4497 s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
4498 s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
4501 s0 -= lptr[0-1*nc] * s3;
4502 s1 -= lptr[1-1*nc] * s3;
4503 s2 -= lptr[2-1*nc] * s3;
4504 s0 -= lptr[0-2*nc] * s2;
4505 s1 -= lptr[1-2*nc] * s2;
4506 s0 -= lptr[0-3*nc] * s1;
4517 for ( i--; i >= 0; i-- ) {
4520 for ( j = i + 1; j <
n; j++ ) {
4521 s0 -= lptr[j*nc] * x[
j];
4534 float *
v, *diag, *mptr;
4535 float s0, s1, s2, s3, sum, d;
4536 float s0_2, s1_2, s2_2, s3_2, sum_2;
4539 v = (
float *) _alloca16( n *
sizeof(
float ) );
4540 diag = (
float *) _alloca16( n *
sizeof(
float ) );
4552 if ( sum == 0.0f ) {
4557 invDiag[0] = d = 1.0f / sum;
4564 for ( j = 1; j <
n; j++ ) {
4565 mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
4570 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4573 if ( sum == 0.0f ) {
4579 invDiag[1] = d = 1.0f / sum;
4586 for ( j = 2; j <
n; j++ ) {
4587 mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
4592 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4593 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4594 sum = mptr[2] - s0 - s1;
4596 if ( sum == 0.0f ) {
4602 invDiag[2] = d = 1.0f / sum;
4609 for ( j = 3; j <
n; j++ ) {
4610 mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
4615 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4616 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4617 v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
4618 sum = mptr[3] - s0 - s1 - s2;
4620 if ( sum == 0.0f ) {
4626 invDiag[3] = d = 1.0f / sum;
4633 for ( j = 4; j <
n; j++ ) {
4634 mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
4637 for ( i = 4; i <
n; i++ ) {
4641 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4642 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4643 v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
4644 v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
4645 for ( k = 4; k < i-3; k += 4 ) {
4646 v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
4647 v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
4648 v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
4649 v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
4652 case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
4653 case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
4654 case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
4660 sum = mptr[
i] - sum;
4662 if ( sum == 0.0f ) {
4668 invDiag[
i] = d = 1.0f / sum;
4676 mptr2 = mat[i+1] + nc;
4678 for ( j = i+1; j+1 <
n; j+=2 ) {
4679 s0 = mptr[0] * v[0];
4680 s1 = mptr[1] * v[1];
4681 s2 = mptr[2] * v[2];
4682 s3 = mptr[3] * v[3];
4684 s0_2 = mptr2[0] * v[0];
4685 s1_2 = mptr2[1] * v[1];
4686 s2_2 = mptr2[2] * v[2];
4687 s3_2 = mptr2[3] * v[3];
4689 for ( k = 4; k < i-7; k += 8 ) {
4690 s0 += mptr[k+0] * v[k+0];
4691 s1 += mptr[k+1] * v[k+1];
4692 s2 += mptr[k+2] * v[k+2];
4693 s3 += mptr[k+3] * v[k+3];
4694 s0 += mptr[k+4] * v[k+4];
4695 s1 += mptr[k+5] * v[k+5];
4696 s2 += mptr[k+6] * v[k+6];
4697 s3 += mptr[k+7] * v[k+7];
4699 s0_2 += mptr2[k+0] * v[k+0];
4700 s1_2 += mptr2[k+1] * v[k+1];
4701 s2_2 += mptr2[k+2] * v[k+2];
4702 s3_2 += mptr2[k+3] * v[k+3];
4703 s0_2 += mptr2[k+4] * v[k+4];
4704 s1_2 += mptr2[k+5] * v[k+5];
4705 s2_2 += mptr2[k+6] * v[k+6];
4706 s3_2 += mptr2[k+7] * v[k+7];
4710 case 7: s0 += mptr[k+6] * v[k+6]; s0_2 += mptr2[k+6] * v[k+6];
4711 case 6: s1 += mptr[k+5] * v[k+5]; s1_2 += mptr2[k+5] * v[k+5];
4712 case 5: s2 += mptr[k+4] * v[k+4]; s2_2 += mptr2[k+4] * v[k+4];
4713 case 4: s3 += mptr[k+3] * v[k+3]; s3_2 += mptr2[k+3] * v[k+3];
4714 case 3: s0 += mptr[k+2] * v[k+2]; s0_2 += mptr2[k+2] * v[k+2];
4715 case 2: s1 += mptr[k+1] * v[k+1]; s1_2 += mptr2[k+1] * v[k+1];
4716 case 1: s2 += mptr[k+0] * v[k+0]; s2_2 += mptr2[k+0] * v[k+0];
4725 sum_2 = s1_2 + s3_2;
4727 mptr[
i] = ( mptr[
i] - sum ) * d;
4728 mptr2[
i] = ( mptr2[
i] - sum_2 ) * d;
4735 for ( ; j <
n; j++ ) {
4736 s0 = mptr[0] * v[0];
4737 s1 = mptr[1] * v[1];
4738 s2 = mptr[2] * v[2];
4739 s3 = mptr[3] * v[3];
4740 for ( k = 4; k < i-7; k += 8 ) {
4741 s0 += mptr[k+0] * v[k+0];
4742 s1 += mptr[k+1] * v[k+1];
4743 s2 += mptr[k+2] * v[k+2];
4744 s3 += mptr[k+3] * v[k+3];
4745 s0 += mptr[k+4] * v[k+4];
4746 s1 += mptr[k+5] * v[k+5];
4747 s2 += mptr[k+6] * v[k+6];
4748 s3 += mptr[k+7] * v[k+7];
4751 case 7: s0 += mptr[k+6] * v[k+6];
4752 case 6: s1 += mptr[k+5] * v[k+5];
4753 case 5: s2 += mptr[k+4] * v[k+4];
4754 case 4: s3 += mptr[k+3] * v[k+3];
4755 case 3: s0 += mptr[k+2] * v[k+2];
4756 case 2: s1 += mptr[k+1] * v[k+1];
4757 case 1: s2 += mptr[k+0] * v[k+0];
4763 mptr[
i] = ( mptr[
i] - sum ) * d;
4772 #ifdef LIVE_VICARIOUSLY
4782 if ( lerp <= 0.0f ) {
4787 if ( lerp >= 1.0f ) {
4793 vector
float vecLerp = loadSplatUnalignedScalar( &lerp );
4794 vector
float zeroVector = (vector
float)(0);
4796 for ( i = 0; i+3 < numJoints; i+=4 ) {
4798 int j2 = index[i+1];
4799 int j3 = index[i+2];
4800 int j4 = index[i+3];
4807 const float *jointPtr3 = joints[j3].
q.
ToFloatPtr();
4808 const float *blendPtr3 = blendJoints[j3].
q.
ToFloatPtr();
4809 const float *jointPtr4 = joints[j4].
q.
ToFloatPtr();
4810 const float *blendPtr4 = blendJoints[j4].
q.
ToFloatPtr();
4812 vector
unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector
unsigned char)(1) );
4813 vector
unsigned char permVec2 = vec_add( vec_lvsl( -1, jointPtr2 ), (vector
unsigned char)(1) );
4814 vector
unsigned char permVec3 = vec_add( vec_lvsl( -1, jointPtr3 ), (vector
unsigned char)(1) );
4815 vector
unsigned char permVec4 = vec_add( vec_lvsl( -1, jointPtr4 ), (vector
unsigned char)(1) );
4817 vector
unsigned char permVec5 = vec_add( vec_lvsl( -1, blendPtr ), (vector
unsigned char)(1) );
4818 vector
unsigned char permVec6 = vec_add( vec_lvsl( -1, blendPtr2 ), (vector
unsigned char)(1) );
4819 vector
unsigned char permVec7 = vec_add( vec_lvsl( -1, blendPtr3 ), (vector
unsigned char)(1) );
4820 vector
unsigned char permVec8 = vec_add( vec_lvsl( -1, blendPtr4 ), (vector
unsigned char)(1) );
4822 vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7, v8, v9, v10, v11;
4823 vector
float v12, v13, v14, v15, v16;
4824 vector
float vecFromX, vecFromY, vecFromZ, vecFromW;
4825 vector
float vecToX, vecToY, vecToZ, vecToW;
4828 v0 = vec_ld( 0, jointPtr );
4829 v1 = vec_ld( 15, jointPtr );
4830 v2 = vec_perm( v0, v1, permVec );
4832 v3 = vec_ld( 0, jointPtr2 );
4833 v4 = vec_ld( 15, jointPtr2 );
4834 v5 = vec_perm( v3, v4, permVec2 );
4836 v6 = vec_ld( 0, jointPtr3 );
4837 v7 = vec_ld( 15, jointPtr3 );
4838 v8 = vec_perm( v6, v7, permVec3 );
4840 v9 = vec_ld( 0, jointPtr4 );
4841 v10 = vec_ld( 15, jointPtr4 );
4842 v11 = vec_perm( v9, v10, permVec4 );
4845 v0 = vec_mergeh( v2, v8 );
4846 v1 = vec_mergeh( v5, v11 );
4847 v3 = vec_mergel( v2, v8 );
4848 v4 = vec_mergel( v5, v11 );
4850 vecFromX = vec_mergeh( v0, v1 );
4851 vecFromY = vec_mergel( v0, v1 );
4852 vecFromZ = vec_mergeh( v3, v4 );
4853 vecFromW = vec_mergel( v3, v4 );
4856 v5 = vec_ld( 0, blendPtr );
4857 v6 = vec_ld( 15, blendPtr );
4858 v7 = vec_perm( v5, v6, permVec5 );
4860 v8 = vec_ld( 0, blendPtr2 );
4861 v9 = vec_ld( 15, blendPtr2 );
4862 v10 = vec_perm( v8, v9, permVec6 );
4864 v11 = vec_ld( 0, blendPtr3 );
4865 v12 = vec_ld( 15, blendPtr3 );
4866 v13 = vec_perm( v11, v12, permVec7 );
4868 v14 = vec_ld( 0, blendPtr4 );
4869 v15 = vec_ld( 15, blendPtr4 );
4870 v16 = vec_perm( v14, v15, permVec8 );
4873 v5 = vec_mergeh( v7, v13 );
4874 v6 = vec_mergeh( v10, v16 );
4875 v8 = vec_mergel( v7, v13 );
4876 v9 = vec_mergel( v10, v16 );
4878 vecToX = vec_mergeh( v5, v6 );
4879 vecToY = vec_mergel( v5, v6 );
4880 vecToZ = vec_mergeh( v8, v9 );
4881 vecToW = vec_mergel( v8, v9 );
4884 vector
float vecCosom = vec_madd( vecFromX, vecToX, (vector
float)(0) );
4885 vecCosom = vec_madd( vecFromY, vecToY, vecCosom );
4886 vecCosom = vec_madd( vecFromZ, vecToZ, vecCosom );
4887 vecCosom = vec_madd( vecFromW, vecToW, vecCosom );
4891 vector
bool int vecCmp, vecCmp2;
4892 vecCmp = vec_cmplt( vecCosom, zeroVector );
4895 vecToX = vec_sel( vecToX, vec_madd( vecToX, (vector
float)(-1), zeroVector ), vecCmp );
4896 vecToY = vec_sel( vecToY, vec_madd( vecToY, (vector
float)(-1), zeroVector ), vecCmp );
4897 vecToZ = vec_sel( vecToZ, vec_madd( vecToZ, (vector
float)(-1), zeroVector ), vecCmp );
4898 vecToW = vec_sel( vecToW, vec_madd( vecToW, (vector
float)(-1), zeroVector ), vecCmp );
4899 vecCosom = vec_sel( vecCosom, vec_madd( vecCosom, (vector
float)(-1), zeroVector ), vecCmp );
4902 vecCmp2 = vec_cmpgt( vec_sub( (vector
float)(1), vecCosom ), (vector
float)(1e-6f) );
4903 vector
float vecScale0 = vec_sub( (vector
float)(1), vecLerp );
4904 vector
float vecScale1 = vec_splat( vecLerp, 0 );
4906 vector
float vecWork1 = vec_sub( (vector
float)(1), vec_madd( vecCosom, vecCosom, zeroVector ) );
4907 vector
float vecWork2 = ReciprocalSquareRoot( vecWork1 );
4908 vector
float vecWork3 = VectorATan16( vec_madd( vecWork1, vecWork2, zeroVector ), vecCosom );
4910 vecWork1 = vec_madd( VectorSin16( vec_madd( vecScale0, vecWork3, zeroVector ) ), vecWork2, zeroVector );
4911 vecWork2 = vec_madd( VectorSin16( vec_madd( vecLerp, vecWork3, zeroVector ) ), vecWork2, zeroVector );
4914 vecScale0 = vec_sel( vecScale0, vecWork1, vecCmp2 );
4915 vecScale1 = vec_sel( vecScale1, vecWork2, vecCmp2 );
4918 vecFromX = vec_madd( vecFromX, vecScale0, zeroVector );
4919 vecFromY = vec_madd( vecFromY, vecScale0, zeroVector );
4920 vecFromZ = vec_madd( vecFromZ, vecScale0, zeroVector );
4921 vecFromW = vec_madd( vecFromW, vecScale0, zeroVector );
4924 vecFromX = vec_madd( vecToX, vecScale1, vecFromX );
4925 vecFromY = vec_madd( vecToY, vecScale1, vecFromY );
4926 vecFromZ = vec_madd( vecToZ, vecScale1, vecFromZ );
4927 vecFromW = vec_madd( vecToW, vecScale1, vecFromW );
4930 v5 = vec_mergeh( vecFromX, vecFromZ );
4931 v6 = vec_mergeh( vecFromY, vecFromW );
4932 v8 = vec_mergel( vecFromX, vecFromZ );
4933 v9 = vec_mergel( vecFromY, vecFromW );
4935 vecToX = vec_mergeh( v5, v6 );
4936 vecToY = vec_mergel( v5, v6 );
4937 vecToZ = vec_mergeh( v8, v9 );
4938 vecToW = vec_mergel( v8, v9 );
4940 vector
unsigned char storePerm1 = vec_lvsr( 0, jointPtr );
4941 vector
unsigned char storePerm2 = vec_lvsr( 0, jointPtr2 );
4942 vector
unsigned char storePerm3 = vec_lvsr( 0, jointPtr3 );
4943 vector
unsigned char storePerm4 = vec_lvsr( 0, jointPtr4 );
4946 vecToX = vec_perm( vecToX, vecToX, storePerm1 );
4947 vecToY = vec_perm( vecToY, vecToY, storePerm2 );
4948 vecToZ = vec_perm( vecToZ, vecToZ, storePerm3 );
4949 vecToW = vec_perm( vecToW, vecToW, storePerm4 );
4951 vec_ste( vecToX, 0, (
float*) jointPtr );
4952 vec_ste( vecToX, 4, (
float*) jointPtr );
4953 vec_ste( vecToX, 8, (
float*) jointPtr );
4954 vec_ste( vecToX, 12, (
float*) jointPtr );
4956 vec_ste( vecToY, 0, (
float*) jointPtr2 );
4957 vec_ste( vecToY, 4, (
float*) jointPtr2 );
4958 vec_ste( vecToY, 8, (
float*) jointPtr2 );
4959 vec_ste( vecToY, 12, (
float*) jointPtr2 );
4961 vec_ste( vecToZ, 0, (
float*) jointPtr3 );
4962 vec_ste( vecToZ, 4, (
float*) jointPtr3 );
4963 vec_ste( vecToZ, 8, (
float*) jointPtr3 );
4964 vec_ste( vecToZ, 12, (
float*) jointPtr3 );
4966 vec_ste( vecToW, 0, (
float*) jointPtr4 );
4967 vec_ste( vecToW, 4, (
float*) jointPtr4 );
4968 vec_ste( vecToW, 8, (
float*) jointPtr4 );
4969 vec_ste( vecToW, 12, (
float*) jointPtr4 );
4973 float *jointVecPtr = (
float*)( jointPtr + 4 );
4974 float *jointVecPtr2 = (
float*)( jointPtr2 + 4 );
4975 float *jointVecPtr3 = (
float*)( jointPtr3 + 4 );
4976 float *jointVecPtr4 = (
float*)( jointPtr4 + 4 );
4978 v0 = vec_ld( 0, jointVecPtr );
4979 v1 = vec_ld( 11, jointVecPtr );
4980 vector
float vecLd1 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, jointVecPtr ), (vector
unsigned char)(1) ) );
4982 v2 = vec_ld( 0, jointVecPtr2 );
4983 v3 = vec_ld( 11, jointVecPtr2 );
4984 vector
float vecLd2 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, jointVecPtr2 ), (vector
unsigned char)(1) ) );
4986 v4 = vec_ld( 0, jointVecPtr3 );
4987 v5 = vec_ld( 11, jointVecPtr3 );
4988 vector
float vecLd3 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, jointVecPtr3 ), (vector
unsigned char)(1) ) );
4990 v6 = vec_ld( 0, jointVecPtr4 );
4991 v7 = vec_ld( 11, jointVecPtr4 );
4992 vector
float vecLd4 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, jointVecPtr4 ), (vector
unsigned char)(1) ) );
4994 vector
float vecVecX, vecVecY, vecVecZ;
4995 vecVecX = vecVecY = vecVecZ = zeroVector;
4998 v0 = vec_mergeh( vecLd1, vecLd3 );
4999 v1 = vec_mergeh( vecLd2, vecLd4 );
5000 v3 = vec_mergel( vecLd1, vecLd3 );
5001 v4 = vec_mergel( vecLd2, vecLd4 );
5003 vecVecX = vec_mergeh( v0, v1 );
5004 vecVecY = vec_mergel( v0, v1 );
5005 vecVecZ = vec_mergeh( v3, v4 );
5008 float *blendVecPtr = (
float*)( blendPtr + 4 );
5009 float *blendVecPtr2 =(
float*)( blendPtr2 + 4 );
5010 float *blendVecPtr3 = (
float*)( blendPtr3 + 4 );
5011 float *blendVecPtr4 = (
float*)( blendPtr4 + 4 );
5013 v0 = vec_ld( 0, blendVecPtr );
5014 v1 = vec_ld( 11, blendVecPtr );
5015 vector
float vecLd5 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, blendVecPtr ), (vector
unsigned char)(1) ) );
5017 v2 = vec_ld( 0, blendVecPtr2 );
5018 v3 = vec_ld( 11, blendVecPtr2 );
5019 vector
float vecLd6 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, blendVecPtr2 ), (vector
unsigned char)(1) ) );
5021 v4 = vec_ld( 0, blendVecPtr3 );
5022 v5 = vec_ld( 11, blendVecPtr3 );
5023 vector
float vecLd7 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, blendVecPtr3 ), (vector
unsigned char)(1) ) );
5025 v6 = vec_ld( 0, blendVecPtr4 );
5026 v7 = vec_ld( 11, blendVecPtr4 );
5027 vector
float vecLd8 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, blendVecPtr4 ), (vector
unsigned char)(1) ) );
5029 vector
float vecBlendX, vecBlendY, vecBlendZ;
5030 vecBlendX = vecBlendY = vecBlendZ = zeroVector;
5033 v0 = vec_mergeh( vecLd5, vecLd7 );
5034 v1 = vec_mergeh( vecLd6, vecLd8 );
5035 v3 = vec_mergel( vecLd5, vecLd7 );
5036 v4 = vec_mergel( vecLd6, vecLd8 );
5038 vecBlendX = vec_mergeh( v0, v1 );
5039 vecBlendY = vec_mergel( v0, v1 );
5040 vecBlendZ = vec_mergeh( v3, v4 );
5043 vecWork1 = vec_sub( vecBlendX, vecVecX );
5044 vecWork2 = vec_sub( vecBlendY, vecVecY );
5045 vecWork3 = vec_sub( vecBlendZ, vecVecZ );
5048 vecVecX = vec_madd( vecWork1, vecLerp, vecVecX );
5049 vecVecY = vec_madd( vecWork2, vecLerp, vecVecY );
5050 vecVecZ = vec_madd( vecWork3, vecLerp, vecVecZ );
5053 v0 = vec_mergeh( vecVecX, vecVecZ );
5054 v1 = vec_mergeh( vecVecY, zeroVector );
5055 v3 = vec_mergel( vecVecX, vecVecZ );
5056 v4 = vec_mergel( vecVecY, zeroVector );
5059 vecWork1 = vec_mergeh( v0, v1 );
5060 vecWork2 = vec_mergel( v0, v1 );
5061 vecWork3 = vec_mergeh( v3, v4 );
5062 vector
float vecWork4 = vec_mergel( v3, v4 );
5065 storePerm1 = vec_lvsr( 0, jointVecPtr );
5066 storePerm2 = vec_lvsr( 0, jointVecPtr2 );
5067 storePerm3 = vec_lvsr( 0, jointVecPtr3 );
5068 storePerm4 = vec_lvsr( 0, jointVecPtr4 );
5071 vecWork1 = vec_perm( vecWork1, vecWork1, storePerm1 );
5072 vecWork2 = vec_perm( vecWork2, vecWork2, storePerm2 );
5073 vecWork3 = vec_perm( vecWork3, vecWork3, storePerm3 );
5074 vecWork4 = vec_perm( vecWork4, vecWork4, storePerm4 );
5076 vec_ste( vecWork1, 0, (
float*) jointVecPtr );
5077 vec_ste( vecWork1, 4, (
float*) jointVecPtr );
5078 vec_ste( vecWork1, 8, (
float*) jointVecPtr );
5080 vec_ste( vecWork2, 0, (
float*) jointVecPtr2 );
5081 vec_ste( vecWork2, 4, (
float*) jointVecPtr2 );
5082 vec_ste( vecWork2, 8, (
float*) jointVecPtr2 );
5084 vec_ste( vecWork3, 0, (
float*) jointVecPtr3 );
5085 vec_ste( vecWork3, 4, (
float*) jointVecPtr3 );
5086 vec_ste( vecWork3, 8, (
float*) jointVecPtr3 );
5088 vec_ste( vecWork4, 0, (
float*) jointVecPtr4 );
5089 vec_ste( vecWork4, 4, (
float*) jointVecPtr4 );
5090 vec_ste( vecWork4, 8, (
float*) jointVecPtr4 );
5094 for ( ; i < numJoints; i++ ) {
5096 joints[
j].
q.
Slerp( joints[j].
q, blendJoints[j].q, lerp );
5097 joints[
j].
t.
Lerp( joints[j].
t, blendJoints[j].t, lerp );
5111 for (
int i = 0; i < numJoints; i++ ) {
5120 float x2 = q[0] + q[0];
5121 float y2 = q[1] + q[1];
5122 float z2 = q[2] + q[2];
5125 float xx = q[0] *
x2;
5126 float yy = q[1] *
y2;
5127 float zz = q[2] * z2;
5129 m[0*4+0] = 1.0f - yy - zz;
5130 m[1*4+1] = 1.0f - xx - zz;
5131 m[2*4+2] = 1.0f - xx - yy;
5135 float yz = q[1] * z2;
5136 float wx = q[3] *
x2;
5143 float xy = q[0] *
y2;
5144 float wz = q[3] * z2;
5151 float xz = q[0] * z2;
5152 float wy = q[3] *
y2;
5175 for ( index = 0; index < numJoints; index++ ) {
5185 static int next[3] = { 1, 2, 0 };
5187 float *mat = (
float*)( jointMats[index].ToFloatPtr() );
5188 trace = mat[0 * 4 + 0] + mat[1 * 4 + 1] + mat[2 * 4 + 2];
5190 if ( trace > 0.0f ) {
5194 s = FastScalarInvSqrt( t ) * 0.5f;
5197 jq.
q[0] = ( mat[1 * 4 + 2] - mat[2 * 4 + 1] ) * s;
5198 jq.
q[1] = ( mat[2 * 4 + 0] - mat[0 * 4 + 2] ) * s;
5199 jq.
q[2] = ( mat[0 * 4 + 1] - mat[1 * 4 + 0] ) * s;
5204 if ( mat[1 * 4 + 1] > mat[0 * 4 + 0] ) {
5207 if ( mat[2 * 4 + 2] > mat[i * 4 + i] ) {
5213 t = ( mat[i * 4 +
i] - ( mat[j * 4 +
j] + mat[k * 4 + k] ) ) + 1.0f;
5215 s = FastScalarInvSqrt( t ) * 0.5f;
5218 jq.
q[3] = ( mat[j * 4 + k] - mat[k * 4 +
j] ) * s;
5219 jq.
q[
j] = ( mat[i * 4 +
j] + mat[j * 4 +
i] ) * s;
5220 jq.
q[k] = ( mat[i * 4 + k] + mat[k * 4 +
i] ) * s;
5223 jq.
t[0] = mat[0 * 4 + 3];
5224 jq.
t[1] = mat[1 * 4 + 3];
5225 jq.
t[2] = mat[2 * 4 + 3];
5226 jointQuats[
index] = jq;
5238 for( i = firstJoint; i <= lastJoint; i++ ) {
5239 assert( parents[i] < i );
5240 jointMats[
i] *= jointMats[parents[
i]];
5246 for ( i = firstJoint; i <= lastJoint; i++ ) {
5247 assert( parents[i] < i );
5249 float *parentPtr = jointMats[parents[
i]].
ToFloatPtr();
5251 vector
unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector
unsigned char)(1) );
5252 vector
unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector
unsigned char)(1) );
5253 vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
5256 v0 = vec_ld( 0, jointPtr );
5257 v1 = vec_ld( 15, jointPtr );
5258 v2 = vec_ld( 31, jointPtr );
5259 v3 = vec_ld( 47, jointPtr );
5262 v4 = vec_ld( 0, parentPtr );
5263 v5 = vec_ld( 15, parentPtr );
5264 v6 = vec_ld( 31, parentPtr );
5265 v7 = vec_ld( 47, parentPtr );
5268 vector
float vecJointMat1 = vec_perm( v0, v1, permVec );
5269 vector
float vecJointMat2 = vec_perm( v1, v2, permVec );
5270 vector
float vecJointMat3 = vec_perm( v2, v3, permVec );
5272 vector
float vecParentMat1 = vec_perm( v4, v5, permVec2 );
5273 vector
float vecParentMat2 = vec_perm( v5, v6, permVec2 );
5274 vector
float vecParentMat3 = vec_perm( v6, v7, permVec2 );
5276 vector
float zero = (vector
float)(0);
5277 vector
float C1,
C2, C3;
5280 C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero );
5281 C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat2, 0 ), zero );
5282 C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat3, 0 ), zero );
5284 C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat1, 1 ), C1 );
5285 C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 );
5286 C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat3, 1 ), C3 );
5288 C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat1, 2 ), C1 );
5289 C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat2, 2 ), C2 );
5290 C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
5293 vector
unsigned char permZeroAndLast = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
5294 C1 = vec_add( C1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
5295 C2 = vec_add( C2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
5296 C3 = vec_add( C3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
5299 UNALIGNED_STORE3( (
float*) jointPtr, C1, C2, C3 );
5312 for( i = lastJoint; i >= firstJoint; i-- ) {
5313 assert( parents[i] < i );
5314 jointMats[
i] /= jointMats[parents[
i]];
5319 for ( i = lastJoint; i >= firstJoint; i-- ) {
5320 assert( parents[i] < i );
5322 float *parentPtr = jointMats[parents[
i]].
ToFloatPtr();
5324 vector
unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector
unsigned char)(1) );
5325 vector
unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector
unsigned char)(1) );
5326 vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
5329 v0 = vec_ld( 0, jointPtr );
5330 v1 = vec_ld( 15, jointPtr );
5331 v2 = vec_ld( 31, jointPtr );
5332 v3 = vec_ld( 47, jointPtr );
5335 v4 = vec_ld( 0, parentPtr );
5336 v5 = vec_ld( 15, parentPtr );
5337 v6 = vec_ld( 31, parentPtr );
5338 v7 = vec_ld( 47, parentPtr );
5341 vector
float vecJointMat1 = vec_perm( v0, v1, permVec );
5342 vector
float vecJointMat2 = vec_perm( v1, v2, permVec );
5343 vector
float vecJointMat3 = vec_perm( v2, v3, permVec );
5345 vector
float vecParentMat1 = vec_perm( v4, v5, permVec2 );
5346 vector
float vecParentMat2 = vec_perm( v5, v6, permVec2 );
5347 vector
float vecParentMat3 = vec_perm( v6, v7, permVec2 );
5349 vector
float zero = (vector
float)(0);
5350 vector
float C1,
C2, C3;
5353 vector
unsigned char permZeroAndLast = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
5354 vecJointMat1 = vec_sub( vecJointMat1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
5355 vecJointMat2 = vec_sub( vecJointMat2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
5356 vecJointMat3 = vec_sub( vecJointMat3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
5359 C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero );
5360 C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 1 ), zero );
5361 C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 2 ), zero );
5363 C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 0 ), C1 );
5364 C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 );
5365 C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 2 ), C3 );
5367 C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 0 ), C1 );
5368 C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 1 ), C2 );
5369 C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
5372 vector
unsigned char storePerm = vec_lvsr( 0, jointPtr );
5375 C1 = vec_perm( C1, C1, storePerm );
5376 C2 = vec_perm( C2, C2, storePerm );
5377 C3 = vec_perm( C3, C3, storePerm );
5379 vec_ste( C1, 0, (
float*) jointPtr );
5380 vec_ste( C1, 4, (
float*) jointPtr );
5381 vec_ste( C1, 8, (
float*) jointPtr );
5382 vec_ste( C1, 12, (
float*) jointPtr );
5384 vec_ste( C2, 16, (
float*) jointPtr );
5385 vec_ste( C2, 20, (
float*) jointPtr );
5386 vec_ste( C2, 24, (
float*) jointPtr );
5387 vec_ste( C2, 28, (
float*) jointPtr );
5389 vec_ste( C3, 32, (
float*) jointPtr );
5390 vec_ste( C3, 36, (
float*) jointPtr );
5391 vec_ste( C3, 40, (
float*) jointPtr );
5392 vec_ste( C3, 44, (
float*) jointPtr );
5408 const byte *jointsPtr = (
byte *)joints;
5410 for( j = i = 0; i < numVerts; i++ ) {
5413 float *matPtrOrig = ( *(
idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
5414 float *weightPtr = (
float*) weights[j].ToFloatPtr();
5416 v[0] = matPtrOrig[0] * weightPtr[0];
5417 v[0] += matPtrOrig[1] * weightPtr[1];
5418 v[0] += matPtrOrig[2] * weightPtr[2];
5419 v[0] += matPtrOrig[3] * weightPtr[3];
5421 v[1] = matPtrOrig[4] * weightPtr[0];
5422 v[1] += matPtrOrig[5] * weightPtr[1];
5423 v[1] += matPtrOrig[6] * weightPtr[2];
5424 v[1] += matPtrOrig[7] * weightPtr[3];
5426 v[2] = matPtrOrig[8] * weightPtr[0];
5427 v[2] += matPtrOrig[9] * weightPtr[1];
5428 v[2] += matPtrOrig[10] * weightPtr[2];
5429 v[2] += matPtrOrig[11] * weightPtr[3];
5431 while( index[j*2+1] == 0 ) {
5433 float *matPtr = ( *(
idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
5434 weightPtr = (
float*) weights[j].ToFloatPtr();
5436 v[0] += matPtr[0] * weightPtr[0];
5437 v[0] += matPtr[1] * weightPtr[1];
5438 v[0] += matPtr[2] * weightPtr[2];
5439 v[0] += matPtr[3] * weightPtr[3];
5441 v[1] += matPtr[4] * weightPtr[0];
5442 v[1] += matPtr[5] * weightPtr[1];
5443 v[1] += matPtr[6] * weightPtr[2];
5444 v[1] += matPtr[7] * weightPtr[3];
5446 v[2] += matPtr[8] * weightPtr[0];
5447 v[2] += matPtr[9] * weightPtr[1];
5448 v[2] += matPtr[10] * weightPtr[2];
5449 v[2] += matPtr[11] * weightPtr[3];
5460 #ifndef DRAWVERT_PADDED
5475 const float *planePtr = planes[0].
ToFloatPtr();
5477 vector
unsigned int vecShift1 = (vector
unsigned int)(0,1,2,3);
5478 vector
unsigned int vecShift2 = (vector
unsigned int)(4,5,6,7);
5479 vector
unsigned int vecFlipBits = (vector
unsigned int)(0x0F);
5480 vector
float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
5481 vector
bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5482 vector
unsigned char vecPerm;
5483 vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
5484 vector
float zeroVector = (vector
float)(0);
5485 vector
float vecRadius;
5486 vector
float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5487 vector
float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
5488 vector
unsigned char vecPermLast = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
5489 vector
unsigned char vecPermHalves = (vector
unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
5490 vector
float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
5491 vector
float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
5492 vector
bool int oneIntVector = (vector
bool int)(1);
5493 vector
unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
5494 vector
unsigned int vecTotals;
5495 vector
unsigned int tempIntSum;
5496 vector
unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
5498 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector
unsigned char)(1) );
5501 v0 = vec_ld( 0, planePtr );
5502 v1 = vec_ld( 15, planePtr );
5503 vecPlane0 = vec_perm( v0, v1, vecPerm );
5505 v2 = vec_ld( 0, planePtr + 4 );
5506 v3 = vec_ld( 15, planePtr + 4 );
5507 vecPlane1 = vec_perm( v2, v3, vecPerm );
5509 v0 = vec_ld( 0, planePtr + 8 );
5510 v1 = vec_ld( 15, planePtr + 8 );
5511 vecPlane2 = vec_perm( v0, v1, vecPerm );
5513 v2 = vec_ld( 0, planePtr + 12 );
5514 v3 = vec_ld( 15, planePtr + 12 );
5515 vecPlane3 = vec_perm( v2, v3, vecPerm );
5518 v0 = vec_mergeh( vecPlane0, vecPlane2 );
5519 v1 = vec_mergeh( vecPlane1, vecPlane3 );
5520 v2 = vec_mergel( vecPlane0, vecPlane2 );
5521 v3 = vec_mergel( vecPlane1, vecPlane3 );
5523 vecPlane0 = vec_mergeh( v0, v1 );
5524 vecPlane1 = vec_mergel( v0, v1 );
5525 vecPlane2 = vec_mergeh( v2, v3 );
5526 vecPlane3 = vec_mergel( v2, v3 );
5529 vecRadius = loadSplatUnalignedScalar( &radius );
5531 unsigned int cullBitVal[4];
5532 vector
unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
5536 if ( i+3 < numVerts ) {
5537 vertPerm1 = vec_add( vec_lvsl( -1, (
float*) verts[0].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
5538 vertPerm2 = vec_add( vec_lvsl( -1, (
float*) verts[1].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
5539 vertPerm3 = vec_add( vec_lvsl( -1, (
float*) verts[2].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
5540 vertPerm4 = vec_add( vec_lvsl( -1, (
float*) verts[3].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
5544 for ( ; i+3 < numVerts; i+=4 ) {
5550 v0 = vec_ld( 0, vertPtr );
5551 v1 = vec_ld( 15, vertPtr );
5552 v2 = vec_ld( 0, vertPtr2 );
5553 v3 = vec_ld( 15, vertPtr2 );
5554 v4 = vec_ld( 0, vertPtr3 );
5555 v5 = vec_ld( 15, vertPtr3 );
5556 v6 = vec_ld( 0, vertPtr4 );
5557 v7 = vec_ld( 15, vertPtr4 );
5559 vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
5560 vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
5561 vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
5562 vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
5564 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
5565 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
5566 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
5567 vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
5569 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
5570 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
5571 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
5572 vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
5574 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
5575 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
5576 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
5577 vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
5579 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
5580 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
5581 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
5582 vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
5586 vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
5587 vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
5588 vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
5589 vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
5590 vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
5591 vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
5592 vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
5593 vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
5596 vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
5597 vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
5598 vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
5599 vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
5600 vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
5601 vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
5602 vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
5603 vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
5606 vecCmp1 = vec_and( vecCmp1, oneIntVector );
5607 vecCmp2 = vec_and( vecCmp2, oneIntVector );
5608 vecCmp3 = vec_and( vecCmp3, oneIntVector );
5609 vecCmp4 = vec_and( vecCmp4, oneIntVector );
5610 vecCmp5 = vec_and( vecCmp5, oneIntVector );
5611 vecCmp6 = vec_and( vecCmp6, oneIntVector );
5612 vecCmp7 = vec_and( vecCmp7, oneIntVector );
5613 vecCmp8 = vec_and( vecCmp8, oneIntVector );
5615 vecBitShifted1 = vec_sl( (vector
unsigned int)vecCmp1, vecShift1 );
5616 vecBitShifted2 = vec_sl( (vector
unsigned int)vecCmp2, vecShift2 );
5617 vecBitShifted3 = vec_sl( (vector
unsigned int)vecCmp3, vecShift1 );
5618 vecBitShifted4 = vec_sl( (vector
unsigned int)vecCmp4, vecShift2 );
5619 vecBitShifted5 = vec_sl( (vector
unsigned int)vecCmp5, vecShift1 );
5620 vecBitShifted6 = vec_sl( (vector
unsigned int)vecCmp6, vecShift2 );
5621 vecBitShifted7 = vec_sl( (vector
unsigned int)vecCmp7, vecShift1 );
5622 vecBitShifted8 = vec_sl( (vector
unsigned int)vecCmp8, vecShift2 );
5625 vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
5626 vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
5627 vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
5628 vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
5630 vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
5631 vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
5632 tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
5633 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5634 vecTotals = vec_mergeh( vecTotals, tempIntSum );
5635 tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
5636 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5637 vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
5638 tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
5639 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5640 vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
5643 vector
unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
5644 tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
5645 vec_ste( tempSt, 0, &cullBitVal[0] );
5646 vec_ste( tempSt, 4, &cullBitVal[0] );
5647 vec_ste( tempSt, 8, &cullBitVal[0] );
5648 vec_ste( tempSt, 12, &cullBitVal[0] );
5650 tOr |= cullBitVal[0];
5651 tOr |= cullBitVal[1];
5652 tOr |= cullBitVal[2];
5653 tOr |= cullBitVal[3];
5655 cullBits[
i] = cullBitVal[0];
5656 cullBits[i+1] = cullBitVal[1];
5657 cullBits[i+2] = cullBitVal[2];
5658 cullBits[i+3] = cullBitVal[3];
5662 for ( ; i < numVerts; i++ ) {
5664 float d0, d1, d2, d3,
t;
5714 const float *planePtr = planes[0].
ToFloatPtr();
5716 vector
unsigned int vecShift1 = (vector
unsigned int)(0,1,2,3);
5717 vector
unsigned int vecShift2 = (vector
unsigned int)(4,5,6,7);
5718 vector
unsigned int vecFlipBits = (vector
unsigned int)(0x0F);
5719 vector
float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
5720 vector
bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5721 vector
unsigned char vecPerm;
5722 vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
5723 vector
float zeroVector = (vector
float)(0);
5724 vector
float vecRadius;
5725 vector
float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5726 vector
float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
5727 vector
unsigned char vecPermLast = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
5728 vector
unsigned char vecPermHalves = (vector
unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
5729 vector
float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
5730 vector
float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
5731 vector
bool int oneIntVector = (vector
bool int)(1);
5732 vector
unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
5733 vector
unsigned int vecTotals;
5734 vector
unsigned int tempIntSum;
5735 vector
unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
5737 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector
unsigned char)(1) );
5740 v0 = vec_ld( 0, planePtr );
5741 v1 = vec_ld( 15, planePtr );
5742 vecPlane0 = vec_perm( v0, v1, vecPerm );
5744 v2 = vec_ld( 0, planePtr + 4 );
5745 v3 = vec_ld( 15, planePtr + 4 );
5746 vecPlane1 = vec_perm( v2, v3, vecPerm );
5748 v0 = vec_ld( 0, planePtr + 8 );
5749 v1 = vec_ld( 15, planePtr + 8 );
5750 vecPlane2 = vec_perm( v0, v1, vecPerm );
5752 v2 = vec_ld( 0, planePtr + 12 );
5753 v3 = vec_ld( 15, planePtr + 12 );
5754 vecPlane3 = vec_perm( v2, v3, vecPerm );
5757 v0 = vec_mergeh( vecPlane0, vecPlane2 );
5758 v1 = vec_mergeh( vecPlane1, vecPlane3 );
5759 v2 = vec_mergel( vecPlane0, vecPlane2 );
5760 v3 = vec_mergel( vecPlane1, vecPlane3 );
5762 vecPlane0 = vec_mergeh( v0, v1 );
5763 vecPlane1 = vec_mergel( v0, v1 );
5764 vecPlane2 = vec_mergeh( v2, v3 );
5765 vecPlane3 = vec_mergel( v2, v3 );
5768 vecRadius = loadSplatUnalignedScalar( &radius );
5770 unsigned int cullBitVal[4];
5771 vector
unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
5775 for ( ; i+3 < numVerts; i+=4 ) {
5781 vecXYZ1 = vec_ld( 0, vertPtr );
5782 vecXYZ2 = vec_ld( 0, vertPtr2 );
5783 vecXYZ3 = vec_ld( 0, vertPtr3 );
5784 vecXYZ4 = vec_ld( 0, vertPtr4 );
5786 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
5787 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
5788 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
5789 vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
5791 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
5792 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
5793 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
5794 vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
5796 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
5797 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
5798 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
5799 vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
5801 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
5802 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
5803 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
5804 vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
5808 vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
5809 vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
5810 vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
5811 vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
5812 vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
5813 vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
5814 vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
5815 vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
5818 vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
5819 vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
5820 vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
5821 vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
5822 vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
5823 vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
5824 vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
5825 vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
5828 vecCmp1 = vec_and( vecCmp1, oneIntVector );
5829 vecCmp2 = vec_and( vecCmp2, oneIntVector );
5830 vecCmp3 = vec_and( vecCmp3, oneIntVector );
5831 vecCmp4 = vec_and( vecCmp4, oneIntVector );
5832 vecCmp5 = vec_and( vecCmp5, oneIntVector );
5833 vecCmp6 = vec_and( vecCmp6, oneIntVector );
5834 vecCmp7 = vec_and( vecCmp7, oneIntVector );
5835 vecCmp8 = vec_and( vecCmp8, oneIntVector );
5837 vecBitShifted1 = vec_sl( (vector
unsigned int)vecCmp1, vecShift1 );
5838 vecBitShifted2 = vec_sl( (vector
unsigned int)vecCmp2, vecShift2 );
5839 vecBitShifted3 = vec_sl( (vector
unsigned int)vecCmp3, vecShift1 );
5840 vecBitShifted4 = vec_sl( (vector
unsigned int)vecCmp4, vecShift2 );
5841 vecBitShifted5 = vec_sl( (vector
unsigned int)vecCmp5, vecShift1 );
5842 vecBitShifted6 = vec_sl( (vector
unsigned int)vecCmp6, vecShift2 );
5843 vecBitShifted7 = vec_sl( (vector
unsigned int)vecCmp7, vecShift1 );
5844 vecBitShifted8 = vec_sl( (vector
unsigned int)vecCmp8, vecShift2 );
5847 vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
5848 vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
5849 vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
5850 vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
5852 vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
5853 vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
5854 tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
5855 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5856 vecTotals = vec_mergeh( vecTotals, tempIntSum );
5857 tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
5858 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5859 vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
5860 tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
5861 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5862 vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
5865 vector
unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
5866 tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
5867 vec_ste( tempSt, 0, &cullBitVal[0] );
5868 vec_ste( tempSt, 4, &cullBitVal[0] );
5869 vec_ste( tempSt, 8, &cullBitVal[0] );
5870 vec_ste( tempSt, 12, &cullBitVal[0] );
5872 tOr |= cullBitVal[0];
5873 tOr |= cullBitVal[1];
5874 tOr |= cullBitVal[2];
5875 tOr |= cullBitVal[3];
5877 cullBits[
i] = cullBitVal[0];
5878 cullBits[i+1] = cullBitVal[1];
5879 cullBits[i+2] = cullBitVal[2];
5880 cullBits[i+3] = cullBitVal[3];
5884 for ( ; i < numVerts; i++ ) {
5886 float d0, d1, d2, d3,
t;
5923 #ifndef DRAWVERT_PADDED
5935 const float *planePtr = planes[0].
ToFloatPtr();
5937 vector
float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
5938 vector
float zeroVector = (vector
float)(0.0);
5939 vector
unsigned char vecPerm;
5940 vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
5942 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector
unsigned char)(1) );
5945 v0 = vec_ld( 0, planePtr );
5946 v1 = vec_ld( 15, planePtr );
5947 vecPlane0 = vec_perm( v0, v1, vecPerm );
5949 v2 = vec_ld( 0, planePtr + 4 );
5950 v3 = vec_ld( 15, planePtr + 4 );
5951 vecPlane1 = vec_perm( v2, v3, vecPerm );
5953 v0 = vec_ld( 0, planePtr + 8 );
5954 v1 = vec_ld( 15, planePtr + 8 );
5955 vecPlane2 = vec_perm( v0, v1, vecPerm );
5957 v2 = vec_ld( 0, planePtr + 12 );
5958 v3 = vec_ld( 15, planePtr + 12 );
5959 vecPlane3 = vec_perm( v2, v3, vecPerm );
5961 v0 = vec_ld( 0, planePtr + 16 );
5962 v1 = vec_ld( 15, planePtr + 16 );
5963 vecPlane4 = vec_perm( v0, v1, vecPerm );
5965 v2 = vec_ld( 0, planePtr + 20 );
5966 v3 = vec_ld( 15, planePtr + 20 );
5967 vecPlane5 = vec_perm( v2, v3, vecPerm );
5970 v0 = vec_mergeh( vecPlane0, vecPlane2 );
5971 v1 = vec_mergeh( vecPlane1, vecPlane3 );
5972 v2 = vec_mergel( vecPlane0, vecPlane2 );
5973 v3 = vec_mergel( vecPlane1, vecPlane3 );
5975 vecPlane0 = vec_mergeh( v0, v1 );
5976 vecPlane1 = vec_mergel( v0, v1 );
5977 vecPlane2 = vec_mergeh( v2, v3 );
5978 vecPlane3 = vec_mergel( v2, v3 );
5980 v0 = vec_mergeh( vecPlane4, zeroVector );
5981 v1 = vec_mergeh( vecPlane5, zeroVector );
5982 v2 = vec_mergel( vecPlane4, zeroVector );
5983 v3 = vec_mergel( vecPlane5, zeroVector );
5985 vecPlane4 = vec_mergeh( v0, v1 );
5986 vecPlane5 = vec_mergel( v0, v1 );
5987 vecPlane6 = vec_mergeh( v2, v3 );
5988 vecPlane7 = vec_mergel( v2, v3 );
5991 vector
float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5992 vector
bool int oneIntVector = (vector
bool int)(1);
5993 vector
float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
5994 vector
unsigned int vecShift1 = (vector
unsigned int)(0, 1, 2, 3 );
5995 vector
unsigned int vecShift2 = (vector
unsigned int)(4, 5, 0, 0 );
5997 vector
bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5998 vector
unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
5999 vector
unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
6000 vector
unsigned int vecFlipBits = (vector
unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
6001 vector
unsigned int vecR1, vecR2, vecR3, vecR4;
6002 vector
unsigned char permHalves = (vector
unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
6003 vector
unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6004 unsigned int vBits[4];
6005 vector
unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
6009 if ( i+3 < numVerts ) {
6010 vertPerm1 = vec_add( vec_lvsl( -1, (
float*) verts[0].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
6011 vertPerm2 = vec_add( vec_lvsl( -1, (
float*) verts[1].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
6012 vertPerm3 = vec_add( vec_lvsl( -1, (
float*) verts[2].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
6013 vertPerm4 = vec_add( vec_lvsl( -1, (
float*) verts[3].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
6017 for ( ; i+3 < numVerts; i+=4 ) {
6023 v0 = vec_ld( 0, vertPtr );
6024 v1 = vec_ld( 15, vertPtr );
6025 v2 = vec_ld( 0, vertPtr2 );
6026 v3 = vec_ld( 15, vertPtr2 );
6027 v4 = vec_ld( 0, vertPtr3 );
6028 v5 = vec_ld( 15, vertPtr3 );
6029 v6 = vec_ld( 0, vertPtr4 );
6030 v7 = vec_ld( 15, vertPtr4 );
6032 vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
6033 vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
6034 vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
6035 vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
6037 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
6038 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
6039 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
6040 vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
6042 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
6043 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
6044 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
6045 vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
6047 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
6048 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
6049 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
6050 vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
6052 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
6053 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
6054 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
6055 vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
6057 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
6058 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
6059 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
6060 vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
6062 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
6063 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
6064 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
6065 vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
6067 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
6068 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
6069 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
6070 vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
6072 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
6073 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
6074 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
6075 vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
6077 vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
6078 vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
6079 vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
6080 vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
6081 vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
6082 vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
6083 vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
6084 vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
6087 vecCmp1 = vec_and( vecCmp1, oneIntVector );
6088 vecCmp2 = vec_and( vecCmp2, oneIntVector );
6089 vecCmp3 = vec_and( vecCmp3, oneIntVector );
6090 vecCmp4 = vec_and( vecCmp4, oneIntVector );
6091 vecCmp5 = vec_and( vecCmp5, oneIntVector );
6092 vecCmp6 = vec_and( vecCmp6, oneIntVector );
6093 vecCmp7 = vec_and( vecCmp7, oneIntVector );
6094 vecCmp8 = vec_and( vecCmp8, oneIntVector );
6096 vecBitShifted1 = vec_sl( (vector
unsigned int)vecCmp1, vecShift1 );
6097 vecBitShifted2 = vec_sl( (vector
unsigned int)vecCmp2, vecShift2 );
6098 vecBitShifted3 = vec_sl( (vector
unsigned int)vecCmp3, vecShift1 );
6099 vecBitShifted4 = vec_sl( (vector
unsigned int)vecCmp4, vecShift2 );
6100 vecBitShifted5 = vec_sl( (vector
unsigned int)vecCmp5, vecShift1 );
6101 vecBitShifted6 = vec_sl( (vector
unsigned int)vecCmp6, vecShift2 );
6102 vecBitShifted7 = vec_sl( (vector
unsigned int)vecCmp7, vecShift1 );
6103 vecBitShifted8 = vec_sl( (vector
unsigned int)vecCmp8, vecShift2 );
6106 vecR1 = (vector
unsigned int)(0);
6107 vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
6108 vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
6109 vecR1 = vec_add(vecR1, vecBitShifted2 );
6110 vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6112 vecR2 = (vector
unsigned int)(0);
6113 vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
6114 vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
6115 vecR2 = vec_add(vecR2, vecBitShifted4 );
6116 vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
6118 vecR3 = (vector
unsigned int)(0);
6119 vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
6120 vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
6121 vecR3 = vec_add(vecR3, vecBitShifted6 );
6122 vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
6124 vecR4 = (vector
unsigned int)(0);
6125 vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
6126 vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
6127 vecR4 = vec_add(vecR4, vecBitShifted8 );
6128 vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
6131 vecR1 = vec_mergeh( vecR1, vecR2 );
6132 vecR3 = vec_mergeh( vecR3, vecR4 );
6133 vecR1 = vec_perm( vecR1, vecR3, permHalves );
6136 vecR1 = vec_xor( vecR1, vecFlipBits );
6140 vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
6141 vec_ste( vecR1, 0, &vBits[0] );
6142 vec_ste( vecR1, 4, &vBits[0] );
6143 vec_ste( vecR1, 8, &vBits[0] );
6144 vec_ste( vecR1, 12, &vBits[0] );
6146 cullBits[
i] = vBits[0];
6147 cullBits[i+1] = vBits[1];
6148 cullBits[i+2] = vBits[2];
6149 cullBits[i+3] = vBits[3];
6152 for ( ; i < numVerts; i++ ) {
6154 float d0, d1, d2, d3, d4, d5;
6172 cullBits[
i] = bits ^ 0x3F;
6189 const float *planePtr = planes[0].
ToFloatPtr();
6191 vector
float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
6192 vector
float zeroVector = (vector
float)(0.0);
6193 vector
unsigned char vecPerm;
6194 vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
6196 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector
unsigned char)(1) );
6199 v0 = vec_ld( 0, planePtr );
6200 v1 = vec_ld( 15, planePtr );
6201 vecPlane0 = vec_perm( v0, v1, vecPerm );
6203 v2 = vec_ld( 0, planePtr + 4 );
6204 v3 = vec_ld( 15, planePtr + 4 );
6205 vecPlane1 = vec_perm( v2, v3, vecPerm );
6207 v0 = vec_ld( 0, planePtr + 8 );
6208 v1 = vec_ld( 15, planePtr + 8 );
6209 vecPlane2 = vec_perm( v0, v1, vecPerm );
6211 v2 = vec_ld( 0, planePtr + 12 );
6212 v3 = vec_ld( 15, planePtr + 12 );
6213 vecPlane3 = vec_perm( v2, v3, vecPerm );
6215 v0 = vec_ld( 0, planePtr + 16 );
6216 v1 = vec_ld( 15, planePtr + 16 );
6217 vecPlane4 = vec_perm( v0, v1, vecPerm );
6219 v2 = vec_ld( 0, planePtr + 20 );
6220 v3 = vec_ld( 15, planePtr + 20 );
6221 vecPlane5 = vec_perm( v2, v3, vecPerm );
6224 v0 = vec_mergeh( vecPlane0, vecPlane2 );
6225 v1 = vec_mergeh( vecPlane1, vecPlane3 );
6226 v2 = vec_mergel( vecPlane0, vecPlane2 );
6227 v3 = vec_mergel( vecPlane1, vecPlane3 );
6229 vecPlane0 = vec_mergeh( v0, v1 );
6230 vecPlane1 = vec_mergel( v0, v1 );
6231 vecPlane2 = vec_mergeh( v2, v3 );
6232 vecPlane3 = vec_mergel( v2, v3 );
6234 v0 = vec_mergeh( vecPlane4, zeroVector );
6235 v1 = vec_mergeh( vecPlane5, zeroVector );
6236 v2 = vec_mergel( vecPlane4, zeroVector );
6237 v3 = vec_mergel( vecPlane5, zeroVector );
6239 vecPlane4 = vec_mergeh( v0, v1 );
6240 vecPlane5 = vec_mergel( v0, v1 );
6241 vecPlane6 = vec_mergeh( v2, v3 );
6242 vecPlane7 = vec_mergel( v2, v3 );
6245 vector
float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6246 vector
bool int oneIntVector = (vector
bool int)(1);
6247 vector
float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
6248 vector
unsigned int vecShift1 = (vector
unsigned int)(0, 1, 2, 3 );
6249 vector
unsigned int vecShift2 = (vector
unsigned int)(4, 5, 0, 0 );
6251 vector
bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
6252 vector
unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
6253 vector
unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
6254 vector
unsigned int vecFlipBits = (vector
unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
6255 vector
unsigned int vecR1, vecR2, vecR3, vecR4;
6256 vector
unsigned char permHalves = (vector
unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
6257 vector
unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6258 unsigned int vBits[4];
6259 vector
unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
6263 for ( ; i+3 < numVerts; i+=4 ) {
6269 v0 = vec_ld( 0, vertPtr );
6270 v2 = vec_ld( 0, vertPtr2 );
6271 v4 = vec_ld( 0, vertPtr3 );
6272 v6 = vec_ld( 0, vertPtr4 );
6274 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
6275 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
6276 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
6277 vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
6279 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
6280 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
6281 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
6282 vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
6284 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
6285 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
6286 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
6287 vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
6289 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
6290 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
6291 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
6292 vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
6294 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
6295 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
6296 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
6297 vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
6299 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
6300 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
6301 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
6302 vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
6304 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
6305 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
6306 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
6307 vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
6309 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
6310 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
6311 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
6312 vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
6314 vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
6315 vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
6316 vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
6317 vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
6318 vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
6319 vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
6320 vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
6321 vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
6324 vecCmp1 = vec_and( vecCmp1, oneIntVector );
6325 vecCmp2 = vec_and( vecCmp2, oneIntVector );
6326 vecCmp3 = vec_and( vecCmp3, oneIntVector );
6327 vecCmp4 = vec_and( vecCmp4, oneIntVector );
6328 vecCmp5 = vec_and( vecCmp5, oneIntVector );
6329 vecCmp6 = vec_and( vecCmp6, oneIntVector );
6330 vecCmp7 = vec_and( vecCmp7, oneIntVector );
6331 vecCmp8 = vec_and( vecCmp8, oneIntVector );
6333 vecBitShifted1 = vec_sl( (vector
unsigned int)vecCmp1, vecShift1 );
6334 vecBitShifted2 = vec_sl( (vector
unsigned int)vecCmp2, vecShift2 );
6335 vecBitShifted3 = vec_sl( (vector
unsigned int)vecCmp3, vecShift1 );
6336 vecBitShifted4 = vec_sl( (vector
unsigned int)vecCmp4, vecShift2 );
6337 vecBitShifted5 = vec_sl( (vector
unsigned int)vecCmp5, vecShift1 );
6338 vecBitShifted6 = vec_sl( (vector
unsigned int)vecCmp6, vecShift2 );
6339 vecBitShifted7 = vec_sl( (vector
unsigned int)vecCmp7, vecShift1 );
6340 vecBitShifted8 = vec_sl( (vector
unsigned int)vecCmp8, vecShift2 );
6343 vecR1 = (vector
unsigned int)(0);
6344 vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
6345 vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
6346 vecR1 = vec_add(vecR1, vecBitShifted2 );
6347 vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6349 vecR2 = (vector
unsigned int)(0);
6350 vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
6351 vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
6352 vecR2 = vec_add(vecR2, vecBitShifted4 );
6353 vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
6355 vecR3 = (vector
unsigned int)(0);
6356 vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
6357 vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
6358 vecR3 = vec_add(vecR3, vecBitShifted6 );
6359 vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
6361 vecR4 = (vector
unsigned int)(0);
6362 vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
6363 vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
6364 vecR4 = vec_add(vecR4, vecBitShifted8 );
6365 vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
6368 vecR1 = vec_mergeh( vecR1, vecR2 );
6369 vecR3 = vec_mergeh( vecR3, vecR4 );
6370 vecR1 = vec_perm( vecR1, vecR3, permHalves );
6373 vecR1 = vec_xor( vecR1, vecFlipBits );
6377 vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
6378 vec_ste( vecR1, 0, &vBits[0] );
6379 vec_ste( vecR1, 4, &vBits[0] );
6380 vec_ste( vecR1, 8, &vBits[0] );
6381 vec_ste( vecR1, 12, &vBits[0] );
6383 cullBits[
i] = vBits[0];
6384 cullBits[i+1] = vBits[1];
6385 cullBits[i+2] = vBits[2];
6386 cullBits[i+3] = vBits[3];
6389 for ( ; i < numVerts; i++ ) {
6391 float d0, d1, d2, d3, d4, d5;
6409 cullBits[
i] = bits ^ 0x3F;
6416 #ifndef DRAWVERT_PADDED
6429 float p0x, p0y, p0z, p0d;
6430 float p1x, p1y, p1z, p1d;
6432 const float *planePtr = planes[0].
ToFloatPtr();
6435 vector
float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
6436 vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
6437 vector
unsigned char vecPerm;
6438 vector
float zeroVector = (vector
float)(0);
6440 p0x = *(planePtr + 0);
6441 p0y = *(planePtr + 1);
6442 p0z = *(planePtr + 2);
6443 p0d = *(planePtr + 3);
6444 p1x = *(planePtr + 4);
6445 p1y = *(planePtr + 5);
6446 p1z = *(planePtr + 6);
6447 p1d = *(planePtr + 7);
6450 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector
unsigned char)(1) );
6451 v0 = vec_ld( 0, planePtr );
6452 v1 = vec_ld( 15, planePtr );
6453 vecPlane0 = vec_perm( v0, v1, vecPerm );
6455 v2 = vec_ld( 31, planePtr );
6456 vecPlane1 = vec_perm( v1, v2, vecPerm );
6459 v0 = vec_mergeh( vecPlane0, vecPlane0 );
6460 v1 = vec_mergeh( vecPlane1, vecPlane1 );
6461 v2 = vec_mergel( vecPlane0, vecPlane0 );
6462 v3 = vec_mergel( vecPlane1, vecPlane1);
6464 vecPlane0 = vec_mergeh( v0, v1 );
6465 vecPlane1 = vec_mergel( v0, v1 );
6466 vecPlane2 = vec_mergeh( v2, v3 );
6467 vecPlane3 = vec_mergel( v2, v3 );
6469 vector
float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6470 vector
float oneVector = (vector
float)(1);
6472 vector
float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
6474 vector
bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
6475 vector
float negTwoVector = (vector
float)(-2);
6476 vector
unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
6477 vector
unsigned int vecShift = (vector
unsigned int)( 0, 1, 0, 1 );
6478 vector
unsigned int vecShiftInv = (vector
unsigned int)( 2, 3, 2, 3 );
6479 vector
unsigned char vecPermFirstThird = (vector
unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
6480 vector
bool int oneIntVector = (vector
bool int)(1);
6481 vector
unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6482 unsigned int cullBitVal[4];
6483 vector
unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
6487 if ( i+3 < numVerts ) {
6488 vertPerm1 = vec_add( vec_lvsl( -1, (
float*) verts[0].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
6489 vertPerm2 = vec_add( vec_lvsl( -1, (
float*) verts[1].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
6490 vertPerm3 = vec_add( vec_lvsl( -1, (
float*) verts[2].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
6491 vertPerm4 = vec_add( vec_lvsl( -1, (
float*) verts[3].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
6495 for ( ; i+3 < numVerts; i+=4 ) {
6501 v0 = vec_ld( 0, vertPtr );
6502 v1 = vec_ld( 15, vertPtr );
6503 v2 = vec_ld( 0, vertPtr2 );
6504 v3 = vec_ld( 15, vertPtr2 );
6505 v4 = vec_ld( 0, vertPtr3 );
6506 v5 = vec_ld( 15, vertPtr3 );
6507 v6 = vec_ld( 0, vertPtr4 );
6508 v7 = vec_ld( 15, vertPtr4 );
6510 vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
6511 vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
6512 vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
6513 vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
6516 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector
unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6517 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector
unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
6518 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector
unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
6519 vecSum1 = vec_add( vecSum1, vecPlane3 );
6521 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector
unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6522 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector
unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
6523 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector
unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
6524 vecSum2 = vec_add( vecSum2, vecPlane3 );
6527 UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
6530 vecCmp1 = vec_cmplt( vecSum1, zeroVector );
6531 vecCmp2 = vec_cmplt( vecSum2, zeroVector );
6534 vecCmp1 = vec_and( vecCmp1, oneIntVector );
6535 vecCmp2 = vec_and( vecCmp2, oneIntVector );
6539 vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
6540 vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
6541 vecSum1Inv = vec_add( vecSum1Inv, oneVector );
6542 vecSum2Inv = vec_add( vecSum2Inv, oneVector );
6545 vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
6546 vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
6549 vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
6550 vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
6553 vecBitShifted1 = vec_sl( (vector
unsigned int)vecCmp1, vecShift );
6554 vecBitShifted2 = vec_sl( (vector
unsigned int)vecCmp2, vecShift );
6555 vecBitShifted1Inv = vec_sl( (vector
unsigned int)vecCmp1Inv, vecShiftInv );
6556 vecBitShifted2Inv = vec_sl( (vector
unsigned int)vecCmp2Inv, vecShiftInv );
6560 vector
unsigned int vecResult;
6561 vector
unsigned int vecResult2;
6562 vector
unsigned int vecResult3;
6563 vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
6565 vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6568 vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
6569 vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
6570 vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
6571 vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
6573 vecResult = vec_add( vecResult, vecResult2 );
6576 vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
6577 vec_ste( vecResult, 0, &cullBitVal[0] );
6578 vec_ste( vecResult, 4, &cullBitVal[0] );
6579 vec_ste( vecResult, 8, &cullBitVal[0] );
6580 vec_ste( vecResult, 12, &cullBitVal[0] );
6582 cullBits[
i] = cullBitVal[0];
6583 cullBits[i+1] = cullBitVal[1];
6584 cullBits[i+2] = cullBitVal[2];
6585 cullBits[i+3] = cullBitVal[3];
6589 for ( ; i < numVerts; i++ ) {
6594 vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
6595 vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
6596 vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
6598 d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
6599 d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
6600 texCoords[
i][0] = d0;
6601 texCoords[
i][1] = d1;
6603 bits = ( d0 >= 0 ) ? 0 : 1;
6605 bits |= ( d1 >= 0 ) ? 0 : 1*2;
6608 bits |= ( d0 >= 0 ) ? 0: 1*4;
6609 bits |= ( d1 >= 0 ) ? 0: 1*8;
6628 float p0x, p0y, p0z, p0d;
6629 float p1x, p1y, p1z, p1d;
6631 const float *planePtr = planes[0].
ToFloatPtr();
6634 vector
float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
6635 vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
6636 vector
unsigned char vecPerm;
6637 vector
float zeroVector = (vector
float)(0);
6639 p0x = *(planePtr + 0);
6640 p0y = *(planePtr + 1);
6641 p0z = *(planePtr + 2);
6642 p0d = *(planePtr + 3);
6643 p1x = *(planePtr + 4);
6644 p1y = *(planePtr + 5);
6645 p1z = *(planePtr + 6);
6646 p1d = *(planePtr + 7);
6649 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector
unsigned char)(1) );
6650 v0 = vec_ld( 0, planePtr );
6651 v1 = vec_ld( 15, planePtr );
6652 vecPlane0 = vec_perm( v0, v1, vecPerm );
6654 v2 = vec_ld( 31, planePtr );
6655 vecPlane1 = vec_perm( v1, v2, vecPerm );
6658 v0 = vec_mergeh( vecPlane0, vecPlane0 );
6659 v1 = vec_mergeh( vecPlane1, vecPlane1 );
6660 v2 = vec_mergel( vecPlane0, vecPlane0 );
6661 v3 = vec_mergel( vecPlane1, vecPlane1);
6663 vecPlane0 = vec_mergeh( v0, v1 );
6664 vecPlane1 = vec_mergel( v0, v1 );
6665 vecPlane2 = vec_mergeh( v2, v3 );
6666 vecPlane3 = vec_mergel( v2, v3 );
6668 vector
float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6669 vector
float oneVector = (vector
float)(1);
6671 vector
float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
6673 vector
bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
6674 vector
float negTwoVector = (vector
float)(-2);
6675 vector
unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
6676 vector
unsigned int vecShift = (vector
unsigned int)( 0, 1, 0, 1 );
6677 vector
unsigned int vecShiftInv = (vector
unsigned int)( 2, 3, 2, 3 );
6678 vector
unsigned char vecPermFirstThird = (vector
unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
6679 vector
bool int oneIntVector = (vector
bool int)(1);
6680 vector
unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6681 unsigned int cullBitVal[4];
6682 vector
unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
6686 for ( ; i+3 < numVerts; i+=4 ) {
6692 vecXYZ1 = vec_ld( 0, vertPtr );
6693 vecXYZ2 = vec_ld( 0, vertPtr2 );
6694 vecXYZ3 = vec_ld( 0, vertPtr3 );
6695 vecXYZ4 = vec_ld( 0, vertPtr4 );
6698 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector
unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6699 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector
unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
6700 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector
unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
6701 vecSum1 = vec_add( vecSum1, vecPlane3 );
6703 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector
unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6704 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector
unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
6705 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector
unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
6706 vecSum2 = vec_add( vecSum2, vecPlane3 );
6709 UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
6712 vecCmp1 = vec_cmplt( vecSum1, zeroVector );
6713 vecCmp2 = vec_cmplt( vecSum2, zeroVector );
6716 vecCmp1 = vec_and( vecCmp1, oneIntVector );
6717 vecCmp2 = vec_and( vecCmp2, oneIntVector );
6721 vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
6722 vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
6723 vecSum1Inv = vec_add( vecSum1Inv, oneVector );
6724 vecSum2Inv = vec_add( vecSum2Inv, oneVector );
6727 vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
6728 vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
6731 vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
6732 vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
6735 vecBitShifted1 = vec_sl( (vector
unsigned int)vecCmp1, vecShift );
6736 vecBitShifted2 = vec_sl( (vector
unsigned int)vecCmp2, vecShift );
6737 vecBitShifted1Inv = vec_sl( (vector
unsigned int)vecCmp1Inv, vecShiftInv );
6738 vecBitShifted2Inv = vec_sl( (vector
unsigned int)vecCmp2Inv, vecShiftInv );
6742 vector
unsigned int vecResult;
6743 vector
unsigned int vecResult2;
6744 vector
unsigned int vecResult3;
6745 vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
6747 vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6750 vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
6751 vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
6752 vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
6753 vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
6755 vecResult = vec_add( vecResult, vecResult2 );
6758 vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
6759 vec_ste( vecResult, 0, &cullBitVal[0] );
6760 vec_ste( vecResult, 4, &cullBitVal[0] );
6761 vec_ste( vecResult, 8, &cullBitVal[0] );
6762 vec_ste( vecResult, 12, &cullBitVal[0] );
6764 cullBits[
i] = cullBitVal[0];
6765 cullBits[i+1] = cullBitVal[1];
6766 cullBits[i+2] = cullBitVal[2];
6767 cullBits[i+3] = cullBitVal[3];
6771 for ( ; i < numVerts; i++ ) {
6776 vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
6777 vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
6778 vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
6780 d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
6781 d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
6782 texCoords[
i][0] = d0;
6783 texCoords[
i][1] = d1;
6785 bits = ( d0 >= 0 ) ? 0 : 1;
6787 bits |= ( d1 >= 0 ) ? 0 : 1*2;
6790 bits |= ( d0 >= 0 ) ? 0: 1*4;
6791 bits |= ( d1 >= 0 ) ? 0: 1*8;
6802 #ifdef ENABLE_DERIVE
6818 vector
float vecD0, vecD1, vecD2, vecD3, vecD4, vecD5, vecD6, vecD7;
6819 vector
float vecVertA, vecVertB, vecVertC;
6820 vector
float vecVertA2, vecVertB2, vecVertC2;
6821 vector
float vecVertA3, vecVertB3, vecVertC3;
6822 vector
float vecVertA4, vecVertB4, vecVertC4;
6824 vector
float vecN, vecN2, vecN3, vecN4;
6825 vector
float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
6826 vector
unsigned char vecPerm1 = (vector
unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
6827 vector
unsigned char vecPerm2 = (vector
unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
6829 vector
float vecF1, vecF2, vecF3, vecF4;
6830 vector
float zeroVector = (vector
float)(0);
6831 vector
float vecNegOne = (vector
float)(-1);
6832 vector
float vecSecondHalf, vecFirstHalf, vecSecondHalf2, vecFirstHalf2, vecSecondHalf3, vecFirstHalf3, vecFirstHalf4, vecSecondHalf4;
6834 vector
unsigned char vecPermA, vecPermA2, vecPermA3, vecPermA4;
6835 vector
unsigned char vecPermB, vecPermB2, vecPermB3, vecPermB4;
6836 vector
unsigned char vecPermC, vecPermC2, vecPermC3, vecPermC4;
6838 vector
unsigned char oneVector = (vector
unsigned char)(1);
6839 vector
float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
6840 vector
unsigned char vecPermZeroLast = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
6846 for ( j = 0, i = 0; i+11 < numIndexes; i += 12, j += 4 ) {
6848 #ifndef DRAWVERT_PADDED
6852 vecPermA = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) ), oneVector );
6853 vecPermB = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) ), oneVector );
6854 vecPermC = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) ), oneVector );
6855 vecPermA2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) ), oneVector );
6856 vecPermB2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) ), oneVector );
6857 vecPermC2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) ), oneVector );
6858 vecPermA3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) ), oneVector );
6859 vecPermB3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) ), oneVector );
6860 vecPermC3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) ), oneVector );
6861 vecPermA4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) ), oneVector );
6862 vecPermB4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) ), oneVector );
6863 vecPermC4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) ), oneVector );
6866 #ifndef DRAWVERT_PADDED
6868 vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6869 vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6870 vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6871 vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6872 vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6873 vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6875 vecVertA = vec_perm( vecLd1, vecLd2, vecPermA );
6876 vecVertB = vec_perm( vecLd3, vecLd4, vecPermB );
6877 vecVertC = vec_perm( vecLd5, vecLd6, vecPermC );
6880 vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
6881 vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
6882 vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
6885 vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6886 vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6887 vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6888 vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6889 vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6890 vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6892 vecVertA2 = vec_perm( vecLd1, vecLd2, vecPermA2 );
6893 vecVertB2 = vec_perm( vecLd3, vecLd4, vecPermB2 );
6894 vecVertC2 = vec_perm( vecLd5, vecLd6, vecPermC2 );
6897 vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6898 vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6899 vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6902 vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6903 vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6904 vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6905 vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6906 vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6907 vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6909 vecVertA3 = vec_perm( vecLd1, vecLd2, vecPermA3 );
6910 vecVertB3 = vec_perm( vecLd3, vecLd4, vecPermB3 );
6911 vecVertC3 = vec_perm( vecLd5, vecLd6, vecPermC3 );
6914 vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6915 vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6916 vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6919 vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6920 vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6921 vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6922 vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6923 vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6924 vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6926 vecVertA4 = vec_perm( vecLd1, vecLd2, vecPermA4 );
6927 vecVertB4 = vec_perm( vecLd3, vecLd4, vecPermB4 );
6928 vecVertC4 = vec_perm( vecLd5, vecLd6, vecPermC4 );
6931 vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
6932 vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
6933 vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
6936 vecVertA = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6937 vecVertB = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6938 vecVertC = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6941 vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
6942 vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
6943 vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
6946 vecVertA2 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6947 vecVertB2 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6948 vecVertC2 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6951 vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6952 vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6953 vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6956 vecVertA3 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6957 vecVertB3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6958 vecVertC3 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6961 vecVertA3 = vec_perm( vecVertA3, zeroVector, vecPermZeroLast );
6962 vecVertB3 = vec_perm( vecVertB3, zeroVector, vecPermZeroLast );
6963 vecVertC3 = vec_perm( vecVertC3, zeroVector, vecPermZeroLast );
6966 vecVertA4 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6967 vecVertB4 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6968 vecVertC4 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6971 vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
6972 vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
6973 vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
6976 vecD0 = vec_sub( vecVertB, vecVertA );
6977 vecD1 = vec_sub( vecVertC, vecVertA );
6979 vecD2 = vec_sub( vecVertB2, vecVertA2 );
6980 vecD3 = vec_sub( vecVertC2, vecVertA2 );
6982 vecD4 = vec_sub( vecVertB3, vecVertA3 );
6983 vecD5 = vec_sub( vecVertC3, vecVertA3 );
6985 vecD6 = vec_sub( vecVertB4, vecVertA4 );
6986 vecD7 = vec_sub( vecVertC4, vecVertA4 );
6988 vecWork1 = vec_perm( vecD0, vecD0, vecPerm1 );
6989 vecWork2 = vec_perm( vecD1, vecD1, vecPerm2 );
6990 vecWork3 = vec_perm( vecD2, vecD2, vecPerm1 );
6991 vecWork4 = vec_perm( vecD3, vecD3, vecPerm2 );
6992 vecWork5 = vec_perm( vecD4, vecD4, vecPerm1 );
6993 vecWork6 = vec_perm( vecD5, vecD5, vecPerm2 );
6994 vecWork7 = vec_perm( vecD6, vecD6, vecPerm1 );
6995 vecWork8 = vec_perm( vecD7, vecD7, vecPerm2 );
6997 vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
6998 vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
6999 vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7000 vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7002 vecWork1 = vec_perm( vecD1, vecD1, vecPerm1 );
7003 vecWork2 = vec_perm( vecD0, vecD0, vecPerm2 );
7004 vecWork3 = vec_perm( vecD3, vecD3, vecPerm1 );
7005 vecWork4 = vec_perm( vecD2, vecD2, vecPerm2 );
7006 vecWork5 = vec_perm( vecD5, vecD5, vecPerm1 );
7007 vecWork6 = vec_perm( vecD4, vecD4, vecPerm2 );
7008 vecWork7 = vec_perm( vecD7, vecD7, vecPerm1 );
7009 vecWork8 = vec_perm( vecD6, vecD6, vecPerm2 );
7011 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7012 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7013 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7014 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7016 vecN = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
7017 vecN2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
7018 vecN3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
7019 vecN4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
7023 v0 = vec_mergeh( vecN, vecN3 );
7024 v1 = vec_mergeh( vecN2, vecN4 );
7025 v2 = vec_mergel( vecN, vecN3 );
7026 v3 = vec_mergel( vecN2, vecN4 );
7028 vecN = vec_mergeh( v0, v1 );
7029 vecN2 = vec_mergel( v0, v1 );
7030 vecN3 = vec_mergeh( v2, v3 );
7031 vecN4 = vec_mergel( v2, v3 );
7033 vecF = vec_madd( vecN, vecN, zeroVector );
7034 vecF = vec_madd( vecN2, vecN2, vecF );
7035 vecF = vec_madd( vecN3, vecN3, vecF );
7037 vecF = ReciprocalSquareRoot( vecF );
7039 vecF1 = vec_madd( vecF, vecN, zeroVector );
7040 vecF2 = vec_madd( vecF, vecN2, zeroVector );
7041 vecF3 = vec_madd( vecF, vecN3, zeroVector );
7042 vecF4 = vec_madd( vecF, vecN4, zeroVector );
7044 vector
float v8, v9, v10, v11;
7051 v0 = vec_mergeh( vecVertA, vecVertA3 );
7052 v1 = vec_mergeh( vecVertA2, vecVertA4 );
7053 v2 = vec_mergel( vecVertA, vecVertA3 );
7054 v3 = vec_mergel( vecVertA2, vecVertA4 );
7056 vecVertA = vec_mergeh( v0, v1 );
7057 vecVertA2 = vec_mergel( v0, v1 );
7058 vecVertA3 = vec_mergeh( v2, v3 );
7059 vecVertA4 = vec_mergel( v2, v3 );
7061 vector
float vecTotals;
7062 vecTotals = vec_madd( vecVertA, v8, zeroVector );
7063 vecTotals = vec_madd( vecVertA2, v9, vecTotals );
7064 vecTotals = vec_madd( vecVertA3, v10, vecTotals );
7065 vecTotals = vec_madd( vecVertA4, v11, vecTotals );
7066 vecF = vec_madd( vecTotals, vecNegOne, zeroVector );
7069 v0 = vec_mergeh( vecF1, vecF3 );
7070 v1 = vec_mergeh( vecF2, vecF );
7071 v2 = vec_mergel( vecF1, vecF3 );
7072 v3 = vec_mergel( vecF2, vecF );
7074 vecF1 = vec_mergeh( v0, v1 );
7075 vecF2 = vec_mergel( v0, v1 );
7076 vecF3 = vec_mergeh( v2, v3 );
7077 vecF4 = vec_mergel( v2, v3 );
7080 UNALIGNED_STORE4( planePtr + ( j * PLANE_OFFSET ), vecF1, vecF2, vecF3, vecF4 );
7084 for ( ; i < numIndexes; i += 3, j++ ) {
7086 float d0[3], d1[3],
f;
7089 a = verts + indexes[i + 0];
7090 b = verts + indexes[i + 1];
7091 c = verts + indexes[i + 2];
7093 d0[0] = b->
xyz[0] - a->
xyz[0];
7094 d0[1] = b->
xyz[1] - a->
xyz[1];
7095 d0[2] = b->
xyz[2] - a->
xyz[2];
7097 d1[0] = c->
xyz[0] - a->
xyz[0];
7098 d1[1] = c->
xyz[1] - a->
xyz[1];
7099 d1[2] = c->
xyz[2] - a->
xyz[2];
7101 n[0] = d1[1] * d0[2] - d1[2] * d0[1];
7102 n[1] = d1[2] * d0[0] - d1[0] * d0[2];
7103 n[2] = d1[0] * d0[1] - d1[1] * d0[0];
7105 f = FastScalarInvSqrt( n.
x * n.
x + n.
y * n.
y + n.
z * n.
z );
7131 bool *used = (
bool *)_alloca16( numVerts *
sizeof( used[0] ) );
7132 memset( used, 0, numVerts *
sizeof( used[0] ) );
7135 for ( i = 0; i < numIndexes; i += 3 ) {
7138 float d0[5], d1[5], area;
7142 int v0 = indexes[i + 0];
7143 int v1 = indexes[i + 1];
7144 int v2 = indexes[i + 2];
7150 d0[0] = b->
xyz[0] - a->
xyz[0];
7151 d0[1] = b->
xyz[1] - a->
xyz[1];
7152 d0[2] = b->
xyz[2] - a->
xyz[2];
7153 d0[3] = b->
st[0] - a->
st[0];
7154 d0[4] = b->
st[1] - a->
st[1];
7156 d1[0] = c->
xyz[0] - a->
xyz[0];
7157 d1[1] = c->
xyz[1] - a->
xyz[1];
7158 d1[2] = c->
xyz[2] - a->
xyz[2];
7159 d1[3] = c->
st[0] - a->
st[0];
7160 d1[4] = c->
st[1] - a->
st[1];
7163 n[0] = d1[1] * d0[2] - d1[2] * d0[1];
7164 n[1] = d1[2] * d0[0] - d1[0] * d0[2];
7165 n[2] = d1[0] * d0[1] - d1[1] * d0[0];
7167 f1 = n.
x * n.
x + n.
y * n.
y + n.
z * n.
z;
7170 area = d0[3] * d1[4] - d0[4] * d1[3];
7173 t0[0] = d0[0] * d1[4] - d0[4] * d1[0];
7174 t0[1] = d0[1] * d1[4] - d0[4] * d1[1];
7175 t0[2] = d0[2] * d1[4] - d0[4] * d1[2];
7177 f2 = t0.
x * t0.
x + t0.
y * t0.
y + t0.
z * t0.
z;
7180 t1[0] = d0[3] * d1[0] - d0[0] * d1[3];
7181 t1[1] = d0[3] * d1[1] - d0[1] * d1[3];
7182 t1[2] = d0[3] * d1[2] - d0[2] * d1[3];
7184 f3 = t1.
x * t1.
x + t1.
y * t1.
y + t1.
z * t1.
z;
7187 FastScalarInvSqrt_x3( &f1, &f2, &f3 );
7188 #ifdef PPC_INTRINSICS
7189 f2 = __fsel( area, f2, -f2 );
7190 f3 = __fsel( area, f3, -f3 );
7192 f2 = ( area < 0.0f ) ? -f2 : f2;
7193 f3 = ( area < 0.0f ) ? -f3 : f3;
7247 #ifdef DERIVE_UNSMOOTH_DRAWVERT_ALIGNED
7257 #define DERIVE_UNSMOOTHED_BITANGENT
7264 assert( IS_16BYTE_ALIGNED( verts[0] ) );
7266 vector
float vecVertA, vecVertB, vecVertC;
7267 vector
float vecVertA2, vecVertB2, vecVertC2;
7268 vector
float vecVertA3, vecVertB3, vecVertC3;
7269 vector
float vecVertA4, vecVertB4, vecVertC4;
7271 vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7, v8;
7272 vector
float vecS0, vecS1, vecS2;
7273 vector
float vecS0_2, vecS1_2, vecS2_2;
7274 vector
float vecS0_3, vecS1_3, vecS2_3;
7275 vector
float vecS0_4, vecS1_4, vecS2_4;
7277 vector
float vecD1, vecD2, vecD3, vecD4, vecD5, vecD6;
7278 vector
float vecD7, vecD8, vecD9, vecD10, vecD11, vecD12;
7279 vector
float vecT1, vecT1_2, vecT1_3, vecT1_4, vecT2, vecT2_2, vecT2_3, vecT2_4;
7280 vector
float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
7281 vector
float vecN, vecN2, vecN3, vecN4;
7283 vector
unsigned char vecPermN0 = (vector
unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
7284 vector
unsigned char vecPermN1 = (vector
unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
7285 vector
unsigned char vecPermT0 = (vector
unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3);
7286 vector
unsigned char vecPermT1 = (vector
unsigned char)(8,9,10,11,8,9,10,11,8,9,10,11,8,9,10,11);
7287 vector
float zeroVector = (vector
float)(0);
7289 vector
float vecNegOne = (vector
float)(-1.0);
7291 vector
float vecStore1, vecStore2, vecStore3;
7292 vector
unsigned char vecPermFirstThreeLast = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
7293 vector
unsigned char vecPermStoreSecond = (vector
unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
7294 vector
unsigned char vecPermLeadAndThree = (vector
unsigned char)(0,1,2,3,16,17,18,19,20,21,22,23,24,25,26,27);
7295 vector
unsigned char vecPermStore2 = (vector
unsigned char)(4,5,6,7,8,9,10,11,24,25,26,27,28,29,30,31);
7296 vector
unsigned char vecPermStore3 = (vector
unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
7297 vector
unsigned char vecPermStore4 = (vector
unsigned char)(8,9,10,11,16,17,18,19,20,21,22,23,24,25,26,27);
7298 vector
unsigned char vecPermHalves = (vector
unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
7300 vector
float vecLd1, vecLd2, vecLd3;
7301 vector
unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3, vecPerm4;
7306 vector
float vecFirstHalf, vecSecondHalf;
7307 vector
float vecFirstHalf2, vecSecondHalf2;
7308 vector
float vecFirstHalf3, vecSecondHalf3;
7309 vector
float vecFirstHalf4, vecSecondHalf4;
7311 for ( i = 0; i+3 < numVerts; i+=4 ) {
7312 int bOffset1, bOffset2, bOffset3, bOffset4;
7313 int cOffset1, cOffset2, cOffset3, cOffset4;
7315 bOffset1 = dominantTris[
i].
v2;
7316 cOffset1 = dominantTris[
i].
v3;
7317 bOffset2 = dominantTris[i+1].
v2;
7318 cOffset2 = dominantTris[i+1].
v3;
7319 bOffset3 = dominantTris[i+2].
v2;
7320 cOffset3 = dominantTris[i+2].
v3;
7321 bOffset4 = dominantTris[i+3].
v2;
7322 cOffset4 = dominantTris[i+3].
v3;
7324 vecPerm0 = vec_lvsl( 0, xyzPtr + ( i * DRAWVERT_OFFSET ) );
7325 v0 = vec_ld( 0, xyzPtr + (i * DRAWVERT_OFFSET ) );
7326 v1 = vec_ld( 16, xyzPtr + (i * DRAWVERT_OFFSET ) );
7327 vecVertA = vec_perm( v0, v1, vecPerm0 );
7329 vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset1 * DRAWVERT_OFFSET ) );
7330 v2 = vec_ld( 0, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
7331 v3 = vec_ld( 16, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
7332 vecVertB = vec_perm( v2, v3, vecPerm1 );
7334 vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7335 v4 = vec_ld( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7336 v5 = vec_ld( 16, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7337 vecVertC = vec_perm( v4, v5, vecPerm2 );
7340 v1 = vec_perm( v1, v1, vecPerm0 );
7341 v3 = vec_perm( v3, v3, vecPerm1 );
7342 v5 = vec_perm( v5, v5, vecPerm2 );
7344 v1 = vec_mergeh( v1, v5 );
7345 v2 = vec_mergeh( v3, zeroVector );
7346 v2 = vec_mergeh( v1, v2 );
7347 v2 = vec_perm( v2, v2, (vector
unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7350 vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7351 v0 = vec_ld( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7352 v1 = vec_ld( 16, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7353 vecVertA2 = vec_perm( v0, v1, vecPerm0 );
7355 vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset2 * DRAWVERT_OFFSET ) );
7356 v3 = vec_ld( 0, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
7357 v4 = vec_ld( 16, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
7358 vecVertB2 = vec_perm( v3, v4, vecPerm3 );
7360 vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7361 v5 = vec_ld( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7362 v6 = vec_ld( 16, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7363 vecVertC2 = vec_perm( v5, v6, vecPerm4 );
7366 v1 = vec_perm( v1, v1, vecPerm0 );
7367 v4 = vec_perm( v4, v4, vecPerm3 );
7368 v5 = vec_perm( v6, v6, vecPerm4 );
7370 v1 = vec_mergeh( v1, v5 );
7371 v3 = vec_mergeh( v4, zeroVector );
7372 v3 = vec_mergeh( v1, v3 );
7373 v3 = vec_perm( v3, v3, (vector
unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7376 vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7377 v0 = vec_ld( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7378 v1 = vec_ld( 16, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7379 vecVertA3 = vec_perm( v0, v1, vecPerm0 );
7381 vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset3 * DRAWVERT_OFFSET ) );
7382 v4 = vec_ld( 0, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
7383 v5 = vec_ld( 16, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
7384 vecVertB3 = vec_perm( v4, v5, vecPerm1 );
7386 vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7387 v6 = vec_ld( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7388 v7 = vec_ld( 16, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7389 vecVertC3 = vec_perm( v6, v7, vecPerm2 );
7392 v1 = vec_perm( v1, v1, vecPerm0 );
7393 v5 = vec_perm( v5, v5, vecPerm1 );
7394 v7 = vec_perm( v7, v7, vecPerm2 );
7396 v1 = vec_mergeh( v1, v7 );
7397 v4 = vec_mergeh( v5, zeroVector );
7398 v4 = vec_mergeh( v1, v4 );
7399 v4 = vec_perm( v4, v4, (vector
unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7402 vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7403 v0 = vec_ld( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7404 v1 = vec_ld( 16, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7405 vecVertA4 = vec_perm( v0, v1, vecPerm0 );
7407 vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset4 * DRAWVERT_OFFSET ) );
7408 v5 = vec_ld( 0, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
7409 v6 = vec_ld( 16, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
7410 vecVertB4 = vec_perm( v5, v6, vecPerm3 );
7412 vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7413 v7 = vec_ld( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7414 v8 = vec_ld( 16, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7415 vecVertC4 = vec_perm( v7, v8, vecPerm4 );
7418 v1 = vec_perm( v1, v1, vecPerm0 );
7419 v6 = vec_perm( v6, v6, vecPerm3 );
7420 v8 = vec_perm( v8, v8, vecPerm4 );
7422 v1 = vec_mergeh( v1, v8 );
7423 v5 = vec_mergeh( v6, zeroVector );
7424 v5 = vec_mergeh( v1, v5 );
7425 v5 = vec_perm( v5, v5, (vector
unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7430 vecD1 = vec_sub( vecVertB, vecVertA );
7431 vecD4 = vec_sub( vecVertB2, vecVertA2 );
7432 vecD7 = vec_sub( vecVertB3, vecVertA3 );
7433 vecD10 = vec_sub( vecVertB4, vecVertA4 );
7436 vecD2 = vec_sub( vecVertC, vecVertA );
7437 vecD5 = vec_sub( vecVertC2, vecVertA2 );
7438 vecD8 = vec_sub( vecVertC3, vecVertA3 );
7439 vecD11 = vec_sub( vecVertC4, vecVertA4 );
7442 vecD3 = vec_sub( v2, vec_sld( v2, v2, 4 ) );
7443 vecD6 = vec_sub( v3, vec_sld( v3, v3, 4 ) );
7444 vecD9 = vec_sub( v4, vec_sld( v4, v4, 4 ) );
7445 vecD12 = vec_sub( v5, vec_sld( v5, v5, 4 ) );
7448 vecPerm1 = vec_add( vec_lvsl( -1, (
int*) &dominantTris[i].normalizationScale[0] ), (vector
unsigned char)(1) );
7449 vecPerm2 = vec_add( vec_lvsl( -1, (
int*) &dominantTris[i+1].normalizationScale[0] ), (vector
unsigned char)(1) );
7450 vecPerm3 = vec_add( vec_lvsl( -1, (
int*) &dominantTris[i+2].normalizationScale[0] ), (vector
unsigned char)(1) );
7451 vecPerm4 = vec_add( vec_lvsl( -1, (
int*) &dominantTris[i+3].normalizationScale[0] ), (vector
unsigned char)(1) );
7454 v0 = vec_ld( 0, &dominantTris[i].normalizationScale[0] );
7455 v1 = vec_ld( 11, &dominantTris[i].normalizationScale[0] );
7456 v2 = vec_ld( 0, &dominantTris[i+1].normalizationScale[0] );
7457 v3 = vec_ld( 11, &dominantTris[i+1].normalizationScale[0] );
7458 v4 = vec_ld( 0, &dominantTris[i+2].normalizationScale[0] );
7459 v5 = vec_ld( 11, &dominantTris[i+2].normalizationScale[0] );
7460 v6 = vec_ld( 0, &dominantTris[i+3].normalizationScale[0] );
7461 v7 = vec_ld( 11, &dominantTris[i+3].normalizationScale[0] );
7463 v0 = vec_perm( v0, v1, vecPerm1 );
7464 v2 = vec_perm( v2, v3, vecPerm2 );
7465 v4 = vec_perm( v4, v5, vecPerm3 );
7466 v6 = vec_perm( v6, v7, vecPerm4 );
7468 vecS0 = vec_splat( v0, 0 );
7469 vecS1 = vec_splat( v0, 1 );
7470 vecS2 = vec_splat( v0, 2 );
7472 vecS0_2 = vec_splat( v2, 0);
7473 vecS1_2 = vec_splat( v2, 1 );
7474 vecS2_2 = vec_splat( v2, 2 );
7476 vecS0_3 = vec_splat( v4, 0 );
7477 vecS1_3 = vec_splat( v4, 1 );
7478 vecS2_3 = vec_splat( v4, 2 );
7480 vecS0_4 = vec_splat( v6, 0 );
7481 vecS1_4 = vec_splat( v6, 1 );
7482 vecS2_4 = vec_splat( v6, 2 );
7485 vecWork1 = vec_perm( vecD2, vecD2, vecPermN1 );
7486 vecWork2 = vec_perm( vecD1, vecD1, vecPermN0 );
7487 vecWork3 = vec_perm( vecD5, vecD5, vecPermN1 );
7488 vecWork4 = vec_perm( vecD4, vecD4, vecPermN0 );
7489 vecWork5 = vec_perm( vecD8, vecD8, vecPermN1 );
7490 vecWork6 = vec_perm( vecD7, vecD7, vecPermN0 );
7491 vecWork7 = vec_perm( vecD11, vecD11, vecPermN1 );
7492 vecWork8 = vec_perm( vecD10, vecD10, vecPermN0 );
7494 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7495 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7496 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7497 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7499 vecWork1 = vec_perm( vecD2, vecD2, vecPermN0 );
7500 vecWork2 = vec_perm( vecD1, vecD1, vecPermN1 );
7501 vecWork3 = vec_perm( vecD5, vecD5, vecPermN0 );
7502 vecWork4 = vec_perm( vecD4, vecD4, vecPermN1 );
7503 vecWork5 = vec_perm( vecD8, vecD8, vecPermN0 );
7504 vecWork6 = vec_perm( vecD7, vecD7, vecPermN1 );
7505 vecWork7 = vec_perm( vecD11, vecD11, vecPermN0 );
7506 vecWork8 = vec_perm( vecD10, vecD10, vecPermN1 );
7508 vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
7509 vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
7510 vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
7511 vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
7515 vecN = vec_madd( vecS2, vecSecondHalf, zeroVector );
7516 vecN2 = vec_madd( vecS2_2, vecSecondHalf2, zeroVector );
7517 vecN3 = vec_madd( vecS2_3, vecSecondHalf3, zeroVector );
7518 vecN4 = vec_madd( vecS2_4, vecSecondHalf4, zeroVector );
7522 vecWork2 = vec_perm( vecD3, vecD3, vecPermT1 );
7524 vecWork4 = vec_perm( vecD6, vecD6, vecPermT1 );
7526 vecWork6 = vec_perm( vecD9, vecD9, vecPermT1 );
7528 vecWork8 = vec_perm( vecD12, vecD12, vecPermT1 );
7530 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7531 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7532 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7533 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7536 vecWork2 = vec_perm( vecD3, vecD3, vecPermT0 );
7538 vecWork4 = vec_perm( vecD6, vecD6, vecPermT0 );
7540 vecWork6 = vec_perm( vecD9, vecD9, vecPermT0 );
7542 vecWork8 = vec_perm( vecD12, vecD12, vecPermT0 );
7544 vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
7545 vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
7546 vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
7547 vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
7550 vecT1 = vec_madd( vecS0, vecSecondHalf, zeroVector );
7551 vecT1_2 = vec_madd( vecS0_2, vecSecondHalf2, zeroVector );
7552 vecT1_3 = vec_madd( vecS0_3, vecSecondHalf3, zeroVector );
7553 vecT1_4 = vec_madd( vecS0_4, vecSecondHalf4, zeroVector );
7555 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7557 vecWork2 = vec_perm( vecD2, vecD2, vecPermT2 );
7559 vecWork4 = vec_perm( vecD5, vecD5, vecPermT2 );
7561 vecWork6 = vec_perm( vecD8, vecD8, vecPermT2 );
7563 vecWork8 = vec_perm( vecD11, vecD11, vecPermT2 );
7565 vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7566 vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7567 vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7568 vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7570 vecWork1 = vec_perm( vecD1, vecD1, vecPermT2 );
7572 vecWork3 = vec_perm( vecD4, vecD4, vecPermT2 );
7574 vecWork5 = vec_perm( vecD7, vecD7, vecPermT2 );
7576 vecWork7 = vec_perm( vecD10, vecD10, vecPermT2 );
7579 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7580 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7581 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7582 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7585 vecWork1 = vec_perm( vecN, vecN, vecPermN1 );
7586 vecWork2 = vec_perm( vecT1, vecT1, vecPermN0 );
7587 vecWork3 = vec_perm( vecN2, vecN2, vecPermN1 );
7588 vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN0 );
7589 vecWork5 = vec_perm( vecN3, vecN3, vecPermN1 );
7590 vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN0 );
7591 vecWork7 = vec_perm( vecN4, vecN4, vecPermN1 );
7592 vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN0 );
7594 vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7595 vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7596 vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7597 vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7599 vecWork1 = vec_perm( vecN, vecN, vecPermN0 );
7600 vecWork2 = vec_perm( vecT1, vecT1, vecPermN1 );
7601 vecWork3 = vec_perm( vecN2, vecN2, vecPermN0 );
7602 vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN1 );
7603 vecWork5 = vec_perm( vecN3, vecN3, vecPermN0 );
7604 vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN1 );
7605 vecWork7 = vec_perm( vecN4, vecN4, vecPermN0 );
7606 vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN1 );
7608 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7609 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7610 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7611 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7614 vecSecondHalf = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
7615 vecSecondHalf2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
7616 vecSecondHalf3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
7617 vecSecondHalf4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
7619 vecT2 = vec_madd( vecS1, vecSecondHalf, zeroVector );
7620 vecT2_2 = vec_madd( vecS1_2, vecSecondHalf2, zeroVector );
7621 vecT2_3 = vec_madd( vecS1_3, vecSecondHalf3, zeroVector );
7622 vecT2_4 = vec_madd( vecS1_4, vecSecondHalf4, zeroVector );
7627 vecLd1 = vec_ld( 0, normalPtr + ( i * DRAWVERT_OFFSET ) );
7628 vecLd2 = vec_ld( 32, normalPtr + ( i * DRAWVERT_OFFSET ) );
7631 vecStore1 = vec_perm( vecLd1, vecN, vecPermLeadAndThree );
7632 vecStore2 = vec_perm( vecT1, vecT2, vecPermFirstThreeLast );
7633 vecStore3 = vec_perm( vecT2, vecLd2, vecPermStore2 );
7636 ALIGNED_STORE3( normalPtr + ( i * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
7639 vecLd3 = vec_ld( 32, normalPtr + ( (i+1) * DRAWVERT_OFFSET ));
7642 vecStore1 = vec_perm( vecN2, vecT1_2, vecPermFirstThreeLast );
7643 vecStore2 = vec_perm( vecT1_2, vecT2_2, vecPermStoreSecond );
7644 vecStore3 = vec_perm( vecT2_2, vecLd3, (vector
unsigned char)(8,9,10,11,20,21,22,23,24,25,26,27,28,29,30,31) );
7648 ALIGNED_STORE3( normalPtr + ((i+1) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
7651 vecLd1 = vec_ld( 0, normalPtr + ( (i+2) * DRAWVERT_OFFSET ) );
7654 vecStore1 = vec_perm( vecLd1, vecN3, vecPermFirstThreeLast );
7655 vecStore2 = vec_perm( vecN3, vecT1_3, vecPermStore3 );
7656 vecStore3 = vec_perm( vecT1_3, vecT2_3, vecPermStore4 );
7659 ALIGNED_STORE3( normalPtr + ((i+2) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
7662 vecLd2 = vec_ld( 0, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
7663 vecLd3 = vec_ld( 32, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
7666 vecStore1 = vec_perm( vecLd2, vecN4, vecPermHalves );
7667 vecStore2 = vec_perm( vecN4, vecT1_4, vecPermStore4 );
7668 vecStore3 = vec_perm( vecT2_4, vecLd3, vecPermFirstThreeLast );
7671 ALIGNED_STORE3( normalPtr + ((i+3) * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
7675 for ( ; i < numVerts; i++ ) {
7677 float d0, d1, d2, d3, d4;
7678 float d5, d6, d7, d8, d9;
7690 d0 = b->
xyz[0] - a->
xyz[0];
7691 d1 = b->
xyz[1] - a->
xyz[1];
7692 d2 = b->
xyz[2] - a->
xyz[2];
7693 d3 = b->
st[0] - a->
st[0];
7695 d4 = b->
st[1] - a->
st[1];
7697 d5 = c->
xyz[0] - a->
xyz[0];
7698 d6 = c->
xyz[1] - a->
xyz[1];
7699 d7 = c->
xyz[2] - a->
xyz[2];
7700 d8 = c->
st[0] - a->
st[0];
7702 d9 = c->
st[1] - a->
st[1];
7708 n0 = s2 * ( d6 * d2 - d7 * d1 );
7709 n1 = s2 * ( d7 * d0 - d5 * d2 );
7710 n2 = s2 * ( d5 * d1 - d6 * d0 );
7712 t0 = s0 * ( d0 * d9 - d4 * d5 );
7713 t1 = s0 * ( d1 * d9 - d4 * d6 );
7714 t2 = s0 * ( d2 * d9 - d4 * d7 );
7716 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7717 t3 = s1 * ( d3 * d5 - d0 * d8 );
7718 t4 = s1 * ( d3 * d6 - d1 * d8 );
7719 t5 = s1 * ( d3 * d7 - d2 * d8 );
7721 t3 = s1 * ( n2 * t1 - n1 * t2 );
7722 t4 = s1 * ( n0 * t2 - n2 * t0 );
7723 t5 = s1 * ( n1 * t0 - n0 * t1 );
7749 #define DERIVE_UNSMOOTHED_BITANGENT
7754 for ( i = 0; i < numVerts; i++ ) {
7756 float d0, d1, d2, d3, d4;
7757 float d5, d6, d7, d8, d9;
7769 d0 = b->
xyz[0] - a->
xyz[0];
7770 d1 = b->
xyz[1] - a->
xyz[1];
7771 d2 = b->
xyz[2] - a->
xyz[2];
7772 d3 = b->
st[0] - a->
st[0];
7774 d4 = b->
st[1] - a->
st[1];
7776 d5 = c->
xyz[0] - a->
xyz[0];
7777 d6 = c->
xyz[1] - a->
xyz[1];
7778 d7 = c->
xyz[2] - a->
xyz[2];
7779 d8 = c->
st[0] - a->
st[0];
7781 d9 = c->
st[1] - a->
st[1];
7787 n0 = s2 * ( d6 * d2 - d7 * d1 );
7788 n1 = s2 * ( d7 * d0 - d5 * d2 );
7789 n2 = s2 * ( d5 * d1 - d6 * d0 );
7791 t0 = s0 * ( d0 * d9 - d4 * d5 );
7792 t1 = s0 * ( d1 * d9 - d4 * d6 );
7793 t2 = s0 * ( d2 * d9 - d4 * d7 );
7795 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7796 t3 = s1 * ( d3 * d5 - d0 * d8 );
7797 t4 = s1 * ( d3 * d6 - d1 * d8 );
7798 t5 = s1 * ( d3 * d7 - d2 * d8 );
7800 t3 = s1 * ( n2 * t1 - n1 * t2 );
7801 t4 = s1 * ( n0 * t2 - n2 * t0 );
7802 t5 = s1 * ( n1 * t0 - n0 * t1 );
7841 vector
float v5, v6, v7, v8;
7843 vector
float vec1T0, vec1T1, vec2T0, vec2T1, vec3T0, vec3T1, vec4T0, vec4T1;
7844 vector
float vecSum, vecTSum1, vecTSum2, tempSum, tempSum2, tempSum3;
7845 vector
float vecF, vecF2;
7846 vector
float vecTemp, vecTemp2, vecTemp3, vecTemp4;
7848 register vector
float zeroVector = (vector
float)(0.0);
7850 vector
unsigned char vecPermHalves = (vector
unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
7851 vector
unsigned char vecPermLast = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
7852 vector
unsigned char vecPermSplatFirstWithZero = (vector
unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,16,17,18,19);
7853 vector
unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3;
7854 vector
unsigned char storePerm0, storePerm1, storePerm2, storePerm3;
7856 vector
float vecTan11, vecTan12, vecTan13, vecTan21, vecTan22, vecTan23;
7857 vector
float vecTan31, vecTan32, vecTan33, vecTan41, vecTan42, vecTan43;
7859 vector
unsigned char vec1T0Perm, vec1T1Perm, vec2T0Perm, vec2T1Perm, vec3T0Perm, vec3T1Perm, vec4T0Perm, vec4T1Perm;
7860 vector
unsigned char storeT11, storeT12, storeT21, storeT22, storeT31, storeT32;
7861 vector
unsigned char storeT41, storeT42;
7865 if ( i+3 < numVerts ) {
7867 vecPerm0 = vec_add( vec_lvsl( -1, addr ), (vector
unsigned char)(1) );
7868 vecPerm1 = vec_add( vec_lvsl( -1, addr + ( 1 * DRAWVERT_OFFSET ) ), (vector
unsigned char)(1) );
7869 vecPerm2 = vec_add( vec_lvsl( -1, addr + ( 2 * DRAWVERT_OFFSET ) ), (vector
unsigned char)(1) );
7870 vecPerm3 = vec_add( vec_lvsl( -1, addr + ( 3 * DRAWVERT_OFFSET ) ), (vector
unsigned char)(1) );
7873 vec1T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) ), (vector
unsigned char)(1) );
7874 vec1T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) + 3 ), (vector
unsigned char)(1) );
7875 vec2T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) ), (vector
unsigned char)(1) );
7876 vec2T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) + 3 ), (vector
unsigned char)(1) );
7877 vec3T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) ), (vector
unsigned char)(1) );
7878 vec3T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) + 3 ), (vector
unsigned char)(1) );
7879 vec4T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) ), (vector
unsigned char)(1) );
7880 vec4T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) + 3 ), (vector
unsigned char)(1) );
7883 storePerm0 = vec_lvsr( 0, addr );
7884 storePerm1 = vec_lvsr( 0, addr + ( 1 * DRAWVERT_OFFSET ) );
7885 storePerm2 = vec_lvsr( 0, addr + ( 2 * DRAWVERT_OFFSET ) );
7886 storePerm3 = vec_lvsr( 0, addr + ( 3 * DRAWVERT_OFFSET ) );
7889 storeT11 = vec_lvsr( 0, tAddr + ( 0 * DRAWVERT_OFFSET ) );
7890 storeT12 = vec_lvsr( 12, tAddr + ( 0 * DRAWVERT_OFFSET ) );
7892 storeT21 = vec_lvsr( 0, tAddr + ( 1 * DRAWVERT_OFFSET ) );
7893 storeT22 = vec_lvsr( 12, tAddr + ( 1 * DRAWVERT_OFFSET ) );
7895 storeT31 = vec_lvsr( 0, tAddr + ( 2 * DRAWVERT_OFFSET ) );
7896 storeT32 = vec_lvsr( 12, tAddr + ( 2 * DRAWVERT_OFFSET ) );
7898 storeT41 = vec_lvsr( 0, tAddr + ( 3 * DRAWVERT_OFFSET ) );
7899 storeT42 = vec_lvsr( 12, tAddr + ( 3 * DRAWVERT_OFFSET ) );
7902 for ( ; i+3 < numVerts; i+=4 ) {
7905 vector
float vecNormal11 = vec_ld( 0, addr + ( i * DRAWVERT_OFFSET ) );
7906 vector
float vecNormal12 = vec_ld( 15, addr + ( i * DRAWVERT_OFFSET ) );
7907 v0 = vec_perm( vecNormal11, vecNormal12, vecPerm0 );
7909 vector
float vecNormal21 = vec_ld( 0, addr + ((i+1) * DRAWVERT_OFFSET ) );
7910 vector
float vecNormal22 = vec_ld( 15, addr + ((i+1) * DRAWVERT_OFFSET ) );
7911 v1 = vec_perm( vecNormal21, vecNormal22, vecPerm1 );
7913 vector
float vecNormal31 = vec_ld( 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
7914 vector
float vecNormal32 = vec_ld( 15, addr + ( (i+2) * DRAWVERT_OFFSET ) );
7915 v2 = vec_perm( vecNormal31, vecNormal32, vecPerm2 );
7917 vector
float vecNormal41 = vec_ld( 0, addr + ((i+3) * DRAWVERT_OFFSET ) );
7918 vector
float vecNormal42 = vec_ld( 15, addr + ((i+3) * DRAWVERT_OFFSET ) );
7919 v3 = vec_perm( vecNormal41, vecNormal42, vecPerm3 );
7922 v0 = vec_perm( v0, zeroVector, vecPermLast );
7923 v1 = vec_perm( v1, zeroVector, vecPermLast );
7924 v2 = vec_perm( v2, zeroVector, vecPermLast );
7925 v3 = vec_perm( v3, zeroVector, vecPermLast );
7929 vecTemp = vec_madd( v0, v0, zeroVector );
7931 vecSum = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
7932 vecSum = vec_add( vecSum, vec_sld( vecSum, vecSum, 4 ) );
7935 vecTemp2 = vec_madd( v1, v1, zeroVector );
7936 tempSum = vec_add( vecTemp2, vec_sld( vecTemp2, vecTemp2, 8 ) );
7937 tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7939 vecSum = vec_mergeh( vecSum, tempSum );
7941 vecTemp3 = vec_madd( v2, v2, zeroVector );
7942 tempSum = vec_add( vecTemp3, vec_sld( vecTemp3, vecTemp3, 8 ) );
7943 tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7945 vecSum = vec_perm( vecSum, tempSum, vecPermHalves );
7947 vecTemp4 = vec_madd( v3, v3, zeroVector );
7948 tempSum = vec_add( vecTemp4, vec_sld( vecTemp4, vecTemp4, 8 ) );
7949 tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7951 vecSum = vec_perm( vecSum, tempSum, vecPermLast );
7954 vecF = ReciprocalSquareRoot( vecSum );
7957 v5 = vec_madd( v0, vec_splat( vecF, 0 ), zeroVector );
7958 v6 = vec_madd( v1, vec_splat( vecF, 1 ), zeroVector );
7959 v7 = vec_madd( v2, vec_splat( vecF, 2 ), zeroVector );
7960 v8 = vec_madd( v3, vec_splat( vecF, 3 ), zeroVector );
7963 vecTan11 = vec_ld( 0, tAddr + ( i * DRAWVERT_OFFSET ) );
7964 vecTan12 = vec_ld( 11, tAddr + ( i * DRAWVERT_OFFSET ) );
7965 vecTan13 = vec_ld( 23, tAddr + ( i * DRAWVERT_OFFSET ) );
7967 vecTan21 = vec_ld( 0, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7968 vecTan22 = vec_ld( 11, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7969 vecTan23 = vec_ld( 23, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7971 vecTan31 = vec_ld( 0, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7972 vecTan32 = vec_ld( 11, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7973 vecTan33 = vec_ld( 23, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7975 vecTan41 = vec_ld( 0, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7976 vecTan42 = vec_ld( 11, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7977 vecTan43 = vec_ld( 23, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7979 vec1T0 = vec_perm( vecTan11, vecTan12, vec1T0Perm );
7980 vec1T1 = vec_perm( vecTan12, vecTan13, vec1T1Perm );
7981 vec2T0 = vec_perm( vecTan21, vecTan22, vec2T0Perm );
7982 vec2T1 = vec_perm( vecTan22, vecTan23, vec2T1Perm );
7983 vec3T0 = vec_perm( vecTan31, vecTan32, vec3T0Perm );
7984 vec3T1 = vec_perm( vecTan32, vecTan33, vec3T1Perm );
7985 vec4T0 = vec_perm( vecTan41, vecTan42, vec4T0Perm );
7986 vec4T1 = vec_perm( vecTan42, vecTan43, vec4T1Perm );
7989 vec1T0 = vec_perm( vec1T0, zeroVector, vecPermLast );
7990 vec1T1 = vec_perm( vec1T1, zeroVector, vecPermLast );
7991 vec2T0 = vec_perm( vec2T0, zeroVector, vecPermLast );
7992 vec2T1 = vec_perm( vec2T1, zeroVector, vecPermLast );
7993 vec3T0 = vec_perm( vec3T0, zeroVector, vecPermLast );
7994 vec3T1 = vec_perm( vec3T1, zeroVector, vecPermLast );
7995 vec4T0 = vec_perm( vec4T0, zeroVector, vecPermLast );
7996 vec4T1 = vec_perm( vec4T1, zeroVector, vecPermLast );
7999 tempSum = zeroVector;
8000 tempSum = vec_madd( vec1T0, v5, tempSum );
8002 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8003 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8005 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8006 vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
8009 vec1T0 = vec_sub( vec1T0, vecTSum1 );
8011 tempSum = zeroVector;
8012 tempSum = vec_madd( vec2T0, v6, tempSum );
8015 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8016 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8017 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8018 vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
8019 vec2T0 = vec_sub( vec2T0, vecTSum1 );
8021 tempSum = zeroVector;
8022 tempSum = vec_madd( vec3T0, v7, tempSum );
8025 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8026 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8027 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8028 vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
8029 vec3T0 = vec_sub( vec3T0, vecTSum1 );
8031 tempSum = zeroVector;
8032 tempSum = vec_madd( vec4T0, v8, tempSum );
8035 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8036 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8037 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8038 vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
8039 vec4T0 = vec_sub( vec4T0, vecTSum1 );
8042 tempSum = zeroVector;
8043 tempSum = vec_madd( vec1T1, v5, tempSum );
8046 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8047 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8048 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8049 vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
8052 vec1T1 = vec_sub( vec1T1, vecTSum1 );
8054 tempSum = zeroVector;
8055 tempSum = vec_madd( vec2T1, v6, tempSum );
8058 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8059 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8060 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8061 vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
8062 vec2T1 = vec_sub( vec2T1, vecTSum1 );
8064 tempSum = zeroVector;
8065 tempSum = vec_madd( vec3T1, v7, tempSum );
8068 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8069 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8070 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8071 vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
8072 vec3T1 = vec_sub( vec3T1, vecTSum1 );
8074 tempSum = zeroVector;
8075 tempSum = vec_madd( vec4T1, v8, tempSum );
8078 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8079 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8080 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8081 vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
8082 vec4T1 = vec_sub( vec4T1, vecTSum1 );
8086 vecTemp = vec_madd( vec1T0, vec1T0, zeroVector );
8087 vecTSum1 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8088 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8091 vecTemp = vec_madd( vec2T0, vec2T0, zeroVector );
8092 tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8093 tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8095 vecTemp = vec_madd( vec3T0, vec3T0, zeroVector );
8096 vecTSum1 = vec_mergeh( vecTSum1, tempSum2 );
8097 tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8098 tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8100 vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermHalves );
8101 vecTemp = vec_madd( vec4T0, vec4T0, zeroVector );
8102 tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8103 tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8105 vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermLast );
8107 vecTemp = vec_madd( vec1T1, vec1T1, zeroVector );
8108 vecTSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8109 vecTSum2 = vec_add( vecTSum2, vec_sld( vecTSum2, vecTSum2, 4 ) );
8111 vecTemp = vec_madd( vec2T1, vec2T1, zeroVector );
8112 tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8113 tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8115 vecTSum2 = vec_mergeh( vecTSum2, tempSum3 );
8116 vecTemp = vec_madd( vec3T1, vec3T1, zeroVector );
8117 tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8118 tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8120 vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermHalves );
8121 vecTemp = vec_madd( vec4T1, vec4T1, zeroVector );
8122 tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8123 tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8125 vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermLast );
8128 vecF = ReciprocalSquareRoot( vecTSum1 );
8130 vecF2 = ReciprocalSquareRoot( vecTSum2 );
8134 vec1T0 = vec_madd( vec1T0, vec_splat( vecF, 0 ), zeroVector );
8135 vec2T0 = vec_madd( vec2T0, vec_splat( vecF, 1 ), zeroVector );
8136 vec3T0 = vec_madd( vec3T0, vec_splat( vecF, 2 ), zeroVector );
8137 vec4T0 = vec_madd( vec4T0, vec_splat( vecF, 3 ), zeroVector );
8139 vec1T1 = vec_madd( vec1T1, vec_splat( vecF2, 0 ), zeroVector );
8140 vec2T1 = vec_madd( vec2T1, vec_splat( vecF2, 1 ), zeroVector );
8141 vec3T1 = vec_madd( vec3T1, vec_splat( vecF2, 2 ), zeroVector );
8142 vec4T1 = vec_madd( vec4T1, vec_splat( vecF2, 3 ), zeroVector );
8145 v5 = vec_perm( v5, v5, storePerm0 );
8146 v6 = vec_perm( v6, v6, storePerm1 );
8147 v7 = vec_perm( v7, v7, storePerm2 );
8148 v8 = vec_perm( v8, v8, storePerm3 );
8150 vec_ste( v5, 0, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8151 vec_ste( v5, 4, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8152 vec_ste( v5, 8, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8154 vec_ste( v6, 0, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8155 vec_ste( v6, 4, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8156 vec_ste( v6, 8, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8158 vec_ste( v7, 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8159 vec_ste( v7, 4, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8160 vec_ste( v7, 8, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8162 vec_ste( v8, 0, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8163 vec_ste( v8, 4, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8164 vec_ste( v8, 8, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8167 vec1T0 = vec_perm( vec1T0, vec1T0, storeT11 );
8168 vec1T1 = vec_perm( vec1T1, vec1T1, storeT12 );
8170 vec_ste( vec1T0, 0, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8171 vec_ste( vec1T0, 4, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8172 vec_ste( vec1T0, 8, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8173 vec_ste( vec1T1, 12, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8174 vec_ste( vec1T1, 16, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8175 vec_ste( vec1T1, 20, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8178 vec2T0 = vec_perm( vec2T0, vec2T0, storeT21 );
8179 vec2T1 = vec_perm( vec2T1, vec2T1, storeT22 );
8181 vec_ste( vec2T0, 0, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8182 vec_ste( vec2T0, 4, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8183 vec_ste( vec2T0, 8, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8184 vec_ste( vec2T1, 12, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8185 vec_ste( vec2T1, 16, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8186 vec_ste( vec2T1, 20, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8189 vec3T0 = vec_perm( vec3T0, vec3T0, storeT31 );
8190 vec3T1 = vec_perm( vec3T1, vec3T1, storeT32 );
8192 vec_ste( vec3T0, 0, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8193 vec_ste( vec3T0, 4, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8194 vec_ste( vec3T0, 8, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8195 vec_ste( vec3T1, 12, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8196 vec_ste( vec3T1, 16, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8197 vec_ste( vec3T1, 20, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8200 vec4T0 = vec_perm( vec4T0, vec4T0, storeT41 );
8201 vec4T1 = vec_perm( vec4T1, vec4T1, storeT42 );
8203 vec_ste( vec4T0, 0, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8204 vec_ste( vec4T0, 4, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8205 vec_ste( vec4T0, 8, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8206 vec_ste( vec4T1, 12, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8207 vec_ste( vec4T1, 16, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8208 vec_ste( vec4T1, 20, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8212 for ( ; i < numVerts; i++ ) {
8217 f = FastScalarInvSqrt( v.
x * v.
x + v.
y * v.
y + v.
z * v.
z );
8218 v.
x *=
f; v.
y *=
f; v.
z *=
f;
8220 for (
int j = 0; j < 2; j++ ) {
8225 f = FastScalarInvSqrt( t.
x * t.
x + t.
y * t.
y + t.
z * t.
z );
8226 t.
x *=
f; t.
y *=
f; t.
z *=
f;
8232 #ifdef ENABLE_CREATE
8246 bool *used = (
bool *)_alloca16( numVerts *
sizeof( used[0] ) );
8247 memset( used, 0, numVerts *
sizeof( used[0] ) );
8250 for ( i = 0; i+7 < numIndexes; i+= 8 ) {
8251 used[indexes[
i]] =
true;
8252 used[indexes[i+1]] =
true;
8253 used[indexes[i+2]] =
true;
8254 used[indexes[i+3]] =
true;
8255 used[indexes[i+4]] =
true;
8256 used[indexes[i+5]] =
true;
8257 used[indexes[i+6]] =
true;
8258 used[indexes[i+7]] =
true;
8261 for ( ; i < numIndexes; i++ ) {
8262 used[indexes[
i]] =
true;
8265 for ( i = 0; i+1 < numVerts; i+=2 ) {
8272 idVec3 lightDir, lightDir2;
8274 lightDir[0] = lightOrigin[0] - v->
xyz[0];
8275 lightDir[1] = lightOrigin[1] - v->
xyz[1];
8276 lightDir[2] = lightOrigin[2] - v->
xyz[2];
8278 lightDir2[0] = lightOrigin[0] - v2->
xyz[0];
8279 lightDir2[1] = lightOrigin[1] - v2->
xyz[1];
8280 lightDir2[2] = lightOrigin[2] - v2->
xyz[2];
8284 z = lightDir[0] * v->
normal[0] + lightDir[1] * v->
normal[1] + lightDir[2] * v->
normal[2];
8288 z2 = lightDir2[0] * v2->
normal[0] + lightDir2[1] * v2->
normal[1] + lightDir2[2] * v2->
normal[2];
8291 lightVectors[
i][0] =
x;
8292 lightVectors[
i][1] =
y;
8293 lightVectors[
i][2] =
z;
8297 lightVectors[i+1][0] =
x2;
8298 lightVectors[i+1][1] =
y2;
8299 lightVectors[i+1][2] = z2;
8304 for ( ; i < numVerts; i++ ) {
8312 lightDir[0] = lightOrigin[0] - v->
xyz[0];
8313 lightDir[1] = lightOrigin[1] - v->
xyz[1];
8314 lightDir[2] = lightOrigin[2] - v->
xyz[2];
8316 lightVectors[
i][0] = lightDir[0] * v->
tangents[0][0] + lightDir[1] * v->
tangents[0][1] + lightDir[2] * v->
tangents[0][2];
8317 lightVectors[
i][1] = lightDir[0] * v->
tangents[1][0] + lightDir[1] * v->
tangents[1][1] + lightDir[2] * v->
tangents[1][2];
8318 lightVectors[
i][2] = lightDir[0] * v->
normal[0] + lightDir[1] * v->
normal[1] + lightDir[2] * v->
normal[2];
8335 bool *used = (
bool *)_alloca16( numVerts *
sizeof( used[0] ) );
8336 memset( used, 0, numVerts *
sizeof( used[0] ) );
8339 for ( i = 0; i+7 < numIndexes; i+= 8 ) {
8340 used[indexes[
i]] =
true;
8341 used[indexes[i+1]] =
true;
8342 used[indexes[i+2]] =
true;
8343 used[indexes[i+3]] =
true;
8344 used[indexes[i+4]] =
true;
8345 used[indexes[i+5]] =
true;
8346 used[indexes[i+6]] =
true;
8347 used[indexes[i+7]] =
true;
8350 for ( ; i < numIndexes; i++ ) {
8351 used[indexes[
i]] =
true;
8355 const float *lightOriginPtr = lightOrigin.
ToFloatPtr();
8356 const float *viewOriginPtr = viewOrigin.
ToFloatPtr();
8357 vector
unsigned char permVec = vec_lvsl( 0, lightOriginPtr );
8358 vector
unsigned char permVec2 = vec_lvsl( 0, viewOriginPtr );
8359 vector
float v0 = vec_ld( 0, lightOriginPtr );
8360 vector
float v1 = vec_ld( 15, lightOriginPtr );
8361 vector
float v2 = vec_ld( 0, viewOriginPtr );
8362 vector
float v3 = vec_ld( 15, viewOriginPtr );
8363 vector
float vecLightOrigin = vec_perm( v0, v1, permVec );
8364 vector
float vecViewOrigin = vec_perm( v2, v3, permVec2 );
8365 const vector
float zeroVector = (vector
float)(0);
8368 for ( index = 0; index+1 < numVerts; index+=2 ) {
8372 permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector
unsigned char)(1) );
8373 permVec2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector
unsigned char)(1) );
8375 v0 = vec_ld( 0, vertPtr );
8376 v1 = vec_ld( 15, vertPtr );
8377 vector
float v2 = vec_ld( 31, vertPtr );
8378 vector
float v3 = vec_ld( 47, vertPtr );
8379 vector
float v4 = vec_ld( 63, vertPtr );
8381 vector
float v5 = vec_ld( 0, vertPtr2 );
8382 vector
float v6 = vec_ld( 15, vertPtr2 );
8383 vector
float v7 = vec_ld( 31, vertPtr2 );
8384 vector
float v8 = vec_ld( 47, vertPtr2 );
8385 vector
float v9 = vec_ld( 63, vertPtr2 );
8388 vector
float vecXYZ = vec_perm( v0, v1, permVec );
8389 vector
float vecNormal = vec_perm( v1, v2, permVec );
8390 vecNormal = vec_sld( vecNormal, vecNormal, 4 );
8391 const vector
float vecTangent0 = vec_perm( v2, v3, permVec );
8392 permVec = vec_add( permVec, (vector
unsigned char)(-4) );
8393 const vector
float vecTangent1 = vec_perm( v3, v4, permVec );
8395 vector
float vecXYZ2 = vec_perm( v5, v6, permVec2 );
8396 vector
float vecNormal2 = vec_perm( v6, v7, permVec2 );
8397 vecNormal2 = vec_sld( vecNormal2, vecNormal2, 4 );
8398 const vector
float vecTangent02 = vec_perm( v7, v8, permVec2 );
8399 permVec2 = vec_add( permVec2, (vector
unsigned char)(-4) );
8400 const vector
float vecTangent12 = vec_perm( v8, v9, permVec2 );
8403 vector
float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
8404 vector
float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
8406 vector
float vecLightDir2 = vec_sub( vecLightOrigin, vecXYZ2 );
8407 vector
float vecViewDir2 = vec_sub( vecViewOrigin, vecXYZ2 );
8410 vector
float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
8411 vector
float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
8413 vector
float vecTempLight2 = vec_madd( vecLightDir2, vecLightDir2, zeroVector );
8414 vector
float vecTempView2 = vec_madd( vecViewDir2, vecViewDir2, zeroVector );
8417 vector
float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
8418 vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8419 vector
float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
8420 vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
8422 vector
float tempSum4 = vec_add( vecTempLight2, vec_sld( vecTempLight2, vecTempLight2, 4 ) );
8423 vecTempLight2 = vec_add( tempSum4, vec_sld( tempSum4, tempSum4, 8 ) );
8424 vector
float tempSum5 = vec_add( vecTempView2, vec_sld( vecTempView2, vecTempView2, 4 ) );
8425 vecTempView2 = vec_add( tempSum5, vec_sld( tempSum5, tempSum5, 8 ) );
8428 vecTempLight = vec_splat( vecTempLight, 0 );
8429 vecTempView = vec_splat( vecTempView, 0 );
8431 vecTempLight2 = vec_splat( vecTempLight2, 0 );
8432 vecTempView2 = vec_splat( vecTempView2, 0 );
8434 vecTempLight = ReciprocalSquareRoot( vecTempLight );
8435 vecTempView = ReciprocalSquareRoot( vecTempView );
8437 vecTempLight2 = ReciprocalSquareRoot( vecTempLight2 );
8438 vecTempView2 = ReciprocalSquareRoot( vecTempView2 );
8441 vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
8442 vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
8444 vecViewDir2 = vec_madd( vecViewDir2, vecTempView2, zeroVector );
8445 vecLightDir2 = vec_madd( vecLightDir2, vecTempLight2, vecViewDir2 );
8448 vector
float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
8449 vector
float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
8450 vector
float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
8452 vector
float vecTC3 = vec_madd( vecLightDir2, vecTangent02, zeroVector );
8453 vector
float vecTC4 = vec_madd( vecLightDir2, vecTangent12, zeroVector );
8454 vector
float vecTC5 = vec_madd( vecLightDir2, vecNormal2, zeroVector );
8457 vector
float tempSum3;
8458 tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
8459 vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
8460 tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
8461 vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
8462 tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
8463 vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
8465 tempSum4 = vec_add( vecTC3, vec_sld( vecTC3, vecTC3, 4 ) );
8466 vecTC3 = vec_add( tempSum4, vec_sld( vecTC3, vecTC3, 8 ) );
8467 tempSum5 = vec_add( vecTC4, vec_sld( vecTC4, vecTC4, 4 ) );
8468 vecTC4 = vec_add( tempSum5, vec_sld( vecTC4, vecTC4, 8 ) );
8469 vector
float tempSum6 = vec_add( vecTC5, vec_sld( vecTC5, vecTC5, 4 ) );
8470 vecTC5 = vec_add( tempSum6, vec_sld( vecTC5, vecTC5, 8 ) );
8472 vecTC0 = vec_splat( vecTC0, 0 );
8473 vecTC1 = vec_splat( vecTC1, 0 );
8474 vecTC2 = vec_splat( vecTC2, 0 );
8476 vecTC3 = vec_splat( vecTC3, 0 );
8477 vecTC4 = vec_splat( vecTC4, 0 );
8478 vecTC5 = vec_splat( vecTC5, 0 );
8480 if ( used[index] ) {
8482 vec_ste( vecTC0, 0, &texCoords[index][0] );
8483 vec_ste( vecTC1, 0, &texCoords[index][1] );
8484 vec_ste( vecTC2, 0, &texCoords[index][2] );
8485 vec_ste( (vector
float)(1.0), 0, &texCoords[index][3] );
8488 if ( used[index+1] ) {
8489 vec_ste( vecTC3, 0, &texCoords[index+1][0] );
8490 vec_ste( vecTC4, 0, &texCoords[index+1][1] );
8491 vec_ste( vecTC5, 0, &texCoords[index+1][2] );
8492 vec_ste( (vector
float)(1.0), 0, &texCoords[index+1][3] );
8497 for ( ; index < numVerts; index++ ) {
8498 if ( !used[index] ) {
8504 permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector
unsigned char)(1) );
8506 v0 = vec_ld( 0, vertPtr );
8507 v1 = vec_ld( 15, vertPtr );
8508 vector
float v2 = vec_ld( 31, vertPtr );
8509 vector
float v3 = vec_ld( 47, vertPtr );
8510 vector
float v4 = vec_ld( 63, vertPtr );
8513 vector
float vecXYZ = vec_perm( v0, v1, permVec );
8514 vector
float vecNormal = vec_perm( v1, v2, permVec );
8515 vecNormal = vec_sld( vecNormal, vecNormal, 4 );
8516 const vector
float vecTangent0 = vec_perm( v2, v3, permVec );
8517 permVec = vec_add( permVec, (vector
unsigned char)(-4) );
8518 const vector
float vecTangent1 = vec_perm( v3, v4, permVec );
8521 vector
float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
8522 vector
float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
8525 vector
float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
8526 vector
float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
8529 vector
float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
8530 vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8531 vector
float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
8532 vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
8535 vecTempLight = vec_splat( vecTempLight, 0 );
8536 vecTempView = vec_splat( vecTempView, 0 );
8538 vecTempLight = ReciprocalSquareRoot( vecTempLight );
8539 vecTempView = ReciprocalSquareRoot( vecTempView );
8542 vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
8543 vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
8546 vector
float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
8547 vector
float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
8548 vector
float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
8551 vector
float tempSum3;
8552 tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
8553 vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
8554 tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
8555 vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
8556 tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
8557 vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
8559 vecTC0 = vec_splat( vecTC0, 0 );
8560 vecTC1 = vec_splat( vecTC1, 0 );
8561 vecTC2 = vec_splat( vecTC2, 0 );
8564 vec_ste( vecTC0, 0, &texCoords[index][0] );
8565 vec_ste( vecTC1, 0, &texCoords[index][1] );
8566 vec_ste( vecTC2, 0, &texCoords[index][2] );
8567 vec_ste( (vector
float)(1.0), 0, &texCoords[index][3] );
8575 #ifdef VERTEXCACHE_ALIGNED
8585 assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
8587 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
8588 register vector
unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
8589 register vector
float zeroVector = (vector
float)(0.0);
8590 register vector
float oneVector = (vector
float)(1);
8591 register vector
unsigned char vecPermZeroLast = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8593 const float *lPtr = lightOrigin.
ToFloatPtr();
8600 vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector
unsigned char)(1) );
8601 v0 = vec_ld( 0, lPtr );
8602 v1 = vec_ld( 15, lPtr );
8603 v0 = vec_perm( v0, v1, vecPerm );
8604 v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
8607 for ( ; i+3 < numVerts; i+= 4 ) {
8608 if ( ! vertRemap[i] ) {
8611 #ifndef DRAWVERT_PADDED
8612 vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector
unsigned char)(1) );
8613 v2 = vec_ld( 0, vPtr );
8614 v3 = vec_ld( 15, vPtr );
8615 v7 = vec_perm( v2, v3, vecPerm2 );
8617 v7 = vec_ld( 0, vPtr );
8619 v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
8620 v3 = vec_perm( v7, oneVector, vecPermZeroLast );
8621 v1 = vec_sub( v2, v0 );
8623 vec_st( v3, 0, &vertexCache[outVerts][0] );
8624 vec_st( v1, 0, &vertexCache[outVerts+1][0] );
8626 vertRemap[
i] = outVerts;
8630 if ( ! vertRemap[i+1] ) {
8633 #ifndef DRAWVERT_PADDED
8634 vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector
unsigned char)(1) );
8635 v4 = vec_ld( 0, vPtr2 );
8636 v5 = vec_ld( 15, vPtr2 );
8637 v6 = vec_perm( v4, v5, vecPerm3 );
8639 v6 = vec_ld( 0, vPtr2 );
8641 v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8642 v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8643 v6 = vec_sub( v4, v0 );
8645 vec_st( v5, 0, &vertexCache[outVerts][0] );
8646 vec_st( v6, 0, &vertexCache[outVerts+1][0] );
8648 vertRemap[i+1] = outVerts;
8652 if ( ! vertRemap[i+2] ) {
8655 #ifndef DRAWVERT_PADDED
8656 vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector
unsigned char)(1) );
8657 v1 = vec_ld( 0, vPtr3 );
8658 v2 = vec_ld( 15, vPtr3 );
8659 v3 = vec_perm( v1, v2, vecPerm4 );
8661 v3 = vec_ld( 0, vPtr3 );
8663 v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
8664 v2 = vec_perm( v3, oneVector, vecPermZeroLast );
8665 v3 = vec_sub( v1, v0 );
8667 vec_st( v2, 0, &vertexCache[outVerts][0] );
8668 vec_st( v3, 0, &vertexCache[outVerts+1][0] );
8670 vertRemap[i+2] = outVerts;
8674 if ( ! vertRemap[i+3] ) {
8676 #ifndef DRAWVERT_PADDED
8677 vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector
unsigned char)(1) );
8678 v4 = vec_ld( 0, vPtr4 );
8679 v5 = vec_ld( 16, vPtr4 );
8680 v6 = vec_perm( v4, v5, vecPerm5 );
8682 v6 = vec_ld( 0, vPtr4 );
8684 v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8685 v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8686 v6 = vec_sub( v4, v0 );
8688 vec_st( v5, 0, &vertexCache[outVerts][0] );
8689 vec_st( v6, 0, &vertexCache[outVerts+1][0] );
8691 vertRemap[i+3] = outVerts;
8697 for (; i < numVerts; i++ ) {
8698 if ( vertRemap[i] ) {
8702 vertexCache[outVerts+0][0] = v[0];
8703 vertexCache[outVerts+0][1] = v[1];
8704 vertexCache[outVerts+0][2] = v[2];
8705 vertexCache[outVerts+0][3] = 1.0f;
8710 vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
8711 vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
8712 vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
8713 vertexCache[outVerts+1][3] = 0.0f;
8714 vertRemap[
i] = outVerts;
8731 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
8732 register vector
unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
8733 register vector
float zeroVector = (vector
float)(0.0);
8734 register vector
float oneVector = (vector
float)(1);
8735 register vector
unsigned char vecPermZeroLast = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8737 const float *lPtr = lightOrigin.
ToFloatPtr();
8744 vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector
unsigned char)(1) );
8745 v0 = vec_ld( 0, lPtr );
8746 v1 = vec_ld( 15, lPtr );
8747 v0 = vec_perm( v0, v1, vecPerm );
8748 v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
8751 for ( ; i+3 < numVerts; i+= 4 ) {
8752 if ( ! vertRemap[i] ) {
8754 #ifndef DRAWVERT_PADDED
8755 vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector
unsigned char)(1) );
8756 v2 = vec_ld( 0, vPtr );
8757 v3 = vec_ld( 15, vPtr );
8758 v7 = vec_perm( v2, v3, vecPerm2 );
8760 v7 = vec_ld( 0, vPtr );
8762 v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
8763 v3 = vec_perm( v7, oneVector, vecPermZeroLast );
8764 v1 = vec_sub( v2, v0 );
8767 UNALIGNED_STORE2( &vertexCache[outVerts][0], v3, v1 );
8769 vertRemap[
i] = outVerts;
8773 if ( ! vertRemap[i+1] ) {
8775 #ifndef DRAWVERT_PADDED
8776 vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector
unsigned char)(1) );
8777 v4 = vec_ld( 0, vPtr2 );
8778 v5 = vec_ld( 15, vPtr2 );
8779 v6 = vec_perm( v4, v5, vecPerm3 );
8781 v6 = vec_ld( 0, vPtr2 );
8783 v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8784 v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8785 v6 = vec_sub( v4, v0 );
8788 UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
8790 vertRemap[i+1] = outVerts;
8794 if ( ! vertRemap[i+2] ) {
8796 #ifndef DRAWVERT_PADDED
8797 vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector
unsigned char)(1) );
8798 v1 = vec_ld( 0, vPtr3 );
8799 v2 = vec_ld( 15, vPtr3 );
8800 v3 = vec_perm( v1, v2, vecPerm4 );
8802 v3 = vec_ld( 0, vPtr3 );
8804 v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
8805 v2 = vec_perm( v3, oneVector, vecPermZeroLast );
8806 v3 = vec_sub( v1, v0 );
8809 UNALIGNED_STORE2( &vertexCache[outVerts][0], v2, v3 );
8811 vertRemap[i+2] = outVerts;
8814 if ( ! vertRemap[i+3] ) {
8816 #ifndef DRAWVERT_PADDED
8817 vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector
unsigned char)(1) );
8818 v4 = vec_ld( 0, vPtr4 );
8819 v5 = vec_ld( 16, vPtr4 );
8820 v6 = vec_perm( v4, v5, vecPerm5 );
8822 v6 = vec_ld( 0, vPtr4 );
8825 v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8826 v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8827 v6 = vec_sub( v4, v0 );
8830 UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
8833 vertRemap[i+3] = outVerts;
8839 for (; i < numVerts; i++ ) {
8840 if ( vertRemap[i] ) {
8844 vertexCache[outVerts+0][0] = v[0];
8845 vertexCache[outVerts+0][1] = v[1];
8846 vertexCache[outVerts+0][2] = v[2];
8847 vertexCache[outVerts+0][3] = 1.0f;
8852 vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
8853 vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
8854 vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
8855 vertexCache[outVerts+1][3] = 0.0f;
8856 vertRemap[
i] = outVerts;
8867 #ifdef VERTEXCACHE_ALIGNED
8876 assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
8880 assert(
sizeof(
idVec4) == IDVEC4_OFFSET *
sizeof(
float) );
8882 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
8883 register vector
float zeroVector = (vector
float)(0.0);
8884 register vector
float oneVector = (vector
float)(1);
8885 register vector
unsigned char vecPermThreeOne = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8886 vector
unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
8889 #ifndef DRAWVERT_PADDED
8891 if ( i+3 < numVerts ) {
8892 vertPerm1 = vec_add( vec_lvsl( -1, (
float*) verts[0].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
8893 vertPerm2 = vec_add( vec_lvsl( -1, (
float*) verts[1].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
8894 vertPerm3 = vec_add( vec_lvsl( -1, (
float*) verts[2].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
8895 vertPerm4 = vec_add( vec_lvsl( -1, (
float*) verts[3].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
8899 for ( ; i+3 < numVerts; i+=4 ) {
8905 #ifndef DRAWVERT_PADDED
8906 v0 = vec_ld( 0, vertPtr );
8907 v1 = vec_ld( 15, vertPtr );
8908 v2 = vec_ld( 0, vertPtr2 );
8909 v3 = vec_ld( 15, vertPtr2 );
8910 v4 = vec_ld( 0, vertPtr3 );
8911 v5 = vec_ld( 15, vertPtr3 );
8912 v6 = vec_ld( 0, vertPtr4 );
8913 v7 = vec_ld( 15, vertPtr4 );
8915 v0 = vec_perm( v0, v1, vertPerm1 );
8916 v1 = vec_perm( v2, v3, vertPerm2 );
8917 v2 = vec_perm( v4, v5, vertPerm3 );
8918 v3 = vec_perm( v6, v7, vertPerm4 );
8920 v0 = vec_ld( 0, vertPtr );
8921 v1 = vec_ld( 0, vertPtr2 );
8922 v2 = vec_ld( 0, vertPtr3 );
8923 v3 = vec_ld( 0, vertPtr4 );
8926 v0 = vec_perm( v0, oneVector, vecPermThreeOne );
8927 v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
8929 v1 = vec_perm( v1, oneVector, vecPermThreeOne );
8930 v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
8932 v2 = vec_perm( v2, oneVector, vecPermThreeOne );
8933 v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
8935 v3 = vec_perm( v3, oneVector, vecPermThreeOne );
8936 v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
8939 ALIGNED_STORE4( &vertexCache[i*2][0], v0, v4, v1, v5 );
8940 ALIGNED_STORE4( &vertexCache[(i+2)*2][0], v2, v6, v3, v7 );
8945 for ( ; i < numVerts; i++ ) {
8947 vertexCache[i*2+0][0] = v[0];
8948 vertexCache[i*2+1][0] = v[0];
8949 vertexCache[i*2+0][1] = v[1];
8950 vertexCache[i*2+1][1] = v[1];
8951 vertexCache[i*2+0][2] = v[2];
8952 vertexCache[i*2+1][2] = v[2];
8953 vertexCache[i*2+0][3] = 1.0f;
8954 vertexCache[i*2+1][3] = 0.0f;
8956 return numVerts * 2;
8970 assert(
sizeof(
idVec4) == IDVEC4_OFFSET *
sizeof(
float) );
8972 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
8973 register vector
float zeroVector = (vector
float)(0.0);
8974 register vector
float oneVector = (vector
float)(1);
8975 register vector
unsigned char vecPermThreeOne = (vector
unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8976 vector
unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
8979 #ifndef DRAWVERT_PADDED
8981 if ( i+3 < numVerts ) {
8982 vertPerm1 = vec_add( vec_lvsl( -1, (
float*) verts[0].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
8983 vertPerm2 = vec_add( vec_lvsl( -1, (
float*) verts[1].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
8984 vertPerm3 = vec_add( vec_lvsl( -1, (
float*) verts[2].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
8985 vertPerm4 = vec_add( vec_lvsl( -1, (
float*) verts[3].xyz.ToFloatPtr() ), (vector
unsigned char)(1) );
8989 for ( ; i+3 < numVerts; i+=4 ) {
8995 #ifndef DRAWVERT_PADDED
8996 v0 = vec_ld( 0, vertPtr );
8997 v1 = vec_ld( 15, vertPtr );
8998 v2 = vec_ld( 0, vertPtr2 );
8999 v3 = vec_ld( 15, vertPtr2 );
9000 v4 = vec_ld( 0, vertPtr3 );
9001 v5 = vec_ld( 15, vertPtr3 );
9002 v6 = vec_ld( 0, vertPtr4 );
9003 v7 = vec_ld( 15, vertPtr4 );
9005 v0 = vec_perm( v0, v1, vertPerm1 );
9006 v1 = vec_perm( v2, v3, vertPerm2 );
9007 v2 = vec_perm( v4, v5, vertPerm3 );
9008 v3 = vec_perm( v6, v7, vertPerm4 );
9010 v0 = vec_ld( 0, vertPtr );
9011 v1 = vec_ld( 0, vertPtr2 );
9012 v2 = vec_ld( 0, vertPtr3 );
9013 v3 = vec_ld( 0, vertPtr4 );
9016 v0 = vec_perm( v0, oneVector, vecPermThreeOne );
9017 v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
9019 v1 = vec_perm( v1, oneVector, vecPermThreeOne );
9020 v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
9022 v2 = vec_perm( v2, oneVector, vecPermThreeOne );
9023 v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
9025 v3 = vec_perm( v3, oneVector, vecPermThreeOne );
9026 v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
9029 vector
unsigned char storePerm = vec_sub( vec_lvsr( 15, &vertexCache[i*2][0] ), (vector
unsigned char)(1) );
9030 vector
unsigned int mask = vec_perm( (vector
unsigned int)(0), (vector
unsigned int)(-1), storePerm );
9031 vector
float vc1 = vec_ld( 0, &vertexCache[i*2][0] );
9032 vector
float vc2 = vec_ld( 127, &vertexCache[i*2][0] );
9035 v0 = vec_perm( v0, v0, storePerm );
9036 v4 = vec_perm( v4, v4, storePerm );
9037 v1 = vec_perm( v1, v1, storePerm );
9038 v5 = vec_perm( v5, v5, storePerm );
9039 v2 = vec_perm( v2, v2, storePerm );
9040 v6 = vec_perm( v6, v6, storePerm );
9041 v3 = vec_perm( v3, v3, storePerm );
9042 v7 = vec_perm( v7, v7, storePerm );
9044 vec_st( vec_sel( vc1, v0, mask ), 0 , &vertexCache[i*2][0] );
9045 vec_st( vec_sel( v0, v4, mask ), 15 , &vertexCache[i*2][0] );
9046 vec_st( vec_sel( v4, v1, mask ), 31 , &vertexCache[i*2][0] );
9047 vec_st( vec_sel( v1, v5, mask ), 47 , &vertexCache[i*2][0] );
9048 vec_st( vec_sel( v5, v2, mask ), 63 , &vertexCache[i*2][0] );
9049 vec_st( vec_sel( v2, v6, mask ), 79 , &vertexCache[i*2][0] );
9050 vec_st( vec_sel( v6, v3, mask ), 95 , &vertexCache[i*2][0] );
9051 vec_st( vec_sel( v3, v7, mask ), 111 , &vertexCache[i*2][0] );
9052 vec_st( vec_sel( v7, vc2, mask ), 127 , &vertexCache[i*2][0] );
9056 for ( ; i < numVerts; i++ ) {
9058 vertexCache[i*2+0][0] = v[0];
9059 vertexCache[i*2+1][0] = v[0];
9060 vertexCache[i*2+0][1] = v[1];
9061 vertexCache[i*2+1][1] = v[1];
9062 vertexCache[i*2+0][2] = v[2];
9063 vertexCache[i*2+1][2] = v[2];
9064 vertexCache[i*2+0][3] = 1.0f;
9065 vertexCache[i*2+1][3] = 0.0f;
9067 return numVerts * 2;
9076 #ifdef ENABLE_SOUND_ROUTINES
9078 #ifdef SOUND_DEST_ALIGNED
9092 assert( IS_16BYTE_ALIGNED( dest[0] ) );
9094 vector
signed short vs0, vs1;
9095 register vector
signed int vi0, vi1;
9096 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7, v8, v9;
9098 register vector
unsigned char vecFirstHalf = (vector
unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
9099 register vector
unsigned char vecSecondHalf = (vector
unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
9101 register vector
unsigned char vecBottom = (vector
unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9102 register vector
unsigned char vecTop = (vector
unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9106 assert( numSamples >= 12 );
9108 if ( kHz == 11025 ) {
9109 if ( numChannels == 1 ) {
9113 vector
signed short vsOld = vec_ld( 0, &src[i] );
9114 vector
unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector
unsigned char)(1) );
9116 for ( ; i+7 < numSamples; i+= 8 ) {
9118 vs1 = vec_ld( 15, &src[i] );
9119 vs0 = vec_perm( vsOld, vs1, permVec );
9123 vi0 = vec_unpackh( vs0 );
9124 vi1 = vec_unpackl( vs0 );
9126 v0 = vec_ctf( vi0, 0 );
9127 v1 = vec_ctf( vi1, 0 );
9130 v2 = vec_splat( v0, 0 );
9131 v3 = vec_splat( v0, 1 );
9132 v4 = vec_splat( v0, 2 );
9133 v5 = vec_splat( v0, 3 );
9134 v6 = vec_splat( v1, 0 );
9135 v7 = vec_splat( v1, 1 );
9136 v8 = vec_splat( v1, 2 );
9137 v9 = vec_splat( v1, 3 );
9140 ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
9143 for (; i < numSamples; i++ ) {
9144 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (
float) src[i+0];
9149 vector
unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector
unsigned char)(1) );
9150 vector
signed short vsOld = vec_ld( 0, &src[0] );
9152 for ( ; i+7 < numSamples; i += 8 ) {
9154 vs1 = vec_ld( 15, &src[i] );
9155 vs0 = vec_perm( vsOld, vs1, permVec );
9159 vi0 = vec_unpackh( vs0 );
9160 vi1 = vec_unpackl( vs0 );
9162 v0 = vec_ctf( vi0, 0 );
9163 v1 = vec_ctf( vi1, 0 );
9165 v2 = vec_perm( v0, v0, vecFirstHalf );
9167 v4 = vec_perm( v0, v0, vecSecondHalf );
9169 v6 = vec_perm( v1, v1, vecFirstHalf );
9171 v8 = vec_perm (v1, v1, vecSecondHalf );
9175 ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
9178 for ( ; i < numSamples; i += 2 ) {
9179 dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (
float) src[i+0];
9180 dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (
float) src[i+1];
9183 }
else if ( kHz == 22050 ) {
9184 if ( numChannels == 1 ) {
9186 vector
unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector
unsigned char)(1) );
9187 vector
signed short vsOld = vec_ld( 0, &src[0] );
9189 for ( i = 0; i+7 < numSamples; i += 8 ) {
9191 vs1 = vec_ld( 0, &src[i] );
9192 vs0 = vec_perm( vsOld, vs1, permVec );
9196 vi0 = vec_unpackh( vs0 );
9197 vi1 = vec_unpackl( vs0 );
9199 v0 = vec_ctf( vi0, 0 );
9200 v1 = vec_ctf( vi1, 0 );
9202 v2 = vec_perm( v0, v0, vecBottom );
9203 v3 = vec_perm( v0, v0, vecTop );
9204 v4 = vec_perm( v1, v1, vecBottom );
9205 v5 = vec_perm (v1, v1, vecTop );
9208 ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
9211 for ( ; i < numSamples; i++ ) {
9212 dest[i*2+0] = dest[i*2+1] = (
float) src[i+0];
9216 vector
unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector
unsigned char)(1) );
9217 vector
signed short vsOld = vec_ld( 0, &src[0] );
9219 for ( i = 0; i+7 < numSamples; i += 8 ) {
9221 vs1 = vec_ld( 15, &src[i] );
9222 vs0 = vec_perm( vsOld, vs1, permVec );
9226 vi0 = vec_unpackh( vs0 );
9227 vi1 = vec_unpackl( vs0 );
9229 v0 = vec_ctf( vi0, 0 );
9230 v1 = vec_ctf( vi1, 0 );
9232 v2 = vec_perm( v0, v0, vecFirstHalf );
9233 v3 = vec_perm( v0, v0, vecSecondHalf );
9234 v4 = vec_perm( v1, v1, vecFirstHalf );
9235 v5 = vec_perm (v1, v1, vecSecondHalf );
9238 ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
9241 for ( ; i < numSamples; i += 2 ) {
9242 dest[i*2+0] = dest[i*2+2] = (
float) src[i+0];
9243 dest[i*2+1] = dest[i*2+3] = (
float) src[i+1];
9246 }
else if ( kHz == 44100 ) {
9248 vector
unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector
unsigned char)(1) );
9249 vector
signed short vsOld = vec_ld( 0, &src[0] );
9251 for ( i = 0; i+7 < numSamples; i += 8 ) {
9252 vs1 = vec_ld( 15, &src[i] );
9253 vs0 = vec_perm( vsOld, vs1, permVec );
9257 vi0 = vec_unpackh( vs0 );
9258 vi1 = vec_unpackl( vs0 );
9261 v0 = vec_ctf( vi0, 0 );
9262 v1 = vec_ctf( vi1, 0 );
9265 ALIGNED_STORE2( &dest[i], v0, v1 );
9268 for ( ; i < numSamples; i++ ) {
9269 dest[
i] = (
float) src[i];
9290 vector
signed short vs0, vs1;
9291 register vector
signed int vi0, vi1;
9292 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7, v8, v9;
9294 register vector
unsigned char vecFirstHalf = (vector
unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
9295 register vector
unsigned char vecSecondHalf = (vector
unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
9297 register vector
unsigned char vecBottom = (vector
unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9298 register vector
unsigned char vecTop = (vector
unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9301 vector
unsigned char storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector
unsigned char)(1) );
9303 vector
float vecDest = vec_ld( 0, &dest[0] );
9304 vector
unsigned int mask = vec_perm( (vector
unsigned int)(0), (vector
unsigned int)(-1), storePerm );
9306 if ( kHz == 11025 ) {
9307 if ( numChannels == 1 ) {
9311 vector
signed short vsOld = vec_ld( 0, &src[i] );
9312 vector
unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector
unsigned char)(1) );
9314 for ( ; i+7 < numSamples; i+= 8 ) {
9316 vs1 = vec_ld( 15, &src[i] );
9317 vs0 = vec_perm( vsOld, vs1, permVec );
9319 vector
float vecDestEnd = vec_ld( 127, &dest[i*4] );
9322 vi0 = vec_unpackh( vs0 );
9323 vi1 = vec_unpackl( vs0 );
9325 v0 = vec_ctf( vi0, 0 );
9326 v1 = vec_ctf( vi1, 0 );
9329 v2 = vec_splat( v0, 0 );
9330 v3 = vec_splat( v0, 1 );
9331 v4 = vec_splat( v0, 2 );
9332 v5 = vec_splat( v0, 3 );
9333 v6 = vec_splat( v1, 0 );
9334 v7 = vec_splat( v1, 1 );
9335 v8 = vec_splat( v1, 2 );
9336 v9 = vec_splat( v1, 3 );
9338 v2 = vec_perm( v2, v2, storePerm );
9339 v3 = vec_perm( v3, v3, storePerm );
9340 v4 = vec_perm( v4, v4, storePerm );
9341 v5 = vec_perm( v5, v5, storePerm );
9342 v6 = vec_perm( v6, v6, storePerm );
9343 v7 = vec_perm( v7, v7, storePerm );
9344 v8 = vec_perm( v8, v8, storePerm );
9345 v9 = vec_perm( v9, v9, storePerm );
9348 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
9349 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
9350 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
9351 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
9352 vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
9353 vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
9354 vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
9355 vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
9356 vecDest = vec_sel( v9, vecDestEnd, mask );
9357 vec_st( vecDest, 127, &dest[i*4] );
9360 for (; i < numSamples; i++ ) {
9361 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (
float) src[i+0];
9366 vector
unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector
unsigned char)(1) );
9367 vector
signed short vsOld = vec_ld( 0, &src[0] );
9369 for ( ; i+7 < numSamples; i += 8 ) {
9371 vs1 = vec_ld( 15, &src[i] );
9372 vs0 = vec_perm( vsOld, vs1, permVec );
9374 vector
float vecDestEnd = vec_ld( 127, &dest[i*4] );
9377 vi0 = vec_unpackh( vs0 );
9378 vi1 = vec_unpackl( vs0 );
9380 v0 = vec_ctf( vi0, 0 );
9381 v1 = vec_ctf( vi1, 0 );
9383 v2 = vec_perm( v0, v0, vecFirstHalf );
9385 v4 = vec_perm( v0, v0, vecSecondHalf );
9387 v6 = vec_perm( v1, v1, vecFirstHalf );
9389 v8 = vec_perm (v1, v1, vecSecondHalf );
9392 v2 = vec_perm( v2, v2, storePerm );
9393 v3 = vec_perm( v3, v3, storePerm );
9394 v4 = vec_perm( v4, v4, storePerm );
9395 v5 = vec_perm( v5, v5, storePerm );
9396 v6 = vec_perm( v6, v6, storePerm );
9397 v7 = vec_perm( v7, v7, storePerm );
9398 v8 = vec_perm( v8, v8, storePerm );
9399 v9 = vec_perm( v9, v9, storePerm );
9402 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
9403 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
9404 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
9405 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
9406 vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
9407 vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
9408 vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
9409 vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
9410 vecDest = vec_sel( v9, vecDestEnd, mask );
9411 vec_st( vecDest, 127, &dest[i*4] );
9414 for ( ; i < numSamples; i += 2 ) {
9415 dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (
float) src[i+0];
9416 dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (
float) src[i+1];
9419 }
else if ( kHz == 22050 ) {
9420 if ( numChannels == 1 ) {
9422 vector
unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector
unsigned char)(1) );
9423 vector
signed short vsOld = vec_ld( 0, &src[0] );
9425 for ( i = 0; i+7 < numSamples; i += 8 ) {
9427 vs1 = vec_ld( 0, &src[i] );
9428 vs0 = vec_perm( vsOld, vs1, permVec );
9430 vector
float vecDestEnd = vec_ld( 63, &dest[i*2] );
9433 vi0 = vec_unpackh( vs0 );
9434 vi1 = vec_unpackl( vs0 );
9436 v0 = vec_ctf( vi0, 0 );
9437 v1 = vec_ctf( vi1, 0 );
9439 v2 = vec_perm( v0, v0, vecBottom );
9440 v3 = vec_perm( v0, v0, vecTop );
9441 v4 = vec_perm( v1, v1, vecBottom );
9442 v5 = vec_perm (v1, v1, vecTop );
9444 v2 = vec_perm( v2, v2, storePerm );
9445 v3 = vec_perm( v3, v3, storePerm );
9446 v4 = vec_perm( v4, v4, storePerm );
9447 v5 = vec_perm( v5, v5, storePerm );
9450 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
9451 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
9452 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
9453 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
9454 vecDest = vec_sel( v5, vecDestEnd, mask );
9455 vec_st( vecDest, 63, &dest[i*2] );
9459 for ( ; i < numSamples; i++ ) {
9460 dest[i*2+0] = dest[i*2+1] = (
float) src[i+0];
9464 vector
unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector
unsigned char)(1) );
9465 vector
signed short vsOld = vec_ld( 0, &src[0] );
9467 for ( i = 0; i+7 < numSamples; i += 8 ) {
9469 vs1 = vec_ld( 15, &src[i] );
9470 vs0 = vec_perm( vsOld, vs1, permVec );
9472 vector
float vecDestEnd = vec_ld( 63, &dest[i*2] );
9475 vi0 = vec_unpackh( vs0 );
9476 vi1 = vec_unpackl( vs0 );
9478 v0 = vec_ctf( vi0, 0 );
9479 v1 = vec_ctf( vi1, 0 );
9481 v2 = vec_perm( v0, v0, vecFirstHalf );
9482 v3 = vec_perm( v0, v0, vecSecondHalf );
9483 v4 = vec_perm( v1, v1, vecFirstHalf );
9484 v5 = vec_perm (v1, v1, vecSecondHalf );
9486 v2 = vec_perm( v2, v2, storePerm );
9487 v3 = vec_perm( v3, v3, storePerm );
9488 v4 = vec_perm( v4, v4, storePerm );
9489 v5 = vec_perm( v5, v5, storePerm );
9492 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
9493 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
9494 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
9495 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
9496 vecDest = vec_sel( v5, vecDestEnd, mask );
9497 vec_st( vecDest, 63, &dest[i*2] );
9500 for ( ; i < numSamples; i += 2 ) {
9501 dest[i*2+0] = dest[i*2+2] = (
float) src[i+0];
9502 dest[i*2+1] = dest[i*2+3] = (
float) src[i+1];
9505 }
else if ( kHz == 44100 ) {
9507 vector
unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector
unsigned char)(1) );
9508 vector
signed short vsOld = vec_ld( 0, &src[0] );
9510 for ( i = 0; i+7 < numSamples; i += 8 ) {
9512 vs1 = vec_ld( 15, &src[i] );
9513 vs0 = vec_perm( vsOld, vs1, permVec );
9515 vector
float vecDestEnd = vec_ld( 31, &dest[i] );
9518 vi0 = vec_unpackh( vs0 );
9519 vi1 = vec_unpackl( vs0 );
9522 v0 = vec_ctf( vi0, 0 );
9523 v1 = vec_ctf( vi1, 0 );
9525 v0 = vec_perm( v0, v0, storePerm );
9526 v1 = vec_perm( v1, v1, storePerm );
9529 vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
9530 vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
9531 vecDest = vec_sel( v1, vecDestEnd, mask );
9532 vec_st( vecDest, 31, &dest[i] );
9535 for ( ; i < numSamples; i++ ) {
9536 dest[
i] = (
float) src[i];
9545 #ifdef SOUND_DEST_ALIGNED
9558 assert( IS_16BYTE_ALIGNED( dest[0] ) );
9560 register vector
float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
9561 register vector
float constVec, zeroVector;
9562 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7, v8, v9, v10;
9563 vector
unsigned char vecPerm1;
9564 vector
unsigned char vecPerm2;
9566 vector
unsigned char vecOneTwo = (vector
unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9567 vector
unsigned char vecThreeFour = (vector
unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9568 vector
unsigned char vecFirst = (vector
unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
9569 vector
unsigned char vecSecond = (vector
unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
9570 vector
unsigned char vecThird = (vector
unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
9571 vector
unsigned char vecFourth = (vector
unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
9573 constVec = (vector
float)(32768.0f);
9574 zeroVector = (vector
float)(0.0);
9576 if ( kHz == 11025 ) {
9577 if ( numChannels == 1 ) {
9579 vecPerm1 = vec_add( vec_lvsl( -1, (
int*) &ogg[0][0] ), (vector
unsigned char)(1) );
9580 v10 = vec_ld( 0, &ogg[0][0] );
9583 for ( i = 0; i+7 < numSamples; i += 8 ) {
9586 v9 = vec_ld( 15, &ogg[0][i] );
9587 v10 = vec_ld( 31, &ogg[0][i] );
9588 v0 = vec_perm( v8, v9, vecPerm1 );
9589 v1 = vec_perm( v9, v10, vecPerm1 );
9593 oggVec1 = vec_splat( v0, 0 );
9594 oggVec2 = vec_splat( v0, 1 );
9595 oggVec3 = vec_splat( v0, 2 );
9596 oggVec4 = vec_splat( v0, 3 );
9597 oggVec5 = vec_splat( v1, 0 );
9598 oggVec6 = vec_splat( v1, 1 );
9599 oggVec7 = vec_splat( v1, 2 );
9600 oggVec8 = vec_splat( v1, 3 );
9602 v0 = vec_madd( oggVec1, constVec, zeroVector );
9603 v1 = vec_madd( oggVec2, constVec, zeroVector );
9604 v2 = vec_madd( oggVec3, constVec, zeroVector );
9605 v3 = vec_madd( oggVec4, constVec, zeroVector );
9606 v4 = vec_madd( oggVec5, constVec, zeroVector );
9607 v5 = vec_madd( oggVec6, constVec, zeroVector );
9608 v6 = vec_madd( oggVec7, constVec, zeroVector );
9609 v7 = vec_madd( oggVec8, constVec, zeroVector );
9612 ALIGNED_STORE8( &dest[i*4], v0, v1, v2, v3, v4, v5, v6, v7 );
9617 for ( ; i < numSamples; i++ ) {
9618 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][
i] * 32768.0f;
9624 vecPerm1 = vec_add( vec_lvsl( -1, (
int*) &ogg[0][0] ), (vector
unsigned char)(1) );
9625 vecPerm2 = vec_add( vec_lvsl( -1, (
int*) &ogg[1][0] ), (vector
unsigned char)(1) );
9626 v7 = vec_ld( 0, &ogg[1][0] );
9627 v9 = vec_ld( 0, &ogg[0][0] );
9630 for ( i = 0; i+3 < numSamples >> 1; i+=4 ) {
9633 v9 = vec_ld( 15, &ogg[0][i] );
9634 v0 = vec_perm( v8, v9, vecPerm1 );
9638 oggVec1 = vec_splat( v0, 0 );
9639 oggVec2 = vec_splat( v0, 1 );
9640 oggVec3 = vec_splat( v0, 2 );
9641 oggVec4 = vec_splat( v0, 3 );
9645 v7 = vec_ld( 15, &ogg[1][i] );
9646 v1 = vec_perm( v6, v7, vecPerm2 );
9650 oggVec5 = vec_splat( v1, 0 );
9651 oggVec6 = vec_splat( v1, 1 );
9652 oggVec7 = vec_splat( v1, 2 );
9653 oggVec8 = vec_splat( v1, 3 );
9655 oggVec1 = vec_madd( oggVec1, constVec, zeroVector );
9656 oggVec2 = vec_madd( oggVec2, constVec, zeroVector );
9657 oggVec3 = vec_madd( oggVec3, constVec, zeroVector );
9658 oggVec4 = vec_madd( oggVec4, constVec, zeroVector );
9659 oggVec5 = vec_madd( oggVec5, constVec, zeroVector );
9660 oggVec6 = vec_madd( oggVec6, constVec, zeroVector );
9661 oggVec7 = vec_madd( oggVec7, constVec, zeroVector );
9662 oggVec8 = vec_madd( oggVec8, constVec, zeroVector );
9666 v0 = vec_mergeh( oggVec1, oggVec5 );
9667 v1 = vec_mergel( oggVec1, oggVec5 );
9668 v2 = vec_mergeh( oggVec2, oggVec6 );
9669 v3 = vec_mergel( oggVec2, oggVec6 );
9671 v4 = vec_mergeh( oggVec3, oggVec7 );
9672 v5 = vec_mergel( oggVec3, oggVec7 );
9673 v6 = vec_mergeh( oggVec4, oggVec8 );
9674 v10 = vec_mergel( oggVec4, oggVec8 );
9677 ALIGNED_STORE8( &dest[i*8], v0, v1, v2, v3, v4, v5, v6, v10 );
9681 for ( ; i < numSamples >> 1; i++ ) {
9682 dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][
i] * 32768.0f;
9683 dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][
i] * 32768.0f;
9686 }
else if ( kHz == 22050 ) {
9687 if ( numChannels == 1 ) {
9690 vecPerm1 = vec_add( vec_lvsl( -1, (
int*) &ogg[0][0] ), (vector
unsigned char)(1) );
9691 v10 = vec_ld( 0, &ogg[0][0] );
9695 for ( i = 0; i+7 < numSamples; i += 8 ) {
9698 v9 = vec_ld( 15, &ogg[0][i] );
9699 v10 = vec_ld( 31, &ogg[0][i] );
9700 v0 = vec_perm( v8, v9, vecPerm1 );
9701 v1 = vec_perm( v9, v10, vecPerm1 );
9704 v0 = vec_madd( v0, constVec, zeroVector );
9705 v1 = vec_madd( v1, constVec, zeroVector );
9708 v5 = vec_perm( v0, v0, vecOneTwo );
9709 v6 = vec_perm( v0, v0, vecThreeFour);
9710 v7 = vec_perm( v1, v1, vecOneTwo );
9711 v8 = vec_perm( v1, v1, vecThreeFour );
9714 ALIGNED_STORE4( &dest[i*2], v5, v6, v7, v8 );
9717 for ( ; i < numSamples; i++ ) {
9718 dest[i*2+0] = dest[i*2+1] = ogg[0][
i] * 32768.0f;
9723 vecPerm1 = vec_add( vec_lvsl( -1, (
int*) &ogg[0][0] ), (vector
unsigned char)(1) );
9724 vecPerm2 = vec_add( vec_lvsl( -1, (
int*) &ogg[1][0] ), (vector
unsigned char)(1) );
9725 v7 = vec_ld( 0, &ogg[1][0] );
9726 v9 = vec_ld( 0, &ogg[0][0] );
9729 for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
9732 v9 = vec_ld( 15, &ogg[0][i] );
9733 v0 = vec_perm( v8, v9, vecPerm1 );
9737 v7 = vec_ld( 15, &ogg[1][i] );
9738 v1 = vec_perm( v6, v7, vecPerm2 );
9741 v0 = vec_madd( v0, constVec, zeroVector );
9742 v1 = vec_madd( v1, constVec, zeroVector );
9745 v2 = vec_perm( v0, v1, vecFirst );
9746 v3 = vec_perm( v0, v1, vecSecond );
9747 v4 = vec_perm( v0, v1, vecThird );
9748 v5 = vec_perm( v0, v1, vecFourth );
9751 ALIGNED_STORE4( &dest[i*4], v2, v3, v4, v5 );
9754 for ( ; i < numSamples >> 1; i++ ) {
9755 dest[i*4+0] = dest[i*4+2] = ogg[0][
i] * 32768.0f;
9756 dest[i*4+1] = dest[i*4+3] = ogg[1][
i] * 32768.0f;
9759 }
else if ( kHz == 44100 ) {
9760 if ( numChannels == 1 ) {
9762 vecPerm1 = vec_add( vec_lvsl( -1, (
int*) &ogg[0][0] ), (vector
unsigned char)(1) );
9764 v9 = vec_ld( 0, &ogg[0][0] );
9767 for ( i = 0; i+7 < numSamples; i += 8 ) {
9770 v7 = vec_ld( 15, &ogg[0][i] );
9772 v9 = vec_ld( 31, &ogg[0][i] );
9774 v0 = vec_perm( v8, v7, vecPerm1 );
9775 v1 = vec_perm( v6, v9, vecPerm1 );
9778 v0 = vec_madd( v0, constVec, zeroVector );
9779 v1 = vec_madd( v1, constVec, zeroVector );
9781 ALIGNED_STORE2( &dest[i], v0, v1 );
9785 for ( ; i < numSamples; i++ ) {
9786 dest[i*1+0] = ogg[0][
i] * 32768.0f;
9791 vecPerm1 = vec_add( vec_lvsl( -1, (
int*) &ogg[0][0] ), (vector
unsigned char)(1) );
9792 vecPerm2 = vec_add( vec_lvsl( -1, (
int*) &ogg[1][0] ), (vector
unsigned char)(1) );
9793 v7 = vec_ld( 0, &ogg[1][0] );
9794 v9 = vec_ld( 0, &ogg[0][0] );
9797 for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
9799 v9 = vec_ld( 15, &ogg[0][i] );
9800 v0 = vec_perm( v8, v9, vecPerm1 );
9804 v7 = vec_ld( 15, &ogg[1][i] );
9805 v1 = vec_perm( v6, v7, vecPerm2 );
9808 v0 = vec_madd( v0, constVec, zeroVector );
9809 v1 = vec_madd( v1, constVec, zeroVector );
9812 v2 = vec_mergeh( v0, v1 );
9813 v3 = vec_mergel( v0, v1 );
9816 ALIGNED_STORE2( &dest[i*2], v2, v3 );
9819 for ( ; i < numSamples >> 1; i++ ) {
9820 dest[i*2+0] = ogg[0][
i] * 32768.0f;
9821 dest[i*2+1] = ogg[1][
i] * 32768.0f;
9843 register vector
float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
9844 register vector
float constVec, zeroVector;
9845 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7, v8, v9, v10;
9846 vector
unsigned char vecPerm1;
9847 vector
unsigned char vecPerm2;
9849 vector
unsigned char vecOneTwo = (vector
unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9850 vector
unsigned char vecThreeFour = (vector
unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9851 vector
unsigned char vecFirst = (vector
unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
9852 vector
unsigned char vecSecond = (vector
unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
9853 vector
unsigned char vecThird = (vector
unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
9854 vector
unsigned char vecFourth = (vector
unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
9856 vector
unsigned char storePerm;
9858 constVec = (vector
float)(32768.0f);
9859 zeroVector = (vector
float)(0.0);
9862 storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector
unsigned char)(1) );
9864 vector
float vecDest = vec_ld( 0, &dest[0] );
9865 vector
unsigned int mask = vec_perm( (vector
unsigned int)(0), (vector
unsigned int)(-1), storePerm );
9867 if ( kHz == 11025 ) {
9868 if ( numChannels == 1 ) {
9870 vecPerm1 = vec_add( vec_lvsl( -1, (
int*) &ogg[0][0] ), (vector
unsigned char)(1) );
9871 v10 = vec_ld( 0, &ogg[0][0] );
9874 for ( i = 0; i+7 < numSamples; i += 8 ) {
9877 v9 = vec_ld( 15, &ogg[0][i] );
9878 v10 = vec_ld( 31, &ogg[0][i] );
9879 vector
float vecDestEnd = vec_ld( 127, &dest[i*4] );
9880 v0 = vec_perm( v8, v9, vecPerm1 );
9881 v1 = vec_perm( v9, v10, vecPerm1 );
9885 oggVec1 = vec_splat( v0, 0 );
9886 oggVec2 = vec_splat( v0, 1 );
9887 oggVec3 = vec_splat( v0, 2 );
9888 oggVec4 = vec_splat( v0, 3 );
9889 oggVec5 = vec_splat( v1, 0 );
9890 oggVec6 = vec_splat( v1, 1 );
9891 oggVec7 = vec_splat( v1, 2 );
9892 oggVec8 = vec_splat( v1, 3 );
9894 v0 = vec_madd( oggVec1, constVec, zeroVector );
9895 v1 = vec_madd( oggVec2, constVec, zeroVector );
9896 v2 = vec_madd( oggVec3, constVec, zeroVector );
9897 v3 = vec_madd( oggVec4, constVec, zeroVector );
9898 v4 = vec_madd( oggVec5, constVec, zeroVector );
9899 v5 = vec_madd( oggVec6, constVec, zeroVector );
9900 v6 = vec_madd( oggVec7, constVec, zeroVector );
9901 v7 = vec_madd( oggVec8, constVec, zeroVector );
9904 v0 = vec_perm( v0, v0, storePerm );
9905 v1 = vec_perm( v1, v1, storePerm );
9906 v2 = vec_perm( v2, v2, storePerm );
9907 v3 = vec_perm( v3, v3, storePerm );
9908 v4 = vec_perm( v4, v4, storePerm );
9909 v5 = vec_perm( v5, v5, storePerm );
9910 v6 = vec_perm( v6, v6, storePerm );
9911 v7 = vec_perm( v7, v7, storePerm );
9914 vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*4] );
9915 vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*4] );
9916 vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*4] );
9917 vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*4] );
9918 vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*4] );
9919 vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*4] );
9920 vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*4] );
9921 vec_st( vec_sel( v6, v7, mask ), 111, &dest[i*4] );
9922 vecDest = vec_sel( v7, vecDestEnd, mask );
9923 vec_st( vecDest, 127, &dest[i*4] );
9927 for ( ; i < numSamples; i++ ) {
9928 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][
i] * 32768.0f;
9934 vecPerm1 = vec_add( vec_lvsl( -1, (
int*) &ogg[0][0] ), (vector
unsigned char)(1) );
9935 vecPerm2 = vec_add( vec_lvsl( -1, (
int*) &ogg[1][0] ), (vector
unsigned char)(1) );
9936 v7 = vec_ld( 0, &ogg[1][0] );
9937 v9 = vec_ld( 0, &ogg[0][0] );
9940 for ( i = 0; i+3 < numSamples >> 1; i+=4 ) {
9943 v9 = vec_ld( 15, &ogg[0][i] );
9944 vector
float vecDestEnd = vec_ld( 127, &dest[i*8] );
9945 v0 = vec_perm( v8, v9, vecPerm1 );
9949 oggVec1 = vec_splat( v0, 0 );
9950 oggVec2 = vec_splat( v0, 1 );
9951 oggVec3 = vec_splat( v0, 2 );
9952 oggVec4 = vec_splat( v0, 3 );
9956 v7 = vec_ld( 15, &ogg[1][i] );
9957 v1 = vec_perm( v6, v7, vecPerm2 );
9961 oggVec5 = vec_splat( v1, 0 );
9962 oggVec6 = vec_splat( v1, 1 );
9963 oggVec7 = vec_splat( v1, 2 );
9964 oggVec8 = vec_splat( v1, 3 );
9966 oggVec1 = vec_madd( oggVec1, constVec, zeroVector );
9967 oggVec2 = vec_madd( oggVec2, constVec, zeroVector );
9968 oggVec3 = vec_madd( oggVec3, constVec, zeroVector );
9969 oggVec4 = vec_madd( oggVec4, constVec, zeroVector );
9970 oggVec5 = vec_madd( oggVec5, constVec, zeroVector );
9971 oggVec6 = vec_madd( oggVec6, constVec, zeroVector );
9972 oggVec7 = vec_madd( oggVec7, constVec, zeroVector );
9973 oggVec8 = vec_madd( oggVec8, constVec, zeroVector );
9977 v0 = vec_mergeh( oggVec1, oggVec5 );
9978 v1 = vec_mergel( oggVec1, oggVec5 );
9979 v2 = vec_mergeh( oggVec2, oggVec6 );
9980 v3 = vec_mergel( oggVec2, oggVec6 );
9982 v4 = vec_mergeh( oggVec3, oggVec7 );
9983 v5 = vec_mergel( oggVec3, oggVec7 );
9984 v6 = vec_mergeh( oggVec4, oggVec8 );
9985 v10 = vec_mergel( oggVec4, oggVec8 );
9988 v0 = vec_perm( v0, v0, storePerm );
9989 v1 = vec_perm( v1, v1, storePerm );
9990 v2 = vec_perm( v2, v2, storePerm );
9991 v3 = vec_perm( v3, v3, storePerm );
9992 v4 = vec_perm( v4, v4, storePerm );
9993 v5 = vec_perm( v5, v5, storePerm );
9994 v6 = vec_perm( v6, v6, storePerm );
9995 v10 = vec_perm( v10, v10, storePerm );
9998 vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*8] );
9999 vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*8] );
10000 vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*8] );
10001 vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*8] );
10002 vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*8] );
10003 vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*8] );
10004 vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*8] );
10005 vec_st( vec_sel( v6, v10, mask ), 111, &dest[i*8] );
10006 vecDest = vec_sel( v10, vecDestEnd, mask );
10007 vec_st( vecDest, 127, &dest[i*8] );
10011 for ( ; i < numSamples >> 1; i++ ) {
10012 dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][
i] * 32768.0f;
10013 dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][
i] * 32768.0f;
10016 }
else if ( kHz == 22050 ) {
10017 if ( numChannels == 1 ) {
10020 vecPerm1 = vec_add( vec_lvsl( -1, (
int*) &ogg[0][0] ), (vector
unsigned char)(1) );
10021 v10 = vec_ld( 0, &ogg[0][0] );
10025 for ( i = 0; i+7 < numSamples; i += 8 ) {
10029 v9 = vec_ld( 15, &ogg[0][i] );
10030 v10 = vec_ld( 31, &ogg[0][i] );
10031 vector
float vecDestEnd = vec_ld( 63, &dest[i*2] );
10032 v0 = vec_perm( v8, v9, vecPerm1 );
10033 v1 = vec_perm( v9, v10, vecPerm1 );
10036 v0 = vec_madd( v0, constVec, zeroVector );
10037 v1 = vec_madd( v1, constVec, zeroVector );
10040 v5 = vec_perm( v0, v0, vecOneTwo );
10041 v6 = vec_perm( v0, v0, vecThreeFour);
10042 v7 = vec_perm( v1, v1, vecOneTwo );
10043 v8 = vec_perm( v1, v1, vecThreeFour );
10046 v5 = vec_perm( v5, v5, storePerm );
10047 v6 = vec_perm( v6, v6, storePerm );
10048 v7 = vec_perm( v7, v7, storePerm );
10049 v8 = vec_perm( v8, v8, storePerm );
10052 vec_st( vec_sel( vecDest, v5, mask ), 0, &dest[i*2] );
10053 vec_st( vec_sel( v5, v6, mask ), 15, &dest[i*2] );
10054 vec_st( vec_sel( v6, v7, mask ), 31, &dest[i*2] );
10055 vec_st( vec_sel( v7, v8, mask ), 47, &dest[i*2] );
10056 vecDest = vec_sel( v8, vecDestEnd, mask );
10057 vec_st( vecDest, 63, &dest[i*2] );
10061 for ( ; i < numSamples; i++ ) {
10062 dest[i*2+0] = dest[i*2+1] = ogg[0][
i] * 32768.0f;
10067 vecPerm1 = vec_add( vec_lvsl( -1, (
int*) &ogg[0][0] ), (vector
unsigned char)(1) );
10068 vecPerm2 = vec_add( vec_lvsl( -1, (
int*) &ogg[1][0] ), (vector
unsigned char)(1) );
10069 v7 = vec_ld( 0, &ogg[1][0] );
10070 v9 = vec_ld( 0, &ogg[0][0] );
10073 for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
10076 v9 = vec_ld( 15, &ogg[0][i] );
10077 vector
float vecDestEnd = vec_ld( 63, &dest[i*4] );
10078 v0 = vec_perm( v8, v9, vecPerm1 );
10082 v7 = vec_ld( 15, &ogg[1][i] );
10083 v1 = vec_perm( v6, v7, vecPerm2 );
10086 v0 = vec_madd( v0, constVec, zeroVector );
10087 v1 = vec_madd( v1, constVec, zeroVector );
10090 v2 = vec_perm( v0, v1, vecFirst );
10091 v3 = vec_perm( v0, v1, vecSecond );
10092 v4 = vec_perm( v0, v1, vecThird );
10093 v5 = vec_perm( v0, v1, vecFourth );
10096 v2 = vec_perm( v2, v2, storePerm );
10097 v3 = vec_perm( v3, v3, storePerm );
10098 v4 = vec_perm( v4, v4, storePerm );
10099 v5 = vec_perm( v5, v5, storePerm );
10102 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
10103 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
10104 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
10105 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
10106 vecDest = vec_sel( v5, vecDestEnd, mask );
10107 vec_st( vecDest, 63, &dest[i*4] );
10111 for ( ; i < numSamples >> 1; i++ ) {
10112 dest[i*4+0] = dest[i*4+2] = ogg[0][
i] * 32768.0f;
10113 dest[i*4+1] = dest[i*4+3] = ogg[1][
i] * 32768.0f;
10116 }
else if ( kHz == 44100 ) {
10117 if ( numChannels == 1 ) {
10119 vecPerm1 = vec_add( vec_lvsl( -1, (
int*) &ogg[0][0] ), (vector
unsigned char)(1) );
10121 v9 = vec_ld( 0, &ogg[0][0] );
10124 for ( i = 0; i+7 < numSamples; i += 8 ) {
10127 v7 = vec_ld( 15, &ogg[0][i] );
10129 v9 = vec_ld( 31, &ogg[0][i] );
10130 vector
float vecDestEnd = vec_ld( 31, &dest[i] );
10132 v0 = vec_perm( v8, v7, vecPerm1 );
10133 v1 = vec_perm( v6, v9, vecPerm1 );
10136 v0 = vec_madd( v0, constVec, zeroVector );
10137 v1 = vec_madd( v1, constVec, zeroVector );
10140 v0 = vec_perm( v0, v0, storePerm );
10141 v1 = vec_perm( v1, v1, storePerm );
10144 vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
10145 vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
10146 vecDest = vec_sel( v1, vecDestEnd, mask );
10147 vec_st( vecDest, 31, &dest[i] );
10151 for ( ; i < numSamples; i++ ) {
10152 dest[i*1+0] = ogg[0][
i] * 32768.0f;
10157 vecPerm1 = vec_add( vec_lvsl( -1, (
int*) &ogg[0][0] ), (vector
unsigned char)(1) );
10158 vecPerm2 = vec_add( vec_lvsl( -1, (
int*) &ogg[1][0] ), (vector
unsigned char)(1) );
10159 v7 = vec_ld( 0, &ogg[1][0] );
10160 v9 = vec_ld( 0, &ogg[0][0] );
10163 for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
10165 v9 = vec_ld( 15, &ogg[0][i] );
10166 v0 = vec_perm( v8, v9, vecPerm1 );
10170 v7 = vec_ld( 15, &ogg[1][i] );
10171 v1 = vec_perm( v6, v7, vecPerm2 );
10174 v0 = vec_madd( v0, constVec, zeroVector );
10175 v1 = vec_madd( v1, constVec, zeroVector );
10178 v2 = vec_mergeh( v0, v1 );
10179 v3 = vec_mergel( v0, v1 );
10182 UNALIGNED_STORE2( &dest[i*2], v2, v3 );
10185 for ( ; i < numSamples >> 1; i++ ) {
10186 dest[i*2+0] = ogg[0][
i] * 32768.0f;
10187 dest[i*2+1] = ogg[1][
i] * 32768.0f;
10196 #ifdef SOUND_DEST_ALIGNED
10208 assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10214 register vector
float vecInc;
10215 register vector
float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10216 register vector
float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10217 register vector
float vecSamplesLd1, vecSamplesLd2;
10218 register vector
float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10220 register vector
unsigned char permVec1 = (vector
unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
10221 register vector
unsigned char permVec2 = (vector
unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
10222 register vector
unsigned char permVec3 = (vector
unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23);
10223 register vector
unsigned char permVec4 = (vector
unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31);
10226 vector
float fourVec = (vector
float)(4.0);
10227 vector
float zeroVec = (vector
float)(0.0);
10232 spkr[0] = lastV[0];
10233 spkr[1] = lastV[1];
10234 spkr[2] = lastV[0] + inc[0];
10235 spkr[3] = lastV[1] + inc[1];
10243 vector
float v0 = loadSplatUnalignedScalar( &inc[0] );
10244 vector
float v1 = loadSplatUnalignedScalar( &inc[1] );
10245 vecInc = vec_mergeh( v0, v1 );
10247 vector
float v2 = loadSplatUnalignedScalar( &spkr[0] );
10248 vector
float v3 = loadSplatUnalignedScalar( &spkr[1] );
10249 vector
float v4 = loadSplatUnalignedScalar( &spkr[2] );
10250 vector
float v5 = loadSplatUnalignedScalar( &spkr[3] );
10253 v0 = vec_mergeh( v2, v4 );
10254 v1 = vec_mergeh( v3, v5 );
10255 vecSpeaker1 = vec_mergeh( v0, v1 );
10257 vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10258 vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10259 vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10260 vecInc = vec_madd( vecInc, fourVec, zeroVec );
10262 vector
unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector
unsigned char)(1) );
10263 vector
float vecSamplesLast = vec_ld( 0, &samples[0] );
10270 vecSamplesLd1 = vecSamplesLast;
10271 vecSamplesLd2 = vec_ld( 15, &samples[i] );
10272 vecSamplesLast = vec_ld( 31, &samples[i] );
10274 vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
10275 vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
10277 vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
10278 vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
10279 vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
10280 vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
10282 vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
10283 vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
10284 vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
10285 vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
10287 vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10288 vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10289 vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10290 vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10293 ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10296 vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10297 vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10298 vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10299 vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10319 register vector
float vecInc;
10320 register vector
float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10321 register vector
float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10322 register vector
float vecSamplesLd1, vecSamplesLd2;
10323 register vector
float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10325 register vector
unsigned char permVec1 = (vector
unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
10326 register vector
unsigned char permVec2 = (vector
unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
10327 register vector
unsigned char permVec3 = (vector
unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23);
10328 register vector
unsigned char permVec4 = (vector
unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31);
10331 vector
float fourVec = (vector
float)(4.0);
10332 vector
float zeroVec = (vector
float)(0.0);
10337 spkr[0] = lastV[0];
10338 spkr[1] = lastV[1];
10339 spkr[2] = lastV[0] + inc[0];
10340 spkr[3] = lastV[1] + inc[1];
10348 vector
float v0 = loadSplatUnalignedScalar( &inc[0] );
10349 vector
float v1 = loadSplatUnalignedScalar( &inc[1] );
10350 vecInc = vec_mergeh( v0, v1 );
10352 vector
float v2 = loadSplatUnalignedScalar( &spkr[0] );
10353 vector
float v3 = loadSplatUnalignedScalar( &spkr[1] );
10354 vector
float v4 = loadSplatUnalignedScalar( &spkr[2] );
10355 vector
float v5 = loadSplatUnalignedScalar( &spkr[3] );
10358 v0 = vec_mergeh( v2, v4 );
10359 v1 = vec_mergeh( v3, v5 );
10360 vecSpeaker1 = vec_mergeh( v0, v1 );
10362 vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10363 vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10364 vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10365 vecInc = vec_madd( vecInc, fourVec, zeroVec );
10367 vector
unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector
unsigned char)(1) );
10368 vector
unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0]), (vector
unsigned char)(1) );
10369 vector
float vecSamplesLast = vec_ld( 0, &samples[0] );
10370 vector
float vecDest = vec_ld( 0, &mixBuffer[0] );
10377 vecSamplesLd1 = vecSamplesLast;
10378 vecSamplesLd2 = vec_ld( 15, &samples[i] );
10379 vecSamplesLast = vec_ld( 31, &samples[i] );
10381 vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
10382 vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
10384 vecMixBuffer1 = vecDest;
10385 vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
10386 vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
10387 vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
10388 vector
float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
10390 vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10391 vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10392 vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10393 vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
10395 vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
10396 vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
10397 vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
10398 vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
10400 vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10401 vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10402 vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10403 vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10406 UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10409 vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10410 vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10411 vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10412 vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10418 #ifdef SOUND_DEST_ALIGNED
10429 assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10436 register vector
float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10438 register vector
float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10439 register vector
float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10440 register vector
float vecInc;
10441 vector
float fourVec = (vector
float)(4.0);
10442 vector
float zeroVec = (vector
float)(0.0);
10449 spkr[0] = lastV[0];
10450 spkr[1] = lastV[1];
10451 spkr[2] = lastV[0] + inc[0];
10452 spkr[3] = lastV[1] + inc[1];
10454 for ( k = 0; k < 2; k++ ) {
10459 vector
float v0 = loadSplatUnalignedScalar( &inc[0] );
10460 vector
float v1 = loadSplatUnalignedScalar( &inc[1] );
10461 vecInc = vec_mergeh( v0, v1 );
10463 vector
float v2 = loadSplatUnalignedScalar( &spkr[0] );
10464 vector
float v3 = loadSplatUnalignedScalar( &spkr[1] );
10465 vector
float v4 = loadSplatUnalignedScalar( &spkr[2] );
10466 vector
float v5 = loadSplatUnalignedScalar( &spkr[3] );
10469 v0 = vec_mergeh( v2, v4 );
10470 v1 = vec_mergeh( v3, v5 );
10471 vecSpeaker1 = vec_mergeh( v0, v1 );
10473 vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10474 vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10475 vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10476 vecInc = vec_madd( vecInc, fourVec, zeroVec );
10478 vector
unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector
unsigned char)(1) );
10479 vector
float vecSamplesLast = vec_ld( 0, &samples[0] );
10485 vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
10486 vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
10487 vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
10488 vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
10490 vecSamples1 = vecSamplesLast;
10491 vecSamples2 = vec_ld( 15, &samples[i*2] );
10492 vecSamples3 = vec_ld( 31, &samples[i*2] );
10493 vecSamples4 = vec_ld( 47, &samples[i*2] );
10494 vecSamplesLast = vec_ld( 63, &samples[i*2] );
10496 vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
10497 vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
10498 vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
10499 vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
10501 vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10502 vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10503 vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10504 vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10506 vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10507 vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10508 vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10509 vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10512 ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10531 register vector
float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10533 register vector
float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10534 register vector
float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10535 register vector
float vecInc;
10536 vector
float fourVec = (vector
float)(4.0);
10537 vector
float zeroVec = (vector
float)(0.0);
10544 spkr[0] = lastV[0];
10545 spkr[1] = lastV[1];
10546 spkr[2] = lastV[0] + inc[0];
10547 spkr[3] = lastV[1] + inc[1];
10549 for ( k = 0; k < 2; k++ ) {
10554 vector
float v0 = loadSplatUnalignedScalar( &inc[0] );
10555 vector
float v1 = loadSplatUnalignedScalar( &inc[1] );
10556 vecInc = vec_mergeh( v0, v1 );
10558 vector
float v2 = loadSplatUnalignedScalar( &spkr[0] );
10559 vector
float v3 = loadSplatUnalignedScalar( &spkr[1] );
10560 vector
float v4 = loadSplatUnalignedScalar( &spkr[2] );
10561 vector
float v5 = loadSplatUnalignedScalar( &spkr[3] );
10564 v0 = vec_mergeh( v2, v4 );
10565 v1 = vec_mergeh( v3, v5 );
10566 vecSpeaker1 = vec_mergeh( v0, v1 );
10568 vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10569 vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10570 vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10571 vecInc = vec_madd( vecInc, fourVec, zeroVec );
10573 vector
unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector
unsigned char)(1) );
10574 vector
unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector
unsigned char)(1) );
10575 vector
float vecSamplesLast = vec_ld( 0, &samples[0] );
10576 vector
float vecDest = vec_ld( 0, &mixBuffer[0] );
10582 vecMixBuffer1 = vecDest;
10583 vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
10584 vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
10585 vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
10586 vector
float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
10588 vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10589 vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10590 vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10591 vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
10593 vecSamples1 = vecSamplesLast;
10594 vecSamples2 = vec_ld( 15, &samples[i*2] );
10595 vecSamples3 = vec_ld( 31, &samples[i*2] );
10596 vecSamples4 = vec_ld( 47, &samples[i*2] );
10597 vecSamplesLast = vec_ld( 63, &samples[i*2] );
10599 vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
10600 vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
10601 vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
10602 vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
10604 vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10605 vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10606 vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10607 vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10609 vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10610 vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10611 vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10612 vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10615 UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10621 #ifdef SOUND_DEST_ALIGNED
10633 assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10639 vector
float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
10640 vector
float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
10641 vector
float vecSamplesLd;
10642 vector
float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
10643 vector
float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
10645 vector
unsigned char samplePerm2 = (vector
unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
10646 vector
unsigned char samplePerm5 = (vector
unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
10653 incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) /
MIXBUFFER_SAMPLES;
10654 incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) /
MIXBUFFER_SAMPLES;
10655 incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) /
MIXBUFFER_SAMPLES;
10656 incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) /
MIXBUFFER_SAMPLES;
10657 incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) /
MIXBUFFER_SAMPLES;
10658 incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) /
MIXBUFFER_SAMPLES;
10661 for ( k = 0; k < 6; k++ ) {
10664 for ( k = 6; k < 12; k++ ) {
10665 sL[k] = lastV[k-6] + incL[k];
10667 for ( k = 12; k < 18; k++ ) {
10668 sL[k] = lastV[k-12] + incL[k] + incL[k];
10670 for ( k = 18; k < 24; k++ ) {
10671 sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
10675 for ( k = 0; k < 24; k++ ) {
10680 vector
unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector
unsigned char)(1) );
10681 vector
unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector
unsigned char)(1) );
10683 vecIncl1 = vec_ld( 0, &incL[0] );
10684 vecIncl2 = vec_ld( 15, &incL[0] );
10685 vecIncl3 = vec_ld( 31, &incL[0] );
10686 vecIncl4 = vec_ld( 47, &incL[0] );
10687 vecIncl5 = vec_ld( 63, &incL[0] );
10688 vecIncl6 = vec_ld( 79, &incL[0] );
10689 vecIncl7 = vec_ld( 95, &incL[0] );
10691 vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10692 vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10693 vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10694 vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
10695 vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
10696 vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
10698 vecSL1 = vec_ld( 0, &sL[0] );
10699 vecSL2 = vec_ld( 15, &sL[0] );
10700 vecSL3 = vec_ld( 31, &sL[0] );
10701 vecSL4 = vec_ld( 47, &sL[0] );
10702 vecSL5 = vec_ld( 63, &sL[0] );
10703 vecSL6 = vec_ld( 79, &sL[0] );
10704 vecSL7 = vec_ld( 95, &sL[0] );
10706 vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10707 vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10708 vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10709 vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
10710 vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
10711 vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
10714 vector
unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector
unsigned char)(1) );
10715 vector
float vecSamplesLast = vec_ld( 0, &samples[0] );
10721 vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
10722 vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
10723 vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
10724 vecMixBuffer4 = vec_ld( 0, &mixBuffer[(i*6)+12] );
10725 vecMixBuffer5 = vec_ld( 0, &mixBuffer[(i*6)+16] );
10726 vecMixBuffer6 = vec_ld( 0, &mixBuffer[(i*6)+20] );
10729 vector
float vecSamplesLd2 = vec_ld( 15, &samples[i] );
10730 vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
10731 vecSamplesLast = vecSamplesLd2;
10734 vecSamples1 = vec_splat( vecSamplesLd, 0 );
10735 vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
10736 vecSamples3 = vec_splat( vecSamplesLd, 1 );
10737 vecSamples4 = vec_splat( vecSamplesLd, 2 );
10738 vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
10739 vecSamples6 = vec_splat( vecSamplesLd, 3 );
10742 vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
10743 vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
10744 vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
10745 vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
10746 vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
10747 vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
10750 ALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
10753 vecSL1 = vec_add( vecSL1, vecIncl1 );
10754 vecSL2 = vec_add( vecSL2, vecIncl2 );
10755 vecSL3 = vec_add( vecSL3, vecIncl3 );
10756 vecSL4 = vec_add( vecSL4, vecIncl4 );
10757 vecSL5 = vec_add( vecSL5, vecIncl5 );
10758 vecSL6 = vec_add( vecSL6, vecIncl6 );
10777 vector
float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
10778 vector
float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
10779 vector
float vecSamplesLd;
10780 vector
float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
10781 vector
float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
10783 register vector
unsigned char samplePerm2 = (vector
unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
10784 register vector
unsigned char samplePerm5 = (vector
unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
10791 incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) /
MIXBUFFER_SAMPLES;
10792 incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) /
MIXBUFFER_SAMPLES;
10793 incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) /
MIXBUFFER_SAMPLES;
10794 incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) /
MIXBUFFER_SAMPLES;
10795 incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) /
MIXBUFFER_SAMPLES;
10796 incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) /
MIXBUFFER_SAMPLES;
10799 for ( k = 0; k < 6; k++ ) {
10802 for ( k = 6; k < 12; k++ ) {
10803 sL[k] = lastV[k-6] + incL[k];
10805 for ( k = 12; k < 18; k++ ) {
10806 sL[k] = lastV[k-12] + incL[k] + incL[k];
10808 for ( k = 18; k < 24; k++ ) {
10809 sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
10813 for ( k = 0; k < 24; k++ ) {
10818 vector
unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector
unsigned char)(1) );
10819 vector
unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector
unsigned char)(1) );
10821 vecIncl1 = vec_ld( 0, &incL[0] );
10822 vecIncl2 = vec_ld( 15, &incL[0] );
10823 vecIncl3 = vec_ld( 31, &incL[0] );
10824 vecIncl4 = vec_ld( 47, &incL[0] );
10825 vecIncl5 = vec_ld( 63, &incL[0] );
10826 vecIncl6 = vec_ld( 79, &incL[0] );
10827 vecIncl7 = vec_ld( 95, &incL[0] );
10829 vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10830 vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10831 vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10832 vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
10833 vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
10834 vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
10836 vecSL1 = vec_ld( 0, &sL[0] );
10837 vecSL2 = vec_ld( 15, &sL[0] );
10838 vecSL3 = vec_ld( 31, &sL[0] );
10839 vecSL4 = vec_ld( 47, &sL[0] );
10840 vecSL5 = vec_ld( 63, &sL[0] );
10841 vecSL6 = vec_ld( 79, &sL[0] );
10842 vecSL7 = vec_ld( 95, &sL[0] );
10844 vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10845 vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10846 vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10847 vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
10848 vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
10849 vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
10851 vector
unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector
unsigned char)(1) );
10852 vector
unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector
unsigned char)(1) );
10853 vector
float vecSamplesLast = vec_ld( 0, &samples[0] );
10854 vector
float vecDest = vec_ld( 0, &mixBuffer[0] );
10860 vecMixBuffer1 = vecDest;
10861 vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
10862 vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
10863 vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*6] );
10864 vecMixBuffer5 = vec_ld( 63, &mixBuffer[i*6] );
10865 vecMixBuffer6 = vec_ld( 79, &mixBuffer[i*6] );
10866 vector
float vecDestEnd = vec_ld( 95, &mixBuffer[i*6] );
10868 vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10869 vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10870 vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10871 vecMixBuffer4 = vec_perm( vecMixBuffer4, vecMixBuffer5, mixBufferPerm );
10872 vecMixBuffer5 = vec_perm( vecMixBuffer5, vecMixBuffer6, mixBufferPerm );
10873 vecMixBuffer6 = vec_perm( vecMixBuffer6, vecDestEnd, mixBufferPerm );
10876 vector
float vecSamplesLd2 = vec_ld( 15, &samples[i] );
10877 vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
10878 vecSamplesLast = vecSamplesLd2;
10881 vecSamples1 = vec_splat( vecSamplesLd, 0 );
10882 vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
10883 vecSamples3 = vec_splat( vecSamplesLd, 1 );
10884 vecSamples4 = vec_splat( vecSamplesLd, 2 );
10885 vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
10886 vecSamples6 = vec_splat( vecSamplesLd, 3 );
10889 vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
10890 vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
10891 vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
10892 vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
10893 vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
10894 vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
10897 UNALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
10900 vecSL1 = vec_add( vecSL1, vecIncl1 );
10901 vecSL2 = vec_add( vecSL2, vecIncl2 );
10902 vecSL3 = vec_add( vecSL3, vecIncl3 );
10903 vecSL4 = vec_add( vecSL4, vecIncl4 );
10904 vecSL5 = vec_add( vecSL5, vecIncl5 );
10905 vecSL6 = vec_add( vecSL6, vecIncl6 );
10911 #ifdef SOUND_DEST_ALIGNED
10924 assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10929 vector
float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
10930 vector
float vecSL1, vecSL2, vecSL3, vecSL4;
10931 vector
float vecSamplesLd;
10932 vector
float vecSamples1, vecSamples2, vecSamples3;
10933 vector
float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
10935 vector
unsigned char samplePerm1 = (vector
unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
10936 vector
unsigned char samplePerm3 = (vector
unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
10957 sL[6] = lastV[0] + incL[0];
10958 sL[7] = lastV[1] + incL[1];
10959 sL[8] = lastV[2] + incL[2];
10960 sL[9] = lastV[3] + incL[3];
10961 sL[10] = lastV[4] + incL[4];
10962 sL[11] = lastV[5] + incL[5];
10979 vector
unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector
unsigned char)(1) );
10980 vector
unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector
unsigned char)(1) );
10981 vecIncl1 = vec_ld( 0, &incL[0] );
10982 vecIncl2 = vec_ld( 15, &incL[0] );
10983 vecIncl3 = vec_ld( 31, &incL[0] );
10984 vecIncl4 = vec_ld( 47, &incL[0] );
10986 vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10987 vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10988 vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10990 vecSL1 = vec_ld( 0, &sL[0] );
10991 vecSL2 = vec_ld( 15, &sL[0] );
10992 vecSL3 = vec_ld( 31, &sL[0] );
10993 vecSL4 = vec_ld( 47, &sL[0] );
10995 vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10996 vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10997 vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10999 vector
unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector
unsigned char)(1) );
11000 vector
float vecSamplesLast = vec_ld( 0, &samples[0] );
11005 vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
11006 vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
11007 vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
11010 vector
float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
11011 vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
11012 vecSamplesLast = vecSamplesLd2;
11017 vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
11018 vecSamples2 = vecSamplesLd;
11019 vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
11022 vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
11023 vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
11024 vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
11027 ALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
11030 vecSL1 = vec_add( vecSL1, vecIncl1 );
11031 vecSL2 = vec_add( vecSL2, vecIncl2 );
11032 vecSL3 = vec_add( vecSL3, vecIncl3 );
11051 vector
float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
11052 vector
float vecSL1, vecSL2, vecSL3, vecSL4;
11053 vector
float vecSamplesLd;
11054 vector
float vecSamples1, vecSamples2, vecSamples3;
11055 vector
float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
11057 vector
unsigned char samplePerm1 = (vector
unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
11058 vector
unsigned char samplePerm3 = (vector
unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
11079 sL[6] = lastV[0] + incL[0];
11080 sL[7] = lastV[1] + incL[1];
11081 sL[8] = lastV[2] + incL[2];
11082 sL[9] = lastV[3] + incL[3];
11083 sL[10] = lastV[4] + incL[4];
11084 sL[11] = lastV[5] + incL[5];
11101 vector
unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector
unsigned char)(1) );
11102 vector
unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector
unsigned char)(1) );
11103 vecIncl1 = vec_ld( 0, &incL[0] );
11104 vecIncl2 = vec_ld( 15, &incL[0] );
11105 vecIncl3 = vec_ld( 31, &incL[0] );
11106 vecIncl4 = vec_ld( 47, &incL[0] );
11108 vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
11109 vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
11110 vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
11112 vecSL1 = vec_ld( 0, &sL[0] );
11113 vecSL2 = vec_ld( 15, &sL[0] );
11114 vecSL3 = vec_ld( 31, &sL[0] );
11115 vecSL4 = vec_ld( 47, &sL[0] );
11117 vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
11118 vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
11119 vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
11121 vector
unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector
unsigned char)(1) );
11122 vector
unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector
unsigned char)(1) );
11123 vector
float vecSamplesLast = vec_ld( 0, &samples[0] );
11124 vector
float vecDest = vec_ld( 0, &mixBuffer[0] );
11129 vecMixBuffer1 = vecDest;
11130 vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
11131 vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
11132 vector
float vecDestEnd = vec_ld( 47, &mixBuffer[i*6] );
11134 vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
11135 vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
11136 vecMixBuffer3 = vec_perm( vecMixBuffer3, vecDestEnd, mixBufferPerm );
11139 vector
float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
11140 vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
11141 vecSamplesLast = vecSamplesLd2;
11146 vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
11147 vecSamples2 = vecSamplesLd;
11148 vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
11151 vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
11152 vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
11153 vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
11156 UNALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
11159 vecSL1 = vec_add( vecSL1, vecIncl1 );
11160 vecSL2 = vec_add( vecSL2, vecIncl2 );
11161 vecSL3 = vec_add( vecSL3, vecIncl3 );
11174 register vector
float v0,
v1,
v2,
v3, v4, v5, v6, v7;
11175 register vector
signed int vi0, vi1, vi2, vi3;
11176 register vector
signed short vs0, vs1;
11177 register vector
float minVec, maxVec, constVec;
11181 for ( ; NOT_16BYTE_ALIGNED( samples[i] ) && ( i < numSamples ); i++ ) {
11182 samples[
i] = mixBuffer[
i] <= -32768.0f ? -32768 : mixBuffer[
i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
11185 constVec = (vector
float)(65536.0f);
11188 minVec = (vector
float)(-32768.0f);
11189 maxVec = (vector
float)(32767.0f);
11191 vector
float vecOld = vec_ld( 0, &mixBuffer[i] );
11192 vector
unsigned char permVec = vec_add( vec_lvsl( -1, &mixBuffer[i] ), (vector
unsigned char)(1) );
11195 for ( ; i+15 < numSamples; i += 16 ) {
11198 v1 = vec_ld( 15, &mixBuffer[i] );
11199 v2 = vec_ld( 31, &mixBuffer[i] );
11200 v3 = vec_ld( 31, &mixBuffer[i] );
11201 vecOld = vec_ld( 47, &mixBuffer[i] );
11203 v0 = vec_perm( v0, v1, permVec );
11204 v1 = vec_perm( v1, v2, permVec );
11205 v2 = vec_perm( v2, v3, permVec );
11206 v3 = vec_perm( v3, vecOld, permVec );
11209 v4 = vec_max( v0, minVec );
11210 v5 = vec_max( v1, minVec );
11211 v6 = vec_max( v2, minVec );
11212 v7 = vec_max( v3, minVec );
11215 v4 = vec_min( v4, maxVec );
11216 v5 = vec_min( v5, maxVec );
11217 v6 = vec_min( v6, maxVec );
11218 v7 = vec_min( v7, maxVec );
11221 vi0 = vec_cts( v4, 0 );
11222 vi1 = vec_cts( v5, 0 );
11223 vi2 = vec_cts( v6, 0 );
11224 vi3 = vec_cts( v7, 0 );
11227 vs0 = vec_pack( vi0, vi1 );
11228 vs1 = vec_pack( vi2, vi3 );
11229 ALIGNED_STORE2( &samples[i], vs0, vs1 );
11233 for ( ; i < numSamples ; i++ ) {
11234 samples[
i] = mixBuffer[
i] <= -32768.0f ? -32768 : mixBuffer[
i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
virtual void VPCALL MatX_LowerTriangularSolve(const idMatX &L, float *x, const float *b, const int n, int skip=0)
virtual void VPCALL CreateSpecularTextureCoords(idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
GLdouble GLdouble GLdouble GLdouble q
virtual void VPCALL TransformVerts(idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights)
static const float INFINITY
virtual void VPCALL AddAssign16(float *dst, const float *src, const int count)
assert(prefInfo.fullscreenBtn)
const idVec3 & Normal(void) const
const float * ToFloatPtr(void) const
virtual void VPCALL Dot(float *dst, const idVec3 &constant, const idVec3 *src, const int count)
virtual void VPCALL BlendJoints(idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints)
const int MIXBUFFER_SAMPLES
const float * ToFloatPtr(void) const
float Distance(const idVec3 &v) const
virtual void VPCALL MixedSoundToSamples(short *samples, const float *mixBuffer, const int numSamples)
virtual void VPCALL ClampMax(float *dst, const float *src, const float max, const int count)
const float * ToFloatPtr(void) const
virtual void VPCALL TracePointCull(byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL Add(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL CmpGE(byte *dst, const float *src0, const float constant, const int count)
virtual void VPCALL MixSoundTwoSpeakerStereo(float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2])
void SetNormal(const idVec3 &normal)
virtual void VPCALL MatX_LowerTriangularSolveTranspose(const idMatX &L, float *x, const float *b, const int n)
idQuat & Slerp(const idQuat &from, const idQuat &to, float t)
GLuint GLuint GLuint GLuint GLuint GLuint GLuint arg2
virtual void VPCALL Sub16(float *dst, const float *src1, const float *src2, const int count)
virtual void VPCALL ClampMin(float *dst, const float *src, const float min, const int count)
GLuint GLuint GLuint GLuint arg1
virtual void VPCALL DeriveUnsmoothedTangents(idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts)
virtual void VPCALL MulAssign16(float *dst, const float constant, const int count)
virtual void VPCALL MixSoundTwoSpeakerMono(float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2])
GLfloat GLfloat GLfloat v2
float normalizationScale[3]
GLuint GLuint GLsizei count
GLint GLint GLint GLint j2
int GetNumColumns(void) const
#define FLOATSIGNBITSET(f)
virtual void VPCALL OverlayPointCull(byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL Negate16(float *dst, const int count)
virtual void VPCALL DecalPointCull(byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL SubAssign16(float *dst, const float *src, const int count)
virtual void VPCALL CmpLE(byte *dst, const float *src0, const float constant, const int count)
GLuint GLuint GLuint GLuint GLuint GLuint GLuint GLuint GLuint GLuint arg3
virtual void VPCALL Div(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL ConvertJointQuatsToJointMats(idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints)
const float * ToFloatPtr(void) const
virtual void VPCALL DeriveTriPlanes(idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
virtual bool VPCALL MatX_LDLTFactor(idMatX &mat, idVecX &invDiag, const int n)
virtual void VPCALL MulSub(float *dst, const float constant, const float *src, const int count)
static float InvSqrt(float x)
GLubyte GLubyte GLubyte a
GLenum const GLvoid * addr
GLdouble GLdouble GLdouble y2
virtual void VPCALL MixSoundSixSpeakerStereo(float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6])
virtual void VPCALL MixSoundSixSpeakerMono(float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6])
GLfloat GLfloat GLfloat GLfloat v3
virtual void VPCALL CreateTextureSpaceLightVectors(idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
virtual void VPCALL UpSamplePCMTo44kHz(float *dest, const short *pcm, const int numSamples, const int kHz, const int numChannels)
virtual void VPCALL DeriveTangents(idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
virtual void VPCALL Mul16(float *dst, const float *src1, const float constant, const int count)
const float * ToFloatPtr(void) const
virtual int VPCALL CreateShadowCache(idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts)
virtual void VPCALL Zero16(float *dst, const int count)
virtual void VPCALL Mul(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL NormalizeTangents(idDrawVert *verts, const int numVerts)
idVertexCache vertexCache
virtual void VPCALL Clamp(float *dst, const float *src, const float min, const float max, const int count)
virtual void VPCALL UntransformJoints(idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint)
void Lerp(const idVec3 &v1, const idVec3 &v2, const float l)
float dot(float a[], float b[])
virtual int VPCALL CreateVertexProgramShadowCache(idVec4 *vertexCache, const idDrawVert *verts, const int numVerts)
virtual void VPCALL ConvertJointMatsToJointQuats(idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints)
virtual void VPCALL MulAdd(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL MinMax(float &min, float &max, const float *src, const int count)
virtual void VPCALL Copy16(float *dst, const float *src, const int count)
virtual void VPCALL Sub(float *dst, const float constant, const float *src, const int count)
const float * ToFloatPtr(void) const
virtual void VPCALL TransformJoints(idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint)
virtual void VPCALL CmpLT(byte *dst, const float *src0, const float constant, const int count)
virtual void VPCALL Add16(float *dst, const float *src1, const float *src2, const int count)
virtual void VPCALL UpSampleOGGTo44kHz(float *dest, const float *const *ogg, const int numSamples, const int kHz, const int numChannels)
virtual void VPCALL CmpGT(byte *dst, const float *src0, const float constant, const int count)
virtual const char *VPCALL GetName(void) const
void FitThroughPoint(const idVec3 &p)