29 #include "../precompiled.h"
55 SIMDProcessor =
generic;
71 newProcessor =
generic;
80 }
else if ( ( cpuid & CPUID_MMX ) && ( cpuid & CPUID_SSE ) && ( cpuid & CPUID_SSE2 ) ) {
82 }
else if ( ( cpuid & CPUID_MMX ) && ( cpuid & CPUID_SSE ) ) {
84 }
else if ( ( cpuid & CPUID_MMX ) && ( cpuid &
CPUID_3DNOW ) ) {
86 }
else if ( ( cpuid & CPUID_MMX ) ) {
91 processor->
cpuid = cpuid;
97 if ( newProcessor != SIMDProcessor ) {
98 SIMDProcessor = newProcessor;
119 if ( processor !=
generic ) {
125 SIMDProcessor =
NULL;
135 #define COUNT 1024 // data count
136 #define NUMTESTS 2048 // number of tests
138 #define RANDOM_SEED 1013904223L //((int)idLib::sys->GetClockTicks())
146 #define TIME_TYPE int
148 #pragma warning(disable : 4731) // frame pointer register 'ebx' modified by inline assembly code
152 #define StartRecordTime( start ) \
153 __asm mov saved_ebx, ebx \
157 __asm mov start, eax \
161 #define StopRecordTime( end ) \
166 __asm mov ebx, saved_ebx \
174 #include <sys/time.h>
175 #include <sys/resource.h>
176 #include <mach/mach_time.h>
178 double ticksPerNanosecond;
180 #define TIME_TYPE uint64_t
182 #ifdef __MWERKS__ //time_in_millisec is missing
210 asm void GetTB(U64 *
in)
232 double TBToDoubleNano( U64 startTime, U64 stopTime,
double ticksPerNanosecond );
235 asm void GetTB( U64 * );
240 double TBToDoubleNano( U64 startTime, U64 stopTime,
double ticksPerNanosecond ) {
241 #define K_2POWER32 4294967296.0
242 #define TICKS_PER_NANOSECOND 0.025
247 diffTime.hi = stopTime.hi - startTime.hi;
248 diffTime.lo = stopTime.lo - startTime.lo;
251 nanoTime = (double)(diffTime.hi)*((double)K_2POWER32) + (double)(diffTime.lo);
252 nanoTime = nanoTime/ticksPerNanosecond;
257 #define K_2POWER32 4294967296.0
258 #define TICKS_PER_NANOSECOND 0.025
261 double nanoTime, milliTime;
266 nanoTime = (double)(the_time.hi)*((double)K_2POWER32) + (double)(the_time.lo);
267 nanoTime = nanoTime/ticksPerNanosecond;
270 milliTime = nanoTime * 1000000.0;
272 printf(
"ticks per nanosec -- %lf\n", ticksPerNanosecond );
273 printf(
"nanoTime is %lf -- milliTime is %lf -- as int is %i\n", nanoTime, milliTime, (
int)milliTime );
275 return (
int)milliTime;
278 #define StartRecordTime( start ) \
279 start = time_in_millisec();
281 #define StopRecordTime( end ) \
282 end = time_in_millisec();
286 #define StartRecordTime( start ) \
287 start = mach_absolute_time();
289 #define StopRecordTime( end ) \
290 end = mach_absolute_time();
294 #define TIME_TYPE int
296 #define StartRecordTime( start ) \
299 #define StopRecordTime( end ) \
304 #define GetBest( start, end, best ) \
305 if ( !best || end - start < best ) { \
306 best = end - start; \
315 void PrintClocks(
const char *
string,
int dataCount,
int clocks,
int otherClocks = 0 ) {
323 if ( otherClocks && clocks ) {
325 int p = (
int) ( (
float) ( otherClocks - clocks ) * 100.0
f / (
float) otherClocks );
344 GetBest( start, end, bestClocks );
357 ALIGN16(
float fdst0[
COUNT] );
358 ALIGN16(
float fdst1[COUNT] );
359 ALIGN16(
float fsrc0[COUNT] );
360 ALIGN16(
float fsrc1[COUNT] );
365 for ( i = 0; i <
COUNT; i++ ) {
372 bestClocksGeneric = 0;
375 p_generic->
Add( fdst0, 4.0
f, fsrc1, COUNT );
377 GetBest( start, end, bestClocksGeneric );
379 PrintClocks(
"generic->Add( float + float[] )", COUNT, bestClocksGeneric );
384 p_simd->
Add( fdst1, 4.0
f, fsrc1, COUNT );
386 GetBest( start, end, bestClocksSIMD );
389 for ( i = 0; i <
COUNT; i++ ) {
395 PrintClocks(
va(
" simd->Add( float + float[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
397 bestClocksGeneric = 0;
400 p_generic->
Add( fdst0, fsrc0, fsrc1, COUNT );
402 GetBest( start, end, bestClocksGeneric );
404 PrintClocks(
"generic->Add( float[] + float[] )", COUNT, bestClocksGeneric );
409 p_simd->
Add( fdst1, fsrc0, fsrc1, COUNT );
411 GetBest( start, end, bestClocksSIMD );
414 for ( i = 0; i <
COUNT; i++ ) {
420 PrintClocks(
va(
" simd->Add( float[] + float[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
431 ALIGN16(
float fdst0[
COUNT] );
432 ALIGN16(
float fdst1[COUNT] );
433 ALIGN16(
float fsrc0[COUNT] );
434 ALIGN16(
float fsrc1[COUNT] );
439 for ( i = 0; i <
COUNT; i++ ) {
446 bestClocksGeneric = 0;
449 p_generic->
Sub( fdst0, 4.0
f, fsrc1, COUNT );
451 GetBest( start, end, bestClocksGeneric );
453 PrintClocks(
"generic->Sub( float + float[] )", COUNT, bestClocksGeneric );
458 p_simd->
Sub( fdst1, 4.0
f, fsrc1, COUNT );
460 GetBest( start, end, bestClocksSIMD );
463 for ( i = 0; i <
COUNT; i++ ) {
469 PrintClocks(
va(
" simd->Sub( float + float[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
471 bestClocksGeneric = 0;
474 p_generic->
Sub( fdst0, fsrc0, fsrc1, COUNT );
476 GetBest( start, end, bestClocksGeneric );
478 PrintClocks(
"generic->Sub( float[] + float[] )", COUNT, bestClocksGeneric );
483 p_simd->
Sub( fdst1, fsrc0, fsrc1, COUNT );
485 GetBest( start, end, bestClocksSIMD );
488 for ( i = 0; i <
COUNT; i++ ) {
494 PrintClocks(
va(
" simd->Sub( float[] + float[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
505 ALIGN16(
float fdst0[
COUNT] );
506 ALIGN16(
float fdst1[COUNT] );
507 ALIGN16(
float fsrc0[COUNT] );
508 ALIGN16(
float fsrc1[COUNT] );
513 for ( i = 0; i <
COUNT; i++ ) {
520 bestClocksGeneric = 0;
523 p_generic->
Mul( fdst0, 4.0
f, fsrc1, COUNT );
525 GetBest( start, end, bestClocksGeneric );
527 PrintClocks(
"generic->Mul( float * float[] )", COUNT, bestClocksGeneric );
532 p_simd->
Mul( fdst1, 4.0
f, fsrc1, COUNT );
534 GetBest( start, end, bestClocksSIMD );
537 for ( i = 0; i <
COUNT; i++ ) {
543 PrintClocks(
va(
" simd->Mul( float * float[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
546 bestClocksGeneric = 0;
549 p_generic->
Mul( fdst0, fsrc0, fsrc1, COUNT );
551 GetBest( start, end, bestClocksGeneric );
553 PrintClocks(
"generic->Mul( float[] * float[] )", COUNT, bestClocksGeneric );
558 p_simd->
Mul( fdst1, fsrc0, fsrc1, COUNT );
560 GetBest( start, end, bestClocksSIMD );
563 for ( i = 0; i <
COUNT; i++ ) {
569 PrintClocks(
va(
" simd->Mul( float[] * float[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
580 ALIGN16(
float fdst0[
COUNT] );
581 ALIGN16(
float fdst1[COUNT] );
582 ALIGN16(
float fsrc0[COUNT] );
583 ALIGN16(
float fsrc1[COUNT] );
588 for ( i = 0; i <
COUNT; i++ ) {
598 bestClocksGeneric = 0;
601 p_generic->
Div( fdst0, 4.0
f, fsrc1, COUNT );
603 GetBest( start, end, bestClocksGeneric );
605 PrintClocks(
"generic->Div( float * float[] )", COUNT, bestClocksGeneric );
610 p_simd->
Div( fdst1, 4.0
f, fsrc1, COUNT );
612 GetBest( start, end, bestClocksSIMD );
615 for ( i = 0; i <
COUNT; i++ ) {
621 PrintClocks(
va(
" simd->Div( float * float[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
624 bestClocksGeneric = 0;
627 p_generic->
Div( fdst0, fsrc0, fsrc1, COUNT );
629 GetBest( start, end, bestClocksGeneric );
631 PrintClocks(
"generic->Div( float[] * float[] )", COUNT, bestClocksGeneric );
636 p_simd->
Div( fdst1, fsrc0, fsrc1, COUNT );
638 GetBest( start, end, bestClocksSIMD );
641 for ( i = 0; i <
COUNT; i++ ) {
647 PrintClocks(
va(
" simd->Div( float[] * float[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
658 ALIGN16(
float fdst0[
COUNT] );
659 ALIGN16(
float fdst1[COUNT] );
660 ALIGN16(
float fsrc0[COUNT] );
665 for ( i = 0; i <
COUNT; i++ ) {
671 for ( j = 0; j < 50 && j <
COUNT; j++ ) {
673 bestClocksGeneric = 0;
675 for (
int k = 0; k <
COUNT; k++ ) {
679 p_generic->
MulAdd( fdst0, 0.123
f, fsrc0, j );
681 GetBest( start, end, bestClocksGeneric );
683 PrintClocks(
va(
"generic->MulAdd( float * float[%2d] )", j ), 1, bestClocksGeneric );
687 for (
int k = 0; k <
COUNT; k++ ) {
691 p_simd->
MulAdd( fdst1, 0.123
f, fsrc0, j );
693 GetBest( start, end, bestClocksSIMD );
696 for ( i = 0; i <
COUNT; i++ ) {
702 PrintClocks(
va(
" simd->MulAdd( float * float[%2d] ) %s", j, result ), 1, bestClocksSIMD, bestClocksGeneric );
714 ALIGN16(
float fdst0[
COUNT] );
715 ALIGN16(
float fdst1[COUNT] );
716 ALIGN16(
float fsrc0[COUNT] );
721 for ( i = 0; i <
COUNT; i++ ) {
727 for ( j = 0; j < 50 && j <
COUNT; j++ ) {
729 bestClocksGeneric = 0;
731 for (
int k = 0; k <
COUNT; k++ ) {
735 p_generic->
MulSub( fdst0, 0.123
f, fsrc0, j );
737 GetBest( start, end, bestClocksGeneric );
739 PrintClocks(
va(
"generic->MulSub( float * float[%2d] )", j ), 1, bestClocksGeneric );
743 for (
int k = 0; k <
COUNT; k++ ) {
747 p_simd->
MulSub( fdst1, 0.123
f, fsrc0, j );
749 GetBest( start, end, bestClocksSIMD );
752 for ( i = 0; i <
COUNT; i++ ) {
758 PrintClocks(
va(
" simd->MulSub( float * float[%2d] ) %s", j, result ), 1, bestClocksSIMD, bestClocksGeneric );
770 ALIGN16(
float fdst0[
COUNT] );
771 ALIGN16(
float fdst1[COUNT] );
772 ALIGN16(
float fsrc0[COUNT] );
773 ALIGN16(
float fsrc1[COUNT] );
774 ALIGN16(
idVec3 v3src0[COUNT] );
775 ALIGN16(
idVec3 v3src1[COUNT] );
776 ALIGN16(
idVec3 v3constant ) ( 1.0f, 2.0f, 3.0f );
777 ALIGN16(
idPlane v4src0[COUNT] );
778 ALIGN16(
idPlane v4constant ) (1.0f, 2.0f, 3.0f, 4.0f);
784 for ( i = 0; i <
COUNT; i++ ) {
793 v4src0[
i] = v3src0[
i];
795 drawVerts[
i].xyz = v3src0[
i];
801 bestClocksGeneric = 0;
804 p_generic->
Dot( fdst0, v3constant, v3src0, COUNT );
806 GetBest( start, end, bestClocksGeneric );
808 PrintClocks(
"generic->Dot( idVec3 * idVec3[] )", COUNT, bestClocksGeneric );
813 p_simd->
Dot( fdst1, v3constant, v3src0, COUNT );
815 GetBest( start, end, bestClocksSIMD );
818 for ( i = 0; i <
COUNT; i++ ) {
824 PrintClocks(
va(
" simd->Dot( idVec3 * idVec3[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
827 bestClocksGeneric = 0;
830 p_generic->
Dot( fdst0, v3constant, v4src0, COUNT );
832 GetBest( start, end, bestClocksGeneric );
834 PrintClocks(
"generic->Dot( idVec3 * idPlane[] )", COUNT, bestClocksGeneric );
839 p_simd->
Dot( fdst1, v3constant, v4src0, COUNT );
841 GetBest( start, end, bestClocksSIMD );
844 for ( i = 0; i <
COUNT; i++ ) {
850 PrintClocks(
va(
" simd->Dot( idVec3 * idPlane[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
853 bestClocksGeneric = 0;
856 p_generic->
Dot( fdst0, v3constant, drawVerts, COUNT );
858 GetBest( start, end, bestClocksGeneric );
860 PrintClocks(
"generic->Dot( idVec3 * idDrawVert[] )", COUNT, bestClocksGeneric );
865 p_simd->
Dot( fdst1, v3constant, drawVerts, COUNT );
867 GetBest( start, end, bestClocksSIMD );
870 for ( i = 0; i <
COUNT; i++ ) {
876 PrintClocks(
va(
" simd->Dot( idVec3 * idDrawVert[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
879 bestClocksGeneric = 0;
882 p_generic->
Dot( fdst0, v4constant, v3src0, COUNT );
884 GetBest( start, end, bestClocksGeneric );
886 PrintClocks(
"generic->Dot( idPlane * idVec3[] )", COUNT, bestClocksGeneric );
891 p_simd->
Dot( fdst1, v4constant, v3src0, COUNT );
893 GetBest( start, end, bestClocksSIMD );
896 for ( i = 0; i <
COUNT; i++ ) {
902 PrintClocks(
va(
" simd->Dot( idPlane * idVec3[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
905 bestClocksGeneric = 0;
908 p_generic->
Dot( fdst0, v4constant, v4src0, COUNT );
910 GetBest( start, end, bestClocksGeneric );
912 PrintClocks(
"generic->Dot( idPlane * idPlane[] )", COUNT, bestClocksGeneric );
917 p_simd->
Dot( fdst1, v4constant, v4src0, COUNT );
919 GetBest( start, end, bestClocksSIMD );
922 for ( i = 0; i <
COUNT; i++ ) {
928 PrintClocks(
va(
" simd->Dot( idPlane * idPlane[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
931 bestClocksGeneric = 0;
934 p_generic->
Dot( fdst0, v4constant, drawVerts, COUNT );
936 GetBest( start, end, bestClocksGeneric );
938 PrintClocks(
"generic->Dot( idPlane * idDrawVert[] )", COUNT, bestClocksGeneric );
943 p_simd->
Dot( fdst1, v4constant, drawVerts, COUNT );
945 GetBest( start, end, bestClocksSIMD );
948 for ( i = 0; i <
COUNT; i++ ) {
954 PrintClocks(
va(
" simd->Dot( idPlane * idDrawVert[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
957 bestClocksGeneric = 0;
960 p_generic->
Dot( fdst0, v3src0, v3src1, COUNT );
962 GetBest( start, end, bestClocksGeneric );
964 PrintClocks(
"generic->Dot( idVec3[] * idVec3[] )", COUNT, bestClocksGeneric );
969 p_simd->
Dot( fdst1, v3src0, v3src1, COUNT );
971 GetBest( start, end, bestClocksSIMD );
974 for ( i = 0; i <
COUNT; i++ ) {
980 PrintClocks(
va(
" simd->Dot( idVec3[] * idVec3[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
985 float dot1 = 0.0f, dot2 = 0.0f;
986 for ( j = 0; j < 50 && j <
COUNT; j++ ) {
988 bestClocksGeneric = 0;
991 p_generic->
Dot( dot1, fsrc0, fsrc1, j );
993 GetBest( start, end, bestClocksGeneric );
995 PrintClocks(
va(
"generic->Dot( float[%2d] * float[%2d] )", j, j ), 1, bestClocksGeneric );
1000 p_simd->
Dot( dot2, fsrc0, fsrc1, j );
1002 GetBest( start, end, bestClocksSIMD );
1005 PrintClocks(
va(
" simd->Dot( float[%2d] * float[%2d] ) %s", j, j, result ), 1, bestClocksSIMD, bestClocksGeneric );
1017 ALIGN16(
float fsrc0[
COUNT] );
1018 ALIGN16(
byte bytedst[COUNT] );
1019 ALIGN16(
byte bytedst2[COUNT] );
1024 for ( i = 0; i <
COUNT; i++ ) {
1030 bestClocksGeneric = 0;
1033 p_generic->
CmpGT( bytedst, fsrc0, 0.0
f, COUNT );
1035 GetBest( start, end, bestClocksGeneric );
1037 PrintClocks(
"generic->CmpGT( float[] >= float )", COUNT, bestClocksGeneric );
1042 p_simd->
CmpGT( bytedst2, fsrc0, 0.0
f, COUNT );
1044 GetBest( start, end, bestClocksSIMD );
1047 for ( i = 0; i <
COUNT; i++ ) {
1048 if ( bytedst[i] != bytedst2[i] ) {
1053 PrintClocks(
va(
" simd->CmpGT( float[] >= float ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1055 bestClocksGeneric = 0;
1057 memset( bytedst, 0, COUNT );
1059 p_generic->
CmpGT( bytedst, 2, fsrc0, 0.0
f, COUNT );
1061 GetBest( start, end, bestClocksGeneric );
1063 PrintClocks(
"generic->CmpGT( 2, float[] >= float )", COUNT, bestClocksGeneric );
1067 memset( bytedst2, 0, COUNT );
1069 p_simd->
CmpGT( bytedst2, 2, fsrc0, 0.0
f, COUNT );
1071 GetBest( start, end, bestClocksSIMD );
1074 for ( i = 0; i <
COUNT; i++ ) {
1075 if ( bytedst[i] != bytedst2[i] ) {
1080 PrintClocks(
va(
" simd->CmpGT( 2, float[] >= float ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1084 bestClocksGeneric = 0;
1087 p_generic->
CmpGE( bytedst, fsrc0, 0.0
f, COUNT );
1089 GetBest( start, end, bestClocksGeneric );
1091 PrintClocks(
"generic->CmpGE( float[] >= float )", COUNT, bestClocksGeneric );
1096 p_simd->
CmpGE( bytedst2, fsrc0, 0.0
f, COUNT );
1098 GetBest( start, end, bestClocksSIMD );
1101 for ( i = 0; i <
COUNT; i++ ) {
1102 if ( bytedst[i] != bytedst2[i] ) {
1107 PrintClocks(
va(
" simd->CmpGE( float[] >= float ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1109 bestClocksGeneric = 0;
1111 memset( bytedst, 0, COUNT );
1113 p_generic->
CmpGE( bytedst, 2, fsrc0, 0.0
f, COUNT );
1115 GetBest( start, end, bestClocksGeneric );
1117 PrintClocks(
"generic->CmpGE( 2, float[] >= float )", COUNT, bestClocksGeneric );
1121 memset( bytedst2, 0, COUNT );
1123 p_simd->
CmpGE( bytedst2, 2, fsrc0, 0.0
f, COUNT );
1125 GetBest( start, end, bestClocksSIMD );
1128 for ( i = 0; i <
COUNT; i++ ) {
1129 if ( bytedst[i] != bytedst2[i] ) {
1134 PrintClocks(
va(
" simd->CmpGE( 2, float[] >= float ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1138 bestClocksGeneric = 0;
1141 p_generic->
CmpLT( bytedst, fsrc0, 0.0
f, COUNT );
1143 GetBest( start, end, bestClocksGeneric );
1145 PrintClocks(
"generic->CmpLT( float[] >= float )", COUNT, bestClocksGeneric );
1150 p_simd->
CmpLT( bytedst2, fsrc0, 0.0
f, COUNT );
1152 GetBest( start, end, bestClocksSIMD );
1155 for ( i = 0; i <
COUNT; i++ ) {
1156 if ( bytedst[i] != bytedst2[i] ) {
1161 PrintClocks(
va(
" simd->CmpLT( float[] >= float ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1163 bestClocksGeneric = 0;
1165 memset( bytedst, 0, COUNT );
1167 p_generic->
CmpLT( bytedst, 2, fsrc0, 0.0
f, COUNT );
1169 GetBest( start, end, bestClocksGeneric );
1171 PrintClocks(
"generic->CmpLT( 2, float[] >= float )", COUNT, bestClocksGeneric );
1175 memset( bytedst2, 0, COUNT );
1177 p_simd->
CmpLT( bytedst2, 2, fsrc0, 0.0
f, COUNT );
1179 GetBest( start, end, bestClocksSIMD );
1182 for ( i = 0; i <
COUNT; i++ ) {
1183 if ( bytedst[i] != bytedst2[i] ) {
1188 PrintClocks(
va(
" simd->CmpLT( 2, float[] >= float ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1192 bestClocksGeneric = 0;
1195 p_generic->
CmpLE( bytedst, fsrc0, 0.0
f, COUNT );
1197 GetBest( start, end, bestClocksGeneric );
1199 PrintClocks(
"generic->CmpLE( float[] >= float )", COUNT, bestClocksGeneric );
1204 p_simd->
CmpLE( bytedst2, fsrc0, 0.0
f, COUNT );
1206 GetBest( start, end, bestClocksSIMD );
1209 for ( i = 0; i <
COUNT; i++ ) {
1210 if ( bytedst[i] != bytedst2[i] ) {
1215 PrintClocks(
va(
" simd->CmpLE( float[] >= float ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1217 bestClocksGeneric = 0;
1219 memset( bytedst, 0, COUNT );
1221 p_generic->
CmpLE( bytedst, 2, fsrc0, 0.0
f, COUNT );
1223 GetBest( start, end, bestClocksGeneric );
1225 PrintClocks(
"generic->CmpLE( 2, float[] >= float )", COUNT, bestClocksGeneric );
1229 memset( bytedst2, 0, COUNT );
1231 p_simd->
CmpLE( bytedst2, 2, fsrc0, 0.0
f, COUNT );
1233 GetBest( start, end, bestClocksSIMD );
1236 for ( i = 0; i <
COUNT; i++ ) {
1237 if ( bytedst[i] != bytedst2[i] ) {
1242 PrintClocks(
va(
" simd->CmpLE( 2, float[] >= float ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1253 ALIGN16(
float fsrc0[
COUNT] );
1254 ALIGN16(
idVec2 v2src0[COUNT] );
1255 ALIGN16(
idVec3 v3src0[COUNT] );
1257 ALIGN16(
int indexes[COUNT] );
1258 float min = 0.0f,
max = 0.0f, min2 = 0.0f, max2 = 0.0f;
1259 idVec2 v2min, v2max, v2min2, v2max2;
1260 idVec3 vmin, vmax, vmin2, vmax2;
1265 for ( i = 0; i <
COUNT; i++ ) {
1272 drawVerts[
i].xyz = v3src0[
i];
1278 bestClocksGeneric = 0;
1283 p_generic->
MinMax( min,
max, fsrc0, COUNT );
1285 GetBest( start, end, bestClocksGeneric );
1287 PrintClocks(
"generic->MinMax( float[] )", COUNT, bestClocksGeneric );
1292 p_simd->
MinMax( min2, max2, fsrc0, COUNT );
1294 GetBest( start, end, bestClocksSIMD );
1298 PrintClocks(
va(
" simd->MinMax( float[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1300 bestClocksGeneric = 0;
1303 p_generic->
MinMax( v2min, v2max, v2src0, COUNT );
1305 GetBest( start, end, bestClocksGeneric );
1307 PrintClocks(
"generic->MinMax( idVec2[] )", COUNT, bestClocksGeneric );
1312 p_simd->
MinMax( v2min2, v2max2, v2src0, COUNT );
1314 GetBest( start, end, bestClocksSIMD );
1317 result = ( v2min == v2min2 && v2max == v2max2 ) ?
"ok" :
S_COLOR_RED"X";
1318 PrintClocks(
va(
" simd->MinMax( idVec2[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1320 bestClocksGeneric = 0;
1323 p_generic->
MinMax( vmin, vmax, v3src0, COUNT );
1325 GetBest( start, end, bestClocksGeneric );
1327 PrintClocks(
"generic->MinMax( idVec3[] )", COUNT, bestClocksGeneric );
1332 p_simd->
MinMax( vmin2, vmax2, v3src0, COUNT );
1334 GetBest( start, end, bestClocksSIMD );
1337 result = ( vmin == vmin2 && vmax == vmax2 ) ?
"ok" :
S_COLOR_RED"X";
1338 PrintClocks(
va(
" simd->MinMax( idVec3[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1340 bestClocksGeneric = 0;
1343 p_generic->
MinMax( vmin, vmax, drawVerts, COUNT );
1345 GetBest( start, end, bestClocksGeneric );
1347 PrintClocks(
"generic->MinMax( idDrawVert[] )", COUNT, bestClocksGeneric );
1352 p_simd->
MinMax( vmin2, vmax2, drawVerts, COUNT );
1354 GetBest( start, end, bestClocksSIMD );
1357 result = ( vmin == vmin2 && vmax == vmax2 ) ?
"ok" :
S_COLOR_RED"X";
1358 PrintClocks(
va(
" simd->MinMax( idDrawVert[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1360 bestClocksGeneric = 0;
1363 p_generic->
MinMax( vmin, vmax, drawVerts, indexes, COUNT );
1365 GetBest( start, end, bestClocksGeneric );
1367 PrintClocks(
"generic->MinMax( idDrawVert[], indexes[] )", COUNT, bestClocksGeneric );
1372 p_simd->
MinMax( vmin2, vmax2, drawVerts, indexes, COUNT );
1374 GetBest( start, end, bestClocksSIMD );
1377 result = ( vmin == vmin2 && vmax == vmax2 ) ?
"ok" :
S_COLOR_RED"X";
1378 PrintClocks(
va(
" simd->MinMax( idDrawVert[], indexes[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1389 ALIGN16(
float fdst0[
COUNT] );
1390 ALIGN16(
float fdst1[COUNT] );
1391 ALIGN16(
float fsrc0[COUNT] );
1396 for ( i = 0; i <
COUNT; i++ ) {
1402 bestClocksGeneric = 0;
1405 p_generic->
Clamp( fdst0, fsrc0, -1.0
f, 1.0
f, COUNT );
1407 GetBest( start, end, bestClocksGeneric );
1409 PrintClocks(
"generic->Clamp( float[] )", COUNT, bestClocksGeneric );
1414 p_simd->
Clamp( fdst1, fsrc0, -1.0
f, 1.0
f, COUNT );
1416 GetBest( start, end, bestClocksSIMD );
1419 for ( i = 0; i <
COUNT; i++ ) {
1420 if ( fdst0[i] != fdst1[i] ) {
1425 PrintClocks(
va(
" simd->Clamp( float[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1428 bestClocksGeneric = 0;
1431 p_generic->
ClampMin( fdst0, fsrc0, -1.0
f, COUNT );
1433 GetBest( start, end, bestClocksGeneric );
1435 PrintClocks(
"generic->ClampMin( float[] )", COUNT, bestClocksGeneric );
1440 p_simd->
ClampMin( fdst1, fsrc0, -1.0
f, COUNT );
1442 GetBest( start, end, bestClocksSIMD );
1445 for ( i = 0; i <
COUNT; i++ ) {
1446 if ( fdst0[i] != fdst1[i] ) {
1451 PrintClocks(
va(
" simd->ClampMin( float[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1454 bestClocksGeneric = 0;
1457 p_generic->
ClampMax( fdst0, fsrc0, 1.0
f, COUNT );
1459 GetBest( start, end, bestClocksGeneric );
1461 PrintClocks(
"generic->ClampMax( float[] )", COUNT, bestClocksGeneric );
1466 p_simd->
ClampMax( fdst1, fsrc0, 1.0
f, COUNT );
1468 GetBest( start, end, bestClocksSIMD );
1471 for ( i = 0; i <
COUNT; i++ ) {
1472 if ( fdst0[i] != fdst1[i] ) {
1477 PrintClocks(
va(
" simd->ClampMax( float[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
1494 for ( i = 5; i < 8192; i += 31 ) {
1495 for ( j = 0; j <
i; j++ ) {
1498 p_simd->
Memcpy( test1, test0, 8192 );
1499 for ( j = 0; j <
i; j++ ) {
1500 if ( test1[j] != test0[j] ) {
1518 for ( i = 0; i < 8192; i++ ) {
1522 for ( i = 5; i < 8192; i += 31 ) {
1523 for ( j = -1; j <= 1; j++ ) {
1524 p_simd->
Memset( test, j, i );
1525 for ( k = 0; k <
i; k++ ) {
1526 if ( test[k] != (
byte)j ) {
1536 #define MATX_SIMD_EPSILON 1e-5f
1561 for ( i = 1; i <= 6; i++ ) {
1564 bestClocksGeneric = 0;
1570 GetBest( start, end, bestClocksGeneric );
1574 PrintClocks(
va(
"generic->MatX_MultiplyVecX %dx%d*%dx1", i, i, i ), 1, bestClocksGeneric );
1582 GetBest( start, end, bestClocksSIMD );
1586 PrintClocks(
va(
" simd->MatX_MultiplyVecX %dx%d*%dx1 %s", i, i, i, result ), 1, bestClocksSIMD, bestClocksGeneric );
1591 for ( i = 1; i <= 6; i++ ) {
1594 bestClocksGeneric = 0;
1600 GetBest( start, end, bestClocksGeneric );
1604 PrintClocks(
va(
"generic->MatX_MultiplyVecX %dx6*6x1", i ), 1, bestClocksGeneric );
1612 GetBest( start, end, bestClocksSIMD );
1616 PrintClocks(
va(
" simd->MatX_MultiplyVecX %dx6*6x1 %s", i, result ), 1, bestClocksSIMD, bestClocksGeneric );
1621 for ( i = 1; i <= 6; i++ ) {
1624 bestClocksGeneric = 0;
1630 GetBest( start, end, bestClocksGeneric );
1634 PrintClocks(
va(
"generic->MatX_MultiplyVecX 6x%d*%dx1", i, i ), 1, bestClocksGeneric );
1641 GetBest( start, end, bestClocksSIMD );
1645 PrintClocks(
va(
" simd->MatX_MultiplyVecX 6x%d*%dx1 %s", i, i, result ), 1, bestClocksSIMD, bestClocksGeneric );
1672 for ( i = 1; i <= 6; i++ ) {
1675 bestClocksGeneric = 0;
1681 GetBest( start, end, bestClocksGeneric );
1685 PrintClocks(
va(
"generic->MatX_MultiplyAddVecX %dx%d*%dx1", i, i, i ), 1, bestClocksGeneric );
1693 GetBest( start, end, bestClocksSIMD );
1697 PrintClocks(
va(
" simd->MatX_MultiplyAddVecX %dx%d*%dx1 %s", i, i, i, result ), 1, bestClocksSIMD, bestClocksGeneric );
1702 for ( i = 1; i <= 6; i++ ) {
1705 bestClocksGeneric = 0;
1711 GetBest( start, end, bestClocksGeneric );
1715 PrintClocks(
va(
"generic->MatX_MultiplyAddVecX %dx6*6x1", i ), 1, bestClocksGeneric );
1723 GetBest( start, end, bestClocksSIMD );
1727 PrintClocks(
va(
" simd->MatX_MultiplyAddVecX %dx6*6x1 %s", i, result ), 1, bestClocksSIMD, bestClocksGeneric );
1732 for ( i = 1; i <= 6; i++ ) {
1735 bestClocksGeneric = 0;
1741 GetBest( start, end, bestClocksGeneric );
1745 PrintClocks(
va(
"generic->MatX_MultiplyAddVecX 6x%d*%dx1", i, i ), 1, bestClocksGeneric );
1753 GetBest( start, end, bestClocksSIMD );
1757 PrintClocks(
va(
" simd->MatX_MultiplyAddVecX 6x%d*%dx1 %s", i, i, result ), 1, bestClocksSIMD, bestClocksGeneric );
1784 for ( i = 1; i <= 6; i++ ) {
1787 bestClocksGeneric = 0;
1793 GetBest( start, end, bestClocksGeneric );
1797 PrintClocks(
va(
"generic->MatX_TransposeMulVecX %dx6*%dx1", i, i ), 1, bestClocksGeneric );
1805 GetBest( start, end, bestClocksSIMD );
1809 PrintClocks(
va(
" simd->MatX_TransposeMulVecX %dx6*%dx1 %s", i, i, result ), 1, bestClocksSIMD, bestClocksGeneric );
1814 for ( i = 1; i <= 6; i++ ) {
1817 bestClocksGeneric = 0;
1823 GetBest( start, end, bestClocksGeneric );
1827 PrintClocks(
va(
"generic->MatX_TransposeMulVecX 6x%d*6x1", i ), 1, bestClocksGeneric );
1835 GetBest( start, end, bestClocksSIMD );
1839 PrintClocks(
va(
" simd->MatX_TransposeMulVecX 6x%d*6x1 %s", i, result ), 1, bestClocksSIMD, bestClocksGeneric );
1866 for ( i = 1; i <= 6; i++ ) {
1869 bestClocksGeneric = 0;
1875 GetBest( start, end, bestClocksGeneric );
1879 PrintClocks(
va(
"generic->MatX_TransposeMulAddVecX %dx6*%dx1", i, i ), 1, bestClocksGeneric );
1887 GetBest( start, end, bestClocksSIMD );
1891 PrintClocks(
va(
" simd->MatX_TransposeMulAddVecX %dx6*%dx1 %s", i, i, result ), 1, bestClocksSIMD, bestClocksGeneric );
1896 for ( i = 1; i <= 6; i++ ) {
1899 bestClocksGeneric = 0;
1905 GetBest( start, end, bestClocksGeneric );
1909 PrintClocks(
va(
"generic->MatX_TransposeMulAddVecX 6x%d*6x1", i ), 1, bestClocksGeneric );
1917 GetBest( start, end, bestClocksSIMD );
1921 PrintClocks(
va(
" simd->MatX_TransposeMulAddVecX 6x%d*6x1 %s", i, result ), 1, bestClocksSIMD, bestClocksGeneric );
1930 #define TEST_VALUE_RANGE 10.0f
1931 #define MATX_MATX_SIMD_EPSILON 1e-4f
1942 for ( i = 1; i <= 5; i++ ) {
1947 bestClocksGeneric = 0;
1952 GetBest( start, end, bestClocksGeneric );
1956 PrintClocks(
va(
"generic->MatX_MultiplyMatX %dx%d*%dx6", i, i, i ), 1, bestClocksGeneric );
1963 GetBest( start, end, bestClocksSIMD );
1967 PrintClocks(
va(
" simd->MatX_MultiplyMatX %dx%d*%dx6 %s", i, i, i, result ), 1, bestClocksSIMD, bestClocksGeneric );
1973 for ( i = 1; i <= 5; i++ ) {
1978 bestClocksGeneric = 0;
1983 GetBest( start, end, bestClocksGeneric );
1987 PrintClocks(
va(
"generic->MatX_MultiplyMatX 6x%d*%dx6", i, i ), 1, bestClocksGeneric );
1994 GetBest( start, end, bestClocksSIMD );
1998 PrintClocks(
va(
" simd->MatX_MultiplyMatX 6x%d*%dx6 %s", i, i, result ), 1, bestClocksSIMD, bestClocksGeneric );
2004 for ( i = 1; i <= 5; i++ ) {
2009 bestClocksGeneric = 0;
2014 GetBest( start, end, bestClocksGeneric );
2018 PrintClocks(
va(
"generic->MatX_MultiplyMatX %dx6*6x%d", i, i ), 1, bestClocksGeneric );
2025 GetBest( start, end, bestClocksSIMD );
2029 PrintClocks(
va(
" simd->MatX_MultiplyMatX %dx6*6x%d %s", i, i, result ), 1, bestClocksSIMD, bestClocksGeneric );
2035 for ( i = 1; i <= 6; i++ ) {
2040 bestClocksGeneric = 0;
2045 GetBest( start, end, bestClocksGeneric );
2049 PrintClocks(
va(
"generic->MatX_MultiplyMatX 6x6*6x%d", i ), 1, bestClocksGeneric );
2056 GetBest( start, end, bestClocksSIMD );
2060 PrintClocks(
va(
" simd->MatX_MultiplyMatX 6x6*6x%d %s", i, result ), 1, bestClocksSIMD, bestClocksGeneric );
2078 for ( i = 1; i <= 5; i++ ) {
2083 bestClocksGeneric = 0;
2088 GetBest( start, end, bestClocksGeneric );
2092 PrintClocks(
va(
"generic->MatX_TransMultiplyMatX %dx6*%dx%d", i, i, i ), 1, bestClocksGeneric );
2099 GetBest( start, end, bestClocksSIMD );
2103 PrintClocks(
va(
" simd->MatX_TransMultiplyMatX %dx6*%dx%d %s", i, i, i, result ), 1, bestClocksSIMD, bestClocksGeneric );
2109 for ( i = 1; i <= 6; i++ ) {
2114 bestClocksGeneric = 0;
2119 GetBest( start, end, bestClocksGeneric );
2123 PrintClocks(
va(
"generic->MatX_TransMultiplyMatX 6x%d*6x6", i ), 1, bestClocksGeneric );
2130 GetBest( start, end, bestClocksSIMD );
2134 PrintClocks(
va(
" simd->MatX_TransMultiplyMatX 6x%d*6x6 %s", i, result ), 1, bestClocksSIMD, bestClocksGeneric );
2138 #define MATX_LTS_SIMD_EPSILON 1.0f
2139 #define MATX_LTS_SOLVE_SIZE 100
2163 bestClocksGeneric = 0;
2168 GetBest( start, end, bestClocksGeneric );
2173 PrintClocks(
va(
"generic->MatX_LowerTriangularSolve %dx%d", i, i ), 1, bestClocksGeneric );
2180 GetBest( start, end, bestClocksSIMD );
2184 PrintClocks(
va(
" simd->MatX_LowerTriangularSolve %dx%d %s", i, i, result ), 1, bestClocksSIMD, bestClocksGeneric );
2210 bestClocksGeneric = 0;
2215 GetBest( start, end, bestClocksGeneric );
2220 PrintClocks(
va(
"generic->MatX_LowerTriangularSolveT %dx%d", i, i ), 1, bestClocksGeneric );
2227 GetBest( start, end, bestClocksSIMD );
2231 PrintClocks(
va(
" simd->MatX_LowerTriangularSolveT %dx%d %s", i, i, result ), 1, bestClocksSIMD, bestClocksGeneric );
2235 #define MATX_LDLT_SIMD_EPSILON 0.1f
2236 #define MATX_LDLT_FACTOR_SOLVE_SIZE 64
2248 idVecX invDiag1, invDiag2;
2258 bestClocksGeneric = 0;
2261 invDiag1.
Zero( MATX_LDLT_FACTOR_SOLVE_SIZE );
2265 GetBest( start, end, bestClocksGeneric );
2268 PrintClocks(
va(
"generic->MatX_LDLTFactor %dx%d", i, i ), 1, bestClocksGeneric );
2273 invDiag2.
Zero( MATX_LDLT_FACTOR_SOLVE_SIZE );
2277 GetBest( start, end, bestClocksSIMD );
2281 PrintClocks(
va(
" simd->MatX_LDLTFactor %dx%d %s", i, i, result ), 1, bestClocksSIMD, bestClocksGeneric );
2297 ALIGN16(
int index[COUNT] );
2303 for ( i = 0; i <
COUNT; i++ ) {
2308 baseJoints[
i].q = angles.
ToQuat();
2315 blendJoints[
i].q = angles.
ToQuat();
2322 bestClocksGeneric = 0;
2324 for ( j = 0; j <
COUNT; j++ ) {
2325 joints1[
j] = baseJoints[
j];
2330 GetBest( start, end, bestClocksGeneric );
2332 PrintClocks(
"generic->BlendJoints()", COUNT, bestClocksGeneric );
2336 for ( j = 0; j <
COUNT; j++ ) {
2337 joints2[
j] = baseJoints[
j];
2342 GetBest( start, end, bestClocksSIMD );
2345 for ( i = 0; i <
COUNT; i++ ) {
2346 if ( !joints1[i].
t.Compare( joints2[i].t, 1e-3
f ) ) {
2349 if ( !joints1[i].
q.Compare( joints2[i].q, 1e-2
f ) ) {
2354 PrintClocks(
va(
" simd->BlendJoints() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
2372 for ( i = 0; i <
COUNT; i++ ) {
2377 baseJoints[
i].q = angles.
ToQuat();
2383 bestClocksGeneric = 0;
2388 GetBest( start, end, bestClocksGeneric );
2390 PrintClocks(
"generic->ConvertJointQuatsToJointMats()", COUNT, bestClocksGeneric );
2397 GetBest( start, end, bestClocksSIMD );
2400 for ( i = 0; i <
COUNT; i++ ) {
2401 if ( !joints1[i].
Compare( joints2[i], 1e-4
f ) ) {
2406 PrintClocks(
va(
" simd->ConvertJointQuatsToJointMats() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
2424 for ( i = 0; i <
COUNT; i++ ) {
2429 baseJoints[
i].SetRotation( angles.
ToMat3() );
2434 baseJoints[
i].SetTranslation( v );
2437 bestClocksGeneric = 0;
2442 GetBest( start, end, bestClocksGeneric );
2444 PrintClocks(
"generic->ConvertJointMatsToJointQuats()", COUNT, bestClocksGeneric );
2451 GetBest( start, end, bestClocksSIMD );
2454 for ( i = 0; i <
COUNT; i++ ) {
2455 if ( !joints1[i].
q.Compare( joints2[i].q, 1e-4
f ) ) {
2459 if ( !joints1[i].
t.Compare( joints2[i].t, 1e-4
f ) ) {
2465 PrintClocks(
va(
" simd->ConvertJointMatsToJointQuats() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
2479 ALIGN16(
int parents[
COUNT+1] );
2484 for ( i = 0; i <=
COUNT; i++ ) {
2489 joints[
i].SetRotation( angles.
ToMat3() );
2494 joints[
i].SetTranslation( v );
2498 bestClocksGeneric = 0;
2500 for ( j = 0; j <=
COUNT; j++ ) {
2501 joints1[
j] = joints[
j];
2506 GetBest( start, end, bestClocksGeneric );
2512 for ( j = 0; j <=
COUNT; j++ ) {
2513 joints2[
j] = joints[
j];
2518 GetBest( start, end, bestClocksSIMD );
2521 for ( i = 0; i <
COUNT; i++ ) {
2522 if ( !joints1[i+1].
Compare( joints2[i+1], 1e-4
f ) ) {
2527 PrintClocks(
va(
" simd->TransformJoints() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
2541 ALIGN16(
int parents[
COUNT+1] );
2546 for ( i = 0; i <=
COUNT; i++ ) {
2551 joints[
i].SetRotation( angles.
ToMat3() );
2556 joints[
i].SetTranslation( v );
2560 bestClocksGeneric = 0;
2562 for ( j = 0; j <=
COUNT; j++ ) {
2563 joints1[
j] = joints[
j];
2568 GetBest( start, end, bestClocksGeneric );
2574 for ( j = 0; j <=
COUNT; j++ ) {
2575 joints2[
j] = joints[
j];
2580 GetBest( start, end, bestClocksSIMD );
2583 for ( i = 0; i <
COUNT; i++ ) {
2584 if ( !joints1[i+1].
Compare( joints2[i+1], 1e-4
f ) ) {
2589 PrintClocks(
va(
" simd->UntransformJoints() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
2597 #define NUMJOINTS 64
2598 #define NUMVERTS COUNT/2
2606 ALIGN16(
int weightIndex[COUNT*2] );
2616 joints[
i].SetRotation( angles.
ToMat3() );
2621 joints[
i].SetTranslation( v );
2624 for ( i = 0; i <
COUNT; i++ ) {
2629 weightIndex[i*2+0] = ( i * NUMJOINTS /
COUNT ) *
sizeof(
idJointMat );
2630 weightIndex[i*2+1] = i & 1;
2633 bestClocksGeneric = 0;
2638 GetBest( start, end, bestClocksGeneric );
2640 PrintClocks(
"generic->TransformVerts()", COUNT, bestClocksGeneric );
2647 GetBest( start, end, bestClocksSIMD );
2651 if ( !drawVerts1[i].xyz.Compare( drawVerts2[i].xyz, 0.5f ) ) {
2656 PrintClocks(
va(
" simd->TransformVerts() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
2669 ALIGN16(
byte cullBits1[COUNT] );
2670 ALIGN16(
byte cullBits2[COUNT] );
2671 byte totalOr1 = 0, totalOr2 = 0;
2680 planes[0][3] = -5.3f;
2681 planes[1][3] = 5.3f;
2682 planes[2][3] = -3.4f;
2683 planes[3][3] = 3.4f;
2685 for ( i = 0; i <
COUNT; i++ ) {
2686 for ( j = 0; j < 3; j++ ) {
2691 bestClocksGeneric = 0;
2694 p_generic->
TracePointCull( cullBits1, totalOr1, 0.0
f, planes, drawVerts, COUNT );
2696 GetBest( start, end, bestClocksGeneric );
2698 PrintClocks(
"generic->TracePointCull()", COUNT, bestClocksGeneric );
2703 p_simd->
TracePointCull( cullBits2, totalOr2, 0.0
f, planes, drawVerts, COUNT );
2705 GetBest( start, end, bestClocksSIMD );
2708 for ( i = 0; i <
COUNT; i++ ) {
2709 if ( cullBits1[i] != cullBits2[i] ) {
2713 result = ( i >= COUNT && totalOr1 == totalOr2 ) ?
"ok" :
S_COLOR_RED"X";
2714 PrintClocks(
va(
" simd->TracePointCull() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
2727 ALIGN16(
byte cullBits1[COUNT] );
2728 ALIGN16(
byte cullBits2[COUNT] );
2739 planes[0][3] = -5.3f;
2740 planes[1][3] = 5.3f;
2741 planes[2][3] = -4.4f;
2742 planes[3][3] = 4.4f;
2743 planes[4][3] = -3.5f;
2744 planes[5][3] = 3.5f;
2746 for ( i = 0; i <
COUNT; i++ ) {
2747 for ( j = 0; j < 3; j++ ) {
2752 bestClocksGeneric = 0;
2755 p_generic->
DecalPointCull( cullBits1, planes, drawVerts, COUNT );
2757 GetBest( start, end, bestClocksGeneric );
2759 PrintClocks(
"generic->DecalPointCull()", COUNT, bestClocksGeneric );
2766 GetBest( start, end, bestClocksSIMD );
2769 for ( i = 0; i <
COUNT; i++ ) {
2770 if ( cullBits1[i] != cullBits2[i] ) {
2775 PrintClocks(
va(
" simd->DecalPointCull() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
2788 ALIGN16(
byte cullBits1[COUNT] );
2789 ALIGN16(
byte cullBits2[COUNT] );
2790 ALIGN16(
idVec2 texCoords1[COUNT] );
2791 ALIGN16(
idVec2 texCoords2[COUNT] );
2798 planes[0][3] = -5.3f;
2799 planes[1][3] = -4.3f;
2801 for ( i = 0; i <
COUNT; i++ ) {
2802 for ( j = 0; j < 3; j++ ) {
2807 bestClocksGeneric = 0;
2810 p_generic->
OverlayPointCull( cullBits1, texCoords1, planes, drawVerts, COUNT );
2812 GetBest( start, end, bestClocksGeneric );
2814 PrintClocks(
"generic->OverlayPointCull()", COUNT, bestClocksGeneric );
2819 p_simd->
OverlayPointCull( cullBits2, texCoords2, planes, drawVerts, COUNT );
2821 GetBest( start, end, bestClocksSIMD );
2824 for ( i = 0; i <
COUNT; i++ ) {
2825 if ( cullBits1[i] != cullBits2[i] ) {
2828 if ( !texCoords1[i].
Compare( texCoords2[i], 1e-4
f ) ) {
2833 PrintClocks(
va(
" simd->OverlayPointCull() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
2846 ALIGN16(
idPlane planes1[COUNT] );
2847 ALIGN16(
idPlane planes2[COUNT] );
2848 ALIGN16(
int indexes[COUNT*3] );
2853 for ( i = 0; i <
COUNT; i++ ) {
2854 for ( j = 0; j < 3; j++ ) {
2857 for ( j = 0; j < 2; j++ ) {
2860 drawVerts2[
i] = drawVerts1[
i];
2863 for ( i = 0; i <
COUNT; i++ ) {
2864 indexes[i*3+0] = ( i + 0 ) % COUNT;
2865 indexes[i*3+1] = ( i + 1 ) % COUNT;
2866 indexes[i*3+2] = ( i + 2 ) % COUNT;
2869 bestClocksGeneric = 0;
2872 p_generic->
DeriveTriPlanes( planes1, drawVerts1, COUNT, indexes, COUNT*3 );
2874 GetBest( start, end, bestClocksGeneric );
2876 PrintClocks(
"generic->DeriveTriPlanes()", COUNT, bestClocksGeneric );
2881 p_simd->
DeriveTriPlanes( planes2, drawVerts2, COUNT, indexes, COUNT*3 );
2883 GetBest( start, end, bestClocksSIMD );
2886 for ( i = 0; i <
COUNT; i++ ) {
2887 if ( !planes1[i].
Compare( planes2[i], 1e-1
f, 1e-1
f ) ) {
2892 PrintClocks(
va(
" simd->DeriveTriPlanes() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
2905 ALIGN16(
idPlane planes1[COUNT] );
2906 ALIGN16(
idPlane planes2[COUNT] );
2907 ALIGN16(
int indexes[COUNT*3] );
2912 for ( i = 0; i <
COUNT; i++ ) {
2913 for ( j = 0; j < 3; j++ ) {
2916 for ( j = 0; j < 2; j++ ) {
2919 drawVerts2[
i] = drawVerts1[
i];
2922 for ( i = 0; i <
COUNT; i++ ) {
2923 indexes[i*3+0] = ( i + 0 ) % COUNT;
2924 indexes[i*3+1] = ( i + 1 ) % COUNT;
2925 indexes[i*3+2] = ( i + 2 ) % COUNT;
2928 bestClocksGeneric = 0;
2931 p_generic->
DeriveTangents( planes1, drawVerts1, COUNT, indexes, COUNT*3 );
2933 GetBest( start, end, bestClocksGeneric );
2935 PrintClocks(
"generic->DeriveTangents()", COUNT, bestClocksGeneric );
2940 p_simd->
DeriveTangents( planes2, drawVerts2, COUNT, indexes, COUNT*3 );
2942 GetBest( start, end, bestClocksSIMD );
2945 for ( i = 0; i <
COUNT; i++ ) {
2948 v1 = drawVerts1[
i].normal;
2950 v2 = drawVerts2[
i].normal;
2956 v1 = drawVerts1[
i].tangents[0];
2958 v2 = drawVerts2[
i].tangents[0];
2964 v1 = drawVerts1[
i].tangents[1];
2966 v2 = drawVerts2[
i].tangents[1];
2972 if ( !planes1[i].
Compare( planes2[i], 1e-1
f, 1e-1
f ) ) {
2977 PrintClocks(
va(
" simd->DeriveTangents() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
2995 for ( i = 0; i <
COUNT; i++ ) {
2996 for ( j = 0; j < 3; j++ ) {
2999 for ( j = 0; j < 2; j++ ) {
3002 drawVerts2[
i] = drawVerts1[
i];
3004 dominantTris[
i].v2 = ( i + 1 + srnd.
RandomInt( 8 ) ) % COUNT;
3005 dominantTris[
i].v3 = ( i + 9 + srnd.
RandomInt( 8 ) ) % COUNT;
3006 dominantTris[
i].normalizationScale[0] = srnd.
CRandomFloat();
3007 dominantTris[
i].normalizationScale[1] = srnd.
CRandomFloat();
3008 dominantTris[
i].normalizationScale[2] = srnd.
CRandomFloat();
3011 bestClocksGeneric = 0;
3016 GetBest( start, end, bestClocksGeneric );
3018 PrintClocks(
"generic->DeriveUnsmoothedTangents()", COUNT, bestClocksGeneric );
3025 GetBest( start, end, bestClocksSIMD );
3028 for ( i = 0; i <
COUNT; i++ ) {
3031 v1 = drawVerts1[
i].normal;
3033 v2 = drawVerts2[
i].normal;
3038 v1 = drawVerts1[
i].tangents[0];
3040 v2 = drawVerts2[
i].tangents[0];
3045 v1 = drawVerts1[
i].tangents[1];
3047 v2 = drawVerts2[
i].tangents[1];
3054 PrintClocks(
va(
" simd->DeriveUnsmoothedTangents() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
3071 for ( i = 0; i <
COUNT; i++ ) {
3072 for ( j = 0; j < 3; j++ ) {
3077 drawVerts2[
i] = drawVerts1[
i];
3080 bestClocksGeneric = 0;
3085 GetBest( start, end, bestClocksGeneric );
3087 PrintClocks(
"generic->NormalizeTangents()", COUNT, bestClocksGeneric );
3094 GetBest( start, end, bestClocksSIMD );
3097 for ( i = 0; i <
COUNT; i++ ) {
3098 if ( !drawVerts1[i].normal.Compare( drawVerts2[i].normal, 1e-2
f ) ) {
3101 if ( !drawVerts1[i].tangents[0].
Compare( drawVerts2[i].tangents[0], 1e-2
f ) ) {
3104 if ( !drawVerts1[i].tangents[1].
Compare( drawVerts2[i].tangents[1], 1e-2
f ) ) {
3110 if ( !drawVerts1[i].xyz.Compare( drawVerts2[i].xyz, 1e-2
f ) ) {
3115 PrintClocks(
va(
" simd->NormalizeTangents() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
3127 ALIGN16(
idVec4 texCoords1[COUNT] );
3128 ALIGN16(
idVec4 texCoords2[COUNT] );
3129 ALIGN16(
int indexes[COUNT*3] );
3130 ALIGN16(
idVec3 lightVectors1[COUNT] );
3131 ALIGN16(
idVec3 lightVectors2[COUNT] );
3137 for ( i = 0; i <
COUNT; i++ ) {
3138 for ( j = 0; j < 3; j++ ) {
3146 for ( i = 0; i <
COUNT; i++ ) {
3147 indexes[i*3+0] = ( i + 0 ) % COUNT;
3148 indexes[i*3+1] = ( i + 1 ) % COUNT;
3149 indexes[i*3+2] = ( i + 2 ) % COUNT;
3156 bestClocksGeneric = 0;
3161 GetBest( start, end, bestClocksGeneric );
3163 PrintClocks(
"generic->CreateTextureSpaceLightVectors()", COUNT, bestClocksGeneric );
3170 GetBest( start, end, bestClocksSIMD );
3173 for ( i = 0; i <
COUNT; i++ ) {
3174 if ( !lightVectors1[i].
Compare( lightVectors2[i], 1e-4
f ) ) {
3179 PrintClocks(
va(
" simd->CreateTextureSpaceLightVectors() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
3191 ALIGN16(
idVec4 texCoords1[COUNT] );
3192 ALIGN16(
idVec4 texCoords2[COUNT] );
3193 ALIGN16(
int indexes[COUNT*3] );
3194 ALIGN16(
idVec3 lightVectors1[COUNT] );
3195 ALIGN16(
idVec3 lightVectors2[COUNT] );
3196 idVec3 lightOrigin, viewOrigin;
3201 for ( i = 0; i <
COUNT; i++ ) {
3202 for ( j = 0; j < 3; j++ ) {
3210 for ( i = 0; i <
COUNT; i++ ) {
3211 indexes[i*3+0] = ( i + 0 ) % COUNT;
3212 indexes[i*3+1] = ( i + 1 ) % COUNT;
3213 indexes[i*3+2] = ( i + 2 ) % COUNT;
3223 bestClocksGeneric = 0;
3228 GetBest( start, end, bestClocksGeneric );
3230 PrintClocks(
"generic->CreateSpecularTextureCoords()", COUNT, bestClocksGeneric );
3237 GetBest( start, end, bestClocksSIMD );
3240 for ( i = 0; i <
COUNT; i++ ) {
3241 if ( !texCoords1[i].
Compare( texCoords2[i], 1e-2
f ) ) {
3246 PrintClocks(
va(
" simd->CreateSpecularTextureCoords() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
3258 ALIGN16(
idVec4 vertexCache1[COUNT*2] );
3259 ALIGN16(
idVec4 vertexCache2[COUNT*2] );
3260 ALIGN16(
int originalVertRemap[COUNT] );
3261 ALIGN16(
int vertRemap1[COUNT] );
3262 ALIGN16(
int vertRemap2[COUNT] );
3263 ALIGN16(
idVec3 lightOrigin );
3264 int numVerts1 = 0, numVerts2 = 0;
3269 for ( i = 0; i <
COUNT; i++ ) {
3273 originalVertRemap[
i] = ( srnd.
CRandomFloat() > 0.0f ) ? -1 : 0;
3279 bestClocksGeneric = 0;
3281 for ( j = 0; j <
COUNT; j++ ) {
3282 vertRemap1[
j] = originalVertRemap[
j];
3285 numVerts1 =p_generic->
CreateShadowCache( vertexCache1, vertRemap1, lightOrigin, drawVerts, COUNT );
3287 GetBest( start, end, bestClocksGeneric );
3289 PrintClocks(
"generic->CreateShadowCache()", COUNT, bestClocksGeneric );
3293 for ( j = 0; j <
COUNT; j++ ) {
3294 vertRemap2[
j] = originalVertRemap[
j];
3297 numVerts2 = p_simd->
CreateShadowCache( vertexCache2, vertRemap2, lightOrigin, drawVerts, COUNT );
3299 GetBest( start, end, bestClocksSIMD );
3302 for ( i = 0; i <
COUNT; i++ ) {
3303 if ( i < ( numVerts1 / 2 ) ) {
3304 if ( !vertexCache1[i*2+0].
Compare( vertexCache2[i*2+0], 1e-2
f ) ) {
3307 if ( !vertexCache1[i*2+1].
Compare( vertexCache2[i*2+1], 1e-2
f ) ) {
3311 if ( vertRemap1[i] != vertRemap2[i] ) {
3316 result = ( i >= COUNT && numVerts1 == numVerts2 ) ?
"ok" :
S_COLOR_RED"X";
3317 PrintClocks(
va(
" simd->CreateShadowCache() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
3319 bestClocksGeneric = 0;
3324 GetBest( start, end, bestClocksGeneric );
3326 PrintClocks(
"generic->CreateVertexProgramShadowCache()", COUNT, bestClocksGeneric );
3333 GetBest( start, end, bestClocksSIMD );
3336 for ( i = 0; i <
COUNT; i++ ) {
3337 if ( !vertexCache1[i*2+0].
Compare( vertexCache2[i*2+0], 1e-2
f ) ) {
3340 if ( !vertexCache1[i*2+1].
Compare( vertexCache2[i*2+1], 1e-2
f ) ) {
3345 PrintClocks(
va(
" simd->CreateVertexProgramShadowCache() %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
3353 #define SOUND_UPSAMPLE_EPSILON 1.0f
3359 ALIGN16(
float ogg0[MIXBUFFER_SAMPLES*2] );
3360 ALIGN16(
float ogg1[MIXBUFFER_SAMPLES*2] );
3361 ALIGN16(
float samples1[MIXBUFFER_SAMPLES*2] );
3362 ALIGN16(
float samples2[MIXBUFFER_SAMPLES*2] );
3364 int kHz, numSpeakers;
3369 for ( i = 0; i < MIXBUFFER_SAMPLES*2; i++ ) {
3370 pcm[
i] = srnd.
RandomInt( (1<<16) ) - (1<<15);
3378 for ( numSpeakers = 1; numSpeakers <= 2; numSpeakers++ ) {
3380 for ( kHz = 11025; kHz <= 44100; kHz *= 2 ) {
3381 bestClocksGeneric = 0;
3384 p_generic->
UpSamplePCMTo44kHz( samples1, pcm, MIXBUFFER_SAMPLES*numSpeakers*kHz/44100, kHz, numSpeakers );
3386 GetBest( start, end, bestClocksGeneric );
3388 PrintClocks(
va(
"generic->UpSamplePCMTo44kHz( %d, %d )", kHz, numSpeakers ), MIXBUFFER_SAMPLES*numSpeakers*kHz/44100, bestClocksGeneric );
3393 p_simd->
UpSamplePCMTo44kHz( samples2, pcm, MIXBUFFER_SAMPLES*numSpeakers*kHz/44100, kHz, numSpeakers );
3395 GetBest( start, end, bestClocksSIMD );
3398 for ( i = 0; i < MIXBUFFER_SAMPLES*numSpeakers; i++ ) {
3403 result = ( i >= MIXBUFFER_SAMPLES*numSpeakers ) ?
"ok" :
S_COLOR_RED"X";
3404 PrintClocks(
va(
" simd->UpSamplePCMTo44kHz( %d, %d ) %s", kHz, numSpeakers, result ), MIXBUFFER_SAMPLES*numSpeakers*kHz/44100, bestClocksSIMD, bestClocksGeneric );
3408 for ( numSpeakers = 1; numSpeakers <= 2; numSpeakers++ ) {
3410 for ( kHz = 11025; kHz <= 44100; kHz *= 2 ) {
3411 bestClocksGeneric = 0;
3414 p_generic->
UpSampleOGGTo44kHz( samples1, ogg, MIXBUFFER_SAMPLES*numSpeakers*kHz/44100, kHz, numSpeakers );
3416 GetBest( start, end, bestClocksGeneric );
3418 PrintClocks(
va(
"generic->UpSampleOGGTo44kHz( %d, %d )", kHz, numSpeakers ), MIXBUFFER_SAMPLES*numSpeakers*kHz/44100, bestClocksGeneric );
3423 p_simd->
UpSampleOGGTo44kHz( samples2, ogg, MIXBUFFER_SAMPLES*numSpeakers*kHz/44100, kHz, numSpeakers );
3425 GetBest( start, end, bestClocksSIMD );
3428 for ( i = 0; i < MIXBUFFER_SAMPLES*numSpeakers; i++ ) {
3434 PrintClocks(
va(
" simd->UpSampleOGGTo44kHz( %d, %d ) %s", kHz, numSpeakers, result ), MIXBUFFER_SAMPLES*numSpeakers*kHz/44100, bestClocksSIMD, bestClocksGeneric );
3444 #define SOUND_MIX_EPSILON 2.0f
3450 ALIGN16(
float mixBuffer1[MIXBUFFER_SAMPLES*6] );
3451 ALIGN16(
float mixBuffer2[MIXBUFFER_SAMPLES*6] );
3452 ALIGN16(
float samples[MIXBUFFER_SAMPLES*6] );
3453 ALIGN16(
short outSamples1[MIXBUFFER_SAMPLES*6] );
3454 ALIGN16(
short outSamples2[MIXBUFFER_SAMPLES*6] );
3461 for ( i = 0; i < 6; i++ ) {
3466 for ( i = 0; i < MIXBUFFER_SAMPLES*6; i++ ) {
3468 samples[
i] = srnd.
RandomInt( (1<<16) ) - (1<<15);
3471 bestClocksGeneric = 0;
3473 for ( j = 0; j < MIXBUFFER_SAMPLES*6; j++ ) {
3474 mixBuffer1[
j] = origMixBuffer[
j];
3479 GetBest( start, end, bestClocksGeneric );
3481 PrintClocks(
"generic->MixSoundTwoSpeakerMono()", MIXBUFFER_SAMPLES, bestClocksGeneric );
3486 for ( j = 0; j < MIXBUFFER_SAMPLES*6; j++ ) {
3487 mixBuffer2[
j] = origMixBuffer[
j];
3492 GetBest( start, end, bestClocksSIMD );
3495 for ( i = 0; i < MIXBUFFER_SAMPLES*6; i++ ) {
3500 result = ( i >= MIXBUFFER_SAMPLES*6 ) ?
"ok" :
S_COLOR_RED"X";
3501 PrintClocks(
va(
" simd->MixSoundTwoSpeakerMono() %s", result ), MIXBUFFER_SAMPLES, bestClocksSIMD, bestClocksGeneric );
3503 bestClocksGeneric = 0;
3505 for ( j = 0; j < MIXBUFFER_SAMPLES*6; j++ ) {
3506 mixBuffer1[
j] = origMixBuffer[
j];
3511 GetBest( start, end, bestClocksGeneric );
3513 PrintClocks(
"generic->MixSoundTwoSpeakerStereo()", MIXBUFFER_SAMPLES, bestClocksGeneric );
3518 for ( j = 0; j < MIXBUFFER_SAMPLES*6; j++ ) {
3519 mixBuffer2[
j] = origMixBuffer[
j];
3524 GetBest( start, end, bestClocksSIMD );
3527 for ( i = 0; i < MIXBUFFER_SAMPLES*6; i++ ) {
3532 result = ( i >= MIXBUFFER_SAMPLES*6 ) ?
"ok" :
S_COLOR_RED"X";
3533 PrintClocks(
va(
" simd->MixSoundTwoSpeakerStereo() %s", result ), MIXBUFFER_SAMPLES, bestClocksSIMD, bestClocksGeneric );
3536 bestClocksGeneric = 0;
3538 for ( j = 0; j < MIXBUFFER_SAMPLES*6; j++ ) {
3539 mixBuffer1[
j] = origMixBuffer[
j];
3544 GetBest( start, end, bestClocksGeneric );
3546 PrintClocks(
"generic->MixSoundSixSpeakerMono()", MIXBUFFER_SAMPLES, bestClocksGeneric );
3551 for ( j = 0; j < MIXBUFFER_SAMPLES*6; j++ ) {
3552 mixBuffer2[
j] = origMixBuffer[
j];
3557 GetBest( start, end, bestClocksSIMD );
3560 for ( i = 0; i < MIXBUFFER_SAMPLES*6; i++ ) {
3565 result = ( i >= MIXBUFFER_SAMPLES*6 ) ?
"ok" :
S_COLOR_RED"X";
3566 PrintClocks(
va(
" simd->MixSoundSixSpeakerMono() %s", result ), MIXBUFFER_SAMPLES, bestClocksSIMD, bestClocksGeneric );
3568 bestClocksGeneric = 0;
3570 for ( j = 0; j < MIXBUFFER_SAMPLES*6; j++ ) {
3571 mixBuffer1[
j] = origMixBuffer[
j];
3576 GetBest( start, end, bestClocksGeneric );
3578 PrintClocks(
"generic->MixSoundSixSpeakerStereo()", MIXBUFFER_SAMPLES, bestClocksGeneric );
3583 for ( j = 0; j < MIXBUFFER_SAMPLES*6; j++ ) {
3584 mixBuffer2[
j] = origMixBuffer[
j];
3589 GetBest( start, end, bestClocksSIMD );
3592 for ( i = 0; i < MIXBUFFER_SAMPLES*6; i++ ) {
3597 result = ( i >= MIXBUFFER_SAMPLES*6 ) ?
"ok" :
S_COLOR_RED"X";
3598 PrintClocks(
va(
" simd->MixSoundSixSpeakerStereo() %s", result ), MIXBUFFER_SAMPLES, bestClocksSIMD, bestClocksGeneric );
3601 for ( i = 0; i < MIXBUFFER_SAMPLES*6; i++ ) {
3602 origMixBuffer[
i] = srnd.
RandomInt( (1<<17) ) - (1<<16);
3605 bestClocksGeneric = 0;
3607 for ( j = 0; j < MIXBUFFER_SAMPLES*6; j++ ) {
3608 mixBuffer1[
j] = origMixBuffer[
j];
3613 GetBest( start, end, bestClocksGeneric );
3615 PrintClocks(
"generic->MixedSoundToSamples()", MIXBUFFER_SAMPLES, bestClocksGeneric );
3619 for ( j = 0; j < MIXBUFFER_SAMPLES*6; j++ ) {
3620 mixBuffer2[
j] = origMixBuffer[
j];
3625 GetBest( start, end, bestClocksSIMD );
3628 for ( i = 0; i < MIXBUFFER_SAMPLES*6; i++ ) {
3629 if ( outSamples1[i] != outSamples2[i] ) {
3633 result = ( i >= MIXBUFFER_SAMPLES*6 ) ?
"ok" :
S_COLOR_RED"X";
3634 PrintClocks(
va(
" simd->MixedSoundToSamples() %s", result ), MIXBUFFER_SAMPLES, bestClocksSIMD, bestClocksGeneric );
3650 float testvar = 1.0f;
3659 GetBest( start, end, bestClocks );
3660 testvar = ( testvar + tst ) * tst;
3669 int tmp = * (
int * ) &tst;
3671 tst = * (
float * ) &tmp;
3673 GetBest( start, end, bestClocks );
3674 testvar = ( testvar + tst ) * tst;
3677 PrintClocks(
" idMath::Fabs( tst )", 1, bestClocks );
3685 GetBest( start, end, bestClocks );
3686 testvar = ( testvar + tst ) * tst * 0.01
f;
3697 GetBest( start, end, bestClocks );
3698 testvar = ( testvar + tst ) * tst;
3701 PrintClocks(
" idMath::Sqrt( tst )", 1, bestClocks );
3709 GetBest( start, end, bestClocks );
3710 testvar = ( testvar + tst ) * tst;
3713 PrintClocks(
" idMath::Sqrt16( tst )", 1, bestClocks );
3721 GetBest( start, end, bestClocks );
3722 testvar = ( testvar + tst ) * tst;
3725 PrintClocks(
" idMath::Sqrt64( tst )", 1, bestClocks );
3733 GetBest( start, end, bestClocks );
3734 testvar = ( testvar + tst ) * tst;
3737 PrintClocks(
" idMath::RSqrt( tst )", 1, bestClocks );
3745 GetBest( start, end, bestClocks );
3746 testvar = ( testvar + tst ) * tst;
3749 PrintClocks(
" idMath::Sin( tst )", 1, bestClocks );
3757 GetBest( start, end, bestClocks );
3758 testvar = ( testvar + tst ) * tst;
3761 PrintClocks(
" idMath::Sin16( tst )", 1, bestClocks );
3769 GetBest( start, end, bestClocks );
3770 testvar = ( testvar + tst ) * tst;
3773 PrintClocks(
" idMath::Cos( tst )", 1, bestClocks );
3781 GetBest( start, end, bestClocks );
3782 testvar = ( testvar + tst ) * tst;
3785 PrintClocks(
" idMath::Cos16( tst )", 1, bestClocks );
3793 GetBest( start, end, bestClocks );
3794 testvar = ( testvar + tst ) * tst;
3797 PrintClocks(
" idMath::SinCos( tst )", 1, bestClocks );
3805 GetBest( start, end, bestClocks );
3806 testvar = ( testvar + tst ) * tst;
3809 PrintClocks(
"idMath::SinCos16( tst )", 1, bestClocks );
3817 GetBest( start, end, bestClocks );
3818 testvar = ( testvar + tst ) * tst;
3821 PrintClocks(
" idMath::Tan( tst )", 1, bestClocks );
3829 GetBest( start, end, bestClocks );
3830 testvar = ( testvar + tst ) * tst;
3833 PrintClocks(
" idMath::Tan16( tst )", 1, bestClocks );
3841 GetBest( start, end, bestClocks );
3842 testvar = ( testvar + tst ) * tst * ( 1.0
f /
idMath::PI );
3845 PrintClocks(
" idMath::ASin( tst )", 1, bestClocks );
3853 GetBest( start, end, bestClocks );
3854 testvar = ( testvar + tst ) * tst * ( 1.0
f /
idMath::PI );
3857 PrintClocks(
" idMath::ASin16( tst )", 1, bestClocks );
3865 GetBest( start, end, bestClocks );
3866 testvar = ( testvar + tst ) * tst * ( 1.0
f /
idMath::PI );
3869 PrintClocks(
" idMath::ACos( tst )", 1, bestClocks );
3877 GetBest( start, end, bestClocks );
3878 testvar = ( testvar + tst ) * tst * ( 1.0
f /
idMath::PI );
3881 PrintClocks(
" idMath::ACos16( tst )", 1, bestClocks );
3889 GetBest( start, end, bestClocks );
3890 testvar = ( testvar + tst ) * tst;
3893 PrintClocks(
" idMath::ATan( tst )", 1, bestClocks );
3901 GetBest( start, end, bestClocks );
3902 testvar = ( testvar + tst ) * tst;
3905 PrintClocks(
" idMath::ATan16( tst )", 1, bestClocks );
3913 GetBest( start, end, bestClocks );
3914 testvar = ( testvar + tst ) * tst * 0.1
f;
3917 PrintClocks(
" idMath::Pow( tst )", 1, bestClocks );
3925 GetBest( start, end, bestClocks );
3926 testvar = ( testvar + tst ) * tst * 0.1
f;
3929 PrintClocks(
" idMath::Pow16( tst )", 1, bestClocks );
3937 GetBest( start, end, bestClocks );
3938 testvar = ( testvar + tst ) * tst * 0.1
f;
3941 PrintClocks(
" idMath::Exp( tst )", 1, bestClocks );
3949 GetBest( start, end, bestClocks );
3950 testvar = ( testvar + tst ) * tst * 0.1
f;
3953 PrintClocks(
" idMath::Exp16( tst )", 1, bestClocks );
3958 tst = fabs( tst ) + 1.0f;
3962 GetBest( start, end, bestClocks );
3963 testvar = ( testvar + tst ) * tst;
3966 PrintClocks(
" idMath::Log( tst )", 1, bestClocks );
3971 tst = fabs( tst ) + 1.0f;
3975 GetBest( start, end, bestClocks );
3976 testvar = ( testvar + tst ) * tst;
3979 PrintClocks(
" idMath::Log16( tst )", 1, bestClocks );
3984 idQuat fromQuat, toQuat, resultQuat;
3996 resultMat3 = fromQuat.
ToMat3();
3998 GetBest( start, end, bestClocks );
4000 PrintClocks(
" idQuat::ToMat3()", 1, bestClocks );
4005 resultQuat.
Slerp( fromQuat, toQuat, 0.3
f );
4007 GetBest( start, end, bestClocks );
4014 resultQuat = cq.
ToQuat();
4016 GetBest( start, end, bestClocks );
4018 PrintClocks(
" idCQuat::ToQuat()", 1, bestClocks );
4023 resultQuat = ang.
ToQuat();
4025 GetBest( start, end, bestClocks );
4027 PrintClocks(
" idAngles::ToQuat()", 1, bestClocks );
4032 resultMat3 = ang.
ToMat3();
4034 GetBest( start, end, bestClocks );
4036 PrintClocks(
" idAngles::ToMat3()", 1, bestClocks );
4049 ALIGN16(
float fsrc0[
COUNT] );
4050 ALIGN16(
float fsrc1[COUNT] );
4051 ALIGN16(
float fsrc2[COUNT] );
4057 for ( i = 0; i <
COUNT; i++ ) {
4064 bestClocksGeneric = 0;
4067 memcpy( &fsrc1[0], &fsrc0[0], COUNT *
sizeof(
float) );
4070 p_generic->
Negate16( fsrc1, COUNT );
4072 GetBest( start, end, bestClocksGeneric );
4074 PrintClocks(
"generic->Negate16( float[] )", COUNT, bestClocksGeneric );
4079 memcpy( &fsrc2[0], &fsrc0[0], COUNT *
sizeof(
float) );
4084 GetBest( start, end, bestClocksSIMD );
4087 for ( i = 0; i <
COUNT; i++ ) {
4088 if ( fsrc1[i] != fsrc2[i] ) {
4093 PrintClocks(
va(
" simd->Negate16( float[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric );
4105 SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
4109 p_generic =
generic;
4123 }
else if (
idStr::Icmp( argString,
"3DNow" ) == 0 ) {
4125 common->
Printf(
"CPU does not support MMX & 3DNow\n" );
4129 }
else if (
idStr::Icmp( argString,
"SSE" ) == 0 ) {
4135 }
else if (
idStr::Icmp( argString,
"SSE2" ) == 0 ) {
4137 common->
Printf(
"CPU does not support MMX & SSE & SSE2\n" );
4141 }
else if (
idStr::Icmp( argString,
"SSE3" ) == 0 ) {
4143 common->
Printf(
"CPU does not support MMX & SSE & SSE2 & SSE3\n" );
4147 }
else if (
idStr::Icmp( argString,
"AltiVec" ) == 0 ) {
4154 common->
Printf(
"invalid argument, use: MMX, 3DNow, SSE, SSE2, SSE3, AltiVec\n" );
4216 if ( p_simd != processor ) {
4223 SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_NORMAL );
GLdouble GLdouble GLdouble GLdouble q
#define MATX_LDLT_SIMD_EPSILON
static float ATan16(float a)
void TestMatXLowerTriangularSolve(void)
idVecX TransposeMultiply(const idVecX &vec) const
void TestGetTextureSpaceLightVectors(void)
void TestMatXTransposeMultiplyMatX(void)
virtual void VPCALL ConvertJointQuatsToJointMats(idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints)=0
static float Log16(float f)
static const float INFINITY
void TestBlendJoints(void)
bool Compare(const idVec3 &a) const
virtual void VPCALL MatX_TransposeMultiplyAddVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)=0
static double Sqrt64(float x)
void TestConvertJointQuatsToJointMats(void)
static float Log(float f)
void TestSoundUpSampling(void)
virtual void VPCALL Sub(float *dst, const float constant, const float *src, const int count)=0
bool Compare(const idMat3 &a) const
virtual void VPCALL UpSamplePCMTo44kHz(float *dest, const short *pcm, const int numSamples, const int kHz, const int numChannels)=0
static float Exp16(float f)
static float Tan16(float a)
static float ACos16(float a)
virtual void VPCALL DeriveTangents(idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)=0
idMat3 ToMat3(void) const
void TestDeriveTangents(void)
const int MIXBUFFER_SAMPLES
static void Test_f(const class idCmdArgs &args)
#define SOUND_UPSAMPLE_EPSILON
virtual void VPCALL MatX_TransposeMultiplyMatX(idMatX &dst, const idMatX &m1, const idMatX &m2)=0
#define StartRecordTime(start)
virtual const char *VPCALL GetName(void) const =0
virtual void VPCALL CreateTextureSpaceLightVectors(idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)=0
#define SOUND_MIX_EPSILON
virtual void VPCALL Div(float *dst, const float constant, const float *src, const int count)=0
virtual void VPCALL Clamp(float *dst, const float *src, const float min, const float max, const int count)=0
virtual void VPCALL CmpGE(byte *dst, const float *src0, const float constant, const int count)=0
void TestMatXLowerTriangularSolveTranspose(void)
const char * Args(int start=1, int end=-1, bool escapeArgs=false) const
virtual void VPCALL MixSoundTwoSpeakerMono(float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2])=0
virtual void VPCALL MatX_MultiplyMatX(idMatX &dst, const idMatX &m1, const idMatX &m2)=0
static float Sqrt(float x)
void TestUntransformJoints(void)
virtual void VPCALL OverlayPointCull(byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts)=0
virtual void VPCALL Memset(void *dst, const int val, const int count)=0
void SetNormal(const idVec3 &normal)
static float Sqrt16(float x)
#define MATX_LTS_SOLVE_SIZE
#define MATX_MATX_SIMD_EPSILON
idQuat & Slerp(const idQuat &from, const idQuat &to, float t)
virtual void VPCALL MatX_LowerTriangularSolveTranspose(const idMatX &L, float *x, const float *b, const int n)=0
static void SinCos(float a, float &s, float &c)
virtual void VPCALL MatX_TransposeMultiplyVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)=0
int Icmp(const char *text) const
static float ASin(float a)
static float Cos16(float a)
virtual void VPCALL ConvertJointMatsToJointQuats(idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints)=0
static float Sin16(float a)
static void Shutdown(void)
void TestCreateShadowCache(void)
virtual void VPCALL CmpLE(byte *dst, const float *src0, const float constant, const int count)=0
virtual void FPU_SetDAZ(bool enable)=0
GLfloat GLfloat GLfloat v2
#define StopRecordTime(end)
static float Pow16(float x, float y)
virtual void VPCALL CreateSpecularTextureCoords(idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)=0
#define MATX_LTS_SIMD_EPSILON
void Random(int seed, float l=0.0f, float u=1.0f)
virtual void VPCALL MinMax(float &min, float &max, const float *src, const int count)=0
static float ASin16(float a)
virtual void VPCALL CmpLT(byte *dst, const float *src0, const float constant, const int count)=0
virtual void VPCALL BlendJoints(idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints)=0
virtual void VPCALL TransformVerts(idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights)=0
static float ATan(float a)
virtual void VPCALL MatX_MultiplyVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)=0
virtual void VPCALL UpSampleOGGTo44kHz(float *dest, const float *const *ogg, const int numSamples, const int kHz, const int numChannels)=0
static float Sin(float a)
void TestMatXMultiplyAddVecX(void)
virtual void VPCALL MulAdd(float *dst, const float constant, const float *src, const int count)=0
static float Fabs(float f)
virtual void VPCALL DeriveTriPlanes(idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)=0
virtual cpuid_t GetProcessorId(void)=0
void TestDeriveUnsmoothedTangents(void)
virtual void VPCALL Add(float *dst, const float constant, const float *src, const int count)=0
void TestMatXMultiplyVecX(void)
idQuat ToQuat(void) const
virtual void VPCALL MixSoundSixSpeakerStereo(float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6])=0
void TestTransformVerts(void)
const float * ToFloatPtr(void) const
virtual void VPCALL MixSoundSixSpeakerMono(float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6])=0
virtual void VPCALL MatX_LowerTriangularSolve(const idMatX &L, float *x, const float *b, const int n, int skip=0)=0
virtual void VPCALL Dot(float *dst, const idVec3 &constant, const idVec3 *src, const int count)=0
virtual void VPCALL MixSoundTwoSpeakerStereo(float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2])=0
void TestNormalizeTangents(void)
void Random(int seed, float l=0.0f, float u=1.0f)
static void InitProcessor(const char *module, bool forceGeneric)
virtual void VPCALL Mul(float *dst, const float constant, const float *src, const int count)=0
idCQuat ToCQuat(void) const
static float Tan(float a)
void TestMatXTransposeMultiplyAddVecX(void)
virtual void Printf(const char *fmt,...) id_attribute((format(printf
static float Exp(float f)
static void SinCos16(float a, float &s, float &c)
void TestMatXLDLTFactor(void)
#define MATX_LDLT_FACTOR_SOLVE_SIZE
virtual bool VPCALL MatX_LDLTFactor(idMatX &mat, idVecX &invDiag, const int n)=0
const char * ToString(int precision=2) const
virtual void VPCALL DeriveUnsmoothedTangents(idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts)=0
virtual void VPCALL Memcpy(void *dst, const void *src, const int count)=0
virtual void VPCALL UntransformJoints(idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint)=0
static float Pow(float x, float y)
virtual void VPCALL TransformJoints(idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint)=0
void TestDecalPointCull(void)
virtual void VPCALL TracePointCull(byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts)=0
void SetSize(int rows, int columns)
virtual void VPCALL MatX_MultiplyAddVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)=0
void TestMatXTransposeMultiplyVecX(void)
virtual void VPCALL MulSub(float *dst, const float constant, const float *src, const int count)=0
void TestTracePointCull(void)
void PrintClocks(const char *string, int dataCount, int clocks, int otherClocks=0)
idMat3 ToMat3(void) const
virtual void FPU_SetFTZ(bool enable)=0
void TestTransformJoints(void)
void TestConvertJointMatsToJointQuats(void)
idQuat ToQuat(void) const
void TestDeriveTriPlanes(void)
virtual void VPCALL NormalizeTangents(idDrawVert *verts, const int numVerts)=0
idSIMDProcessor * p_generic
virtual void VPCALL MixedSoundToSamples(short *samples, const float *mixBuffer, const int numSamples)=0
idSIMDProcessor * processor
bool Compare(const idVecX &a) const
virtual void VPCALL CmpGT(byte *dst, const float *src0, const float constant, const int count)=0
void TestMatXMultiplyMatX(void)
void TestOverlayPointCull(void)
const char * Argv(int arg) const
void TestGetSpecularTextureCoords(void)
virtual void SetRefreshOnPrint(bool set)=0
#define GetBest(start, end, best)
char * va(const char *fmt,...)
static float ACos(float a)
void TestSoundMixing(void)
virtual void VPCALL Negate16(float *dst, const int count)=0
void Replace(const char *old, const char *nw)
virtual void VPCALL ClampMin(float *dst, const float *src, const float min, const int count)=0
#define MATX_SIMD_EPSILON
int LengthWithoutColors(void) const
virtual void VPCALL ClampMax(float *dst, const float *src, const float max, const int count)=0
static float RSqrt(float x)
bool Compare(const idMatX &a) const
virtual void VPCALL DecalPointCull(byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts)=0
static class idCommon * common
virtual int VPCALL CreateShadowCache(idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts)=0
virtual int VPCALL CreateVertexProgramShadowCache(idVec4 *vertexCache, const idDrawVert *verts, const int numVerts)=0
static float Cos(float a)
idSIMDProcessor * SIMDProcessor