29 #include "../precompiled.h"
44 #if defined(MACOS_X) && defined(__i386__)
46 #include <xmmintrin.h>
48 #define DRAWVERT_SIZE 60
49 #define DRAWVERT_XYZ_OFFSET (0*4)
50 #define DRAWVERT_ST_OFFSET (3*4)
51 #define DRAWVERT_NORMAL_OFFSET (5*4)
52 #define DRAWVERT_TANGENT0_OFFSET (8*4)
53 #define DRAWVERT_TANGENT1_OFFSET (11*4)
54 #define DRAWVERT_COLOR_OFFSET (14*4)
56 #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
57 #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
88 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
91 char *constant_p = (
char *)&constant;
92 char *src_p = (
char *) src;
93 char *dst_p = (
char *) dst;
109 count_l4 = count_l4 & ~3;
110 xmm4 = _mm_load_ss((
float *) (constant_p));
111 xmm4 = _mm_shuffle_ps(xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ));
112 xmm5 = _mm_load_ss((
float *) (constant_p + 4));
113 xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
114 xmm6 = _mm_load_ss((
float *) (constant_p + 8));
115 xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
116 xmm7 = _mm_load_ss((
float *) (constant_p + 12));
117 xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
128 count_l4 = count_l4 * DRAWVERT_SIZE;
129 src_p = src_p + count_l4;
130 count_l4 = -count_l4;
141 xmm0 = _mm_load_ss((
float *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0));
142 xmm2 = _mm_load_ss((
float *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8));
143 xmm0 = _mm_loadh_pi(xmm0, (__m64 *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0));
150 xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4));
151 xmm2 = _mm_shuffle_ps(xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ));
158 xmm3 = _mm_load_ss((
float *) (src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0));
159 xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0));
160 xmm0 = _mm_shuffle_ps(xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ));
165 xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4));
166 xmm1 = _mm_shuffle_ps(xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ));
171 xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8));
172 xmm2 = _mm_shuffle_ps(xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ));
179 count_l4 = count_l4 + 4*DRAWVERT_SIZE;
189 xmm0 = _mm_mul_ps(xmm0, xmm4);
190 xmm1 = _mm_mul_ps(xmm1, xmm5);
191 xmm2 = _mm_mul_ps(xmm2, xmm6);
192 xmm0 = _mm_add_ps(xmm0, xmm7);
193 xmm0 = _mm_add_ps(xmm0, xmm1);
194 xmm0 = _mm_add_ps(xmm0, xmm2);
201 _mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
202 _mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
203 }
while(count_l4 < 0);
211 count_l1 = count_l1 & 3;
231 xmm0 = _mm_load_ss((
float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+0));
232 xmm1 = _mm_load_ss((
float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+4));
233 xmm2 = _mm_load_ss((
float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+8));
234 xmm0 = _mm_mul_ss(xmm0, xmm4);
235 xmm1 = _mm_mul_ss(xmm1, xmm5);
236 xmm2 = _mm_mul_ss(xmm2, xmm6);
237 xmm0 = _mm_add_ss(xmm0, xmm7);
239 xmm0 = _mm_add_ss(xmm0, xmm1);
240 count_l4 = count_l4 + DRAWVERT_SIZE;
241 xmm0 = _mm_add_ss(xmm0, xmm2);
242 count_l1 = count_l1 - 1;
243 _mm_store_ss((
float *) (dst_p-4), xmm0);
244 }
while( count_l1 != 0);
261 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
279 xmm1 = _mm_xor_ps(xmm0, xmm0);
280 xmm0 = _mm_shuffle_ps(xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ));
281 xmm1 = _mm_sub_ps(xmm1, xmm0);
292 indexes_p = (
char *) indexes;
293 src_p = (
char *) src;
295 count_l = count_l & ~3;
302 count_l = count_l << 2;
303 indexes_p = indexes_p + count_l;
319 edx = *((
int*)(indexes_p+count_l+0));
320 edx = edx * DRAWVERT_SIZE;
321 xmm4 = _mm_load_ss((
float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
322 xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
323 xmm0 = _mm_min_ps(xmm0, xmm4);
324 xmm1 = _mm_max_ps(xmm1, xmm4);
334 edx = *((
int*)(indexes_p+count_l+4));
335 edx = edx * DRAWVERT_SIZE;
336 xmm5 = _mm_load_ss((
float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
337 xmm5 = _mm_loadh_pi(xmm5, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
338 xmm2 = _mm_min_ps(xmm2, xmm5);
339 xmm3 = _mm_max_ps(xmm3, xmm5);
349 edx = *((
int*)(indexes_p+count_l+8));
350 edx = edx * DRAWVERT_SIZE;
351 xmm6 = _mm_load_ss((
float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
352 xmm6 = _mm_loadh_pi(xmm6, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
353 xmm0 = _mm_min_ps(xmm0, xmm6);
354 xmm1 = _mm_max_ps(xmm1, xmm6);
364 edx = *((
int*)(indexes_p+count_l+12));
365 edx = edx * DRAWVERT_SIZE;
366 xmm7 = _mm_load_ss((
float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
367 xmm7 = _mm_loadh_pi(xmm7, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
368 xmm2 = _mm_min_ps(xmm2, xmm7);
369 xmm3 = _mm_max_ps(xmm3, xmm7);
375 count_l = count_l + 4*4;
376 }
while (count_l < 0);
385 count_l = count_l & 3;
392 count_l = count_l << 2;
393 indexes_p = indexes_p + count_l;
407 edx = *((
int*)(indexes_p+count_l+0));
408 edx = edx * DRAWVERT_SIZE;
409 xmm4 = _mm_load_ss((
float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
410 xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
411 xmm0 = _mm_min_ps(xmm0, xmm4);
412 xmm1 = _mm_max_ps(xmm1, xmm4);
418 count_l = count_l + 4;
419 }
while (count_l < 0);
436 xmm2 = _mm_shuffle_ps(xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ));
437 xmm3 = _mm_shuffle_ps(xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ));
438 xmm0 = _mm_min_ps(xmm0, xmm2);
439 xmm1 = _mm_max_ps(xmm1, xmm3);
440 min_p = (
char *) &min;
441 _mm_storeh_pi((__m64 *)(min_p), xmm0);
442 _mm_store_ss((
float *)(min_p+8), xmm0);
443 max_p = (
char *) &max;
444 _mm_storeh_pi((__m64 *)(max_p), xmm1);
445 _mm_store_ss((
float *)(max_p+8), xmm1);
461 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
472 constant_p = (
char *) &constant;
474 src_p = (
char *) src;
475 dst_p = (
char *) dst;
476 count_l4 = count_l4 & ~3;
486 xmm5 = _mm_load_ss((
float *) (constant_p+0));
487 xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
488 xmm6 = _mm_load_ss((
float *) (constant_p+4));
489 xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
490 xmm7 = _mm_load_ss((
float *) (constant_p+8));
491 xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
502 count_l4 = count_l4 * 16;
503 src_p = src_p + count_l4;
504 count_l4 = -count_l4;
525 xmm1 = _mm_loadl_pi(xmm1, (__m64 *)(src_p+count_l4+ 0));
526 xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+ 8));
527 xmm1 = _mm_loadh_pi(xmm1, (__m64 *)(src_p+count_l4+16));
528 xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+24));
529 xmm2 = _mm_loadl_pi(xmm2, (__m64 *)(src_p+count_l4+32));
530 xmm4 = _mm_loadl_pi(xmm4, (__m64 *)(src_p+count_l4+40));
531 xmm2 = _mm_loadh_pi(xmm2, (__m64 *)(src_p+count_l4+48));
532 xmm4 = _mm_loadh_pi(xmm4, (__m64 *)(src_p+count_l4+56));
535 xmm0 = _mm_shuffle_ps(xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ));
536 xmm1 = _mm_shuffle_ps(xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ));
538 xmm2 = _mm_shuffle_ps(xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ));
539 xmm3 = _mm_shuffle_ps(xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ));
546 count_l4 = count_l4 + 4*16;
556 xmm0 = _mm_mul_ps(xmm0, xmm5);
557 xmm1 = _mm_mul_ps(xmm1, xmm6);
558 xmm2 = _mm_mul_ps(xmm2, xmm7);
559 xmm0 = _mm_add_ps(xmm0, xmm3);
560 xmm0 = _mm_add_ps(xmm0, xmm1);
561 xmm0 = _mm_add_ps(xmm0, xmm2);
568 _mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
569 _mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
570 }
while (count_l4 < 0);
578 count_l1 = count_l1 & 3;
601 xmm0 = _mm_load_ss((
float *) (src_p+count_l4+ 0));
602 xmm1 = _mm_load_ss((
float *) (src_p+count_l4+ 4));
603 xmm2 = _mm_load_ss((
float *) (src_p+count_l4+ 8));
604 xmm3 = _mm_load_ss((
float *) (src_p+count_l4+12));
606 xmm0 = _mm_mul_ss(xmm0, xmm5);
607 xmm1 = _mm_mul_ss(xmm1, xmm6);
608 xmm2 = _mm_mul_ss(xmm2, xmm7);
610 xmm0 = _mm_add_ss(xmm0, xmm3);
612 xmm0 = _mm_add_ss(xmm0, xmm1);
613 count_l4 = count_l4 + 16;
614 xmm0 = _mm_add_ss(xmm0, xmm2);
615 count_l1 = count_l1 - 1;
616 _mm_store_ss((
float *) (dst_p-4), xmm0);
617 }
while (count_l1 != 0);
624 #elif defined(_WIN32)
626 #include <xmmintrin.h>
628 #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
629 #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
632 #define TRANSPOSE_4x4( reg0, reg1, reg2, reg3, reg4 ) \
633 __asm movaps reg4, reg2 \
634 __asm unpcklps reg2, reg3 \
635 __asm unpckhps reg4, reg3 \
636 __asm movaps reg3, reg0 \
637 __asm unpcklps reg0, reg1 \
638 __asm unpckhps reg3, reg1 \
639 __asm movaps reg1, reg0 \
640 __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 1, 0, 1 ) \
641 __asm shufps reg1, reg2, R_SHUFFLEPS( 2, 3, 2, 3 ) \
642 __asm movaps reg2, reg3 \
643 __asm shufps reg2, reg4, R_SHUFFLEPS( 0, 1, 0, 1 ) \
644 __asm shufps reg3, reg4, R_SHUFFLEPS( 2, 3, 2, 3 )
647 #define TRANPOSE_4x4_FROM_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \
648 __asm movlps reg1, [address+ 0] \
649 __asm movlps reg3, [address+ 8] \
650 __asm movhps reg1, [address+16] \
651 __asm movhps reg3, [address+24] \
652 __asm movlps reg2, [address+32] \
653 __asm movlps reg4, [address+40] \
654 __asm movhps reg2, [address+48] \
655 __asm movhps reg4, [address+56] \
656 __asm movaps reg0, reg1 \
657 __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 2, 0, 2 ) \
658 __asm shufps reg1, reg2, R_SHUFFLEPS( 1, 3, 1, 3 ) \
659 __asm movaps reg2, reg3 \
660 __asm shufps reg2, reg4, R_SHUFFLEPS( 0, 2, 0, 2 ) \
661 __asm shufps reg3, reg4, R_SHUFFLEPS( 1, 3, 1, 3 )
664 #define TRANPOSE_4x4_TO_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \
665 __asm movaps reg4, reg0 \
666 __asm unpcklps reg0, reg1 \
667 __asm unpckhps reg4, reg1 \
668 __asm movaps reg1, reg2 \
669 __asm unpcklps reg2, reg3 \
670 __asm unpckhps reg1, reg3 \
671 __asm movlps [address+ 0], reg0 \
672 __asm movlps [address+ 8], reg2 \
673 __asm movhps [address+16], reg0 \
674 __asm movhps [address+24], reg2 \
675 __asm movlps [address+32], reg4 \
676 __asm movlps [address+40], reg1 \
677 __asm movhps [address+48], reg4 \
678 __asm movhps [address+56], reg1
681 #define TRANSPOSE_4x3( reg0, reg1, reg2, reg3 ) \
682 __asm movaps reg3, reg2 \
683 __asm shufps reg3, reg1, R_SHUFFLEPS( 2, 3, 0, 1 ) \
684 __asm shufps reg2, reg0, R_SHUFFLEPS( 0, 1, 2, 3 ) \
685 __asm shufps reg1, reg0, R_SHUFFLEPS( 2, 3, 0, 1 ) \
686 __asm movaps reg0, reg1 \
687 __asm shufps reg0, reg2, R_SHUFFLEPS( 2, 0, 3, 1 ) \
688 __asm shufps reg1, reg3, R_SHUFFLEPS( 3, 1, 2, 0 ) \
689 __asm shufps reg2, reg3, R_SHUFFLEPS( 2, 0, 3, 1 )
692 #define TRANSPOSE_4x3_FROM_MEMORY( address, reg0, reg1, reg2, reg3 ) \
693 __asm movlps reg1, [address+ 0] \
694 __asm movlps reg2, [address+ 8] \
695 __asm movlps reg3, [address+16] \
696 __asm movhps reg1, [address+24] \
697 __asm movhps reg2, [address+32] \
698 __asm movhps reg3, [address+40] \
699 __asm movaps reg0, reg1 \
700 __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 2, 1, 3 ) \
701 __asm shufps reg1, reg3, R_SHUFFLEPS( 1, 3, 0, 2 ) \
702 __asm shufps reg2, reg3, R_SHUFFLEPS( 0, 2, 1, 3 )
705 #define TRANSPOSE_4x3_TO_MEMORY( address, reg0, reg1, reg2, reg3 ) \
706 __asm movhlps reg3, reg0 \
707 __asm unpcklps reg0, reg1 \
708 __asm unpckhps reg1, reg2 \
709 __asm unpcklps reg2, reg3 \
710 __asm movlps [address+ 0], reg0 \
711 __asm movlps [address+ 8], reg2 \
712 __asm movlps [address+16], reg1 \
713 __asm movhps [address+24], reg0 \
714 __asm movhps [address+32], reg2 \
715 __asm movhps [address+40], reg1
719 #define KFLOATINITS( SRC0, COUNT, PRE, POST ) KFLOATINITDSS( SRC0,SRC0,SRC0,COUNT,PRE,POST )
720 #define KFLOATINITD( DST, COUNT, PRE, POST ) KFLOATINITDSS( DST,DST,DST,COUNT,PRE,POST )
721 #define KFLOATINITDS( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS( DST,SRC0,SRC0,COUNT,PRE,POST )
723 #define KFLOATINITDSS( DST, SRC0, SRC1, COUNT, PRE, POST )\
726 __asm mov ebx,COUNT \
732 __asm jge noUnderFlow \
734 __asm mov ecx,COUNT \
741 __asm and ebx,0xfffffff8 \
744 __asm lea ecx,[ecx*4+ebx] \
753 #define KFLOATINITS_NA( SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( SRC0,SRC0,SRC0,COUNT,PRE,POST )
754 #define KFLOATINITD_NA( DST, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,DST,DST,COUNT,PRE,POST )
755 #define KFLOATINITDS_NA( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,SRC0,SRC0,COUNT,PRE,POST )
756 #define KFLOATINITDSS_NA( DST, SRC0, SRC1, COUNT, PRE, POST )\
757 __asm mov eax,COUNT \
760 __asm mov ebx,COUNT \
762 __asm and ebx,0xfffffff8 \
785 #define KMOVDS1( DST, SRC0 ) \
786 __asm movss xmm2,SRC0 \
788 #define KMOVDS4( DST, SRC0 ) \
789 __asm movups xmm2,SRC0 \
790 __asm movups DST,xmm2
791 #define KMINDS1( DST, SRC0 ) \
792 __asm movss xmm2,SRC0 \
794 #define KMAXDS1( DST, SRC0 ) \
795 __asm movss xmm2,SRC0 \
799 #define KALUDSS1( OP, DST, SRC0, SRC1 ) \
800 __asm movss xmm2,SRC0 \
801 __asm OP##ss xmm2,SRC1 \
803 #define KALUDSS4( OP, DST, SRC0, SRC1 ) \
804 __asm movups xmm2,SRC0 \
805 __asm movups xmm3,SRC1 \
806 __asm OP##ps xmm2,xmm3 \
807 __asm movups DST,xmm2
809 #define KADDDSS1( DST, SRC0, SRC1 ) KALUDSS1( add, DST,SRC0,SRC1 )
810 #define KADDDSS4( DST, SRC0, SRC1 ) KALUDSS4( add, DST,SRC0,SRC1 )
811 #define KSUBDSS1( DST, SRC0, SRC1 ) KALUDSS1( sub, DST,SRC0,SRC1 )
812 #define KSUBDSS4( DST, SRC0, SRC1 ) KALUDSS4( sub, DST,SRC0,SRC1 )
813 #define KMULDSS1( DST, SRC0, SRC1 ) KALUDSS1( mul, DST,SRC0,SRC1 )
814 #define KMULDSS4( DST, SRC0, SRC1 ) KALUDSS4( mul, DST,SRC0,SRC1 )
816 #define KDIVDSS1( DST, SRC0, SRC1 ) \
817 __asm movss xmm2,SRC1 \
818 __asm rcpss xmm3,xmm2 \
819 __asm mulss xmm2,xmm3 \
820 __asm mulss xmm2,xmm3 \
821 __asm addss xmm3,xmm3 \
822 __asm subss xmm3,xmm2 \
823 __asm mulss xmm3,SRC0 \
825 #define KDIVDSS4( DST, SRC0, SRC1 ) \
826 __asm movups xmm2,SRC1 \
827 __asm rcpps xmm3,xmm2 \
828 __asm mulps xmm2,xmm3 \
829 __asm mulps xmm2,xmm3 \
830 __asm addps xmm3,xmm3 \
831 __asm subps xmm3,xmm2 \
832 __asm movups xmm2,SRC0 \
833 __asm mulps xmm3,xmm2 \
834 __asm movups DST,xmm3
835 #define KF2IDS1( SRC0 ) \
836 __asm movss xmm2,SRC0 \
837 __asm cvttps2pi mm2,xmm2 \
838 __asm movd [edi+ebx],mm2
839 #define KF2IDS4( SRC0 ) \
840 __asm movups xmm2,SRC0 \
841 __asm cvttps2pi mm2,xmm2 \
842 __asm movq [edi+ebx+0],mm2 \
843 __asm shufps xmm2,xmm2,SHUFFLEPS(1,0,3,2) \
844 __asm cvttps2pi mm2,xmm2 \
845 __asm movq [edi+ebx+8],mm2
846 #define KISQRTDS1( DST,SRC0 ) \
847 __asm movss xmm2,SRC0 \
848 __asm rsqrtss xmm3,xmm2 \
849 __asm mulss xmm2,xmm3 \
850 __asm mulss xmm2,xmm3 \
851 __asm subss xmm2,xmm1 \
852 __asm mulss xmm3,xmm0 \
853 __asm mulss xmm3,xmm2 \
855 #define KISQRTDS4( DST,SRC0 ) \
856 __asm movups xmm2,SRC0 \
857 __asm rsqrtps xmm3,xmm2 \
858 __asm mulps xmm2,xmm3 \
859 __asm mulps xmm2,xmm3 \
860 __asm subps xmm2,xmm1 \
861 __asm mulps xmm3,xmm0 \
862 __asm mulps xmm3,xmm2 \
863 __asm movups DST,xmm3
866 #define KANDREGDSV( DST, SRC0, VALUE ) \
871 #define KEXPANDFLOAT( DST, SRC ) \
872 __asm movss DST,SRC \
873 __asm shufps DST,DST,0
875 #define KADDDS1( DST,SRC ) KADDDSS1( DST,DST,SRC )
876 #define KADDDS4( DST,SRC ) KADDDSS4( DST,DST,SRC )
877 #define KSUBDS1( DST,SRC ) KSUBDSS1( DST,DST,SRC )
878 #define KSUBDS4( DST,SRC ) KSUBDSS4( DST,DST,SRC )
879 #define KMULDS1( DST,SRC ) KMULDSS1( DST,DST,SRC )
880 #define KMULDS4( DST,SRC ) KMULDSS4( DST,DST,SRC )
881 #define KDIVDS1( DST,SRC ) KDIVDSS1( DST,DST,SRC )
882 #define KDIVDS4( DST,SRC ) KDIVDSS4( DST,DST,SRC )
885 #define KFLOATOPER( OPER, OPER4, COUNT ) \
887 __asm mov ebx,COUNT \
889 __asm cmovl ecx,COUNT \
900 __asm mov ebx,COUNT \
919 #define KFLOAT_CA( ALUOP, DST, SRC, CONSTANT, COUNT ) \
921 __asm movss xmm0,CONSTANT \
922 __asm shufps xmm0,xmm0,0 \
923 KFLOATINITDS( DST, SRC, COUNT, pre, post ) \
929 __asm prefetchnta [edx+ebx+64] \
930 __asm movaps xmm1,xmm0 \
931 __asm movaps xmm2,xmm0 \
932 __asm ALUOP##ps xmm1,[edx+ebx] \
933 __asm ALUOP##ps xmm2,[edx+ebx+16] \
934 __asm movaps [edi+ebx],xmm1 \
935 __asm movaps [edi+ebx+16],xmm2 \
941 __asm prefetchnta [edx+ebx+64] \
942 __asm movaps xmm1,xmm0 \
943 __asm movaps xmm2,xmm0 \
944 __asm movups xmm3,[edx+ebx] \
945 __asm movups xmm4,[edx+ebx+16] \
946 __asm ALUOP##ps xmm1,xmm3 \
947 __asm ALUOP##ps xmm2,xmm4 \
948 __asm movaps [edi+ebx],xmm1 \
949 __asm movaps [edi+ebx+16],xmm2 \
955 __asm KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), \
956 __asm KALUDSS4( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), COUNT )
959 #define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT ) \
961 KFLOATINITDSS( DST, SRC0, SRC1, COUNT, pre, post ) \
967 __asm movaps xmm1,[edx+ebx] \
968 __asm movaps xmm2,[edx+ebx+16] \
969 __asm ALUOP##ps xmm1,[esi+ebx] \
970 __asm ALUOP##ps xmm2,[esi+ebx+16] \
971 __asm prefetchnta [edx+ebx+64] \
972 __asm prefetchnta [esi+ebx+64] \
973 __asm movaps [edi+ebx],xmm1 \
974 __asm movaps [edi+ebx+16],xmm2 \
980 __asm movups xmm1,[edx+ebx] \
981 __asm movups xmm2,[edx+ebx+16] \
982 __asm movups xmm3,[esi+ebx] \
983 __asm movups xmm4,[esi+ebx+16] \
984 __asm prefetchnta [edx+ebx+64] \
985 __asm prefetchnta [esi+ebx+64] \
986 __asm ALUOP##ps xmm1,xmm3 \
987 __asm ALUOP##ps xmm2,xmm4 \
988 __asm movaps [edi+ebx],xmm1 \
989 __asm movaps [edi+ebx+16],xmm2 \
996 KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), \
997 KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT )
1000 #define DRAWVERT_SIZE 60
1001 #define DRAWVERT_XYZ_OFFSET (0*4)
1002 #define DRAWVERT_ST_OFFSET (3*4)
1003 #define DRAWVERT_NORMAL_OFFSET (5*4)
1004 #define DRAWVERT_TANGENT0_OFFSET (8*4)
1005 #define DRAWVERT_TANGENT1_OFFSET (11*4)
1006 #define DRAWVERT_COLOR_OFFSET (14*4)
1008 #define JOINTQUAT_SIZE (7*4)
1009 #define JOINTMAT_SIZE (4*3*4)
1010 #define JOINTWEIGHT_SIZE (4*4)
1013 #define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
1014 #define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
1015 #define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
1017 ALIGN8_INIT1(
unsigned short SIMD_W_zero, 0 );
1018 ALIGN8_INIT1(
unsigned short SIMD_W_maxShort, 1<<15 );
1020 ALIGN4_INIT1(
unsigned long SIMD_DW_mat2quatShuffle0, (3<<0)|(2<<8)|(1<<16)|(0<<24) );
1021 ALIGN4_INIT1(
unsigned long SIMD_DW_mat2quatShuffle1, (0<<0)|(1<<8)|(2<<16)|(3<<24) );
1022 ALIGN4_INIT1(
unsigned long SIMD_DW_mat2quatShuffle2, (1<<0)|(0<<8)|(3<<16)|(2<<24) );
1023 ALIGN4_INIT1(
unsigned long SIMD_DW_mat2quatShuffle3, (2<<0)|(3<<8)|(0<<16)|(1<<24) );
1025 ALIGN4_INIT4(
unsigned long SIMD_SP_singleSignBitMask, (
unsigned long) ( 1 << 31 ), 0, 0, 0 );
1026 ALIGN4_INIT1(
unsigned long SIMD_SP_signBitMask, (
unsigned long) ( 1 << 31 ) );
1027 ALIGN4_INIT1(
unsigned long SIMD_SP_absMask, (
unsigned long) ~( 1 << 31 ) );
1028 ALIGN4_INIT1(
unsigned long SIMD_SP_infinityMask, (
unsigned long) ~( 1 << 23 ) );
1029 ALIGN4_INIT1(
unsigned long SIMD_SP_not, 0xFFFFFFFF );
1031 ALIGN4_INIT1(
float SIMD_SP_zero, 0.0
f );
1032 ALIGN4_INIT1(
float SIMD_SP_half, 0.5
f );
1033 ALIGN4_INIT1(
float SIMD_SP_one, 1.0
f );
1034 ALIGN4_INIT1(
float SIMD_SP_two, 2.0
f );
1035 ALIGN4_INIT1(
float SIMD_SP_three, 3.0
f );
1036 ALIGN4_INIT1(
float SIMD_SP_four, 4.0
f );
1037 ALIGN4_INIT1(
float SIMD_SP_maxShort, (1<<15) );
1038 ALIGN4_INIT1(
float SIMD_SP_tiny, 1e-10
f );
1039 ALIGN4_INIT1(
float SIMD_SP_PI,
idMath::PI );
1044 ALIGN4_INIT4(
float SIMD_SP_lastOne, 0.0
f, 0.0
f, 0.0
f, 1.0
f );
1046 ALIGN4_INIT1(
float SIMD_SP_rsqrt_c0, 3.0
f );
1047 ALIGN4_INIT1(
float SIMD_SP_rsqrt_c1, -0.5
f );
1048 ALIGN4_INIT1(
float SIMD_SP_mat2quat_rsqrt_c1, -0.5
f*0.5
f );
1050 ALIGN4_INIT1(
float SIMD_SP_sin_c0, -2.39e-08
f );
1051 ALIGN4_INIT1(
float SIMD_SP_sin_c1, 2.7526e-06
f );
1052 ALIGN4_INIT1(
float SIMD_SP_sin_c2, -1.98409e-04
f );
1053 ALIGN4_INIT1(
float SIMD_SP_sin_c3, 8.3333315e-03
f );
1054 ALIGN4_INIT1(
float SIMD_SP_sin_c4, -1.666666664e-01
f );
1056 ALIGN4_INIT1(
float SIMD_SP_cos_c0, -2.605e-07
f );
1057 ALIGN4_INIT1(
float SIMD_SP_cos_c1, 2.47609e-05
f );
1058 ALIGN4_INIT1(
float SIMD_SP_cos_c2, -1.3888397e-03
f );
1059 ALIGN4_INIT1(
float SIMD_SP_cos_c3, 4.16666418e-02
f );
1060 ALIGN4_INIT1(
float SIMD_SP_cos_c4, -4.999999963e-01
f );
1062 ALIGN4_INIT1(
float SIMD_SP_atan_c0, 0.0028662257
f );
1063 ALIGN4_INIT1(
float SIMD_SP_atan_c1, -0.0161657367
f );
1064 ALIGN4_INIT1(
float SIMD_SP_atan_c2, 0.0429096138
f );
1065 ALIGN4_INIT1(
float SIMD_SP_atan_c3, -0.0752896400
f );
1066 ALIGN4_INIT1(
float SIMD_SP_atan_c4, 0.1065626393
f );
1067 ALIGN4_INIT1(
float SIMD_SP_atan_c5, -0.1420889944
f );
1068 ALIGN4_INIT1(
float SIMD_SP_atan_c6, 0.1999355085
f );
1069 ALIGN4_INIT1(
float SIMD_SP_atan_c7, -0.3333314528
f );
1076 float SSE_InvSqrt(
float x ) {
1084 subss xmm0, SIMD_SP_rsqrt_c0
1085 mulss xmm1, SIMD_SP_rsqrt_c1
1097 void SSE_InvSqrt4(
float x[4] ) {
1104 subps xmm0, SIMD_SP_rsqrt_c0
1105 mulps xmm1, SIMD_SP_rsqrt_c1
1118 float SSE_SinZeroHalfPI(
float a ) {
1129 movss xmm2, SIMD_SP_sin_c0
1131 addss xmm2, SIMD_SP_sin_c1
1133 addss xmm2, SIMD_SP_sin_c2
1135 addss xmm2, SIMD_SP_sin_c3
1137 addss xmm2, SIMD_SP_sin_c4
1139 addss xmm2, SIMD_SP_one
1159 t += 8.3333315e-03
f;
1161 t += -1.666666664e-01
f;
1178 void SSE_Sin4ZeroHalfPI(
float a[4],
float s[4] ) {
1185 movaps xmm2, SIMD_SP_sin_c0
1187 addps xmm2, SIMD_SP_sin_c1
1189 addps xmm2, SIMD_SP_sin_c2
1191 addps xmm2, SIMD_SP_sin_c3
1193 addps xmm2, SIMD_SP_sin_c4
1195 addps xmm2, SIMD_SP_one
1206 float SSE_Sin(
float a ) {
1215 mulss xmm2, SIMD_SP_oneOverTwoPI
1217 cmpltss xmm3, SIMD_SP_zero
1218 andps xmm3, SIMD_SP_one
1221 mulss xmm2, SIMD_SP_twoPI
1224 movss xmm0, SIMD_SP_PI
1227 andps xmm1, SIMD_SP_signBitMask
1230 cmpnltss xmm2, SIMD_SP_halfPI
1231 movss xmm3, SIMD_SP_PI
1234 andps xmm2, SIMD_SP_signBitMask
1240 movss xmm2, SIMD_SP_sin_c0
1242 addss xmm2, SIMD_SP_sin_c1
1244 addss xmm2, SIMD_SP_sin_c2
1246 addss xmm2, SIMD_SP_sin_c3
1248 addss xmm2, SIMD_SP_sin_c4
1250 addss xmm2, SIMD_SP_one
1277 t += 8.3333315e-03
f;
1279 t += -1.666666664e-01
f;
1294 void SSE_Sin4(
float a[4],
float s[4] ) {
1300 mulps xmm2, SIMD_SP_oneOverTwoPI
1306 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
1307 shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
1312 shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
1314 cmpltps xmm3, SIMD_SP_zero
1315 andps xmm3, SIMD_SP_one
1317 mulps xmm2, SIMD_SP_twoPI
1320 movaps xmm0, SIMD_SP_PI
1323 andps xmm1, SIMD_SP_signBitMask
1326 cmpnltps xmm2, SIMD_SP_halfPI
1327 movaps xmm3, SIMD_SP_PI
1330 andps xmm2, SIMD_SP_signBitMask
1336 movaps xmm2, SIMD_SP_sin_c0
1338 addps xmm2, SIMD_SP_sin_c1
1340 addps xmm2, SIMD_SP_sin_c2
1342 addps xmm2, SIMD_SP_sin_c3
1344 addps xmm2, SIMD_SP_sin_c4
1346 addps xmm2, SIMD_SP_one
1359 float SSE_CosZeroHalfPI(
float a ) {
1369 movss xmm1, SIMD_SP_cos_c0
1371 addss xmm1, SIMD_SP_cos_c1
1373 addss xmm1, SIMD_SP_cos_c2
1375 addss xmm1, SIMD_SP_cos_c3
1377 addss xmm1, SIMD_SP_cos_c4
1379 addss xmm1, SIMD_SP_one
1396 t += -1.3888397e-03
f;
1398 t += 4.16666418e-02
f;
1400 t += -4.999999963e-01
f;
1416 void SSE_Cos4ZeroHalfPI(
float a[4],
float c[4] ) {
1422 movaps xmm1, SIMD_SP_cos_c0
1424 addps xmm1, SIMD_SP_cos_c1
1426 addps xmm1, SIMD_SP_cos_c2
1428 addps xmm1, SIMD_SP_cos_c3
1430 addps xmm1, SIMD_SP_cos_c4
1432 addps xmm1, SIMD_SP_one
1442 float SSE_Cos(
float a ) {
1451 mulss xmm2, SIMD_SP_oneOverTwoPI
1453 cmpltss xmm3, SIMD_SP_zero
1454 andps xmm3, SIMD_SP_one
1457 mulss xmm2, SIMD_SP_twoPI
1460 movss xmm0, SIMD_SP_PI
1463 andps xmm1, SIMD_SP_signBitMask
1466 cmpnltss xmm2, SIMD_SP_halfPI
1467 movss xmm3, SIMD_SP_PI
1470 andps xmm2, SIMD_SP_signBitMask
1475 movss xmm1, SIMD_SP_cos_c0
1477 addss xmm1, SIMD_SP_cos_c1
1479 addss xmm1, SIMD_SP_cos_c2
1481 addss xmm1, SIMD_SP_cos_c3
1483 addss xmm1, SIMD_SP_cos_c4
1485 addss xmm1, SIMD_SP_one
1486 xorps xmm2, SIMD_SP_signBitMask
1514 t += -1.3888397e-03
f;
1516 t += 4.16666418e-02
f;
1518 t += -4.999999963e-01
f;
1533 void SSE_Cos4(
float a[4],
float c[4] ) {
1539 mulps xmm2, SIMD_SP_oneOverTwoPI
1545 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
1546 shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
1551 shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
1553 cmpltps xmm3, SIMD_SP_zero
1554 andps xmm3, SIMD_SP_one
1556 mulps xmm2, SIMD_SP_twoPI
1559 movaps xmm0, SIMD_SP_PI
1562 andps xmm1, SIMD_SP_signBitMask
1565 cmpnltps xmm2, SIMD_SP_halfPI
1566 movaps xmm3, SIMD_SP_PI
1569 andps xmm2, SIMD_SP_signBitMask
1574 movaps xmm1, SIMD_SP_cos_c0
1576 addps xmm1, SIMD_SP_cos_c1
1578 addps xmm1, SIMD_SP_cos_c2
1580 addps xmm1, SIMD_SP_cos_c3
1582 addps xmm1, SIMD_SP_cos_c4
1584 addps xmm1, SIMD_SP_one
1585 xorps xmm2, SIMD_SP_signBitMask
1596 void SSE_SinCos(
float a,
float &s,
float &c ) {
1603 mulss xmm2, SIMD_SP_oneOverTwoPI
1605 cmpltss xmm3, SIMD_SP_zero
1606 andps xmm3, SIMD_SP_one
1609 mulss xmm2, SIMD_SP_twoPI
1612 movss xmm0, SIMD_SP_PI
1615 andps xmm1, SIMD_SP_signBitMask
1618 cmpnltss xmm2, SIMD_SP_halfPI
1619 movss xmm3, SIMD_SP_PI
1622 andps xmm2, SIMD_SP_signBitMask
1628 movss xmm3, SIMD_SP_sin_c0
1629 movss xmm4, SIMD_SP_cos_c0
1632 addss xmm3, SIMD_SP_sin_c1
1633 addss xmm4, SIMD_SP_cos_c1
1636 addss xmm3, SIMD_SP_sin_c2
1637 addss xmm4, SIMD_SP_cos_c2
1640 addss xmm3, SIMD_SP_sin_c3
1641 addss xmm4, SIMD_SP_cos_c3
1644 addss xmm3, SIMD_SP_sin_c4
1645 addss xmm4, SIMD_SP_cos_c4
1648 addss xmm3, SIMD_SP_one
1649 addss xmm4, SIMD_SP_one
1651 xorps xmm2, SIMD_SP_signBitMask
1663 void SSE_SinCos4(
float a[4],
float s[4],
float c[4] ) {
1670 mulps xmm2, SIMD_SP_oneOverTwoPI
1676 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
1677 shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
1682 shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
1684 cmpltps xmm3, SIMD_SP_zero
1685 andps xmm3, SIMD_SP_one
1687 mulps xmm2, SIMD_SP_twoPI
1690 movaps xmm0, SIMD_SP_PI
1693 andps xmm1, SIMD_SP_signBitMask
1696 cmpnltps xmm2, SIMD_SP_halfPI
1697 movaps xmm3, SIMD_SP_PI
1700 andps xmm2, SIMD_SP_signBitMask
1707 movaps xmm3, SIMD_SP_sin_c0
1708 movaps xmm4, SIMD_SP_cos_c0
1711 addps xmm3, SIMD_SP_sin_c1
1712 addps xmm4, SIMD_SP_cos_c1
1715 addps xmm3, SIMD_SP_sin_c2
1716 addps xmm4, SIMD_SP_cos_c2
1719 addps xmm3, SIMD_SP_sin_c3
1720 addps xmm4, SIMD_SP_cos_c3
1723 addps xmm3, SIMD_SP_sin_c4
1724 addps xmm4, SIMD_SP_cos_c4
1727 addps xmm3, SIMD_SP_one
1728 addps xmm4, SIMD_SP_one
1730 xorps xmm2, SIMD_SP_signBitMask
1744 float SSE_ATanPositive(
float y,
float x ) {
1765 andps xmm1, SIMD_SP_signBitMask
1767 andps xmm3, SIMD_SP_halfPI
1770 movss xmm2, SIMD_SP_atan_c0
1772 addss xmm2, SIMD_SP_atan_c1
1774 addss xmm2, SIMD_SP_atan_c2
1776 addss xmm2, SIMD_SP_atan_c3
1778 addss xmm2, SIMD_SP_atan_c4
1780 addss xmm2, SIMD_SP_atan_c5
1782 addss xmm2, SIMD_SP_atan_c6
1784 addss xmm2, SIMD_SP_atan_c7
1786 addss xmm2, SIMD_SP_one
1810 t += -0.0161657367f;
1814 t += -0.0752896400f;
1818 t += -0.1420889944f;
1822 t += -0.3333314528f;
1840 void SSE_ATan4Positive(
float y[4],
float x[4],
float at[4] ) {
1858 andps xmm1, SIMD_SP_signBitMask
1860 andps xmm3, SIMD_SP_halfPI
1863 movaps xmm2, SIMD_SP_atan_c0
1865 addps xmm2, SIMD_SP_atan_c1
1867 addps xmm2, SIMD_SP_atan_c2
1869 addps xmm2, SIMD_SP_atan_c3
1871 addps xmm2, SIMD_SP_atan_c4
1873 addps xmm2, SIMD_SP_atan_c5
1875 addps xmm2, SIMD_SP_atan_c6
1877 addps xmm2, SIMD_SP_atan_c7
1879 addps xmm2, SIMD_SP_one
1891 float SSE_ATan(
float y,
float x ) {
1900 andps xmm0, SIMD_SP_absMask
1903 andps xmm1, SIMD_SP_absMask
1904 andps xmm4, SIMD_SP_signBitMask
1916 andps xmm1, SIMD_SP_signBitMask
1918 orps xmm4, SIMD_SP_halfPI
1922 movss xmm2, SIMD_SP_atan_c0
1924 addss xmm2, SIMD_SP_atan_c1
1926 addss xmm2, SIMD_SP_atan_c2
1928 addss xmm2, SIMD_SP_atan_c3
1930 addss xmm2, SIMD_SP_atan_c4
1932 addss xmm2, SIMD_SP_atan_c5
1934 addss xmm2, SIMD_SP_atan_c6
1936 addss xmm2, SIMD_SP_atan_c7
1938 addss xmm2, SIMD_SP_one
1950 if ( fabs( y ) > fabs( x ) ) {
1953 *((
unsigned long *)&d) ^= ( *((
unsigned long *)&x) ^ *((
unsigned long *)&y) ) & (1<<31);
1962 t += -0.0161657367f;
1966 t += -0.0752896400f;
1970 t += -0.1420889944f;
1974 t += -0.3333314528f;
1990 void SSE_ATan4(
float y[4],
float x[4],
float at[4] ) {
1998 andps xmm0, SIMD_SP_absMask
2001 andps xmm1, SIMD_SP_absMask
2002 andps xmm4, SIMD_SP_signBitMask
2014 andps xmm1, SIMD_SP_signBitMask
2016 orps xmm4, SIMD_SP_halfPI
2020 movaps xmm2, SIMD_SP_atan_c0
2022 addps xmm2, SIMD_SP_atan_c1
2024 addps xmm2, SIMD_SP_atan_c2
2026 addps xmm2, SIMD_SP_atan_c3
2028 addps xmm2, SIMD_SP_atan_c4
2030 addps xmm2, SIMD_SP_atan_c5
2032 addps xmm2, SIMD_SP_atan_c6
2034 addps xmm2, SIMD_SP_atan_c7
2036 addps xmm2, SIMD_SP_one
2048 void SSE_TestTrigonometry(
void ) {
2050 float a, s1, s2, c1, c2;
2052 for ( i = 0; i < 100; i++ ) {
2056 s2 = SSE_SinZeroHalfPI( a );
2058 if ( fabs( s1 - s2 ) > 1e-7
f ) {
2063 c2 = SSE_CosZeroHalfPI( a );
2065 if ( fabs( c1 - c2 ) > 1e-7
f ) {
2070 for ( i = -200; i < 200; i++ ) {
2076 if ( fabs( s1 - s2 ) > 1e-6
f ) {
2083 if ( fabs( c1 - c2 ) > 1e-6
f ) {
2087 SSE_SinCos( a, s2, c2 );
2088 if ( fabs( s1 - s2 ) > 1e-6
f || fabs( c1 - c2 ) > 1e-6
f ) {
2110 void VPCALL idSIMD_SSE::Add(
float *dst,
const float constant,
const float *src,
const int count ) {
2111 KFLOAT_CA( add, dst, src, constant, count )
2122 KFLOAT_AA( add, dst, src0, src1, count )
2132 void VPCALL idSIMD_SSE::Sub(
float *dst,
const float constant,
const float *src,
const int count ) {
2133 KFLOAT_CA( sub, dst, src, constant, count )
2144 KFLOAT_AA( sub, dst, src0, src1, count )
2154 void VPCALL idSIMD_SSE::Mul(
float *dst,
const float constant,
const float *src,
const int count ) {
2155 KFLOAT_CA( mul, dst, src, constant, count )
2166 KFLOAT_AA( mul, dst, src0, src1, count )
2176 void VPCALL idSIMD_SSE::Div(
float *dst,
const float constant,
const float *src,
const int count ) {
2185 KFLOATINITDS( dst, src, count, pre, post )
2191 movaps xmm2,[edx+ebx]
2192 movaps xmm3,[edx+ebx+16]
2195 prefetchnta [edx+ebx+64]
2206 movaps [edi+ebx],xmm4
2207 movaps [edi+ebx+16],xmm5
2213 movups xmm2,[edx+ebx]
2214 movups xmm3,[edx+ebx+16]
2217 prefetchnta [edx+ebx+64]
2228 movaps [edi+ebx],xmm4
2229 movaps [edi+ebx+16],xmm5
2235 KFLOATOPER( KDIVDSS1( [edi+ebx],xmm1,[edx+ebx] ),
2236 KDIVDSS4( [edi+ebx],xmm1,[edx+ebx] ), count )
2253 KFLOATINITDSS( dst, src0, src1, count, pre, post )
2259 movaps xmm2,[esi+ebx]
2260 movaps xmm3,[esi+ebx+16]
2263 prefetchnta [esi+ebx+64]
2272 mulps xmm4,[edx+ebx]
2273 mulps xmm5,[edx+ebx+16]
2274 movaps [edi+ebx],xmm4
2275 movaps [edi+ebx+16],xmm5
2281 movups xmm2,[esi+ebx]
2282 movups xmm3,[esi+ebx+16]
2285 prefetchnta [esi+ebx+64]
2294 movups xmm2,[edx+ebx]
2295 movups xmm3,[edx+ebx+16]
2298 movaps [edi+ebx],xmm4
2299 movaps [edi+ebx+16],xmm5
2306 KFLOATOPER( KDIVDSS1( [edi+ebx],[edx+ebx],[esi+ebx] ),
2307 KDIVDSS4( [edi+ebx],[edx+ebx],[esi+ebx] ), count )
2317 static
void Simd_MulAdd(
float *dst,
const float constant,
const float *src,
const int count ) {
2320 __asm mov eax, count
2327 __asm jz SimdMulAdd16
2329 __asm jnz SimdMulAdd8
2343 __asm loopPreMulAdd16:
2345 __asm fmul
dword ptr [edi+ecx]
2346 __asm fadd
dword ptr [esi+ecx]
2347 __asm fstp
dword ptr [esi+ecx]
2349 __asm jl loopPreMulAdd16
2352 __asm movss xmm1, constant
2353 __asm shufps xmm1, xmm1, 0x00
2359 __asm movaps xmm0, [edi+eax]
2360 __asm mulps xmm0, xmm1
2361 __asm addps xmm0, [esi+eax]
2362 __asm movaps [esi+eax], xmm0
2364 __asm jl loopMulAdd16
2365 __asm jmp postMulAdd
2369 __asm jz SimdMulAdd8
2375 __asm loopPreMulAdd8:
2377 __asm fmul
dword ptr [edi+ecx]
2378 __asm fadd
dword ptr [esi+ecx]
2379 __asm fstp
dword ptr [esi+ecx]
2381 __asm jl loopPreMulAdd8
2384 __asm movss xmm1, constant
2385 __asm shufps xmm1, xmm1, 0x00
2391 __asm movlps xmm0, [edi+eax]
2392 __asm movhps xmm0, [edi+eax+8]
2393 __asm mulps xmm0, xmm1
2394 __asm movlps xmm2, [esi+eax]
2395 __asm movhps xmm2, [esi+eax+8]
2396 __asm addps xmm0, xmm2
2397 __asm movlps [esi+eax], xmm0
2398 __asm movhps [esi+eax+8], xmm0
2400 __asm jl loopMulAdd8
2401 __asm jmp postMulAdd
2408 __asm loopPostMulAdd:
2410 __asm fmul
dword ptr [edi+edx]
2411 __asm fadd
dword ptr [esi+edx]
2412 __asm fstp
dword ptr [esi+edx]
2414 __asm jl loopPostMulAdd
2419 #define MULADD_FEW( OPER ) \
2424 dst[0] OPER c * src[0]; \
2427 dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; \
2430 dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; \
2433 dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2436 dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2437 dst[4] OPER c * src[4]; \
2440 dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2441 dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; \
2444 dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2445 dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; \
2448 dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2449 dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
2452 dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2453 dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
2454 dst[8] OPER c * src[8]; \
2457 dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2458 dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
2459 dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; \
2462 dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
2463 dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
2464 dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; dst[10] OPER c * src[10]; \
2478 Simd_MulAdd( dst, constant, src, count );
2489 for (
int i = 0; i <
count; i++ ) {
2490 dst[
i] += src0[
i] + src1[
i];
2504 Simd_MulAdd( dst, -constant, src, count );
2515 for (
int i = 0; i <
count; i++ ) {
2516 dst[
i] -= src0[
i] + src1[
i];
2538 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
2540 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
2542 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
2550 movlps xmm1, [esi+eax+ 0]
2551 movlps xmm2, [esi+eax+ 8]
2552 movlps xmm3, [esi+eax+16]
2553 movhps xmm1, [esi+eax+24]
2554 movhps xmm2, [esi+eax+32]
2555 movhps xmm3, [esi+eax+40]
2557 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
2558 shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
2559 shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
2567 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
2568 movlps [ecx-16+0], xmm0
2569 movhps [ecx-16+8], xmm0
2577 movss xmm0, [esi+eax+0]
2578 movss xmm1, [esi+eax+4]
2579 movss xmm2, [esi+eax+8]
2612 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
2614 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
2616 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
2625 movlps xmm1, [esi+eax+ 0]
2626 movlps xmm3, [esi+eax+ 8]
2627 movhps xmm1, [esi+eax+16]
2628 movhps xmm3, [esi+eax+24]
2629 movlps xmm2, [esi+eax+32]
2630 movlps xmm4, [esi+eax+40]
2631 movhps xmm2, [esi+eax+48]
2632 movhps xmm4, [esi+eax+56]
2634 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
2635 shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
2637 shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
2638 shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
2650 movlps [ecx-16+0], xmm0
2651 movhps [ecx-16+8], xmm0
2659 movss xmm0, [esi+eax+0]
2660 movss xmm1, [esi+eax+4]
2661 movss xmm2, [esi+eax+8]
2665 addss xmm0, [esi+eax+12]
2704 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
2706 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
2708 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
2711 imul eax, DRAWVERT_SIZE
2716 movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
2717 movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
2718 movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
2721 movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
2722 shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )
2724 movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
2725 movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
2726 shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )
2728 movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
2729 shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )
2731 movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
2732 shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )
2735 add eax, 4*DRAWVERT_SIZE
2743 movlps [ecx-16+0], xmm0
2744 movhps [ecx-16+8], xmm0
2752 movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
2753 movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
2754 movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
2760 add eax, DRAWVERT_SIZE
2788 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
2790 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
2792 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
2793 movss xmm7, [edi+12]
2794 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
2802 movlps xmm1, [esi+eax+ 0]
2803 movlps xmm2, [esi+eax+ 8]
2804 movlps xmm3, [esi+eax+16]
2805 movhps xmm1, [esi+eax+24]
2806 movhps xmm2, [esi+eax+32]
2807 movhps xmm3, [esi+eax+40]
2809 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
2810 shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
2811 shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
2822 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
2824 movlps [ecx-16+0], xmm0
2825 movhps [ecx-16+8], xmm0
2833 movss xmm0, [esi+eax+0]
2834 movss xmm1, [esi+eax+4]
2835 movss xmm2, [esi+eax+8]
2861 #define SINGLE_OP(SRC, DEST) \
2862 __asm movlps xmm0,[SRC] \
2863 __asm movlps xmm1,[SRC+8] \
2864 __asm mulps xmm0,xmm4 \
2865 __asm mulps xmm1,xmm5 \
2866 __asm addps xmm0,xmm1 \
2867 __asm movaps xmm1,xmm0 \
2868 __asm shufps xmm1,xmm1,SHUFFLEPS(1,1,1,1) \
2869 __asm addss xmm0,xmm1 \
2870 __asm movss [DEST],xmm0 \
2874 #define DUAL_OP(SRC, DEST) \
2875 __asm movlps xmm0,[SRC] \
2876 __asm movlps xmm1,[SRC+8] \
2877 __asm movhps xmm0,[SRC+16] \
2878 __asm movhps xmm1,[SRC+24] \
2879 __asm mulps xmm0,xmm4 \
2880 __asm mulps xmm1,xmm5 \
2881 __asm addps xmm0,xmm1 \
2882 __asm shufps xmm1,xmm0,SHUFFLEPS(2,0,1,0) \
2883 __asm shufps xmm0,xmm0,SHUFFLEPS(3,1,2,0) \
2884 __asm addps xmm0,xmm1 \
2885 __asm movhps [DEST],xmm0 \
2896 shufps xmm4, xmm4, SHUFFLEPS(1,0,1,0)
2897 movlps xmm5, [ebx+8]
2898 shufps xmm5, xmm5, SHUFFLEPS(1,0,1,0)
2919 lea eax, [eax+ecx*4]
2923 movlps xmm0, [eax+ecx*4]
2924 movhps xmm0, [eax+ecx*4+16]
2925 movlps xmm2, [eax+ecx*4+32]
2926 movhps xmm2, [eax+ecx*4+48]
2931 prefetchnta [eax+ecx*4+128]
2933 movlps xmm0, [eax+ecx*4]
2934 movhps xmm0, [eax+ecx*4+16]
2935 movlps xmm2, [eax+ecx*4+32]
2936 movhps xmm2, [eax+ecx*4+48]
2937 movaps [edx+ecx-16],xmm1
2939 movlps xmm1, [eax+ecx*4+8]
2940 movhps xmm1, [eax+ecx*4+24]
2941 movlps xmm3, [eax+ecx*4+40]
2942 movhps xmm3, [eax+ecx*4+56]
2951 shufps xmm0, xmm2, SHUFFLEPS(2,0,2,0)
2952 shufps xmm1, xmm2, SHUFFLEPS(3,1,3,1)
2955 movaps [edx+ecx-16], xmm1
3001 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
3003 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
3005 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
3006 movss xmm7, [edi+12]
3007 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
3010 imul eax, DRAWVERT_SIZE
3015 movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3016 movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
3017 movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3020 movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
3021 shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )
3023 movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3024 movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3025 shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )
3027 movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
3028 shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )
3030 movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
3031 shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )
3034 add eax, 4*DRAWVERT_SIZE
3043 movlps [ecx-16+0], xmm0
3044 movhps [ecx-16+8], xmm0
3052 movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
3053 movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
3054 movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
3061 add eax, DRAWVERT_SIZE
3095 movlps xmm0, [esi+eax]
3096 movlps xmm3, [edi+eax]
3097 movlps xmm1, [esi+eax+8]
3098 movlps xmm4, [edi+eax+8]
3099 movhps xmm0, [esi+eax+24]
3100 movhps xmm3, [edi+eax+24]
3101 movhps xmm1, [esi+eax+32]
3102 movhps xmm4, [edi+eax+32]
3103 movlps xmm2, [esi+eax+16]
3104 movlps xmm5, [edi+eax+16]
3105 movhps xmm2, [esi+eax+40]
3106 movhps xmm5, [edi+eax+40]
3115 shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
3116 shufps xmm0, xmm2, R_SHUFFLEPS( 1, 3, 0, 2 )
3117 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
3120 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 1, 3 )
3122 movlps [ecx-16+0], xmm7
3123 movhps [ecx-16+8], xmm7
3131 movss xmm0, [esi+eax+0]
3132 movss xmm3, [edi+eax+0]
3133 movss xmm1, [esi+eax+4]
3134 movss xmm4, [edi+eax+4]
3135 movss xmm2, [esi+eax+8]
3136 movss xmm5, [edi+eax+8]
3165 dot = src1[0] * src2[0];
3168 dot = src1[0] * src2[0] + src1[1] * src2[1];
3171 dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
3188 movups xmm0, [ecx+eax]
3189 movups xmm1, [edx+eax]
3194 movups xmm1, [ecx+eax]
3195 movups xmm2, [edx+eax]
3209 movaps xmm0, [ecx+eax]
3210 movaps xmm1, [edx+eax]
3215 movaps xmm1, [ecx+eax]
3216 movaps xmm2, [edx+eax]
3223 switch( count & 3 ) {
3244 movhps xmm1, [ecx+4]
3246 movhps xmm2, [edx+4]
3256 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
3275 #define COMPARECONSTANT( DST, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \
3276 int i, cnt, pre, post; \
3280 if ( ((int) SRC0) & 3 ) { \
3284 post = COUNT - (cnt<<2); \
3285 __asm mov edx, cnt \
3286 __asm test edx, edx \
3290 __asm mov esi, SRC0 \
3291 __asm prefetchnta [esi+64] \
3292 __asm movss xmm1, CONSTANT \
3293 __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
3294 __asm mov edi, DST \
3295 __asm mov ecx, 0x01010101 \
3297 __asm movups xmm0, [esi] \
3298 __asm prefetchnta [esi+128] \
3299 __asm CMPSIMD xmm0, xmm1 \
3300 __asm movmskps eax, xmm0 \
3307 __asm and ebx, ecx \
3308 __asm mov dword ptr [edi], ebx \
3317 aligned = (float *) ((((int) SRC0) + 15) & ~15); \
3318 if ( (int)aligned > ((int)src0) + COUNT ) { \
3323 pre = aligned - SRC0; \
3324 cnt = (COUNT - pre) >> 2; \
3325 post = COUNT - pre - (cnt<<2); \
3326 __asm mov edx, cnt \
3327 __asm test edx, edx \
3331 __asm mov esi, aligned \
3332 __asm prefetchnta [esi+64] \
3333 __asm movss xmm1, CONSTANT \
3334 __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
3335 __asm mov edi, DST \
3336 __asm add edi, pre \
3337 __asm mov ecx, 0x01010101 \
3339 __asm movaps xmm0, [esi] \
3340 __asm prefetchnta [esi+128] \
3341 __asm CMPSIMD xmm0, xmm1 \
3342 __asm movmskps eax, xmm0 \
3349 __asm and ebx, ecx \
3350 __asm mov dword ptr [edi], ebx \
3359 double c = constant; \
3360 for ( i = 0; i < pre; i++ ) { \
3361 dst[i] = src0[i] CMP c; \
3363 for ( i = count - post; i < count; i++ ) { \
3364 dst[i] = src0[i] CMP c; \
3367 #define COMPAREBITCONSTANT( DST, BITNUM, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \
3368 int i, cnt, pre, post; \
3372 if ( ((int) SRC0) & 3 ) { \
3376 post = COUNT - (cnt<<2); \
3377 __asm mov edx, cnt \
3378 __asm test edx, edx \
3382 __asm mov esi, SRC0 \
3383 __asm prefetchnta [esi+64] \
3384 __asm movss xmm1, CONSTANT \
3385 __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
3386 __asm mov edi, DST \
3387 __asm mov cl, bitNum \
3389 __asm movups xmm0, [esi] \
3390 __asm prefetchnta [esi+128] \
3391 __asm CMPSIMD xmm0, xmm1 \
3392 __asm movmskps eax, xmm0 \
3399 __asm and ebx, 0x01010101 \
3401 __asm or ebx, dword ptr [edi] \
3402 __asm mov dword ptr [edi], ebx \
3411 aligned = (float *) ((((int) SRC0) + 15) & ~15); \
3412 if ( (int)aligned > ((int)src0) + COUNT ) { \
3417 pre = aligned - SRC0; \
3418 cnt = (COUNT - pre) >> 2; \
3419 post = COUNT - pre - (cnt<<2); \
3420 __asm mov edx, cnt \
3421 __asm test edx, edx \
3425 __asm mov esi, aligned \
3426 __asm prefetchnta [esi+64] \
3427 __asm movss xmm1, CONSTANT \
3428 __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
3429 __asm mov edi, DST \
3430 __asm add edi, pre \
3431 __asm mov cl, bitNum \
3433 __asm movaps xmm0, [esi] \
3434 __asm prefetchnta [esi+128] \
3435 __asm CMPSIMD xmm0, xmm1 \
3436 __asm movmskps eax, xmm0 \
3443 __asm and ebx, 0x01010101 \
3445 __asm or ebx, dword ptr [edi] \
3446 __asm mov dword ptr [edi], ebx \
3455 float c = constant; \
3456 for ( i = 0; i < pre; i++ ) { \
3457 dst[i] |= ( src0[i] CMP c ) << BITNUM; \
3459 for ( i = count - post; i < count; i++ ) { \
3460 dst[i] |= ( src0[i] CMP c ) << BITNUM; \
3471 COMPARECONSTANT( dst, src0, constant, count, >, cmpnleps, NOFLIP )
3482 COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >, cmpnleps, NOFLIP )
3493 COMPARECONSTANT( dst, src0, constant, count, >=, cmpnltps, NOFLIP )
3504 COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >=, cmpnltps, NOFLIP )
3515 COMPARECONSTANT( dst, src0, constant, count, <, cmpltps, NOFLIP )
3526 COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <, cmpltps, NOFLIP )
3537 COMPARECONSTANT( dst, src0, constant, count, <=, cmpnleps, FLIP )
3548 COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <=, cmpnleps, FLIP )
3568 shufps xmm0, xmm0, 0
3569 shufps xmm1, xmm1, 0
3571 KFLOATINITS( src, count, pre, post )
3577 movups xmm2, [edx+ebx]
3578 movups xmm3, [edx+ebx+16]
3581 prefetchnta [edx+ebx+64]
3588 movaps xmm2, [edx+ebx]
3589 movaps xmm3, [edx+ebx+16]
3592 prefetchnta [edx+ebx+64]
3602 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
3603 shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
3606 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
3607 shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
3610 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
3611 shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
3622 for ( i = 0; i < pre; i++ ) {
3631 for ( i = count - post; i <
count; i++ ) {
3653 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
3661 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
3671 movlps xmm2, [esi+eax]
3672 movhps xmm2, [esi+eax+8]
3679 shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 )
3684 shufps xmm3, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
3701 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
3717 movss xmm4, [esi+eax+0*12+8]
3718 movhps xmm4, [esi+eax+0*12+0]
3722 movss xmm5, [esi+eax+1*12+0]
3723 movhps xmm5, [esi+eax+1*12+4]
3727 movss xmm6, [esi+eax+2*12+8]
3728 movhps xmm6, [esi+eax+2*12+0]
3732 movss xmm7, [esi+eax+3*12+0]
3733 movhps xmm7, [esi+eax+3*12+4]
3749 movss xmm4, [esi+eax+0*12+8]
3750 movhps xmm4, [esi+eax+0*12+0]
3758 shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
3759 shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
3785 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
3794 imul eax, DRAWVERT_SIZE
3801 movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
3802 movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3806 movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3807 movhps xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
3811 movss xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
3812 movhps xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3816 movss xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3817 movhps xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
3821 add eax, 4*DRAWVERT_SIZE
3828 imul eax, DRAWVERT_SIZE
3833 movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
3834 movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3838 add eax, DRAWVERT_SIZE
3842 shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
3843 shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
3869 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
3887 mov edx, [edi+eax+0]
3888 imul edx, DRAWVERT_SIZE
3889 movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
3890 movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3894 mov edx, [edi+eax+4]
3895 imul edx, DRAWVERT_SIZE
3896 movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3897 movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
3901 mov edx, [edi+eax+8]
3902 imul edx, DRAWVERT_SIZE
3903 movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
3904 movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3908 mov edx, [edi+eax+12]
3909 imul edx, DRAWVERT_SIZE
3910 movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3911 movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
3927 mov edx, [edi+eax+0]
3928 imul edx, DRAWVERT_SIZE;
3929 movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
3930 movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3938 shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
3939 shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
3966 KFLOATINITDS( dst, src, count, pre, post )
3972 movaps xmm2,[edx+ebx]
3973 movaps xmm3,[edx+ebx+16]
3976 prefetchnta [edx+ebx+64]
3979 movaps [edi+ebx],xmm2
3980 movaps [edi+ebx+16],xmm3
3987 movups xmm2,[edx+ebx]
3988 movups xmm3,[edx+ebx+16]
3991 prefetchnta [edx+ebx+64]
3994 movaps [edi+ebx],xmm2
3995 movaps [edi+ebx+16],xmm3
4001 for ( i = 0; i < pre; i++ ) {
4004 else if ( src[i] > max )
4010 for( i = count - post; i <
count; i++ ) {
4013 else if ( src[i] > max )
4033 KFLOATINITDS( dst, src, count, pre, post )
4039 movaps xmm2,[edx+ebx]
4040 movaps xmm3,[edx+ebx+16]
4042 prefetchnta [edx+ebx+64]
4044 movaps [edi+ebx],xmm2
4045 movaps [edi+ebx+16],xmm3
4052 movups xmm2,[edx+ebx]
4053 movups xmm3,[edx+ebx+16]
4055 prefetchnta [edx+ebx+64]
4057 movaps [edi+ebx],xmm2
4058 movaps [edi+ebx+16],xmm3
4064 for( i = 0; i < pre; i++ ) {
4070 for( i = count - post; i <
count; i++ ) {
4091 KFLOATINITDS( dst, src, count, pre, post )
4097 movaps xmm2,[edx+ebx]
4098 movaps xmm3,[edx+ebx+16]
4100 prefetchnta [edx+ebx+64]
4102 movaps [edi+ebx],xmm2
4103 movaps [edi+ebx+16],xmm3
4110 movups xmm2,[edx+ebx]
4111 movups xmm3,[edx+ebx+16]
4113 prefetchnta [edx+ebx+64]
4115 movaps [edi+ebx],xmm2
4116 movaps [edi+ebx+16],xmm3
4122 for( i = 0; i < pre; i++ ) {
4129 for( i = count - post; i <
count; i++ ) {
4154 movaps [edx+eax], xmm0
4176 movss xmm0, SIMD_SP_signBitMask
4177 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
4179 movaps xmm1, [edx+eax]
4181 movaps [edx+eax], xmm1
4206 movaps xmm0, [ecx+eax]
4207 movaps [edx+eax], xmm0
4234 movaps xmm0, [ecx+eax]
4235 addps xmm0, [edx+eax]
4236 movaps [esi+eax], xmm0
4263 movaps xmm0, [ecx+eax]
4264 subps xmm0, [edx+eax]
4265 movaps [esi+eax], xmm0
4285 movss xmm1, constant
4290 shufps xmm1, xmm1, 0x00
4292 movaps xmm0, [edx+eax]
4294 movaps [ecx+eax], xmm0
4319 movaps xmm0, [ecx+eax]
4320 addps xmm0, [edx+eax]
4321 movaps [ecx+eax], xmm0
4346 movaps xmm0, [ecx+eax]
4347 subps xmm0, [edx+eax]
4348 movaps [ecx+eax], xmm0
4367 movss xmm1, constant
4371 shufps xmm1, xmm1, 0x00
4373 movaps xmm0, [ecx+eax]
4375 movaps [ecx+eax], xmm0
4396 #define STORE1( offset, reg1, reg2 ) \
4397 __asm movss [eax+offset], reg1
4398 #define STORE2LO( offset, reg1, reg2 ) \
4399 __asm movlps [eax+offset], reg1
4400 #define STORE2HI( offset, reg1, reg2 ) \
4401 __asm movhps [eax+offset], reg1
4402 #define STORE4( offset, reg1, reg2 ) \
4403 __asm movlps [eax+offset], reg1 \
4404 __asm movhps [eax+offset+8], reg1
4408 const float *mPtr, *vPtr;
4428 STORE1( 0, xmm0, xmm1 )
4438 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
4441 mulps xmm1, [edi+16]
4442 STORE4( 0, xmm0, xmm2 )
4443 STORE2LO( 16, xmm1, xmm2 )
4448 for (
int i = 0; i < numRows; i++ ) {
4449 dstPtr[
i] STOREC mPtr[0] * vPtr[0];
4471 STORE1( 0, xmm2, xmm4 )
4473 mulss xmm1, [edi+8+4]
4475 STORE1( 4, xmm0, xmm4 )
4485 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
4488 movaps xmm1, [edi+16]
4491 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
4492 shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
4493 movaps xmm3, [edi+32]
4496 STORE4( 0, xmm0, xmm4 )
4497 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
4500 STORE2LO( 16, xmm3, xmm4 )
4505 for (
int i = 0; i < numRows; i++ ) {
4506 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
4532 movss xmm3, [edi+12]
4534 STORE1( 0, xmm4, xmm7 );
4535 movss xmm5, [edi+12+4]
4538 movss xmm6, [edi+12+8]
4541 mulss xmm0, [edi+24]
4542 mulss xmm1, [edi+24+4]
4543 STORE1( 4, xmm3, xmm7 );
4545 mulss xmm2, [edi+24+8]
4547 STORE1( 8, xmm0, xmm7 );
4557 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
4559 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
4561 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
4563 movlps xmm1, [edi+4*4]
4564 shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )
4565 movlps xmm2, [edi+6*4]
4566 movhps xmm2, [edi+8*4]
4567 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )
4569 movlps xmm3, [edi+10*4]
4570 shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )
4572 shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )
4574 shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )
4578 STORE4( 0, xmm0, xmm4 )
4579 movss xmm1, [edi+12*4]
4581 movss xmm2, [edi+13*4]
4583 movss xmm3, [edi+14*4]
4587 STORE1( 16, xmm1, xmm4 )
4588 mulss xmm5, [edi+15*4]
4589 mulss xmm6, [edi+16*4]
4590 mulss xmm7, [edi+17*4]
4593 STORE1( 20, xmm5, xmm4 )
4598 for (
int i = 0; i < numRows; i++ ) {
4599 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
4614 movlps xmm6, qword ptr [esi ]
4615 movlps xmm0, qword ptr [edi ]
4616 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
4617 movhps xmm0, qword ptr [edi+16]
4619 movlps xmm7, qword ptr [esi+ 8]
4620 movlps xmm2, qword ptr [edi+ 8]
4621 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
4622 movhps xmm2, qword ptr [edi+24]
4624 movlps xmm1, qword ptr [edi+32]
4625 movhps xmm1, qword ptr [edi+48]
4627 movlps xmm3, qword ptr [edi+40]
4629 movhps xmm3, qword ptr [edi+56]
4633 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
4634 shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
4636 STORE4( 0, xmm0, xmm2 )
4645 movlps xmm6, qword ptr [esi+ 0]
4646 movlps xmm0, qword ptr [edi+ 0]
4647 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
4648 movhps xmm0, qword ptr [edi+16]
4650 movlps xmm7, qword ptr [esi+ 8]
4651 movlps xmm2, qword ptr [edi+ 8]
4652 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
4653 movhps xmm2, qword ptr [edi+24]
4655 movlps xmm1, qword ptr [edi+32]
4656 movhps xmm1, qword ptr [edi+48]
4658 movlps xmm3, qword ptr [edi+40]
4660 movhps xmm3, qword ptr [edi+56]
4664 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
4665 shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
4667 movlps xmm1, qword ptr [edi+64]
4668 movhps xmm1, qword ptr [edi+80]
4669 STORE4( 0, xmm0, xmm4 )
4671 movlps xmm2, qword ptr [edi+72]
4672 movhps xmm2, qword ptr [edi+88]
4675 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
4678 STORE2LO( 16, xmm1, xmm4 )
4683 for (
int i = 0; i < numRows; i++ ) {
4684 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
4699 movss xmm0, [edi+5*4]
4700 movhps xmm0, [edi+0*4]
4701 movss xmm5, [edi+15*4]
4702 movhps xmm5, [edi+10*4]
4704 shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )
4705 movlps xmm1, [edi+6*4]
4706 movlps xmm5, [edi+16*4]
4708 shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )
4709 movhps xmm2, [edi+2*4]
4710 movhps xmm5, [edi+12*4]
4712 shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )
4713 movlps xmm3, [edi+8*4]
4714 movlps xmm5, [edi+18*4]
4715 movss xmm4, [edi+4*4]
4717 shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )
4718 movhps xmm5, [edi+14*4]
4719 shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )
4720 movss xmm7, [esi+0*4]
4721 shufps xmm7, xmm7, 0
4723 movss xmm5, [esi+1*4]
4724 shufps xmm5, xmm5, 0
4727 movss xmm6, [esi+2*4]
4728 shufps xmm6, xmm6, 0
4731 movss xmm1, [esi+3*4]
4732 shufps xmm1, xmm1, 0
4735 movss xmm2, [esi+4*4]
4736 shufps xmm2, xmm2, 0
4739 mulss xmm7, [edi+20*4]
4740 mulss xmm5, [edi+21*4]
4742 mulss xmm6, [edi+22*4]
4744 mulss xmm1, [edi+23*4]
4746 mulss xmm2, [edi+24*4]
4748 STORE4( 0, xmm0, xmm3 )
4749 STORE1( 16, xmm7, xmm4 )
4759 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
4760 movlps xmm7, [esi+8]
4761 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
4763 movhps xmm3, [edi+8]
4764 movaps xmm1, [edi+16]
4765 movlps xmm2, [edi+32]
4766 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )
4767 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )
4768 shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )
4771 movlps xmm2, [edi+40]
4773 movhps xmm5, [edi+40+8]
4774 movlps xmm3, [edi+40+16]
4775 movhps xmm3, [edi+40+24]
4776 movlps xmm4, [edi+40+32]
4777 shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )
4778 shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )
4779 shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )
4783 movss xmm5, [esi+16]
4784 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
4786 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
4787 shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
4788 shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
4792 STORE4( 0, xmm0, xmm2 )
4793 movlps xmm4, [edi+80]
4794 movhps xmm3, [edi+80+8]
4795 movaps xmm1, [edi+80+16]
4796 movlps xmm2, [edi+80+32]
4797 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )
4798 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )
4799 shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )
4804 shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
4805 shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
4807 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
4809 STORE2LO( 16, xmm4, xmm2 )
4814 for (
int i = 0; i < numRows; i++ ) {
4815 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
4837 movss xmm3, [esi+12]
4839 mulss xmm3, [edi+12]
4840 movss xmm4, [esi+16]
4842 mulss xmm4, [edi+16]
4843 movss xmm5, [esi+20]
4845 mulss xmm5, [edi+20]
4846 movss xmm6, [esi+24]
4848 mulss xmm6, [edi+24]
4850 STORE1( 0, xmm0, xmm7 )
4861 movhps xmm4, [esi+8]
4862 movlps xmm5, [esi+16]
4868 movaps xmm1, [edi+16]
4869 movaps xmm2, [edi+32]
4876 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4878 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
4881 STORE2LO( 0, xmm0, xmm3 )
4892 movhps xmm4, [esi+8]
4893 movlps xmm5, [esi+16]
4899 movaps xmm1, [edi+16]
4900 movaps xmm2, [edi+32]
4907 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4909 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
4912 STORE2LO( 0, xmm0, xmm3 )
4914 movaps xmm0, [edi+48]
4915 movaps xmm1, [edi+48+16]
4922 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
4924 STORE1( 8, xmm0, xmm3 )
4935 movhps xmm4, [esi+8]
4936 movlps xmm5, [esi+16]
4942 movaps xmm1, [edi+16]
4943 movaps xmm2, [edi+32]
4950 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4953 movaps xmm0, [edi+48]
4954 movaps xmm1, [edi+48+16]
4955 movaps xmm2, [edi+48+32]
4962 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4966 shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
4967 shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
4969 STORE4( 0, xmm0, xmm4 )
4980 movhps xmm4, [esi+8]
4981 movlps xmm5, [esi+16]
4987 movaps xmm1, [edi+16]
4988 movaps xmm2, [edi+32]
4995 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4998 movaps xmm0, [edi+48]
4999 movaps xmm1, [edi+48+16]
5000 movaps xmm2, [edi+48+32]
5007 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5011 shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5012 shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5014 STORE4( 0, xmm0, xmm3 )
5016 movaps xmm0, [edi+96]
5017 movaps xmm1, [edi+96+16]
5024 shufps xmm1, xmm1, 0x01
5026 STORE1( 16, xmm0, xmm3 )
5035 movlps xmm7, qword ptr [esi]
5036 movlps xmm6, qword ptr [esi+8]
5037 shufps xmm7, xmm7, 0x44
5038 shufps xmm6, xmm6, 0x44
5039 movlps xmm0, qword ptr [edi ]
5040 movhps xmm0, qword ptr [edi+ 24]
5042 movlps xmm3, qword ptr [edi+ 8]
5043 movhps xmm3, qword ptr [edi+ 32]
5045 movlps xmm1, qword ptr [edi+ 48]
5046 movhps xmm1, qword ptr [edi+ 72]
5048 movlps xmm2, qword ptr [edi+ 96]
5049 movhps xmm2, qword ptr [edi+120]
5051 movlps xmm4, qword ptr [edi+ 56]
5052 movhps xmm4, qword ptr [edi+ 80]
5053 movlps xmm5, qword ptr [edi+104]
5054 movhps xmm5, qword ptr [edi+128]
5056 movlps xmm7, qword ptr [esi+16]
5058 shufps xmm7, xmm7, 0x44
5061 movlps xmm3, qword ptr [edi+ 16]
5062 movhps xmm3, qword ptr [edi+ 40]
5064 movlps xmm4, qword ptr [edi+ 64]
5065 movhps xmm4, qword ptr [edi+ 88]
5067 movlps xmm5, qword ptr [edi+112]
5068 movhps xmm5, qword ptr [edi+136]
5075 shufps xmm0, xmm1, 0x88
5076 shufps xmm6, xmm1, 0xDD
5078 shufps xmm7, xmm2, 0x88
5079 shufps xmm2, xmm2, 0xDD
5082 STORE4( 0, xmm0, xmm3 )
5083 STORE2LO( 16, xmm2, xmm4 )
5088 for (
int i = 0; i < numRows; i++ ) {
5089 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
5090 mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
5100 for (
int i = 0; i < numRows; i++ ) {
5101 float sum = mPtr[0] * vPtr[0];
5102 for (
int j = 1;
j < numColumns;
j++ ) {
5103 sum += mPtr[
j] * vPtr[
j];
5105 dstPtr[
i] STOREC sum;
5133 #define STORE1( offset, reg1, reg2 ) \
5134 __asm movss reg2, [eax+offset] \
5135 __asm addss reg2, reg1 \
5136 __asm movss [eax+offset], reg2
5137 #define STORE2LO( offset, reg1, reg2 ) \
5138 __asm movlps reg2, [eax+offset] \
5139 __asm addps reg2, reg1 \
5140 __asm movlps [eax+offset], reg2
5141 #define STORE2HI( offset, reg1, reg2 ) \
5142 __asm movhps reg2, [eax+offset] \
5143 __asm addps reg2, reg1 \
5144 __asm movhps [eax+offset], reg2
5145 #define STORE4( offset, reg1, reg2 ) \
5146 __asm movlps reg2, [eax+offset] \
5147 __asm movhps reg2, [eax+offset+8] \
5148 __asm addps reg2, reg1 \
5149 __asm movlps [eax+offset], reg2 \
5150 __asm movhps [eax+offset+8], reg2
5154 const float *mPtr, *vPtr;
5174 STORE1( 0, xmm0, xmm1 )
5184 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
5187 mulps xmm1, [edi+16]
5188 STORE4( 0, xmm0, xmm2 )
5189 STORE2LO( 16, xmm1, xmm2 )
5194 for (
int i = 0; i < numRows; i++ ) {
5195 dstPtr[
i] STOREC mPtr[0] * vPtr[0];
5217 STORE1( 0, xmm2, xmm4 )
5219 mulss xmm1, [edi+8+4]
5221 STORE1( 4, xmm0, xmm4 )
5231 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5234 movaps xmm1, [edi+16]
5237 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5238 shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5239 movaps xmm3, [edi+32]
5242 STORE4( 0, xmm0, xmm4 )
5243 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
5246 STORE2LO( 16, xmm3, xmm4 )
5251 for (
int i = 0; i < numRows; i++ ) {
5252 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
5278 movss xmm3, [edi+12]
5280 STORE1( 0, xmm4, xmm7 );
5281 movss xmm5, [edi+12+4]
5284 movss xmm6, [edi+12+8]
5287 mulss xmm0, [edi+24]
5288 mulss xmm1, [edi+24+4]
5289 STORE1( 4, xmm3, xmm7 );
5291 mulss xmm2, [edi+24+8]
5293 STORE1( 8, xmm0, xmm7 );
5303 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
5305 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
5307 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
5309 movlps xmm1, [edi+4*4]
5310 shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )
5311 movlps xmm2, [edi+6*4]
5312 movhps xmm2, [edi+8*4]
5313 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )
5315 movlps xmm3, [edi+10*4]
5316 shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )
5318 shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )
5320 shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )
5324 STORE4( 0, xmm0, xmm4 )
5325 movss xmm1, [edi+12*4]
5327 movss xmm2, [edi+13*4]
5329 movss xmm3, [edi+14*4]
5333 STORE1( 16, xmm1, xmm4 )
5334 mulss xmm5, [edi+15*4]
5335 mulss xmm6, [edi+16*4]
5336 mulss xmm7, [edi+17*4]
5339 STORE1( 20, xmm5, xmm4 )
5344 for (
int i = 0; i < numRows; i++ ) {
5345 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
5360 movlps xmm6, qword ptr [esi ]
5361 movlps xmm0, qword ptr [edi ]
5362 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
5363 movhps xmm0, qword ptr [edi+16]
5365 movlps xmm7, qword ptr [esi+ 8]
5366 movlps xmm2, qword ptr [edi+ 8]
5367 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5368 movhps xmm2, qword ptr [edi+24]
5370 movlps xmm1, qword ptr [edi+32]
5371 movhps xmm1, qword ptr [edi+48]
5373 movlps xmm3, qword ptr [edi+40]
5375 movhps xmm3, qword ptr [edi+56]
5379 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5380 shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5382 STORE4( 0, xmm0, xmm2 )
5391 movlps xmm6, qword ptr [esi+ 0]
5392 movlps xmm0, qword ptr [edi+ 0]
5393 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
5394 movhps xmm0, qword ptr [edi+16]
5396 movlps xmm7, qword ptr [esi+ 8]
5397 movlps xmm2, qword ptr [edi+ 8]
5398 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5399 movhps xmm2, qword ptr [edi+24]
5401 movlps xmm1, qword ptr [edi+32]
5402 movhps xmm1, qword ptr [edi+48]
5404 movlps xmm3, qword ptr [edi+40]
5406 movhps xmm3, qword ptr [edi+56]
5410 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5411 shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5413 movlps xmm1, qword ptr [edi+64]
5414 movhps xmm1, qword ptr [edi+80]
5415 STORE4( 0, xmm0, xmm4 )
5417 movlps xmm2, qword ptr [edi+72]
5418 movhps xmm2, qword ptr [edi+88]
5421 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
5424 STORE2LO( 16, xmm1, xmm4 )
5429 for (
int i = 0; i < numRows; i++ ) {
5430 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
5445 movss xmm0, [edi+5*4]
5446 movhps xmm0, [edi+0*4]
5447 movss xmm5, [edi+15*4]
5448 movhps xmm5, [edi+10*4]
5450 shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )
5451 movlps xmm1, [edi+6*4]
5452 movlps xmm5, [edi+16*4]
5454 shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )
5455 movhps xmm2, [edi+2*4]
5456 movhps xmm5, [edi+12*4]
5458 shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )
5459 movlps xmm3, [edi+8*4]
5460 movlps xmm5, [edi+18*4]
5461 movss xmm4, [edi+4*4]
5463 shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )
5464 movhps xmm5, [edi+14*4]
5465 shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )
5466 movss xmm7, [esi+0*4]
5467 shufps xmm7, xmm7, 0
5469 movss xmm5, [esi+1*4]
5470 shufps xmm5, xmm5, 0
5473 movss xmm6, [esi+2*4]
5474 shufps xmm6, xmm6, 0
5477 movss xmm1, [esi+3*4]
5478 shufps xmm1, xmm1, 0
5481 movss xmm2, [esi+4*4]
5482 shufps xmm2, xmm2, 0
5485 mulss xmm7, [edi+20*4]
5486 mulss xmm5, [edi+21*4]
5488 mulss xmm6, [edi+22*4]
5490 mulss xmm1, [edi+23*4]
5492 mulss xmm2, [edi+24*4]
5494 STORE4( 0, xmm0, xmm3 )
5495 STORE1( 16, xmm7, xmm4 )
5505 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
5506 movlps xmm7, [esi+8]
5507 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5509 movhps xmm3, [edi+8]
5510 movaps xmm1, [edi+16]
5511 movlps xmm2, [edi+32]
5512 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )
5513 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )
5514 shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )
5517 movlps xmm2, [edi+40]
5519 movhps xmm5, [edi+40+8]
5520 movlps xmm3, [edi+40+16]
5521 movhps xmm3, [edi+40+24]
5522 movlps xmm4, [edi+40+32]
5523 shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )
5524 shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )
5525 shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )
5529 movss xmm5, [esi+16]
5530 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
5532 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
5533 shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
5534 shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
5538 STORE4( 0, xmm0, xmm2 )
5539 movlps xmm4, [edi+80]
5540 movhps xmm3, [edi+80+8]
5541 movaps xmm1, [edi+80+16]
5542 movlps xmm2, [edi+80+32]
5543 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )
5544 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )
5545 shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )
5550 shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
5551 shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
5553 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
5555 STORE2LO( 16, xmm4, xmm2 )
5560 for (
int i = 0; i < numRows; i++ ) {
5561 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
5583 movss xmm3, [esi+12]
5585 mulss xmm3, [edi+12]
5586 movss xmm4, [esi+16]
5588 mulss xmm4, [edi+16]
5589 movss xmm5, [esi+20]
5591 mulss xmm5, [edi+20]
5592 movss xmm6, [esi+24]
5594 mulss xmm6, [edi+24]
5596 STORE1( 0, xmm0, xmm7 )
5607 movhps xmm4, [esi+8]
5608 movlps xmm5, [esi+16]
5614 movaps xmm1, [edi+16]
5615 movaps xmm2, [edi+32]
5622 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5624 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
5627 STORE2LO( 0, xmm0, xmm3 )
5638 movhps xmm4, [esi+8]
5639 movlps xmm5, [esi+16]
5645 movaps xmm1, [edi+16]
5646 movaps xmm2, [edi+32]
5653 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5655 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
5658 STORE2LO( 0, xmm0, xmm3 )
5660 movaps xmm0, [edi+48]
5661 movaps xmm1, [edi+48+16]
5668 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
5670 STORE1( 8, xmm0, xmm3 )
5681 movhps xmm4, [esi+8]
5682 movlps xmm5, [esi+16]
5688 movaps xmm1, [edi+16]
5689 movaps xmm2, [edi+32]
5696 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5699 movaps xmm0, [edi+48]
5700 movaps xmm1, [edi+48+16]
5701 movaps xmm2, [edi+48+32]
5708 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5712 shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5713 shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5715 STORE4( 0, xmm0, xmm4 )
5726 movhps xmm4, [esi+8]
5727 movlps xmm5, [esi+16]
5733 movaps xmm1, [edi+16]
5734 movaps xmm2, [edi+32]
5741 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5744 movaps xmm0, [edi+48]
5745 movaps xmm1, [edi+48+16]
5746 movaps xmm2, [edi+48+32]
5753 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5757 shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5758 shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5760 STORE4( 0, xmm0, xmm3 )
5762 movaps xmm0, [edi+96]
5763 movaps xmm1, [edi+96+16]
5770 shufps xmm1, xmm1, 0x01
5772 STORE1( 16, xmm0, xmm3 )
5781 movlps xmm7, qword ptr [esi]
5782 movlps xmm6, qword ptr [esi+8]
5783 shufps xmm7, xmm7, 0x44
5784 shufps xmm6, xmm6, 0x44
5785 movlps xmm0, qword ptr [edi ]
5786 movhps xmm0, qword ptr [edi+ 24]
5788 movlps xmm3, qword ptr [edi+ 8]
5789 movhps xmm3, qword ptr [edi+ 32]
5791 movlps xmm1, qword ptr [edi+ 48]
5792 movhps xmm1, qword ptr [edi+ 72]
5794 movlps xmm2, qword ptr [edi+ 96]
5795 movhps xmm2, qword ptr [edi+120]
5797 movlps xmm4, qword ptr [edi+ 56]
5798 movhps xmm4, qword ptr [edi+ 80]
5799 movlps xmm5, qword ptr [edi+104]
5800 movhps xmm5, qword ptr [edi+128]
5802 movlps xmm7, qword ptr [esi+16]
5804 shufps xmm7, xmm7, 0x44
5807 movlps xmm3, qword ptr [edi+ 16]
5808 movhps xmm3, qword ptr [edi+ 40]
5810 movlps xmm4, qword ptr [edi+ 64]
5811 movhps xmm4, qword ptr [edi+ 88]
5813 movlps xmm5, qword ptr [edi+112]
5814 movhps xmm5, qword ptr [edi+136]
5821 shufps xmm0, xmm1, 0x88
5822 shufps xmm6, xmm1, 0xDD
5824 shufps xmm7, xmm2, 0x88
5825 shufps xmm2, xmm2, 0xDD
5828 STORE4( 0, xmm0, xmm3 )
5829 STORE2LO( 16, xmm2, xmm4 )
5834 for (
int i = 0; i < numRows; i++ ) {
5835 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
5836 mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
5846 for (
int i = 0; i < numRows; i++ ) {
5847 float sum = mPtr[0] * vPtr[0];
5848 for (
int j = 1;
j < numColumns;
j++ ) {
5849 sum += mPtr[
j] * vPtr[
j];
5851 dstPtr[
i] STOREC sum;
5879 #define STORE1( offset, reg1, reg2 ) \
5880 __asm movss reg2, [eax+offset] \
5881 __asm subss reg2, reg1 \
5882 __asm movss [eax+offset], reg2
5883 #define STORE2LO( offset, reg1, reg2 ) \
5884 __asm movlps reg2, [eax+offset] \
5885 __asm subps reg2, reg1 \
5886 __asm movlps [eax+offset], reg2
5887 #define STORE2HI( offset, reg1, reg2 ) \
5888 __asm movhps reg2, [eax+offset] \
5889 __asm subps reg2, reg1 \
5890 __asm movhps [eax+offset], reg2
5891 #define STORE4( offset, reg1, reg2 ) \
5892 __asm movlps reg2, [eax+offset] \
5893 __asm movhps reg2, [eax+offset+8] \
5894 __asm subps reg2, reg1 \
5895 __asm movlps [eax+offset], reg2 \
5896 __asm movhps [eax+offset+8], reg2
5900 const float *mPtr, *vPtr;
5920 STORE1( 0, xmm0, xmm1 )
5930 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
5933 mulps xmm1, [edi+16]
5934 STORE4( 0, xmm0, xmm2 )
5935 STORE2LO( 16, xmm1, xmm2 )
5940 for (
int i = 0; i < numRows; i++ ) {
5941 dstPtr[
i] STOREC mPtr[0] * vPtr[0];
5963 STORE1( 0, xmm2, xmm4 )
5965 mulss xmm1, [edi+8+4]
5967 STORE1( 4, xmm0, xmm4 )
5977 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5980 movaps xmm1, [edi+16]
5983 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5984 shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5985 movaps xmm3, [edi+32]
5988 STORE4( 0, xmm0, xmm4 )
5989 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
5992 STORE2LO( 16, xmm3, xmm4 )
5997 for (
int i = 0; i < numRows; i++ ) {
5998 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
6024 movss xmm3, [edi+12]
6026 STORE1( 0, xmm4, xmm7 );
6027 movss xmm5, [edi+12+4]
6030 movss xmm6, [edi+12+8]
6033 mulss xmm0, [edi+24]
6034 mulss xmm1, [edi+24+4]
6035 STORE1( 4, xmm3, xmm7 );
6037 mulss xmm2, [edi+24+8]
6039 STORE1( 8, xmm0, xmm7 );
6049 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
6051 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6053 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6055 movlps xmm1, [edi+4*4]
6056 shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )
6057 movlps xmm2, [edi+6*4]
6058 movhps xmm2, [edi+8*4]
6059 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )
6061 movlps xmm3, [edi+10*4]
6062 shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )
6064 shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )
6066 shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )
6070 STORE4( 0, xmm0, xmm4 )
6071 movss xmm1, [edi+12*4]
6073 movss xmm2, [edi+13*4]
6075 movss xmm3, [edi+14*4]
6079 STORE1( 16, xmm1, xmm4 )
6080 mulss xmm5, [edi+15*4]
6081 mulss xmm6, [edi+16*4]
6082 mulss xmm7, [edi+17*4]
6085 STORE1( 20, xmm5, xmm4 )
6090 for (
int i = 0; i < numRows; i++ ) {
6091 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
6106 movlps xmm6, qword ptr [esi ]
6107 movlps xmm0, qword ptr [edi ]
6108 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
6109 movhps xmm0, qword ptr [edi+16]
6111 movlps xmm7, qword ptr [esi+ 8]
6112 movlps xmm2, qword ptr [edi+ 8]
6113 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
6114 movhps xmm2, qword ptr [edi+24]
6116 movlps xmm1, qword ptr [edi+32]
6117 movhps xmm1, qword ptr [edi+48]
6119 movlps xmm3, qword ptr [edi+40]
6121 movhps xmm3, qword ptr [edi+56]
6125 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
6126 shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
6128 STORE4( 0, xmm0, xmm2 )
6137 movlps xmm6, qword ptr [esi+ 0]
6138 movlps xmm0, qword ptr [edi+ 0]
6139 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
6140 movhps xmm0, qword ptr [edi+16]
6142 movlps xmm7, qword ptr [esi+ 8]
6143 movlps xmm2, qword ptr [edi+ 8]
6144 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
6145 movhps xmm2, qword ptr [edi+24]
6147 movlps xmm1, qword ptr [edi+32]
6148 movhps xmm1, qword ptr [edi+48]
6150 movlps xmm3, qword ptr [edi+40]
6152 movhps xmm3, qword ptr [edi+56]
6156 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
6157 shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
6159 movlps xmm1, qword ptr [edi+64]
6160 movhps xmm1, qword ptr [edi+80]
6161 STORE4( 0, xmm0, xmm4 )
6163 movlps xmm2, qword ptr [edi+72]
6164 movhps xmm2, qword ptr [edi+88]
6167 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
6170 STORE2LO( 16, xmm1, xmm4 )
6175 for (
int i = 0; i < numRows; i++ ) {
6176 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
6191 movss xmm0, [edi+5*4]
6192 movhps xmm0, [edi+0*4]
6193 movss xmm5, [edi+15*4]
6194 movhps xmm5, [edi+10*4]
6196 shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )
6197 movlps xmm1, [edi+6*4]
6198 movlps xmm5, [edi+16*4]
6200 shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )
6201 movhps xmm2, [edi+2*4]
6202 movhps xmm5, [edi+12*4]
6204 shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )
6205 movlps xmm3, [edi+8*4]
6206 movlps xmm5, [edi+18*4]
6207 movss xmm4, [edi+4*4]
6209 shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )
6210 movhps xmm5, [edi+14*4]
6211 shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )
6212 movss xmm7, [esi+0*4]
6213 shufps xmm7, xmm7, 0
6215 movss xmm5, [esi+1*4]
6216 shufps xmm5, xmm5, 0
6219 movss xmm6, [esi+2*4]
6220 shufps xmm6, xmm6, 0
6223 movss xmm1, [esi+3*4]
6224 shufps xmm1, xmm1, 0
6227 movss xmm2, [esi+4*4]
6228 shufps xmm2, xmm2, 0
6231 mulss xmm7, [edi+20*4]
6232 mulss xmm5, [edi+21*4]
6234 mulss xmm6, [edi+22*4]
6236 mulss xmm1, [edi+23*4]
6238 mulss xmm2, [edi+24*4]
6240 STORE4( 0, xmm0, xmm3 )
6241 STORE1( 16, xmm7, xmm4 )
6251 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
6252 movlps xmm7, [esi+8]
6253 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
6255 movhps xmm3, [edi+8]
6256 movaps xmm1, [edi+16]
6257 movlps xmm2, [edi+32]
6258 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )
6259 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )
6260 shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )
6263 movlps xmm2, [edi+40]
6265 movhps xmm5, [edi+40+8]
6266 movlps xmm3, [edi+40+16]
6267 movhps xmm3, [edi+40+24]
6268 movlps xmm4, [edi+40+32]
6269 shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )
6270 shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )
6271 shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )
6275 movss xmm5, [esi+16]
6276 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
6278 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
6279 shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
6280 shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
6284 STORE4( 0, xmm0, xmm2 )
6285 movlps xmm4, [edi+80]
6286 movhps xmm3, [edi+80+8]
6287 movaps xmm1, [edi+80+16]
6288 movlps xmm2, [edi+80+32]
6289 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )
6290 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )
6291 shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )
6296 shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
6297 shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
6299 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
6301 STORE2LO( 16, xmm4, xmm2 )
6306 for (
int i = 0; i < numRows; i++ ) {
6307 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
6329 movss xmm3, [esi+12]
6331 mulss xmm3, [edi+12]
6332 movss xmm4, [esi+16]
6334 mulss xmm4, [edi+16]
6335 movss xmm5, [esi+20]
6337 mulss xmm5, [edi+20]
6338 movss xmm6, [esi+24]
6340 mulss xmm6, [edi+24]
6342 STORE1( 0, xmm0, xmm7 )
6353 movhps xmm4, [esi+8]
6354 movlps xmm5, [esi+16]
6360 movaps xmm1, [edi+16]
6361 movaps xmm2, [edi+32]
6368 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6370 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
6373 STORE2LO( 0, xmm0, xmm3 )
6384 movhps xmm4, [esi+8]
6385 movlps xmm5, [esi+16]
6391 movaps xmm1, [edi+16]
6392 movaps xmm2, [edi+32]
6399 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6401 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
6404 STORE2LO( 0, xmm0, xmm3 )
6406 movaps xmm0, [edi+48]
6407 movaps xmm1, [edi+48+16]
6414 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
6416 STORE1( 8, xmm0, xmm3 )
6427 movhps xmm4, [esi+8]
6428 movlps xmm5, [esi+16]
6434 movaps xmm1, [edi+16]
6435 movaps xmm2, [edi+32]
6442 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6445 movaps xmm0, [edi+48]
6446 movaps xmm1, [edi+48+16]
6447 movaps xmm2, [edi+48+32]
6454 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6458 shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
6459 shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
6461 STORE4( 0, xmm0, xmm4 )
6472 movhps xmm4, [esi+8]
6473 movlps xmm5, [esi+16]
6479 movaps xmm1, [edi+16]
6480 movaps xmm2, [edi+32]
6487 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6490 movaps xmm0, [edi+48]
6491 movaps xmm1, [edi+48+16]
6492 movaps xmm2, [edi+48+32]
6499 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6503 shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
6504 shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
6506 STORE4( 0, xmm0, xmm3 )
6508 movaps xmm0, [edi+96]
6509 movaps xmm1, [edi+96+16]
6516 shufps xmm1, xmm1, 0x01
6518 STORE1( 16, xmm0, xmm3 )
6527 movlps xmm7, qword ptr [esi]
6528 movlps xmm6, qword ptr [esi+8]
6529 shufps xmm7, xmm7, 0x44
6530 shufps xmm6, xmm6, 0x44
6531 movlps xmm0, qword ptr [edi ]
6532 movhps xmm0, qword ptr [edi+ 24]
6534 movlps xmm3, qword ptr [edi+ 8]
6535 movhps xmm3, qword ptr [edi+ 32]
6537 movlps xmm1, qword ptr [edi+ 48]
6538 movhps xmm1, qword ptr [edi+ 72]
6540 movlps xmm2, qword ptr [edi+ 96]
6541 movhps xmm2, qword ptr [edi+120]
6543 movlps xmm4, qword ptr [edi+ 56]
6544 movhps xmm4, qword ptr [edi+ 80]
6545 movlps xmm5, qword ptr [edi+104]
6546 movhps xmm5, qword ptr [edi+128]
6548 movlps xmm7, qword ptr [esi+16]
6550 shufps xmm7, xmm7, 0x44
6553 movlps xmm3, qword ptr [edi+ 16]
6554 movhps xmm3, qword ptr [edi+ 40]
6556 movlps xmm4, qword ptr [edi+ 64]
6557 movhps xmm4, qword ptr [edi+ 88]
6559 movlps xmm5, qword ptr [edi+112]
6560 movhps xmm5, qword ptr [edi+136]
6567 shufps xmm0, xmm1, 0x88
6568 shufps xmm6, xmm1, 0xDD
6570 shufps xmm7, xmm2, 0x88
6571 shufps xmm2, xmm2, 0xDD
6574 STORE4( 0, xmm0, xmm3 )
6575 STORE2LO( 16, xmm2, xmm4 )
6580 for (
int i = 0; i < numRows; i++ ) {
6581 dstPtr[
i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
6582 mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
6592 for (
int i = 0; i < numRows; i++ ) {
6593 float sum = mPtr[0] * vPtr[0];
6594 for (
int j = 1;
j < numColumns;
j++ ) {
6595 sum += mPtr[
j] * vPtr[
j];
6597 dstPtr[
i] STOREC sum;
6624 #define STORE1( offset, reg1, reg2 ) \
6625 __asm movss [eax+offset], reg1
6626 #define STORE2LO( offset, reg1, reg2 ) \
6627 __asm movlps [eax+offset], reg1
6628 #define STORE2HI( offset, reg1, reg2 ) \
6629 __asm movhps [eax+offset], reg1
6630 #define STORE4( offset, reg1, reg2 ) \
6631 __asm movlps [eax+offset], reg1 \
6632 __asm movhps [eax+offset+8], reg1
6636 const float *mPtr, *vPtr;
6648 switch( numColumns ) {
6655 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
6658 mulps xmm1, [edi+16]
6659 STORE4( 0, xmm0, xmm2 )
6660 STORE2LO( 16, xmm1, xmm3 )
6665 for (
int i = 0; i < numColumns; i++ ) {
6666 dstPtr[
i] STOREC *(mPtr) * vPtr[0];
6674 switch( numColumns ) {
6682 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
6683 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
6686 movlps xmm3, [edi+24]
6687 movhps xmm3, [edi+32]
6690 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
6691 movlps xmm4, [edi+16]
6692 movhps xmm4, [edi+40]
6696 STORE4( 0, xmm2, xmm5 )
6697 STORE2LO( 16, xmm3, xmm6 )
6702 for (
int i = 0; i < numColumns; i++ ) {
6703 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
6711 switch( numColumns ) {
6717 movlps xmm0, [esi+0*4]
6718 movss xmm1, [esi+2*4]
6719 movlps xmm3, [edi+(0*6+0)*4]
6720 movhps xmm3, [edi+(0*6+2)*4]
6722 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
6724 movlps xmm5, [edi+(1*6+0)*4]
6725 movhps xmm5, [edi+(1*6+2)*4]
6727 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6730 movlps xmm4, [edi+(2*6+0)*4]
6731 movhps xmm4, [edi+(2*6+2)*4]
6732 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
6735 STORE4( 0, xmm3, xmm7 )
6736 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
6737 movlps xmm3, [edi+(0*6+4)*4]
6738 movhps xmm3, [edi+(1*6+4)*4]
6742 movlps xmm5, [edi+(2*6+4)*4]
6745 STORE2LO( 16, xmm3, xmm7 )
6750 for (
int i = 0; i < numColumns; i++ ) {
6751 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
6759 switch( numColumns ) {
6765 movlps xmm0, [esi+0*4]
6766 movlps xmm1, [esi+2*4]
6768 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
6769 mulps xmm3, [edi+(0*6+0)*4]
6770 movlps xmm5, [edi+(1*6+0)*4]
6771 movhps xmm5, [edi+(1*6+2)*4]
6773 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6776 movlps xmm4, [edi+(2*6+0)*4]
6777 movhps xmm4, [edi+(2*6+2)*4]
6779 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6782 movlps xmm5, [edi+(3*6+0)*4]
6783 movhps xmm5, [edi+(3*6+2)*4]
6785 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6788 STORE4( 0, xmm3, xmm7 )
6789 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
6790 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
6791 movlps xmm3, [edi+(0*6+4)*4]
6792 movhps xmm3, [edi+(1*6+4)*4]
6794 movlps xmm4, [edi+(2*6+4)*4]
6795 movhps xmm4, [edi+(3*6+4)*4]
6800 STORE2LO( 16, xmm3, xmm7 )
6805 for (
int i = 0; i < numColumns; i++ ) {
6806 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
6807 *(mPtr+3*numColumns) * vPtr[3];
6815 switch( numColumns ) {
6821 movlps xmm0, [esi+0*4]
6822 movlps xmm1, [esi+2*4]
6823 movss xmm2, [esi+4*4]
6825 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
6826 mulps xmm3, [edi+(0*6+0)*4]
6827 movlps xmm5, [edi+(1*6+0)*4]
6828 movhps xmm5, [edi+(1*6+2)*4]
6830 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6834 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6835 mulps xmm6, [edi+(2*6+0)*4]
6837 movlps xmm5, [edi+(3*6+0)*4]
6838 movhps xmm5, [edi+(3*6+2)*4]
6840 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6843 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
6845 mulps xmm4, [edi+(4*6+0)*4]
6847 STORE4( 0, xmm3, xmm7 )
6848 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
6849 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
6850 movlps xmm3, [edi+(0*6+4)*4]
6851 movhps xmm3, [edi+(1*6+4)*4]
6853 movlps xmm4, [edi+(2*6+4)*4]
6854 movhps xmm4, [edi+(3*6+4)*4]
6859 movlps xmm5, [edi+(4*6+4)*4]
6862 STORE2LO( 16, xmm3, xmm7 )
6867 for (
int i = 0; i < numColumns; i++ ) {
6868 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
6869 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
6877 switch( numColumns ) {
6884 movhps xmm0, [esi+8]
6885 movlps xmm1, [esi+16]
6887 mulps xmm1, [edi+16]
6888 shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
6892 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
6894 STORE1( 0, xmm2, xmm3 )
6903 movlps xmm0, [esi+0*4]
6904 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
6905 movaps xmm6, [edi+0*4]
6907 movlps xmm1, [esi+2*4]
6908 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
6909 movaps xmm7, [edi+4*4]
6912 movlps xmm2, [esi+4*4]
6913 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
6914 movaps xmm7, [edi+8*4]
6919 STORE2LO( 0, xmm3, xmm7 )
6928 movss xmm0, [edi+(0*3+2)*4]
6929 movhps xmm0, [edi+(0*3+0)*4]
6930 shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
6931 movss xmm6, [esi+0*4]
6932 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6934 movss xmm1, [edi+(1*3+0)*4]
6935 movhps xmm1, [edi+(1*3+1)*4]
6936 movss xmm7, [esi+1*4]
6937 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6940 movss xmm2, [edi+(2*3+2)*4]
6941 movhps xmm2, [edi+(2*3+0)*4]
6942 shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
6943 movss xmm7, [esi+2*4]
6944 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6947 movss xmm3, [edi+(3*3+0)*4]
6948 movhps xmm3, [edi+(3*3+1)*4]
6949 movss xmm7, [esi+3*4]
6950 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6953 movss xmm4, [edi+(4*3+2)*4]
6954 movhps xmm4, [edi+(4*3+0)*4]
6955 shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
6956 movss xmm7, [esi+4*4]
6957 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6960 movss xmm5, [edi+(5*3+0)*4]
6961 movhps xmm5, [edi+(5*3+1)*4]
6962 movss xmm7, [esi+5*4]
6963 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6966 STORE1( 0, xmm6, xmm7 )
6967 STORE2HI( 4, xmm6, xmm7 )
6976 movlps xmm3, [edi+(0*4+0)*4]
6977 movhps xmm3, [edi+(0*4+2)*4]
6978 movss xmm4, [esi+0*4]
6979 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
6981 movlps xmm5, [edi+(1*4+0)*4]
6982 movhps xmm5, [edi+(1*4+2)*4]
6983 movss xmm6, [esi+1*4]
6984 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6987 movlps xmm4, [edi+(2*4+0)*4]
6988 movhps xmm4, [edi+(2*4+2)*4]
6989 movss xmm6, [esi+2*4]
6990 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6993 movlps xmm5, [edi+(3*4+0)*4]
6994 movhps xmm5, [edi+(3*4+2)*4]
6995 movss xmm6, [esi+3*4]
6996 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6999 movlps xmm4, [edi+(4*4+0)*4]
7000 movhps xmm4, [edi+(4*4+2)*4]
7001 movss xmm6, [esi+4*4]
7002 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7005 movlps xmm5, [edi+(5*4+0)*4]
7006 movhps xmm5, [edi+(5*4+2)*4]
7007 movss xmm6, [esi+5*4]
7008 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7011 STORE4( 0, xmm3, xmm7 )
7020 movlps xmm6, [edi+(0*5+0)*4]
7021 movhps xmm6, [edi+(0*5+2)*4]
7022 movss xmm0, [esi+0*4]
7023 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7025 movlps xmm7, [edi+(1*5+0)*4]
7026 movhps xmm7, [edi+(1*5+2)*4]
7027 movss xmm1, [esi+1*4]
7028 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7031 movlps xmm7, [edi+(2*5+0)*4]
7032 movhps xmm7, [edi+(2*5+2)*4]
7033 movss xmm2, [esi+2*4]
7034 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
7037 movlps xmm7, [edi+(3*5+0)*4]
7038 movhps xmm7, [edi+(3*5+2)*4]
7039 movss xmm3, [esi+3*4]
7040 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7043 movlps xmm7, [edi+(4*5+0)*4]
7044 movhps xmm7, [edi+(4*5+2)*4]
7045 movss xmm4, [esi+4*4]
7046 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7049 movlps xmm7, [edi+(5*5+0)*4]
7050 movhps xmm7, [edi+(5*5+2)*4]
7051 movss xmm5, [esi+5*4]
7052 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
7055 STORE4( 0, xmm6, xmm7 )
7056 movss xmm6, [edi+(0*5+4)*4]
7058 movss xmm7, [edi+(1*5+4)*4]
7061 movss xmm7, [edi+(2*5+4)*4]
7064 movss xmm7, [edi+(3*5+4)*4]
7067 movss xmm7, [edi+(4*5+4)*4]
7070 movss xmm7, [edi+(5*5+4)*4]
7073 STORE1( 16, xmm6, xmm7 )
7082 movlps xmm0, [esi+0*4]
7083 movlps xmm1, [esi+2*4]
7084 movlps xmm2, [esi+4*4]
7086 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7087 mulps xmm3, [edi+(0*6+0)*4]
7088 movlps xmm5, [edi+(1*6+0)*4]
7089 movhps xmm5, [edi+(1*6+2)*4]
7091 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7095 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7096 mulps xmm6, [edi+(2*6+0)*4]
7099 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7100 movlps xmm5, [edi+(3*6+0)*4]
7101 movhps xmm5, [edi+(3*6+2)*4]
7105 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7106 mulps xmm6, [edi+(4*6+0)*4]
7109 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7110 movlps xmm5, [edi+(5*6+0)*4]
7111 movhps xmm5, [edi+(5*6+2)*4]
7114 STORE4( 0, xmm3, xmm7 )
7115 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7116 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7117 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
7118 movlps xmm3, [edi+(0*6+4)*4]
7119 movhps xmm3, [edi+(1*6+4)*4]
7121 movlps xmm4, [edi+(2*6+4)*4]
7122 movhps xmm4, [edi+(3*6+4)*4]
7125 movlps xmm5, [edi+(4*6+4)*4]
7126 movhps xmm5, [edi+(5*6+4)*4]
7131 STORE2LO( 16, xmm3, xmm7 )
7136 for (
int i = 0; i < numColumns; i++ ) {
7137 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7138 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
7147 for (
int i = 0; i < numColumns; i++ ) {
7149 float sum = mPtr[0] * vPtr[0];
7150 for (
int j = 1;
j < numRows;
j++ ) {
7152 sum += mPtr[0] * vPtr[
j];
7154 dstPtr[
i] STOREC sum;
7179 #define STORE1( offset, reg1, reg2 ) \
7180 __asm movss reg2, [eax+offset] \
7181 __asm addss reg2, reg1 \
7182 __asm movss [eax+offset], reg2
7183 #define STORE2LO( offset, reg1, reg2 ) \
7184 __asm movlps reg2, [eax+offset] \
7185 __asm addps reg2, reg1 \
7186 __asm movlps [eax+offset], reg2
7187 #define STORE2HI( offset, reg1, reg2 ) \
7188 __asm movhps reg2, [eax+offset] \
7189 __asm addps reg2, reg1 \
7190 __asm movhps [eax+offset], reg2
7191 #define STORE4( offset, reg1, reg2 ) \
7192 __asm movlps reg2, [eax+offset] \
7193 __asm movhps reg2, [eax+offset+8] \
7194 __asm addps reg2, reg1 \
7195 __asm movlps [eax+offset], reg2 \
7196 __asm movhps [eax+offset+8], reg2
7200 const float *mPtr, *vPtr;
7212 switch( numColumns ) {
7219 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7222 mulps xmm1, [edi+16]
7223 STORE4( 0, xmm0, xmm2 )
7224 STORE2LO( 16, xmm1, xmm3 )
7229 for (
int i = 0; i < numColumns; i++ ) {
7230 dstPtr[
i] STOREC *(mPtr) * vPtr[0];
7238 switch( numColumns ) {
7246 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7247 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
7250 movlps xmm3, [edi+24]
7251 movhps xmm3, [edi+32]
7254 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7255 movlps xmm4, [edi+16]
7256 movhps xmm4, [edi+40]
7260 STORE4( 0, xmm2, xmm5 )
7261 STORE2LO( 16, xmm3, xmm6 )
7266 for (
int i = 0; i < numColumns; i++ ) {
7267 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
7275 switch( numColumns ) {
7281 movlps xmm0, [esi+0*4]
7282 movss xmm1, [esi+2*4]
7283 movlps xmm3, [edi+(0*6+0)*4]
7284 movhps xmm3, [edi+(0*6+2)*4]
7286 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7288 movlps xmm5, [edi+(1*6+0)*4]
7289 movhps xmm5, [edi+(1*6+2)*4]
7291 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7294 movlps xmm4, [edi+(2*6+0)*4]
7295 movhps xmm4, [edi+(2*6+2)*4]
7296 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7299 STORE4( 0, xmm3, xmm7 )
7300 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7301 movlps xmm3, [edi+(0*6+4)*4]
7302 movhps xmm3, [edi+(1*6+4)*4]
7306 movlps xmm5, [edi+(2*6+4)*4]
7309 STORE2LO( 16, xmm3, xmm7 )
7314 for (
int i = 0; i < numColumns; i++ ) {
7315 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
7323 switch( numColumns ) {
7329 movlps xmm0, [esi+0*4]
7330 movlps xmm1, [esi+2*4]
7332 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7333 mulps xmm3, [edi+(0*6+0)*4]
7334 movlps xmm5, [edi+(1*6+0)*4]
7335 movhps xmm5, [edi+(1*6+2)*4]
7337 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7340 movlps xmm4, [edi+(2*6+0)*4]
7341 movhps xmm4, [edi+(2*6+2)*4]
7343 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7346 movlps xmm5, [edi+(3*6+0)*4]
7347 movhps xmm5, [edi+(3*6+2)*4]
7349 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7352 STORE4( 0, xmm3, xmm7 )
7353 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7354 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7355 movlps xmm3, [edi+(0*6+4)*4]
7356 movhps xmm3, [edi+(1*6+4)*4]
7358 movlps xmm4, [edi+(2*6+4)*4]
7359 movhps xmm4, [edi+(3*6+4)*4]
7364 STORE2LO( 16, xmm3, xmm7 )
7369 for (
int i = 0; i < numColumns; i++ ) {
7370 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7371 *(mPtr+3*numColumns) * vPtr[3];
7379 switch( numColumns ) {
7385 movlps xmm0, [esi+0*4]
7386 movlps xmm1, [esi+2*4]
7387 movss xmm2, [esi+4*4]
7389 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7390 mulps xmm3, [edi+(0*6+0)*4]
7391 movlps xmm5, [edi+(1*6+0)*4]
7392 movhps xmm5, [edi+(1*6+2)*4]
7394 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7398 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7399 mulps xmm6, [edi+(2*6+0)*4]
7401 movlps xmm5, [edi+(3*6+0)*4]
7402 movhps xmm5, [edi+(3*6+2)*4]
7404 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7407 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
7409 mulps xmm4, [edi+(4*6+0)*4]
7411 STORE4( 0, xmm3, xmm7 )
7412 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7413 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7414 movlps xmm3, [edi+(0*6+4)*4]
7415 movhps xmm3, [edi+(1*6+4)*4]
7417 movlps xmm4, [edi+(2*6+4)*4]
7418 movhps xmm4, [edi+(3*6+4)*4]
7423 movlps xmm5, [edi+(4*6+4)*4]
7426 STORE2LO( 16, xmm3, xmm7 )
7431 for (
int i = 0; i < numColumns; i++ ) {
7432 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7433 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
7441 switch( numColumns ) {
7448 movhps xmm0, [esi+8]
7449 movlps xmm1, [esi+16]
7451 mulps xmm1, [edi+16]
7452 shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
7456 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
7458 STORE1( 0, xmm2, xmm3 )
7467 movlps xmm0, [esi+0*4]
7468 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7469 movaps xmm6, [edi+0*4]
7471 movlps xmm1, [esi+2*4]
7472 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7473 movaps xmm7, [edi+4*4]
7476 movlps xmm2, [esi+4*4]
7477 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
7478 movaps xmm7, [edi+8*4]
7483 STORE2LO( 0, xmm3, xmm7 )
7492 movss xmm0, [edi+(0*3+2)*4]
7493 movhps xmm0, [edi+(0*3+0)*4]
7494 shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
7495 movss xmm6, [esi+0*4]
7496 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7498 movss xmm1, [edi+(1*3+0)*4]
7499 movhps xmm1, [edi+(1*3+1)*4]
7500 movss xmm7, [esi+1*4]
7501 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7504 movss xmm2, [edi+(2*3+2)*4]
7505 movhps xmm2, [edi+(2*3+0)*4]
7506 shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
7507 movss xmm7, [esi+2*4]
7508 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7511 movss xmm3, [edi+(3*3+0)*4]
7512 movhps xmm3, [edi+(3*3+1)*4]
7513 movss xmm7, [esi+3*4]
7514 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7517 movss xmm4, [edi+(4*3+2)*4]
7518 movhps xmm4, [edi+(4*3+0)*4]
7519 shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
7520 movss xmm7, [esi+4*4]
7521 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7524 movss xmm5, [edi+(5*3+0)*4]
7525 movhps xmm5, [edi+(5*3+1)*4]
7526 movss xmm7, [esi+5*4]
7527 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7530 STORE1( 0, xmm6, xmm7 )
7531 STORE2HI( 4, xmm6, xmm7 )
7540 movlps xmm3, [edi+(0*4+0)*4]
7541 movhps xmm3, [edi+(0*4+2)*4]
7542 movss xmm4, [esi+0*4]
7543 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7545 movlps xmm5, [edi+(1*4+0)*4]
7546 movhps xmm5, [edi+(1*4+2)*4]
7547 movss xmm6, [esi+1*4]
7548 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7551 movlps xmm4, [edi+(2*4+0)*4]
7552 movhps xmm4, [edi+(2*4+2)*4]
7553 movss xmm6, [esi+2*4]
7554 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7557 movlps xmm5, [edi+(3*4+0)*4]
7558 movhps xmm5, [edi+(3*4+2)*4]
7559 movss xmm6, [esi+3*4]
7560 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7563 movlps xmm4, [edi+(4*4+0)*4]
7564 movhps xmm4, [edi+(4*4+2)*4]
7565 movss xmm6, [esi+4*4]
7566 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7569 movlps xmm5, [edi+(5*4+0)*4]
7570 movhps xmm5, [edi+(5*4+2)*4]
7571 movss xmm6, [esi+5*4]
7572 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7575 STORE4( 0, xmm3, xmm7 )
7584 movlps xmm6, [edi+(0*5+0)*4]
7585 movhps xmm6, [edi+(0*5+2)*4]
7586 movss xmm0, [esi+0*4]
7587 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7589 movlps xmm7, [edi+(1*5+0)*4]
7590 movhps xmm7, [edi+(1*5+2)*4]
7591 movss xmm1, [esi+1*4]
7592 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7595 movlps xmm7, [edi+(2*5+0)*4]
7596 movhps xmm7, [edi+(2*5+2)*4]
7597 movss xmm2, [esi+2*4]
7598 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
7601 movlps xmm7, [edi+(3*5+0)*4]
7602 movhps xmm7, [edi+(3*5+2)*4]
7603 movss xmm3, [esi+3*4]
7604 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7607 movlps xmm7, [edi+(4*5+0)*4]
7608 movhps xmm7, [edi+(4*5+2)*4]
7609 movss xmm4, [esi+4*4]
7610 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7613 movlps xmm7, [edi+(5*5+0)*4]
7614 movhps xmm7, [edi+(5*5+2)*4]
7615 movss xmm5, [esi+5*4]
7616 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
7619 STORE4( 0, xmm6, xmm7 )
7620 movss xmm6, [edi+(0*5+4)*4]
7622 movss xmm7, [edi+(1*5+4)*4]
7625 movss xmm7, [edi+(2*5+4)*4]
7628 movss xmm7, [edi+(3*5+4)*4]
7631 movss xmm7, [edi+(4*5+4)*4]
7634 movss xmm7, [edi+(5*5+4)*4]
7637 STORE1( 16, xmm6, xmm7 )
7646 movlps xmm0, [esi+0*4]
7647 movlps xmm1, [esi+2*4]
7648 movlps xmm2, [esi+4*4]
7650 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7651 mulps xmm3, [edi+(0*6+0)*4]
7652 movlps xmm5, [edi+(1*6+0)*4]
7653 movhps xmm5, [edi+(1*6+2)*4]
7655 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7659 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7660 mulps xmm6, [edi+(2*6+0)*4]
7663 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7664 movlps xmm5, [edi+(3*6+0)*4]
7665 movhps xmm5, [edi+(3*6+2)*4]
7669 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7670 mulps xmm6, [edi+(4*6+0)*4]
7673 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7674 movlps xmm5, [edi+(5*6+0)*4]
7675 movhps xmm5, [edi+(5*6+2)*4]
7678 STORE4( 0, xmm3, xmm7 )
7679 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7680 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7681 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
7682 movlps xmm3, [edi+(0*6+4)*4]
7683 movhps xmm3, [edi+(1*6+4)*4]
7685 movlps xmm4, [edi+(2*6+4)*4]
7686 movhps xmm4, [edi+(3*6+4)*4]
7689 movlps xmm5, [edi+(4*6+4)*4]
7690 movhps xmm5, [edi+(5*6+4)*4]
7695 STORE2LO( 16, xmm3, xmm7 )
7700 for (
int i = 0; i < numColumns; i++ ) {
7701 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7702 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
7711 for (
int i = 0; i < numColumns; i++ ) {
7713 float sum = mPtr[0] * vPtr[0];
7714 for (
int j = 1;
j < numRows;
j++ ) {
7716 sum += mPtr[0] * vPtr[
j];
7718 dstPtr[
i] STOREC sum;
7743 #define STORE1( offset, reg1, reg2 ) \
7744 __asm movss reg2, [eax+offset] \
7745 __asm subss reg2, reg1 \
7746 __asm movss [eax+offset], reg2
7747 #define STORE2LO( offset, reg1, reg2 ) \
7748 __asm movlps reg2, [eax+offset] \
7749 __asm subps reg2, reg1 \
7750 __asm movlps [eax+offset], reg2
7751 #define STORE2HI( offset, reg1, reg2 ) \
7752 __asm movhps reg2, [eax+offset] \
7753 __asm subps reg2, reg1 \
7754 __asm movhps [eax+offset], reg2
7755 #define STORE4( offset, reg1, reg2 ) \
7756 __asm movlps reg2, [eax+offset] \
7757 __asm movhps reg2, [eax+offset+8] \
7758 __asm subps reg2, reg1 \
7759 __asm movlps [eax+offset], reg2 \
7760 __asm movhps [eax+offset+8], reg2
7764 const float *mPtr, *vPtr;
7776 switch( numColumns ) {
7783 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7786 mulps xmm1, [edi+16]
7787 STORE4( 0, xmm0, xmm2 )
7788 STORE2LO( 16, xmm1, xmm3 )
7793 for (
int i = 0; i < numColumns; i++ ) {
7794 dstPtr[
i] STOREC *(mPtr) * vPtr[0];
7802 switch( numColumns ) {
7810 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7811 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
7814 movlps xmm3, [edi+24]
7815 movhps xmm3, [edi+32]
7818 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7819 movlps xmm4, [edi+16]
7820 movhps xmm4, [edi+40]
7824 STORE4( 0, xmm2, xmm5 )
7825 STORE2LO( 16, xmm3, xmm6 )
7830 for (
int i = 0; i < numColumns; i++ ) {
7831 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
7839 switch( numColumns ) {
7845 movlps xmm0, [esi+0*4]
7846 movss xmm1, [esi+2*4]
7847 movlps xmm3, [edi+(0*6+0)*4]
7848 movhps xmm3, [edi+(0*6+2)*4]
7850 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7852 movlps xmm5, [edi+(1*6+0)*4]
7853 movhps xmm5, [edi+(1*6+2)*4]
7855 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7858 movlps xmm4, [edi+(2*6+0)*4]
7859 movhps xmm4, [edi+(2*6+2)*4]
7860 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7863 STORE4( 0, xmm3, xmm7 )
7864 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7865 movlps xmm3, [edi+(0*6+4)*4]
7866 movhps xmm3, [edi+(1*6+4)*4]
7870 movlps xmm5, [edi+(2*6+4)*4]
7873 STORE2LO( 16, xmm3, xmm7 )
7878 for (
int i = 0; i < numColumns; i++ ) {
7879 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
7887 switch( numColumns ) {
7893 movlps xmm0, [esi+0*4]
7894 movlps xmm1, [esi+2*4]
7896 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7897 mulps xmm3, [edi+(0*6+0)*4]
7898 movlps xmm5, [edi+(1*6+0)*4]
7899 movhps xmm5, [edi+(1*6+2)*4]
7901 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7904 movlps xmm4, [edi+(2*6+0)*4]
7905 movhps xmm4, [edi+(2*6+2)*4]
7907 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7910 movlps xmm5, [edi+(3*6+0)*4]
7911 movhps xmm5, [edi+(3*6+2)*4]
7913 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7916 STORE4( 0, xmm3, xmm7 )
7917 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7918 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7919 movlps xmm3, [edi+(0*6+4)*4]
7920 movhps xmm3, [edi+(1*6+4)*4]
7922 movlps xmm4, [edi+(2*6+4)*4]
7923 movhps xmm4, [edi+(3*6+4)*4]
7928 STORE2LO( 16, xmm3, xmm7 )
7933 for (
int i = 0; i < numColumns; i++ ) {
7934 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7935 *(mPtr+3*numColumns) * vPtr[3];
7943 switch( numColumns ) {
7949 movlps xmm0, [esi+0*4]
7950 movlps xmm1, [esi+2*4]
7951 movss xmm2, [esi+4*4]
7953 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7954 mulps xmm3, [edi+(0*6+0)*4]
7955 movlps xmm5, [edi+(1*6+0)*4]
7956 movhps xmm5, [edi+(1*6+2)*4]
7958 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7962 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7963 mulps xmm6, [edi+(2*6+0)*4]
7965 movlps xmm5, [edi+(3*6+0)*4]
7966 movhps xmm5, [edi+(3*6+2)*4]
7968 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7971 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
7973 mulps xmm4, [edi+(4*6+0)*4]
7975 STORE4( 0, xmm3, xmm7 )
7976 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7977 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7978 movlps xmm3, [edi+(0*6+4)*4]
7979 movhps xmm3, [edi+(1*6+4)*4]
7981 movlps xmm4, [edi+(2*6+4)*4]
7982 movhps xmm4, [edi+(3*6+4)*4]
7987 movlps xmm5, [edi+(4*6+4)*4]
7990 STORE2LO( 16, xmm3, xmm7 )
7995 for (
int i = 0; i < numColumns; i++ ) {
7996 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7997 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
8005 switch( numColumns ) {
8012 movhps xmm0, [esi+8]
8013 movlps xmm1, [esi+16]
8015 mulps xmm1, [edi+16]
8016 shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
8020 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
8022 STORE1( 0, xmm2, xmm3 )
8031 movlps xmm0, [esi+0*4]
8032 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
8033 movaps xmm6, [edi+0*4]
8035 movlps xmm1, [esi+2*4]
8036 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
8037 movaps xmm7, [edi+4*4]
8040 movlps xmm2, [esi+4*4]
8041 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
8042 movaps xmm7, [edi+8*4]
8047 STORE2LO( 0, xmm3, xmm7 )
8056 movss xmm0, [edi+(0*3+2)*4]
8057 movhps xmm0, [edi+(0*3+0)*4]
8058 shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
8059 movss xmm6, [esi+0*4]
8060 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8062 movss xmm1, [edi+(1*3+0)*4]
8063 movhps xmm1, [edi+(1*3+1)*4]
8064 movss xmm7, [esi+1*4]
8065 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8068 movss xmm2, [edi+(2*3+2)*4]
8069 movhps xmm2, [edi+(2*3+0)*4]
8070 shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
8071 movss xmm7, [esi+2*4]
8072 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8075 movss xmm3, [edi+(3*3+0)*4]
8076 movhps xmm3, [edi+(3*3+1)*4]
8077 movss xmm7, [esi+3*4]
8078 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8081 movss xmm4, [edi+(4*3+2)*4]
8082 movhps xmm4, [edi+(4*3+0)*4]
8083 shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
8084 movss xmm7, [esi+4*4]
8085 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8088 movss xmm5, [edi+(5*3+0)*4]
8089 movhps xmm5, [edi+(5*3+1)*4]
8090 movss xmm7, [esi+5*4]
8091 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8094 STORE1( 0, xmm6, xmm7 )
8095 STORE2HI( 4, xmm6, xmm7 )
8104 movlps xmm3, [edi+(0*4+0)*4]
8105 movhps xmm3, [edi+(0*4+2)*4]
8106 movss xmm4, [esi+0*4]
8107 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
8109 movlps xmm5, [edi+(1*4+0)*4]
8110 movhps xmm5, [edi+(1*4+2)*4]
8111 movss xmm6, [esi+1*4]
8112 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8115 movlps xmm4, [edi+(2*4+0)*4]
8116 movhps xmm4, [edi+(2*4+2)*4]
8117 movss xmm6, [esi+2*4]
8118 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8121 movlps xmm5, [edi+(3*4+0)*4]
8122 movhps xmm5, [edi+(3*4+2)*4]
8123 movss xmm6, [esi+3*4]
8124 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8127 movlps xmm4, [edi+(4*4+0)*4]
8128 movhps xmm4, [edi+(4*4+2)*4]
8129 movss xmm6, [esi+4*4]
8130 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8133 movlps xmm5, [edi+(5*4+0)*4]
8134 movhps xmm5, [edi+(5*4+2)*4]
8135 movss xmm6, [esi+5*4]
8136 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8139 STORE4( 0, xmm3, xmm7 )
8148 movlps xmm6, [edi+(0*5+0)*4]
8149 movhps xmm6, [edi+(0*5+2)*4]
8150 movss xmm0, [esi+0*4]
8151 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
8153 movlps xmm7, [edi+(1*5+0)*4]
8154 movhps xmm7, [edi+(1*5+2)*4]
8155 movss xmm1, [esi+1*4]
8156 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
8159 movlps xmm7, [edi+(2*5+0)*4]
8160 movhps xmm7, [edi+(2*5+2)*4]
8161 movss xmm2, [esi+2*4]
8162 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
8165 movlps xmm7, [edi+(3*5+0)*4]
8166 movhps xmm7, [edi+(3*5+2)*4]
8167 movss xmm3, [esi+3*4]
8168 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
8171 movlps xmm7, [edi+(4*5+0)*4]
8172 movhps xmm7, [edi+(4*5+2)*4]
8173 movss xmm4, [esi+4*4]
8174 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
8177 movlps xmm7, [edi+(5*5+0)*4]
8178 movhps xmm7, [edi+(5*5+2)*4]
8179 movss xmm5, [esi+5*4]
8180 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
8183 STORE4( 0, xmm6, xmm7 )
8184 movss xmm6, [edi+(0*5+4)*4]
8186 movss xmm7, [edi+(1*5+4)*4]
8189 movss xmm7, [edi+(2*5+4)*4]
8192 movss xmm7, [edi+(3*5+4)*4]
8195 movss xmm7, [edi+(4*5+4)*4]
8198 movss xmm7, [edi+(5*5+4)*4]
8201 STORE1( 16, xmm6, xmm7 )
8210 movlps xmm0, [esi+0*4]
8211 movlps xmm1, [esi+2*4]
8212 movlps xmm2, [esi+4*4]
8214 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
8215 mulps xmm3, [edi+(0*6+0)*4]
8216 movlps xmm5, [edi+(1*6+0)*4]
8217 movhps xmm5, [edi+(1*6+2)*4]
8219 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
8223 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8224 mulps xmm6, [edi+(2*6+0)*4]
8227 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
8228 movlps xmm5, [edi+(3*6+0)*4]
8229 movhps xmm5, [edi+(3*6+2)*4]
8233 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8234 mulps xmm6, [edi+(4*6+0)*4]
8237 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
8238 movlps xmm5, [edi+(5*6+0)*4]
8239 movhps xmm5, [edi+(5*6+2)*4]
8242 STORE4( 0, xmm3, xmm7 )
8243 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
8244 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
8245 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
8246 movlps xmm3, [edi+(0*6+4)*4]
8247 movhps xmm3, [edi+(1*6+4)*4]
8249 movlps xmm4, [edi+(2*6+4)*4]
8250 movhps xmm4, [edi+(3*6+4)*4]
8253 movlps xmm5, [edi+(4*6+4)*4]
8254 movhps xmm5, [edi+(5*6+4)*4]
8259 STORE2LO( 16, xmm3, xmm7 )
8264 for (
int i = 0; i < numColumns; i++ ) {
8265 dstPtr[
i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
8266 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
8275 for (
int i = 0; i < numColumns; i++ ) {
8277 float sum = mPtr[0] * vPtr[0];
8278 for (
int j = 1;
j < numRows;
j++ ) {
8280 sum += mPtr[0] * vPtr[
j];
8282 dstPtr[
i] STOREC sum;
8315 const float *m1Ptr, *m2Ptr;
8337 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
8341 movlps xmm2, [esi+16]
8343 movlps [eax+16], xmm2
8354 movlps xmm1, [edi+16]
8361 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
8363 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
8365 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
8370 movaps [eax+16], xmm5
8371 movaps [eax+32], xmm6
8374 shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 )
8376 shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 )
8377 shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
8381 movaps [eax+48], xmm4
8382 movaps [eax+64], xmm5
8383 movaps [eax+80], xmm3
8385 movlps xmm3, [esi+16]
8387 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
8389 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
8390 shufps xmm3, xmm3, R_SHUFFLEPS( 1, 1, 1, 1 )
8394 movaps [eax+96], xmm4
8395 movaps [eax+112], xmm5
8396 movaps [eax+128], xmm3
8402 for ( i = 0; i < k; i++ ) {
8404 for ( j = 0; j <
l; j++ ) {
8405 *dstPtr++ = m1Ptr[0] * m2Ptr[0];
8417 #define MUL_Nx2_2x6_INIT \
8418 __asm mov esi, m2Ptr \
8419 __asm mov edi, m1Ptr \
8420 __asm mov eax, dstPtr \
8421 __asm movaps xmm0, [esi] \
8422 __asm movlps xmm1, [esi+16] \
8423 __asm movhps xmm1, [esi+40] \
8424 __asm movlps xmm2, [esi+24] \
8425 __asm movhps xmm2, [esi+32]
8427 #define MUL_Nx2_2x6_ROW2( row ) \
8428 __asm movaps xmm3, [edi+row*16] \
8429 __asm movaps xmm5, xmm0 \
8430 __asm movaps xmm4, xmm3 \
8431 __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8432 __asm mulps xmm5, xmm4 \
8433 __asm movaps xmm4, xmm3 \
8434 __asm movaps xmm6, xmm2 \
8435 __asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 1, 1 ) \
8436 __asm mulps xmm6, xmm4 \
8437 __asm addps xmm5, xmm6 \
8438 __asm movaps [eax+row*48], xmm5 \
8439 __asm movaps xmm4, xmm3 \
8440 __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \
8441 __asm movaps xmm7, xmm1 \
8442 __asm mulps xmm7, xmm4 \
8443 __asm movaps xmm4, xmm3 \
8444 __asm movaps xmm5, xmm0 \
8445 __asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 ) \
8446 __asm mulps xmm5, xmm4 \
8447 __asm movaps xmm4, xmm3 \
8448 __asm movaps xmm6, xmm2 \
8449 __asm shufps xmm4, xmm4, R_SHUFFLEPS( 3, 3, 3, 3 ) \
8450 __asm mulps xmm6, xmm4 \
8451 __asm addps xmm5, xmm6 \
8452 __asm shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 ) \
8453 __asm movaps xmm6, xmm1 \
8454 __asm mulps xmm6, xmm3 \
8455 __asm movaps xmm4, xmm7 \
8456 __asm movlhps xmm7, xmm6 \
8457 __asm movhlps xmm6, xmm4 \
8458 __asm addps xmm6, xmm7 \
8459 __asm movlps [eax+row*48+16], xmm6 \
8460 __asm movlps [eax+row*48+24], xmm5 \
8461 __asm movhps [eax+row*48+32], xmm5 \
8462 __asm movhps [eax+row*48+40], xmm6
8465 MUL_Nx2_2x6_ROW2( 0 )
8472 MUL_Nx2_2x6_ROW2( 0 )
8473 MUL_Nx2_2x6_ROW2( 1 )
8474 MUL_Nx2_2x6_ROW2( 2 )
8480 for ( i = 0; i < k; i++ ) {
8482 for ( j = 0; j <
l; j++ ) {
8483 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[
l];
8498 movaps xmm5, xmmword ptr [esi]
8499 movlps xmm6, qword ptr [esi+24]
8500 movhps xmm6, qword ptr [esi+32]
8501 movaps xmm7, xmmword ptr [esi+48]
8502 movss xmm0,
dword ptr [edi]
8503 shufps xmm0, xmm0, 0
8505 movss xmm1,
dword ptr [edi+4]
8506 shufps xmm1, xmm1, 0
8508 movss xmm2,
dword ptr [edi+8]
8509 shufps xmm2, xmm2, 0
8513 movaps xmmword ptr [eax], xmm0
8514 movss xmm3,
dword ptr [edi+12]
8515 shufps xmm3, xmm3, 0
8517 movss xmm4,
dword ptr [edi+16]
8518 shufps xmm4, xmm4, 0
8520 movss xmm0,
dword ptr [edi+20]
8521 shufps xmm0, xmm0, 0
8525 movlps qword ptr [eax+24], xmm0
8526 movhps qword ptr [eax+32], xmm0
8527 movss xmm1,
dword ptr [edi+24]
8528 shufps xmm1, xmm1, 0
8530 movss xmm2,
dword ptr [edi+28]
8531 shufps xmm2, xmm2, 0
8533 movss xmm3,
dword ptr [edi+32]
8534 shufps xmm3, xmm3, 0
8538 movaps xmmword ptr [eax+48], xmm1
8539 movlps xmm5, qword ptr [esi+16]
8540 movlps xmm6, qword ptr [esi+40]
8541 movlps xmm7, qword ptr [esi+64]
8542 shufps xmm5, xmm5, 0x44
8543 shufps xmm6, xmm6, 0x44
8544 shufps xmm7, xmm7, 0x44
8545 movaps xmm3, xmmword ptr [edi]
8546 movlps xmm4, qword ptr [edi+16]
8548 shufps xmm0, xmm0, 0xF0
8551 shufps xmm1, xmm4, 0x05
8553 shufps xmm3, xmm4, 0x5A
8557 movlps qword ptr [eax+16], xmm1
8558 movhps qword ptr [eax+40], xmm1
8559 movss xmm0,
dword ptr [edi+24]
8560 shufps xmm0, xmm0, 0
8562 movss xmm2,
dword ptr [edi+28]
8563 shufps xmm2, xmm2, 0
8565 movss xmm4,
dword ptr [edi+32]
8566 shufps xmm4, xmm4, 0
8570 movlps qword ptr [eax+64], xmm0
8575 #define MUL_Nx3_3x6_FIRST4COLUMNS_INIT \
8576 __asm mov esi, m2Ptr \
8577 __asm mov edi, m1Ptr \
8578 __asm mov eax, dstPtr \
8579 __asm movlps xmm0, [esi+ 0*4] \
8580 __asm movhps xmm0, [esi+ 2*4] \
8581 __asm movlps xmm1, [esi+ 6*4] \
8582 __asm movhps xmm1, [esi+ 8*4] \
8583 __asm movlps xmm2, [esi+12*4] \
8584 __asm movhps xmm2, [esi+14*4]
8586 #define MUL_Nx3_3x6_FIRST4COLUMNS_ROW( row ) \
8587 __asm movss xmm3, [edi+(row*3+0)*4] \
8588 __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8589 __asm mulps xmm3, xmm0 \
8590 __asm movss xmm4, [edi+(row*3+1)*4] \
8591 __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8592 __asm mulps xmm4, xmm1 \
8593 __asm addps xmm3, xmm4 \
8594 __asm movss xmm5, [edi+(row*3+2)*4] \
8595 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8596 __asm mulps xmm5, xmm2 \
8597 __asm addps xmm3, xmm5 \
8598 __asm movlps [eax+(row*6+0)*4], xmm3 \
8599 __asm movhps [eax+(row*6+2)*4], xmm3
8601 #define MUL_Nx3_3x6_LAST2COLUMNS_ROW6 \
8602 __asm movlps xmm0, [esi+ 4*4] \
8603 __asm movlps xmm1, [esi+10*4] \
8604 __asm movlps xmm2, [esi+16*4] \
8605 __asm shufps xmm0, xmm0, 0x44 \
8606 __asm shufps xmm1, xmm1, 0x44 \
8607 __asm shufps xmm2, xmm2, 0x44 \
8608 __asm movlps xmm3, [edi+0*4] \
8609 __asm movhps xmm3, [edi+2*4] \
8610 __asm movaps xmm4, xmm3 \
8611 __asm movaps xmm5, xmm3 \
8612 __asm shufps xmm3, xmm3, 0xF0 \
8613 __asm mulps xmm3, xmm0 \
8614 __asm movlps xmm6, [edi+4*4] \
8615 __asm movhps xmm6, [edi+6*4] \
8616 __asm shufps xmm4, xmm6, 0x05 \
8617 __asm mulps xmm4, xmm1 \
8618 __asm addps xmm3, xmm4 \
8619 __asm shufps xmm5, xmm6, 0x5A \
8620 __asm mulps xmm5, xmm2 \
8621 __asm addps xmm3, xmm5 \
8622 __asm movlps [eax+4*4], xmm3 \
8623 __asm movhps [eax+10*4], xmm3 \
8624 __asm movaps xmm5, xmm6 \
8625 __asm movlps xmm3, [edi+8*4] \
8626 __asm movhps xmm3, [edi+10*4] \
8627 __asm movaps xmm4, xmm3 \
8628 __asm shufps xmm5, xmm3, 0x5A \
8629 __asm mulps xmm5, xmm0 \
8630 __asm shufps xmm6, xmm3, 0xAF \
8631 __asm mulps xmm6, xmm1 \
8632 __asm addps xmm5, xmm6 \
8633 __asm shufps xmm4, xmm4, 0xF0 \
8634 __asm mulps xmm4, xmm2 \
8635 __asm addps xmm4, xmm5 \
8636 __asm movlps [eax+16*4], xmm4 \
8637 __asm movhps [eax+22*4], xmm4 \
8638 __asm movlps xmm6, [edi+12*4] \
8639 __asm movhps xmm6, [edi+14*4] \
8640 __asm movaps xmm5, xmm6 \
8641 __asm movaps xmm4, xmm6 \
8642 __asm shufps xmm6, xmm6, 0xF0 \
8643 __asm mulps xmm6, xmm0 \
8644 __asm movlps xmm3, [edi+16*4] \
8645 __asm shufps xmm5, xmm3, 0x05 \
8646 __asm mulps xmm5, xmm1 \
8647 __asm addps xmm5, xmm6 \
8648 __asm shufps xmm4, xmm3, 0x5A \
8649 __asm mulps xmm4, xmm2 \
8650 __asm addps xmm4, xmm5 \
8651 __asm movlps [eax+28*4], xmm4 \
8652 __asm movhps [eax+34*4], xmm4
8654 MUL_Nx3_3x6_FIRST4COLUMNS_INIT
8655 MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 0 )
8656 MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 1 )
8657 MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 2 )
8658 MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 3 )
8659 MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 4 )
8660 MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 5 )
8661 MUL_Nx3_3x6_LAST2COLUMNS_ROW6
8667 for ( i = 0; i < k; i++ ) {
8669 for ( j = 0; j <
l; j++ ) {
8670 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[
l] + m1Ptr[2] * m2Ptr[2*
l];
8682 #define MUL_Nx4_4x6_FIRST4COLUMNS_INIT \
8683 __asm mov esi, m2Ptr \
8684 __asm mov edi, m1Ptr \
8685 __asm mov eax, dstPtr \
8686 __asm movlps xmm0, [esi+ 0*4] \
8687 __asm movhps xmm0, [esi+ 2*4] \
8688 __asm movlps xmm1, [esi+ 6*4] \
8689 __asm movhps xmm1, [esi+ 8*4] \
8690 __asm movlps xmm2, [esi+12*4] \
8691 __asm movhps xmm2, [esi+14*4] \
8692 __asm movlps xmm3, [esi+18*4] \
8693 __asm movhps xmm3, [esi+20*4]
8695 #define MUL_Nx4_4x6_FIRST4COLUMNS_ROW( row ) \
8696 __asm movss xmm4, [edi+row*16+0*4] \
8697 __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8698 __asm mulps xmm4, xmm0 \
8699 __asm movss xmm5, [edi+row*16+1*4] \
8700 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8701 __asm mulps xmm5, xmm1 \
8702 __asm addps xmm4, xmm5 \
8703 __asm movss xmm6, [edi+row*16+2*4] \
8704 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8705 __asm mulps xmm6, xmm2 \
8706 __asm addps xmm4, xmm6 \
8707 __asm movss xmm7, [edi+row*16+3*4] \
8708 __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8709 __asm mulps xmm7, xmm3 \
8710 __asm addps xmm4, xmm7 \
8711 __asm movlps [eax+row*24+0], xmm4 \
8712 __asm movhps [eax+row*24+8], xmm4
8714 #define MUL_Nx4_4x6_LAST2COLUMNS_INIT \
8715 __asm movlps xmm0, [esi+ 4*4] \
8716 __asm movlps xmm1, [esi+10*4] \
8717 __asm movlps xmm2, [esi+16*4] \
8718 __asm movlps xmm3, [esi+22*4] \
8719 __asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
8720 __asm shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
8721 __asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
8722 __asm shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
8724 #define MUL_Nx4_4x6_LAST2COLUMNS_ROW2( row ) \
8725 __asm movlps xmm7, [edi+row*32+ 0*4] \
8726 __asm movhps xmm7, [edi+row*32+ 4*4] \
8727 __asm movaps xmm6, xmm7 \
8728 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 3, 3 ) \
8729 __asm mulps xmm6, xmm0 \
8730 __asm shufps xmm7, xmm7, R_SHUFFLEPS( 1, 1, 2, 2 ) \
8731 __asm mulps xmm7, xmm1 \
8732 __asm addps xmm6, xmm7 \
8733 __asm movlps xmm4, [edi+row*32+ 2*4] \
8734 __asm movhps xmm4, [edi+row*32+ 6*4] \
8735 __asm movaps xmm5, xmm4 \
8736 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 3, 3 ) \
8737 __asm mulps xmm5, xmm2 \
8738 __asm addps xmm6, xmm5 \
8739 __asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 2, 2 ) \
8740 __asm mulps xmm4, xmm3 \
8741 __asm addps xmm6, xmm4 \
8742 __asm movlps [eax+row*48+ 4*4], xmm6 \
8743 __asm movhps [eax+row*48+10*4], xmm6
8745 MUL_Nx4_4x6_FIRST4COLUMNS_INIT
8746 MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
8747 MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
8748 MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
8749 MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
8750 MUL_Nx4_4x6_LAST2COLUMNS_INIT
8751 MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
8752 MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
8758 MUL_Nx4_4x6_FIRST4COLUMNS_INIT
8759 MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
8760 MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
8761 MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
8762 MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
8763 MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 4 )
8764 MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 5 )
8765 MUL_Nx4_4x6_LAST2COLUMNS_INIT
8766 MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
8767 MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
8768 MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 2 )
8774 for ( i = 0; i < k; i++ ) {
8776 for ( j = 0; j <
l; j++ ) {
8777 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[
l] + m1Ptr[2] * m2Ptr[2*
l] +
8778 m1Ptr[3] * m2Ptr[3*
l];
8790 #define MUL_Nx5_5x6_FIRST4COLUMNS_INIT \
8791 __asm mov esi, m2Ptr \
8792 __asm mov edi, m1Ptr \
8793 __asm mov eax, dstPtr \
8794 __asm movlps xmm0, [esi+ 0*4] \
8795 __asm movhps xmm0, [esi+ 2*4] \
8796 __asm movlps xmm1, [esi+ 6*4] \
8797 __asm movhps xmm1, [esi+ 8*4] \
8798 __asm movlps xmm2, [esi+12*4] \
8799 __asm movhps xmm2, [esi+14*4] \
8800 __asm movlps xmm3, [esi+18*4] \
8801 __asm movhps xmm3, [esi+20*4] \
8802 __asm movlps xmm4, [esi+24*4] \
8803 __asm movhps xmm4, [esi+26*4]
8805 #define MUL_Nx5_5x6_FIRST4COLUMNS_ROW( row ) \
8806 __asm movss xmm6, [edi+row*20+0*4] \
8807 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8808 __asm mulps xmm6, xmm0 \
8809 __asm movss xmm5, [edi+row*20+1*4] \
8810 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8811 __asm mulps xmm5, xmm1 \
8812 __asm addps xmm6, xmm5 \
8813 __asm movss xmm5, [edi+row*20+2*4] \
8814 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8815 __asm mulps xmm5, xmm2 \
8816 __asm addps xmm6, xmm5 \
8817 __asm movss xmm5, [edi+row*20+3*4] \
8818 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8819 __asm mulps xmm5, xmm3 \
8820 __asm addps xmm6, xmm5 \
8821 __asm movss xmm5, [edi+row*20+4*4] \
8822 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
8823 __asm mulps xmm5, xmm4 \
8824 __asm addps xmm6, xmm5 \
8825 __asm movlps [eax+row*24+0], xmm6 \
8826 __asm movhps [eax+row*24+8], xmm6
8828 #define MUL_Nx5_5x6_LAST2COLUMNS_INIT \
8829 __asm movlps xmm0, [esi+ 4*4] \
8830 __asm movlps xmm1, [esi+10*4] \
8831 __asm movlps xmm2, [esi+16*4] \
8832 __asm movlps xmm3, [esi+22*4] \
8833 __asm movlps xmm4, [esi+28*4] \
8834 __asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
8835 __asm shufps xmm1, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \
8836 __asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
8837 __asm shufps xmm3, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \
8838 __asm shufps xmm4, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )
8840 #define MUL_Nx5_5x6_LAST2COLUMNS_ROW2( row ) \
8841 __asm movlps xmm7, [edi+row*40+ 0*4] \
8842 __asm movhps xmm7, [edi+row*40+ 6*4] \
8843 __asm movaps xmm6, xmm7 \
8844 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 2, 2 ) \
8845 __asm mulps xmm6, xmm0 \
8846 __asm movaps xmm5, xmm7 \
8847 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \
8848 __asm mulps xmm5, xmm1 \
8849 __asm addps xmm6, xmm5 \
8850 __asm movlps xmm7, [edi+row*40+ 2*4] \
8851 __asm movhps xmm7, [edi+row*40+ 8*4] \
8852 __asm movaps xmm5, xmm7 \
8853 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 2, 2 ) \
8854 __asm mulps xmm5, xmm2 \
8855 __asm addps xmm6, xmm5 \
8856 __asm movaps xmm5, xmm7 \
8857 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \
8858 __asm mulps xmm5, xmm3 \
8859 __asm addps xmm6, xmm5 \
8860 __asm movlps xmm5, [edi+row*40+ 4*4] \
8861 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \
8862 __asm mulps xmm5, xmm4 \
8863 __asm addps xmm6, xmm5 \
8864 __asm movlps [eax+row*48+ 4*4], xmm6 \
8865 __asm movhps [eax+row*48+10*4], xmm6
8867 #define MUL_Nx5_5x6_LAST2COLUMNS_ROW( row ) \
8868 __asm movlps xmm6, [edi+20*4+0*4] \
8869 __asm unpcklps xmm6, xmm6 \
8870 __asm mulps xmm6, xmm0 \
8871 __asm movlps xmm5, [edi+20*4+2*4] \
8872 __asm unpcklps xmm5, xmm5 \
8873 __asm mulps xmm5, xmm2 \
8874 __asm addps xmm6, xmm5 \
8875 __asm movss xmm5, [edi+20*4+4*4] \
8876 __asm unpcklps xmm5, xmm5 \
8877 __asm mulps xmm5, xmm4 \
8878 __asm addps xmm6, xmm5 \
8879 __asm movhlps xmm7, xmm6 \
8880 __asm addps xmm6, xmm7 \
8881 __asm movlps [eax+row*24+4*4], xmm6
8883 MUL_Nx5_5x6_FIRST4COLUMNS_INIT
8884 MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
8885 MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
8886 MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
8887 MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
8888 MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
8889 MUL_Nx5_5x6_LAST2COLUMNS_INIT
8890 MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
8891 MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
8892 MUL_Nx5_5x6_LAST2COLUMNS_ROW( 4 )
8898 MUL_Nx5_5x6_FIRST4COLUMNS_INIT
8899 MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
8900 MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
8901 MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
8902 MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
8903 MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
8904 MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 5 )
8905 MUL_Nx5_5x6_LAST2COLUMNS_INIT
8906 MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
8907 MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
8908 MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 2 )
8914 for ( i = 0; i < k; i++ ) {
8916 for ( j = 0; j <
l; j++ ) {
8917 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[
l] + m1Ptr[2] * m2Ptr[2*
l] +
8918 m1Ptr[3] * m2Ptr[3*
l] + m1Ptr[4] * m2Ptr[4*
l];
8929 dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
8930 m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
8938 #define MUL_Nx6_6x2_INIT \
8939 __asm mov esi, m2Ptr \
8940 __asm mov edi, m1Ptr \
8941 __asm mov eax, dstPtr \
8942 __asm movaps xmm0, [esi] \
8943 __asm movaps xmm1, [esi+16] \
8944 __asm movaps xmm2, [esi+32]
8946 #define MUL_Nx6_6x2_ROW2( row ) \
8947 __asm movaps xmm7, [edi+row*48+0*4] \
8948 __asm movaps xmm6, xmm7 \
8949 __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
8950 __asm mulps xmm7, xmm0 \
8951 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 2, 2, 3, 3 ) \
8952 __asm mulps xmm6, xmm1 \
8953 __asm addps xmm7, xmm6 \
8954 __asm movaps xmm6, [edi+row*48+4*4] \
8955 __asm movaps xmm5, xmm6 \
8956 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
8957 __asm mulps xmm6, xmm2 \
8958 __asm addps xmm7, xmm6 \
8959 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 ) \
8960 __asm mulps xmm5, xmm0 \
8961 __asm movaps xmm6, [edi+row*48+24+2*4] \
8962 __asm movaps xmm4, xmm6 \
8963 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
8964 __asm mulps xmm6, xmm1 \
8965 __asm addps xmm5, xmm6 \
8966 __asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 3, 3 ) \
8967 __asm mulps xmm4, xmm2 \
8968 __asm addps xmm5, xmm4 \
8969 __asm movaps xmm4, xmm5 \
8970 __asm movhlps xmm5, xmm7 \
8971 __asm movlhps xmm7, xmm4 \
8972 __asm addps xmm7, xmm5 \
8973 __asm movaps [eax+row*16], xmm7
8976 MUL_Nx6_6x2_ROW2( 0 )
8985 #define MUL_Nx6_6x3_INIT \
8986 __asm mov esi, m2Ptr \
8987 __asm mov edi, m1Ptr \
8988 __asm mov eax, dstPtr \
8989 __asm movss xmm0, [esi+ 0*4] \
8990 __asm movhps xmm0, [esi+ 1*4] \
8991 __asm movss xmm1, [esi+ 3*4] \
8992 __asm movhps xmm1, [esi+ 4*4] \
8993 __asm movss xmm2, [esi+ 6*4] \
8994 __asm movhps xmm2, [esi+ 7*4] \
8995 __asm movss xmm3, [esi+ 9*4] \
8996 __asm movhps xmm3, [esi+10*4] \
8997 __asm movss xmm4, [esi+12*4] \
8998 __asm movhps xmm4, [esi+13*4] \
8999 __asm movss xmm5, [esi+15*4] \
9000 __asm movhps xmm5, [esi+16*4]
9002 #define MUL_Nx6_6x3_ROW( row ) \
9003 __asm movss xmm7, [edi+row*24+0] \
9004 __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9005 __asm mulps xmm7, xmm0 \
9006 __asm movss xmm6, [edi+row*24+4] \
9007 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9008 __asm mulps xmm6, xmm1 \
9009 __asm addps xmm7, xmm6 \
9010 __asm movss xmm6, [edi+row*24+8] \
9011 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9012 __asm mulps xmm6, xmm2 \
9013 __asm addps xmm7, xmm6 \
9014 __asm movss xmm6, [edi+row*24+12] \
9015 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9016 __asm mulps xmm6, xmm3 \
9017 __asm addps xmm7, xmm6 \
9018 __asm movss xmm6, [edi+row*24+16] \
9019 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9020 __asm mulps xmm6, xmm4 \
9021 __asm addps xmm7, xmm6 \
9022 __asm movss xmm6, [edi+row*24+20] \
9023 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9024 __asm mulps xmm6, xmm5 \
9025 __asm addps xmm7, xmm6 \
9026 __asm movss [eax+row*12+0], xmm7 \
9027 __asm movhps [eax+row*12+4], xmm7
9030 MUL_Nx6_6x3_ROW( 0 )
9031 MUL_Nx6_6x3_ROW( 1 )
9032 MUL_Nx6_6x3_ROW( 2 )
9041 #define MUL_Nx6_6x4_INIT \
9042 __asm mov esi, m2Ptr \
9043 __asm mov edi, m1Ptr \
9044 __asm mov eax, dstPtr \
9045 __asm movaps xmm0, [esi] \
9046 __asm movaps xmm1, [esi+16] \
9047 __asm movaps xmm2, [esi+32] \
9048 __asm movaps xmm3, [esi+48] \
9049 __asm movaps xmm4, [esi+64] \
9050 __asm movaps xmm5, [esi+80]
9052 #define MUL_Nx6_6x4_ROW( row ) \
9053 __asm movss xmm7, [edi+row*24+0] \
9054 __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9055 __asm mulps xmm7, xmm0 \
9056 __asm movss xmm6, [edi+row*24+4] \
9057 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9058 __asm mulps xmm6, xmm1 \
9059 __asm addps xmm7, xmm6 \
9060 __asm movss xmm6, [edi+row*24+8] \
9061 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9062 __asm mulps xmm6, xmm2 \
9063 __asm addps xmm7, xmm6 \
9064 __asm movss xmm6, [edi+row*24+12] \
9065 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9066 __asm mulps xmm6, xmm3 \
9067 __asm addps xmm7, xmm6 \
9068 __asm movss xmm6, [edi+row*24+16] \
9069 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9070 __asm mulps xmm6, xmm4 \
9071 __asm addps xmm7, xmm6 \
9072 __asm movss xmm6, [edi+row*24+20] \
9073 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9074 __asm mulps xmm6, xmm5 \
9075 __asm addps xmm7, xmm6 \
9076 __asm movaps [eax+row*16], xmm7
9079 MUL_Nx6_6x4_ROW( 0 )
9080 MUL_Nx6_6x4_ROW( 1 )
9081 MUL_Nx6_6x4_ROW( 2 )
9082 MUL_Nx6_6x4_ROW( 3 )
9091 #define MUL_Nx6_6x5_INIT \
9092 __asm mov esi, m2Ptr \
9093 __asm mov edi, m1Ptr \
9094 __asm mov eax, dstPtr \
9095 __asm movaps xmm0, [esi] \
9096 __asm movlps xmm1, [esi+20] \
9097 __asm movhps xmm1, [esi+28] \
9098 __asm movlps xmm2, [esi+40] \
9099 __asm movhps xmm2, [esi+48] \
9100 __asm movlps xmm3, [esi+60] \
9101 __asm movhps xmm3, [esi+68] \
9102 __asm movaps xmm4, [esi+80] \
9103 __asm movlps xmm5, [esi+100] \
9104 __asm movhps xmm5, [esi+108]
9106 #define MUL_Nx6_6x5_ROW( row ) \
9107 __asm movss xmm7, [edi+row*24+0] \
9108 __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9109 __asm mulps xmm7, xmm0 \
9110 __asm fld dword ptr [edi+(row*6+0)*4] \
9111 __asm fmul dword ptr [esi+(4+0*5)*4] \
9112 __asm movss xmm6, [edi+row*24+4] \
9113 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9114 __asm mulps xmm6, xmm1 \
9115 __asm addps xmm7, xmm6 \
9116 __asm fld dword ptr [edi+(row*6+1)*4] \
9117 __asm fmul dword ptr [esi+(4+1*5)*4] \
9118 __asm faddp st(1),st \
9119 __asm movss xmm6, [edi+row*24+8] \
9120 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9121 __asm mulps xmm6, xmm2 \
9122 __asm addps xmm7, xmm6 \
9123 __asm fld dword ptr [edi+(row*6+2)*4] \
9124 __asm fmul dword ptr [esi+(4+2*5)*4] \
9125 __asm faddp st(1),st \
9126 __asm movss xmm6, [edi+row*24+12] \
9127 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9128 __asm mulps xmm6, xmm3 \
9129 __asm addps xmm7, xmm6 \
9130 __asm fld dword ptr [edi+(row*6+3)*4] \
9131 __asm fmul dword ptr [esi+(4+3*5)*4] \
9132 __asm faddp st(1),st \
9133 __asm movss xmm6, [edi+row*24+16] \
9134 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9135 __asm mulps xmm6, xmm4 \
9136 __asm addps xmm7, xmm6 \
9137 __asm fld dword ptr [edi+(row*6+4)*4] \
9138 __asm fmul dword ptr [esi+(4+4*5)*4] \
9139 __asm faddp st(1),st \
9140 __asm movss xmm6, [edi+row*24+20] \
9141 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9142 __asm mulps xmm6, xmm5 \
9143 __asm addps xmm7, xmm6 \
9144 __asm fld dword ptr [edi+(row*6+5)*4] \
9145 __asm fmul dword ptr [esi+(4+5*5)*4] \
9146 __asm faddp st(1),st \
9147 __asm fstp dword ptr [eax+(row*5+4)*4] \
9148 __asm movlps [eax+row*20], xmm7 \
9149 __asm movhps [eax+row*20+8], xmm7
9152 MUL_Nx6_6x5_ROW( 0 )
9153 MUL_Nx6_6x5_ROW( 1 )
9154 MUL_Nx6_6x5_ROW( 2 )
9155 MUL_Nx6_6x5_ROW( 3 )
9156 MUL_Nx6_6x5_ROW( 4 )
9169 movlps xmm7, qword ptr [esi]
9170 movlps xmm6, qword ptr [esi+8]
9171 shufps xmm7, xmm7, 0x44
9172 shufps xmm6, xmm6, 0x44
9173 movlps xmm0, qword ptr [edi ]
9174 movhps xmm0, qword ptr [edi+ 24]
9176 movlps xmm3, qword ptr [edi+ 8]
9177 movhps xmm3, qword ptr [edi+ 32]
9179 movlps xmm1, qword ptr [edi+ 48]
9180 movhps xmm1, qword ptr [edi+ 72]
9182 movlps xmm2, qword ptr [edi+ 96]
9183 movhps xmm2, qword ptr [edi+120]
9185 movlps xmm4, qword ptr [edi+ 56]
9186 movhps xmm4, qword ptr [edi+ 80]
9187 movlps xmm5, qword ptr [edi+104]
9188 movhps xmm5, qword ptr [edi+128]
9190 movlps xmm7, qword ptr [esi+16]
9192 shufps xmm7, xmm7, 0x44
9195 movlps xmm3, qword ptr [edi+ 16]
9196 movhps xmm3, qword ptr [edi+ 40]
9198 movlps xmm4, qword ptr [edi+ 64]
9199 movhps xmm4, qword ptr [edi+ 88]
9201 movlps xmm5, qword ptr [edi+112]
9202 movhps xmm5, qword ptr [edi+136]
9209 shufps xmm0, xmm1, 0x88
9210 shufps xmm6, xmm1, 0xDD
9212 shufps xmm7, xmm2, 0x88
9213 shufps xmm2, xmm2, 0xDD
9217 movhps [eax+8], xmm0
9218 movlps [eax+16], xmm2
9225 MUL_Nx6_6x2_ROW2( 0 )
9226 MUL_Nx6_6x2_ROW2( 1 )
9227 MUL_Nx6_6x2_ROW2( 2 )
9234 MUL_Nx6_6x3_ROW( 0 )
9235 MUL_Nx6_6x3_ROW( 1 )
9236 MUL_Nx6_6x3_ROW( 2 )
9237 MUL_Nx6_6x3_ROW( 3 )
9238 MUL_Nx6_6x3_ROW( 4 )
9239 MUL_Nx6_6x3_ROW( 5 )
9246 MUL_Nx6_6x4_ROW( 0 )
9247 MUL_Nx6_6x4_ROW( 1 )
9248 MUL_Nx6_6x4_ROW( 2 )
9249 MUL_Nx6_6x4_ROW( 3 )
9250 MUL_Nx6_6x4_ROW( 4 )
9251 MUL_Nx6_6x4_ROW( 5 )
9258 MUL_Nx6_6x5_ROW( 0 )
9259 MUL_Nx6_6x5_ROW( 1 )
9260 MUL_Nx6_6x5_ROW( 2 )
9261 MUL_Nx6_6x5_ROW( 3 )
9262 MUL_Nx6_6x5_ROW( 4 )
9263 MUL_Nx6_6x5_ROW( 5 )
9269 mov ecx,
dword ptr m2Ptr
9270 movlps xmm3, qword ptr [ecx+72]
9271 mov edx,
dword ptr m1Ptr
9273 movaps xmm0, xmmword ptr [ecx]
9274 movlps xmm1, qword ptr [ecx+24]
9275 movhps xmm1, qword ptr [ecx+32]
9276 movaps xmm2, xmmword ptr [ecx+48]
9277 movhps xmm3, qword ptr [ecx+80]
9279 movss xmm4,
dword ptr [edx]
9280 movss xmm5,
dword ptr [edx+4]
9281 mov eax,
dword ptr dstPtr
9282 shufps xmm4, xmm4, 0
9283 movss xmm6,
dword ptr [edx+8]
9284 shufps xmm5, xmm5, 0
9285 movss xmm7,
dword ptr [edx+12]
9287 shufps xmm6, xmm6, 0
9288 shufps xmm7, xmm7, 0
9295 movaps xmmword ptr [eax], xmm7
9297 movss xmm4,
dword ptr [edx+24]
9298 shufps xmm4, xmm4, 0
9300 movss xmm5,
dword ptr [edx+28]
9301 shufps xmm5, xmm5, 0
9303 movss xmm6,
dword ptr [edx+32]
9304 shufps xmm6, xmm6, 0
9305 movss xmm7,
dword ptr [edx+36]
9306 shufps xmm7, xmm7, 0
9313 movss xmm4,
dword ptr [edx+48]
9314 movss xmm5,
dword ptr [edx+52]
9315 movlps qword ptr [eax+24], xmm7 ; save 2nd
9316 movhps qword ptr [eax+32], xmm7 ;
row
9317 movss xmm6,
dword ptr [edx+56]
9318 movss xmm7,
dword ptr [edx+60]
9319 shufps xmm4, xmm4, 0
9320 shufps xmm5, xmm5, 0
9321 shufps xmm6, xmm6, 0
9322 shufps xmm7, xmm7, 0
9330 movaps xmmword ptr [eax+48], xmm7
9332 movss xmm4,
dword ptr [edx+72]
9333 movss xmm5,
dword ptr [edx+76]
9334 movss xmm6,
dword ptr [edx+80]
9335 movss xmm7,
dword ptr [edx+84]
9336 shufps xmm4, xmm4, 0
9337 shufps xmm5, xmm5, 0
9338 shufps xmm6, xmm6, 0
9339 shufps xmm7, xmm7, 0
9347 movlps qword ptr [eax+72], xmm7
9348 movhps qword ptr [eax+80], xmm7
9350 movss xmm4,
dword ptr [edx+96]
9351 movss xmm5,
dword ptr [edx+100]
9352 movss xmm6,
dword ptr [edx+104]
9353 movss xmm7,
dword ptr [edx+108]
9354 shufps xmm4, xmm4, 0
9355 shufps xmm5, xmm5, 0
9356 shufps xmm6, xmm6, 0
9357 shufps xmm7, xmm7, 0
9365 movaps xmmword ptr [eax+96], xmm7
9367 movss xmm4,
dword ptr [edx+120]
9368 movss xmm5,
dword ptr [edx+124]
9369 movss xmm6,
dword ptr [edx+128]
9370 movss xmm7,
dword ptr [edx+132]
9371 shufps xmm4, xmm4, 0
9372 shufps xmm5, xmm5, 0
9373 shufps xmm6, xmm6, 0
9374 shufps xmm7, xmm7, 0
9382 movhps qword ptr [eax+128], xmm7
9383 movlps qword ptr [eax+120], xmm7
9385 movlps xmm0, qword ptr [ecx+96]
9386 movhps xmm0, qword ptr [ecx+104]
9387 movlps xmm1, qword ptr [ecx+120]
9388 movhps xmm1, qword ptr [ecx+128]
9390 movss xmm2,
dword ptr [edx+16]
9391 shufps xmm2, xmm2, 0
9392 movss xmm4,
dword ptr [edx+40]
9393 movss xmm3,
dword ptr [edx+20]
9394 movss xmm5,
dword ptr [edx+44]
9395 movaps xmm6, xmmword ptr [eax]
9396 movlps xmm7, qword ptr [eax+24]
9397 shufps xmm3, xmm3, 0
9398 shufps xmm5, xmm5, 0
9399 movhps xmm7, qword ptr [eax+32]
9400 shufps xmm4, xmm4, 0
9409 movlps qword ptr [eax+24], xmm7
9410 movaps xmmword ptr [eax], xmm6
9411 movhps qword ptr [eax+32], xmm7
9413 movss xmm2,
dword ptr [edx+64]
9414 movss xmm4,
dword ptr [edx+88]
9415 movss xmm5,
dword ptr [edx+92]
9416 movss xmm3,
dword ptr [edx+68]
9417 movaps xmm6, xmmword ptr [eax+48]
9418 movlps xmm7, qword ptr [eax+72]
9419 movhps xmm7, qword ptr [eax+80]
9420 shufps xmm2, xmm2, 0
9421 shufps xmm4, xmm4, 0
9422 shufps xmm5, xmm5, 0
9423 shufps xmm3, xmm3, 0
9432 movlps qword ptr [eax+72], xmm7
9433 movaps xmmword ptr [eax+48], xmm6
9434 movhps qword ptr [eax+80], xmm7
9436 movss xmm2,
dword ptr [edx+112]
9437 movss xmm3,
dword ptr [edx+116]
9438 movaps xmm6, xmmword ptr [eax+96]
9439 shufps xmm2, xmm2, 0
9440 shufps xmm3, xmm3, 0
9445 movaps xmmword ptr [eax+96], xmm6
9447 movss xmm4,
dword ptr [edx+136]
9448 movss xmm5,
dword ptr [edx+140]
9449 movhps xmm7, qword ptr [eax+128]
9450 movlps xmm7, qword ptr [eax+120]
9451 shufps xmm4, xmm4, 0
9452 shufps xmm5, xmm5, 0
9458 movlps xmm0, qword ptr [ecx+16]
9459 movhps xmm0, qword ptr [ecx+40]
9460 movhps qword ptr [eax+128], xmm7
9461 movlps qword ptr [eax+120], xmm7
9462 movlps xmm2, qword ptr [ecx+64]
9463 movhps xmm2, qword ptr [ecx+88]
9465 shufps xmm3, xmm3, 4Eh
9466 movlps xmm4, qword ptr [ecx+112]
9467 movhps xmm4, qword ptr [ecx+136]
9469 shufps xmm5, xmm5, 4Eh
9470 movlps xmm6, qword ptr [edx]
9471 movhps xmm6, qword ptr [edx+24]
9473 shufps xmm7, xmm7, 0F0h
9475 shufps xmm6, xmm6, 0A5h
9477 shufps xmm1, xmm1, 4Eh
9480 movlps xmm6, qword ptr [edx+8]
9481 movhps xmm6, qword ptr [edx+32]
9483 shufps xmm1, xmm1, 0F0h
9484 shufps xmm6, xmm6, 0A5h
9489 movhps xmm6, qword ptr [edx+40]
9490 movlps xmm6, qword ptr [edx+16]
9492 shufps xmm1, xmm1, 0F0h
9493 shufps xmm6, xmm6, 0A5h
9498 movlps qword ptr [eax+16], xmm7
9499 movhps qword ptr [eax+40], xmm7
9500 movlps xmm6, qword ptr [edx+48]
9501 movhps xmm6, qword ptr [edx+72]
9503 shufps xmm7, xmm7, 0F0h
9505 shufps xmm6, xmm6, 0A5h
9507 shufps xmm1, xmm1, 4Eh
9510 movhps xmm6, qword ptr [edx+80]
9511 movlps xmm6, qword ptr [edx+56]
9513 shufps xmm1, xmm1, 0F0h
9514 shufps xmm6, xmm6, 0A5h
9519 movlps xmm6, qword ptr [edx+64]
9520 movhps xmm6, qword ptr [edx+88]
9522 shufps xmm1, xmm1, 0F0h
9523 shufps xmm6, xmm6, 0A5h
9528 movlps qword ptr [eax+64], xmm7
9529 movhps qword ptr [eax+88], xmm7
9530 movlps xmm6, qword ptr [edx+96]
9531 movhps xmm6, qword ptr [edx+120]
9533 shufps xmm7, xmm7, 0F0h
9535 shufps xmm6, xmm6, 0A5h
9537 shufps xmm1, xmm1, 4Eh
9540 movlps xmm6, qword ptr [edx+104]
9541 movhps xmm6, qword ptr [edx+128]
9543 shufps xmm1, xmm1, 0F0h
9544 shufps xmm6, xmm6, 0A5h
9549 movlps xmm6, qword ptr [edx+112]
9550 movhps xmm6, qword ptr [edx+136]
9552 shufps xmm1, xmm1, 0F0h
9553 shufps xmm6, xmm6, 0A5h
9558 movlps qword ptr [eax+112], xmm7
9559 movhps qword ptr [eax+136], xmm7
9566 for ( i = 0; i < k; i++ ) {
9568 for ( j = 0; j <
l; j++ ) {
9569 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[
l] + m1Ptr[2] * m2Ptr[2*
l] +
9570 m1Ptr[3] * m2Ptr[3*
l] + m1Ptr[4] * m2Ptr[4*
l] + m1Ptr[5] * m2Ptr[5*
l];
9578 for ( i = 0; i < k; i++ ) {
9579 for ( j = 0; j <
l; j++ ) {
9581 sum = m1Ptr[0] * m2Ptr[0];
9584 sum += m1Ptr[
n] * m2Ptr[0];
9610 const float *m1Ptr, *m2Ptr;
9623 if ( !((k^6)|(l^1)) ) {
9629 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
9632 mulps xmm1, [edi+16]
9634 movlps [eax+16], xmm1
9638 for ( i = 0; i < k; i++ ) {
9640 for ( j = 0; j <
l; j++ ) {
9641 *dstPtr++ = m1Ptr[0] * m2Ptr[0];
9648 if ( !((k^6)|(l^2)) ) {
9649 #define MUL_2xN_2x2_INIT \
9650 __asm mov esi, m2Ptr \
9651 __asm mov edi, m1Ptr \
9652 __asm mov eax, dstPtr \
9653 __asm movlps xmm0, [esi] \
9654 __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
9655 __asm movlps xmm1, [esi+8] \
9656 __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )
9658 #define MUL_2xN_2x2_ROW2( N, row ) \
9659 __asm movlps xmm6, [edi+(row+0*N)*4] \
9660 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9661 __asm movlps xmm7, [edi+(row+1*N)*4] \
9662 __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9663 __asm mulps xmm6, xmm0 \
9664 __asm mulps xmm7, xmm1 \
9665 __asm addps xmm6, xmm7 \
9666 __asm movaps [eax+(row*2)*4], xmm6
9669 MUL_2xN_2x2_ROW2( 6, 0 )
9670 MUL_2xN_2x2_ROW2( 6, 2 )
9671 MUL_2xN_2x2_ROW2( 6, 4 )
9675 for ( i = 0; i < k; i++ ) {
9677 for ( j = 0; j <
l; j++ ) {
9678 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[
l];
9685 if ( !((k^6)|(l^3)) ) {
9687 #define MUL_3xN_3x3_INIT \
9688 __asm mov esi, m2Ptr \
9689 __asm mov edi, m1Ptr \
9690 __asm mov eax, dstPtr \
9691 __asm movss xmm0, [esi+(0*3+0)*4] \
9692 __asm movhps xmm0, [esi+(0*3+1)*4] \
9693 __asm movss xmm1, [esi+(1*3+0)*4] \
9694 __asm movhps xmm1, [esi+(1*3+1)*4] \
9695 __asm movss xmm2, [esi+(2*3+0)*4] \
9696 __asm movhps xmm2, [esi+(2*3+1)*4]
9698 #define MUL_3xN_3x3_INIT_ROW4 \
9699 __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 3, 0 ) \
9700 __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 0 ) \
9701 __asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 2, 3, 0 )
9703 #define MUL_3xN_3x3_ROW4( N, row ) \
9704 __asm movlps xmm3, [edi+(row+0*N+0)*4] \
9705 __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 1 ) \
9706 __asm movlps xmm4, [edi+(row+1*N+0)*4] \
9707 __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 1 ) \
9708 __asm movlps xmm5, [edi+(row+2*N+0)*4] \
9709 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 1 ) \
9710 __asm mulps xmm3, xmm0 \
9711 __asm mulps xmm4, xmm1 \
9712 __asm mulps xmm5, xmm2 \
9713 __asm addps xmm3, xmm4 \
9714 __asm addps xmm3, xmm5 \
9715 __asm movaps [eax+(row*3+0)*4], xmm3 \
9716 __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 ) \
9717 __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 ) \
9718 __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 ) \
9719 __asm movlps xmm3, [edi+(row+0*N+1)*4] \
9720 __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9721 __asm movlps xmm4, [edi+(row+1*N+1)*4] \
9722 __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9723 __asm movlps xmm5, [edi+(row+2*N+1)*4] \
9724 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9725 __asm mulps xmm3, xmm0 \
9726 __asm mulps xmm4, xmm1 \
9727 __asm mulps xmm5, xmm2 \
9728 __asm addps xmm3, xmm4 \
9729 __asm addps xmm3, xmm5 \
9730 __asm movaps [eax+(row*3+4)*4], xmm3 \
9731 __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 ) \
9732 __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 ) \
9733 __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 ) \
9734 __asm movlps xmm3, [edi+(row+0*N+2)*4] \
9735 __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 1, 1 ) \
9736 __asm movlps xmm4, [edi+(row+1*N+2)*4] \
9737 __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 1, 1 ) \
9738 __asm movlps xmm5, [edi+(row+2*N+2)*4] \
9739 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 1, 1 ) \
9740 __asm mulps xmm3, xmm0 \
9741 __asm mulps xmm4, xmm1 \
9742 __asm mulps xmm5, xmm2 \
9743 __asm addps xmm3, xmm4 \
9744 __asm addps xmm3, xmm5 \
9745 __asm movaps [eax+(row*3+8)*4], xmm3
9747 #define MUL_3xN_3x3_INIT_ROW4_ROW4 \
9748 __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) \
9749 __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) \
9750 __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
9752 #define MUL_3xN_3x3_INIT_ROW4_ROW \
9753 __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 1, 2, 3 ) \
9754 __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 2, 3 ) \
9755 __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 1, 2, 3 )
9757 #define MUL_3xN_3x3_ROW( N, row ) \
9758 __asm movss xmm3, [edi+(row+0*N)*4] \
9759 __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9760 __asm movss xmm4, [edi+(row+1*N)*4] \
9761 __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9762 __asm movss xmm5, [edi+(row+2*N)*4] \
9763 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9764 __asm mulps xmm3, xmm0 \
9765 __asm mulps xmm4, xmm1 \
9766 __asm mulps xmm5, xmm2 \
9767 __asm addps xmm3, xmm4 \
9768 __asm addps xmm3, xmm5 \
9769 __asm movss [eax+(row*3+0)*4], xmm3 \
9770 __asm movhps [eax+(row*3+1)*4], xmm3
9773 MUL_3xN_3x3_INIT_ROW4
9774 MUL_3xN_3x3_ROW4( 6, 0 )
9775 MUL_3xN_3x3_INIT_ROW4_ROW
9776 MUL_3xN_3x3_ROW( 6, 4 )
9777 MUL_3xN_3x3_ROW( 6, 5 )
9781 for ( i = 0; i < k; i++ ) {
9783 for ( j = 0; j <
l; j++ ) {
9784 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[
l] + m1Ptr[2*k] * m2Ptr[2*
l];
9791 if ( !((k^6)|(l^4)) ) {
9793 #define MUL_4xN_4x4_INIT \
9794 __asm mov esi, m2Ptr \
9795 __asm mov edi, m1Ptr \
9796 __asm mov eax, dstPtr \
9797 __asm movaps xmm0, [esi] \
9798 __asm movaps xmm1, [esi+16] \
9799 __asm movaps xmm2, [esi+32] \
9800 __asm movaps xmm3, [esi+48]
9802 #define MUL_4xN_4x4_ROW( N, row ) \
9803 __asm movss xmm7, [edi+(row+0*N)*4] \
9804 __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9805 __asm mulps xmm7, xmm0 \
9806 __asm movss xmm6, [edi+(row+1*N)*4] \
9807 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9808 __asm mulps xmm6, xmm1 \
9809 __asm addps xmm7, xmm6 \
9810 __asm movss xmm6, [edi+(row+2*N)*4] \
9811 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9812 __asm mulps xmm6, xmm2 \
9813 __asm addps xmm7, xmm6 \
9814 __asm movss xmm6, [edi+(row+3*N)*4] \
9815 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9816 __asm mulps xmm6, xmm3 \
9817 __asm addps xmm7, xmm6 \
9818 __asm movaps [eax+row*16], xmm7
9821 MUL_4xN_4x4_ROW( 6, 0 )
9822 MUL_4xN_4x4_ROW( 6, 1 )
9823 MUL_4xN_4x4_ROW( 6, 2 )
9824 MUL_4xN_4x4_ROW( 6, 3 )
9825 MUL_4xN_4x4_ROW( 6, 4 )
9826 MUL_4xN_4x4_ROW( 6, 5 )
9830 for ( i = 0; i < k; i++ ) {
9832 for ( j = 0; j <
l; j++ ) {
9833 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[
l] + m1Ptr[2*k] * m2Ptr[2*
l] +
9834 m1Ptr[3*k] * m2Ptr[3*
l];
9841 if ( !((k^6)|(l^5)) ) {
9843 #define MUL_5xN_5x5_INIT \
9844 __asm mov esi, m2Ptr \
9845 __asm mov edi, m1Ptr \
9846 __asm mov eax, dstPtr \
9847 __asm movlps xmm0, [esi+ 0*4] \
9848 __asm movhps xmm0, [esi+ 2*4] \
9849 __asm movlps xmm1, [esi+ 5*4] \
9850 __asm movhps xmm1, [esi+ 7*4] \
9851 __asm movlps xmm2, [esi+10*4] \
9852 __asm movhps xmm2, [esi+12*4] \
9853 __asm movlps xmm3, [esi+15*4] \
9854 __asm movhps xmm3, [esi+17*4] \
9855 __asm movlps xmm4, [esi+20*4] \
9856 __asm movhps xmm4, [esi+22*4]
9858 #define MUL_5xN_5x5_ROW( N, row ) \
9859 __asm movss xmm6, [edi+(row+0*N)*4] \
9860 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9861 __asm mulps xmm6, xmm0 \
9862 __asm fld dword ptr [edi+(row+0*N)*4] \
9863 __asm fmul dword ptr [esi+ 4*4] \
9864 __asm movss xmm5, [edi+(row+1*N)*4] \
9865 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9866 __asm mulps xmm5, xmm1 \
9867 __asm addps xmm6, xmm5 \
9868 __asm fld dword ptr [edi+(row+1*N)*4] \
9869 __asm fmul dword ptr [esi+ 9*4] \
9870 __asm faddp st(1),st \
9871 __asm movss xmm5, [edi+(row+2*N)*4] \
9872 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9873 __asm mulps xmm5, xmm2 \
9874 __asm addps xmm6, xmm5 \
9875 __asm fld dword ptr [edi+(row+2*N)*4] \
9876 __asm fmul dword ptr [esi+14*4] \
9877 __asm faddp st(1),st \
9878 __asm movss xmm5, [edi+(row+3*N)*4] \
9879 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9880 __asm mulps xmm5, xmm3 \
9881 __asm addps xmm6, xmm5 \
9882 __asm fld dword ptr [edi+(row+3*N)*4] \
9883 __asm fmul dword ptr [esi+19*4] \
9884 __asm faddp st(1),st \
9885 __asm movss xmm5, [edi+(row+4*N)*4] \
9886 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9887 __asm mulps xmm5, xmm4 \
9888 __asm addps xmm6, xmm5 \
9889 __asm fld dword ptr [edi+(row+4*N)*4] \
9890 __asm fmul dword ptr [esi+24*4] \
9891 __asm faddp st(1),st \
9892 __asm fstp dword ptr [eax+(row*5+4)*4] \
9893 __asm movlps [eax+(row*5+0)*4], xmm6 \
9894 __asm movhps [eax+(row*5+2)*4], xmm6
9897 MUL_5xN_5x5_ROW( 6, 0 )
9898 MUL_5xN_5x5_ROW( 6, 1 )
9899 MUL_5xN_5x5_ROW( 6, 2 )
9900 MUL_5xN_5x5_ROW( 6, 3 )
9901 MUL_5xN_5x5_ROW( 6, 4 )
9902 MUL_5xN_5x5_ROW( 6, 5 )
9906 for ( i = 0; i < k; i++ ) {
9908 for ( j = 0; j <
l; j++ ) {
9909 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[
l] + m1Ptr[2*k] * m2Ptr[2*
l] +
9910 m1Ptr[3*k] * m2Ptr[3*
l] + m1Ptr[4*k] * m2Ptr[4*
l];
9920 #define MUL_6xN_6x6_FIRST4COLUMNS_INIT \
9921 __asm mov esi, m2Ptr \
9922 __asm mov edi, m1Ptr \
9923 __asm mov eax, dstPtr \
9924 __asm movlps xmm0, [esi+ 0*4] \
9925 __asm movhps xmm0, [esi+ 2*4] \
9926 __asm movlps xmm1, [esi+ 6*4] \
9927 __asm movhps xmm1, [esi+ 8*4] \
9928 __asm movlps xmm2, [esi+12*4] \
9929 __asm movhps xmm2, [esi+14*4] \
9930 __asm movlps xmm3, [esi+18*4] \
9931 __asm movhps xmm3, [esi+20*4] \
9932 __asm movlps xmm4, [esi+24*4] \
9933 __asm movhps xmm4, [esi+26*4] \
9934 __asm movlps xmm5, [esi+30*4] \
9935 __asm movhps xmm5, [esi+32*4]
9937 #define MUL_6xN_6x6_FIRST4COLUMNS_ROW( N, row ) \
9938 __asm movss xmm7, [edi+(row+0*N)*4] \
9939 __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9940 __asm mulps xmm7, xmm0 \
9941 __asm movss xmm6, [edi+(row+1*N)*4] \
9942 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9943 __asm mulps xmm6, xmm1 \
9944 __asm addps xmm7, xmm6 \
9945 __asm movss xmm6, [edi+(row+2*N)*4] \
9946 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9947 __asm mulps xmm6, xmm2 \
9948 __asm addps xmm7, xmm6 \
9949 __asm movss xmm6, [edi+(row+3*N)*4] \
9950 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9951 __asm mulps xmm6, xmm3 \
9952 __asm addps xmm7, xmm6 \
9953 __asm movss xmm6, [edi+(row+4*N)*4] \
9954 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9955 __asm mulps xmm6, xmm4 \
9956 __asm addps xmm7, xmm6 \
9957 __asm movss xmm6, [edi+(row+5*N)*4] \
9958 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
9959 __asm mulps xmm6, xmm5 \
9960 __asm addps xmm7, xmm6 \
9961 __asm movlps [eax+(row*6+0)*4], xmm7 \
9962 __asm movhps [eax+(row*6+2)*4], xmm7
9964 #define MUL_6xN_6x6_LAST2COLUMNS_INIT \
9965 __asm movlps xmm0, [esi+ 4*4] \
9966 __asm movlps xmm1, [esi+10*4] \
9967 __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
9968 __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
9969 __asm movlps xmm2, [esi+16*4] \
9970 __asm movlps xmm3, [esi+22*4] \
9971 __asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \
9972 __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
9973 __asm movlps xmm4, [esi+28*4] \
9974 __asm movlps xmm5, [esi+34*4] \
9975 __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \
9976 __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
9978 #define MUL_6xN_6x6_LAST2COLUMNS_ROW2( N, row ) \
9979 __asm movlps xmm7, [edi+(row*2+0*N)*4] \
9980 __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9981 __asm mulps xmm7, xmm0 \
9982 __asm movlps xmm6, [edi+(row*2+1*N)*4] \
9983 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9984 __asm mulps xmm6, xmm1 \
9985 __asm addps xmm7, xmm6 \
9986 __asm movlps xmm6, [edi+(row*2+2*N)*4] \
9987 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9988 __asm mulps xmm6, xmm2 \
9989 __asm addps xmm7, xmm6 \
9990 __asm movlps xmm6, [edi+(row*2+3*N)*4] \
9991 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9992 __asm mulps xmm6, xmm3 \
9993 __asm addps xmm7, xmm6 \
9994 __asm movlps xmm6, [edi+(row*2+4*N)*4] \
9995 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
9996 __asm mulps xmm6, xmm4 \
9997 __asm addps xmm7, xmm6 \
9998 __asm movlps xmm6, [edi+(row*2+5*N)*4] \
9999 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
10000 __asm mulps xmm6, xmm5 \
10001 __asm addps xmm7, xmm6 \
10002 __asm movlps [eax+(row*12+ 4)*4], xmm7 \
10003 __asm movhps [eax+(row*12+10)*4], xmm7
10005 #define MUL_6xN_6x6_LAST2COLUMNS_ROW( N, row ) \
10006 __asm movss xmm7, [edi+(1*N-1)*4] \
10007 __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
10008 __asm mulps xmm7, xmm0 \
10009 __asm movss xmm6, [edi+(2*N-1)*4] \
10010 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
10011 __asm mulps xmm6, xmm1 \
10012 __asm addps xmm7, xmm6 \
10013 __asm movss xmm6, [edi+(3*N-1)*4] \
10014 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
10015 __asm mulps xmm6, xmm2 \
10016 __asm addps xmm7, xmm6 \
10017 __asm movss xmm6, [edi+(4*N-1)*4] \
10018 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
10019 __asm mulps xmm6, xmm3 \
10020 __asm addps xmm7, xmm6 \
10021 __asm movss xmm6, [edi+(5*N-1)*4] \
10022 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
10023 __asm mulps xmm6, xmm4 \
10024 __asm addps xmm7, xmm6 \
10025 __asm movss xmm6, [edi+(6*N-1)*4] \
10026 __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
10027 __asm mulps xmm6, xmm5 \
10028 __asm addps xmm7, xmm6 \
10029 __asm movlps [eax+(row*6+4)*4], xmm7
10031 MUL_6xN_6x6_FIRST4COLUMNS_INIT
10032 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 1, 0 )
10033 MUL_6xN_6x6_LAST2COLUMNS_INIT
10034 MUL_6xN_6x6_LAST2COLUMNS_ROW( 1, 0 )
10040 MUL_6xN_6x6_FIRST4COLUMNS_INIT
10041 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 0 )
10042 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 1 )
10043 MUL_6xN_6x6_LAST2COLUMNS_INIT
10044 MUL_6xN_6x6_LAST2COLUMNS_ROW2( 2, 0 )
10050 MUL_6xN_6x6_FIRST4COLUMNS_INIT
10051 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 0 )
10052 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 1 )
10053 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 2 )
10054 MUL_6xN_6x6_LAST2COLUMNS_INIT
10055 MUL_6xN_6x6_LAST2COLUMNS_ROW2( 3, 0 )
10056 MUL_6xN_6x6_LAST2COLUMNS_ROW( 3, 2 )
10062 MUL_6xN_6x6_FIRST4COLUMNS_INIT
10063 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 0 )
10064 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 1 )
10065 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 2 )
10066 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 3 )
10067 MUL_6xN_6x6_LAST2COLUMNS_INIT
10068 MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 0 )
10069 MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 1 )
10075 MUL_6xN_6x6_FIRST4COLUMNS_INIT
10076 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 0 )
10077 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 1 )
10078 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 2 )
10079 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 3 )
10080 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 4 )
10081 MUL_6xN_6x6_LAST2COLUMNS_INIT
10082 MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 0 )
10083 MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 1 )
10084 MUL_6xN_6x6_LAST2COLUMNS_ROW( 5, 4 )
10090 MUL_6xN_6x6_FIRST4COLUMNS_INIT
10091 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 0 )
10092 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 1 )
10093 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 2 )
10094 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 3 )
10095 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 4 )
10096 MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 5 )
10097 MUL_6xN_6x6_LAST2COLUMNS_INIT
10098 MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 0 )
10099 MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 1 )
10100 MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 2 )
10106 for ( i = 0; i < k; i++ ) {
10108 for ( j = 0; j <
l; j++ ) {
10109 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[
l] + m1Ptr[2*k] * m2Ptr[2*
l] +
10110 m1Ptr[3*k] * m2Ptr[3*
l] + m1Ptr[4*k] * m2Ptr[4*
l] + m1Ptr[5*k] * m2Ptr[5*
l];
10117 for ( i = 0; i < k; i++ ) {
10118 for ( j = 0; j <
l; j++ ) {
10121 sum = m1Ptr[0] * m2Ptr[0];
10125 sum += m1Ptr[0] * m2Ptr[0];
10157 #define NSKIP( n, s ) ((n<<3)|(s&7))
10158 switch(
NSKIP( n, skip ) ) {
10159 case NSKIP( 1, 0 ): x[0] = b[0];
10161 case NSKIP( 2, 0 ): x[0] = b[0];
10162 case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10164 case NSKIP( 3, 0 ): x[0] = b[0];
10165 case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10166 case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10168 case NSKIP( 4, 0 ): x[0] = b[0];
10169 case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10170 case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10171 case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10173 case NSKIP( 5, 0 ): x[0] = b[0];
10174 case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10175 case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10176 case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10177 case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
10179 case NSKIP( 6, 0 ): x[0] = b[0];
10180 case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10181 case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10182 case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10183 case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
10184 case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
10186 case NSKIP( 7, 0 ): x[0] = b[0];
10187 case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10188 case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10189 case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10190 case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
10191 case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
10192 case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
10200 case 0: x[0] = b[0];
10201 case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
10202 case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10203 case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10234 movaps xmm0, [esi+ecx]
10235 mulps xmm0, [edi+ecx]
10239 movaps xmm1, [esi+ecx-(8*4)]
10240 mulps xmm1, [edi+ecx-(8*4)]
10242 movaps xmm3, [esi+ecx-(4*4)]
10243 mulps xmm3, [edi+ecx-(4*4)]
10251 movaps xmm1, [esi+ecx-(4*4)]
10252 mulps xmm1, [edi+ecx-(4*4)]
10259 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
10268 movss xmm1, [esi-(3*4)]
10269 mulss xmm1, [edi-(3*4)]
10272 movss xmm3, [esi-(2*4)]
10273 mulss xmm3, [edi-(2*4)]
10276 movss xmm5, [esi-(1*4)]
10277 mulss xmm5, [edi-(1*4)]
10280 movss xmm1, [ebx+eax]
10297 movups xmm0, [esi+ecx]
10298 movups xmm1, [edi+ecx]
10303 movups xmm1, [esi+ecx-(8*4)]
10304 movups xmm2, [edi+ecx-(8*4)]
10307 movups xmm3, [esi+ecx-(4*4)]
10308 movups xmm4, [edi+ecx-(4*4)]
10317 movups xmm1, [esi+ecx-(4*4)]
10318 movups xmm2, [edi+ecx-(4*4)]
10326 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
10335 movss xmm1, [esi-(3*4)]
10336 movss xmm2, [edi-(3*4)]
10340 movss xmm3, [esi-(2*4)]
10341 movss xmm4, [edi-(2*4)]
10345 movss xmm5, [esi-(1*4)]
10346 movss xmm6, [edi-(1*4)]
10350 movss xmm1, [ebx+eax]
10380 lptr = L.ToFloatPtr();
10381 nc = L.GetNumColumns();
10393 x[0] = b[0] - lptr[1*nc+0] * x[1];
10397 x[1] = b[1] - lptr[2*nc+1] * x[2];
10398 x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10402 x[2] = b[2] - lptr[3*nc+2] * x[3];
10403 x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
10404 x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10408 x[3] = b[3] - lptr[4*nc+3] * x[4];
10409 x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
10410 x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
10411 x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10415 x[4] = b[4] - lptr[5*nc+4] * x[5];
10416 x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
10417 x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
10418 x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
10419 x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10423 x[5] = b[5] - lptr[6*nc+5] * x[6];
10424 x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
10425 x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
10426 x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
10427 x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
10428 x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10449 lptr = L.ToFloatPtr() + m * nc + m - 4;
10460 movlps xmm0, [ebx+eax*4-16]
10461 movhps xmm0, [ebx+eax*4-8]
10467 movlps xmm2, [edi+0]
10468 movhps xmm2, [edi+8]
10470 movss xmm1, [esi+4*ecx+0]
10471 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10472 movlps xmm3, [edi+0]
10473 movhps xmm3, [edi+8]
10477 movss xmm1, [esi+4*ecx+4]
10478 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10479 movlps xmm4, [edi+0]
10480 movhps xmm4, [edi+8]
10484 movss xmm1, [esi+4*ecx+8]
10485 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10486 movlps xmm5, [edi+0]
10487 movhps xmm5, [edi+8]
10491 movss xmm1, [esi+4*ecx+12]
10492 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10499 movlps xmm2, [edi+0]
10500 movhps xmm2, [edi+8]
10501 movss xmm1, [esi+4*ecx]
10502 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10512 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
10514 shufps xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
10516 shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
10518 movss [esi-4], xmm3
10521 mulss xmm3, [edi+8]
10522 mulss xmm4, [edi+4]
10523 mulss xmm5, [edi+0]
10525 movss [esi-8], xmm2
10530 mulss xmm2, [edi+4]
10531 mulss xmm6, [edi+0]
10533 movss [esi-12], xmm1
10537 mulss xmm1, [edi+0]
10539 movss [esi-16], xmm0
10551 lptr = L.ToFloatPtr() + m * nc + m - 4;
10562 movlps xmm0, [ebx+eax*4-16]
10563 movhps xmm0, [ebx+eax*4-8]
10569 movlps xmm2, [edi+0]
10570 movhps xmm2, [edi+8]
10572 movss xmm1, [esi+4*ecx+0]
10573 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10574 movlps xmm3, [edi+0]
10575 movhps xmm3, [edi+8]
10579 movss xmm1, [esi+4*ecx+4]
10580 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10581 movlps xmm4, [edi+0]
10582 movhps xmm4, [edi+8]
10586 movss xmm1, [esi+4*ecx+8]
10587 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10588 movlps xmm5, [edi+0]
10589 movhps xmm5, [edi+8]
10593 movss xmm1, [esi+4*ecx+12]
10594 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10607 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
10609 shufps xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
10611 shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
10613 movss [esi-4], xmm3
10616 mulss xmm3, [edi+8]
10617 mulss xmm4, [edi+4]
10618 mulss xmm5, [edi+0]
10620 movss [esi-8], xmm2
10625 mulss xmm2, [edi+4]
10626 mulss xmm6, [edi+0]
10628 movss [esi-12], xmm1
10632 mulss xmm1, [edi+0]
10634 movss [esi-16], xmm0
10646 for ( i = (m&3)-1; i >= 0; i-- ) {
10649 for ( j = i + 1; j <
n; j++ ) {
10650 s0 -= lptr[j*nc] * x[
j];
10658 double s0, s1, s2, s3,
t;
10659 const float *lptr2;
10660 float *xptr, *xptr2;
10668 lptr = L.ToFloatPtr() + m * nc + m - 4;
10671 for ( i = m; i >= 4; i -= 4 ) {
10679 for ( j = 0; j < m-
i; j += 4 ) {
10681 s0 -= lptr2[0] *
t;
10682 s1 -= lptr2[1] *
t;
10683 s2 -= lptr2[2] *
t;
10684 s3 -= lptr2[3] *
t;
10688 s0 -= lptr2[0] *
t;
10689 s1 -= lptr2[1] *
t;
10690 s2 -= lptr2[2] *
t;
10691 s3 -= lptr2[3] *
t;
10695 s0 -= lptr2[0] *
t;
10696 s1 -= lptr2[1] *
t;
10697 s2 -= lptr2[2] *
t;
10698 s3 -= lptr2[3] *
t;
10702 s0 -= lptr2[0] *
t;
10703 s1 -= lptr2[1] *
t;
10704 s2 -= lptr2[2] *
t;
10705 s3 -= lptr2[3] *
t;
10710 s0 -= lptr2[0] *
t;
10711 s1 -= lptr2[1] *
t;
10712 s2 -= lptr2[2] *
t;
10713 s3 -= lptr2[3] *
t;
10716 s0 -= lptr[0] * s3;
10717 s1 -= lptr[1] * s3;
10718 s2 -= lptr[2] * s3;
10720 s0 -= lptr[0] * s2;
10721 s1 -= lptr[1] * s2;
10723 s0 -= lptr[0] * s1;
10737 lptr = L.ToFloatPtr() + m * nc + m - 4;
10740 for ( i = m; i >= 4; i -= 4 ) {
10748 for ( j = 0; j < m-
i; j += 4 ) {
10750 s0 -= lptr2[0] *
t;
10751 s1 -= lptr2[1] *
t;
10752 s2 -= lptr2[2] *
t;
10753 s3 -= lptr2[3] *
t;
10757 s0 -= lptr2[0] *
t;
10758 s1 -= lptr2[1] *
t;
10759 s2 -= lptr2[2] *
t;
10760 s3 -= lptr2[3] *
t;
10764 s0 -= lptr2[0] *
t;
10765 s1 -= lptr2[1] *
t;
10766 s2 -= lptr2[2] *
t;
10767 s3 -= lptr2[3] *
t;
10771 s0 -= lptr2[0] *
t;
10772 s1 -= lptr2[1] *
t;
10773 s2 -= lptr2[2] *
t;
10774 s3 -= lptr2[3] *
t;
10780 s0 -= lptr[0] * s3;
10781 s1 -= lptr[1] * s3;
10782 s2 -= lptr[2] * s3;
10784 s0 -= lptr[0] * s2;
10785 s1 -= lptr[1] * s2;
10787 s0 -= lptr[0] * s1;
10800 for ( i--; i >= 0; i-- ) {
10803 for ( j = i + 1; j < m; j++ ) {
10804 s0 -= lptr[j*nc] * x[
j];
10825 float *
v, *diag, *invDiagPtr, *mptr;
10826 double s0, s1, s2, sum, d;
10828 v = (
float *) _alloca16( n *
sizeof(
float ) );
10829 diag = (
float *) _alloca16( n *
sizeof(
float ) );
10834 assert( ( nc & 3 ) == 0 );
10844 if ( sum == 0.0
f ) {
10849 invDiagPtr[0] = d = 1.0f / sum;
10856 for ( j = 1; j <
n; j++ ) {
10857 mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
10862 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
10863 sum = mptr[1] - s0;
10865 if ( sum == 0.0
f ) {
10871 invDiagPtr[1] = d = 1.0f / sum;
10878 for ( j = 2; j <
n; j++ ) {
10879 mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
10884 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
10885 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
10886 sum = mptr[2] - s0 - s1;
10888 if ( sum == 0.0
f ) {
10894 invDiagPtr[2] = d = 1.0f / sum;
10901 for ( j = 3; j <
n; j++ ) {
10902 mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
10907 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
10908 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
10909 v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
10910 sum = mptr[3] - s0 - s1 - s2;
10912 if ( sum == 0.0
f ) {
10918 invDiagPtr[3] = d = 1.0f / sum;
10925 for ( j = 4; j <
n; j++ ) {
10926 mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
10929 int ncf = nc *
sizeof(
float );
10954 mov eax, invDiagPtr
10958 movaps xmm0, [edx+ecx]
10959 mulps xmm0, [edi+ecx]
10960 movaps [esi+ecx], xmm0
10961 mulps xmm0, [edi+ecx]
10965 movaps xmm1, [edx+ecx-(8*4)]
10966 mulps xmm1, [edi+ecx-(8*4)]
10967 movaps [esi+ecx-(8*4)], xmm1
10968 mulps xmm1, [edi+ecx-(8*4)]
10970 movaps xmm2, [edx+ecx-(4*4)]
10971 mulps xmm2, [edi+ecx-(4*4)]
10972 movaps [esi+ecx-(4*4)], xmm2
10973 mulps xmm2, [edi+ecx-(4*4)]
10980 movaps xmm1, [edx+ecx-(4*4)]
10981 mulps xmm1, [edi+ecx-(4*4)]
10982 movaps [esi+ecx-(4*4)], xmm1
10983 mulps xmm1, [edi+ecx-(4*4)]
10989 movlps xmm3, [edx+ecx-(2*4)]
10990 movlps xmm4, [edi+ecx-(2*4)]
10992 movlps [esi+ecx-(2*4)], xmm3
10999 movss xmm3, [edx+ecx-(1*4)]
11000 movss xmm4, [edi+ecx-(1*4)]
11002 movss [esi+ecx-(1*4)], xmm3
11009 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
11018 cmpeqss xmm2, SIMD_SP_zero
11019 andps xmm2, SIMD_SP_tiny
11041 movaps xmm0, [esi+ecx]
11042 mulps xmm0, [edi+ecx]
11046 movaps xmm1, [esi+ecx-(8*4)]
11047 mulps xmm1, [edi+ecx-(8*4)]
11049 movaps xmm2, [esi+ecx-(4*4)]
11050 mulps xmm2, [edi+ecx-(4*4)]
11057 movaps xmm1, [esi+ecx-(4*4)]
11058 mulps xmm1, [edi+ecx-(4*4)]
11064 movlps xmm3, [esi+ecx-(2*4)]
11065 movlps xmm4, [edi+ecx-(2*4)]
11072 movss xmm3, [esi+ecx-(1*4)]
11073 movss xmm4, [edi+ecx-(1*4)]
11080 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
11100 float *
v, *diag, *mptr;
11101 double s0, s1, s2, s3, sum, d;
11103 v = (
float *) _alloca16( n *
sizeof(
float ) );
11104 diag = (
float *) _alloca16( n *
sizeof(
float ) );
11116 if ( sum == 0.0
f ) {
11121 invDiag[0] = d = 1.0f / sum;
11128 for ( j = 1; j <
n; j++ ) {
11129 mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
11134 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
11135 sum = mptr[1] - s0;
11137 if ( sum == 0.0
f ) {
11143 invDiag[1] = d = 1.0f / sum;
11150 for ( j = 2; j <
n; j++ ) {
11151 mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
11156 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
11157 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
11158 sum = mptr[2] - s0 - s1;
11160 if ( sum == 0.0
f ) {
11166 invDiag[2] = d = 1.0f / sum;
11173 for ( j = 3; j <
n; j++ ) {
11174 mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
11179 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
11180 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
11181 v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
11182 sum = mptr[3] - s0 - s1 - s2;
11184 if ( sum == 0.0
f ) {
11190 invDiag[3] = d = 1.0f / sum;
11197 for ( j = 4; j <
n; j++ ) {
11198 mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
11201 for ( i = 4; i <
n; i++ ) {
11205 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
11206 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
11207 v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
11208 v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
11209 for ( k = 4; k < i-3; k += 4 ) {
11210 v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
11211 v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
11212 v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
11213 v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
11216 case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
11217 case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
11218 case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
11224 sum = mptr[
i] - sum;
11226 if ( sum == 0.0
f ) {
11232 invDiag[
i] = d = 1.0f / sum;
11234 if ( i + 1 >= n ) {
11239 for ( j = i+1; j <
n; j++ ) {
11240 s0 = mptr[0] * v[0];
11241 s1 = mptr[1] * v[1];
11242 s2 = mptr[2] * v[2];
11243 s3 = mptr[3] * v[3];
11244 for ( k = 4; k < i-7; k += 8 ) {
11245 s0 += mptr[k+0] * v[k+0];
11246 s1 += mptr[k+1] * v[k+1];
11247 s2 += mptr[k+2] * v[k+2];
11248 s3 += mptr[k+3] * v[k+3];
11249 s0 += mptr[k+4] * v[k+4];
11250 s1 += mptr[k+5] * v[k+5];
11251 s2 += mptr[k+6] * v[k+6];
11252 s3 += mptr[k+7] * v[k+7];
11255 case 7: s0 += mptr[k+6] * v[k+6];
11256 case 6: s1 += mptr[k+5] * v[k+5];
11257 case 5: s2 += mptr[k+4] * v[k+4];
11258 case 4: s3 += mptr[k+3] * v[k+3];
11259 case 3: s0 += mptr[k+2] * v[k+2];
11260 case 2: s1 += mptr[k+1] * v[k+1];
11261 case 1: s2 += mptr[k+0] * v[k+0];
11267 mptr[
i] = ( mptr[
i] - sum ) * d;
11282 #define REFINE_BLENDJOINTS_RECIPROCAL
11287 if ( lerp <= 0.0
f ) {
11289 }
else if ( lerp >= 1.0
f ) {
11290 for ( i = 0; i < numJoints; i++ ) {
11292 joints[
j] = blendJoints[
j];
11297 for ( i = 0; i <= numJoints - 4; i += 4 ) {
11298 ALIGN16(
float jointVert0[4] );
11299 ALIGN16(
float jointVert1[4] );
11300 ALIGN16(
float jointVert2[4] );
11301 ALIGN16(
float blendVert0[4] );
11302 ALIGN16(
float blendVert1[4] );
11303 ALIGN16(
float blendVert2[4] );
11304 ALIGN16(
float jointQuat0[4] );
11305 ALIGN16(
float jointQuat1[4] );
11306 ALIGN16(
float jointQuat2[4] );
11307 ALIGN16(
float jointQuat3[4] );
11308 ALIGN16(
float blendQuat0[4] );
11309 ALIGN16(
float blendQuat1[4] );
11310 ALIGN16(
float blendQuat2[4] );
11311 ALIGN16(
float blendQuat3[4] );
11313 for (
int j = 0; j < 4; j++ ) {
11314 int n = index[i+
j];
11316 jointVert0[
j] = joints[
n].
t[0];
11317 jointVert1[
j] = joints[
n].
t[1];
11318 jointVert2[
j] = joints[
n].
t[2];
11320 blendVert0[
j] = blendJoints[
n].
t[0];
11321 blendVert1[
j] = blendJoints[
n].
t[1];
11322 blendVert2[
j] = blendJoints[
n].
t[2];
11324 jointQuat0[
j] = joints[
n].
q[0];
11325 jointQuat1[
j] = joints[
n].
q[1];
11326 jointQuat2[
j] = joints[
n].
q[2];
11327 jointQuat3[
j] = joints[
n].
q[3];
11329 blendQuat0[
j] = blendJoints[
n].
q[0];
11330 blendQuat1[
j] = blendJoints[
n].
q[1];
11331 blendQuat2[
j] = blendJoints[
n].
q[2];
11332 blendQuat3[
j] = blendJoints[
n].
q[3];
11339 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
11340 movaps xmm0, blendVert0
11341 subps xmm0, jointVert0
11343 addps xmm0, jointVert0
11344 movaps jointVert0, xmm0
11345 movaps xmm1, blendVert1
11346 subps xmm1, jointVert1
11348 addps xmm1, jointVert1
11349 movaps jointVert1, xmm1
11350 movaps xmm2, blendVert2
11351 subps xmm2, jointVert2
11353 addps xmm2, jointVert2
11354 movaps jointVert2, xmm2
11357 movaps xmm0, jointQuat0
11358 mulps xmm0, blendQuat0
11359 movaps xmm1, jointQuat1
11360 mulps xmm1, blendQuat1
11362 movaps xmm2, jointQuat2
11363 mulps xmm2, blendQuat2
11365 movaps xmm3, jointQuat3
11366 mulps xmm3, blendQuat3
11371 andps xmm1, SIMD_SP_signBitMask
11376 movaps xmm3, SIMD_SP_one
11379 andps xmm4, SIMD_SP_tiny
11380 andps xmm3, SIMD_SP_absMask
11383 #ifdef REFINE_BLENDJOINTS_RECIPROCAL
11388 subps xmm2, SIMD_SP_rsqrt_c0
11389 mulps xmm4, SIMD_SP_rsqrt_c1
11402 #ifdef REFINE_BLENDJOINTS_RECIPROCAL
11414 andps xmm3, SIMD_SP_signBitMask
11416 andps xmm4, SIMD_SP_halfPI
11419 movaps xmm5, SIMD_SP_atan_c0
11421 addps xmm5, SIMD_SP_atan_c1
11423 addps xmm5, SIMD_SP_atan_c2
11425 addps xmm5, SIMD_SP_atan_c3
11427 addps xmm5, SIMD_SP_atan_c4
11429 addps xmm5, SIMD_SP_atan_c5
11431 addps xmm5, SIMD_SP_atan_c6
11433 addps xmm5, SIMD_SP_atan_c7
11435 addps xmm5, SIMD_SP_one
11449 movaps xmm4, SIMD_SP_sin_c0
11450 movaps xmm0, SIMD_SP_sin_c0
11453 addps xmm4, SIMD_SP_sin_c1
11454 addps xmm0, SIMD_SP_sin_c1
11457 addps xmm4, SIMD_SP_sin_c2
11458 addps xmm0, SIMD_SP_sin_c2
11461 addps xmm4, SIMD_SP_sin_c3
11462 addps xmm0, SIMD_SP_sin_c3
11465 addps xmm4, SIMD_SP_sin_c4
11466 addps xmm0, SIMD_SP_sin_c4
11469 addps xmm4, SIMD_SP_one
11470 addps xmm0, SIMD_SP_one
11478 movaps xmm0, jointQuat0
11480 movaps xmm1, blendQuat0
11483 movaps jointQuat0, xmm0
11485 movaps xmm1, jointQuat1
11487 movaps xmm2, blendQuat1
11490 movaps jointQuat1, xmm1
11492 movaps xmm2, jointQuat2
11494 movaps xmm3, blendQuat2
11497 movaps jointQuat2, xmm2
11499 movaps xmm3, jointQuat3
11501 movaps xmm4, blendQuat3
11504 movaps jointQuat3, xmm3
11509 jointVert0[0] += lerp * ( blendVert0[0] - jointVert0[0] );
11510 jointVert0[1] += lerp * ( blendVert0[1] - jointVert0[1] );
11511 jointVert0[2] += lerp * ( blendVert0[2] - jointVert0[2] );
11512 jointVert0[3] += lerp * ( blendVert0[3] - jointVert0[3] );
11514 jointVert1[0] += lerp * ( blendVert1[0] - jointVert1[0] );
11515 jointVert1[1] += lerp * ( blendVert1[1] - jointVert1[1] );
11516 jointVert1[2] += lerp * ( blendVert1[2] - jointVert1[2] );
11517 jointVert1[3] += lerp * ( blendVert1[3] - jointVert1[3] );
11519 jointVert2[0] += lerp * ( blendVert2[0] - jointVert2[0] );
11520 jointVert2[1] += lerp * ( blendVert2[1] - jointVert2[1] );
11521 jointVert2[2] += lerp * ( blendVert2[2] - jointVert2[2] );
11522 jointVert2[3] += lerp * ( blendVert2[3] - jointVert2[3] );
11524 ALIGN16(
float cosom[4] );
11525 ALIGN16(
float sinom[4] );
11526 ALIGN16(
float omega0[4] );
11527 ALIGN16(
float omega1[4] );
11528 ALIGN16(
float scale0[4] );
11529 ALIGN16(
float scale1[4] );
11530 ALIGN16(
unsigned long signBit[4] );
11532 cosom[0] = jointQuat0[0] * blendQuat0[0];
11533 cosom[1] = jointQuat0[1] * blendQuat0[1];
11534 cosom[2] = jointQuat0[2] * blendQuat0[2];
11535 cosom[3] = jointQuat0[3] * blendQuat0[3];
11537 cosom[0] += jointQuat1[0] * blendQuat1[0];
11538 cosom[1] += jointQuat1[1] * blendQuat1[1];
11539 cosom[2] += jointQuat1[2] * blendQuat1[2];
11540 cosom[3] += jointQuat1[3] * blendQuat1[3];
11542 cosom[0] += jointQuat2[0] * blendQuat2[0];
11543 cosom[1] += jointQuat2[1] * blendQuat2[1];
11544 cosom[2] += jointQuat2[2] * blendQuat2[2];
11545 cosom[3] += jointQuat2[3] * blendQuat2[3];
11547 cosom[0] += jointQuat3[0] * blendQuat3[0];
11548 cosom[1] += jointQuat3[1] * blendQuat3[1];
11549 cosom[2] += jointQuat3[2] * blendQuat3[2];
11550 cosom[3] += jointQuat3[3] * blendQuat3[3];
11552 signBit[0] = (*(
unsigned long *)&cosom[0]) & ( 1 << 31 );
11553 signBit[1] = (*(
unsigned long *)&cosom[1]) & ( 1 << 31 );
11554 signBit[2] = (*(
unsigned long *)&cosom[2]) & ( 1 << 31 );
11555 signBit[3] = (*(
unsigned long *)&cosom[3]) & ( 1 << 31 );
11557 (*(
unsigned long *)&cosom[0]) ^= signBit[0];
11558 (*(
unsigned long *)&cosom[1]) ^= signBit[1];
11559 (*(
unsigned long *)&cosom[2]) ^= signBit[2];
11560 (*(
unsigned long *)&cosom[3]) ^= signBit[3];
11562 scale0[0] = 1.0f - cosom[0] * cosom[0];
11563 scale0[1] = 1.0f - cosom[1] * cosom[1];
11564 scale0[2] = 1.0f - cosom[2] * cosom[2];
11565 scale0[3] = 1.0f - cosom[3] * cosom[3];
11567 scale0[0] = ( scale0[0] <= 0.0f ) ? SIMD_SP_tiny[0] : scale0[0];
11568 scale0[1] = ( scale0[1] <= 0.0f ) ? SIMD_SP_tiny[1] : scale0[1];
11569 scale0[2] = ( scale0[2] <= 0.0f ) ? SIMD_SP_tiny[2] : scale0[2];
11570 scale0[3] = ( scale0[3] <= 0.0f ) ? SIMD_SP_tiny[3] : scale0[3];
11577 scale0[0] *= sinom[0];
11578 scale0[1] *= sinom[1];
11579 scale0[2] *= sinom[2];
11580 scale0[3] *= sinom[3];
11582 omega0[0] = SSE_ATanPositive( scale0[0], cosom[0] );
11583 omega0[1] = SSE_ATanPositive( scale0[1], cosom[1] );
11584 omega0[2] = SSE_ATanPositive( scale0[2], cosom[2] );
11585 omega0[3] = SSE_ATanPositive( scale0[3], cosom[3] );
11587 omega1[0] = lerp * omega0[0];
11588 omega1[1] = lerp * omega0[1];
11589 omega1[2] = lerp * omega0[2];
11590 omega1[3] = lerp * omega0[3];
11592 omega0[0] -= omega1[0];
11593 omega0[1] -= omega1[1];
11594 omega0[2] -= omega1[2];
11595 omega0[3] -= omega1[3];
11597 scale0[0] = SSE_SinZeroHalfPI( omega0[0] ) * sinom[0];
11598 scale0[1] = SSE_SinZeroHalfPI( omega0[1] ) * sinom[1];
11599 scale0[2] = SSE_SinZeroHalfPI( omega0[2] ) * sinom[2];
11600 scale0[3] = SSE_SinZeroHalfPI( omega0[3] ) * sinom[3];
11602 scale1[0] = SSE_SinZeroHalfPI( omega1[0] ) * sinom[0];
11603 scale1[1] = SSE_SinZeroHalfPI( omega1[1] ) * sinom[1];
11604 scale1[2] = SSE_SinZeroHalfPI( omega1[2] ) * sinom[2];
11605 scale1[3] = SSE_SinZeroHalfPI( omega1[3] ) * sinom[3];
11607 (*(
unsigned long *)&scale1[0]) ^= signBit[0];
11608 (*(
unsigned long *)&scale1[1]) ^= signBit[1];
11609 (*(
unsigned long *)&scale1[2]) ^= signBit[2];
11610 (*(
unsigned long *)&scale1[3]) ^= signBit[3];
11612 jointQuat0[0] = scale0[0] * jointQuat0[0] + scale1[0] * blendQuat0[0];
11613 jointQuat0[1] = scale0[1] * jointQuat0[1] + scale1[1] * blendQuat0[1];
11614 jointQuat0[2] = scale0[2] * jointQuat0[2] + scale1[2] * blendQuat0[2];
11615 jointQuat0[3] = scale0[3] * jointQuat0[3] + scale1[3] * blendQuat0[3];
11617 jointQuat1[0] = scale0[0] * jointQuat1[0] + scale1[0] * blendQuat1[0];
11618 jointQuat1[1] = scale0[1] * jointQuat1[1] + scale1[1] * blendQuat1[1];
11619 jointQuat1[2] = scale0[2] * jointQuat1[2] + scale1[2] * blendQuat1[2];
11620 jointQuat1[3] = scale0[3] * jointQuat1[3] + scale1[3] * blendQuat1[3];
11622 jointQuat2[0] = scale0[0] * jointQuat2[0] + scale1[0] * blendQuat2[0];
11623 jointQuat2[1] = scale0[1] * jointQuat2[1] + scale1[1] * blendQuat2[1];
11624 jointQuat2[2] = scale0[2] * jointQuat2[2] + scale1[2] * blendQuat2[2];
11625 jointQuat2[3] = scale0[3] * jointQuat2[3] + scale1[3] * blendQuat2[3];
11627 jointQuat3[0] = scale0[0] * jointQuat3[0] + scale1[0] * blendQuat3[0];
11628 jointQuat3[1] = scale0[1] * jointQuat3[1] + scale1[1] * blendQuat3[1];
11629 jointQuat3[2] = scale0[2] * jointQuat3[2] + scale1[2] * blendQuat3[2];
11630 jointQuat3[3] = scale0[3] * jointQuat3[3] + scale1[3] * blendQuat3[3];
11634 for (
int j = 0; j < 4; j++ ) {
11635 int n = index[i+
j];
11637 joints[
n].
t[0] = jointVert0[
j];
11638 joints[
n].
t[1] = jointVert1[
j];
11639 joints[
n].
t[2] = jointVert2[
j];
11641 joints[
n].
q[0] = jointQuat0[
j];
11642 joints[
n].
q[1] = jointQuat1[
j];
11643 joints[
n].
q[2] = jointQuat2[
j];
11644 joints[
n].
q[3] = jointQuat3[
j];
11648 for ( ; i < numJoints; i++ ) {
11652 const idVec3 &blendVert = blendJoints[
n].
t;
11654 jointVert[0] += lerp * ( blendVert[0] - jointVert[0] );
11655 jointVert[1] += lerp * ( blendVert[1] - jointVert[1] );
11656 jointVert[2] += lerp * ( blendVert[2] - jointVert[2] );
11659 const idQuat &blendQuat = blendJoints[
n].
q;
11666 unsigned long signBit;
11668 cosom = jointQuat.
x * blendQuat.
x + jointQuat.y * blendQuat.
y + jointQuat.z * blendQuat.
z + jointQuat.w * blendQuat.
w;
11670 signBit = (*(
unsigned long *)&cosom) & ( 1 << 31 );
11672 (*(
unsigned long *)&cosom) ^= signBit;
11674 scale0 = 1.0f - cosom * cosom;
11675 scale0 = ( scale0 <= 0.0f ) ? SIMD_SP_tiny[0] : scale0;
11681 (*(
unsigned long *)&scale1) ^= signBit;
11683 jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.
x;
11684 jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.
y;
11685 jointQuat.z = scale0 * jointQuat.z + scale1 * blendQuat.
z;
11686 jointQuat.w = scale0 * jointQuat.w + scale1 * blendQuat.
w;
11701 for (
int i = 0; i < numJoints; i++ ) {
11710 float x2 = q[0] + q[0];
11711 float y2 = q[1] + q[1];
11712 float z2 = q[2] + q[2];
11715 float xx = q[0] *
x2;
11716 float yy = q[1] *
y2;
11717 float zz = q[2] * z2;
11719 m[0*4+0] = 1.0f - yy - zz;
11720 m[1*4+1] = 1.0f - xx - zz;
11721 m[2*4+2] = 1.0f - xx - yy;
11725 float yz = q[1] * z2;
11726 float wx = q[3] *
x2;
11728 m[2*4+1] = yz - wx;
11729 m[1*4+2] = yz + wx;
11733 float xy = q[0] *
y2;
11734 float wz = q[3] * z2;
11736 m[1*4+0] = xy - wz;
11737 m[0*4+1] = xy + wz;
11741 float xz = q[0] * z2;
11742 float wy = q[3] *
y2;
11744 m[0*4+2] = xz - wy;
11745 m[2*4+0] = xz + wy;
11763 ALIGN16(
byte shuffle[16] );
11768 mov edi, jointQuats
11771 imul eax, JOINTMAT_SIZE
11776 movss xmm5, [esi+eax+3*JOINTMAT_SIZE+0*16+0*4]
11777 movss xmm6, [esi+eax+3*JOINTMAT_SIZE+1*16+1*4]
11778 movss xmm7, [esi+eax+3*JOINTMAT_SIZE+2*16+2*4]
11780 shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
11781 shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
11782 shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
11784 movss xmm0, [esi+eax+2*JOINTMAT_SIZE+0*16+0*4]
11785 movss xmm1, [esi+eax+2*JOINTMAT_SIZE+1*16+1*4]
11786 movss xmm2, [esi+eax+2*JOINTMAT_SIZE+2*16+2*4]
11792 shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
11793 shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
11794 shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
11796 movss xmm0, [esi+eax+1*JOINTMAT_SIZE+0*16+0*4]
11797 movss xmm1, [esi+eax+1*JOINTMAT_SIZE+1*16+1*4]
11798 movss xmm2, [esi+eax+1*JOINTMAT_SIZE+2*16+2*4]
11804 shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
11805 shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
11806 shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
11808 movss xmm0, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
11809 movss xmm1, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
11810 movss xmm2, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
11821 cmpnltps xmm0, SIMD_SP_zero
11825 cmpnltps xmm1, xmm6
11826 cmpnltps xmm2, xmm7
11830 cmpnltps xmm4, xmm7
11838 xorps xmm3, SIMD_SP_not
11840 andps xmm0, SIMD_DW_mat2quatShuffle0
11842 andps xmm4, SIMD_DW_mat2quatShuffle1
11845 andps xmm4, SIMD_DW_mat2quatShuffle2
11848 andps xmm4, SIMD_DW_mat2quatShuffle3
11851 movaps shuffle, xmm4
11858 andps xmm0, SIMD_SP_signBitMask
11859 andps xmm1, SIMD_SP_signBitMask
11860 andps xmm2, SIMD_SP_signBitMask
11866 addps xmm7, SIMD_SP_one
11873 subps xmm5, SIMD_SP_rsqrt_c0
11874 mulps xmm6, SIMD_SP_mat2quat_rsqrt_c1
11878 xorps xmm6, SIMD_SP_signBitMask
11882 add edi, 4*JOINTQUAT_SIZE
11884 movzx ecx,
byte ptr shuffle[0*4+0]
11885 movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm7
11887 movzx edx,
byte ptr shuffle[0*4+1]
11888 movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
11890 subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
11892 movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4
11894 movzx ecx,
byte ptr shuffle[0*4+2]
11895 movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
11897 subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
11899 movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm3
11901 movzx edx,
byte ptr shuffle[0*4+3]
11902 movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
11904 subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
11906 movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4
11908 mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
11909 mov [edi-4*JOINTQUAT_SIZE+16], ecx
11910 mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
11911 mov [edi-4*JOINTQUAT_SIZE+20], edx
11912 mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
11913 mov [edi-4*JOINTQUAT_SIZE+24], ecx
11915 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
11916 shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
11917 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
11918 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
11919 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
11921 movzx ecx,
byte ptr shuffle[1*4+0]
11922 movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm7
11924 movzx edx,
byte ptr shuffle[1*4+1]
11925 movss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+0*4]
11927 subss xmm4, [esi+eax+1*JOINTMAT_SIZE+0*16+1*4]
11929 movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4
11931 movzx ecx,
byte ptr shuffle[1*4+2]
11932 movss xmm3, [esi+eax+1*JOINTMAT_SIZE+0*16+2*4]
11934 subss xmm3, [esi+eax+1*JOINTMAT_SIZE+2*16+0*4]
11936 movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm3
11938 movzx edx,
byte ptr shuffle[1*4+3]
11939 movss xmm4, [esi+eax+1*JOINTMAT_SIZE+2*16+1*4]
11941 subss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+2*4]
11943 movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4
11945 mov ecx, [esi+eax+1*JOINTMAT_SIZE+0*16+3*4]
11946 mov [edi-3*JOINTQUAT_SIZE+16], ecx
11947 mov edx, [esi+eax+1*JOINTMAT_SIZE+1*16+3*4]
11948 mov [edi-3*JOINTQUAT_SIZE+20], edx
11949 mov ecx, [esi+eax+1*JOINTMAT_SIZE+2*16+3*4]
11950 mov [edi-3*JOINTQUAT_SIZE+24], ecx
11952 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
11953 shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
11954 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
11955 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
11956 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
11958 movzx ecx,
byte ptr shuffle[2*4+0]
11959 movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm7
11961 movzx edx,
byte ptr shuffle[2*4+1]
11962 movss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+0*4]
11964 subss xmm4, [esi+eax+2*JOINTMAT_SIZE+0*16+1*4]
11966 movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4
11968 movzx ecx,
byte ptr shuffle[2*4+2]
11969 movss xmm3, [esi+eax+2*JOINTMAT_SIZE+0*16+2*4]
11971 subss xmm3, [esi+eax+2*JOINTMAT_SIZE+2*16+0*4]
11973 movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm3
11975 movzx edx,
byte ptr shuffle[2*4+3]
11976 movss xmm4, [esi+eax+2*JOINTMAT_SIZE+2*16+1*4]
11978 subss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+2*4]
11980 movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4
11982 mov ecx, [esi+eax+2*JOINTMAT_SIZE+0*16+3*4]
11983 mov [edi-2*JOINTQUAT_SIZE+16], ecx
11984 mov edx, [esi+eax+2*JOINTMAT_SIZE+1*16+3*4]
11985 mov [edi-2*JOINTQUAT_SIZE+20], edx
11986 mov ecx, [esi+eax+2*JOINTMAT_SIZE+2*16+3*4]
11987 mov [edi-2*JOINTQUAT_SIZE+24], ecx
11989 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
11990 shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
11991 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
11992 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
11993 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
11995 movzx ecx,
byte ptr shuffle[3*4+0]
11996 movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7
11998 movzx edx,
byte ptr shuffle[3*4+1]
11999 movss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+0*4]
12001 subss xmm4, [esi+eax+3*JOINTMAT_SIZE+0*16+1*4]
12003 movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4
12005 movzx ecx,
byte ptr shuffle[3*4+2]
12006 movss xmm3, [esi+eax+3*JOINTMAT_SIZE+0*16+2*4]
12008 subss xmm3, [esi+eax+3*JOINTMAT_SIZE+2*16+0*4]
12010 movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3
12012 movzx edx,
byte ptr shuffle[3*4+3]
12013 movss xmm4, [esi+eax+3*JOINTMAT_SIZE+2*16+1*4]
12015 subss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+2*4]
12017 movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4
12019 mov ecx, [esi+eax+3*JOINTMAT_SIZE+0*16+3*4]
12020 mov [edi-1*JOINTQUAT_SIZE+16], ecx
12021 mov edx, [esi+eax+3*JOINTMAT_SIZE+1*16+3*4]
12022 mov [edi-1*JOINTQUAT_SIZE+20], edx
12023 mov ecx, [esi+eax+3*JOINTMAT_SIZE+2*16+3*4]
12024 mov [edi-1*JOINTQUAT_SIZE+24], ecx
12026 add eax, 4*JOINTMAT_SIZE
12033 imul eax, JOINTMAT_SIZE
12038 movss xmm5, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
12039 movss xmm6, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
12040 movss xmm7, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
12047 cmpnltss xmm0, SIMD_SP_zero
12051 cmpnltss xmm1, xmm6
12052 cmpnltss xmm2, xmm7
12056 cmpnltss xmm4, xmm7
12064 xorps xmm3, SIMD_SP_not
12066 andps xmm0, SIMD_DW_mat2quatShuffle0
12068 andps xmm4, SIMD_DW_mat2quatShuffle1
12071 andps xmm4, SIMD_DW_mat2quatShuffle2
12074 andps xmm4, SIMD_DW_mat2quatShuffle3
12077 movss shuffle, xmm4
12084 andps xmm0, SIMD_SP_signBitMask
12085 andps xmm1, SIMD_SP_signBitMask
12086 andps xmm2, SIMD_SP_signBitMask
12092 addss xmm7, SIMD_SP_one
12099 subss xmm5, SIMD_SP_rsqrt_c0
12100 mulss xmm6, SIMD_SP_mat2quat_rsqrt_c1
12104 xorps xmm6, SIMD_SP_signBitMask
12108 movzx ecx,
byte ptr shuffle[0]
12109 add edi, JOINTQUAT_SIZE
12110 movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7
12112 movzx edx,
byte ptr shuffle[1]
12113 movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
12115 subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
12117 movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4
12119 movzx ecx,
byte ptr shuffle[2]
12120 movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
12122 subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
12124 movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3
12126 movzx edx,
byte ptr shuffle[3]
12127 movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
12129 subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
12131 movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4
12133 mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
12134 mov [edi-1*JOINTQUAT_SIZE+16], ecx
12135 mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
12136 mov [edi-1*JOINTQUAT_SIZE+20], edx
12137 mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
12138 mov [edi-1*JOINTQUAT_SIZE+24], ecx
12140 add eax, JOINTMAT_SIZE
12148 for (
int i = 0; i < numJoints; i++ ) {
12150 int k0, k1, k2, k3;
12155 if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0
f ) {
12165 }
else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
12175 }
else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
12197 float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f;
12201 q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
12202 q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
12203 q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
12205 q[4] = m[0 * 4 + 3];
12206 q[5] = m[1 * 4 + 3];
12207 q[6] = m[2 * 4 + 3];
12212 for (
int i = 0; i < numJoints; i++ ) {
12217 if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0
f ) {
12219 float t = + m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
12223 q[2] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
12224 q[1] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
12225 q[0] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
12227 }
else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
12229 float t = + m[0 * 4 + 0] - m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
12233 q[1] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
12234 q[2] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
12235 q[3] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
12237 }
else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
12239 float t = - m[0 * 4 + 0] + m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
12243 q[0] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
12244 q[3] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
12245 q[2] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
12249 float t = - m[0 * 4 + 0] - m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
12253 q[3] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
12254 q[0] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
12255 q[1] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
12259 q[4] = m[0 * 4 + 3];
12260 q[5] = m[1 * 4 + 3];
12261 q[6] = m[2 * 4 + 3];
12279 mov ecx, firstJoint
12294 movaps xmm0, [esi+ecx+ 0]
12296 movaps xmm1, [esi+ecx+16]
12297 imul edx, JOINTMAT_SIZE
12298 movaps xmm2, [esi+ecx+32]
12300 movss xmm4, [esi+edx+ 0]
12301 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12304 movss xmm5, [esi+edx+ 4]
12305 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12308 movss xmm6, [esi+edx+ 8]
12309 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12313 movss xmm5, [esi+edx+16]
12314 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12317 movss xmm7, [esi+edx+12]
12318 shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
12321 movaps [esi+ecx+ 0], xmm4
12323 movss xmm6, [esi+edx+20]
12324 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12327 movss xmm7, [esi+edx+24]
12328 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12332 movss xmm6, [esi+edx+32]
12333 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12336 movss xmm3, [esi+edx+28]
12337 shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
12340 movaps [esi+ecx+16], xmm5
12342 movss xmm7, [esi+edx+36]
12343 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12346 movss xmm3, [esi+edx+40]
12347 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
12351 movss xmm7, [esi+edx+44]
12352 shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
12355 movaps [esi+ecx+32], xmm6
12357 add ecx, JOINTMAT_SIZE
12367 for( i = firstJoint; i <= lastJoint; i++ ) {
12368 assert( parents[i] < i );
12369 jointMats[
i] *= jointMats[parents[
i]];
12387 mov edx, firstJoint
12393 imul ecx, JOINTMAT_SIZE
12401 movaps xmm0, [esi+ecx+ 0]
12403 movaps xmm1, [esi+ecx+16]
12404 imul edx, JOINTMAT_SIZE
12405 movaps xmm2, [esi+ecx+32]
12407 movss xmm6, [esi+edx+12]
12408 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
12410 movss xmm7, [esi+edx+28]
12411 shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
12413 movss xmm3, [esi+edx+44]
12414 shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
12417 movss xmm4, [esi+edx+ 0]
12418 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12420 movss xmm5, [esi+edx+16]
12421 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12424 movss xmm6, [esi+edx+32]
12425 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12429 movaps [esi+ecx+ 0], xmm4
12431 movss xmm5, [esi+edx+ 4]
12432 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12434 movss xmm6, [esi+edx+20]
12435 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12438 movss xmm7, [esi+edx+36]
12439 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12443 movaps [esi+ecx+16], xmm5
12445 movss xmm6, [esi+edx+ 8]
12446 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12448 movss xmm7, [esi+edx+24]
12449 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12452 movss xmm3, [esi+edx+40]
12453 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
12457 movaps [esi+ecx+32], xmm6
12459 sub ecx, JOINTMAT_SIZE
12469 for( i = lastJoint; i >= firstJoint; i-- ) {
12470 assert( parents[i] < i );
12471 jointMats[
i] /= jointMats[parents[
i]];
12495 imul eax, DRAWVERT_SIZE
12510 add esi, JOINTWEIGHT_SIZE
12513 mulps xmm0, [edi+ebx+ 0]
12514 mulps xmm1, [edi+ebx+16]
12515 mulps xmm2, [edi+ebx+32]
12517 cmp
dword ptr [edx-4], 0
12526 add esi, JOINTWEIGHT_SIZE
12529 mulps xmm3, [edi+ebx+ 0]
12530 mulps xmm4, [edi+ebx+16]
12531 mulps xmm5, [edi+ebx+32]
12533 cmp
dword ptr [edx-4], 0
12542 add eax, DRAWVERT_SIZE
12545 unpcklps xmm6, xmm1
12546 unpckhps xmm0, xmm1
12554 movhps [ecx+eax-DRAWVERT_SIZE+0], xmm6
12557 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 0, 2, 3 )
12560 movss [ecx+eax-DRAWVERT_SIZE+8], xmm5
12569 const byte *jointsPtr = (
byte *)joints;
12571 for( j = i = 0; i < numVerts; i++ ) {
12574 v = ( *(
idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
12575 while( index[j*2+1] == 0 ) {
12577 v += ( *(
idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
12606 movhps xmm1, [edi+16]
12607 movlps xmm3, [edi+8]
12608 movhps xmm3, [edi+24]
12609 movlps xmm4, [edi+32]
12610 movhps xmm4, [edi+48]
12611 movlps xmm5, [edi+40]
12612 movhps xmm5, [edi+56]
12614 shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
12615 shufps xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
12617 shufps xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )
12618 shufps xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 )
12620 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12625 imul eax, DRAWVERT_SIZE
12630 movss xmm4, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
12631 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12632 movss xmm5, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
12634 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12635 movss xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
12637 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12643 xorps xmm5, SIMD_SP_signBitMask
12652 add eax, DRAWVERT_SIZE
12653 mov
byte ptr [edi-1], cl
12658 mov
byte ptr [esi], dl
12669 for ( i = 0; i < numVerts; i++ ) {
12671 float d0, d1, d2, d3,
t;
12674 d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
12675 d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
12676 d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
12677 d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];
12700 cullBits[
i] =
bits;
12716 ALIGN16(
float p0[4] );
12717 ALIGN16(
float p1[4] );
12718 ALIGN16(
float p2[4] );
12719 ALIGN16(
float p3[4] );
12720 ALIGN16(
float p4[4] );
12721 ALIGN16(
float p5[4] );
12722 ALIGN16(
float p6[4] );
12723 ALIGN16(
float p7[4] );
12731 movhps xmm1, [ecx+16]
12732 movlps xmm3, [ecx+8]
12733 movhps xmm3, [ecx+24]
12734 movlps xmm4, [ecx+32]
12735 movhps xmm4, [ecx+48]
12736 movlps xmm5, [ecx+40]
12737 movhps xmm5, [ecx+56]
12739 shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
12740 shufps xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
12742 shufps xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )
12743 shufps xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 )
12750 movlps xmm4, [ecx+64]
12751 movhps xmm4, [ecx+80]
12753 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
12754 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 )
12755 movlps xmm6, [ecx+72]
12756 movhps xmm6, [ecx+88]
12758 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 )
12759 shufps xmm7, xmm7, R_SHUFFLEPS( 1, 3, 1, 3 )
12771 imul eax, DRAWVERT_SIZE
12777 movss xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
12778 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
12781 movss xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
12782 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
12786 movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
12787 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
12792 cmpnltps xmm6, SIMD_SP_zero
12796 movss xmm3, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
12797 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
12800 movss xmm4, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
12801 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12805 movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
12806 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12811 cmpnltps xmm6, SIMD_SP_zero
12815 shufps xmm0, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
12817 shufps xmm1, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12820 shufps xmm2, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12825 cmpnltps xmm0, SIMD_SP_zero
12833 and edx, (3<<4)|(3<<12)
12836 add eax, 2*DRAWVERT_SIZE
12837 mov
word ptr [edi-2], cx
12847 movss xmm0, [esi+DRAWVERT_XYZ_OFFSET+0]
12848 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
12851 movss xmm1, [esi+DRAWVERT_XYZ_OFFSET+4]
12852 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
12856 movss xmm2, [esi+DRAWVERT_XYZ_OFFSET+8]
12857 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
12862 cmpnltps xmm6, SIMD_SP_zero
12872 cmpnltps xmm0, SIMD_SP_zero
12879 mov
byte ptr [edi], cl
12889 for ( i = 0; i < numVerts; i += 2 ) {
12890 unsigned short bits0, bits1;
12891 float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
12895 d0 = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3];
12896 d1 = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3];
12897 d2 = planes[2][0] * v0[0] + planes[2][1] * v0[1] + planes[2][2] * v0[2] + planes[2][3];
12898 d3 = planes[3][0] * v0[0] + planes[3][1] * v0[1] + planes[3][2] * v0[2] + planes[3][3];
12900 d4 = planes[4][0] * v0[0] + planes[4][1] * v0[1] + planes[4][2] * v0[2] + planes[4][3];
12901 d5 = planes[5][0] * v0[0] + planes[5][1] * v0[1] + planes[5][2] * v0[2] + planes[5][3];
12902 d10 = planes[4][0] * v1[0] + planes[4][1] * v1[1] + planes[4][2] * v1[2] + planes[4][3];
12903 d11 = planes[5][0] * v1[0] + planes[5][1] * v1[1] + planes[5][2] * v1[2] + planes[5][3];
12905 d6 = planes[0][0] * v1[0] + planes[0][1] * v1[1] + planes[0][2] * v1[2] + planes[0][3];
12906 d7 = planes[1][0] * v1[0] + planes[1][1] * v1[1] + planes[1][2] * v1[2] + planes[1][3];
12907 d8 = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3];
12908 d9 = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3];
12924 *(
unsigned short *)(cullBits + i) = ( bits0 | bits1 ) ^ 0x3F3F;
12927 if ( numVerts & 1 ) {
12929 float d0, d1, d2, d3, d4, d5;
12930 const idVec3 &v = verts[numVerts - 1].
xyz;
12932 d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
12933 d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
12934 d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
12935 d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];
12937 d4 = planes[4][0] * v[0] + planes[4][1] * v[1] + planes[4][2] * v[2] + planes[4][3];
12938 d5 = planes[5][0] * v[0] + planes[5][1] * v[1] + planes[5][2] * v[2] + planes[5][3];
12948 cullBits[numVerts - 1] = bits ^ 0x3F;
12972 movss xmm4, [ecx+ 0]
12973 movss xmm5, [ecx+16]
12974 shufps xmm4, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12975 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
12976 movss xmm5, [ecx+ 4]
12977 movss xmm6, [ecx+20]
12978 shufps xmm5, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12979 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )
12980 movss xmm6, [ecx+ 8]
12981 movss xmm7, [ecx+24]
12982 shufps xmm6, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12983 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 )
12984 movss xmm7, [ecx+12]
12985 movss xmm0, [ecx+28]
12986 shufps xmm7, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
12987 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 0, 2 )
12995 movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
12996 movss xmm1, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
12997 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
12999 movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
13000 movss xmm2, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
13001 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
13003 movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
13004 movss xmm3, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
13005 shufps xmm2, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
13012 movaps xmm2, SIMD_SP_one
13014 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
13015 shufps xmm1, xmm2, R_SHUFFLEPS( 2, 3, 2, 3 )
13016 add edx, 2*DRAWVERT_SIZE
13018 mov
byte ptr [edi+eax+0], cl
13021 mov
byte ptr [edi+eax+1], cl
13030 movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
13031 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
13033 movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
13034 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
13036 movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
13037 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
13044 movaps xmm2, SIMD_SP_one
13046 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
13048 mov
byte ptr [edi], cl
13055 const idPlane &p0 = planes[0];
13056 const idPlane &p1 = planes[1];
13058 for (
int i = 0; i < numVerts - 1; i += 2 ) {
13059 unsigned short bits;
13060 float d0, d1, d2, d3;
13065 d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
13066 d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
13067 d2 = p0[0] * v1[0] + p0[1] * v1[1] + p0[2] * v1[2] + p0[3];
13068 d3 = p1[0] * v1[0] + p1[1] * v1[1] + p1[2] * v1[2] + p1[3];
13070 texCoords[i+0][0] = d0;
13071 texCoords[i+0][1] = d1;
13072 texCoords[i+1][0] = d2;
13073 texCoords[i+1][1] = d3;
13090 *(
unsigned short *)(cullBits + i) =
bits;
13093 if ( numVerts & 1 ) {
13097 const idPlane &p0 = planes[0];
13098 const idPlane &p1 = planes[1];
13099 const idVec3 &v0 = verts[numVerts - 1].
xyz;
13101 d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
13102 d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
13104 texCoords[
i][0] = d0;
13105 texCoords[
i][1] = d1;
13116 cullBits[numVerts - 1] =
bits;
13134 mov eax, numIndexes
13147 mov ebx, [edi+eax-4*12+4]
13148 imul ebx, DRAWVERT_SIZE
13149 mov ecx, [edi+eax-4*12+0]
13150 imul ecx, DRAWVERT_SIZE
13152 movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13153 subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13155 movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13156 subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13158 movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13159 subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13161 mov ebx, [edi+eax-4*12+8]
13162 imul ebx, DRAWVERT_SIZE
13164 shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
13165 shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
13166 shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
13168 movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13169 subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13171 movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13172 subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13174 movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13175 subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13177 mov ebx, [edi+eax-3*12+4]
13178 imul ebx, DRAWVERT_SIZE
13179 mov ecx, [edi+eax-3*12+0]
13180 imul ecx, DRAWVERT_SIZE
13182 shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
13183 shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
13184 shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
13186 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13187 subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13190 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13191 subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13194 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13195 subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13198 mov ebx, [edi+eax-3*12+8]
13199 imul ebx, DRAWVERT_SIZE
13201 shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
13202 shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
13203 shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
13205 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13206 subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13209 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13210 subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13213 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13214 subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13217 mov ebx, [edi+eax-2*12+4]
13218 imul ebx, DRAWVERT_SIZE
13219 mov ecx, [edi+eax-2*12+0]
13220 imul ecx, DRAWVERT_SIZE
13222 shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
13223 shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
13224 shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
13226 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13227 subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13230 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13231 subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13234 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13235 subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13238 mov ebx, [edi+eax-2*12+8]
13239 imul ebx, DRAWVERT_SIZE
13241 shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
13242 shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
13243 shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
13245 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13246 subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13249 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13250 subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13253 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13254 subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13257 mov ebx, [edi+eax-1*12+4]
13258 imul ebx, DRAWVERT_SIZE
13259 mov ecx, [edi+eax-1*12+0]
13260 imul ecx, DRAWVERT_SIZE
13262 shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
13263 shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
13264 shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
13266 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13267 subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13270 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13271 subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13274 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13275 subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13278 mov ebx, [edi+eax-1*12+8]
13279 imul ebx, DRAWVERT_SIZE
13281 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13282 subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13285 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13286 subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13289 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13290 subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13319 mov ecx, [edi+eax-1*12+0]
13320 imul ecx, DRAWVERT_SIZE
13326 movss [edx-1*16+0], xmm0
13327 movss [edx-1*16+4], xmm1
13328 movss [edx-1*16+8], xmm2
13330 mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13331 mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13332 mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13334 xorps xmm0, SIMD_SP_singleSignBitMask
13337 movss [edx-1*16+12], xmm0
13339 mov ecx, [edi+eax-2*12+0]
13340 imul ecx, DRAWVERT_SIZE
13342 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
13343 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
13344 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
13346 movss [edx-2*16+0], xmm0
13347 movss [edx-2*16+4], xmm1
13348 movss [edx-2*16+8], xmm2
13350 mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13351 mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13352 mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13354 xorps xmm0, SIMD_SP_singleSignBitMask
13357 movss [edx-2*16+12], xmm0
13359 mov ecx, [edi+eax-3*12+0]
13360 imul ecx, DRAWVERT_SIZE
13362 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
13363 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
13364 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
13366 movss [edx-3*16+0], xmm0
13367 movss [edx-3*16+4], xmm1
13368 movss [edx-3*16+8], xmm2
13370 mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13371 mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13372 mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13374 xorps xmm0, SIMD_SP_singleSignBitMask
13377 movss [edx-3*16+12], xmm0
13379 mov ecx, [edi+eax-4*12+0]
13380 imul ecx, DRAWVERT_SIZE
13382 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
13383 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
13384 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
13386 movss [edx-4*16+0], xmm0
13387 movss [edx-4*16+4], xmm1
13388 movss [edx-4*16+8], xmm2
13390 mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13391 mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13392 mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13394 xorps xmm0, SIMD_SP_singleSignBitMask
13397 movss [edx-4*16+12], xmm0
13408 mov ebx, [edi+eax+4]
13409 imul ebx, DRAWVERT_SIZE
13410 mov ecx, [edi+eax+0]
13411 imul ecx, DRAWVERT_SIZE
13413 movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13414 subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13416 movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13417 subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13419 movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13420 subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13422 mov ebx, [edi+eax+8]
13423 imul ebx, DRAWVERT_SIZE
13425 movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13426 subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13428 movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13429 subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13431 movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13432 subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13465 movss [edx-1*16+0], xmm0
13466 movss [edx-1*16+4], xmm1
13467 movss [edx-1*16+8], xmm2
13469 mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13470 mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13471 mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13473 xorps xmm0, SIMD_SP_singleSignBitMask
13476 movss [edx-1*16+12], xmm0
13488 for ( i = 0; i <= numIndexes - 12; i += 12 ) {
13489 ALIGN16(
float d0[4] );
13490 ALIGN16(
float d1[4] );
13491 ALIGN16(
float d2[4] );
13492 ALIGN16(
float d3[4] );
13493 ALIGN16(
float d4[4] );
13494 ALIGN16(
float d5[4] );
13495 ALIGN16(
float n0[4] );
13496 ALIGN16(
float n1[4] );
13497 ALIGN16(
float n2[4] );
13499 for ( j = 0; j < 4; j++ ) {
13502 a = verts + indexes[i + j * 3 + 0];
13503 b = verts + indexes[i + j * 3 + 1];
13504 c = verts + indexes[i + j * 3 + 2];
13506 d0[
j] = b->
xyz[0] - a->
xyz[0];
13507 d1[
j] = b->
xyz[1] - a->
xyz[1];
13508 d2[
j] = b->
xyz[2] - a->
xyz[2];
13510 d3[
j] = c->
xyz[0] - a->
xyz[0];
13511 d4[
j] = c->
xyz[1] - a->
xyz[1];
13512 d5[
j] = c->
xyz[2] - a->
xyz[2];
13515 ALIGN16(
float tmp[4] );
13517 n0[0] = d4[0] * d2[0];
13518 n0[1] = d4[1] * d2[1];
13519 n0[2] = d4[2] * d2[2];
13520 n0[3] = d4[3] * d2[3];
13522 n0[0] -= d5[0] * d1[0];
13523 n0[1] -= d5[1] * d1[1];
13524 n0[2] -= d5[2] * d1[2];
13525 n0[3] -= d5[3] * d1[3];
13527 n1[0] = d5[0] * d0[0];
13528 n1[1] = d5[1] * d0[1];
13529 n1[2] = d5[2] * d0[2];
13530 n1[3] = d5[3] * d0[3];
13532 n1[0] -= d3[0] * d2[0];
13533 n1[1] -= d3[1] * d2[1];
13534 n1[2] -= d3[2] * d2[2];
13535 n1[3] -= d3[3] * d2[3];
13537 n2[0] = d3[0] * d1[0];
13538 n2[1] = d3[1] * d1[1];
13539 n2[2] = d3[2] * d1[2];
13540 n2[3] = d3[3] * d1[3];
13542 n2[0] -= d4[0] * d0[0];
13543 n2[1] -= d4[1] * d0[1];
13544 n2[2] -= d4[2] * d0[2];
13545 n2[3] -= d4[3] * d0[3];
13547 tmp[0] = n0[0] * n0[0];
13548 tmp[1] = n0[1] * n0[1];
13549 tmp[2] = n0[2] * n0[2];
13550 tmp[3] = n0[3] * n0[3];
13552 tmp[0] += n1[0] * n1[0];
13553 tmp[1] += n1[1] * n1[1];
13554 tmp[2] += n1[2] * n1[2];
13555 tmp[3] += n1[3] * n1[3];
13557 tmp[0] += n2[0] * n2[0];
13558 tmp[1] += n2[1] * n2[1];
13559 tmp[2] += n2[2] * n2[2];
13560 tmp[3] += n2[3] * n2[3];
13583 for ( j = 0; j < 4; j++ ) {
13586 a = verts + indexes[i + j * 3];
13596 for ( ; i < numIndexes; i += 3 ) {
13598 float d0, d1, d2, d3, d4, d5;
13601 a = verts + indexes[i + 0];
13602 b = verts + indexes[i + 1];
13603 c = verts + indexes[i + 2];
13605 d0 = b->
xyz[0] - a->
xyz[0];
13606 d1 = b->
xyz[1] - a->
xyz[1];
13607 d2 = b->
xyz[2] - a->
xyz[2];
13609 d3 = c->
xyz[0] - a->
xyz[0];
13610 d4 = c->
xyz[1] - a->
xyz[1];
13611 d5 = c->
xyz[2] - a->
xyz[2];
13615 n0 = d4 * d2 - d5 * d1;
13616 n1 = d5 * d0 - d3 * d2;
13617 n2 = d3 * d1 - d4 * d0;
13625 planes->
Normal()[0] = n0;
13626 planes->
Normal()[1] = n1;
13627 planes->
Normal()[2] = n2;
13641 #define FIX_DEGENERATE_TANGENT
13648 assert( (
int)&((
idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
13649 assert( (
int)&((
idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
13653 assert( numVerts >= 0 );
13655 #ifdef REFINE_TANGENT_SQUAREROOT
13657 movaps xmm6, SIMD_SP_rsqrt_c0
13658 movaps xmm7, SIMD_SP_rsqrt_c1
13662 bool *used = (
bool *)_alloca16( numVerts *
sizeof( used[0] ) );
13663 memset( used, 0, numVerts *
sizeof( used[0] ) );
13665 for ( i = 0; i <= numIndexes - 12; i += 12 ) {
13667 ALIGN16(
unsigned long signBit[4] );
13668 ALIGN16(
float d0[4] );
13669 ALIGN16(
float d1[4] );
13670 ALIGN16(
float d2[4] );
13671 ALIGN16(
float d3[4] );
13672 ALIGN16(
float d4[4] );
13673 ALIGN16(
float d5[4] );
13674 ALIGN16(
float d6[4] );
13675 ALIGN16(
float d7[4] );
13676 ALIGN16(
float d8[4] );
13677 ALIGN16(
float d9[4] );
13678 ALIGN16(
float n0[4] );
13679 ALIGN16(
float n1[4] );
13680 ALIGN16(
float n2[4] );
13681 ALIGN16(
float t0[4] );
13682 ALIGN16(
float t1[4] );
13683 ALIGN16(
float t2[4] );
13684 ALIGN16(
float t3[4] );
13685 ALIGN16(
float t4[4] );
13686 ALIGN16(
float t5[4] );
13688 for (
int j = 0; j < 4; j++ ) {
13690 a = verts + indexes[i + j * 3 + 0];
13691 b = verts + indexes[i + j * 3 + 1];
13692 c = verts + indexes[i + j * 3 + 2];
13694 d0[
j] = b->
xyz[0] - a->
xyz[0];
13695 d1[
j] = b->
xyz[1] - a->
xyz[1];
13696 d2[
j] = b->
xyz[2] - a->
xyz[2];
13697 d3[
j] = b->
st[0] - a->
st[0];
13698 d4[
j] = b->
st[1] - a->
st[1];
13700 d5[
j] = c->
xyz[0] - a->
xyz[0];
13701 d6[
j] = c->
xyz[1] - a->
xyz[1];
13702 d7[
j] = c->
xyz[2] - a->
xyz[2];
13703 d8[
j] = c->
st[0] - a->
st[0];
13704 d9[
j] = c->
st[1] - a->
st[1];
13740 #ifdef FIX_DEGENERATE_TANGENT
13743 andps xmm4, SIMD_SP_tiny
13744 andps xmm3, SIMD_SP_absMask
13748 #ifdef REFINE_TANGENT_SQUAREROOT
13771 andps xmm0, SIMD_SP_signBitMask
13772 movaps signBit, xmm0
13804 #ifdef FIX_DEGENERATE_TANGENT
13807 andps xmm4, SIMD_SP_tiny
13808 andps xmm3, SIMD_SP_absMask
13812 #ifdef REFINE_TANGENT_SQUAREROOT
13822 xorps xmm3, signBit
13861 #ifdef FIX_DEGENERATE_TANGENT
13864 andps xmm4, SIMD_SP_tiny
13865 andps xmm3, SIMD_SP_absMask
13869 #ifdef REFINE_TANGENT_SQUAREROOT
13879 xorps xmm3, signBit
13891 ALIGN16(
float tmp[4] );
13894 n0[0] = d6[0] * d2[0];
13895 n0[1] = d6[1] * d2[1];
13896 n0[2] = d6[2] * d2[2];
13897 n0[3] = d6[3] * d2[3];
13899 n0[0] -= d7[0] * d1[0];
13900 n0[1] -= d7[1] * d1[1];
13901 n0[2] -= d7[2] * d1[2];
13902 n0[3] -= d7[3] * d1[3];
13904 n1[0] = d7[0] * d0[0];
13905 n1[1] = d7[1] * d0[1];
13906 n1[2] = d7[2] * d0[2];
13907 n1[3] = d7[3] * d0[3];
13909 n1[0] -= d5[0] * d2[0];
13910 n1[1] -= d5[1] * d2[1];
13911 n1[2] -= d5[2] * d2[2];
13912 n1[3] -= d5[3] * d2[3];
13914 n2[0] = d5[0] * d1[0];
13915 n2[1] = d5[1] * d1[1];
13916 n2[2] = d5[2] * d1[2];
13917 n2[3] = d5[3] * d1[3];
13919 n2[0] -= d6[0] * d0[0];
13920 n2[1] -= d6[1] * d0[1];
13921 n2[2] -= d6[2] * d0[2];
13922 n2[3] -= d6[3] * d0[3];
13924 tmp[0] = n0[0] * n0[0];
13925 tmp[1] = n0[1] * n0[1];
13926 tmp[2] = n0[2] * n0[2];
13927 tmp[3] = n0[3] * n0[3];
13929 tmp[0] += n1[0] * n1[0];
13930 tmp[1] += n1[1] * n1[1];
13931 tmp[2] += n1[2] * n1[2];
13932 tmp[3] += n1[3] * n1[3];
13934 tmp[0] += n2[0] * n2[0];
13935 tmp[1] += n2[1] * n2[1];
13936 tmp[2] += n2[2] * n2[2];
13937 tmp[3] += n2[3] * n2[3];
13960 tmp[0] = d3[0] * d9[0];
13961 tmp[1] = d3[1] * d9[1];
13962 tmp[2] = d3[2] * d9[2];
13963 tmp[3] = d3[3] * d9[3];
13965 tmp[0] -= d4[0] * d8[0];
13966 tmp[1] -= d4[1] * d8[1];
13967 tmp[2] -= d4[2] * d8[2];
13968 tmp[3] -= d4[3] * d8[3];
13970 signBit[0] = ( *(
unsigned long *)&tmp[0] ) & ( 1 << 31 );
13971 signBit[1] = ( *(
unsigned long *)&tmp[1] ) & ( 1 << 31 );
13972 signBit[2] = ( *(
unsigned long *)&tmp[2] ) & ( 1 << 31 );
13973 signBit[3] = ( *(
unsigned long *)&tmp[3] ) & ( 1 << 31 );
13976 t0[0] = d0[0] * d9[0];
13977 t0[1] = d0[1] * d9[1];
13978 t0[2] = d0[2] * d9[2];
13979 t0[3] = d0[3] * d9[3];
13981 t0[0] -= d4[0] * d5[0];
13982 t0[1] -= d4[1] * d5[1];
13983 t0[2] -= d4[2] * d5[2];
13984 t0[3] -= d4[3] * d5[3];
13986 t1[0] = d1[0] * d9[0];
13987 t1[1] = d1[1] * d9[1];
13988 t1[2] = d1[2] * d9[2];
13989 t1[3] = d1[3] * d9[3];
13991 t1[0] -= d4[0] * d6[0];
13992 t1[1] -= d4[1] * d6[1];
13993 t1[2] -= d4[2] * d6[2];
13994 t1[3] -= d4[3] * d6[3];
13996 t2[0] = d2[0] * d9[0];
13997 t2[1] = d2[1] * d9[1];
13998 t2[2] = d2[2] * d9[2];
13999 t2[3] = d2[3] * d9[3];
14001 t2[0] -= d4[0] * d7[0];
14002 t2[1] -= d4[1] * d7[1];
14003 t2[2] -= d4[2] * d7[2];
14004 t2[3] -= d4[3] * d7[3];
14006 tmp[0] = t0[0] * t0[0];
14007 tmp[1] = t0[1] * t0[1];
14008 tmp[2] = t0[2] * t0[2];
14009 tmp[3] = t0[3] * t0[3];
14011 tmp[0] += t1[0] * t1[0];
14012 tmp[1] += t1[1] * t1[1];
14013 tmp[2] += t1[2] * t1[2];
14014 tmp[3] += t1[3] * t1[3];
14016 tmp[0] += t2[0] * t2[0];
14017 tmp[1] += t2[1] * t2[1];
14018 tmp[2] += t2[2] * t2[2];
14019 tmp[3] += t2[3] * t2[3];
14026 *(
unsigned long *)&tmp[0] ^= signBit[0];
14027 *(
unsigned long *)&tmp[1] ^= signBit[1];
14028 *(
unsigned long *)&tmp[2] ^= signBit[2];
14029 *(
unsigned long *)&tmp[3] ^= signBit[3];
14047 t3[0] = d3[0] * d5[0];
14048 t3[1] = d3[1] * d5[1];
14049 t3[2] = d3[2] * d5[2];
14050 t3[3] = d3[3] * d5[3];
14052 t3[0] -= d0[0] * d8[0];
14053 t3[1] -= d0[1] * d8[1];
14054 t3[2] -= d0[2] * d8[2];
14055 t3[3] -= d0[3] * d8[3];
14057 t4[0] = d3[0] * d6[0];
14058 t4[1] = d3[1] * d6[1];
14059 t4[2] = d3[2] * d6[2];
14060 t4[3] = d3[3] * d6[3];
14062 t4[0] -= d1[0] * d8[0];
14063 t4[1] -= d1[1] * d8[1];
14064 t4[2] -= d1[2] * d8[2];
14065 t4[3] -= d1[3] * d8[3];
14067 t5[0] = d3[0] * d7[0];
14068 t5[1] = d3[1] * d7[1];
14069 t5[2] = d3[2] * d7[2];
14070 t5[3] = d3[3] * d7[3];
14072 t5[0] -= d2[0] * d8[0];
14073 t5[1] -= d2[1] * d8[1];
14074 t5[2] -= d2[2] * d8[2];
14075 t5[3] -= d2[3] * d8[3];
14077 tmp[0] = t3[0] * t3[0];
14078 tmp[1] = t3[1] * t3[1];
14079 tmp[2] = t3[2] * t3[2];
14080 tmp[3] = t3[3] * t3[3];
14082 tmp[0] += t4[0] * t4[0];
14083 tmp[1] += t4[1] * t4[1];
14084 tmp[2] += t4[2] * t4[2];
14085 tmp[3] += t4[3] * t4[3];
14087 tmp[0] += t5[0] * t5[0];
14088 tmp[1] += t5[1] * t5[1];
14089 tmp[2] += t5[2] * t5[2];
14090 tmp[3] += t5[3] * t5[3];
14097 *(
unsigned long *)&tmp[0] ^= signBit[0];
14098 *(
unsigned long *)&tmp[1] ^= signBit[1];
14099 *(
unsigned long *)&tmp[2] ^= signBit[2];
14100 *(
unsigned long *)&tmp[3] ^= signBit[3];
14119 for (
int j = 0; j < 4; j++ ) {
14121 const int v0 = indexes[i + j * 3 + 0];
14122 const int v1 = indexes[i + j * 3 + 1];
14123 const int v2 = indexes[i + j * 3 + 2];
14221 for ( ; i < numIndexes; i += 3 ) {
14223 ALIGN16(
unsigned long signBit[4] );
14224 float d0, d1, d2, d3, d4;
14225 float d5, d6, d7, d8, d9;
14230 const int v0 = indexes[i + 0];
14231 const int v1 = indexes[i + 1];
14232 const int v2 = indexes[i + 2];
14238 d0 = b->
xyz[0] - a->
xyz[0];
14239 d1 = b->
xyz[1] - a->
xyz[1];
14240 d2 = b->
xyz[2] - a->
xyz[2];
14241 d3 = b->
st[0] - a->
st[0];
14242 d4 = b->
st[1] - a->
st[1];
14244 d5 = c->
xyz[0] - a->
xyz[0];
14245 d6 = c->
xyz[1] - a->
xyz[1];
14246 d7 = c->
xyz[2] - a->
xyz[2];
14247 d8 = c->
st[0] - a->
st[0];
14248 d9 = c->
st[1] - a->
st[1];
14283 #ifdef FIX_DEGENERATE_TANGENT
14286 andps xmm4, SIMD_SP_tiny
14287 andps xmm3, SIMD_SP_absMask
14291 #ifdef REFINE_TANGENT_SQUAREROOT
14314 andps xmm0, SIMD_SP_signBitMask
14315 movaps signBit, xmm0
14347 #ifdef FIX_DEGENERATE_TANGENT
14350 andps xmm4, SIMD_SP_tiny
14351 andps xmm3, SIMD_SP_absMask
14355 #ifdef REFINE_TANGENT_SQUAREROOT
14365 xorps xmm3, signBit
14404 #ifdef FIX_DEGENERATE_TANGENT
14407 andps xmm4, SIMD_SP_tiny
14408 andps xmm3, SIMD_SP_absMask
14412 #ifdef REFINE_TANGENT_SQUAREROOT
14422 xorps xmm3, signBit
14437 n0 = d6 * d2 - d7 * d1;
14438 n1 = d7 * d0 - d5 * d2;
14439 n2 = d5 * d1 - d6 * d0;
14448 tmp = d3 * d9 - d4 * d8;
14449 signBit[0] = ( *(
unsigned long *)&tmp ) & ( 1 << 31 );
14452 t0 = d0 * d9 - d4 * d5;
14453 t1 = d1 * d9 - d4 * d6;
14454 t2 = d2 * d9 - d4 * d7;
14457 *(
unsigned long *)&tmp ^= signBit[0];
14464 t3 = d3 * d5 - d0 * d8;
14465 t4 = d3 * d6 - d1 * d8;
14466 t5 = d3 * d7 - d2 * d8;
14469 *(
unsigned long *)&tmp ^= signBit[0];
14477 planes->
Normal()[0] = n0;
14478 planes->
Normal()[1] = n1;
14479 planes->
Normal()[2] = n2;
14574 #define DERIVE_UNSMOOTHED_BITANGENT
14579 for ( i = 0; i <= numVerts - 4; i += 4 ) {
14580 ALIGN16(
float s0[4] );
14581 ALIGN16(
float s1[4] );
14582 ALIGN16(
float s2[4] );
14583 ALIGN16(
float d0[4] );
14584 ALIGN16(
float d1[4] );
14585 ALIGN16(
float d2[4] );
14586 ALIGN16(
float d3[4] );
14587 ALIGN16(
float d4[4] );
14588 ALIGN16(
float d5[4] );
14589 ALIGN16(
float d6[4] );
14590 ALIGN16(
float d7[4] );
14591 ALIGN16(
float d8[4] );
14592 ALIGN16(
float d9[4] );
14593 ALIGN16(
float n0[4] );
14594 ALIGN16(
float n1[4] );
14595 ALIGN16(
float n2[4] );
14596 ALIGN16(
float t0[4] );
14597 ALIGN16(
float t1[4] );
14598 ALIGN16(
float t2[4] );
14599 ALIGN16(
float t3[4] );
14600 ALIGN16(
float t4[4] );
14601 ALIGN16(
float t5[4] );
14603 for ( j = 0; j < 4; j++ ) {
14616 d0[
j] = b->
xyz[0] - a->
xyz[0];
14617 d1[
j] = b->
xyz[1] - a->
xyz[1];
14618 d2[
j] = b->
xyz[2] - a->
xyz[2];
14619 d3[
j] = b->
st[0] - a->
st[0];
14620 d4[
j] = b->
st[1] - a->
st[1];
14622 d5[
j] = c->
xyz[0] - a->
xyz[0];
14623 d6[
j] = c->
xyz[1] - a->
xyz[1];
14624 d7[
j] = c->
xyz[2] - a->
xyz[2];
14625 d8[
j] = c->
st[0] - a->
st[0];
14626 d9[
j] = c->
st[1] - a->
st[1];
14687 #ifndef DERIVE_UNSMOOTHED_BITANGENT
14733 n0[0] = d6[0] * d2[0];
14734 n0[1] = d6[1] * d2[1];
14735 n0[2] = d6[2] * d2[2];
14736 n0[3] = d6[3] * d2[3];
14738 n1[0] = d7[0] * d0[0];
14739 n1[1] = d7[1] * d0[1];
14740 n1[2] = d7[2] * d0[2];
14741 n1[3] = d7[3] * d0[3];
14743 n2[0] = d5[0] * d1[0];
14744 n2[1] = d5[1] * d1[1];
14745 n2[2] = d5[2] * d1[2];
14746 n2[3] = d5[3] * d1[3];
14748 n0[0] -= d7[0] * d1[0];
14749 n0[1] -= d7[1] * d1[1];
14750 n0[2] -= d7[2] * d1[2];
14751 n0[3] -= d7[3] * d1[3];
14753 n1[0] -= d5[0] * d2[0];
14754 n1[1] -= d5[1] * d2[1];
14755 n1[2] -= d5[2] * d2[2];
14756 n1[3] -= d5[3] * d2[3];
14758 n2[0] -= d6[0] * d0[0];
14759 n2[1] -= d6[1] * d0[1];
14760 n2[2] -= d6[2] * d0[2];
14761 n2[3] -= d6[3] * d0[3];
14778 t0[0] = d0[0] * d9[0];
14779 t0[1] = d0[1] * d9[1];
14780 t0[2] = d0[2] * d9[2];
14781 t0[3] = d0[3] * d9[3];
14783 t1[0] = d1[0] * d9[0];
14784 t1[1] = d1[1] * d9[1];
14785 t1[2] = d1[2] * d9[2];
14786 t1[3] = d1[3] * d9[3];
14788 t2[0] = d2[0] * d9[0];
14789 t2[1] = d2[1] * d9[1];
14790 t2[2] = d2[2] * d9[2];
14791 t2[3] = d2[3] * d9[3];
14793 t0[0] -= d4[0] * d5[0];
14794 t0[1] -= d4[1] * d5[1];
14795 t0[2] -= d4[2] * d5[2];
14796 t0[3] -= d4[3] * d5[3];
14798 t1[0] -= d4[0] * d6[0];
14799 t1[1] -= d4[1] * d6[1];
14800 t1[2] -= d4[2] * d6[2];
14801 t1[3] -= d4[3] * d6[3];
14803 t2[0] -= d4[0] * d7[0];
14804 t2[1] -= d4[1] * d7[1];
14805 t2[2] -= d4[2] * d7[2];
14806 t2[3] -= d4[3] * d7[3];
14823 #ifndef DERIVE_UNSMOOTHED_BITANGENT
14824 t3[0] = d3[0] * d5[0];
14825 t3[1] = d3[1] * d5[1];
14826 t3[2] = d3[2] * d5[2];
14827 t3[3] = d3[3] * d5[3];
14829 t4[0] = d3[0] * d6[0];
14830 t4[1] = d3[1] * d6[1];
14831 t4[2] = d3[2] * d6[2];
14832 t4[3] = d3[3] * d6[3];
14834 t5[0] = d3[0] * d7[0];
14835 t5[1] = d3[1] * d7[1];
14836 t5[2] = d3[2] * d7[2];
14837 t5[3] = d3[3] * d7[3];
14839 t3[0] -= d0[0] * d8[0];
14840 t3[1] -= d0[1] * d8[1];
14841 t3[2] -= d0[2] * d8[2];
14842 t3[3] -= d0[3] * d8[3];
14844 t4[0] -= d1[0] * d8[0];
14845 t4[1] -= d1[1] * d8[1];
14846 t4[2] -= d1[2] * d8[2];
14847 t4[3] -= d1[3] * d8[3];
14849 t5[0] -= d2[0] * d8[0];
14850 t5[1] -= d2[1] * d8[1];
14851 t5[2] -= d2[2] * d8[2];
14852 t5[3] -= d2[3] * d8[3];
14854 t3[0] = n2[0] * t1[0];
14855 t3[1] = n2[1] * t1[1];
14856 t3[2] = n2[2] * t1[2];
14857 t3[3] = n2[3] * t1[3];
14859 t4[0] = n0[0] * t2[0];
14860 t4[1] = n0[1] * t2[1];
14861 t4[2] = n0[2] * t2[2];
14862 t4[3] = n0[3] * t2[3];
14864 t5[0] = n1[0] * t0[0];
14865 t5[1] = n1[1] * t0[1];
14866 t5[2] = n1[2] * t0[2];
14867 t5[3] = n1[3] * t0[3];
14869 t3[0] -= n1[0] * t2[0];
14870 t3[1] -= n1[1] * t2[1];
14871 t3[2] -= n1[2] * t2[2];
14872 t3[3] -= n1[3] * t2[3];
14874 t4[0] -= n2[0] * t0[0];
14875 t4[1] -= n2[1] * t0[1];
14876 t4[2] -= n2[2] * t0[2];
14877 t4[3] -= n2[3] * t0[3];
14879 t5[0] -= n0[0] * t1[0];
14880 t5[1] -= n0[1] * t1[1];
14881 t5[2] -= n0[2] * t1[2];
14882 t5[3] -= n0[3] * t1[3];
14901 for ( j = 0; j < 4; j++ ) {
14920 for ( ; i < numVerts; i++ ) {
14922 float d0, d1, d2, d3, d4;
14923 float d5, d6, d7, d8, d9;
14939 d0 = b->
xyz[0] - a->
xyz[0];
14940 d1 = b->
xyz[1] - a->
xyz[1];
14941 d2 = b->
xyz[2] - a->
xyz[2];
14942 d3 = b->
st[0] - a->
st[0];
14943 d4 = b->
st[1] - a->
st[1];
14945 d5 = c->
xyz[0] - a->
xyz[0];
14946 d6 = c->
xyz[1] - a->
xyz[1];
14947 d7 = c->
xyz[2] - a->
xyz[2];
14948 d8 = c->
st[0] - a->
st[0];
14949 d9 = c->
st[1] - a->
st[1];
15009 #ifndef DERIVE_UNSMOOTHED_BITANGENT
15055 n0 = s2 * ( d6 * d2 - d7 * d1 );
15056 n1 = s2 * ( d7 * d0 - d5 * d2 );
15057 n2 = s2 * ( d5 * d1 - d6 * d0 );
15059 t0 = s0 * ( d0 * d9 - d4 * d5 );
15060 t1 = s0 * ( d1 * d9 - d4 * d6 );
15061 t2 = s0 * ( d2 * d9 - d4 * d7 );
15063 #ifndef DERIVE_UNSMOOTHED_BITANGENT
15064 t3 = s1 * ( d3 * d5 - d0 * d8 );
15065 t4 = s1 * ( d3 * d6 - d1 * d8 );
15066 t5 = s1 * ( d3 * d7 - d2 * d8 );
15068 t3 = s1 * ( n2 * t1 - n1 * t2 );
15069 t4 = s1 * ( n0 * t2 - n2 * t0 );
15070 t5 = s1 * ( n1 * t0 - n0 * t1 );
15095 ALIGN16(
float normal[12] );
15099 assert( (
int)&((
idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
15100 assert( (
int)&((
idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
15103 assert( numVerts >= 0 );
15109 #ifdef REFINE_TANGENT_SQUAREROOT
15110 movaps xmm6, SIMD_SP_rsqrt_c0
15111 movaps xmm7, SIMD_SP_rsqrt_c1
15114 imul eax, DRAWVERT_SIZE
15117 add eax, DRAWVERT_SIZE*4
15120 sub eax, DRAWVERT_SIZE*4
15125 sub eax, DRAWVERT_SIZE*4
15129 movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0]
15130 movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0]
15131 movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8]
15132 movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4]
15133 movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0]
15134 movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0]
15135 movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8]
15136 movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4]
15140 shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
15141 shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )
15142 shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 )
15143 shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 )
15144 shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 )
15156 #ifdef REFINE_TANGENT_SQUAREROOT
15173 movaps [normal+ 0], xmm0
15174 movaps [normal+16], xmm1
15175 movaps [normal+32], xmm2
15177 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0], xmm0
15178 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4], xmm1
15179 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+8], xmm2
15181 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15182 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15183 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15185 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0], xmm0
15186 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+4], xmm1
15187 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8], xmm2
15189 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15190 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15191 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15193 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0], xmm0
15194 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4], xmm1
15195 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+8], xmm2
15197 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15198 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15199 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15201 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0], xmm0
15202 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+4], xmm1
15203 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8], xmm2
15207 movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0]
15208 movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0]
15209 movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8]
15210 movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4]
15211 movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0]
15212 movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0]
15213 movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8]
15214 movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4]
15218 shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
15219 shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )
15220 shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 )
15221 shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 )
15222 shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 )
15228 mulps xmm3, [normal+ 0]
15229 mulps xmm4, [normal+16]
15230 mulps xmm5, [normal+32]
15236 mulps xmm3, [normal+ 0]
15237 mulps xmm4, [normal+16]
15238 mulps xmm5, [normal+32]
15253 #ifdef REFINE_TANGENT_SQUAREROOT
15268 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15269 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15270 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15272 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15273 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15274 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15276 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15277 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15278 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15280 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15281 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15282 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15284 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15285 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15286 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15288 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15289 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15290 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15292 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15293 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15294 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15298 movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0]
15299 movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0]
15300 movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8]
15301 movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4]
15302 movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0]
15303 movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0]
15304 movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8]
15305 movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4]
15309 shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
15310 shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )
15311 shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 )
15312 shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 )
15313 shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 )
15319 mulps xmm3, [normal+ 0]
15320 mulps xmm4, [normal+16]
15321 mulps xmm5, [normal+32]
15327 mulps xmm3, [normal+ 0]
15328 mulps xmm4, [normal+16]
15329 mulps xmm5, [normal+32]
15344 #ifdef REFINE_TANGENT_SQUAREROOT
15359 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15360 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15361 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15363 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15364 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15365 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15367 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15368 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15369 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15371 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15372 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15373 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15375 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15376 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15377 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15379 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15380 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15381 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15383 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15384 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15385 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15387 add eax, DRAWVERT_SIZE*8
15391 sub eax, DRAWVERT_SIZE*4
15398 movss xmm0, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15399 movss xmm1, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15400 movss xmm2, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15411 #ifdef REFINE_TANGENT_SQUAREROOT
15426 movss [esi+eax+DRAWVERT_NORMAL_OFFSET+0], xmm0
15427 movss [esi+eax+DRAWVERT_NORMAL_OFFSET+4], xmm1
15428 movss [esi+eax+DRAWVERT_NORMAL_OFFSET+8], xmm2
15432 movss xmm0, [esi+eax+DRAWVERT_TANGENT0_OFFSET+0]
15433 movss xmm1, [esi+eax+DRAWVERT_TANGENT0_OFFSET+4]
15434 movss xmm2, [esi+eax+DRAWVERT_TANGENT0_OFFSET+8]
15439 mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15440 mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15441 mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15447 mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15448 mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15449 mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15464 #ifdef REFINE_TANGENT_SQUAREROOT
15479 movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15480 movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15481 movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15485 movss xmm0, [esi+eax+DRAWVERT_TANGENT1_OFFSET+0]
15486 movss xmm1, [esi+eax+DRAWVERT_TANGENT1_OFFSET+4]
15487 movss xmm2, [esi+eax+DRAWVERT_TANGENT1_OFFSET+8]
15492 mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15493 mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15494 mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15500 mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15501 mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15502 mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15517 #ifdef REFINE_TANGENT_SQUAREROOT
15532 movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15533 movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15534 movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15536 add eax, DRAWVERT_SIZE
15553 assert( (
int)&((
idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
15554 assert( (
int)&((
idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
15556 bool *used = (
bool *)_alloca16( numVerts *
sizeof( used[0] ) );
15557 memset( used, 0, numVerts *
sizeof( used[0] ) );
15559 for (
int i = numIndexes - 1; i >= 0; i-- ) {
15560 used[indexes[
i]] =
true;
15573 sub edi, DRAWVERT_SIZE
15578 mov ecx, lightOrigin
15579 movss xmm7, [ecx+0]
15580 movhps xmm7, [ecx+4]
15582 mov ecx, lightVectors
15589 add edi, DRAWVERT_SIZE
15592 cmp
byte ptr [esi+eax], 0
15596 movss xmm1, [edi+DRAWVERT_XYZ_OFFSET+0]
15597 movhps xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
15604 movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
15605 movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
15608 movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
15609 movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
15613 unpcklps xmm5, xmm3
15614 unpckhps xmm2, xmm3
15616 movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
15617 movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
15622 shufps xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 )
15626 movlps [ecx+0], xmm5
15627 shufps xmm5, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
15628 movss [ecx+8], xmm5
15637 for (
int i = 0; i < numVerts; i++ ) {
15645 lightDir[0] = lightOrigin[0] - v->
xyz[0];
15646 lightDir[1] = lightOrigin[1] - v->
xyz[1];
15647 lightDir[2] = lightOrigin[2] - v->
xyz[2];
15649 lightVectors[
i][0] = lightDir[0] * v->
tangents[0][0] + lightDir[1] * v->
tangents[0][1] + lightDir[2] * v->
tangents[0][2];
15650 lightVectors[
i][1] = lightDir[0] * v->
tangents[1][0] + lightDir[1] * v->
tangents[1][1] + lightDir[2] * v->
tangents[1][2];
15651 lightVectors[
i][2] = lightDir[0] * v->
normal[0] + lightDir[1] * v->
normal[1] + lightDir[2] * v->
normal[2];
15656 ALIGN16(
int usedVertNums[4] );
15657 ALIGN16(
float lightDir0[4] );
15658 ALIGN16(
float lightDir1[4] );
15659 ALIGN16(
float lightDir2[4] );
15660 ALIGN16(
float normal0[4] );
15661 ALIGN16(
float normal1[4] );
15662 ALIGN16(
float normal2[4] );
15663 ALIGN16(
float tangent0[4] );
15664 ALIGN16(
float tangent1[4] );
15665 ALIGN16(
float tangent2[4] );
15666 ALIGN16(
float tangent3[4] );
15667 ALIGN16(
float tangent4[4] );
15668 ALIGN16(
float tangent5[4] );
15669 idVec3 localLightOrigin = lightOrigin;
15680 sub edi, DRAWVERT_SIZE
15689 add edi, DRAWVERT_SIZE
15691 cmp
byte ptr [esi+eax], 0
15694 mov usedVertNums[ecx*4], eax
15699 movss xmm0, localLightOrigin[0]
15700 movss xmm1, localLightOrigin[4]
15701 movss xmm2, localLightOrigin[8]
15703 subss xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
15704 subss xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
15705 subss xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]
15707 movss lightDir0[ecx*4-4], xmm0
15708 movss lightDir1[ecx*4-4], xmm1
15709 movss lightDir2[ecx*4-4], xmm2
15711 movss xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
15712 movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
15713 movss xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]
15715 movss normal0[ecx*4-4], xmm3
15716 movss normal1[ecx*4-4], xmm4
15717 movss normal2[ecx*4-4], xmm5
15719 movss xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
15720 movss xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
15721 movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]
15723 movss tangent0[ecx*4-4], xmm0
15724 movss tangent1[ecx*4-4], xmm1
15725 movss tangent2[ecx*4-4], xmm2
15727 movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
15728 movss xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
15729 movss xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]
15731 movss tangent3[ecx*4-4], xmm3
15732 movss tangent4[ecx*4-4], xmm4
15733 movss tangent5[ecx*4-4], xmm5
15737 movaps xmm0, lightDir0
15738 movaps xmm1, lightDir1
15739 movaps xmm2, lightDir2
15741 movaps xmm3, tangent0
15743 movaps xmm4, tangent1
15745 movaps xmm5, tangent2
15751 movaps xmm3, tangent3
15753 movaps xmm4, tangent4
15755 movaps xmm6, tangent5
15761 mulps xmm0, normal0
15762 mulps xmm1, normal1
15763 mulps xmm2, normal2
15770 mov edx, usedVertNums[0]
15771 add ecx, lightVectors
15774 movss [ecx+edx+0], xmm5
15775 movss [ecx+edx+4], xmm6
15776 movss [ecx+edx+8], xmm0
15778 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
15779 mov edx, usedVertNums[4]
15780 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
15782 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15784 movss [ecx+edx+0], xmm5
15785 movss [ecx+edx+4], xmm6
15786 movss [ecx+edx+8], xmm0
15788 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
15789 mov edx, usedVertNums[8]
15790 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
15792 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15794 movss [ecx+edx+0], xmm5
15795 movss [ecx+edx+4], xmm6
15796 movss [ecx+edx+8], xmm0
15798 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
15799 mov edx, usedVertNums[12]
15800 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
15802 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15804 movss [ecx+edx+0], xmm5
15805 movss [ecx+edx+4], xmm6
15806 movss [ecx+edx+8], xmm0
15817 add edi, lightVectors
15820 movss xmm0, lightDir0[eax*4]
15821 movss xmm1, lightDir1[eax*4]
15822 movss xmm2, lightDir2[eax*4]
15824 mov edx, usedVertNums[eax*4]
15827 movss xmm3, tangent0[eax*4]
15829 movss xmm4, tangent1[eax*4]
15831 movss xmm5, tangent2[eax*4]
15836 movss [edi+edx+0], xmm5
15838 movss xmm3, tangent3[eax*4]
15840 movss xmm4, tangent4[eax*4]
15842 movss xmm6, tangent5[eax*4]
15847 movss [edi+edx+4], xmm6
15849 mulss xmm0, normal0[eax*4]
15850 mulss xmm1, normal1[eax*4]
15851 mulss xmm2, normal2[eax*4]
15855 movss [edi+edx+8], xmm0
15866 ALIGN16(
float lightVectors0[4] );
15867 ALIGN16(
float lightVectors1[4] );
15868 ALIGN16(
float lightVectors2[4] );
15869 int numUsedVerts = 0;
15871 for (
int i = 0; i < numVerts; i++ ) {
15878 lightDir0[numUsedVerts] = lightOrigin[0] - v->
xyz[0];
15879 lightDir1[numUsedVerts] = lightOrigin[1] - v->
xyz[1];
15880 lightDir2[numUsedVerts] = lightOrigin[2] - v->
xyz[2];
15882 normal0[numUsedVerts] = v->
normal[0];
15883 normal1[numUsedVerts] = v->
normal[1];
15884 normal2[numUsedVerts] = v->
normal[2];
15886 tangent0[numUsedVerts] = v->
tangents[0][0];
15887 tangent1[numUsedVerts] = v->
tangents[0][1];
15888 tangent2[numUsedVerts] = v->
tangents[0][2];
15890 tangent3[numUsedVerts] = v->
tangents[1][0];
15891 tangent4[numUsedVerts] = v->
tangents[1][1];
15892 tangent5[numUsedVerts] = v->
tangents[1][2];
15894 usedVertNums[numUsedVerts++] =
i;
15895 if ( numUsedVerts < 4 ) {
15899 lightVectors0[0] = lightDir0[0] * tangent0[0];
15900 lightVectors0[1] = lightDir0[1] * tangent0[1];
15901 lightVectors0[2] = lightDir0[2] * tangent0[2];
15902 lightVectors0[3] = lightDir0[3] * tangent0[3];
15904 lightVectors0[0] += lightDir1[0] * tangent1[0];
15905 lightVectors0[1] += lightDir1[1] * tangent1[1];
15906 lightVectors0[2] += lightDir1[2] * tangent1[2];
15907 lightVectors0[3] += lightDir1[3] * tangent1[3];
15909 lightVectors0[0] += lightDir2[0] * tangent2[0];
15910 lightVectors0[1] += lightDir2[1] * tangent2[1];
15911 lightVectors0[2] += lightDir2[2] * tangent2[2];
15912 lightVectors0[3] += lightDir2[3] * tangent2[3];
15914 lightVectors1[0] = lightDir0[0] * tangent3[0];
15915 lightVectors1[1] = lightDir0[1] * tangent3[1];
15916 lightVectors1[2] = lightDir0[2] * tangent3[2];
15917 lightVectors1[3] = lightDir0[3] * tangent3[3];
15919 lightVectors1[0] += lightDir1[0] * tangent4[0];
15920 lightVectors1[1] += lightDir1[1] * tangent4[1];
15921 lightVectors1[2] += lightDir1[2] * tangent4[2];
15922 lightVectors1[3] += lightDir1[3] * tangent4[3];
15924 lightVectors1[0] += lightDir2[0] * tangent5[0];
15925 lightVectors1[1] += lightDir2[1] * tangent5[1];
15926 lightVectors1[2] += lightDir2[2] * tangent5[2];
15927 lightVectors1[3] += lightDir2[3] * tangent5[3];
15929 lightVectors2[0] = lightDir0[0] * normal0[0];
15930 lightVectors2[1] = lightDir0[1] * normal0[1];
15931 lightVectors2[2] = lightDir0[2] * normal0[2];
15932 lightVectors2[3] = lightDir0[3] * normal0[3];
15934 lightVectors2[0] += lightDir1[0] * normal1[0];
15935 lightVectors2[1] += lightDir1[1] * normal1[1];
15936 lightVectors2[2] += lightDir1[2] * normal1[2];
15937 lightVectors2[3] += lightDir1[3] * normal1[3];
15939 lightVectors2[0] += lightDir2[0] * normal2[0];
15940 lightVectors2[1] += lightDir2[1] * normal2[1];
15941 lightVectors2[2] += lightDir2[2] * normal2[2];
15942 lightVectors2[3] += lightDir2[3] * normal2[3];
15945 for (
int j = 0; j < 4; j++ ) {
15946 int n = usedVertNums[
j];
15948 lightVectors[
n][0] = lightVectors0[
j];
15949 lightVectors[
n][1] = lightVectors1[
j];
15950 lightVectors[
n][2] = lightVectors2[
j];
15956 for (
int i = 0; i < numUsedVerts; i++ ) {
15958 lightVectors0[
i] = lightDir0[
i] * tangent0[
i] + lightDir1[
i] * tangent1[
i] + lightDir2[
i] * tangent2[
i];
15959 lightVectors1[
i] = lightDir0[
i] * tangent3[
i] + lightDir1[
i] * tangent4[
i] + lightDir2[
i] * tangent5[
i];
15960 lightVectors2[
i] = lightDir0[
i] * normal0[
i] + lightDir1[
i] * normal1[
i] + lightDir2[
i] * normal2[
i];
15962 int n = usedVertNums[
i];
15963 lightVectors[
n][0] = lightVectors0[
i];
15964 lightVectors[
n][1] = lightVectors1[
i];
15965 lightVectors[
n][2] = lightVectors2[
i];
15981 assert( (
int)&((
idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
15982 assert( (
int)&((
idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
15984 bool *used = (
bool *)_alloca16( numVerts *
sizeof( used[0] ) );
15985 memset( used, 0, numVerts *
sizeof( used[0] ) );
15987 for (
int i = numIndexes - 1; i >= 0; i-- ) {
15988 used[indexes[
i]] =
true;
16001 sub edi, DRAWVERT_SIZE
16006 mov ecx, viewOrigin
16007 movss xmm6, [ecx+0]
16008 movhps xmm6, [ecx+4]
16010 mov ecx, lightOrigin
16011 movss xmm7, [ecx+0]
16012 movhps xmm7, [ecx+4]
16021 add edi, DRAWVERT_SIZE
16024 cmp
byte ptr [esi+eax], 0
16029 movss xmm2, [edi+DRAWVERT_XYZ_OFFSET+0]
16030 movhps xmm2, [edi+DRAWVERT_XYZ_OFFSET+4]
16043 unpcklps xmm5, xmm4
16044 unpckhps xmm3, xmm4
16049 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
16053 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
16054 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 1, 1 )
16060 movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
16061 movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
16064 movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
16065 movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
16068 movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
16069 movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
16073 unpcklps xmm5, xmm3
16074 unpckhps xmm2, xmm3
16078 shufps xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 )
16080 movaps xmm3, SIMD_SP_one
16084 movaps [ecx+0], xmm5
16085 movss [ecx+12], xmm3
16094 for (
int i = 0; i < numVerts; i++ ) {
16101 idVec3 lightDir = lightOrigin - v->
xyz;
16106 ilength =
idMath::RSqrt( lightDir[0] * lightDir[0] + lightDir[1] * lightDir[1] + lightDir[2] * lightDir[2] );
16107 lightDir[0] *= ilength;
16108 lightDir[1] *= ilength;
16109 lightDir[2] *= ilength;
16111 ilength =
idMath::RSqrt( viewDir[0] * viewDir[0] + viewDir[1] * viewDir[1] + viewDir[2] * viewDir[2] );
16112 viewDir[0] *= ilength;
16113 viewDir[1] *= ilength;
16114 viewDir[2] *= ilength;
16116 lightDir += viewDir;
16120 texCoords[
i][2] = lightDir[0] * v->
normal[0] + lightDir[1] * v->
normal[1] + lightDir[2] * v->
normal[2];
16121 texCoords[
i][3] = 1.0f;
16127 ALIGN16(
int usedVertNums[4] );
16128 ALIGN16(
float lightDir0[4] );
16129 ALIGN16(
float lightDir1[4] );
16130 ALIGN16(
float lightDir2[4] );
16131 ALIGN16(
float viewDir0[4] );
16132 ALIGN16(
float viewDir1[4] );
16133 ALIGN16(
float viewDir2[4] );
16134 ALIGN16(
float normal0[4] );
16135 ALIGN16(
float normal1[4] );
16136 ALIGN16(
float normal2[4] );
16137 ALIGN16(
float tangent0[4] );
16138 ALIGN16(
float tangent1[4] );
16139 ALIGN16(
float tangent2[4] );
16140 ALIGN16(
float tangent3[4] );
16141 ALIGN16(
float tangent4[4] );
16142 ALIGN16(
float tangent5[4] );
16143 idVec3 localLightOrigin = lightOrigin;
16144 idVec3 localViewOrigin = viewOrigin;
16155 sub edi, DRAWVERT_SIZE
16164 add edi, DRAWVERT_SIZE
16166 cmp
byte ptr [esi+eax], 0
16169 mov usedVertNums[ecx*4], eax
16174 movss xmm3, localLightOrigin[0]
16175 movss xmm4, localLightOrigin[4]
16176 movss xmm5, localLightOrigin[8]
16178 subss xmm3, [edi+DRAWVERT_XYZ_OFFSET+0]
16179 subss xmm4, [edi+DRAWVERT_XYZ_OFFSET+4]
16180 subss xmm5, [edi+DRAWVERT_XYZ_OFFSET+8]
16182 movss lightDir0[ecx*4-4], xmm3
16183 movss lightDir1[ecx*4-4], xmm4
16184 movss lightDir2[ecx*4-4], xmm5
16186 movss xmm0, localViewOrigin[0]
16187 movss xmm1, localViewOrigin[4]
16188 movss xmm2, localViewOrigin[8]
16190 subss xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
16191 subss xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
16192 subss xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]
16194 movss viewDir0[ecx*4-4], xmm0
16195 movss viewDir1[ecx*4-4], xmm1
16196 movss viewDir2[ecx*4-4], xmm2
16198 movss xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
16199 movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
16200 movss xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]
16202 movss normal0[ecx*4-4], xmm3
16203 movss normal1[ecx*4-4], xmm4
16204 movss normal2[ecx*4-4], xmm5
16206 movss xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
16207 movss xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
16208 movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]
16210 movss tangent0[ecx*4-4], xmm0
16211 movss tangent1[ecx*4-4], xmm1
16212 movss tangent2[ecx*4-4], xmm2
16214 movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
16215 movss xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
16216 movss xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]
16218 movss tangent3[ecx*4-4], xmm3
16219 movss tangent4[ecx*4-4], xmm4
16220 movss tangent5[ecx*4-4], xmm5
16224 movaps xmm6, lightDir0
16227 movaps xmm7, lightDir1
16231 movaps xmm5, lightDir2
16241 movaps xmm3, viewDir0
16244 movaps xmm4, viewDir1
16248 movaps xmm5, viewDir2
16261 movaps xmm3, tangent0
16263 movaps xmm4, tangent1
16266 movaps xmm5, tangent2
16270 movaps xmm3, tangent3
16272 movaps xmm4, tangent4
16275 movaps xmm6, tangent5
16279 mulps xmm0, normal0
16280 mulps xmm1, normal1
16282 mulps xmm2, normal2
16287 mov edx, usedVertNums[0]
16290 movss xmm3, SIMD_SP_one
16292 movss [ecx+edx+0], xmm5
16293 movss [ecx+edx+4], xmm6
16294 movss [ecx+edx+8], xmm0
16295 movss [ecx+edx+12], xmm3
16297 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
16298 mov edx, usedVertNums[4]
16299 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
16301 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
16303 movss [ecx+edx+0], xmm5
16304 movss [ecx+edx+4], xmm6
16305 movss [ecx+edx+8], xmm0
16306 movss [ecx+edx+12], xmm3
16308 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
16309 mov edx, usedVertNums[8]
16310 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
16312 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
16314 movss [ecx+edx+0], xmm5
16315 movss [ecx+edx+4], xmm6
16316 movss [ecx+edx+8], xmm0
16317 movss [ecx+edx+12], xmm3
16319 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
16320 mov edx, usedVertNums[12]
16321 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
16323 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
16325 movss [ecx+edx+0], xmm5
16326 movss [ecx+edx+4], xmm6
16327 movss [ecx+edx+8], xmm0
16328 movss [ecx+edx+12], xmm3
16342 movss xmm6, lightDir0[eax*4]
16345 movss xmm7, lightDir1[eax*4]
16349 movss xmm5, lightDir2[eax*4]
16359 movss xmm3, viewDir0[eax*4]
16362 movss xmm4, viewDir1[eax*4]
16366 movss xmm5, viewDir2[eax*4]
16379 mov edx, usedVertNums[eax*4]
16382 movss xmm3, tangent0[eax*4]
16384 movss xmm4, tangent1[eax*4]
16387 movss xmm5, tangent2[eax*4]
16390 movss [edi+edx+0], xmm5
16392 movss xmm3, tangent3[eax*4]
16394 movss xmm4, tangent4[eax*4]
16397 movss xmm6, tangent5[eax*4]
16400 movss [edi+edx+4], xmm6
16402 mulss xmm0, normal0[eax*4]
16403 mulss xmm1, normal1[eax*4]
16405 mulss xmm2, normal2[eax*4]
16407 movss [edi+edx+8], xmm0
16409 movss xmm3, SIMD_SP_one
16410 movss [edi+edx+12], xmm3
16421 ALIGN16(
int usedVertNums[4] );
16422 ALIGN16(
float lightDir0[4] );
16423 ALIGN16(
float lightDir1[4] );
16424 ALIGN16(
float lightDir2[4] );
16425 ALIGN16(
float viewDir0[4] );
16426 ALIGN16(
float viewDir1[4] );
16427 ALIGN16(
float viewDir2[4] );
16428 ALIGN16(
float normal0[4] );
16429 ALIGN16(
float normal1[4] );
16430 ALIGN16(
float normal2[4] );
16431 ALIGN16(
float tangent0[4] );
16432 ALIGN16(
float tangent1[4] );
16433 ALIGN16(
float tangent2[4] );
16434 ALIGN16(
float tangent3[4] );
16435 ALIGN16(
float tangent4[4] );
16436 ALIGN16(
float tangent5[4] );
16437 ALIGN16(
float texCoords0[4] );
16438 ALIGN16(
float texCoords1[4] );
16439 ALIGN16(
float texCoords2[4] );
16440 idVec3 localLightOrigin = lightOrigin;
16441 idVec3 localViewOrigin = viewOrigin;
16442 int numUsedVerts = 0;
16444 for (
int i = 0; i < numVerts; i++ ) {
16451 lightDir0[numUsedVerts] = localLightOrigin[0] - v->
xyz[0];
16452 lightDir1[numUsedVerts] = localLightOrigin[1] - v->
xyz[1];
16453 lightDir2[numUsedVerts] = localLightOrigin[2] - v->
xyz[2];
16455 viewDir0[numUsedVerts] = localViewOrigin[0] - v->
xyz[0];
16456 viewDir1[numUsedVerts] = localViewOrigin[1] - v->
xyz[1];
16457 viewDir2[numUsedVerts] = localViewOrigin[2] - v->
xyz[2];
16459 normal0[numUsedVerts] = v->
normal[0];
16460 normal1[numUsedVerts] = v->
normal[1];
16461 normal2[numUsedVerts] = v->
normal[2];
16463 tangent0[numUsedVerts] = v->
tangents[0][0];
16464 tangent1[numUsedVerts] = v->
tangents[0][1];
16465 tangent2[numUsedVerts] = v->
tangents[0][2];
16467 tangent3[numUsedVerts] = v->
tangents[1][0];
16468 tangent4[numUsedVerts] = v->
tangents[1][1];
16469 tangent5[numUsedVerts] = v->
tangents[1][2];
16471 usedVertNums[numUsedVerts++] =
i;
16472 if ( numUsedVerts < 4 ) {
16476 ALIGN16(
float temp[4] );
16478 temp[0] = lightDir0[0] * lightDir0[0];
16479 temp[1] = lightDir0[1] * lightDir0[1];
16480 temp[2] = lightDir0[2] * lightDir0[2];
16481 temp[3] = lightDir0[3] * lightDir0[3];
16483 temp[0] += lightDir1[0] * lightDir1[0];
16484 temp[1] += lightDir1[1] * lightDir1[1];
16485 temp[2] += lightDir1[2] * lightDir1[2];
16486 temp[3] += lightDir1[3] * lightDir1[3];
16488 temp[0] += lightDir2[0] * lightDir2[0];
16489 temp[1] += lightDir2[1] * lightDir2[1];
16490 temp[2] += lightDir2[2] * lightDir2[2];
16491 temp[3] += lightDir2[3] * lightDir2[3];
16498 lightDir0[0] *= temp[0];
16499 lightDir0[1] *= temp[1];
16500 lightDir0[2] *= temp[2];
16501 lightDir0[3] *= temp[3];
16503 lightDir1[0] *= temp[0];
16504 lightDir1[1] *= temp[1];
16505 lightDir1[2] *= temp[2];
16506 lightDir1[3] *= temp[3];
16508 lightDir2[0] *= temp[0];
16509 lightDir2[1] *= temp[1];
16510 lightDir2[2] *= temp[2];
16511 lightDir2[3] *= temp[3];
16513 temp[0] = viewDir0[0] * viewDir0[0];
16514 temp[1] = viewDir0[1] * viewDir0[1];
16515 temp[2] = viewDir0[2] * viewDir0[2];
16516 temp[3] = viewDir0[3] * viewDir0[3];
16518 temp[0] += viewDir1[0] * viewDir1[0];
16519 temp[1] += viewDir1[1] * viewDir1[1];
16520 temp[2] += viewDir1[2] * viewDir1[2];
16521 temp[3] += viewDir1[3] * viewDir1[3];
16523 temp[0] += viewDir2[0] * viewDir2[0];
16524 temp[1] += viewDir2[1] * viewDir2[1];
16525 temp[2] += viewDir2[2] * viewDir2[2];
16526 temp[3] += viewDir2[3] * viewDir2[3];
16533 viewDir0[0] *= temp[0];
16534 viewDir0[1] *= temp[1];
16535 viewDir0[2] *= temp[2];
16536 viewDir0[3] *= temp[3];
16538 viewDir1[0] *= temp[0];
16539 viewDir1[1] *= temp[1];
16540 viewDir1[2] *= temp[2];
16541 viewDir1[3] *= temp[3];
16543 viewDir2[0] *= temp[0];
16544 viewDir2[1] *= temp[1];
16545 viewDir2[2] *= temp[2];
16546 viewDir2[3] *= temp[3];
16548 lightDir0[0] += viewDir0[0];
16549 lightDir0[1] += viewDir0[1];
16550 lightDir0[2] += viewDir0[2];
16551 lightDir0[3] += viewDir0[3];
16553 lightDir1[0] += viewDir1[0];
16554 lightDir1[1] += viewDir1[1];
16555 lightDir1[2] += viewDir1[2];
16556 lightDir1[3] += viewDir1[3];
16558 lightDir2[0] += viewDir2[0];
16559 lightDir2[1] += viewDir2[1];
16560 lightDir2[2] += viewDir2[2];
16561 lightDir2[3] += viewDir2[3];
16563 texCoords0[0] = lightDir0[0] * tangent0[0];
16564 texCoords0[1] = lightDir0[1] * tangent0[1];
16565 texCoords0[2] = lightDir0[2] * tangent0[2];
16566 texCoords0[3] = lightDir0[3] * tangent0[3];
16568 texCoords0[0] += lightDir1[0] * tangent1[0];
16569 texCoords0[1] += lightDir1[1] * tangent1[1];
16570 texCoords0[2] += lightDir1[2] * tangent1[2];
16571 texCoords0[3] += lightDir1[3] * tangent1[3];
16573 texCoords0[0] += lightDir2[0] * tangent2[0];
16574 texCoords0[1] += lightDir2[1] * tangent2[1];
16575 texCoords0[2] += lightDir2[2] * tangent2[2];
16576 texCoords0[3] += lightDir2[3] * tangent2[3];
16578 texCoords1[0] = lightDir0[0] * tangent3[0];
16579 texCoords1[1] = lightDir0[1] * tangent3[1];
16580 texCoords1[2] = lightDir0[2] * tangent3[2];
16581 texCoords1[3] = lightDir0[3] * tangent3[3];
16583 texCoords1[0] += lightDir1[0] * tangent4[0];
16584 texCoords1[1] += lightDir1[1] * tangent4[1];
16585 texCoords1[2] += lightDir1[2] * tangent4[2];
16586 texCoords1[3] += lightDir1[3] * tangent4[3];
16588 texCoords1[0] += lightDir2[0] * tangent5[0];
16589 texCoords1[1] += lightDir2[1] * tangent5[1];
16590 texCoords1[2] += lightDir2[2] * tangent5[2];
16591 texCoords1[3] += lightDir2[3] * tangent5[3];
16593 texCoords2[0] = lightDir0[0] * normal0[0];
16594 texCoords2[1] = lightDir0[1] * normal0[1];
16595 texCoords2[2] = lightDir0[2] * normal0[2];
16596 texCoords2[3] = lightDir0[3] * normal0[3];
16598 texCoords2[0] += lightDir1[0] * normal1[0];
16599 texCoords2[1] += lightDir1[1] * normal1[1];
16600 texCoords2[2] += lightDir1[2] * normal1[2];
16601 texCoords2[3] += lightDir1[3] * normal1[3];
16603 texCoords2[0] += lightDir2[0] * normal2[0];
16604 texCoords2[1] += lightDir2[1] * normal2[1];
16605 texCoords2[2] += lightDir2[2] * normal2[2];
16606 texCoords2[3] += lightDir2[3] * normal2[3];
16608 for (
int j = 0; j < 4; j++ ) {
16609 int n = usedVertNums[
j];
16611 texCoords[
n][0] = texCoords0[
j];
16612 texCoords[
n][1] = texCoords1[
j];
16613 texCoords[
n][2] = texCoords2[
j];
16614 texCoords[
n][3] = 1.0f;
16620 for (
int i = 0; i < numUsedVerts; i++ ) {
16623 temp = lightDir0[
i] * lightDir0[
i] + lightDir1[
i] * lightDir1[
i] + lightDir2[
i] * lightDir2[
i];
16626 lightDir0[
i] *= temp;
16627 lightDir1[
i] *= temp;
16628 lightDir2[
i] *= temp;
16630 temp = viewDir0[
i] * viewDir0[
i] + viewDir1[
i] * viewDir1[
i] + viewDir2[
i] * viewDir2[
i];
16633 viewDir0[
i] *= temp;
16634 viewDir1[
i] *= temp;
16635 viewDir2[
i] *= temp;
16637 lightDir0[
i] += viewDir0[
i];
16638 lightDir1[
i] += viewDir1[
i];
16639 lightDir2[
i] += viewDir2[
i];
16641 texCoords0[
i] = lightDir0[
i] * tangent0[
i] + lightDir1[
i] * tangent1[
i] + lightDir2[
i] * tangent2[
i];
16642 texCoords1[
i] = lightDir0[
i] * tangent3[
i] + lightDir1[
i] * tangent4[
i] + lightDir2[
i] * tangent5[
i];
16643 texCoords2[
i] = lightDir0[
i] * normal0[
i] + lightDir1[
i] * normal1[
i] + lightDir2[
i] * normal2[
i];
16645 int n = usedVertNums[
i];
16646 texCoords[
n][0] = texCoords0;
16647 texCoords[
n][1] = texCoords1;
16648 texCoords[
n][2] = texCoords2;
16649 texCoords[
n][3] = 1.0f;
16667 mov esi, lightOrigin
16668 movaps xmm5, SIMD_SP_lastOne
16669 movss xmm6, [esi+0]
16670 movhps xmm6, [esi+4]
16671 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 3, 1 )
16672 orps xmm6, SIMD_SP_lastOne
16680 mov edi, vertexCache
16689 prefetchnta [edx+128]
16690 prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
16692 cmp
dword ptr [edx+eax+0], ebx
16695 mov
dword ptr [edx+eax+0], ecx
16696 movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16697 movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16699 shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
16701 movaps [edi+0*16], xmm0
16703 movaps [edi+1*16], xmm0
16707 cmp
dword ptr [edx+eax+4], ebx
16710 mov
dword ptr [edx+eax+4], ecx
16711 movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16712 movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
16714 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
16716 movaps [edi+0*16], xmm1
16718 movaps [edi+1*16], xmm1
16722 cmp
dword ptr [edx+eax+8], ebx
16725 mov
dword ptr [edx+eax+8], ecx
16726 movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16727 movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16729 shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
16731 movaps [edi+0*16], xmm2
16733 movaps [edi+1*16], xmm2
16737 cmp
dword ptr [edx+eax+12], ebx
16740 mov
dword ptr [edx+eax+12], ecx
16741 movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16742 movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
16744 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
16746 movaps [edi+0*16], xmm3
16748 movaps [edi+1*16], xmm3
16752 add esi, 4*DRAWVERT_SIZE
16765 cmp
dword ptr [edx+eax+0], ebx
16768 mov
dword ptr [edx+eax+0], ecx
16769 movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16770 movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16772 shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 )
16774 movaps [edi+0*16], xmm0
16776 movaps [edi+1*16], xmm0
16781 add esi, DRAWVERT_SIZE
16794 for (
int i = 0; i < numVerts; i++ ) {
16795 if ( vertRemap[i] ) {
16799 vertexCache[outVerts+0][0] = v[0];
16800 vertexCache[outVerts+0][1] = v[1];
16801 vertexCache[outVerts+0][2] = v[2];
16802 vertexCache[outVerts+0][3] = 1.0f;
16807 vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
16808 vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
16809 vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
16810 vertexCache[outVerts+1][3] = 0.0f;
16811 vertRemap[
i] = outVerts;
16828 movaps xmm4, SIMD_SP_lastOne
16834 mov edi, vertexCache
16843 prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
16845 movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16846 movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16847 shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
16848 movaps [edi+eax+1*16], xmm0
16850 movaps [edi+eax+0*16], xmm0
16852 movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16853 movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
16854 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
16855 movaps [edi+eax+3*16], xmm1
16857 movaps [edi+eax+2*16], xmm1
16859 movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16860 movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16861 shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
16862 movaps [edi+eax+5*16], xmm2
16864 movaps [edi+eax+4*16], xmm2
16866 movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16867 movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
16868 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
16869 movaps [edi+eax+7*16], xmm3
16871 movaps [edi+eax+6*16], xmm3
16873 add esi, 4*DRAWVERT_SIZE
16886 movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16887 movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16888 shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
16889 movaps [edi+eax+1*16], xmm0
16891 movaps [edi+eax+0*16], xmm0
16893 add esi, DRAWVERT_SIZE
16899 return numVerts * 2;
16903 for (
int i = 0; i < numVerts; i++ ) {
16905 vertexCache[i*2+0][0] = v[0];
16906 vertexCache[i*2+0][1] = v[1];
16907 vertexCache[i*2+0][2] = v[2];
16908 vertexCache[i*2+0][3] = 1.0f;
16910 vertexCache[i*2+1][0] = v[0];
16911 vertexCache[i*2+1][1] = v[1];
16912 vertexCache[i*2+1][2] = v[2];
16913 vertexCache[i*2+1][3] = 0.0f;
16915 return numVerts * 2;
16925 static void SSE_UpSample11kHzMonoPCMTo44kHz(
float *dest,
const short *src,
const int numSamples ) {
16930 mov eax, numSamples
16941 movsx ecx,
word ptr [esi+eax+0]
16943 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
16944 movlps [edi-2*4*4+0], xmm0
16945 movhps [edi-2*4*4+8], xmm0
16947 movsx edx,
word ptr [esi+eax+2]
16949 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
16950 movlps [edi-1*4*4+0], xmm1
16951 movhps [edi-1*4*4+8], xmm1
16957 mov eax, numSamples
16961 movsx ecx,
word ptr [esi]
16963 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
16964 movlps [edi+0], xmm0
16965 movhps [edi+8], xmm0
16976 static
void SSE_UpSample11kHzStereoPCMTo44kHz(
float *dest,
const short *src,
const int numSamples ) {
16981 mov eax, numSamples
16992 movsx ecx,
word ptr [esi+eax+0]
16995 movsx edx,
word ptr [esi+eax+2]
16998 unpcklps xmm0, xmm1
17000 movlps [edi-8*4+0], xmm0
17001 movlps [edi-8*4+8], xmm0
17002 movlps [edi-4*4+0], xmm0
17003 movlps [edi-4*4+8], xmm0
17017 static void SSE_UpSample22kHzMonoPCMTo44kHz(
float *dest,
const short *src,
const int numSamples ) {
17022 mov eax, numSamples
17033 movsx ecx,
word ptr [esi+eax+0]
17036 movsx edx,
word ptr [esi+eax+2]
17039 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
17040 movlps [edi-4*4+0], xmm0
17041 movhps [edi-4*4+8], xmm0
17047 mov eax, numSamples
17051 movsx ecx,
word ptr [esi]
17053 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
17065 static
void SSE_UpSample22kHzStereoPCMTo44kHz(
float *dest,
const short *src,
const int numSamples ) {
17070 mov eax, numSamples
17081 movsx ecx,
word ptr [esi+eax+0]
17083 movss [edi-4*4], xmm0
17084 movss [edi-2*4], xmm0
17086 movsx edx,
word ptr [esi+eax+2]
17088 movss [edi-3*4], xmm1
17089 movss [edi-1*4], xmm1
17103 static void SSE_UpSample44kHzMonoPCMTo44kHz(
float *dest,
const short *src,
const int numSamples ) {
17108 mov eax, numSamples
17119 movsx ecx,
word ptr [esi+eax+0]
17121 movss [edi-2*4], xmm0
17123 movsx edx,
word ptr [esi+eax+2]
17125 movss [edi-1*4], xmm1
17131 mov eax, numSamples
17135 movsx ecx,
word ptr [esi]
17151 if ( kHz == 11025 ) {
17152 if ( numChannels == 1 ) {
17153 SSE_UpSample11kHzMonoPCMTo44kHz( dest, src, numSamples );
17155 SSE_UpSample11kHzStereoPCMTo44kHz( dest, src, numSamples );
17157 }
else if ( kHz == 22050 ) {
17158 if ( numChannels == 1 ) {
17159 SSE_UpSample22kHzMonoPCMTo44kHz( dest, src, numSamples );
17161 SSE_UpSample22kHzStereoPCMTo44kHz( dest, src, numSamples );
17163 }
else if ( kHz == 44100 ) {
17164 SSE_UpSample44kHzMonoPCMTo44kHz( dest, src, numSamples );
17175 static void SSE_UpSample11kHzMonoOGGTo44kHz(
float *dest,
const float *src,
const int numSamples ) {
17176 float constant = 32768.0f;
17180 movss xmm7, constant
17181 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17183 mov eax, numSamples
17194 movss xmm0, [esi+eax+0]
17196 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
17197 movlps [edi-32], xmm0
17198 movlps [edi-24], xmm0
17200 movss xmm1, [esi+eax+4]
17202 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
17203 movlps [edi-16], xmm1
17204 movlps [edi- 8], xmm1
17210 mov eax, numSamples
17216 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
17217 movlps [edi+0], xmm0
17218 movlps [edi+8], xmm0
17229 static
void SSE_UpSample11kHzStereoOGGTo44kHz(
float *dest,
const float *
const *src,
const int numSamples ) {
17230 float constant = 32768.0f;
17236 movss xmm7, constant
17237 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17239 mov eax, numSamples
17251 movlps xmm0, [ecx+eax]
17252 movlps xmm1, [edx+eax]
17253 unpcklps xmm0, xmm1
17255 movlps [edi-8*8], xmm0
17256 movlps [edi-7*8], xmm0
17257 movlps [edi-6*8], xmm0
17258 movlps [edi-5*8], xmm0
17259 movhps [edi-4*8], xmm0
17260 movhps [edi-3*8], xmm0
17261 movhps [edi-2*8], xmm0
17262 movhps [edi-1*8], xmm0
17268 mov eax, numSamples
17274 unpcklps xmm0, xmm1
17276 movlps [edi+0*8], xmm0
17277 movlps [edi+1*8], xmm0
17278 movlps [edi+2*8], xmm0
17279 movlps [edi+3*8], xmm0
17290 static
void SSE_UpSample22kHzMonoOGGTo44kHz(
float *dest,
const float *src,
const int numSamples ) {
17291 float constant = 32768.0f;
17295 movss xmm7, constant
17296 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17298 mov eax, numSamples
17309 movss xmm0, [esi+eax+0]
17310 movss xmm1, [esi+eax+4]
17311 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
17313 movlps [edi-16], xmm0
17314 movhps [edi- 8], xmm0
17320 mov eax, numSamples
17326 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
17327 movlps [edi+0], xmm0
17338 static
void SSE_UpSample22kHzStereoOGGTo44kHz(
float *dest,
const float *
const *src,
const int numSamples ) {
17339 float constant = 32768.0f;
17345 movss xmm7, constant
17346 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17348 mov eax, numSamples
17360 movlps xmm0, [ecx+eax]
17361 movlps xmm1, [edx+eax]
17362 unpcklps xmm0, xmm1
17364 movlps [edi-4*8], xmm0
17365 movlps [edi-3*8], xmm0
17366 movhps [edi-2*8], xmm0
17367 movhps [edi-1*8], xmm0
17373 mov eax, numSamples
17379 unpcklps xmm0, xmm1
17381 movlps [edi+0*8], xmm0
17382 movlps [edi+1*8], xmm0
17393 static
void SSE_UpSample44kHzMonoOGGTo44kHz(
float *dest,
const float *src,
const int numSamples ) {
17394 float constant = 32768.0f;
17395 KFLOAT_CA( mul, dest, src, constant, numSamples )
17403 static void SSE_UpSample44kHzStereoOGGTo44kHz(
float *dest,
const float *
const *src,
const int numSamples ) {
17404 float constant = 32768.0f;
17410 movss xmm7, constant
17411 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17413 mov eax, numSamples
17425 movlps xmm0, [ecx+eax]
17426 movlps xmm1, [edx+eax]
17427 unpcklps xmm0, xmm1
17429 movlps [edi-2*8], xmm0
17430 movhps [edi-1*8], xmm0
17436 mov eax, numSamples
17442 unpcklps xmm0, xmm1
17444 movlps [edi+0*8], xmm0
17458 if ( kHz == 11025 ) {
17459 if ( numChannels == 1 ) {
17460 SSE_UpSample11kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
17462 SSE_UpSample11kHzStereoOGGTo44kHz( dest, ogg, numSamples );
17464 }
else if ( kHz == 22050 ) {
17465 if ( numChannels == 1 ) {
17466 SSE_UpSample22kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
17468 SSE_UpSample22kHzStereoOGGTo44kHz( dest, ogg, numSamples );
17470 }
else if ( kHz == 44100 ) {
17471 if ( numChannels == 1 ) {
17472 SSE_UpSample44kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
17474 SSE_UpSample44kHzStereoOGGTo44kHz( dest, ogg, numSamples );
17489 ALIGN16(
float incs[2] );
17508 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
17510 shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
17516 movaps xmm0, [esi+eax+0*4*4]
17518 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
17520 addps xmm0, [edi-4*4*4]
17522 movaps [edi-4*4*4], xmm0
17524 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
17526 addps xmm1, [edi-3*4*4]
17528 movaps [edi-3*4*4], xmm1
17530 movaps xmm2, [esi+eax+1*4*4]
17532 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
17534 addps xmm2, [edi-2*4*4]
17536 movaps [edi-2*4*4], xmm2
17538 shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 )
17540 addps xmm3, [edi-1*4*4]
17542 movaps [edi-1*4*4], xmm3
17564 sL1 = lastV[0] + incL;
17565 sR1 = lastV[1] + incR;
17571 mixBuffer[i*2+0] += samples[i+0] * sL0;
17572 mixBuffer[i*2+1] += samples[i+0] * sR0;
17573 mixBuffer[i*2+2] += samples[i+1] * sL1;
17574 mixBuffer[i*2+3] += samples[i+1] * sR1;
17592 ALIGN16(
float incs[2] );
17611 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
17613 shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
17619 movaps xmm0, [esi+eax+0*4*4]
17621 addps xmm0, [edi-4*4*4]
17623 movaps [edi-4*4*4], xmm0
17625 movaps xmm2, [esi+eax+1*4*4]
17627 addps xmm2, [edi-3*4*4]
17629 movaps [edi-3*4*4], xmm2
17631 movaps xmm3, [esi+eax+2*4*4]
17633 addps xmm3, [edi-2*4*4]
17635 movaps [edi-2*4*4], xmm3
17637 movaps xmm4, [esi+eax+3*4*4]
17639 addps xmm4, [edi-1*4*4]
17641 movaps [edi-1*4*4], xmm4
17663 sL1 = lastV[0] + incL;
17664 sR1 = lastV[1] + incR;
17670 mixBuffer[i*2+0] += samples[i*2+0] * sL0;
17671 mixBuffer[i*2+1] += samples[i*2+1] * sR0;
17672 mixBuffer[i*2+2] += samples[i*2+2] * sL1;
17673 mixBuffer[i*2+3] += samples[i*2+3] * sR1;
17691 ALIGN16(
float incs[6] );
17711 movlps xmm2, [ecx+ 0]
17712 movhps xmm2, [ecx+ 8]
17713 movlps xmm3, [ecx+16]
17715 shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
17716 shufps xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
17720 movlps xmm7, incs+8
17721 movhps xmm7, incs+16
17724 shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
17726 shufps xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
17734 movaps xmm0, [esi+eax]
17737 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
17739 addps xmm1, [edi-6*16]
17741 movaps [edi-6*16], xmm1
17744 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
17746 addps xmm1, [edi-5*16]
17748 movaps [edi-5*16], xmm1
17751 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
17753 addps xmm1, [edi-4*16]
17755 movaps [edi-4*16], xmm1
17758 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 2, 2 )
17760 addps xmm1, [edi-3*16]
17762 movaps [edi-3*16], xmm1
17765 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
17767 addps xmm1, [edi-2*16]
17769 movaps [edi-2*16], xmm1
17771 shufps xmm0, xmm0, R_SHUFFLEPS( 3, 3, 3, 3 )
17773 addps xmm0, [edi-1*16]
17775 movaps [edi-1*16], xmm0
17785 float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
17786 float incL0, incL1, incL2, incL3, incL4, incL5;
17804 sL6 = lastV[0] + incL0;
17805 sL7 = lastV[1] + incL1;
17806 sL8 = lastV[2] + incL2;
17807 sL9 = lastV[3] + incL3;
17808 sL10 = lastV[4] + incL4;
17809 sL11 = lastV[5] + incL5;
17819 mixBuffer[i*6+ 0] += samples[i+0] * sL0;
17820 mixBuffer[i*6+ 1] += samples[i+0] * sL1;
17821 mixBuffer[i*6+ 2] += samples[i+0] * sL2;
17822 mixBuffer[i*6+ 3] += samples[i+0] * sL3;
17824 mixBuffer[i*6+ 4] += samples[i+0] * sL4;
17825 mixBuffer[i*6+ 5] += samples[i+0] * sL5;
17826 mixBuffer[i*6+ 6] += samples[i+1] * sL6;
17827 mixBuffer[i*6+ 7] += samples[i+1] * sL7;
17829 mixBuffer[i*6+ 8] += samples[i+1] * sL8;
17830 mixBuffer[i*6+ 9] += samples[i+1] * sL9;
17831 mixBuffer[i*6+10] += samples[i+1] * sL10;
17832 mixBuffer[i*6+11] += samples[i+1] * sL11;
17861 ALIGN16(
float incs[6] );
17883 movlps xmm2, [ecx+ 0]
17884 movhps xmm2, [ecx+ 8]
17885 movlps xmm3, [ecx+16]
17887 shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
17888 shufps xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
17892 movlps xmm7, incs+ 8
17893 movhps xmm7, incs+16
17896 shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
17898 shufps xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
17906 movaps xmm0, [esi+eax+0]
17909 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 0 )
17911 addps xmm1, [edi-3*16]
17913 movaps [edi-3*16], xmm1
17916 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 2, 3 )
17918 addps xmm1, [edi-2*16]
17920 movaps [edi-2*16], xmm1
17924 shufps xmm0, xmm0, R_SHUFFLEPS( 2, 2, 2, 3 )
17926 addps xmm0, [edi-1*16]
17928 movaps [edi-1*16], xmm0
17938 float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
17939 float incL0, incL1, incL2, incL3, incL4, incL5;
17959 sL6 = lastV[0] + incL0;
17960 sL7 = lastV[1] + incL1;
17961 sL8 = lastV[2] + incL2;
17962 sL9 = lastV[3] + incL3;
17963 sL10 = lastV[4] + incL4;
17964 sL11 = lastV[5] + incL5;
17974 mixBuffer[i*6+ 0] += samples[i*2+0+0] * sL0;
17975 mixBuffer[i*6+ 1] += samples[i*2+0+1] * sL1;
17976 mixBuffer[i*6+ 2] += samples[i*2+0+0] * sL2;
17977 mixBuffer[i*6+ 3] += samples[i*2+0+0] * sL3;
17979 mixBuffer[i*6+ 4] += samples[i*2+0+0] * sL4;
17980 mixBuffer[i*6+ 5] += samples[i*2+0+1] * sL5;
17981 mixBuffer[i*6+ 6] += samples[i*2+2+0] * sL6;
17982 mixBuffer[i*6+ 7] += samples[i*2+2+1] * sL7;
17984 mixBuffer[i*6+ 8] += samples[i*2+2+0] * sL8;
17985 mixBuffer[i*6+ 9] += samples[i*2+2+0] * sL9;
17986 mixBuffer[i*6+10] += samples[i*2+2+0] * sL10;
17987 mixBuffer[i*6+11] += samples[i*2+2+1] * sL11;
18020 mov eax, numSamples
18029 movaps xmm0, [edi+eax+0*16]
18030 movaps xmm2, [edi+eax+1*16]
18031 movaps xmm4, [edi+eax+2*16]
18032 movaps xmm6, [edi+eax+3*16]
18041 prefetchnta [edi+eax+64]
18048 prefetchnta [edi+eax+128]
18062 movq [esi-4*4*2], mm0
18063 movq [esi-3*4*2], mm2
18064 movq [esi-2*4*2], mm4
18065 movq [esi-1*4*2], mm6
18074 for (
int i = 0; i < numSamples; i++ ) {
18075 if ( mixBuffer[i] <= -32768.0
f ) {
18076 samples[
i] = -32768;
18077 }
else if ( mixBuffer[i] >= 32767.0
f ) {
18078 samples[
i] = 32767;
18080 samples[
i] = (short) mixBuffer[i];
virtual void VPCALL MatX_LowerTriangularSolve(const idMatX &L, float *x, const float *b, const int n, int skip=0)
virtual void VPCALL CreateSpecularTextureCoords(idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
GLdouble GLdouble GLdouble GLdouble q
virtual void VPCALL TransformVerts(idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights)
static float ATan16(float a)
static const float INFINITY
virtual void VPCALL AddAssign16(float *dst, const float *src, const int count)
assert(prefInfo.fullscreenBtn)
const idVec3 & Normal(void) const
const float * ToFloatPtr(void) const
virtual void VPCALL Dot(float *dst, const idVec3 &constant, const idVec3 *src, const int count)
virtual void VPCALL BlendJoints(idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints)
const int MIXBUFFER_SAMPLES
const float * ToFloatPtr(void) const
virtual void VPCALL MixedSoundToSamples(short *samples, const float *mixBuffer, const int numSamples)
virtual void VPCALL ClampMax(float *dst, const float *src, const float max, const int count)
virtual void VPCALL MatX_TransposeMultiplyAddVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
virtual void VPCALL TracePointCull(byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL Add(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL CmpGE(byte *dst, const float *src0, const float constant, const int count)
virtual void VPCALL MixSoundTwoSpeakerStereo(float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2])
static const float HALF_PI
virtual void VPCALL MatX_LowerTriangularSolveTranspose(const idMatX &L, float *x, const float *b, const int n)
virtual void VPCALL Sub16(float *dst, const float *src1, const float *src2, const int count)
virtual void VPCALL ClampMin(float *dst, const float *src, const float min, const int count)
static float Sin16(float a)
virtual void VPCALL DeriveUnsmoothedTangents(idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts)
virtual void VPCALL MatX_MultiplyVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
virtual void VPCALL MulAssign16(float *dst, const float constant, const int count)
virtual void VPCALL MixSoundTwoSpeakerMono(float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2])
GLfloat GLfloat GLfloat v2
float normalizationScale[3]
GLuint GLuint GLsizei count
int GetNumColumns(void) const
#define FLOATSIGNBITSET(f)
virtual void VPCALL OverlayPointCull(byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL Negate16(float *dst, const int count)
virtual void VPCALL DecalPointCull(byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts)
virtual void VPCALL SubAssign16(float *dst, const float *src, const int count)
virtual void VPCALL MatX_MultiplyAddVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
virtual void VPCALL MatX_MultiplyMatX(idMatX &dst, const idMatX &m1, const idMatX &m2)
virtual void VPCALL CmpLE(byte *dst, const float *src0, const float constant, const int count)
virtual void VPCALL MatX_TransposeMultiplyMatX(idMatX &dst, const idMatX &m1, const idMatX &m2)
virtual void VPCALL Div(float *dst, const float constant, const float *src, const int count)
const float * ToFloatPtr(void) const
virtual void VPCALL ConvertJointQuatsToJointMats(idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints)
int GetNumRows(void) const
const float * ToFloatPtr(void) const
virtual void VPCALL DeriveTriPlanes(idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
virtual bool VPCALL MatX_LDLTFactor(idMatX &mat, idVecX &invDiag, const int n)
virtual void VPCALL MulSub(float *dst, const float constant, const float *src, const int count)
static float InvSqrt(float x)
GLubyte GLubyte GLubyte a
GLdouble GLdouble GLdouble y2
virtual void VPCALL MatX_TransposeMultiplyVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
virtual void VPCALL MixSoundSixSpeakerStereo(float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6])
static const float TWO_PI
virtual void VPCALL MixSoundSixSpeakerMono(float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6])
virtual void VPCALL CreateTextureSpaceLightVectors(idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
GLenum GLenum GLvoid * row
virtual void VPCALL UpSamplePCMTo44kHz(float *dest, const short *pcm, const int numSamples, const int kHz, const int numChannels)
virtual void VPCALL DeriveTangents(idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
virtual void VPCALL Mul16(float *dst, const float *src1, const float constant, const int count)
virtual int VPCALL CreateShadowCache(idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts)
virtual void VPCALL Zero16(float *dst, const int count)
virtual void VPCALL Mul(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL NormalizeTangents(idDrawVert *verts, const int numVerts)
idVertexCache vertexCache
virtual void VPCALL Clamp(float *dst, const float *src, const float min, const float max, const int count)
virtual void VPCALL UntransformJoints(idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint)
float dot(float a[], float b[])
virtual int VPCALL CreateVertexProgramShadowCache(idVec4 *vertexCache, const idDrawVert *verts, const int numVerts)
virtual void VPCALL ConvertJointMatsToJointQuats(idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints)
virtual void VPCALL MulAdd(float *dst, const float constant, const float *src, const int count)
virtual void VPCALL MatX_MultiplySubVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
virtual void VPCALL MatX_TransposeMultiplySubVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
virtual void VPCALL MinMax(float &min, float &max, const float *src, const int count)
virtual void VPCALL Copy16(float *dst, const float *src, const int count)
virtual void VPCALL Sub(float *dst, const float constant, const float *src, const int count)
const float * ToFloatPtr(void) const
virtual void VPCALL TransformJoints(idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint)
virtual void VPCALL CmpLT(byte *dst, const float *src0, const float constant, const int count)
static float RSqrt(float x)
virtual void VPCALL Add16(float *dst, const float *src1, const float *src2, const int count)
virtual void VPCALL UpSampleOGGTo44kHz(float *dest, const float *const *ogg, const int numSamples, const int kHz, const int numChannels)
virtual void VPCALL CmpGT(byte *dst, const float *src0, const float constant, const int count)
virtual const char *VPCALL GetName(void) const
void FitThroughPoint(const idVec3 &p)