doom3/doxygen/_simd___generic_8cpp_source.html

 /*

 ===========================================================================


 Doom 3 GPL Source Code

 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.


 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).


 Doom 3 Source Code is free software: you can redistribute it and/or modify

 it under the terms of the GNU General Public License as published by

 the Free Software Foundation, either version 3 of the License, or

 (at your option) any later version.


 Doom 3 Source Code is distributed in the hope that it will be useful,

 but WITHOUT ANY WARRANTY; without even the implied warranty of

 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

 GNU General Public License for more details.


 You should have received a copy of the GNU General Public License

 along with Doom 3 Source Code.  If not, see <http://www.gnu.org/licenses/>.


 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code.  If not, please request a copy in writing from id Software at the address below.


 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.


 ===========================================================================

 */


 #include "../precompiled.h"

 #pragma hdrstop


 #include "Simd_Generic.h"


 //===============================================================

 //

 //      Generic implementation of idSIMDProcessor

 //

 //===============================================================


 #define UNROLL1(Y) { int _IX; for (_IX=0;_IX<count;_IX++) {Y(_IX);} }

 #define UNROLL2(Y) { int _IX, _NM = count&0xfffffffe; for (_IX=0;_IX<_NM;_IX+=2){Y(_IX+0);Y(_IX+1);} if (_IX < count) {Y(_IX);}}

 #define UNROLL4(Y) { int _IX, _NM = count&0xfffffffc; for (_IX=0;_IX<_NM;_IX+=4){Y(_IX+0);Y(_IX+1);Y(_IX+2);Y(_IX+3);}for(;_IX<count;_IX++){Y(_IX);}}

 #define UNROLL8(Y) { int _IX, _NM = count&0xfffffff8; for (_IX=0;_IX<_NM;_IX+=8){Y(_IX+0);Y(_IX+1);Y(_IX+2);Y(_IX+3);Y(_IX+4);Y(_IX+5);Y(_IX+6);Y(_IX+7);} _NM = count&0xfffffffe; for(;_IX<_NM;_IX+=2){Y(_IX); Y(_IX+1);} if (_IX < count) {Y(_IX);} }


 #ifdef _DEBUG

 #define NODEFAULT       default: assert( 0 )

 #elif _WIN32

 #define NODEFAULT       default: __assume( 0 )

 #else

 #define NODEFAULT

 #endif


 /*

 ============

 idSIMD_Generic::GetName

 ============

 */

 const char * idSIMD_Generic::GetName( void ) const {

         return "generic code";

 }


 /*

 ============

 idSIMD_Generic::Add


   dst[i] = constant + src[i];

 ============

 */

 void VPCALL idSIMD_Generic::Add( float *dst, const float constant, const float *src, const int count ) {

 #define OPER(X) dst[(X)] = src[(X)] + constant;

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Add


   dst[i] = src0[i] + src1[i];

 ============

 */

 void VPCALL idSIMD_Generic::Add( float *dst, const float *src0, const float *src1, const int count ) {

 #define OPER(X) dst[(X)] = src0[(X)] + src1[(X)];

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Sub


   dst[i] = constant - src[i];

 ============

 */

 void VPCALL idSIMD_Generic::Sub( float *dst, const float constant, const float *src, const int count ) {

         double c = constant;

 #define OPER(X) dst[(X)] = c - src[(X)];

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Sub


   dst[i] = src0[i] - src1[i];

 ============

 */

 void VPCALL idSIMD_Generic::Sub( float *dst, const float *src0, const float *src1, const int count ) {

 #define OPER(X) dst[(X)] = src0[(X)] - src1[(X)];

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Mul


   dst[i] = constant * src[i];

 ============

 */

 void VPCALL idSIMD_Generic::Mul( float *dst, const float constant, const float *src0, const int count) {

         double c = constant;

 #define OPER(X) (dst[(X)] = (c * src0[(X)]))

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Mul


   dst[i] = src0[i] * src1[i];

 ============

 */

 void VPCALL idSIMD_Generic::Mul( float *dst, const float *src0, const float *src1, const int count ) {

 #define OPER(X) (dst[(X)] = src0[(X)] * src1[(X)])

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Div


   dst[i] = constant / divisor[i];

 ============

 */

 void VPCALL idSIMD_Generic::Div( float *dst, const float constant, const float *divisor, const int count ) {

         double c = constant;

 #define OPER(X) (dst[(X)] = (c / divisor[(X)]))

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Div


   dst[i] = src0[i] / src1[i];

 ============

 */

 void VPCALL idSIMD_Generic::Div( float *dst, const float *src0, const float *src1, const int count ) {

 #define OPER(X) (dst[(X)] = src0[(X)] / src1[(X)])

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::MulAdd


   dst[i] += constant * src[i];

 ============

 */

 void VPCALL idSIMD_Generic::MulAdd( float *dst, const float constant, const float *src, const int count ) {

         double c = constant;

 #define OPER(X) (dst[(X)] += c * src[(X)])

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::MulAdd


   dst[i] += src0[i] * src1[i];

 ============

 */

 void VPCALL idSIMD_Generic::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {

 #define OPER(X) (dst[(X)] += src0[(X)] * src1[(X)])

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::MulSub


   dst[i] -= constant * src[i];

 ============

 */

 void VPCALL idSIMD_Generic::MulSub( float *dst, const float constant, const float *src, const int count ) {

         double c = constant;

 #define OPER(X) (dst[(X)] -= c * src[(X)])

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::MulSub


   dst[i] -= src0[i] * src1[i];

 ============

 */

 void VPCALL idSIMD_Generic::MulSub( float *dst, const float *src0, const float *src1, const int count ) {

 #define OPER(X) (dst[(X)] -= src0[(X)] * src1[(X)])

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Dot


   dst[i] = constant * src[i];

 ============

 */

 void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {

 #define OPER(X) dst[(X)] = constant * src[(X)];

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Dot


   dst[i] = constant * src[i].Normal() + src[i][3];

 ============

 */

 void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {

 #define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Dot


   dst[i] = constant * src[i].xyz;

 ============

 */

 void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {

 #define OPER(X) dst[(X)] = constant * src[(X)].xyz;

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Dot


   dst[i] = constant.Normal() * src[i] + constant[3];

 ============

 */

 void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {

 #define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Dot


   dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];

 ============

 */

 void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {

 #define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Dot


   dst[i] = constant.Normal() * src[i].xyz + constant[3];

 ============

 */

 void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {

 #define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Dot


   dst[i] = src0[i] * src1[i];

 ============

 */

 void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {

 #define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Dot


   dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...

 ============

 */

 void VPCALL idSIMD_Generic::Dot( float &dot, const float *src1, const float *src2, const int count ) {

 #if 1


         switch( count ) {

                 case 0: {

                         dot = 0.0f;

                         return;

                 }

                 case 1: {

                         dot = src1[0] * src2[0];

                         return;

                 }

                 case 2: {

                         dot = src1[0] * src2[0] + src1[1] * src2[1];

                         return;

                 }

                 case 3: {

                         dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];

                         return;

                 }

                 default: {

                         int i;

                         double s0, s1, s2, s3;

                         s0 = src1[0] * src2[0];

                         s1 = src1[1] * src2[1];

                         s2 = src1[2] * src2[2];

                         s3 = src1[3] * src2[3];

                         for ( i = 4; i < count-7; i += 8 ) {

                                 s0 += src1[i+0] * src2[i+0];

                                 s1 += src1[i+1] * src2[i+1];

                                 s2 += src1[i+2] * src2[i+2];

                                 s3 += src1[i+3] * src2[i+3];

                                 s0 += src1[i+4] * src2[i+4];

                                 s1 += src1[i+5] * src2[i+5];

                                 s2 += src1[i+6] * src2[i+6];

                                 s3 += src1[i+7] * src2[i+7];

                         }

                         switch( count - i ) {

                                 NODEFAULT;

                                 case 7: s0 += src1[i+6] * src2[i+6];

                                 case 6: s1 += src1[i+5] * src2[i+5];

                                 case 5: s2 += src1[i+4] * src2[i+4];

                                 case 4: s3 += src1[i+3] * src2[i+3];

                                 case 3: s0 += src1[i+2] * src2[i+2];

                                 case 2: s1 += src1[i+1] * src2[i+1];

                                 case 1: s2 += src1[i+0] * src2[i+0];

                                 case 0: break;

                         }

                         double sum;

                         sum = s3;

                         sum += s2;

                         sum += s1;

                         sum += s0;

                         dot = sum;

                 }

         }


 #else


         dot = 0.0f;

         for ( i = 0; i < count; i++ ) {

                 dot += src1[i] * src2[i];

         }


 #endif

 }


 /*

 ============

 idSIMD_Generic::CmpGT


   dst[i] = src0[i] > constant;

 ============

 */

 void VPCALL idSIMD_Generic::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {

 #define OPER(X) dst[(X)] = src0[(X)] > constant;

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::CmpGT


   dst[i] |= ( src0[i] > constant ) << bitNum;

 ============

 */

 void VPCALL idSIMD_Generic::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {

 #define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::CmpGE


   dst[i] = src0[i] >= constant;

 ============

 */

 void VPCALL idSIMD_Generic::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {

 #define OPER(X) dst[(X)] = src0[(X)] >= constant;

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::CmpGE


   dst[i] |= ( src0[i] >= constant ) << bitNum;

 ============

 */

 void VPCALL idSIMD_Generic::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {

 #define OPER(X) dst[(X)] |= ( src0[(X)] >= constant ) << bitNum;

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::CmpLT


   dst[i] = src0[i] < constant;

 ============

 */

 void VPCALL idSIMD_Generic::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {

 #define OPER(X) dst[(X)] = src0[(X)] < constant;

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::CmpLT


   dst[i] |= ( src0[i] < constant ) << bitNum;

 ============

 */

 void VPCALL idSIMD_Generic::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {

 #define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::CmpLE


   dst[i] = src0[i] <= constant;

 ============

 */

 void VPCALL idSIMD_Generic::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {

 #define OPER(X) dst[(X)] = src0[(X)] <= constant;

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::CmpLE


   dst[i] |= ( src0[i] <= constant ) << bitNum;

 ============

 */

 void VPCALL idSIMD_Generic::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {

 #define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;

         UNROLL4(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::MinMax

 ============

 */

 void VPCALL idSIMD_Generic::MinMax( float &min, float &max, const float *src, const int count ) {

         min = idMath::INFINITY; max = -idMath::INFINITY;

 #define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::MinMax

 ============

 */

 void VPCALL idSIMD_Generic::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {

         min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;

 #define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::MinMax

 ============

 */

 void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {

         min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;

 #define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::MinMax

 ============

 */

 void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {

         min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;

 #define OPER(X) const idVec3 &v = src[(X)].xyz; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::MinMax

 ============

 */

 void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {

         min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;

 #define OPER(X) const idVec3 &v = src[indexes[(X)]].xyz; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Clamp

 ============

 */

 void VPCALL idSIMD_Generic::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {

 #define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::ClampMin

 ============

 */

 void VPCALL idSIMD_Generic::ClampMin( float *dst, const float *src, const float min, const int count ) {

 #define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::ClampMax

 ============

 */

 void VPCALL idSIMD_Generic::ClampMax( float *dst, const float *src, const float max, const int count ) {

 #define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ================

 idSIMD_Generic::Memcpy

 ================

 */

 void VPCALL idSIMD_Generic::Memcpy( void *dst, const void *src, const int count ) {

         memcpy( dst, src, count );

 }


 /*

 ================

 idSIMD_Generic::Memset

 ================

 */

 void VPCALL idSIMD_Generic::Memset( void *dst, const int val, const int count ) {

         memset( dst, val, count );

 }


 /*

 ============

 idSIMD_Generic::Zero16

 ============

 */

 void VPCALL idSIMD_Generic::Zero16( float *dst, const int count ) {

         memset( dst, 0, count * sizeof( float ) );

 }


 /*

 ============

 idSIMD_Generic::Negate16

 ============

 */

 void VPCALL idSIMD_Generic::Negate16( float *dst, const int count ) {

         unsigned int *ptr = reinterpret_cast<unsigned int *>(dst);

 #define OPER(X) ptr[(X)] ^= ( 1 << 31 )         // IEEE 32 bits float sign bit

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Copy16

 ============

 */

 void VPCALL idSIMD_Generic::Copy16( float *dst, const float *src, const int count ) {

 #define OPER(X) dst[(X)] = src[(X)]

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Add16

 ============

 */

 void VPCALL idSIMD_Generic::Add16( float *dst, const float *src1, const float *src2, const int count ) {

 #define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Sub16

 ============

 */

 void VPCALL idSIMD_Generic::Sub16( float *dst, const float *src1, const float *src2, const int count ) {

 #define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::Mul16

 ============

 */

 void VPCALL idSIMD_Generic::Mul16( float *dst, const float *src1, const float constant, const int count ) {

 #define OPER(X) dst[(X)] = src1[(X)] * constant

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::AddAssign16

 ============

 */

 void VPCALL idSIMD_Generic::AddAssign16( float *dst, const float *src, const int count ) {

 #define OPER(X) dst[(X)] += src[(X)]

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::SubAssign16

 ============

 */

 void VPCALL idSIMD_Generic::SubAssign16( float *dst, const float *src, const int count ) {

 #define OPER(X) dst[(X)] -= src[(X)]

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::MulAssign16

 ============

 */

 void VPCALL idSIMD_Generic::MulAssign16( float *dst, const float constant, const int count ) {

 #define OPER(X) dst[(X)] *= constant

         UNROLL1(OPER)

 #undef OPER

 }


 /*

 ============

 idSIMD_Generic::MatX_MultiplyVecX

 ============

 */

 void VPCALL idSIMD_Generic::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {

         int i, j, numRows;

         const float *mPtr, *vPtr;

         float *dstPtr;


         assert( vec.GetSize() >= mat.GetNumColumns() );

         assert( dst.GetSize() >= mat.GetNumRows() );


         mPtr = mat.ToFloatPtr();

         vPtr = vec.ToFloatPtr();

         dstPtr = dst.ToFloatPtr();

         numRows = mat.GetNumRows();

         switch( mat.GetNumColumns() ) {

                 case 1:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] = mPtr[0] * vPtr[0];

                                 mPtr++;

                         }

                         break;

                 case 2:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];

                                 mPtr += 2;

                         }

                         break;

                 case 3:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];

                                 mPtr += 3;

                         }

                         break;

                 case 4:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +

                                                         mPtr[3] * vPtr[3];

                                 mPtr += 4;

                         }

                         break;

                 case 5:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +

                                                         mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];

                                 mPtr += 5;

                         }

                         break;

                 case 6:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +

                                                         mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];

                                 mPtr += 6;

                         }

                         break;

                 default:

                         int numColumns = mat.GetNumColumns();

                         for ( i = 0; i < numRows; i++ ) {

                                 float sum = mPtr[0] * vPtr[0];

                                 for ( j = 1; j < numColumns; j++ ) {

                                         sum += mPtr[j] * vPtr[j];

                                 }

                                 dstPtr[i] = sum;

                                 mPtr += numColumns;

                         }

                         break;

         }

 }


 /*

 ============

 idSIMD_Generic::MatX_MultiplyAddVecX

 ============

 */

 void VPCALL idSIMD_Generic::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {

         int i, j, numRows;

         const float *mPtr, *vPtr;

         float *dstPtr;


         assert( vec.GetSize() >= mat.GetNumColumns() );

         assert( dst.GetSize() >= mat.GetNumRows() );


         mPtr = mat.ToFloatPtr();

         vPtr = vec.ToFloatPtr();

         dstPtr = dst.ToFloatPtr();

         numRows = mat.GetNumRows();

         switch( mat.GetNumColumns() ) {

                 case 1:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] += mPtr[0] * vPtr[0];

                                 mPtr++;

                         }

                         break;

                 case 2:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];

                                 mPtr += 2;

                         }

                         break;

                 case 3:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];

                                 mPtr += 3;

                         }

                         break;

                 case 4:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +

                                                         mPtr[3] * vPtr[3];

                                 mPtr += 4;

                         }

                         break;

                 case 5:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +

                                                         mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];

                                 mPtr += 5;

                         }

                         break;

                 case 6:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +

                                                         mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];

                                 mPtr += 6;

                         }

                         break;

                 default:

                         int numColumns = mat.GetNumColumns();

                         for ( i = 0; i < numRows; i++ ) {

                                 float sum = mPtr[0] * vPtr[0];

                                 for ( j = 1; j < numColumns; j++ ) {

                                         sum += mPtr[j] * vPtr[j];

                                 }

                                 dstPtr[i] += sum;

                                 mPtr += numColumns;

                         }

                         break;

         }

 }


 /*

 ============

 idSIMD_Generic::MatX_MultiplySubVecX

 ============

 */

 void VPCALL idSIMD_Generic::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {

         int i, j, numRows;

         const float *mPtr, *vPtr;

         float *dstPtr;


         assert( vec.GetSize() >= mat.GetNumColumns() );

         assert( dst.GetSize() >= mat.GetNumRows() );


         mPtr = mat.ToFloatPtr();

         vPtr = vec.ToFloatPtr();

         dstPtr = dst.ToFloatPtr();

         numRows = mat.GetNumRows();

         switch( mat.GetNumColumns() ) {

                 case 1:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] -= mPtr[0] * vPtr[0];

                                 mPtr++;

                         }

                         break;

                 case 2:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];

                                 mPtr += 2;

                         }

                         break;

                 case 3:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];

                                 mPtr += 3;

                         }

                         break;

                 case 4:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +

                                                         mPtr[3] * vPtr[3];

                                 mPtr += 4;

                         }

                         break;

                 case 5:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +

                                                         mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];

                                 mPtr += 5;

                         }

                         break;

                 case 6:

                         for ( i = 0; i < numRows; i++ ) {

                                 dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +

                                                         mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];

                                 mPtr += 6;

                         }

                         break;

                 default:

                         int numColumns = mat.GetNumColumns();

                         for ( i = 0; i < numRows; i++ ) {

                                 float sum = mPtr[0] * vPtr[0];

                                 for ( j = 1; j < numColumns; j++ ) {

                                         sum += mPtr[j] * vPtr[j];

                                 }

                                 dstPtr[i] -= sum;

                                 mPtr += numColumns;

                         }

                         break;

         }

 }


 /*

 ============

 idSIMD_Generic::MatX_TransposeMultiplyVecX

 ============

 */

 void VPCALL idSIMD_Generic::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {

         int i, j, numColumns;

         const float *mPtr, *vPtr;

         float *dstPtr;


         assert( vec.GetSize() >= mat.GetNumRows() );

         assert( dst.GetSize() >= mat.GetNumColumns() );


         mPtr = mat.ToFloatPtr();

         vPtr = vec.ToFloatPtr();

         dstPtr = dst.ToFloatPtr();

         numColumns = mat.GetNumColumns();

         switch( mat.GetNumRows() ) {

                 case 1:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] = *(mPtr) * vPtr[0];

                                 mPtr++;

                         }

                         break;

                 case 2:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];

                                 mPtr++;

                         }

                         break;

                 case 3:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];

                                 mPtr++;

                         }

                         break;

                 case 4:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +

                                                 *(mPtr+3*numColumns) * vPtr[3];

                                 mPtr++;

                         }

                         break;

                 case 5:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +

                                                 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];

                                 mPtr++;

                         }

                         break;

                 case 6:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +

                                                 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];

                                 mPtr++;

                         }

                         break;

                 default:

                         int numRows = mat.GetNumRows();

                         for ( i = 0; i < numColumns; i++ ) {

                                 mPtr = mat.ToFloatPtr() + i;

                                 float sum = mPtr[0] * vPtr[0];

                                 for ( j = 1; j < numRows; j++ ) {

                                         mPtr += numColumns;

                                         sum += mPtr[0] * vPtr[j];

                                 }

                                 dstPtr[i] = sum;

                         }

                         break;

         }

 }


 /*

 ============

 idSIMD_Generic::MatX_TransposeMultiplyAddVecX

 ============

 */

 void VPCALL idSIMD_Generic::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {

         int i, j, numColumns;

         const float *mPtr, *vPtr;

         float *dstPtr;


         assert( vec.GetSize() >= mat.GetNumRows() );

         assert( dst.GetSize() >= mat.GetNumColumns() );


         mPtr = mat.ToFloatPtr();

         vPtr = vec.ToFloatPtr();

         dstPtr = dst.ToFloatPtr();

         numColumns = mat.GetNumColumns();

         switch( mat.GetNumRows() ) {

                 case 1:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] += *(mPtr) * vPtr[0];

                                 mPtr++;

                         }

                         break;

                 case 2:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];

                                 mPtr++;

                         }

                         break;

                 case 3:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];

                                 mPtr++;

                         }

                         break;

                 case 4:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +

                                                 *(mPtr+3*numColumns) * vPtr[3];

                                 mPtr++;

                         }

                         break;

                 case 5:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +

                                                 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];

                                 mPtr++;

                         }

                         break;

                 case 6:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +

                                                 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];

                                 mPtr++;

                         }

                         break;

                 default:

                         int numRows = mat.GetNumRows();

                         for ( i = 0; i < numColumns; i++ ) {

                                 mPtr = mat.ToFloatPtr() + i;

                                 float sum = mPtr[0] * vPtr[0];

                                 for ( j = 1; j < numRows; j++ ) {

                                         mPtr += numColumns;

                                         sum += mPtr[0] * vPtr[j];

                                 }

                                 dstPtr[i] += sum;

                         }

                         break;

         }

 }


 /*

 ============

 idSIMD_Generic::MatX_TransposeMultiplySubVecX

 ============

 */

 void VPCALL idSIMD_Generic::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {

         int i, numColumns;

         const float *mPtr, *vPtr;

         float *dstPtr;


         assert( vec.GetSize() >= mat.GetNumRows() );

         assert( dst.GetSize() >= mat.GetNumColumns() );


         mPtr = mat.ToFloatPtr();

         vPtr = vec.ToFloatPtr();

         dstPtr = dst.ToFloatPtr();

         numColumns = mat.GetNumColumns();

         switch( mat.GetNumRows() ) {

                 case 1:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] -= *(mPtr) * vPtr[0];

                                 mPtr++;

                         }

                         break;

                 case 2:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];

                                 mPtr++;

                         }

                         break;

                 case 3:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];

                                 mPtr++;

                         }

                         break;

                 case 4:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +

                                                 *(mPtr+3*numColumns) * vPtr[3];

                                 mPtr++;

                         }

                         break;

                 case 5:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +

                                                 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];

                                 mPtr++;

                         }

                         break;

                 case 6:

                         for ( i = 0; i < numColumns; i++ ) {

                                 dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +

                                                 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];

                                 mPtr++;

                         }

                         break;

                 default:

                         int numRows = mat.GetNumRows();

                         for ( i = 0; i < numColumns; i++ ) {

                                 mPtr = mat.ToFloatPtr() + i;

                                 float sum = mPtr[0] * vPtr[0];

                                 for ( int j = 1; j < numRows; j++ ) {

                                         mPtr += numColumns;

                                         sum += mPtr[0] * vPtr[j];

                                 }

                                 dstPtr[i] -= sum;

                         }

                         break;

         }

 }


 /*

 ============

 idSIMD_Generic::MatX_MultiplyMatX


         optimizes the following matrix multiplications:


         NxN * Nx6

         6xN * Nx6

         Nx6 * 6xN

         6x6 * 6xN


         with N in the range [1-6].

 ============

 */

 void VPCALL idSIMD_Generic::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {

         int i, j, k, l, n;

         float *dstPtr;

         const float *m1Ptr, *m2Ptr;

         double sum;


         assert( m1.GetNumColumns() == m2.GetNumRows() );


         dstPtr = dst.ToFloatPtr();

         m1Ptr = m1.ToFloatPtr();

         m2Ptr = m2.ToFloatPtr();

         k = m1.GetNumRows();

         l = m2.GetNumColumns();


         switch( m1.GetNumColumns() ) {

                 case 1: {

                         if ( l == 6 ) {

                                 for ( i = 0; i < k; i++ ) {             // Nx1 * 1x6

                                         *dstPtr++ = m1Ptr[i] * m2Ptr[0];

                                         *dstPtr++ = m1Ptr[i] * m2Ptr[1];

                                         *dstPtr++ = m1Ptr[i] * m2Ptr[2];

                                         *dstPtr++ = m1Ptr[i] * m2Ptr[3];

                                         *dstPtr++ = m1Ptr[i] * m2Ptr[4];

                                         *dstPtr++ = m1Ptr[i] * m2Ptr[5];

                                 }

                                 return;

                         }

                         for ( i = 0; i < k; i++ ) {

                                 m2Ptr = m2.ToFloatPtr();

                                 for ( j = 0; j < l; j++ ) {

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0];

                                         m2Ptr++;

                                 }

                                 m1Ptr++;

                         }

                         break;

                 }

                 case 2: {

                         if ( l == 6 ) {

                                 for ( i = 0; i < k; i++ ) {             // Nx2 * 2x6

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11];

                                         m1Ptr += 2;

                                 }

                                 return;

                         }

                         for ( i = 0; i < k; i++ ) {

                                 m2Ptr = m2.ToFloatPtr();

                                 for ( j = 0; j < l; j++ ) {

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];

                                         m2Ptr++;

                                 }

                                 m1Ptr += 2;

                         }

                         break;

                 }

                 case 3: {

                         if ( l == 6 ) {

                                 for ( i = 0; i < k; i++ ) {             // Nx3 * 3x6

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17];

                                         m1Ptr += 3;

                                 }

                                 return;

                         }

                         for ( i = 0; i < k; i++ ) {

                                 m2Ptr = m2.ToFloatPtr();

                                 for ( j = 0; j < l; j++ ) {

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l];

                                         m2Ptr++;

                                 }

                                 m1Ptr += 3;

                         }

                         break;

                 }

                 case 4: {

                         if ( l == 6 ) {

                                 for ( i = 0; i < k; i++ ) {             // Nx4 * 4x6

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12] + m1Ptr[3] * m2Ptr[18];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13] + m1Ptr[3] * m2Ptr[19];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14] + m1Ptr[3] * m2Ptr[20];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15] + m1Ptr[3] * m2Ptr[21];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16] + m1Ptr[3] * m2Ptr[22];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17] + m1Ptr[3] * m2Ptr[23];

                                         m1Ptr += 4;

                                 }

                                 return;

                         }

                         for ( i = 0; i < k; i++ ) {

                                 m2Ptr = m2.ToFloatPtr();

                                 for ( j = 0; j < l; j++ ) {

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +

                                                                          m1Ptr[3] * m2Ptr[3*l];

                                         m2Ptr++;

                                 }

                                 m1Ptr += 4;

                         }

                         break;

                 }

                 case 5: {

                         if ( l == 6 ) {

                                 for ( i = 0; i < k; i++ ) {             // Nx5 * 5x6

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12] + m1Ptr[3] * m2Ptr[18] + m1Ptr[4] * m2Ptr[24];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13] + m1Ptr[3] * m2Ptr[19] + m1Ptr[4] * m2Ptr[25];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14] + m1Ptr[3] * m2Ptr[20] + m1Ptr[4] * m2Ptr[26];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15] + m1Ptr[3] * m2Ptr[21] + m1Ptr[4] * m2Ptr[27];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16] + m1Ptr[3] * m2Ptr[22] + m1Ptr[4] * m2Ptr[28];

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17] + m1Ptr[3] * m2Ptr[23] + m1Ptr[4] * m2Ptr[29];

                                         m1Ptr += 5;

                                 }

                                 return;

                         }

                         for ( i = 0; i < k; i++ ) {

                                 m2Ptr = m2.ToFloatPtr();

                                 for ( j = 0; j < l; j++ ) {

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +

                                                                          m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l];

                                         m2Ptr++;

                                 }

                                 m1Ptr += 5;

                         }

                         break;

                 }

                 case 6: {

                         switch( k ) {

                                 case 1: {

                                         if ( l == 1 ) {         // 1x6 * 6x1

                                                 dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +

                                                                          m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];

                                                 return;

                                         }

                                         break;

                                 }

                                 case 2: {

                                         if ( l == 2 ) {         // 2x6 * 6x2

                                                 for ( i = 0; i < 2; i++ ) {

                                                         for ( j = 0; j < 2; j++ ) {

                                                                 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 2 + j ]

                                                                                 + m1Ptr[1] * m2Ptr[ 1 * 2 + j ]

                                                                                 + m1Ptr[2] * m2Ptr[ 2 * 2 + j ]

                                                                                 + m1Ptr[3] * m2Ptr[ 3 * 2 + j ]

                                                                                 + m1Ptr[4] * m2Ptr[ 4 * 2 + j ]

                                                                                 + m1Ptr[5] * m2Ptr[ 5 * 2 + j ];

                                                                 dstPtr++;

                                                         }

                                                         m1Ptr += 6;

                                                 }

                                                 return;

                                         }

                                         break;

                                 }

                                 case 3: {

                                         if ( l == 3 ) {         // 3x6 * 6x3

                                                 for ( i = 0; i < 3; i++ ) {

                                                         for ( j = 0; j < 3; j++ ) {

                                                                 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 3 + j ]

                                                                                 + m1Ptr[1] * m2Ptr[ 1 * 3 + j ]

                                                                                 + m1Ptr[2] * m2Ptr[ 2 * 3 + j ]

                                                                                 + m1Ptr[3] * m2Ptr[ 3 * 3 + j ]

                                                                                 + m1Ptr[4] * m2Ptr[ 4 * 3 + j ]

                                                                                 + m1Ptr[5] * m2Ptr[ 5 * 3 + j ];

                                                                 dstPtr++;

                                                         }

                                                         m1Ptr += 6;

                                                 }

                                                 return;

                                         }

                                         break;

                                 }

                                 case 4: {

                                         if ( l == 4 ) {         // 4x6 * 6x4

                                                 for ( i = 0; i < 4; i++ ) {

                                                         for ( j = 0; j < 4; j++ ) {

                                                                 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 4 + j ]

                                                                                 + m1Ptr[1] * m2Ptr[ 1 * 4 + j ]

                                                                                 + m1Ptr[2] * m2Ptr[ 2 * 4 + j ]

                                                                                 + m1Ptr[3] * m2Ptr[ 3 * 4 + j ]

                                                                                 + m1Ptr[4] * m2Ptr[ 4 * 4 + j ]

                                                                                 + m1Ptr[5] * m2Ptr[ 5 * 4 + j ];

                                                                 dstPtr++;

                                                         }

                                                         m1Ptr += 6;

                                                 }

                                                 return;

                                         }

                                 }

                                 case 5: {

                                         if ( l == 5 ) {         // 5x6 * 6x5

                                                 for ( i = 0; i < 5; i++ ) {

                                                         for ( j = 0; j < 5; j++ ) {

                                                                 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 5 + j ]

                                                                                 + m1Ptr[1] * m2Ptr[ 1 * 5 + j ]

                                                                                 + m1Ptr[2] * m2Ptr[ 2 * 5 + j ]

                                                                                 + m1Ptr[3] * m2Ptr[ 3 * 5 + j ]

                                                                                 + m1Ptr[4] * m2Ptr[ 4 * 5 + j ]

                                                                                 + m1Ptr[5] * m2Ptr[ 5 * 5 + j ];

                                                                 dstPtr++;

                                                         }

                                                         m1Ptr += 6;

                                                 }

                                                 return;

                                         }

                                 }

                                 case 6: {

                                         switch( l ) {

                                                 case 1: {               // 6x6 * 6x1

                                                         for ( i = 0; i < 6; i++ ) {

                                                                 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 1 ]

                                                                                 + m1Ptr[1] * m2Ptr[ 1 * 1 ]

                                                                                 + m1Ptr[2] * m2Ptr[ 2 * 1 ]

                                                                                 + m1Ptr[3] * m2Ptr[ 3 * 1 ]

                                                                                 + m1Ptr[4] * m2Ptr[ 4 * 1 ]

                                                                                 + m1Ptr[5] * m2Ptr[ 5 * 1 ];

                                                                 dstPtr++;

                                                                 m1Ptr += 6;

                                                         }

                                                         return;

                                                 }

                                                 case 2: {               // 6x6 * 6x2

                                                         for ( i = 0; i < 6; i++ ) {

                                                                 for ( j = 0; j < 2; j++ ) {

                                                                         *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 2 + j ]

                                                                                         + m1Ptr[1] * m2Ptr[ 1 * 2 + j ]

                                                                                         + m1Ptr[2] * m2Ptr[ 2 * 2 + j ]

                                                                                         + m1Ptr[3] * m2Ptr[ 3 * 2 + j ]

                                                                                         + m1Ptr[4] * m2Ptr[ 4 * 2 + j ]

                                                                                         + m1Ptr[5] * m2Ptr[ 5 * 2 + j ];

                                                                         dstPtr++;

                                                                 }

                                                                 m1Ptr += 6;

                                                         }

                                                         return;

                                                 }

                                                 case 3: {               // 6x6 * 6x3

                                                         for ( i = 0; i < 6; i++ ) {

                                                                 for ( j = 0; j < 3; j++ ) {

                                                                         *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 3 + j ]

                                                                                         + m1Ptr[1] * m2Ptr[ 1 * 3 + j ]

                                                                                         + m1Ptr[2] * m2Ptr[ 2 * 3 + j ]

                                                                                         + m1Ptr[3] * m2Ptr[ 3 * 3 + j ]

                                                                                         + m1Ptr[4] * m2Ptr[ 4 * 3 + j ]

                                                                                         + m1Ptr[5] * m2Ptr[ 5 * 3 + j ];

                                                                         dstPtr++;

                                                                 }

                                                                 m1Ptr += 6;

                                                         }

                                                         return;

                                                 }

                                                 case 4: {               // 6x6 * 6x4

                                                         for ( i = 0; i < 6; i++ ) {

                                                                 for ( j = 0; j < 4; j++ ) {

                                                                         *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 4 + j ]

                                                                                         + m1Ptr[1] * m2Ptr[ 1 * 4 + j ]

                                                                                         + m1Ptr[2] * m2Ptr[ 2 * 4 + j ]

                                                                                         + m1Ptr[3] * m2Ptr[ 3 * 4 + j ]

                                                                                         + m1Ptr[4] * m2Ptr[ 4 * 4 + j ]

                                                                                         + m1Ptr[5] * m2Ptr[ 5 * 4 + j ];

                                                                         dstPtr++;

                                                                 }

                                                                 m1Ptr += 6;

                                                         }

                                                         return;

                                                 }

                                                 case 5: {               // 6x6 * 6x5

                                                         for ( i = 0; i < 6; i++ ) {

                                                                 for ( j = 0; j < 5; j++ ) {

                                                                         *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 5 + j ]

                                                                                         + m1Ptr[1] * m2Ptr[ 1 * 5 + j ]

                                                                                         + m1Ptr[2] * m2Ptr[ 2 * 5 + j ]

                                                                                         + m1Ptr[3] * m2Ptr[ 3 * 5 + j ]

                                                                                         + m1Ptr[4] * m2Ptr[ 4 * 5 + j ]

                                                                                         + m1Ptr[5] * m2Ptr[ 5 * 5 + j ];

                                                                         dstPtr++;

                                                                 }

                                                                 m1Ptr += 6;

                                                         }

                                                         return;

                                                 }

                                                 case 6: {               // 6x6 * 6x6

                                                         for ( i = 0; i < 6; i++ ) {

                                                                 for ( j = 0; j < 6; j++ ) {

                                                                         *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 6 + j ]

                                                                                         + m1Ptr[1] * m2Ptr[ 1 * 6 + j ]

                                                                                         + m1Ptr[2] * m2Ptr[ 2 * 6 + j ]

                                                                                         + m1Ptr[3] * m2Ptr[ 3 * 6 + j ]

                                                                                         + m1Ptr[4] * m2Ptr[ 4 * 6 + j ]

                                                                                         + m1Ptr[5] * m2Ptr[ 5 * 6 + j ];

                                                                         dstPtr++;

                                                                 }

                                                                 m1Ptr += 6;

                                                         }

                                                         return;

                                                 }

                                         }

                                 }

                         }

                         for ( i = 0; i < k; i++ ) {

                                 m2Ptr = m2.ToFloatPtr();

                                 for ( j = 0; j < l; j++ ) {

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +

                                                                          m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l];

                                         m2Ptr++;

                                 }

                                 m1Ptr += 6;

                         }

                         break;

                 }

                 default: {

                         for ( i = 0; i < k; i++ ) {

                                 for ( j = 0; j < l; j++ ) {

                                         m2Ptr = m2.ToFloatPtr() + j;

                                         sum = m1Ptr[0] * m2Ptr[0];

                                         for ( n = 1; n < m1.GetNumColumns(); n++ ) {

                                                 m2Ptr += l;

                                                 sum += m1Ptr[n] * m2Ptr[0];

                                         }

                                         *dstPtr++ = sum;

                                 }

                                 m1Ptr += m1.GetNumColumns();

                         }

                         break;

                 }

         }

 }


 /*

 ============

 idSIMD_Generic::MatX_TransposeMultiplyMatX


         optimizes the following tranpose matrix multiplications:


         Nx6 * NxN

         6xN * 6x6


         with N in the range [1-6].

 ============

 */

 void VPCALL idSIMD_Generic::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {

         int i, j, k, l, n;

         float *dstPtr;

         const float *m1Ptr, *m2Ptr;

         double sum;


         assert( m1.GetNumRows() == m2.GetNumRows() );


         m1Ptr = m1.ToFloatPtr();

         m2Ptr = m2.ToFloatPtr();

         dstPtr = dst.ToFloatPtr();

         k = m1.GetNumColumns();

         l = m2.GetNumColumns();


         switch( m1.GetNumRows() ) {

                 case 1:

                         if ( k == 6 && l == 1 ) {                       // 1x6 * 1x1

                                 for ( i = 0; i < 6; i++ ) {

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0];

                                         m1Ptr++;

                                 }

                                 return;

                         }

                         for ( i = 0; i < k; i++ ) {

                                 m2Ptr = m2.ToFloatPtr();

                                 for ( j = 0; j < l; j++ ) {

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0];

                                         m2Ptr++;

                                 }

                                 m1Ptr++;

                         }

                         break;

                 case 2:

                         if ( k == 6 && l == 2 ) {                       // 2x6 * 2x2

                                 for ( i = 0; i < 6; i++ ) {

                                         *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*2+0] + m1Ptr[1*6] * m2Ptr[1*2+0];

                                         *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*2+1] + m1Ptr[1*6] * m2Ptr[1*2+1];

                                         m1Ptr++;

                                 }

                                 return;

                         }

                         for ( i = 0; i < k; i++ ) {

                                 m2Ptr = m2.ToFloatPtr();

                                 for ( j = 0; j < l; j++ ) {

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];

                                         m2Ptr++;

                                 }

                                 m1Ptr++;

                         }

                         break;

                 case 3:

                         if ( k == 6 && l == 3 ) {                       // 3x6 * 3x3

                                 for ( i = 0; i < 6; i++ ) {

                                         *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+0] + m1Ptr[1*6] * m2Ptr[1*3+0] + m1Ptr[2*6] * m2Ptr[2*3+0];

                                         *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+1] + m1Ptr[1*6] * m2Ptr[1*3+1] + m1Ptr[2*6] * m2Ptr[2*3+1];

                                         *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+2] + m1Ptr[1*6] * m2Ptr[1*3+2] + m1Ptr[2*6] * m2Ptr[2*3+2];

                                         m1Ptr++;

                                 }

                                 return;

                         }

                         for ( i = 0; i < k; i++ ) {

                                 m2Ptr = m2.ToFloatPtr();

                                 for ( j = 0; j < l; j++ ) {

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l];

                                         m2Ptr++;

                                 }

                                 m1Ptr++;

                         }

                         break;

                 case 4:

                         if ( k == 6 && l == 4 ) {                       // 4x6 * 4x4

                                 for ( i = 0; i < 6; i++ ) {

                                         *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+0] + m1Ptr[1*6] * m2Ptr[1*4+0] + m1Ptr[2*6] * m2Ptr[2*4+0] + m1Ptr[3*6] * m2Ptr[3*4+0];

                                         *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+1] + m1Ptr[1*6] * m2Ptr[1*4+1] + m1Ptr[2*6] * m2Ptr[2*4+1] + m1Ptr[3*6] * m2Ptr[3*4+1];

                                         *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+2] + m1Ptr[1*6] * m2Ptr[1*4+2] + m1Ptr[2*6] * m2Ptr[2*4+2] + m1Ptr[3*6] * m2Ptr[3*4+2];

                                         *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+3] + m1Ptr[1*6] * m2Ptr[1*4+3] + m1Ptr[2*6] * m2Ptr[2*4+3] + m1Ptr[3*6] * m2Ptr[3*4+3];

                                         m1Ptr++;

                                 }

                                 return;

                         }

                         for ( i = 0; i < k; i++ ) {

                                 m2Ptr = m2.ToFloatPtr();

                                 for ( j = 0; j < l; j++ ) {

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +

                                                                         m1Ptr[3*k] * m2Ptr[3*l];

                                         m2Ptr++;

                                 }

                                 m1Ptr++;

                         }

                         break;

                 case 5:

                         if ( k == 6 && l == 5 ) {                       // 5x6 * 5x5

                                 for ( i = 0; i < 6; i++ ) {

                                         *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+0] + m1Ptr[1*6] * m2Ptr[1*5+0] + m1Ptr[2*6] * m2Ptr[2*5+0] + m1Ptr[3*6] * m2Ptr[3*5+0] + m1Ptr[4*6] * m2Ptr[4*5+0];

                                         *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+1] + m1Ptr[1*6] * m2Ptr[1*5+1] + m1Ptr[2*6] * m2Ptr[2*5+1] + m1Ptr[3*6] * m2Ptr[3*5+1] + m1Ptr[4*6] * m2Ptr[4*5+1];

                                         *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+2] + m1Ptr[1*6] * m2Ptr[1*5+2] + m1Ptr[2*6] * m2Ptr[2*5+2] + m1Ptr[3*6] * m2Ptr[3*5+2] + m1Ptr[4*6] * m2Ptr[4*5+2];

                                         *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+3] + m1Ptr[1*6] * m2Ptr[1*5+3] + m1Ptr[2*6] * m2Ptr[2*5+3] + m1Ptr[3*6] * m2Ptr[3*5+3] + m1Ptr[4*6] * m2Ptr[4*5+3];

                                         *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+4] + m1Ptr[1*6] * m2Ptr[1*5+4] + m1Ptr[2*6] * m2Ptr[2*5+4] + m1Ptr[3*6] * m2Ptr[3*5+4] + m1Ptr[4*6] * m2Ptr[4*5+4];

                                         m1Ptr++;

                                 }

                                 return;

                         }

                         for ( i = 0; i < k; i++ ) {

                                 m2Ptr = m2.ToFloatPtr();

                                 for ( j = 0; j < l; j++ ) {

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +

                                                                         m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l];

                                         m2Ptr++;

                                 }

                                 m1Ptr++;

                         }

                         break;

                 case 6:

                         if ( l == 6 ) {

                                 switch( k ) {

                                         case 1:                                         // 6x1 * 6x6

                                                 m2Ptr = m2.ToFloatPtr();

                                                 for ( j = 0; j < 6; j++ ) {

                                                         *dstPtr++ = m1Ptr[0*1] * m2Ptr[0*6] +

                                                                                 m1Ptr[1*1] * m2Ptr[1*6] +

                                                                                 m1Ptr[2*1] * m2Ptr[2*6] +

                                                                                 m1Ptr[3*1] * m2Ptr[3*6] +

                                                                                 m1Ptr[4*1] * m2Ptr[4*6] +

                                                                                 m1Ptr[5*1] * m2Ptr[5*6];

                                                         m2Ptr++;

                                                 }

                                                 return;

                                         case 2:                                         // 6x2 * 6x6

                                                 for ( i = 0; i < 2; i++ ) {

                                                         m2Ptr = m2.ToFloatPtr();

                                                         for ( j = 0; j < 6; j++ ) {

                                                                 *dstPtr++ = m1Ptr[0*2] * m2Ptr[0*6] +

                                                                                         m1Ptr[1*2] * m2Ptr[1*6] +

                                                                                         m1Ptr[2*2] * m2Ptr[2*6] +

                                                                                         m1Ptr[3*2] * m2Ptr[3*6] +

                                                                                         m1Ptr[4*2] * m2Ptr[4*6] +

                                                                                         m1Ptr[5*2] * m2Ptr[5*6];

                                                                 m2Ptr++;

                                                         }

                                                         m1Ptr++;

                                                 }

                                                 return;

                                         case 3:                                         // 6x3 * 6x6

                                                 for ( i = 0; i < 3; i++ ) {

                                                         m2Ptr = m2.ToFloatPtr();

                                                         for ( j = 0; j < 6; j++ ) {

                                                                 *dstPtr++ = m1Ptr[0*3] * m2Ptr[0*6] +

                                                                                         m1Ptr[1*3] * m2Ptr[1*6] +

                                                                                         m1Ptr[2*3] * m2Ptr[2*6] +

                                                                                         m1Ptr[3*3] * m2Ptr[3*6] +

                                                                                         m1Ptr[4*3] * m2Ptr[4*6] +

                                                                                         m1Ptr[5*3] * m2Ptr[5*6];

                                                                 m2Ptr++;

                                                         }

                                                         m1Ptr++;

                                                 }

                                                 return;

                                         case 4:                                         // 6x4 * 6x6

                                                 for ( i = 0; i < 4; i++ ) {

                                                         m2Ptr = m2.ToFloatPtr();

                                                         for ( j = 0; j < 6; j++ ) {

                                                                 *dstPtr++ = m1Ptr[0*4] * m2Ptr[0*6] +

                                                                                         m1Ptr[1*4] * m2Ptr[1*6] +

                                                                                         m1Ptr[2*4] * m2Ptr[2*6] +

                                                                                         m1Ptr[3*4] * m2Ptr[3*6] +

                                                                                         m1Ptr[4*4] * m2Ptr[4*6] +

                                                                                         m1Ptr[5*4] * m2Ptr[5*6];

                                                                 m2Ptr++;

                                                         }

                                                         m1Ptr++;

                                                 }

                                                 return;

                                         case 5:                                         // 6x5 * 6x6

                                                 for ( i = 0; i < 5; i++ ) {

                                                         m2Ptr = m2.ToFloatPtr();

                                                         for ( j = 0; j < 6; j++ ) {

                                                                 *dstPtr++ = m1Ptr[0*5] * m2Ptr[0*6] +

                                                                                         m1Ptr[1*5] * m2Ptr[1*6] +

                                                                                         m1Ptr[2*5] * m2Ptr[2*6] +

                                                                                         m1Ptr[3*5] * m2Ptr[3*6] +

                                                                                         m1Ptr[4*5] * m2Ptr[4*6] +

                                                                                         m1Ptr[5*5] * m2Ptr[5*6];

                                                                 m2Ptr++;

                                                         }

                                                         m1Ptr++;

                                                 }

                                                 return;

                                         case 6:                                         // 6x6 * 6x6

                                                 for ( i = 0; i < 6; i++ ) {

                                                         m2Ptr = m2.ToFloatPtr();

                                                         for ( j = 0; j < 6; j++ ) {

                                                                 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*6] +

                                                                                         m1Ptr[1*6] * m2Ptr[1*6] +

                                                                                         m1Ptr[2*6] * m2Ptr[2*6] +

                                                                                         m1Ptr[3*6] * m2Ptr[3*6] +

                                                                                         m1Ptr[4*6] * m2Ptr[4*6] +

                                                                                         m1Ptr[5*6] * m2Ptr[5*6];

                                                                 m2Ptr++;

                                                         }

                                                         m1Ptr++;

                                                 }

                                                 return;

                                 }

                         }

                         for ( i = 0; i < k; i++ ) {

                                 m2Ptr = m2.ToFloatPtr();

                                 for ( j = 0; j < l; j++ ) {

                                         *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +

                                                                         m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l];

                                         m2Ptr++;

                                 }

                                 m1Ptr++;

                         }

                         break;

                 default:

                         for ( i = 0; i < k; i++ ) {

                                 for ( j = 0; j < l; j++ ) {

                                         m1Ptr = m1.ToFloatPtr() + i;

                                         m2Ptr = m2.ToFloatPtr() + j;

                                         sum = m1Ptr[0] * m2Ptr[0];

                                         for ( n = 1; n < m1.GetNumRows(); n++ ) {

                                                 m1Ptr += k;

                                                 m2Ptr += l;

                                                 sum += m1Ptr[0] * m2Ptr[0];

                                         }

                                         *dstPtr++ = sum;

                                 }

                         }

                 break;

         }

 }


 /*

 ============

 idSIMD_Generic::MatX_LowerTriangularSolve


   solves x in Lx = b for the n * n sub-matrix of L

   if skip > 0 the first skip elements of x are assumed to be valid already

   L has to be a lower triangular matrix with (implicit) ones on the diagonal

   x == b is allowed

 ============

 */

 void VPCALL idSIMD_Generic::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {

 #if 1


         int nc;

         const float *lptr;


         if ( skip >= n ) {

                 return;

         }


         lptr = L.ToFloatPtr();

         nc = L.GetNumColumns();


         // unrolled cases for n < 8

         if ( n < 8 ) {

                 #define NSKIP( n, s )   ((n<<3)|(s&7))

                 switch( NSKIP( n, skip ) ) {

                         case NSKIP( 1, 0 ): x[0] = b[0];

                                 return;

                         case NSKIP( 2, 0 ): x[0] = b[0];

                         case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];

                                 return;

                         case NSKIP( 3, 0 ): x[0] = b[0];

                         case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];

                         case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];

                                 return;

                         case NSKIP( 4, 0 ): x[0] = b[0];

                         case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];

                         case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];

                         case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];

                                 return;

                         case NSKIP( 5, 0 ): x[0] = b[0];

                         case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];

                         case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];

                         case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];

                         case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];

                                 return;

                         case NSKIP( 6, 0 ): x[0] = b[0];

                         case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];

                         case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];

                         case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];

                         case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];

                         case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];

                                 return;

                         case NSKIP( 7, 0 ): x[0] = b[0];

                         case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];

                         case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];

                         case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];

                         case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];

                         case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];

                         case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];

                                 return;

                 }

                 return;

         }


         // process first 4 rows

         switch( skip ) {

                 case 0: x[0] = b[0];

                 case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];

                 case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];

                 case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];

                                 skip = 4;

         }


         lptr = L[skip];


         int i, j;

         register double s0, s1, s2, s3;


         for ( i = skip; i < n; i++ ) {

                 s0 = lptr[0] * x[0];

                 s1 = lptr[1] * x[1];

                 s2 = lptr[2] * x[2];

                 s3 = lptr[3] * x[3];

                 for ( j = 4; j < i-7; j += 8 ) {

                         s0 += lptr[j+0] * x[j+0];

                         s1 += lptr[j+1] * x[j+1];

                         s2 += lptr[j+2] * x[j+2];

                         s3 += lptr[j+3] * x[j+3];

                         s0 += lptr[j+4] * x[j+4];

                         s1 += lptr[j+5] * x[j+5];

                         s2 += lptr[j+6] * x[j+6];

                         s3 += lptr[j+7] * x[j+7];

                 }

                 switch( i - j ) {

                         NODEFAULT;

                         case 7: s0 += lptr[j+6] * x[j+6];

                         case 6: s1 += lptr[j+5] * x[j+5];

                         case 5: s2 += lptr[j+4] * x[j+4];

                         case 4: s3 += lptr[j+3] * x[j+3];

                         case 3: s0 += lptr[j+2] * x[j+2];

                         case 2: s1 += lptr[j+1] * x[j+1];

                         case 1: s2 += lptr[j+0] * x[j+0];

                         case 0: break;

                 }

                 double sum;

                 sum = s3;

                 sum += s2;

                 sum += s1;

                 sum += s0;

                 sum -= b[i];

                 x[i] = -sum;

                 lptr += nc;

         }


 #else


         int i, j;

         const float *lptr;

         double sum;


         for ( i = skip; i < n; i++ ) {

                 sum = b[i];

                 lptr = L[i];

                 for ( j = 0; j < i; j++ ) {

                         sum -= lptr[j] * x[j];

                 }

                 x[i] = sum;

         }


 #endif

 }


 /*

 ============

 idSIMD_Generic::MatX_LowerTriangularSolveTranspose


   solves x in L'x = b for the n * n sub-matrix of L

   L has to be a lower triangular matrix with (implicit) ones on the diagonal

   x == b is allowed

 ============

 */

 void VPCALL idSIMD_Generic::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {

 #if 1


         int nc;

         const float *lptr;


         lptr = L.ToFloatPtr();

         nc = L.GetNumColumns();


         // unrolled cases for n < 8

         if ( n < 8 ) {

                 switch( n ) {

                         case 0:

                                 return;

                         case 1:

                                 x[0] = b[0];

                                 return;

                         case 2:

                                 x[1] = b[1];

                                 x[0] = b[0] - lptr[1*nc+0] * x[1];

                                 return;

                         case 3:

                                 x[2] = b[2];

                                 x[1] = b[1] - lptr[2*nc+1] * x[2];

                                 x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];

                                 return;

                         case 4:

                                 x[3] = b[3];

                                 x[2] = b[2] - lptr[3*nc+2] * x[3];

                                 x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];

                                 x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];

                                 return;

                         case 5:

                                 x[4] = b[4];

                                 x[3] = b[3] - lptr[4*nc+3] * x[4];

                                 x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];

                                 x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];

                                 x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];

                                 return;

                         case 6:

                                 x[5] = b[5];

                                 x[4] = b[4] - lptr[5*nc+4] * x[5];

                                 x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];

                                 x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];

                                 x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];

                                 x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];

                                 return;

                         case 7:

                                 x[6] = b[6];

                                 x[5] = b[5] - lptr[6*nc+5] * x[6];

                                 x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];

                                 x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];

                                 x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];

                                 x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];

                                 x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];

                                 return;

                 }

                 return;

         }


         int i, j;

         register double s0, s1, s2, s3;

         float *xptr;


         lptr = L.ToFloatPtr() + n * nc + n - 4;

         xptr = x + n;


         // process 4 rows at a time

         for ( i = n; i >= 4; i -= 4 ) {

                 s0 = b[i-4];

                 s1 = b[i-3];

                 s2 = b[i-2];

                 s3 = b[i-1];

                 // process 4x4 blocks

                 for ( j = 0; j < n-i; j += 4 ) {

                         s0 -= lptr[(j+0)*nc+0] * xptr[j+0];

                         s1 -= lptr[(j+0)*nc+1] * xptr[j+0];

                         s2 -= lptr[(j+0)*nc+2] * xptr[j+0];

                         s3 -= lptr[(j+0)*nc+3] * xptr[j+0];

                         s0 -= lptr[(j+1)*nc+0] * xptr[j+1];

                         s1 -= lptr[(j+1)*nc+1] * xptr[j+1];

                         s2 -= lptr[(j+1)*nc+2] * xptr[j+1];

                         s3 -= lptr[(j+1)*nc+3] * xptr[j+1];

                         s0 -= lptr[(j+2)*nc+0] * xptr[j+2];

                         s1 -= lptr[(j+2)*nc+1] * xptr[j+2];

                         s2 -= lptr[(j+2)*nc+2] * xptr[j+2];

                         s3 -= lptr[(j+2)*nc+3] * xptr[j+2];

                         s0 -= lptr[(j+3)*nc+0] * xptr[j+3];

                         s1 -= lptr[(j+3)*nc+1] * xptr[j+3];

                         s2 -= lptr[(j+3)*nc+2] * xptr[j+3];

                         s3 -= lptr[(j+3)*nc+3] * xptr[j+3];

                 }

                 // process left over of the 4 rows

                 s0 -= lptr[0-1*nc] * s3;

                 s1 -= lptr[1-1*nc] * s3;

                 s2 -= lptr[2-1*nc] * s3;

                 s0 -= lptr[0-2*nc] * s2;

                 s1 -= lptr[1-2*nc] * s2;

                 s0 -= lptr[0-3*nc] * s1;

                 // store result

                 xptr[-4] = s0;

                 xptr[-3] = s1;

                 xptr[-2] = s2;

                 xptr[-1] = s3;

                 // update pointers for next four rows

                 lptr -= 4 + 4 * nc;

                 xptr -= 4;

         }

         // process left over rows

         for ( i--; i >= 0; i-- ) {

                 s0 = b[i];

                 lptr = L[0] + i;

                 for ( j = i + 1; j < n; j++ ) {

                         s0 -= lptr[j*nc] * x[j];

                 }

                 x[i] = s0;

         }


 #else


         int i, j, nc;

         const float *ptr;

         double sum;


         nc = L.GetNumColumns();

         for ( i = n - 1; i >= 0; i-- ) {

                 sum = b[i];

                 ptr = L[0] + i;

                 for ( j = i + 1; j < n; j++ ) {

                         sum -= ptr[j*nc] * x[j];

                 }

                 x[i] = sum;

         }


 #endif

 }


 /*

 ============

 idSIMD_Generic::MatX_LDLTFactor


   in-place factorization LDL' of the n * n sub-matrix of mat

   the reciprocal of the diagonal elements are stored in invDiag

 ============

 */

 bool VPCALL idSIMD_Generic::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {

 #if 1


         int i, j, k, nc;

         float *v, *diag, *mptr;

         double s0, s1, s2, s3, sum, d;


         v = (float *) _alloca16( n * sizeof( float ) );

         diag = (float *) _alloca16( n * sizeof( float ) );


         nc = mat.GetNumColumns();


         if ( n <= 0 ) {

                 return true;

         }


         mptr = mat[0];


         sum = mptr[0];


         if ( sum == 0.0f ) {

                 return false;

         }


         diag[0] = sum;

         invDiag[0] = d = 1.0f / sum;


         if ( n <= 1 ) {

                 return true;

         }


         mptr = mat[0];

         for ( j = 1; j < n; j++ ) {

                 mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;

         }


         mptr = mat[1];


         v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];

         sum = mptr[1] - s0;


         if ( sum == 0.0f ) {

                 return false;

         }


         mat[1][1] = sum;

         diag[1] = sum;

         invDiag[1] = d = 1.0f / sum;


         if ( n <= 2 ) {

                 return true;

         }


         mptr = mat[0];

         for ( j = 2; j < n; j++ ) {

                 mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;

         }


         mptr = mat[2];


         v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];

         v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];

         sum = mptr[2] - s0 - s1;


         if ( sum == 0.0f ) {

                 return false;

         }


         mat[2][2] = sum;

         diag[2] = sum;

         invDiag[2] = d = 1.0f / sum;


         if ( n <= 3 ) {

                 return true;

         }


         mptr = mat[0];

         for ( j = 3; j < n; j++ ) {

                 mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;

         }


         mptr = mat[3];


         v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];

         v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];

         v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];

         sum = mptr[3] - s0 - s1 - s2;


         if ( sum == 0.0f ) {

                 return false;

         }


         mat[3][3] = sum;

         diag[3] = sum;

         invDiag[3] = d = 1.0f / sum;


         if ( n <= 4 ) {

                 return true;

         }


         mptr = mat[0];

         for ( j = 4; j < n; j++ ) {

                 mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;

         }


         for ( i = 4; i < n; i++ ) {


                 mptr = mat[i];


                 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];

                 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];

                 v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];

                 v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];

                 for ( k = 4; k < i-3; k += 4 ) {

                         v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];

                         v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];

                         v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];

                         v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];

                 }

                 switch( i - k ) {

                         NODEFAULT;

                         case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];

                         case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];

                         case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];

                         case 0: break;

                 }

                 sum = s3;

                 sum += s2;

                 sum += s1;

                 sum += s0;

                 sum = mptr[i] - sum;


                 if ( sum == 0.0f ) {

                         return false;

                 }


                 mat[i][i] = sum;

                 diag[i] = sum;

                 invDiag[i] = d = 1.0f / sum;


                 if ( i + 1 >= n ) {

                         return true;

                 }


                 mptr = mat[i+1];

                 for ( j = i+1; j < n; j++ ) {

                         s0 = mptr[0] * v[0];

                         s1 = mptr[1] * v[1];

                         s2 = mptr[2] * v[2];

                         s3 = mptr[3] * v[3];

                         for ( k = 4; k < i-7; k += 8 ) {

                                 s0 += mptr[k+0] * v[k+0];

                                 s1 += mptr[k+1] * v[k+1];

                                 s2 += mptr[k+2] * v[k+2];

                                 s3 += mptr[k+3] * v[k+3];

                                 s0 += mptr[k+4] * v[k+4];

                                 s1 += mptr[k+5] * v[k+5];

                                 s2 += mptr[k+6] * v[k+6];

                                 s3 += mptr[k+7] * v[k+7];

                         }

                         switch( i - k ) {

                                 NODEFAULT;

                                 case 7: s0 += mptr[k+6] * v[k+6];

                                 case 6: s1 += mptr[k+5] * v[k+5];

                                 case 5: s2 += mptr[k+4] * v[k+4];

                                 case 4: s3 += mptr[k+3] * v[k+3];

                                 case 3: s0 += mptr[k+2] * v[k+2];

                                 case 2: s1 += mptr[k+1] * v[k+1];

                                 case 1: s2 += mptr[k+0] * v[k+0];

                                 case 0: break;

                         }

                         sum = s3;

                         sum += s2;

                         sum += s1;

                         sum += s0;

                         mptr[i] = ( mptr[i] - sum ) * d;

                         mptr += nc;

                 }

         }


         return true;


 #else


         int i, j, k, nc;

         float *v, *ptr, *diagPtr;

         double d, sum;


         v = (float *) _alloca16( n * sizeof( float ) );

         nc = mat.GetNumColumns();


         for ( i = 0; i < n; i++ ) {


                 ptr = mat[i];

                 diagPtr = mat[0];

                 sum = ptr[i];

                 for ( j = 0; j < i; j++ ) {

                         d = ptr[j];

                     v[j] = diagPtr[0] * d;

                     sum -= v[j] * d;

                         diagPtr += nc + 1;

                 }


                 if ( sum == 0.0f ) {

                         return false;

                 }


                 diagPtr[0] = sum;

                 invDiag[i] = d = 1.0f / sum;


                 if ( i + 1 >= n ) {

                         continue;

                 }


                 ptr = mat[i+1];

                 for ( j = i + 1; j < n; j++ ) {

                         sum = ptr[i];

                         for ( k = 0; k < i; k++ ) {

                                 sum -= ptr[k] * v[k];

                         }

                         ptr[i] = sum * d;

                         ptr += nc;

                 }

         }


         return true;


 #endif

 }


 /*

 ============

 idSIMD_Generic::BlendJoints

 ============

 */

 void VPCALL idSIMD_Generic::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {

         int i;


         for ( i = 0; i < numJoints; i++ ) {

                 int j = index[i];

                 joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );

                 joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );

         }

 }


 /*

 ============

 idSIMD_Generic::ConvertJointQuatsToJointMats

 ============

 */

 void VPCALL idSIMD_Generic::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {

         int i;


         for ( i = 0; i < numJoints; i++ ) {

                 jointMats[i].SetRotation( jointQuats[i].q.ToMat3() );

                 jointMats[i].SetTranslation( jointQuats[i].t );

         }

 }


 /*

 ============

 idSIMD_Generic::ConvertJointMatsToJointQuats

 ============

 */

 void VPCALL idSIMD_Generic::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {

         int i;


         for ( i = 0; i < numJoints; i++ ) {

                 jointQuats[i] = jointMats[i].ToJointQuat();

         }

 }


 /*

 ============

 idSIMD_Generic::TransformJoints

 ============

 */

 void VPCALL idSIMD_Generic::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {

         int i;


         for( i = firstJoint; i <= lastJoint; i++ ) {

                 assert( parents[i] < i );

                 jointMats[i] *= jointMats[parents[i]];

         }

 }


 /*

 ============

 idSIMD_Generic::UntransformJoints

 ============

 */

 void VPCALL idSIMD_Generic::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {

         int i;


         for( i = lastJoint; i >= firstJoint; i-- ) {

                 assert( parents[i] < i );

                 jointMats[i] /= jointMats[parents[i]];

         }

 }


 /*

 ============

 idSIMD_Generic::TransformVerts

 ============

 */

 void VPCALL idSIMD_Generic::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, int numWeights ) {

         int i, j;

         const byte *jointsPtr = (byte *)joints;


         for( j = i = 0; i < numVerts; i++ ) {

                 idVec3 v;


                 v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];

                 while( index[j*2+1] == 0 ) {

                         j++;

                         v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];

                 }

                 j++;


                 verts[i].xyz = v;

         }

 }


 /*

 ============

 idSIMD_Generic::TracePointCull

 ============

 */

 void VPCALL idSIMD_Generic::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {

         int i;

         byte tOr;


         tOr = 0;


         for ( i = 0; i < numVerts; i++ ) {

                 byte bits;

                 float d0, d1, d2, d3, t;

                 const idVec3 &v = verts[i].xyz;


                 d0 = planes[0].Distance( v );

                 d1 = planes[1].Distance( v );

                 d2 = planes[2].Distance( v );

                 d3 = planes[3].Distance( v );


                 t = d0 + radius;

                 bits  = FLOATSIGNBITSET( t ) << 0;

                 t = d1 + radius;

                 bits |= FLOATSIGNBITSET( t ) << 1;

                 t = d2 + radius;

                 bits |= FLOATSIGNBITSET( t ) << 2;

                 t = d3 + radius;

                 bits |= FLOATSIGNBITSET( t ) << 3;


                 t = d0 - radius;

                 bits |= FLOATSIGNBITSET( t ) << 4;

                 t = d1 - radius;

                 bits |= FLOATSIGNBITSET( t ) << 5;

                 t = d2 - radius;

                 bits |= FLOATSIGNBITSET( t ) << 6;

                 t = d3 - radius;

                 bits |= FLOATSIGNBITSET( t ) << 7;


                 bits ^= 0x0F;           // flip lower four bits


                 tOr |= bits;

                 cullBits[i] = bits;

         }


         totalOr = tOr;

 }


 /*

 ============

 idSIMD_Generic::DecalPointCull

 ============

 */

 void VPCALL idSIMD_Generic::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {

         int i;


         for ( i = 0; i < numVerts; i++ ) {

                 byte bits;

                 float d0, d1, d2, d3, d4, d5;

                 const idVec3 &v = verts[i].xyz;


                 d0 = planes[0].Distance( v );

                 d1 = planes[1].Distance( v );

                 d2 = planes[2].Distance( v );

                 d3 = planes[3].Distance( v );

                 d4 = planes[4].Distance( v );

                 d5 = planes[5].Distance( v );


                 bits  = FLOATSIGNBITSET( d0 ) << 0;

                 bits |= FLOATSIGNBITSET( d1 ) << 1;

                 bits |= FLOATSIGNBITSET( d2 ) << 2;

                 bits |= FLOATSIGNBITSET( d3 ) << 3;

                 bits |= FLOATSIGNBITSET( d4 ) << 4;

                 bits |= FLOATSIGNBITSET( d5 ) << 5;


                 cullBits[i] = bits ^ 0x3F;              // flip lower 6 bits

         }

 }


 /*

 ============

 idSIMD_Generic::OverlayPointCull

 ============

 */

 void VPCALL idSIMD_Generic::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {

         int i;


         for ( i = 0; i < numVerts; i++ ) {

                 byte bits;

                 float d0, d1;

                 const idVec3 &v = verts[i].xyz;


                 texCoords[i][0] = d0 = planes[0].Distance( v );

                 texCoords[i][1] = d1 = planes[1].Distance( v );


                 bits  = FLOATSIGNBITSET( d0 ) << 0;

                 d0 = 1.0f - d0;

                 bits |= FLOATSIGNBITSET( d1 ) << 1;

                 d1 = 1.0f - d1;

                 bits |= FLOATSIGNBITSET( d0 ) << 2;

                 bits |= FLOATSIGNBITSET( d1 ) << 3;


                 cullBits[i] = bits;

         }

 }


 /*

 ============

 idSIMD_Generic::DeriveTriPlanes


         Derives a plane equation for each triangle.

 ============

 */

 void VPCALL idSIMD_Generic::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {

         int i;


         for ( i = 0; i < numIndexes; i += 3 ) {

                 const idDrawVert *a, *b, *c;

                 float d0[3], d1[3], f;

                 idVec3 n;


                 a = verts + indexes[i + 0];

                 b = verts + indexes[i + 1];

                 c = verts + indexes[i + 2];


                 d0[0] = b->xyz[0] - a->xyz[0];

                 d0[1] = b->xyz[1] - a->xyz[1];

                 d0[2] = b->xyz[2] - a->xyz[2];


                 d1[0] = c->xyz[0] - a->xyz[0];

                 d1[1] = c->xyz[1] - a->xyz[1];

                 d1[2] = c->xyz[2] - a->xyz[2];


                 n[0] = d1[1] * d0[2] - d1[2] * d0[1];

                 n[1] = d1[2] * d0[0] - d1[0] * d0[2];

                 n[2] = d1[0] * d0[1] - d1[1] * d0[0];


                 f = idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );


                 n.x *= f;

                 n.y *= f;

                 n.z *= f;


                 planes->SetNormal( n );

                 planes->FitThroughPoint( a->xyz );

                 planes++;

         }

 }


 /*

 ============

 idSIMD_Generic::DeriveTangents


         Derives the normal and orthogonal tangent vectors for the triangle vertices.

         For each vertex the normal and tangent vectors are derived from all triangles

         using the vertex which results in smooth tangents across the mesh.

         In the process the triangle planes are calculated as well.

 ============

 */

 void VPCALL idSIMD_Generic::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {

         int i;


         bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );

         memset( used, 0, numVerts * sizeof( used[0] ) );


         idPlane *planesPtr = planes;

         for ( i = 0; i < numIndexes; i += 3 ) {

                 idDrawVert *a, *b, *c;

                 unsigned long signBit;

                 float d0[5], d1[5], f, area;

                 idVec3 n, t0, t1;


                 int v0 = indexes[i + 0];

                 int v1 = indexes[i + 1];

                 int v2 = indexes[i + 2];


                 a = verts + v0;

                 b = verts + v1;

                 c = verts + v2;


                 d0[0] = b->xyz[0] - a->xyz[0];

                 d0[1] = b->xyz[1] - a->xyz[1];

                 d0[2] = b->xyz[2] - a->xyz[2];

                 d0[3] = b->st[0] - a->st[0];

                 d0[4] = b->st[1] - a->st[1];


                 d1[0] = c->xyz[0] - a->xyz[0];

                 d1[1] = c->xyz[1] - a->xyz[1];

                 d1[2] = c->xyz[2] - a->xyz[2];

                 d1[3] = c->st[0] - a->st[0];

                 d1[4] = c->st[1] - a->st[1];


                 // normal

                 n[0] = d1[1] * d0[2] - d1[2] * d0[1];

                 n[1] = d1[2] * d0[0] - d1[0] * d0[2];

                 n[2] = d1[0] * d0[1] - d1[1] * d0[0];


                 f = idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );


                 n.x *= f;

                 n.y *= f;

                 n.z *= f;


                 planesPtr->SetNormal( n );

                 planesPtr->FitThroughPoint( a->xyz );

                 planesPtr++;


                 // area sign bit

                 area = d0[3] * d1[4] - d0[4] * d1[3];

                 signBit = ( *(unsigned long *)&area ) & ( 1 << 31 );


                 // first tangent

                 t0[0] = d0[0] * d1[4] - d0[4] * d1[0];

                 t0[1] = d0[1] * d1[4] - d0[4] * d1[1];

                 t0[2] = d0[2] * d1[4] - d0[4] * d1[2];


                 f = idMath::RSqrt( t0.x * t0.x + t0.y * t0.y + t0.z * t0.z );

                 *(unsigned long *)&f ^= signBit;


                 t0.x *= f;

                 t0.y *= f;

                 t0.z *= f;


                 // second tangent

                 t1[0] = d0[3] * d1[0] - d0[0] * d1[3];

                 t1[1] = d0[3] * d1[1] - d0[1] * d1[3];

                 t1[2] = d0[3] * d1[2] - d0[2] * d1[3];


                 f = idMath::RSqrt( t1.x * t1.x + t1.y * t1.y + t1.z * t1.z );

                 *(unsigned long *)&f ^= signBit;


                 t1.x *= f;

                 t1.y *= f;

                 t1.z *= f;


                 if ( used[v0] ) {

                         a->normal += n;

                         a->tangents[0] += t0;

                         a->tangents[1] += t1;

                 } else {

                         a->normal = n;

                         a->tangents[0] = t0;

                         a->tangents[1] = t1;

                         used[v0] = true;

                 }


                 if ( used[v1] ) {

                         b->normal += n;

                         b->tangents[0] += t0;

                         b->tangents[1] += t1;

                 } else {

                         b->normal = n;

                         b->tangents[0] = t0;

                         b->tangents[1] = t1;

                         used[v1] = true;

                 }


                 if ( used[v2] ) {

                         c->normal += n;

                         c->tangents[0] += t0;

                         c->tangents[1] += t1;

                 } else {

                         c->normal = n;

                         c->tangents[0] = t0;

                         c->tangents[1] = t1;

                         used[v2] = true;

                 }

         }

 }


 /*

 ============

 idSIMD_Generic::DeriveUnsmoothedTangents


         Derives the normal and orthogonal tangent vectors for the triangle vertices.

         For each vertex the normal and tangent vectors are derived from a single dominant triangle.

 ============

 */

 #define DERIVE_UNSMOOTHED_BITANGENT


 void VPCALL idSIMD_Generic::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {

         int i;


         for ( i = 0; i < numVerts; i++ ) {

                 idDrawVert *a, *b, *c;

                 float d0, d1, d2, d3, d4;

                 float d5, d6, d7, d8, d9;

                 float s0, s1, s2;

                 float n0, n1, n2;

                 float t0, t1, t2;

                 float t3, t4, t5;


                 const dominantTri_s &dt = dominantTris[i];


                 a = verts + i;

                 b = verts + dt.v2;

                 c = verts + dt.v3;


                 d0 = b->xyz[0] - a->xyz[0];

                 d1 = b->xyz[1] - a->xyz[1];

                 d2 = b->xyz[2] - a->xyz[2];

                 d3 = b->st[0] - a->st[0];

                 d4 = b->st[1] - a->st[1];


                 d5 = c->xyz[0] - a->xyz[0];

                 d6 = c->xyz[1] - a->xyz[1];

                 d7 = c->xyz[2] - a->xyz[2];

                 d8 = c->st[0] - a->st[0];

                 d9 = c->st[1] - a->st[1];


                 s0 = dt.normalizationScale[0];

                 s1 = dt.normalizationScale[1];

                 s2 = dt.normalizationScale[2];


                 n0 = s2 * ( d6 * d2 - d7 * d1 );

                 n1 = s2 * ( d7 * d0 - d5 * d2 );

                 n2 = s2 * ( d5 * d1 - d6 * d0 );


                 t0 = s0 * ( d0 * d9 - d4 * d5 );

                 t1 = s0 * ( d1 * d9 - d4 * d6 );

                 t2 = s0 * ( d2 * d9 - d4 * d7 );


 #ifndef DERIVE_UNSMOOTHED_BITANGENT

                 t3 = s1 * ( d3 * d5 - d0 * d8 );

                 t4 = s1 * ( d3 * d6 - d1 * d8 );

                 t5 = s1 * ( d3 * d7 - d2 * d8 );

 #else

                 t3 = s1 * ( n2 * t1 - n1 * t2 );

                 t4 = s1 * ( n0 * t2 - n2 * t0 );

                 t5 = s1 * ( n1 * t0 - n0 * t1 );

 #endif


                 a->normal[0] = n0;

                 a->normal[1] = n1;

                 a->normal[2] = n2;


                 a->tangents[0][0] = t0;

                 a->tangents[0][1] = t1;

                 a->tangents[0][2] = t2;


                 a->tangents[1][0] = t3;

                 a->tangents[1][1] = t4;

                 a->tangents[1][2] = t5;

         }

 }


 /*

 ============

 idSIMD_Generic::NormalizeTangents


         Normalizes each vertex normal and projects and normalizes the

         tangent vectors onto the plane orthogonal to the vertex normal.

 ============

 */

 void VPCALL idSIMD_Generic::NormalizeTangents( idDrawVert *verts, const int numVerts ) {


         for ( int i = 0; i < numVerts; i++ ) {

                 idVec3 &v = verts[i].normal;

                 float f;


                 f = idMath::RSqrt( v.x * v.x + v.y * v.y + v.z * v.z );

                 v.x *= f; v.y *= f; v.z *= f;


                 for ( int j = 0; j < 2; j++ ) {

                         idVec3 &t = verts[i].tangents[j];


                         t -= ( t * v ) * v;

                         f = idMath::RSqrt( t.x * t.x + t.y * t.y + t.z * t.z );

                         t.x *= f; t.y *= f; t.z *= f;

                 }

         }

 }


 /*

 ============

 idSIMD_Generic::CreateTextureSpaceLightVectors


         Calculates light vectors in texture space for the given triangle vertices.

         For each vertex the direction towards the light origin is projected onto texture space.

         The light vectors are only calculated for the vertices referenced by the indexes.

 ============

 */

 void VPCALL idSIMD_Generic::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {


         bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );

         memset( used, 0, numVerts * sizeof( used[0] ) );


         for ( int i = numIndexes - 1; i >= 0; i-- ) {

                 used[indexes[i]] = true;

         }


         for ( int i = 0; i < numVerts; i++ ) {

                 if ( !used[i] ) {

                         continue;

                 }


                 const idDrawVert *v = &verts[i];


                 idVec3 lightDir = lightOrigin - v->xyz;


                 lightVectors[i][0] = lightDir * v->tangents[0];

                 lightVectors[i][1] = lightDir * v->tangents[1];

                 lightVectors[i][2] = lightDir * v->normal;

         }

 }


 /*

 ============

 idSIMD_Generic::CreateSpecularTextureCoords


         Calculates specular texture coordinates for the given triangle vertices.

         For each vertex the normalized direction towards the light origin is added to the

         normalized direction towards the view origin and the result is projected onto texture space.

         The texture coordinates are only calculated for the vertices referenced by the indexes.

 ============

 */

 void VPCALL idSIMD_Generic::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {


         bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );

         memset( used, 0, numVerts * sizeof( used[0] ) );


         for ( int i = numIndexes - 1; i >= 0; i-- ) {

                 used[indexes[i]] = true;

         }


         for ( int i = 0; i < numVerts; i++ ) {

                 if ( !used[i] ) {

                         continue;

                 }


                 const idDrawVert *v = &verts[i];


                 idVec3 lightDir = lightOrigin - v->xyz;

                 idVec3 viewDir = viewOrigin - v->xyz;


                 float ilength;


                 ilength = idMath::RSqrt( lightDir * lightDir );

                 lightDir[0] *= ilength;

                 lightDir[1] *= ilength;

                 lightDir[2] *= ilength;


                 ilength = idMath::RSqrt( viewDir * viewDir );

                 viewDir[0] *= ilength;

                 viewDir[1] *= ilength;

                 viewDir[2] *= ilength;


                 lightDir += viewDir;


                 texCoords[i][0] = lightDir * v->tangents[0];

                 texCoords[i][1] = lightDir * v->tangents[1];

                 texCoords[i][2] = lightDir * v->normal;

                 texCoords[i][3] = 1.0f;

         }

 }


 /*

 ============

 idSIMD_Generic::CreateShadowCache

 ============

 */

 int VPCALL idSIMD_Generic::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {

         int outVerts = 0;


         for ( int i = 0; i < numVerts; i++ ) {

                 if ( vertRemap[i] ) {

                         continue;

                 }

                 const float *v = verts[i].xyz.ToFloatPtr();

                 vertexCache[outVerts+0][0] = v[0];

                 vertexCache[outVerts+0][1] = v[1];

                 vertexCache[outVerts+0][2] = v[2];

                 vertexCache[outVerts+0][3] = 1.0f;


                 // R_SetupProjection() builds the projection matrix with a slight crunch

                 // for depth, which keeps this w=0 division from rasterizing right at the

                 // wrap around point and causing depth fighting with the rear caps

                 vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];

                 vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];

                 vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];

                 vertexCache[outVerts+1][3] = 0.0f;

                 vertRemap[i] = outVerts;

                 outVerts += 2;

         }

         return outVerts;

 }


 /*

 ============

 idSIMD_Generic::CreateVertexProgramShadowCache

 ============

 */

 int VPCALL idSIMD_Generic::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {

         for ( int i = 0; i < numVerts; i++ ) {

                 const float *v = verts[i].xyz.ToFloatPtr();

                 vertexCache[i*2+0][0] = v[0];

                 vertexCache[i*2+1][0] = v[0];

                 vertexCache[i*2+0][1] = v[1];

                 vertexCache[i*2+1][1] = v[1];

                 vertexCache[i*2+0][2] = v[2];

                 vertexCache[i*2+1][2] = v[2];

                 vertexCache[i*2+0][3] = 1.0f;

                 vertexCache[i*2+1][3] = 0.0f;

         }

         return numVerts * 2;

 }


 /*

 ============

 idSIMD_Generic::UpSamplePCMTo44kHz


   Duplicate samples for 44kHz output.

 ============

 */

 void idSIMD_Generic::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {

         if ( kHz == 11025 ) {

                 if ( numChannels == 1 ) {

                         for ( int i = 0; i < numSamples; i++ ) {

                                 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];

                         }

                 } else {

                         for ( int i = 0; i < numSamples; i += 2 ) {

                                 dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];

                                 dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];

                         }

                 }

         } else if ( kHz == 22050 ) {

                 if ( numChannels == 1 ) {

                         for ( int i = 0; i < numSamples; i++ ) {

                                 dest[i*2+0] = dest[i*2+1] = (float) src[i+0];

                         }

                 } else {

                         for ( int i = 0; i < numSamples; i += 2 ) {

                                 dest[i*2+0] = dest[i*2+2] = (float) src[i+0];

                                 dest[i*2+1] = dest[i*2+3] = (float) src[i+1];

                         }

                 }

         } else if ( kHz == 44100 ) {

                 for ( int i = 0; i < numSamples; i++ ) {

                         dest[i] = (float) src[i];

                 }

         } else {

                 assert( 0 );

         }

 }


 /*

 ============

 idSIMD_Generic::UpSampleOGGTo44kHz


   Duplicate samples for 44kHz output.

 ============

 */

 void idSIMD_Generic::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {

         if ( kHz == 11025 ) {

                 if ( numChannels == 1 ) {

                         for ( int i = 0; i < numSamples; i++ ) {

                                 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;

                         }

                 } else {

                         for ( int i = 0; i < numSamples >> 1; i++ ) {

                                 dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;

                                 dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;

                         }

                 }

         } else if ( kHz == 22050 ) {

                 if ( numChannels == 1 ) {

                         for ( int i = 0; i < numSamples; i++ ) {

                                 dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;

                         }

                 } else {

                         for ( int i = 0; i < numSamples >> 1; i++ ) {

                                 dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;

                                 dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;

                         }

                 }

         } else if ( kHz == 44100 ) {

                 if ( numChannels == 1 ) {

                         for ( int i = 0; i < numSamples; i++ ) {

                                 dest[i*1+0] = ogg[0][i] * 32768.0f;

                         }

                 } else {

                         for ( int i = 0; i < numSamples >> 1; i++ ) {

                                 dest[i*2+0] = ogg[0][i] * 32768.0f;

                                 dest[i*2+1] = ogg[1][i] * 32768.0f;

                         }

                 }

         } else {

                 assert( 0 );

         }

 }


 /*

 ============

 idSIMD_Generic::MixSoundTwoSpeakerMono

 ============

 */

 void VPCALL idSIMD_Generic::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {

         float sL = lastV[0];

         float sR = lastV[1];

         float incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;

         float incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;


         assert( numSamples == MIXBUFFER_SAMPLES );


         for( int j = 0; j < MIXBUFFER_SAMPLES; j++ ) {

                 mixBuffer[j*2+0] += samples[j] * sL;

                 mixBuffer[j*2+1] += samples[j] * sR;

                 sL += incL;

                 sR += incR;

         }

 }


 /*

 ============

 idSIMD_Generic::MixSoundTwoSpeakerStereo

 ============

 */

 void VPCALL idSIMD_Generic::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {

         float sL = lastV[0];

         float sR = lastV[1];

         float incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;

         float incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;


         assert( numSamples == MIXBUFFER_SAMPLES );


         for( int j = 0; j < MIXBUFFER_SAMPLES; j++ ) {

                 mixBuffer[j*2+0] += samples[j*2+0] * sL;

                 mixBuffer[j*2+1] += samples[j*2+1] * sR;

                 sL += incL;

                 sR += incR;

         }

 }


 /*

 ============

 idSIMD_Generic::MixSoundSixSpeakerMono

 ============

 */

 void VPCALL idSIMD_Generic::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {

         float sL0 = lastV[0];

         float sL1 = lastV[1];

         float sL2 = lastV[2];

         float sL3 = lastV[3];

         float sL4 = lastV[4];

         float sL5 = lastV[5];


         float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;

         float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;

         float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;

         float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;

         float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;

         float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;


         assert( numSamples == MIXBUFFER_SAMPLES );


         for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {

                 mixBuffer[i*6+0] += samples[i] * sL0;

                 mixBuffer[i*6+1] += samples[i] * sL1;

                 mixBuffer[i*6+2] += samples[i] * sL2;

                 mixBuffer[i*6+3] += samples[i] * sL3;

                 mixBuffer[i*6+4] += samples[i] * sL4;

                 mixBuffer[i*6+5] += samples[i] * sL5;

                 sL0 += incL0;

                 sL1 += incL1;

                 sL2 += incL2;

                 sL3 += incL3;

                 sL4 += incL4;

                 sL5 += incL5;

         }

 }


 /*

 ============

 idSIMD_Generic::MixSoundSixSpeakerStereo

 ============

 */

 void VPCALL idSIMD_Generic::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {

         float sL0 = lastV[0];

         float sL1 = lastV[1];

         float sL2 = lastV[2];

         float sL3 = lastV[3];

         float sL4 = lastV[4];

         float sL5 = lastV[5];


         float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;

         float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;

         float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;

         float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;

         float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;

         float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;


         assert( numSamples == MIXBUFFER_SAMPLES );


         for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {

                 mixBuffer[i*6+0] += samples[i*2+0] * sL0;

                 mixBuffer[i*6+1] += samples[i*2+1] * sL1;

                 mixBuffer[i*6+2] += samples[i*2+0] * sL2;

                 mixBuffer[i*6+3] += samples[i*2+0] * sL3;

                 mixBuffer[i*6+4] += samples[i*2+0] * sL4;

                 mixBuffer[i*6+5] += samples[i*2+1] * sL5;

                 sL0 += incL0;

                 sL1 += incL1;

                 sL2 += incL2;

                 sL3 += incL3;

                 sL4 += incL4;

                 sL5 += incL5;

         }

 }


 /*

 ============

 idSIMD_Generic::MixedSoundToSamples

 ============

 */

 void VPCALL idSIMD_Generic::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {


         for ( int i = 0; i < numSamples; i++ ) {

                 if ( mixBuffer[i] <= -32768.0f ) {

                         samples[i] = -32768;

                 } else if ( mixBuffer[i] >= 32767.0f ) {

                         samples[i] = 32767;

                 } else {

                         samples[i] = (short) mixBuffer[i];

                 }

         }

 }

idSIMD_Generic::MatX_LowerTriangularSolve
virtual void VPCALL MatX_LowerTriangularSolve(const idMatX &L, float *x, const float *b, const int n, int skip=0)
Definition: Simd_Generic.cpp:1733

idSIMD_Generic::CreateSpecularTextureCoords
virtual void VPCALL CreateSpecularTextureCoords(idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
Definition: Simd_Generic.cpp:2750

q
GLdouble GLdouble GLdouble GLdouble q
Definition: glext.h:2959

idSIMD_Generic::TransformVerts
virtual void VPCALL TransformVerts(idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights)
Definition: Simd_Generic.cpp:2316

idMath::INFINITY
static const float INFINITY
Definition: Math.h:218

min
#define min(a, b)

idSIMD_Generic::AddAssign16
virtual void VPCALL AddAssign16(float *dst, const float *src, const int count)
Definition: Simd_Generic.cpp:675

assert
assert(prefInfo.fullscreenBtn)

weights
const GLbyte * weights
Definition: glext.h:3273

idMatX
Definition: Matrix.h:1786

idSIMD_Generic::Dot
virtual void VPCALL Dot(float *dst, const idVec3 &constant, const idVec3 *src, const int count)
Definition: Simd_Generic.cpp:232

idVecX::GetSize
int GetSize(void) const
Definition: Vector.h:1467

idJointMat
Definition: JointTransform.h:63

idSIMD_Generic::BlendJoints
virtual void VPCALL BlendJoints(idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints)
Definition: Simd_Generic.cpp:2246

idSIMD_Generic::Memset
virtual void VPCALL Memset(void *dst, const int val, const int count)
Definition: Simd_Generic.cpp:601

v
const GLdouble * v
Definition: glext.h:2936

idDrawVert
Definition: DrawVert.h:40

MIXBUFFER_SAMPLES
const int MIXBUFFER_SAMPLES
Definition: Simd.h:84

idVec3::ToFloatPtr
const float * ToFloatPtr(void) const
Definition: Vector.h:719

idPlane::Distance
float Distance(const idVec3 &v) const
Definition: Plane.h:324

idDrawVert::xyz
idVec3 xyz
Definition: DrawVert.h:42

idSIMD_Generic::MixedSoundToSamples
virtual void VPCALL MixedSoundToSamples(short *samples, const float *mixBuffer, const int numSamples)
Definition: Simd_Generic.cpp:3049

n
GLenum GLsizei n
Definition: glext.h:3705

idSIMD_Generic::ClampMax
virtual void VPCALL ClampMax(float *dst, const float *src, const float max, const int count)
Definition: Simd_Generic.cpp:581

idVec3::z
float z
Definition: Vector.h:320

idDrawVert::tangents
idVec3 tangents[2]
Definition: DrawVert.h:45

idSIMD_Generic::Memcpy
virtual void VPCALL Memcpy(void *dst, const void *src, const int count)
Definition: Simd_Generic.cpp:592

idJointQuat::t
idVec3 t
Definition: JointTransform.h:44

idSIMD_Generic::MatX_TransposeMultiplyAddVecX
virtual void VPCALL MatX_TransposeMultiplyAddVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
Definition: Simd_Generic.cpp:993

idVec3
Definition: Vector.h:316

float
case const float
Definition: Callbacks.cpp:62

idSIMD_Generic::TracePointCull
virtual void VPCALL TracePointCull(byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts)
Definition: Simd_Generic.cpp:2339

idSIMD_Generic::Add
virtual void VPCALL Add(float *dst, const float constant, const float *src, const int count)
Definition: Simd_Generic.cpp:71

idSIMD_Generic::CmpGE
virtual void VPCALL CmpGE(byte *dst, const float *src0, const float constant, const int count)
Definition: Simd_Generic.cpp:423

idSIMD_Generic::MixSoundTwoSpeakerStereo
virtual void VPCALL MixSoundTwoSpeakerStereo(float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2])
Definition: Simd_Generic.cpp:2952

src
GLuint src
Definition: glext.h:5390

dominantTri_s::v3
glIndex_t v3
Definition: Model.h:70

idPlane::SetNormal
void SetNormal(const idVec3 &normal)
Definition: Plane.h:233

v0
GLfloat v0
Definition: glext.h:3606

idVec3::x
float x
Definition: Vector.h:318

idSIMD_Generic::MatX_LowerTriangularSolveTranspose
virtual void VPCALL MatX_LowerTriangularSolveTranspose(const idMatX &L, float *x, const float *b, const int n)
Definition: Simd_Generic.cpp:1866

idQuat::Slerp
idQuat & Slerp(const idQuat &from, const idQuat &to, float t)
Definition: Quat.cpp:160

x
GLenum GLint x
Definition: glext.h:2849

process.i
int i
Definition: process.py:33

idJointMat::SetTranslation
void SetTranslation(const idVec3 &t)
Definition: JointTransform.h:103

UNROLL1
#define UNROLL1(Y)
Definition: Simd_Generic.cpp:41

idSIMD_Generic::Sub16
virtual void VPCALL Sub16(float *dst, const float *src1, const float *src2, const int count)
Definition: Simd_Generic.cpp:653

idSIMD_Generic::ClampMin
virtual void VPCALL ClampMin(float *dst, const float *src, const float min, const int count)
Definition: Simd_Generic.cpp:570

idJointMat::SetRotation
void SetRotation(const idMat3 &m)
Definition: JointTransform.h:90

prepare.l
list l
Definition: prepare.py:17

idSIMD_Generic::DeriveUnsmoothedTangents
virtual void VPCALL DeriveUnsmoothedTangents(idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts)
Definition: Simd_Generic.cpp:2614

idSIMD_Generic::MatX_MultiplyVecX
virtual void VPCALL MatX_MultiplyVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
Definition: Simd_Generic.cpp:708

idSIMD_Generic::MulAssign16
virtual void VPCALL MulAssign16(float *dst, const float constant, const int count)
Definition: Simd_Generic.cpp:697

idSIMD_Generic::MixSoundTwoSpeakerMono
virtual void VPCALL MixSoundTwoSpeakerMono(float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2])
Definition: Simd_Generic.cpp:2931

idDrawVert::st
idVec2 st
Definition: DrawVert.h:43

v2
GLfloat GLfloat GLfloat v2
Definition: glext.h:3608

dst
GLuint dst
Definition: glext.h:5285

dominantTri_s::normalizationScale
float normalizationScale[3]
Definition: Model.h:71

count
GLuint GLuint GLsizei count
Definition: glext.h:2845

idMatX::GetNumColumns
int GetNumColumns(void) const
Definition: Matrix.h:1822

idVec2
Definition: Vector.h:52

idVecX
Definition: Vector.h:1435

FLOATSIGNBITSET
#define FLOATSIGNBITSET(f)
Definition: Math.h:68

idSIMD_Generic::OverlayPointCull
virtual void VPCALL OverlayPointCull(byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts)
Definition: Simd_Generic.cpp:2418

idJointQuat::q
idQuat q
Definition: JointTransform.h:43

idSIMD_Generic::Negate16
virtual void VPCALL Negate16(float *dst, const int count)
Definition: Simd_Generic.cpp:619

index
GLuint index
Definition: glext.h:3476

c
const GLubyte * c
Definition: glext.h:4677

idVec4
Definition: Vector.h:808

idSIMD_Generic::DecalPointCull
virtual void VPCALL DecalPointCull(byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts)
Definition: Simd_Generic.cpp:2387

idSIMD_Generic::SubAssign16
virtual void VPCALL SubAssign16(float *dst, const float *src, const int count)
Definition: Simd_Generic.cpp:686

NSKIP
#define NSKIP(n, s)

idSIMD_Generic::MatX_MultiplyAddVecX
virtual void VPCALL MatX_MultiplyAddVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
Definition: Simd_Generic.cpp:779

idSIMD_Generic::MatX_MultiplyMatX
virtual void VPCALL MatX_MultiplyMatX(idMatX &dst, const idMatX &m1, const idMatX &m2)
Definition: Simd_Generic.cpp:1146

idSIMD_Generic::CmpLE
virtual void VPCALL CmpLE(byte *dst, const float *src0, const float constant, const int count)
Definition: Simd_Generic.cpp:475

idSIMD_Generic::MatX_TransposeMultiplyMatX
virtual void VPCALL MatX_TransposeMultiplyMatX(idMatX &dst, const idMatX &m1, const idMatX &m2)
Definition: Simd_Generic.cpp:1491

idSIMD_Generic::Div
virtual void VPCALL Div(float *dst, const float constant, const float *src, const int count)
Definition: Simd_Generic.cpp:151

idVecX::ToFloatPtr
const float * ToFloatPtr(void) const
Definition: Vector.h:1910

idVec3::y
float y
Definition: Vector.h:319

Simd_Generic.h

dominantTri_s
Definition: Model.h:69

idSIMD_Generic::ConvertJointQuatsToJointMats
virtual void VPCALL ConvertJointQuatsToJointMats(idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints)
Definition: Simd_Generic.cpp:2261

idPlane
Definition: Plane.h:71

L
Definition: eax4.h:1413

idMatX::GetNumRows
int GetNumRows(void) const
Definition: Matrix.h:1821

idSIMD_Generic::DeriveTriPlanes
virtual void VPCALL DeriveTriPlanes(idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
Definition: Simd_Generic.cpp:2447

divisor
INT64 INT64 divisor
Definition: wglext.h:504

idJointMat::ToJointQuat
idJointQuat ToJointQuat(void) const
Definition: JointTransform.cpp:38

idSIMD_Generic::MatX_LDLTFactor
virtual bool VPCALL MatX_LDLTFactor(idMatX &mat, idVecX &invDiag, const int n)
Definition: Simd_Generic.cpp:2011

idDrawVert::normal
idVec3 normal
Definition: DrawVert.h:44

idSIMD_Generic::MulSub
virtual void VPCALL MulSub(float *dst, const float constant, const float *src, const int count)
Definition: Simd_Generic.cpp:205

a
GLubyte GLubyte GLubyte a
Definition: glext.h:4662

idSIMD_Generic::MatX_TransposeMultiplyVecX
virtual void VPCALL MatX_TransposeMultiplyVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
Definition: Simd_Generic.cpp:921

idSIMD_Generic::MixSoundSixSpeakerStereo
virtual void VPCALL MixSoundSixSpeakerStereo(float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6])
Definition: Simd_Generic.cpp:3011

v1
GLfloat GLfloat v1
Definition: glext.h:3607

b
GLubyte GLubyte b
Definition: glext.h:4662

idSIMD_Generic::MixSoundSixSpeakerMono
virtual void VPCALL MixSoundSixSpeakerMono(float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6])
Definition: Simd_Generic.cpp:2973

idJointQuat
Definition: JointTransform.h:40

idSIMD_Generic::CreateTextureSpaceLightVectors
virtual void VPCALL CreateTextureSpaceLightVectors(idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
Definition: Simd_Generic.cpp:2716

bits
#define bits
Definition: Unzip.cpp:3797

dominantTri_s::v2
glIndex_t v2
Definition: Model.h:70

idSIMD_Generic::UpSamplePCMTo44kHz
virtual void VPCALL UpSamplePCMTo44kHz(float *dest, const short *pcm, const int numSamples, const int kHz, const int numChannels)
Definition: Simd_Generic.cpp:2848

idal.f
tuple f
Definition: idal.py:89

idSIMD_Generic::DeriveTangents
virtual void VPCALL DeriveTangents(idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes)
Definition: Simd_Generic.cpp:2493

idSIMD_Generic::Mul16
virtual void VPCALL Mul16(float *dst, const float *src1, const float constant, const int count)
Definition: Simd_Generic.cpp:664

byte
unsigned char byte
Definition: Lib.h:75

idSIMD_Generic::CreateShadowCache
virtual int VPCALL CreateShadowCache(idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts)
Definition: Simd_Generic.cpp:2795

NODEFAULT
#define NODEFAULT
Definition: Simd_Generic.cpp:51

idSIMD_Generic::Zero16
virtual void VPCALL Zero16(float *dst, const int count)
Definition: Simd_Generic.cpp:610

idSIMD_Generic::Mul
virtual void VPCALL Mul(float *dst, const float constant, const float *src, const int count)
Definition: Simd_Generic.cpp:124

idSIMD_Generic::NormalizeTangents
virtual void VPCALL NormalizeTangents(idDrawVert *verts, const int numVerts)
Definition: Simd_Generic.cpp:2688

vertexCache
idVertexCache vertexCache
Definition: VertexCache.cpp:41

idSIMD_Generic::Clamp
virtual void VPCALL Clamp(float *dst, const float *src, const float min, const float max, const int count)
Definition: Simd_Generic.cpp:559

VPCALL
#define VPCALL
Definition: Simd.h:63

UNROLL4
#define UNROLL4(Y)
Definition: Simd_Generic.cpp:43

idSIMD_Generic::UntransformJoints
virtual void VPCALL UntransformJoints(idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint)
Definition: Simd_Generic.cpp:2302

j
GLint j
Definition: qgl.h:264

idVec3::Lerp
void Lerp(const idVec3 &v1, const idVec3 &v2, const float l)
Definition: Vector.cpp:232

dot
float dot(float a[], float b[])
Definition: Model_lwo.cpp:3883

idSIMD_Generic::CreateVertexProgramShadowCache
virtual int VPCALL CreateVertexProgramShadowCache(idVec4 *vertexCache, const idDrawVert *verts, const int numVerts)
Definition: Simd_Generic.cpp:2826

idSIMD_Generic::ConvertJointMatsToJointQuats
virtual void VPCALL ConvertJointMatsToJointQuats(idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints)
Definition: Simd_Generic.cpp:2275

idSIMD_Generic::MulAdd
virtual void VPCALL MulAdd(float *dst, const float constant, const float *src, const int count)
Definition: Simd_Generic.cpp:178

idSIMD_Generic::MatX_MultiplySubVecX
virtual void VPCALL MatX_MultiplySubVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
Definition: Simd_Generic.cpp:850

idSIMD_Generic::MatX_TransposeMultiplySubVecX
virtual void VPCALL MatX_TransposeMultiplySubVecX(idVecX &dst, const idMatX &mat, const idVecX &vec)
Definition: Simd_Generic.cpp:1065

max
#define max(x, y)
Definition: os.h:70

idSIMD_Generic::MinMax
virtual void VPCALL MinMax(float &min, float &max, const float *src, const int count)
Definition: Simd_Generic.cpp:499

idSIMD_Generic::Copy16
virtual void VPCALL Copy16(float *dst, const float *src, const int count)
Definition: Simd_Generic.cpp:631

idSIMD_Generic::Sub
virtual void VPCALL Sub(float *dst, const float constant, const float *src, const int count)
Definition: Simd_Generic.cpp:97

OPER
#define OPER(X)

idMatX::ToFloatPtr
const float * ToFloatPtr(void) const
Definition: Matrix.h:2935

idSIMD_Generic::TransformJoints
virtual void VPCALL TransformJoints(idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint)
Definition: Simd_Generic.cpp:2288

idSIMD_Generic::CmpLT
virtual void VPCALL CmpLT(byte *dst, const float *src0, const float constant, const int count)
Definition: Simd_Generic.cpp:449

idMath::RSqrt
static float RSqrt(float x)
Definition: Math.h:241

idSIMD_Generic::Add16
virtual void VPCALL Add16(float *dst, const float *src1, const float *src2, const int count)
Definition: Simd_Generic.cpp:642

idSIMD_Generic::UpSampleOGGTo44kHz
virtual void VPCALL UpSampleOGGTo44kHz(float *dest, const float *const *ogg, const int numSamples, const int kHz, const int numChannels)
Definition: Simd_Generic.cpp:2887

idSIMD_Generic::CmpGT
virtual void VPCALL CmpGT(byte *dst, const float *src0, const float constant, const int count)
Definition: Simd_Generic.cpp:397

idSIMD_Generic::GetName
virtual const char *VPCALL GetName(void) const
Definition: Simd_Generic.cpp:60

idPlane::FitThroughPoint
void FitThroughPoint(const idVec3 &p)
Definition: Plane.h:297

t
GLdouble GLdouble t
Definition: glext.h:2943