doom3-gpl
Doom 3 GPL source release
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Simd_MMX.cpp
Go to the documentation of this file.
1 /*
2 ===========================================================================
3 
4 Doom 3 GPL Source Code
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
6 
7 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
8 
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13 
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
21 
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
23 
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
25 
26 ===========================================================================
27 */
28 
29 #include "../precompiled.h"
30 #pragma hdrstop
31 
32 #include "Simd_Generic.h"
33 #include "Simd_MMX.h"
34 
35 
36 //===============================================================
37 //
38 // MMX implementation of idSIMDProcessor
39 //
40 //===============================================================
41 
42 #if defined(MACOS_X) && defined(__i386__)
43 /*
44 ============
45 idSIMD_MMX::GetName
46 ============
47 */
48 const char * idSIMD_MMX::GetName( void ) const {
49  return "MMX";
50 }
51 
52 #elif defined(_WIN32)
53 
54 #define EMMS_INSTRUCTION __asm emms
55 
56 /*
57 ============
58 idSIMD_MMX::GetName
59 ============
60 */
61 const char * idSIMD_MMX::GetName( void ) const {
62  return "MMX";
63 }
64 
65 /*
66 ================
67 MMX_Memcpy8B
68 ================
69 */
70 void MMX_Memcpy8B( void *dest, const void *src, const int count ) {
71  _asm {
72  mov esi, src
73  mov edi, dest
74  mov ecx, count
75  shr ecx, 3 // 8 bytes per iteration
76 
77 loop1:
78  movq mm1, 0[ESI] // Read in source data
79  movntq 0[EDI], mm1 // Non-temporal stores
80 
81  add esi, 8
82  add edi, 8
83  dec ecx
84  jnz loop1
85 
86  }
87  EMMS_INSTRUCTION
88 }
89 
90 /*
91 ================
92 MMX_Memcpy64B
93 
94  165MB/sec
95 ================
96 */
97 void MMX_Memcpy64B( void *dest, const void *src, const int count ) {
98  _asm {
99  mov esi, src
100  mov edi, dest
101  mov ecx, count
102  shr ecx, 6 // 64 bytes per iteration
103 
104 loop1:
105  prefetchnta 64[ESI] // Prefetch next loop, non-temporal
106  prefetchnta 96[ESI]
107 
108  movq mm1, 0[ESI] // Read in source data
109  movq mm2, 8[ESI]
110  movq mm3, 16[ESI]
111  movq mm4, 24[ESI]
112  movq mm5, 32[ESI]
113  movq mm6, 40[ESI]
114  movq mm7, 48[ESI]
115  movq mm0, 56[ESI]
116 
117  movntq 0[EDI], mm1 // Non-temporal stores
118  movntq 8[EDI], mm2
119  movntq 16[EDI], mm3
120  movntq 24[EDI], mm4
121  movntq 32[EDI], mm5
122  movntq 40[EDI], mm6
123  movntq 48[EDI], mm7
124  movntq 56[EDI], mm0
125 
126  add esi, 64
127  add edi, 64
128  dec ecx
129  jnz loop1
130  }
131  EMMS_INSTRUCTION
132 }
133 
134 /*
135 ================
136 MMX_Memcpy2kB
137 
138  240MB/sec
139 ================
140 */
141 void MMX_Memcpy2kB( void *dest, const void *src, const int count ) {
142  byte *tbuf = (byte *)_alloca16(2048);
143  __asm {
144  push ebx
145  mov esi, src
146  mov ebx, count
147  shr ebx, 11 // 2048 bytes at a time
148  mov edi, dest
149 
150 loop2k:
151  push edi // copy 2k into temporary buffer
152  mov edi, tbuf
153  mov ecx, 32
154 
155 loopMemToL1:
156  prefetchnta 64[ESI] // Prefetch next loop, non-temporal
157  prefetchnta 96[ESI]
158 
159  movq mm1, 0[ESI] // Read in source data
160  movq mm2, 8[ESI]
161  movq mm3, 16[ESI]
162  movq mm4, 24[ESI]
163  movq mm5, 32[ESI]
164  movq mm6, 40[ESI]
165  movq mm7, 48[ESI]
166  movq mm0, 56[ESI]
167 
168  movq 0[EDI], mm1 // Store into L1
169  movq 8[EDI], mm2
170  movq 16[EDI], mm3
171  movq 24[EDI], mm4
172  movq 32[EDI], mm5
173  movq 40[EDI], mm6
174  movq 48[EDI], mm7
175  movq 56[EDI], mm0
176  add esi, 64
177  add edi, 64
178  dec ecx
179  jnz loopMemToL1
180 
181  pop edi // Now copy from L1 to system memory
182  push esi
183  mov esi, tbuf
184  mov ecx, 32
185 
186 loopL1ToMem:
187  movq mm1, 0[ESI] // Read in source data from L1
188  movq mm2, 8[ESI]
189  movq mm3, 16[ESI]
190  movq mm4, 24[ESI]
191  movq mm5, 32[ESI]
192  movq mm6, 40[ESI]
193  movq mm7, 48[ESI]
194  movq mm0, 56[ESI]
195 
196  movntq 0[EDI], mm1 // Non-temporal stores
197  movntq 8[EDI], mm2
198  movntq 16[EDI], mm3
199  movntq 24[EDI], mm4
200  movntq 32[EDI], mm5
201  movntq 40[EDI], mm6
202  movntq 48[EDI], mm7
203  movntq 56[EDI], mm0
204 
205  add esi, 64
206  add edi, 64
207  dec ecx
208  jnz loopL1ToMem
209 
210  pop esi // Do next 2k block
211  dec ebx
212  jnz loop2k
213  pop ebx
214  }
215  EMMS_INSTRUCTION
216 }
217 
218 
219 /*
220 ================
221 idSIMD_MMX::Memcpy
222 
223  optimized memory copy routine that handles all alignment cases and block sizes efficiently
224 ================
225 */
226 void VPCALL idSIMD_MMX::Memcpy( void *dest0, const void *src0, const int count0 ) {
227  // if copying more than 16 bytes and we can copy 8 byte aligned
228  if ( count0 > 16 && !( ( (int)dest0 ^ (int)src0 ) & 7 ) ) {
229  byte *dest = (byte *)dest0;
230  byte *src = (byte *)src0;
231 
232  // copy up to the first 8 byte aligned boundary
233  int count = ((int)dest) & 7;
234  memcpy( dest, src, count );
235  dest += count;
236  src += count;
237  count = count0 - count;
238 
239  // if there are multiple blocks of 2kB
240  if ( count & ~4095 ) {
241  MMX_Memcpy2kB( dest, src, count );
242  src += (count & ~2047);
243  dest += (count & ~2047);
244  count &= 2047;
245  }
246 
247  // if there are blocks of 64 bytes
248  if ( count & ~63 ) {
249  MMX_Memcpy64B( dest, src, count );
250  src += (count & ~63);
251  dest += (count & ~63);
252  count &= 63;
253  }
254 
255  // if there are blocks of 8 bytes
256  if ( count & ~7 ) {
257  MMX_Memcpy8B( dest, src, count );
258  src += (count & ~7);
259  dest += (count & ~7);
260  count &= 7;
261  }
262 
263  // copy any remaining bytes
264  memcpy( dest, src, count );
265  } else {
266  // use the regular one if we cannot copy 8 byte aligned
267  memcpy( dest0, src0, count0 );
268  }
269 
270  // the MMX_Memcpy* functions use MOVNTQ, issue a fence operation
271  __asm {
272  sfence
273  }
274 }
275 
276 /*
277 ================
278 idSIMD_MMX::Memset
279 ================
280 */
281 void VPCALL idSIMD_MMX::Memset( void* dest0, const int val, const int count0 ) {
282  union {
283  byte bytes[8];
284  word words[4];
285  dword dwords[2];
286  } dat;
287 
288  byte *dest = (byte *)dest0;
289  int count = count0;
290 
291  while ( count > 0 && (((int)dest) & 7) ) {
292  *dest = val;
293  dest++;
294  count--;
295  }
296  if ( !count ) {
297  return;
298  }
299 
300  dat.bytes[0] = val;
301  dat.bytes[1] = val;
302  dat.words[1] = dat.words[0];
303  dat.dwords[1] = dat.dwords[0];
304 
305  if ( count >= 64 ) {
306  __asm {
307  mov edi, dest
308  mov ecx, count
309  shr ecx, 6 // 64 bytes per iteration
310  movq mm1, dat // Read in source data
311  movq mm2, mm1
312  movq mm3, mm1
313  movq mm4, mm1
314  movq mm5, mm1
315  movq mm6, mm1
316  movq mm7, mm1
317  movq mm0, mm1
318 loop1:
319  movntq 0[EDI], mm1 // Non-temporal stores
320  movntq 8[EDI], mm2
321  movntq 16[EDI], mm3
322  movntq 24[EDI], mm4
323  movntq 32[EDI], mm5
324  movntq 40[EDI], mm6
325  movntq 48[EDI], mm7
326  movntq 56[EDI], mm0
327 
328  add edi, 64
329  dec ecx
330  jnz loop1
331  }
332  dest += ( count & ~63 );
333  count &= 63;
334  }
335 
336  if ( count >= 8 ) {
337  __asm {
338  mov edi, dest
339  mov ecx, count
340  shr ecx, 3 // 8 bytes per iteration
341  movq mm1, dat // Read in source data
342 loop2:
343  movntq 0[EDI], mm1 // Non-temporal stores
344 
345  add edi, 8
346  dec ecx
347  jnz loop2
348  }
349  dest += (count & ~7);
350  count &= 7;
351  }
352 
353  while ( count > 0 ) {
354  *dest = val;
355  dest++;
356  count--;
357  }
358 
359  EMMS_INSTRUCTION
360 
361  // the MMX_Memcpy* functions use MOVNTQ, issue a fence operation
362  __asm {
363  sfence
364  }
365 }
366 
367 #endif /* _WIN32 */
unsigned int dword
Definition: Lib.h:77
virtual void VPCALL Memset(void *dst, const int val, const int count)
case const int
Definition: Callbacks.cpp:52
virtual void VPCALL Memcpy(void *dst, const void *src, const int count)
GLuint src
Definition: glext.h:5390
GLuint GLuint GLsizei count
Definition: glext.h:2845
unsigned short word
Definition: Lib.h:76
unsigned char byte
Definition: Lib.h:75
#define VPCALL
Definition: Simd.h:63
virtual const char *VPCALL GetName(void) const