doom3-gpl
Doom 3 GPL source release
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Simd_3DNow.cpp
Go to the documentation of this file.
1 /*
2 ===========================================================================
3 
4 Doom 3 GPL Source Code
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
6 
7 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
8 
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13 
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
21 
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
23 
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
25 
26 ===========================================================================
27 */
28 
29 #include "../precompiled.h"
30 #pragma hdrstop
31 
32 #include "Simd_Generic.h"
33 #include "Simd_MMX.h"
34 #include "Simd_3DNow.h"
35 
36 
37 //===============================================================
38 //
39 // 3DNow! implementation of idSIMDProcessor
40 //
41 //===============================================================
42 
43 #ifdef _WIN32
44 
45 /*
46 ============
47 idSIMD_3DNow::GetName
48 ============
49 */
50 const char * idSIMD_3DNow::GetName( void ) const {
51  return "MMX & 3DNow!";
52 }
53 
54 // Very optimized memcpy() routine for all AMD Athlon and Duron family.
55 // This code uses any of FOUR different basic copy methods, depending
56 // on the transfer size.
57 // NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
58 // "Streaming Store"), and also uses the software prefetchnta instructions,
59 // be sure you're running on Athlon/Duron or other recent CPU before calling!
60 
61 #define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
62 // The smallest copy uses the X86 "movsd" instruction, in an optimized
63 // form which is an "unrolled loop".
64 
65 #define IN_CACHE_COPY 64 * 1024 // upper limit for movq/movq copy w/SW prefetch
66 // Next is a copy that uses the MMX registers to copy 8 bytes at a time,
67 // also using the "unrolled loop" optimization. This code uses
68 // the software prefetch instruction to get the data into the cache.
69 
70 #define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
71 // For larger blocks, which will spill beyond the cache, it's faster to
72 // use the Streaming Store instruction MOVNTQ. This write instruction
73 // bypasses the cache and writes straight to main memory. This code also
74 // uses the software prefetch instruction to pre-read the data.
75 // USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
76 
77 #define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
78 #define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
79 // For the largest size blocks, a special technique called Block Prefetch
80 // can be used to accelerate the read operations. Block Prefetch reads
81 // one address per cache line, for a series of cache lines, in a short loop.
82 // This is faster than using software prefetch. The technique is great for
83 // getting maximum read bandwidth, especially in DDR memory systems.
84 
85 /*
86 ================
87 idSIMD_3DNow::Memcpy
88 
89  optimized memory copy routine that handles all alignment cases and block sizes efficiently
90 ================
91 */
92 void VPCALL idSIMD_3DNow::Memcpy( void *dest, const void *src, const int n ) {
93  __asm {
94 
95  mov ecx, [n] // number of bytes to copy
96  mov edi, [dest] // destination
97  mov esi, [src] // source
98  mov ebx, ecx // keep a copy of count
99 
100  cld
101  cmp ecx, TINY_BLOCK_COPY
102  jb $memcpy_ic_3 // tiny? skip mmx copy
103 
104  cmp ecx, 32*1024 // don't align between 32k-64k because
105  jbe $memcpy_do_align // it appears to be slower
106  cmp ecx, 64*1024
107  jbe $memcpy_align_done
108 $memcpy_do_align:
109  mov ecx, 8 // a trick that's faster than rep movsb...
110  sub ecx, edi // align destination to qword
111  and ecx, 111b // get the low bits
112  sub ebx, ecx // update copy count
113  neg ecx // set up to jump into the array
114  add ecx, offset $memcpy_align_done
115  jmp ecx // jump to array of movsb's
116 
117 align 4
118  movsb
119  movsb
120  movsb
121  movsb
122  movsb
123  movsb
124  movsb
125  movsb
126 
127 $memcpy_align_done: // destination is dword aligned
128  mov ecx, ebx // number of bytes left to copy
129  shr ecx, 6 // get 64-byte block count
130  jz $memcpy_ic_2 // finish the last few bytes
131 
132  cmp ecx, IN_CACHE_COPY/64 // too big 4 cache? use uncached copy
133  jae $memcpy_uc_test
134 
135 // This is small block copy that uses the MMX registers to copy 8 bytes
136 // at a time. It uses the "unrolled loop" optimization, and also uses
137 // the software prefetch instruction to get the data into the cache.
138 align 16
139 $memcpy_ic_1: // 64-byte block copies, in-cache copy
140 
141  prefetchnta [esi + (200*64/34+192)] // start reading ahead
142 
143  movq mm0, [esi+0] // read 64 bits
144  movq mm1, [esi+8]
145  movq [edi+0], mm0 // write 64 bits
146  movq [edi+8], mm1 // note: the normal movq writes the
147  movq mm2, [esi+16] // data to cache; a cache line will be
148  movq mm3, [esi+24] // allocated as needed, to store the data
149  movq [edi+16], mm2
150  movq [edi+24], mm3
151  movq mm0, [esi+32]
152  movq mm1, [esi+40]
153  movq [edi+32], mm0
154  movq [edi+40], mm1
155  movq mm2, [esi+48]
156  movq mm3, [esi+56]
157  movq [edi+48], mm2
158  movq [edi+56], mm3
159 
160  add esi, 64 // update source pointer
161  add edi, 64 // update destination pointer
162  dec ecx // count down
163  jnz $memcpy_ic_1 // last 64-byte block?
164 
165 $memcpy_ic_2:
166  mov ecx, ebx // has valid low 6 bits of the byte count
167 $memcpy_ic_3:
168  shr ecx, 2 // dword count
169  and ecx, 1111b // only look at the "remainder" bits
170  neg ecx // set up to jump into the array
171  add ecx, offset $memcpy_last_few
172  jmp ecx // jump to array of movsd's
173 
174 $memcpy_uc_test:
175  cmp ecx, UNCACHED_COPY/64 // big enough? use block prefetch copy
176  jae $memcpy_bp_1
177 
178 $memcpy_64_test:
179  or ecx, ecx // tail end of block prefetch will jump here
180  jz $memcpy_ic_2 // no more 64-byte blocks left
181 
182 // For larger blocks, which will spill beyond the cache, it's faster to
183 // use the Streaming Store instruction MOVNTQ. This write instruction
184 // bypasses the cache and writes straight to main memory. This code also
185 // uses the software prefetch instruction to pre-read the data.
186 align 16
187 $memcpy_uc_1: // 64-byte blocks, uncached copy
188 
189  prefetchnta [esi + (200*64/34+192)] // start reading ahead
190 
191  movq mm0,[esi+0] // read 64 bits
192  add edi,64 // update destination pointer
193  movq mm1,[esi+8]
194  add esi,64 // update source pointer
195  movq mm2,[esi-48]
196  movntq [edi-64], mm0 // write 64 bits, bypassing the cache
197  movq mm0,[esi-40] // note: movntq also prevents the CPU
198  movntq [edi-56], mm1 // from READING the destination address
199  movq mm1,[esi-32] // into the cache, only to be over-written
200  movntq [edi-48], mm2 // so that also helps performance
201  movq mm2,[esi-24]
202  movntq [edi-40], mm0
203  movq mm0,[esi-16]
204  movntq [edi-32], mm1
205  movq mm1,[esi-8]
206  movntq [edi-24], mm2
207  movntq [edi-16], mm0
208  dec ecx
209  movntq [edi-8], mm1
210  jnz $memcpy_uc_1 // last 64-byte block?
211 
212  jmp $memcpy_ic_2 // almost done
213 
214 // For the largest size blocks, a special technique called Block Prefetch
215 // can be used to accelerate the read operations. Block Prefetch reads
216 // one address per cache line, for a series of cache lines, in a short loop.
217 // This is faster than using software prefetch, in this case.
218 // The technique is great for getting maximum read bandwidth,
219 // especially in DDR memory systems.
220 $memcpy_bp_1: // large blocks, block prefetch copy
221 
222  cmp ecx, CACHEBLOCK // big enough to run another prefetch loop?
223  jl $memcpy_64_test // no, back to regular uncached copy
224 
225  mov eax, CACHEBLOCK / 2 // block prefetch loop, unrolled 2X
226  add esi, CACHEBLOCK * 64 // move to the top of the block
227 align 16
228 $memcpy_bp_2:
229  mov edx, [esi-64] // grab one address per cache line
230  mov edx, [esi-128] // grab one address per cache line
231  sub esi, 128 // go reverse order
232  dec eax // count down the cache lines
233  jnz $memcpy_bp_2 // keep grabbing more lines into cache
234 
235  mov eax, CACHEBLOCK // now that it's in cache, do the copy
236 align 16
237 $memcpy_bp_3:
238  movq mm0, [esi ] // read 64 bits
239  movq mm1, [esi+ 8]
240  movq mm2, [esi+16]
241  movq mm3, [esi+24]
242  movq mm4, [esi+32]
243  movq mm5, [esi+40]
244  movq mm6, [esi+48]
245  movq mm7, [esi+56]
246  add esi, 64 // update source pointer
247  movntq [edi ], mm0 // write 64 bits, bypassing cache
248  movntq [edi+ 8], mm1 // note: movntq also prevents the CPU
249  movntq [edi+16], mm2 // from READING the destination address
250  movntq [edi+24], mm3 // into the cache, only to be over-written,
251  movntq [edi+32], mm4 // so that also helps performance
252  movntq [edi+40], mm5
253  movntq [edi+48], mm6
254  movntq [edi+56], mm7
255  add edi, 64 // update dest pointer
256 
257  dec eax // count down
258 
259  jnz $memcpy_bp_3 // keep copying
260  sub ecx, CACHEBLOCK // update the 64-byte block count
261  jmp $memcpy_bp_1 // keep processing chunks
262 
263 // The smallest copy uses the X86 "movsd" instruction, in an optimized
264 // form which is an "unrolled loop". Then it handles the last few bytes.
265 align 4
266  movsd
267  movsd // perform last 1-15 dword copies
268  movsd
269  movsd
270  movsd
271  movsd
272  movsd
273  movsd
274  movsd
275  movsd // perform last 1-7 dword copies
276  movsd
277  movsd
278  movsd
279  movsd
280  movsd
281  movsd
282 
283 $memcpy_last_few: // dword aligned from before movsd's
284  mov ecx, ebx // has valid low 2 bits of the byte count
285  and ecx, 11b // the last few cows must come home
286  jz $memcpy_final // no more, let's leave
287  rep movsb // the last 1, 2, or 3 bytes
288 
289 $memcpy_final:
290  emms // clean up the MMX state
291  sfence // flush the write buffer
292  mov eax, [dest] // ret value = destination pointer
293 
294  }
295 }
296 
297 #endif /* _WIN32 */
GLenum GLsizei n
Definition: glext.h:3705
virtual void VPCALL Memcpy(void *dst, const void *src, const int count)
GLuint src
Definition: glext.h:5390
GLintptr offset
Definition: glext.h:3113
GLubyte GLubyte b
Definition: glext.h:4662
#define VPCALL
Definition: Simd.h:63
virtual const char *VPCALL GetName(void) const