/* =========================================================================== Copyright (C) 1999-2005 Id Software, Inc. This file is part of Quake III Arena source code. Quake III Arena source code is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. Quake III Arena source code is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Foobar; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA =========================================================================== */ /** * GAS syntax equivalents of the MSVC asm memory calls in common.c * * The following changes have been made to the asm: * 1. Registers are loaded by the inline asm arguments when possible * 2. Labels have been changed to local label format (0,1,etc.) to allow inlining * * HISTORY: * AH - Created on 08 Dec 2000 */ #include // AH - for size_t #include // bk001207 - we need something under Linux, too. Mac? #if 1 // defined(C_ONLY) // bk010102 - dedicated? void Com_Memcpy (void* dest, const void* src, const size_t count) { memcpy(dest, src, count); } void Com_Memset (void* dest, const int val, const size_t count) { memset(dest, val, count); } #else typedef enum { PRE_READ, // prefetch assuming that buffer is used for reading only PRE_WRITE, // prefetch assuming that buffer is used for writing only PRE_READ_WRITE // prefetch assuming that buffer is used for both reading and writing } e_prefetch; void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type); void _copyDWord (unsigned int* dest, const unsigned int constant, const unsigned int count) { // MMX version not used on standard Pentium MMX // because the dword version is faster (with // proper destination prefetching) __asm__ __volatile__ (" \ //mov eax,constant // eax = val \ //mov edx,dest // dest \ //mov ecx,count \ movd %%eax, %%mm0 \ punpckldq %%mm0, %%mm0 \ \ // ensure that destination is qword aligned \ \ testl $7, %%edx // qword padding?\ jz 0f \ movl %%eax, (%%edx) \ decl %%ecx \ addl $4, %%edx \ \ 0: movl %%ecx, %%ebx \ andl $0xfffffff0, %%ecx \ jz 2f \ jmp 1f \ .align 16 \ \ // funny ordering here to avoid commands \ // that cross 32-byte boundaries (the \ // [edx+0] version has a special 3-byte opcode... \ 1: movq %%mm0, 8(%%edx) \ movq %%mm0, 16(%%edx) \ movq %%mm0, 24(%%edx) \ movq %%mm0, 32(%%edx) \ movq %%mm0, 40(%%edx) \ movq %%mm0, 48(%%edx) \ movq %%mm0, 56(%%edx) \ movq %%mm0, (%%edx)\ addl $64, %%edx \ subl $16, %%ecx \ jnz 1b \ 2: \ movl %%ebx, %%ecx // ebx = cnt \ andl $0xfffffff0, %%ecx // ecx = cnt&~15 \ subl %%ecx, %%ebx \ jz 6f \ cmpl $8, %%ebx \ jl 3f \ \ movq %%mm0, (%%edx) \ movq %%mm0, 8(%%edx) \ movq %%mm0, 16(%%edx) \ movq %%mm0, 24(%%edx) \ addl $32, %%edx \ subl $8, %%ebx \ jz 6f \ \ 3: cmpl $4, %%ebx \ jl 4f \ \ movq %%mm0, (%%edx) \ movq %%mm0, 8(%%edx) \ addl $16, %%edx \ subl $4, %%ebx \ \ 4: cmpl $2, %%ebx \ jl 5f \ movq %%mm0, (%%edx) \ addl $8, %%edx \ subl $2, %%ebx \ \ 5: cmpl $1, %%ebx \ jl 6f \ movl %%eax, (%%edx) \ 6: \ emms \ " : : "a" (constant), "c" (count), "d" (dest) : "%ebx", "%edi", "%esi", "cc", "memory"); } // optimized memory copy routine that handles all alignment // cases and block sizes efficiently void Com_Memcpy (void* dest, const void* src, const size_t count) { Com_Prefetch (src, count, PRE_READ); __asm__ __volatile__ (" \ pushl %%edi \ pushl %%esi \ //mov ecx,count \ cmpl $0, %%ecx // count = 0 check (just to be on the safe side) \ je 6f \ //mov edx,dest \ movl %0, %%ebx \ cmpl $32, %%ecx // padding only? \ jl 1f \ \ movl %%ecx, %%edi \ andl $0xfffffe00, %%edi // edi = count&~31 \ subl $32, %%edi \ \ .align 16 \ 0: \ movl (%%ebx, %%edi, 1), %%eax \ movl 4(%%ebx, %%edi, 1), %%esi \ movl %%eax, (%%edx, %%edi, 1) \ movl %%esi, 4(%%edx, %%edi, 1) \ movl 8(%%ebx, %%edi, 1), %%eax \ movl 12(%%ebx, %%edi, 1), %%esi \ movl %%eax, 8(%%edx, %%edi, 1) \ movl %%esi, 12(%%edx, %%edi, 1) \ movl 16(%%ebx, %%edi, 1), %%eax \ movl 20(%%ebx, %%edi, 1), %%esi \ movl %%eax, 16(%%edx, %%edi, 1) \ movl %%esi, 20(%%edx, %%edi, 1) \ movl 24(%%ebx, %%edi, 1), %%eax \ movl 28(%%ebx, %%edi, 1), %%esi \ movl %%eax, 24(%%edx, %%edi, 1) \ movl %%esi, 28(%%edx, %%edi, 1) \ subl $32, %%edi \ jge 0b \ \ movl %%ecx, %%edi \ andl $0xfffffe00, %%edi \ addl %%edi, %%ebx // increase src pointer \ addl %%edi, %%edx // increase dst pointer \ andl $31, %%ecx // new count \ jz 6f // if count = 0, get outta here \ \ 1: \ cmpl $16, %%ecx \ jl 2f \ movl (%%ebx), %%eax \ movl %%eax, (%%edx) \ movl 4(%%ebx), %%eax \ movl %%eax, 4(%%edx) \ movl 8(%%ebx), %%eax \ movl %%eax, 8(%%edx) \ movl 12(%%ebx), %%eax \ movl %%eax, 12(%%edx) \ subl $16, %%ecx \ addl $16, %%ebx \ addl $16, %%edx \ 2: \ cmpl $8, %%ecx \ jl 3f \ movl (%%ebx), %%eax \ movl %%eax, (%%edx) \ movl 4(%%ebx), %%eax \ subl $8, %%ecx \ movl %%eax, 4(%%edx) \ addl $8, %%ebx \ addl $8, %%edx \ 3: \ cmpl $4, %%ecx \ jl 4f \ movl (%%ebx), %%eax // here 4-7 bytes \ addl $4, %%ebx \ subl $4, %%ecx \ movl %%eax, (%%edx) \ addl $4, %%edx \ 4: // 0-3 remaining bytes \ cmpl $2, %%ecx \ jl 5f \ movw (%%ebx), %%ax // two bytes \ cmpl $3, %%ecx // less than 3? \ movw %%ax, (%%edx) \ jl 6f \ movb 2(%%ebx), %%al // last byte \ movb %%al, 2(%%edx) \ jmp 6f \ 5: \ cmpl $1, %%ecx \ jl 6f \ movb (%%ebx), %%al \ movb %%al, (%%edx) \ 6: \ popl %%esi \ popl %%edi \ " : : "m" (src), "d" (dest), "c" (count) : "%eax", "%ebx", "%edi", "%esi", "cc", "memory"); } void Com_Memset (void* dest, const int val, const size_t count) { unsigned int fillval; if (count < 8) { __asm__ __volatile__ (" \ //mov edx,dest \ //mov eax, val \ movb %%al, %%ah \ movl %%eax, %%ebx \ andl $0xffff, %%ebx \ shll $16, %%eax \ addl %%ebx, %%eax // eax now contains pattern \ //mov ecx,count \ cmpl $4, %%ecx \ jl 0f \ movl %%eax, (%%edx) // copy first dword \ addl $4, %%edx \ subl $4, %%ecx \ 0: cmpl $2, %%ecx \ jl 1f \ movw %%ax, (%%edx) // copy 2 bytes \ addl $2, %%edx \ subl $2, %%ecx \ 1: cmpl $0, %%ecx \ je 2f \ movb %%al, (%%edx) // copy single byte \ 2: \ " : : "d" (dest), "a" (val), "c" (count) : "%ebx", "%edi", "%esi", "cc", "memory"); return; } fillval = val; fillval = fillval|(fillval<<8); fillval = fillval|(fillval<<16); // fill dword with 8-bit pattern _copyDWord ((unsigned int*)(dest),fillval, count/4); __asm__ __volatile__ (" // padding of 0-3 bytes \ //mov ecx,count \ movl %%ecx, %%eax \ andl $3, %%ecx \ jz 1f \ andl $0xffffff00, %%eax \ //mov ebx,dest \ addl %%eax, %%edx \ movl %0, %%eax \ cmpl $2, %%ecx \ jl 0f \ movw %%ax, (%%edx) \ cmpl $2, %%ecx \ je 1f \ movb %%al, 2(%%edx) \ jmp 1f \ 0: \ cmpl $0, %%ecx\ je 1f\ movb %%al, (%%edx)\ 1: \ " : : "m" (fillval), "c" (count), "d" (dest) : "%eax", "%ebx", "%edi", "%esi", "cc", "memory"); } void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type) { // write buffer prefetching is performed only if // the processor benefits from it. Read and read/write // prefetching is always performed. switch (type) { case PRE_WRITE : break; case PRE_READ: case PRE_READ_WRITE: __asm__ __volatile__ ("\ //mov ebx,s\ //mov ecx,bytes\ cmpl $4096, %%ecx // clamp to 4kB\ jle 0f\ movl $4096, %%ecx\ 0:\ addl $0x1f, %%ecx\ shrl $5, %%ecx // number of cache lines\ jz 2f\ jmp 1f\ \ .align 16\ 1: testb %%al, (%%edx)\ addl $32, %%edx\ decl %%ecx\ jnz 1b\ 2:\ " : : "d" (s), "c" (bytes) : "%eax", "%ebx", "%edi", "%esi", "memory", "cc"); break; } } #endif