%include "defs.asm" ;************************* memset64.asm ************************************* ; Author: Agner Fog ; Date created: 2008-07-19 ; Last modified: 2016-11-12 (patched version with AVX512 support removed) ; Description: ; Faster version of the standard memset function: ; void * A_memset(void * dest, int c, size_t count); ; Sets 'count' bytes from 'dest' to the 8-bit value 'c' ; ; Overriding standard function memset: ; The alias ?OVR_memset is changed to _memset in the object file if ; it is desired to override the standard library function memset. ; ; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset ; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit ; ; Optimization: ; Uses XMM registers to set 16 bytes at a time, aligned. ; ; The latest version of this file is available at: ; www.agner.org/optimize/asmexamples.zip ; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses ;****************************************************************************** default rel global A_memset: function ; Function memset global EXP(memset): function ; ?OVR removed if standard function memset overridden global memsetSSE2: function ; SSE2 version global memsetAVX: function ; version for CPUs with fast 256-bit store global GetMemsetCacheLimit: function ; Data blocks bigger than this will be stored uncached by memset global SetMemsetCacheLimit: function ; Change limit in GetMemsetCacheLimit ; Imported from cachesize64.asm: extern DataCacheSize ; Get size of data cache ; Imported from unalignedisfaster64.asm: extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores ; Define prolog for this function %MACRO PROLOGM 0 %IFDEF WINDOWS %define Rdest rcx ; dest movzx eax, dl ; c mov rdx, r8 ; count %define Rcount rdx ; count %define Rdest2 r9 ; copy of dest %define Rcount2 r8 ; copy of count %ELSE ; Unix %define Rdest rdi ; dest movzx eax, sil ; c %define Rcount rdx ; count %define Rdest2 rcx ; copy of dest %define Rcount2 rsi ; copy of count mov Rcount2, Rcount ; copy count %ENDIF %ENDMACRO SECTION .text align=16 ; extern "C" void * memset(void * dest, int c, size_t count); ; Function entry: A_memset: EXP(memset): jmp [memsetDispatch] ; CPU dispatch table memsetAVX: ; AVX version. Use ymm register memsetAVX@: ; local label PROLOGM imul eax, 01010101H ; Broadcast c into all bytes of eax mov Rdest2, Rdest ; save dest cmp Rcount, 16 ja B100 B050: lea r10, [MemsetJTab] ; SSE2 version comes in here jmp qword [r10+Rcount*8] ; jump table for small counts ; Separate code for each count from 0 to 16: M16: mov [Rdest+12], eax M12: mov [Rdest+8], eax M08: mov [Rdest+4], eax M04: mov [Rdest], eax M00: mov rax, Rdest2 ; return dest ret M15: mov [Rdest+11], eax M11: mov [Rdest+7], eax M07: mov [Rdest+3], eax M03: mov [Rdest+1], ax M01: mov [Rdest], al mov rax, Rdest2 ; return dest ret M14: mov [Rdest+10], eax M10: mov [Rdest+6], eax M06: mov [Rdest+2], eax M02: mov [Rdest], ax mov rax, Rdest2 ; return dest ret M13: mov [Rdest+9], eax M09: mov [Rdest+5], eax M05: mov [Rdest+1], eax mov [Rdest], al mov rax, Rdest2 ; return dest ret B100: ; AVX version, Rcount > 16 movd xmm0, eax pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0 lea rax, [Rdest+Rcount] ; point to end cmp Rcount, 20H jbe K600 ; faster to use xmm registers if small ; Store the first possibly unaligned 16 bytes ; It is faster to always write 16 bytes, possibly overlapping ; with the subsequent regular part, than to make possibly mispredicted ; branches depending on the size of the first part. movups oword [Rdest], xmm0 ; store another 16 bytes, aligned add Rdest, 10H and Rdest, -10H movaps oword [Rdest], xmm0 ; go to next 32 bytes boundary add Rdest, 10H and Rdest, -20H ; Check if count very big cmp Rcount, [MemsetCacheLimit] ja K300 ; Use non-temporal store if count > MemsetCacheLimit ; find last 32 bytes boundary mov Rcount, rax and Rcount, -20H ; - size of 32-bytes blocks sub Rdest, Rcount jnb K200 ; Jump if not negative ; extend value to 256 bits vinsertf128 ymm0,ymm0,xmm0,1 align 16 K100: ; Loop through 32-bytes blocks. Register use is swapped ; Rcount = end of 32-bytes blocks part ; Rdest = negative index from the end, counting up to zero vmovaps [Rcount+Rdest], ymm0 add Rdest, 20H jnz K100 vzeroupper K200: ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap movups [rax-20H], xmm0 movups [rax-10H], xmm0 mov rax, Rdest2 ; return dest ret K300: ; Use non-temporal moves, same code as above: ; find last 32 bytes boundary mov Rcount, rax and Rcount, -20H ; - size of 32-bytes blocks sub Rdest, Rcount jnb K500 ; Jump if not negative ; extend value to 256 bits vinsertf128 ymm0,ymm0,xmm0,1 align 16 K400: ; Loop through 32-bytes blocks. Register use is swapped ; Rcount = end of 32-bytes blocks part ; Rdest = negative index from the end, counting up to zero vmovntps [Rcount+Rdest], ymm0 add Rdest, 20H jnz K400 sfence vzeroupper K500: ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap movups [rax-20H], xmm0 movups [rax-10H], xmm0 mov rax, Rdest2 ; return dest ret K600: ; 16 < count <= 32 movups [Rdest], xmm0 movups [rax-10H], xmm0 mov rax, Rdest2 ; return dest ret memsetSSE2: ; count > 16. Use SSE2 instruction set memsetSSE2@: ; local label PROLOGM imul eax, 01010101H ; Broadcast c into all bytes of eax mov Rdest2, Rdest ; save dest cmp Rcount, 16 jna B050 movd xmm0, eax pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0 ; Store the first unaligned part. ; The size of this part is 1 - 16 bytes. ; It is faster to always write 16 bytes, possibly overlapping ; with the subsequent regular part, than to make possibly mispredicted ; branches depending on the size of the first part. movq qword [Rdest], xmm0 movq qword [Rdest+8], xmm0 ; Check if count very big M150: mov rax, [MemsetCacheLimit] cmp Rcount, rax ja M500 ; Use non-temporal store if count > MemsetCacheLimit ; Point to end of regular part: ; Round down dest+count to nearest preceding 16-bytes boundary lea Rcount, [Rdest+Rcount-1] and Rcount, -10H ; Point to start of regular part: ; Round up dest to next 16-bytes boundary add Rdest, 10H and Rdest, -10H ; -(size of regular part) sub Rdest, Rcount jnb M300 ; Jump if not negative align 16 M200: ; Loop through regular part ; Rcount = end of regular part ; Rdest = negative index from the end, counting up to zero movdqa [Rcount+Rdest], xmm0 add Rdest, 10H jnz M200 M300: ; Do the last irregular part ; The size of this part is 1 - 16 bytes. ; It is faster to always write 16 bytes, possibly overlapping ; with the preceding regular part, than to make possibly mispredicted ; branches depending on the size of the last part. mov rax, Rdest2 ; dest movq qword [rax+Rcount2-10H], xmm0 movq qword [rax+Rcount2-8], xmm0 ret M500: ; Use non-temporal moves, same code as above: ; End of regular part: ; Round down dest+count to nearest preceding 16-bytes boundary lea Rcount, [Rdest+Rcount-1] and Rcount, -10H ; Start of regular part: ; Round up dest to next 16-bytes boundary add Rdest, 10H and Rdest, -10H ; -(size of regular part) sub Rdest, Rcount jnb M700 ; Jump if not negative align 16 M600: ; Loop through regular part ; Rcount = end of regular part ; Rdest = negative index from the end, counting up to zero movntdq [Rcount+Rdest], xmm0 add Rdest, 10H jnz M600 sfence M700: ; Do the last irregular part ; The size of this part is 1 - 16 bytes. ; It is faster to always write 16 bytes, possibly overlapping ; with the preceding regular part, than to make possibly mispredicted ; branches depending on the size of the last part. mov rax, Rdest2 ; dest movq qword [rax+Rcount2-10H], xmm0 movq qword [rax+Rcount2-8], xmm0 ret memsetCPUDispatch: ; CPU dispatcher, check for instruction sets and which method is fastest ; This part is executed only once push rbx push rcx push rdx push rsi push rdi push r8 ; set CacheBypassLimit to half the size of the largest level cache call GetMemsetCacheLimit@ lea rbx, [memsetSSE2@] call Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write test eax, eax jz Q100 lea rbx, [memsetAVX@] Q100: ; Insert appropriate pointer mov [memsetDispatch], rbx mov rax, rbx pop r8 pop rdi pop rsi pop rdx pop rcx pop rbx ; Jump according to the replaced function pointer jmp rax ; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset GetMemsetCacheLimit: GetMemsetCacheLimit@: mov rax, [MemsetCacheLimit] test rax, rax jnz U200 ; Get half the size of the largest level cache %ifdef WINDOWS xor ecx, ecx ; 0 means largest level cache %else xor edi, edi ; 0 means largest level cache %endif call DataCacheSize ; get cache size shr eax, 1 ; half the size jnz U100 mov eax, 400000H ; cannot determine cache size. use 4 Mbytes U100: mov [MemsetCacheLimit], eax U200: ret ; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit SetMemsetCacheLimit: %ifdef WINDOWS mov rax, rcx %else mov rax, rdi %endif test rax, rax jnz U400 ; zero, means default mov [MemsetCacheLimit], rax call GetMemsetCacheLimit@ U400: mov [MemsetCacheLimit], rax ret SECTION .data align 16 ; Jump table for count from 0 to 16: MemsetJTab:DQ M00, M01, M02, M03, M04, M05, M06, M07 DQ M08, M09, M10, M11, M12, M13, M14, M15, M16 ; Pointer to appropriate version. ; This initially points to memsetCPUDispatch. memsetCPUDispatch will ; change this to the appropriate version of memset, so that ; memsetCPUDispatch is only executed once: memsetDispatch: DQ memsetCPUDispatch ; Bypass cache by using non-temporal moves if count > MemsetCacheLimit ; The optimal value of MemsetCacheLimit is difficult to estimate, but ; a reasonable value is half the size of the largest cache MemsetCacheLimit: DQ 0