123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372 |
- %include "defs.asm"
- ;************************* memset64.asm *************************************
- ; Author: Agner Fog
- ; Date created: 2008-07-19
- ; Last modified: 2016-11-12 (patched version with AVX512 support removed)
- ; Description:
- ; Faster version of the standard memset function:
- ; void * A_memset(void * dest, int c, size_t count);
- ; Sets 'count' bytes from 'dest' to the 8-bit value 'c'
- ;
- ; Overriding standard function memset:
- ; The alias ?OVR_memset is changed to _memset in the object file if
- ; it is desired to override the standard library function memset.
- ;
- ; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
- ; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
- ;
- ; Optimization:
- ; Uses XMM registers to set 16 bytes at a time, aligned.
- ;
- ; The latest version of this file is available at:
- ; www.agner.org/optimize/asmexamples.zip
- ; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
- ;******************************************************************************
- default rel
- global A_memset: function ; Function memset
- global EXP(memset): function ; ?OVR removed if standard function memset overridden
- global memsetSSE2: function ; SSE2 version
- global memsetAVX: function ; version for CPUs with fast 256-bit store
- global GetMemsetCacheLimit: function ; Data blocks bigger than this will be stored uncached by memset
- global SetMemsetCacheLimit: function ; Change limit in GetMemsetCacheLimit
- ; Imported from cachesize64.asm:
- extern DataCacheSize ; Get size of data cache
- ; Imported from unalignedisfaster64.asm:
- extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
- ; Define prolog for this function
- %MACRO PROLOGM 0
- %IFDEF WINDOWS
- %define Rdest rcx ; dest
- movzx eax, dl ; c
- mov rdx, r8 ; count
- %define Rcount rdx ; count
- %define Rdest2 r9 ; copy of dest
- %define Rcount2 r8 ; copy of count
- %ELSE ; Unix
- %define Rdest rdi ; dest
- movzx eax, sil ; c
- %define Rcount rdx ; count
- %define Rdest2 rcx ; copy of dest
- %define Rcount2 rsi ; copy of count
- mov Rcount2, Rcount ; copy count
- %ENDIF
- %ENDMACRO
- SECTION .text align=16
- ; extern "C" void * memset(void * dest, int c, size_t count);
- ; Function entry:
- A_memset:
- EXP(memset):
- jmp [memsetDispatch] ; CPU dispatch table
-
- memsetAVX: ; AVX version. Use ymm register
- memsetAVX@: ; local label
- PROLOGM
- imul eax, 01010101H ; Broadcast c into all bytes of eax
- mov Rdest2, Rdest ; save dest
- cmp Rcount, 16
- ja B100
- B050: lea r10, [MemsetJTab] ; SSE2 version comes in here
- jmp qword [r10+Rcount*8] ; jump table for small counts
-
- ; Separate code for each count from 0 to 16:
- M16: mov [Rdest+12], eax
- M12: mov [Rdest+8], eax
- M08: mov [Rdest+4], eax
- M04: mov [Rdest], eax
- M00: mov rax, Rdest2 ; return dest
- ret
- M15: mov [Rdest+11], eax
- M11: mov [Rdest+7], eax
- M07: mov [Rdest+3], eax
- M03: mov [Rdest+1], ax
- M01: mov [Rdest], al
- mov rax, Rdest2 ; return dest
- ret
-
- M14: mov [Rdest+10], eax
- M10: mov [Rdest+6], eax
- M06: mov [Rdest+2], eax
- M02: mov [Rdest], ax
- mov rax, Rdest2 ; return dest
- ret
- M13: mov [Rdest+9], eax
- M09: mov [Rdest+5], eax
- M05: mov [Rdest+1], eax
- mov [Rdest], al
- mov rax, Rdest2 ; return dest
- ret
-
- B100: ; AVX version, Rcount > 16
- movd xmm0, eax
- pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
-
- lea rax, [Rdest+Rcount] ; point to end
-
- cmp Rcount, 20H
- jbe K600 ; faster to use xmm registers if small
-
- ; Store the first possibly unaligned 16 bytes
- ; It is faster to always write 16 bytes, possibly overlapping
- ; with the subsequent regular part, than to make possibly mispredicted
- ; branches depending on the size of the first part.
- movups oword [Rdest], xmm0
-
- ; store another 16 bytes, aligned
- add Rdest, 10H
- and Rdest, -10H
- movaps oword [Rdest], xmm0
-
- ; go to next 32 bytes boundary
- add Rdest, 10H
- and Rdest, -20H
-
- ; Check if count very big
- cmp Rcount, [MemsetCacheLimit]
- ja K300 ; Use non-temporal store if count > MemsetCacheLimit
-
- ; find last 32 bytes boundary
- mov Rcount, rax
- and Rcount, -20H
-
- ; - size of 32-bytes blocks
- sub Rdest, Rcount
- jnb K200 ; Jump if not negative
-
- ; extend value to 256 bits
- vinsertf128 ymm0,ymm0,xmm0,1
-
- align 16
- K100: ; Loop through 32-bytes blocks. Register use is swapped
- ; Rcount = end of 32-bytes blocks part
- ; Rdest = negative index from the end, counting up to zero
- vmovaps [Rcount+Rdest], ymm0
- add Rdest, 20H
- jnz K100
- vzeroupper
-
- K200: ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
- movups [rax-20H], xmm0
- movups [rax-10H], xmm0
- mov rax, Rdest2 ; return dest
- ret
-
- K300: ; Use non-temporal moves, same code as above:
- ; find last 32 bytes boundary
- mov Rcount, rax
- and Rcount, -20H
-
- ; - size of 32-bytes blocks
- sub Rdest, Rcount
- jnb K500 ; Jump if not negative
-
- ; extend value to 256 bits
- vinsertf128 ymm0,ymm0,xmm0,1
-
- align 16
- K400: ; Loop through 32-bytes blocks. Register use is swapped
- ; Rcount = end of 32-bytes blocks part
- ; Rdest = negative index from the end, counting up to zero
- vmovntps [Rcount+Rdest], ymm0
- add Rdest, 20H
- jnz K400
- sfence
- vzeroupper
-
- K500: ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
- movups [rax-20H], xmm0
- movups [rax-10H], xmm0
- mov rax, Rdest2 ; return dest
- ret
-
- K600: ; 16 < count <= 32
- movups [Rdest], xmm0
- movups [rax-10H], xmm0
- mov rax, Rdest2 ; return dest
- ret
-
- memsetSSE2: ; count > 16. Use SSE2 instruction set
- memsetSSE2@: ; local label
- PROLOGM
- imul eax, 01010101H ; Broadcast c into all bytes of eax
- mov Rdest2, Rdest ; save dest
- cmp Rcount, 16
- jna B050
- movd xmm0, eax
- pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
-
- ; Store the first unaligned part.
- ; The size of this part is 1 - 16 bytes.
- ; It is faster to always write 16 bytes, possibly overlapping
- ; with the subsequent regular part, than to make possibly mispredicted
- ; branches depending on the size of the first part.
- movq qword [Rdest], xmm0
- movq qword [Rdest+8], xmm0
-
- ; Check if count very big
- M150: mov rax, [MemsetCacheLimit]
- cmp Rcount, rax
- ja M500 ; Use non-temporal store if count > MemsetCacheLimit
-
- ; Point to end of regular part:
- ; Round down dest+count to nearest preceding 16-bytes boundary
- lea Rcount, [Rdest+Rcount-1]
- and Rcount, -10H
-
- ; Point to start of regular part:
- ; Round up dest to next 16-bytes boundary
- add Rdest, 10H
- and Rdest, -10H
-
- ; -(size of regular part)
- sub Rdest, Rcount
- jnb M300 ; Jump if not negative
-
- align 16
- M200: ; Loop through regular part
- ; Rcount = end of regular part
- ; Rdest = negative index from the end, counting up to zero
- movdqa [Rcount+Rdest], xmm0
- add Rdest, 10H
- jnz M200
-
- M300: ; Do the last irregular part
- ; The size of this part is 1 - 16 bytes.
- ; It is faster to always write 16 bytes, possibly overlapping
- ; with the preceding regular part, than to make possibly mispredicted
- ; branches depending on the size of the last part.
- mov rax, Rdest2 ; dest
- movq qword [rax+Rcount2-10H], xmm0
- movq qword [rax+Rcount2-8], xmm0
- ret
-
- M500: ; Use non-temporal moves, same code as above:
- ; End of regular part:
- ; Round down dest+count to nearest preceding 16-bytes boundary
- lea Rcount, [Rdest+Rcount-1]
- and Rcount, -10H
-
- ; Start of regular part:
- ; Round up dest to next 16-bytes boundary
- add Rdest, 10H
- and Rdest, -10H
-
- ; -(size of regular part)
- sub Rdest, Rcount
- jnb M700 ; Jump if not negative
- align 16
- M600: ; Loop through regular part
- ; Rcount = end of regular part
- ; Rdest = negative index from the end, counting up to zero
- movntdq [Rcount+Rdest], xmm0
- add Rdest, 10H
- jnz M600
- sfence
- M700: ; Do the last irregular part
- ; The size of this part is 1 - 16 bytes.
- ; It is faster to always write 16 bytes, possibly overlapping
- ; with the preceding regular part, than to make possibly mispredicted
- ; branches depending on the size of the last part.
- mov rax, Rdest2 ; dest
- movq qword [rax+Rcount2-10H], xmm0
- movq qword [rax+Rcount2-8], xmm0
- ret
-
-
- memsetCPUDispatch: ; CPU dispatcher, check for instruction sets and which method is fastest
- ; This part is executed only once
- push rbx
- push rcx
- push rdx
- push rsi
- push rdi
- push r8
- ; set CacheBypassLimit to half the size of the largest level cache
- call GetMemsetCacheLimit@
- lea rbx, [memsetSSE2@]
- call Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write
- test eax, eax
- jz Q100
- lea rbx, [memsetAVX@]
- Q100:
- ; Insert appropriate pointer
- mov [memsetDispatch], rbx
- mov rax, rbx
- pop r8
- pop rdi
- pop rsi
- pop rdx
- pop rcx
- pop rbx
- ; Jump according to the replaced function pointer
- jmp rax
-
- ; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
- GetMemsetCacheLimit:
- GetMemsetCacheLimit@:
- mov rax, [MemsetCacheLimit]
- test rax, rax
- jnz U200
- ; Get half the size of the largest level cache
- %ifdef WINDOWS
- xor ecx, ecx ; 0 means largest level cache
- %else
- xor edi, edi ; 0 means largest level cache
- %endif
- call DataCacheSize ; get cache size
- shr eax, 1 ; half the size
- jnz U100
- mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
- U100: mov [MemsetCacheLimit], eax
- U200: ret
- ; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
- SetMemsetCacheLimit:
- %ifdef WINDOWS
- mov rax, rcx
- %else
- mov rax, rdi
- %endif
- test rax, rax
- jnz U400
- ; zero, means default
- mov [MemsetCacheLimit], rax
- call GetMemsetCacheLimit@
- U400: mov [MemsetCacheLimit], rax
- ret
-
-
- SECTION .data
- align 16
- ; Jump table for count from 0 to 16:
- MemsetJTab:DQ M00, M01, M02, M03, M04, M05, M06, M07
- DQ M08, M09, M10, M11, M12, M13, M14, M15, M16
-
- ; Pointer to appropriate version.
- ; This initially points to memsetCPUDispatch. memsetCPUDispatch will
- ; change this to the appropriate version of memset, so that
- ; memsetCPUDispatch is only executed once:
- memsetDispatch: DQ memsetCPUDispatch
- ; Bypass cache by using non-temporal moves if count > MemsetCacheLimit
- ; The optimal value of MemsetCacheLimit is difficult to estimate, but
- ; a reasonable value is half the size of the largest cache
- MemsetCacheLimit: DQ 0
|