%include "defs.asm" ;************************* memmove64.asm *********************************** ; Author: Agner Fog ; Date created: 2008-07-18 ; Last modified: 2016-11-16 (patched version with AVX512 support removed) ; Description: ; Faster version of the standard memmove function: ; void * A_memmove(void *dest, const void *src, size_t count); ; Moves 'count' bytes from 'src' to 'dest'. src and dest may overlap. ; ; Overriding standard function memmove: ; The alias ?OVR_memmove is changed to _memmove in the object file if ; it is desired to override the standard library function memmove. ; ; CPU dispatching included for different CPUs ; ; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses ;****************************************************************************** default rel global A_memmove: function ; Function A_memmove global EXP(memmove): function ; ?OVR removed if standard function memmove overridden global memmoveSSE2: function ; Version for processors with only SSE2 global memmoveSSSE3: function ; Version for processors with SSSE3 global memmoveU: function ; Version for processors with fast unaligned read global memmoveU256: function ; Version for processors with fast 256-bit read/write global SetMemcpyCacheLimit ; Change limit for bypassing cache ; Imported from memcpy64.asm: extern A_memcpy ; function entry extern memcpySSE2 ; CPU specific function entry extern memcpySSSE3 ; CPU specific function entry extern memcpyU ; CPU specific function entry extern memcpyU256 ; CPU specific function entry ; Imported from instrset64.asm extern InstructionSet ; Instruction set for CPU dispatcher ; Imported from unalignedisfaster64.asm: extern UnalignedIsFaster ; Tells if unaligned read is faster than PALIGNR extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores ; Imported from memcpy64.asm extern GetMemcpyCacheLimit ; Get the size limit for bypassing cache when copying with memcpy and memmove extern SetMemcpyCacheLimit1 ; Set the size limit for bypassing cache when copying with memcpy ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Prolog macro. Determine if we should move forwards or backwards ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Define prolog for this function ; Parameter 1 is forward function label %MACRO PROLOGM 1 %IFDEF WINDOWS ; Check if dest overlaps src mov rax, rcx sub rax, rdx cmp rax, r8 ; We can avoid testing for dest < src by using unsigned compare: ; (Assume that the memory block cannot span across address 0) ; Must move backwards if unsigned(dest-src) < count jae %1 ; Jump to memcpy if we can move forwards push rsi push rdi mov rdi, rcx ; dest mov r9, rcx ; dest mov rsi, rdx ; src mov rcx, r8 ; count %ELSE ; Unix ; Check if dest overlaps src mov rax, rdi sub rax, rsi cmp rax, rdx ; Must move backwards if unsigned(dest-src) < count jae %1 ; Jump to memcpy if we can move forwards mov rcx, rdx ; count mov r9, rdi ; dest %ENDIF %ENDM ; Define return from this function %MACRO RETURNM 0 %IFDEF WINDOWS pop rdi pop rsi %ENDIF mov rax, r9 ; Return value = dest ret %ENDMACRO SECTION .text align=16 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Common entry for dispatch ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; extern "C" void * A_memmove(void * dest, const void * src, size_t count); ; Function entry: A_memmove: EXP(memmove): jmp qword [memmoveDispatch] ; Go to appropriate version, depending on instruction set ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; AVX Version for processors with fast unaligned read and fast 32 bytes write ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 16 memmoveU256: ; Version for processors with fast 256-bit read/write memmoveU256@: ; local label PROLOGM memcpyU256 cmp rcx, 40H jb A1000 ; Use simpler code if count < 64 ; count >= 64 ; Note: this part will not always work if count < 64 ; Calculate size of last block after last regular boundary of dest lea edx, [rdi+rcx] ; end of dext and edx, 1FH jz B4300 ; Skip if end of dest aligned by 32 ; edx = size of last partial block, 1 - 31 bytes test dl, 3 jz B4210 test dl, 1 jz B4201 ; B4200 if we haven't tested edx,3 ; move 1 byte dec rcx movzx eax, byte [rsi+rcx] mov [rdi+rcx], al B4200: test dl, 2 jz B4210 B4201: ; move 2 bytes sub rcx, 2 movzx eax, word [rsi+rcx] mov [rdi+rcx], ax B4210: test dl, 4 jz B4220 ; move 4 bytes sub rcx, 4 mov eax, [rsi+rcx] mov [rdi+rcx], eax B4220: test dl, 8 jz B4230 ; move 8 bytes sub rcx, 8 mov rax, [rsi+rcx] mov [rdi+rcx], rax B4230: test dl, 16 jz B4300 ; move 16 bytes sub rcx, 16 movups xmm0, [rsi+rcx] movaps [rdi+rcx], xmm0 B4300: ; Now end of dest is aligned by 32. Any partial block has been moved mov rdx, rcx and ecx, 1FH ; remaining size after 32 bytes blocks moved and rdx, -20H ; number of 32 bytes blocks jz H4100 add rsi, rcx add rdi, rcx ; Check if count very big cmp rdx, [CacheBypassLimit] ja H4800 ; Use non-temporal store if count > _CacheBypassLimit align 16 H4000: ; 32 bytes move loop vmovups ymm0, [rsi+rdx-20H] vmovaps [rdi+rdx-20H], ymm0 sub rdx, 20H jnz H4000 vzeroupper H4090: sub rsi, rcx sub rdi, rcx H4100: ; remaining 0-31 bytes test ecx, ecx jz H4600 test cl, 10H jz H4200 ; move 16 bytes sub ecx, 10H movups xmm0, [rsi+rcx] movaps [rdi+rcx], xmm0 jz H4600 ; early out if count divisible by 16 H4200: test cl, 8 jz H4300 ; move 8 bytes sub ecx, 8 mov rax, [rsi+rcx] mov [rdi+rcx], rax H4300: test cl, 4 jz H4400 ; move 4 bytes sub ecx, 4 mov eax, [rsi+rcx] mov [rdi+rcx], eax jz H4600 ; early out if count divisible by 4 H4400: test cl, 2 jz H4500 ; move 2 bytes sub ecx, 2 movzx eax, word [rsi+rcx] mov [rdi+rcx], ax H4500: test cl, 1 jz H4600 ; move 1 byte movzx eax, byte [rsi] ; rcx-1 = 0 mov [rdi], al H4600: ; finished RETURNM align 16 H4800: ; 32 bytes move loop, bypass cache vmovups ymm0, [rsi+rdx-20H] vmovntps [rdi+rdx-20H], ymm0 sub rdx, 20H jnz H4800 sfence vzeroupper jmp H4090 A1000: ; count < 64. Move 32-16-8-4-2-1 bytes test cl, 20H jz A1100 ; move 32 bytes ; movups is faster on processors with SSSE3 sub ecx, 20H movups xmm0, [rsi+rcx+10H] movups xmm1, [rsi+rcx] movups [rdi+rcx+10H], xmm0 movups [rdi+rcx], xmm1 A1100: test cl, 10H jz A1200 ; move 16 bytes sub ecx, 10H movups xmm0, [rsi+rcx] movups [rdi+rcx], xmm0 A1200: test cl, 8 jz A1300 ; move 8 bytes sub ecx, 8 mov rax, [rsi+rcx] mov [rdi+rcx], rax A1300: test cl, 4 jz A1400 ; move 4 bytes sub ecx, 4 mov eax, [rsi+rcx] mov [rdi+rcx], eax jz A1900 ; early out if count divisible by 4 A1400: test cl, 2 jz A1500 ; move 2 bytes sub ecx, 2 movzx eax, word [rsi+rcx] mov [rdi+rcx], ax A1500: test cl, 1 jz A1900 ; move 1 byte movzx eax, byte [rsi] ; rcx-1 = 0 mov [rdi], al A1900: ; finished RETURNM ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Version for processors with fast unaligned read and fast 16 bytes write ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 16 memmoveU: ; Version for processors with fast unaligned read memmoveU@: ; local label PROLOGM memcpyU cmp rcx, 40H jb A1000 ; Use simpler code if count < 64 ; count >= 64 ; Note: this part will not always work if count < 64 ; Calculate size of last block after last regular boundary of dest lea edx, [rdi+rcx] ; end of dext and edx, 0FH jz B3300 ; Skip if end of dest aligned by 16 ; edx = size of last partial block, 1 - 15 bytes test dl, 3 jz B3210 test dl, 1 jz B3201 ; B3200 if we haven't tested edx,3 ; move 1 byte dec rcx movzx eax, byte [rsi+rcx] mov [rdi+rcx], al B3200: test dl, 2 jz B3210 B3201: ; move 2 bytes sub rcx, 2 movzx eax, word [rsi+rcx] mov [rdi+rcx], ax B3210: test dl, 4 jz B3220 ; move 4 bytes sub rcx, 4 mov eax, [rsi+rcx] mov [rdi+rcx], eax B3220: test dl, 8 jz B3300 ; move 8 bytes sub rcx, 8 mov rax, [rsi+rcx] mov [rdi+rcx], rax B3300: ; Now end of dest is aligned by 16. Any partial block has been moved mov rdx, rcx and ecx, 1FH ; remaining size after 32 bytes blocks moved and rdx, -20H ; number of 32 bytes blocks jz H1100 add rsi, rcx add rdi, rcx ; Check if count very big cmp rdx, [CacheBypassLimit] ja H1800 ; Use non-temporal store if count > _CacheBypassLimit align 16 ; minimize 16-bytes boundaries in H1000 loop H1000: ; 32 bytes move loop movups xmm1, [rsi+rdx-20H] movups xmm0, [rsi+rdx-10H] movaps [rdi+rdx-20H], xmm1 movaps [rdi+rdx-10H], xmm0 sub rdx, 20H jnz H1000 H1090: sub rsi, rcx sub rdi, rcx H1100: ; remaining 0-31 bytes test ecx, ecx jz H1600 test cl, 10H jz H1200 ; move 16 bytes sub ecx, 10H movups xmm0, [rsi+rcx] movaps [rdi+rcx], xmm0 jz H1600 ; early out if count divisible by 16 H1200: test cl, 8 jz H1300 ; move 8 bytes sub ecx, 8 mov rax, [rsi+rcx] mov [rdi+rcx], rax H1300: test cl, 4 jz H1400 ; move 4 bytes sub ecx, 4 mov eax, [rsi+rcx] mov [rdi+rcx], eax jz H1600 ; early out if count divisible by 4 H1400: test cl, 2 jz H1500 ; move 2 bytes sub ecx, 2 movzx eax, word [rsi+rcx] mov [rdi+rcx], ax H1500: test cl, 1 jz H1600 ; move 1 byte movzx eax, byte [rsi] ; rcx-1 = 0 mov [rdi], al H1600: ; finished RETURNM align 16 H1800: ; 32 bytes move loop, bypass cache movups xmm1, [rsi+rdx-20H] movups xmm0, [rsi+rdx-10H] movntps [rdi+rdx-20H], xmm1 movntps [rdi+rdx-10H], xmm0 sub rdx, 20H jnz H1800 sfence jmp H1090 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Version for processors with SSSE3. Aligned read + shift + aligned write ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 16 memmoveSSSE3: ; SSSE3 version begins here memmoveSSSE3@: ; local label PROLOGM memcpySSSE3 ; Cannot use memcpy. Must move backwards because of overlap between src and dest cmp rcx, 40H jb A1000 ; Use simpler code if count < 64 ; count >= 64 ; Note: this part will not always work if count < 64 ; Calculate size of last block after last regular boundary of dest lea edx, [rdi+rcx] ; end of dext and edx, 0FH jz B1300 ; Skip if end of dest aligned by 16 ; edx = size of last partial block, 1 - 15 bytes test dl, 3 jz B1210 test dl, 1 jz B1201 ; B1200 if we haven't tested edx,3 ; move 1 byte dec rcx movzx eax, byte [rsi+rcx] mov [rdi+rcx], al B1200: test dl, 2 jz B1210 B1201: ; move 2 bytes sub rcx, 2 movzx eax, word [rsi+rcx] mov [rdi+rcx], ax B1210: test dl, 4 jz B1220 ; move 4 bytes sub rcx, 4 mov eax, [rsi+rcx] mov [rdi+rcx], eax B1220: test dl, 8 jz B1300 ; move 8 bytes sub rcx, 8 mov rax, [rsi+rcx] mov [rdi+rcx], rax B1300: ; Now end of dest is aligned by 16. Any partial block has been moved ; Find alignment of end of src modulo 16 at this point: lea eax, [rsi+rcx] and eax, 0FH ; Set up for loop moving 32 bytes per iteration: mov edx, ecx ; Save count and rcx, -20H ; Round down to nearest multiple of 32 sub edx, ecx ; Remaining data after loop sub rsi, rax ; Nearest preceding aligned block of src ; Add the same to rsi and rdi as we have subtracted from rcx add rsi, rdx add rdi, rdx ; Check if count very big cmp rcx, [CacheBypassLimit] ja B1400 ; Use non-temporal store if count > CacheBypassLimit ; Dispatch to different codes depending on src alignment lea r8, [MAlignmentDispatchSSSE3] jmp near [r8+rax*8] B1400: ; Dispatch to different codes depending on src alignment lea r8, [MAlignmentDispatchNT] jmp near [r8+rax*8] align 16 C100: ; Code for aligned src. SSE2 and later CPUs ; The nice case, src and dest have same alignment. ; Loop. rcx has positive index from the beginning, counting down to zero movaps xmm0, [rsi+rcx-10H] movaps xmm1, [rsi+rcx-20H] movaps [rdi+rcx-10H], xmm0 movaps [rdi+rcx-20H], xmm1 sub rcx, 20H jnz C100 ; Move the remaining edx bytes (0 - 31): ; move 16-8-4-2-1 bytes, aligned test edx, edx jz C500 ; Early out if no more data test dl, 10H jz C200 ; move 16 bytes sub rcx, 10H movaps xmm0, [rsi+rcx] movaps [rdi+rcx], xmm0 C200: ; Other branches come in here, rcx may contain arbitrary offset test edx, edx jz C500 ; Early out if no more data test dl, 8 jz C210 ; move 8 bytes sub rcx, 8 mov rax, [rsi+rcx] mov [rdi+rcx], rax C210: test dl, 4 jz C220 ; move 4 bytes sub rcx, 4 mov eax, [rsi+rcx] mov [rdi+rcx], eax jz C500 ; Early out if count divisible by 4 C220: test dl, 2 jz C230 ; move 2 bytes sub rcx, 2 movzx eax, word [rsi+rcx] mov [rdi+rcx], ax C230: test dl, 1 jz C500 ; move 1 byte movzx eax, byte [rsi+rcx-1] ; rcx-1 is not always 0 here mov [rdi+rcx-1], al C500: ; finished RETURNM ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Version for processors with SSE2. Aligned read + shift + aligned write ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; memmoveSSE2: ; SSE2 version begins here memmoveSSE2@: ; local label PROLOGM memcpySSE2 ; Cannot use memcpy. Must move backwards because of overlap between src and dest cmp rcx, 40H jae B0100 ; Use simpler code if count < 64 ; count < 64. Move 32-16-8-4-2-1 bytes test cl, 20H jz A100 ; move 32 bytes ; mov is faster than movdqu on SSE2 processors, ; movdqu is faster on later processors sub ecx, 20H mov rax, [rsi+rcx+18H] mov rdx, [rsi+rcx+10H] mov [rdi+rcx+18H], rax mov [rdi+rcx+10H], rdx mov rax, [rsi+rcx+8] mov rdx, [rsi+rcx] mov [rdi+rcx+8], rax mov [rdi+rcx], rdx A100: test cl, 10H jz A200 ; move 16 bytes sub ecx, 10H mov rax, [rsi+rcx+8] mov rdx, [rsi+rcx] mov [rdi+rcx+8], rax mov [rdi+rcx], rdx A200: test cl, 8 jz A300 ; move 8 bytes sub ecx, 8 mov rax, [rsi+rcx] mov [rdi+rcx], rax A300: test cl, 4 jz A400 ; move 4 bytes sub ecx, 4 mov eax, [rsi+rcx] mov [rdi+rcx], eax jz A900 ; early out if count divisible by 4 A400: test cl, 2 jz A500 ; move 2 bytes sub ecx, 2 movzx eax, word [rsi+rcx] mov [rdi+rcx], ax A500: test cl, 1 jz A900 ; move 1 byte movzx eax, byte [rsi] ; rcx-1 = 0 mov [rdi], al A900: ; finished RETURNM B0100: ; count >= 64 ; Note: this part will not always work if count < 64 ; Calculate size of last block after last regular boundary of dest lea edx, [rdi+rcx] ; end of dext and edx, 0FH jz B0300 ; Skip if end of dest aligned by 16 ; edx = size of last partial block, 1 - 15 bytes test dl, 3 jz B0210 test dl, 1 jz B0201 ; B0200 if we haven't tested edx,3 ; move 1 byte dec rcx movzx eax, byte [rsi+rcx] mov [rdi+rcx], al B0200: test dl, 2 jz B0210 B0201: ; move 2 bytes sub rcx, 2 movzx eax, word [rsi+rcx] mov [rdi+rcx], ax B0210: test dl, 4 jz B0220 ; move 4 bytes sub rcx, 4 mov eax, [rsi+rcx] mov [rdi+rcx], eax B0220: test dl, 8 jz B0300 ; move 8 bytes sub rcx, 8 mov rax, [rsi+rcx] mov [rdi+rcx], rax B0300: ; Now end of dest is aligned by 16. Any partial block has been moved ; Find alignment of end of src modulo 16 at this point: lea eax, [rsi+rcx] and eax, 0FH ; Set up for loop moving 32 bytes per iteration: mov edx, ecx ; Save count and rcx, -20H ; Round down to nearest multiple of 32 sub edx, ecx ; Remaining data after loop sub rsi, rax ; Nearest preceding aligned block of src ; Add the same to rsi and rdi as we have subtracted from rcx add rsi, rdx add rdi, rdx ; Check if count very big cmp rcx, [CacheBypassLimit] ja B0400 ; Use non-temporal store if count > CacheBypassLimit ; Dispatch to different codes depending on src alignment lea r8, [MAlignmentDispatchSSE2] jmp near [r8+rax*8] B0400: ; Dispatch to different codes depending on src alignment lea r8, [MAlignmentDispatchNT] jmp near [r8+rax*8] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Macros and alignment jump tables ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Macros for each src alignment, SSE2 instruction set: ; Make separate code for each alignment u because the shift instructions ; have the shift count as a constant: %MACRO MOVE_REVERSE_UNALIGNED_SSE2 2 ; u, nt ; Move rcx + rdx bytes of data ; Source is misaligned. (src-dest) modulo 16 = %1 ; %2 = 1 if non-temporal store desired ; eax = %1 ; rsi = src - %1 = nearest preceding 16-bytes boundary ; rdi = dest (aligned) ; rcx = count rounded down to nearest divisible by 32 ; edx = remaining bytes to move after loop movdqa xmm0, [rsi+rcx] ; Read from nearest following 16B boundary %%L1: ; Loop. rcx has positive index from the beginning, counting down to zero sub rcx, 20H movdqa xmm1, [rsi+rcx+10H] ; Read next two blocks aligned movdqa xmm2, [rsi+rcx] movdqa xmm3, xmm1 ; Copy because used twice pslldq xmm0, 16-%1 ; shift left psrldq xmm1, %1 ; shift right por xmm0, xmm1 ; combine blocks %IF %2 == 0 movdqa [rdi+rcx+10H], xmm0 ; Save aligned %ELSE movntdq [rdi+rcx+10H], xmm0 ; Save aligned %ENDIF movdqa xmm0, xmm2 ; Save for next iteration pslldq xmm3, 16-%1 ; shift left psrldq xmm2, %1 ; shift right por xmm3, xmm2 ; combine blocks %IF %2 == 0 movdqa [rdi+rcx], xmm3 ; Save aligned %ELSE movntdq [rdi+rcx], xmm3 ; Save aligned %ENDIF jnz %%L1 ; Move edx remaining bytes test dl, 10H jz %%L2 ; One more 16-bytes block to move sub rcx, 10H movdqa xmm1, [rsi+rcx] pslldq xmm0, 16-%1 ; shift left psrldq xmm1, %1 ; shift right por xmm0, xmm1 ; combine blocks %IF %2 == 0 movdqa [rdi+rcx], xmm0 ; Save aligned %ELSE movntdq [rdi+rcx], xmm0 ; Save aligned %ENDIF %%L2: ; Get src pointer back to misaligned state add rsi, rax %IF %2 == 1 sfence %ENDIF ; Move remaining 0 - 15 bytes, unaligned jmp C200 %ENDMACRO %MACRO MOVE_REVERSE_UNALIGNED_SSE2_4 1 ; nt ; Special case: u = 4 ; %1 = 1 if non-temporal store desired movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary %%L1: ; Loop. rcx has positive index from the beginning, counting down to zero sub rcx, 20H movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned movaps xmm2, [rsi+rcx] movaps xmm3, xmm0 movaps xmm0, xmm2 movss xmm2, xmm1 shufps xmm2, xmm2, 00111001B ; Rotate right movss xmm1, xmm3 shufps xmm1, xmm1, 00111001B ; Rotate right %IF %1 == 0 movaps [rdi+rcx+10H], xmm1 ; Save aligned movaps [rdi+rcx], xmm2 ; Save aligned %ELSE movntps [rdi+rcx+10H], xmm1 ; Non-temporal save movntps [rdi+rcx], xmm2 ; Non-temporal save %ENDIF jnz %%L1 ; Move edx remaining bytes test dl, 10H jz %%L2 ; One more 16-bytes block to move sub rcx, 10H movaps xmm1, [rsi+rcx] movss xmm1, xmm0 shufps xmm1, xmm1, 00111001B ; Rotate right %IF %1 == 0 movaps [rdi+rcx], xmm1 ; Save aligned %ELSE movntps [rdi+rcx], xmm1 ; Non-temporal save %ENDIF %%L2: ; Get src pointer back to misaligned state add rsi, rax %IF %1 == 1 sfence %ENDIF ; Move remaining 0 - 15 bytes, unaligned jmp C200 %ENDMACRO %MACRO MOVE_REVERSE_UNALIGNED_SSE2_8 1 ; nt ; Special case: u = 8 ; %1 = 1 if non-temporal store desired movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary shufps xmm0, xmm0, 01001110B ; Rotate %%L1: ; Loop. rcx has positive index from the beginning, counting down to zero sub rcx, 20H movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned shufps xmm1, xmm1, 01001110B ; Rotate movsd xmm0, xmm1 %IF %1 == 0 movaps [rdi+rcx+10H], xmm0 ; Save aligned %ELSE movntps [rdi+rcx+10H], xmm0 ; Non-temporal save %ENDIF movaps xmm0, [rsi+rcx] shufps xmm0, xmm0, 01001110B ; Rotate movsd xmm1, xmm0 %IF %1 == 0 movaps [rdi+rcx], xmm1 ; Save aligned %ELSE movntps [rdi+rcx], xmm1 ; Non-temporal save %ENDIF jnz %%L1 ; Move edx remaining bytes test dl, 10H jz %%L2 ; One more 16-bytes block to move sub rcx, 10H movaps xmm1, [rsi+rcx] shufps xmm1, xmm1, 01001110B ; Rotate movsd xmm0, xmm1 %IF %1 == 0 movaps [rdi+rcx], xmm0 ; Save aligned %ELSE movntps [rdi+rcx], xmm0 ; Non-temporal save %ENDIF %%L2: ; Get src pointer back to misaligned state add rsi, rax %IF %1 == 1 sfence %ENDIF ; Move remaining 0 - 15 bytes, unaligned jmp C200 %ENDMACRO %MACRO MOVE_REVERSE_UNALIGNED_SSE2_12 1 ; nt ; Special case: u = 12 ; %1 = 1 if non-temporal store desired movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary shufps xmm0, xmm0, 10010011B ; Rotate right %%L1: ; Loop. rcx has positive index from the beginning, counting down to zero sub rcx, 20H movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned shufps xmm1, xmm1, 10010011B ; Rotate left movss xmm0, xmm1 %IF %1 == 0 movaps [rdi+rcx+10H], xmm0 ; Save aligned %ELSE movntps [rdi+rcx+10H], xmm0 ; Non-temporal save %ENDIF movaps xmm0, [rsi+rcx] shufps xmm0, xmm0, 10010011B ; Rotate left movss xmm1, xmm0 %IF %1 == 0 movaps [rdi+rcx], xmm1 ; Save aligned %ELSE movntps [rdi+rcx], xmm1 ; Non-temporal save %ENDIF jnz %%L1 ; Move edx remaining bytes test dl, 10H jz %%L2 ; One more 16-bytes block to move sub rcx, 10H movaps xmm1, [rsi+rcx] shufps xmm1, xmm1, 10010011B ; Rotate left movss xmm0, xmm1 %IF %1 == 0 movaps [rdi+rcx], xmm0 ; Save aligned %ELSE movntps [rdi+rcx], xmm0 ; Non-temporal save %ENDIF %%L2: ; Get src pointer back to misaligned state add rsi, rax %IF %1 == 1 sfence %ENDIF ; Move remaining 0 - 15 bytes, unaligned jmp C200 %ENDMACRO ; Macros for each src alignment, Suppl.SSE3 instruction set: ; Code for unaligned src, Suppl.SSE3 instruction set. ; Make separate code for each alignment u because the palignr instruction ; has the shift count as a constant: %MACRO MOVE_REVERSE_UNALIGNED_SSSE3 1; u ; Move rcx + rdx bytes of data ; Source is misaligned. (src-dest) modulo 16 = %1 ; eax = %1 ; rsi = src - %1 = nearest preceding 16-bytes boundary ; rdi = dest (aligned) ; rcx = - (count rounded down to nearest divisible by 32) ; edx = remaining bytes to move after loop movdqa xmm0, [rsi+rcx] ; Read from nearest following 16B boundary %%L1: ; Loop. rcx has positive index from the beginning, counting down to zero movdqa xmm1, [rsi+rcx-10H] ; Read next two blocks palignr xmm0, xmm1, %1 ; Combine parts into aligned block movdqa [rdi+rcx-10H], xmm0 ; Save aligned movdqa xmm0, [rsi+rcx-20H] palignr xmm1, xmm0, %1 ; Combine parts into aligned block movdqa [rdi+rcx-20H], xmm1 ; Save aligned sub rcx, 20H jnz %%L1 ; Set up for edx remaining bytes test dl, 10H jz %%L2 ; One more 16-bytes block to move sub rcx, 10H movdqa xmm1, [rsi+rcx] ; Read next two blocks palignr xmm0, xmm1, %1 ; Combine parts into aligned block movdqa [rdi+rcx], xmm0 ; Save aligned %%L2: ; Get src pointer back to misaligned state add rsi, rax ; Move remaining 0 - 15 bytes jmp C200 %ENDMACRO ; Make 15 instances of SSE2 macro for each value of the alignment u. ; These are pointed to by the jump table MAlignmentDispatchSSE2 below ; (aligns and fillers are inserted manually to minimize the ; number of 16-bytes boundaries inside loops) align 16 D104: MOVE_REVERSE_UNALIGNED_SSE2_4 0 D108: MOVE_REVERSE_UNALIGNED_SSE2_8 0 D10C: MOVE_REVERSE_UNALIGNED_SSE2_12 0 D101: MOVE_REVERSE_UNALIGNED_SSE2 1, 0 D102: MOVE_REVERSE_UNALIGNED_SSE2 2, 0 D103: MOVE_REVERSE_UNALIGNED_SSE2 3, 0 D105: MOVE_REVERSE_UNALIGNED_SSE2 5, 0 D106: MOVE_REVERSE_UNALIGNED_SSE2 6, 0 D107: MOVE_REVERSE_UNALIGNED_SSE2 7, 0 D109: MOVE_REVERSE_UNALIGNED_SSE2 9, 0 D10A: MOVE_REVERSE_UNALIGNED_SSE2 0AH, 0 D10B: MOVE_REVERSE_UNALIGNED_SSE2 0BH, 0 D10D: MOVE_REVERSE_UNALIGNED_SSE2 0DH, 0 D10E: MOVE_REVERSE_UNALIGNED_SSE2 0EH, 0 D10F: MOVE_REVERSE_UNALIGNED_SSE2 0FH, 0 ; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u. ; These are pointed to by the jump table MAlignmentDispatchSupSSE3 below align 16 E104: MOVE_REVERSE_UNALIGNED_SSSE3 4 E108: MOVE_REVERSE_UNALIGNED_SSSE3 8 E10C: MOVE_REVERSE_UNALIGNED_SSSE3 0CH E101: MOVE_REVERSE_UNALIGNED_SSSE3 1 E102: MOVE_REVERSE_UNALIGNED_SSSE3 2 E103: MOVE_REVERSE_UNALIGNED_SSSE3 3 E105: MOVE_REVERSE_UNALIGNED_SSSE3 5 E106: MOVE_REVERSE_UNALIGNED_SSSE3 6 E107: MOVE_REVERSE_UNALIGNED_SSSE3 7 E109: MOVE_REVERSE_UNALIGNED_SSSE3 9 E10A: MOVE_REVERSE_UNALIGNED_SSSE3 0AH E10B: MOVE_REVERSE_UNALIGNED_SSSE3 0BH E10D: MOVE_REVERSE_UNALIGNED_SSSE3 0DH E10E: MOVE_REVERSE_UNALIGNED_SSSE3 0EH E10F: MOVE_REVERSE_UNALIGNED_SSSE3 0FH align 16 F100: ; Non-temporal move, src and dest have same alignment. ; Loop. rcx has positive index from the beginning, counting down to zero sub rcx, 20H movaps xmm0, [rsi+rcx+10H] movaps xmm1, [rsi+rcx] movntps [rdi+rcx+10H], xmm0 movntps [rdi+rcx], xmm1 jnz F100 ; Move the remaining edx bytes (0 - 31): ; move 16-8-4-2-1 bytes, aligned test dl, 10H jz C200 ; move 16 bytes sub rcx, 10H movaps xmm0, [rsi+rcx] movntps [rdi+rcx], xmm0 sfence ; move the remaining 0 - 15 bytes jmp C200 ; Non-temporal move, src and dest have different alignment. ; Make 15 instances of SSE2 macro for each value of the alignment u. ; These are pointed to by the jump table MAlignmentDispatchNT below align 16 F101: MOVE_REVERSE_UNALIGNED_SSE2 1, 1 F102: MOVE_REVERSE_UNALIGNED_SSE2 2, 1 F103: MOVE_REVERSE_UNALIGNED_SSE2 3, 1 F104: MOVE_REVERSE_UNALIGNED_SSE2_4 1 F105: MOVE_REVERSE_UNALIGNED_SSE2 5, 1 F106: MOVE_REVERSE_UNALIGNED_SSE2 6, 1 F107: MOVE_REVERSE_UNALIGNED_SSE2 7, 1 F108: MOVE_REVERSE_UNALIGNED_SSE2_8 1 F109: MOVE_REVERSE_UNALIGNED_SSE2 9, 1 F10A: MOVE_REVERSE_UNALIGNED_SSE2 0AH, 1 F10B: MOVE_REVERSE_UNALIGNED_SSE2 0BH, 1 F10C: MOVE_REVERSE_UNALIGNED_SSE2_12 1 F10D: MOVE_REVERSE_UNALIGNED_SSE2 0DH, 1 F10E: MOVE_REVERSE_UNALIGNED_SSE2 0EH, 1 F10F: MOVE_REVERSE_UNALIGNED_SSE2 0FH, 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; CPU dispatcher ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; memmoveCPUDispatch: ; CPU dispatcher, check for Suppl-SSE3 instruction set ; This part is executed only once push rbx push rcx push rdx push rsi push rdi push r8 ; set CacheBypassLimit to half the size of the largest level cache %ifdef WINDOWS xor ecx, ecx ; 0 means default %else xor edi, edi %endif call SetMemcpyCacheLimit@ mov eax, 1 cpuid ; Get feature flags lea rbx, [memmoveSSE2@] bt ecx, 9 ; Test bit for SupplSSE3 jnc Q100 lea rbx, [memmoveSSSE3@] call UnalignedIsFaster test eax, eax jz Q100 lea rbx, [memmoveU@] call Store256BitIsFaster test eax, eax jz Q100 lea rbx, [memmoveU256@] Q100: ; Insert appropriate pointer mov [memmoveDispatch], rbx mov rax, rbx pop r8 pop rdi pop rsi pop rdx pop rcx pop rbx ; Jump according to the replaced function pointer jmp rax ; Note: Must call SetMemcpyCacheLimit1 defined in memcpy64.asm SetMemcpyCacheLimit: SetMemcpyCacheLimit@: call SetMemcpyCacheLimit1 mov [CacheBypassLimit], rax ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; data section. jump tables, dispatch function pointer, cache size ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Data segment must be included in function namespace SECTION .data align 16 ; Jump tables for alignments 0 - 15: ; The CPU dispatcher replaces MAlignmentDispatch with ; MAlignmentDispatchSSE2 or MAlignmentDispatchSupSSE3 if Suppl-SSE3 ; is supported. ; Code pointer for each alignment for SSE2 instruction set MAlignmentDispatchSSE2: DQ C100, D101, D102, D103, D104, D105, D106, D107 DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F ; Code pointer for each alignment for Suppl-SSE3 instruction set MAlignmentDispatchSSSE3: DQ C100, E101, E102, E103, E104, E105, E106, E107 DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F ; Code pointer for each alignment for non-temporal store MAlignmentDispatchNT: DQ F100, F101, F102, F103, F104, F105, F106, F107 DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F memmoveDispatch: DQ memmoveCPUDispatch ; Bypass cache by using non-temporal moves if count > _CacheBypassLimit ; The optimal value of CacheBypassLimit is difficult to estimate, but ; a reasonable value is half the size of the largest cache: CacheBypassLimit: DD 0