memmove64.asm 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090
  1. %include "defs.asm"
  2. ;************************* memmove64.asm ***********************************
  3. ; Author: Agner Fog
  4. ; Date created: 2008-07-18
  5. ; Last modified: 2016-11-16 (patched version with AVX512 support removed)
  6. ; Description:
  7. ; Faster version of the standard memmove function:
  8. ; void * A_memmove(void *dest, const void *src, size_t count);
  9. ; Moves 'count' bytes from 'src' to 'dest'. src and dest may overlap.
  10. ;
  11. ; Overriding standard function memmove:
  12. ; The alias ?OVR_memmove is changed to _memmove in the object file if
  13. ; it is desired to override the standard library function memmove.
  14. ;
  15. ; CPU dispatching included for different CPUs
  16. ;
  17. ; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
  18. ;******************************************************************************
  19. default rel
  20. global A_memmove: function ; Function A_memmove
  21. global EXP(memmove): function ; ?OVR removed if standard function memmove overridden
  22. global memmoveSSE2: function ; Version for processors with only SSE2
  23. global memmoveSSSE3: function ; Version for processors with SSSE3
  24. global memmoveU: function ; Version for processors with fast unaligned read
  25. global memmoveU256: function ; Version for processors with fast 256-bit read/write
  26. global SetMemcpyCacheLimit ; Change limit for bypassing cache
  27. ; Imported from memcpy64.asm:
  28. extern A_memcpy ; function entry
  29. extern memcpySSE2 ; CPU specific function entry
  30. extern memcpySSSE3 ; CPU specific function entry
  31. extern memcpyU ; CPU specific function entry
  32. extern memcpyU256 ; CPU specific function entry
  33. ; Imported from instrset64.asm
  34. extern InstructionSet ; Instruction set for CPU dispatcher
  35. ; Imported from unalignedisfaster64.asm:
  36. extern UnalignedIsFaster ; Tells if unaligned read is faster than PALIGNR
  37. extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
  38. ; Imported from memcpy64.asm
  39. extern GetMemcpyCacheLimit ; Get the size limit for bypassing cache when copying with memcpy and memmove
  40. extern SetMemcpyCacheLimit1 ; Set the size limit for bypassing cache when copying with memcpy
  41. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  42. ;
  43. ; Prolog macro. Determine if we should move forwards or backwards
  44. ;
  45. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  46. ; Define prolog for this function
  47. ; Parameter 1 is forward function label
  48. %MACRO PROLOGM 1
  49. %IFDEF WINDOWS
  50. ; Check if dest overlaps src
  51. mov rax, rcx
  52. sub rax, rdx
  53. cmp rax, r8
  54. ; We can avoid testing for dest < src by using unsigned compare:
  55. ; (Assume that the memory block cannot span across address 0)
  56. ; Must move backwards if unsigned(dest-src) < count
  57. jae %1 ; Jump to memcpy if we can move forwards
  58. push rsi
  59. push rdi
  60. mov rdi, rcx ; dest
  61. mov r9, rcx ; dest
  62. mov rsi, rdx ; src
  63. mov rcx, r8 ; count
  64. %ELSE ; Unix
  65. ; Check if dest overlaps src
  66. mov rax, rdi
  67. sub rax, rsi
  68. cmp rax, rdx
  69. ; Must move backwards if unsigned(dest-src) < count
  70. jae %1 ; Jump to memcpy if we can move forwards
  71. mov rcx, rdx ; count
  72. mov r9, rdi ; dest
  73. %ENDIF
  74. %ENDM
  75. ; Define return from this function
  76. %MACRO RETURNM 0
  77. %IFDEF WINDOWS
  78. pop rdi
  79. pop rsi
  80. %ENDIF
  81. mov rax, r9 ; Return value = dest
  82. ret
  83. %ENDMACRO
  84. SECTION .text align=16
  85. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  86. ;
  87. ; Common entry for dispatch
  88. ;
  89. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  90. ; extern "C" void * A_memmove(void * dest, const void * src, size_t count);
  91. ; Function entry:
  92. A_memmove:
  93. EXP(memmove):
  94. jmp qword [memmoveDispatch] ; Go to appropriate version, depending on instruction set
  95. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  96. ;
  97. ; AVX Version for processors with fast unaligned read and fast 32 bytes write
  98. ;
  99. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  100. align 16
  101. memmoveU256: ; Version for processors with fast 256-bit read/write
  102. memmoveU256@: ; local label
  103. PROLOGM memcpyU256
  104. cmp rcx, 40H
  105. jb A1000 ; Use simpler code if count < 64
  106. ; count >= 64
  107. ; Note: this part will not always work if count < 64
  108. ; Calculate size of last block after last regular boundary of dest
  109. lea edx, [rdi+rcx] ; end of dext
  110. and edx, 1FH
  111. jz B4300 ; Skip if end of dest aligned by 32
  112. ; edx = size of last partial block, 1 - 31 bytes
  113. test dl, 3
  114. jz B4210
  115. test dl, 1
  116. jz B4201 ; B4200 if we haven't tested edx,3
  117. ; move 1 byte
  118. dec rcx
  119. movzx eax, byte [rsi+rcx]
  120. mov [rdi+rcx], al
  121. B4200: test dl, 2
  122. jz B4210
  123. B4201: ; move 2 bytes
  124. sub rcx, 2
  125. movzx eax, word [rsi+rcx]
  126. mov [rdi+rcx], ax
  127. B4210: test dl, 4
  128. jz B4220
  129. ; move 4 bytes
  130. sub rcx, 4
  131. mov eax, [rsi+rcx]
  132. mov [rdi+rcx], eax
  133. B4220: test dl, 8
  134. jz B4230
  135. ; move 8 bytes
  136. sub rcx, 8
  137. mov rax, [rsi+rcx]
  138. mov [rdi+rcx], rax
  139. B4230: test dl, 16
  140. jz B4300
  141. ; move 16 bytes
  142. sub rcx, 16
  143. movups xmm0, [rsi+rcx]
  144. movaps [rdi+rcx], xmm0
  145. B4300: ; Now end of dest is aligned by 32. Any partial block has been moved
  146. mov rdx, rcx
  147. and ecx, 1FH ; remaining size after 32 bytes blocks moved
  148. and rdx, -20H ; number of 32 bytes blocks
  149. jz H4100
  150. add rsi, rcx
  151. add rdi, rcx
  152. ; Check if count very big
  153. cmp rdx, [CacheBypassLimit]
  154. ja H4800 ; Use non-temporal store if count > _CacheBypassLimit
  155. align 16
  156. H4000: ; 32 bytes move loop
  157. vmovups ymm0, [rsi+rdx-20H]
  158. vmovaps [rdi+rdx-20H], ymm0
  159. sub rdx, 20H
  160. jnz H4000
  161. vzeroupper
  162. H4090: sub rsi, rcx
  163. sub rdi, rcx
  164. H4100: ; remaining 0-31 bytes
  165. test ecx, ecx
  166. jz H4600
  167. test cl, 10H
  168. jz H4200
  169. ; move 16 bytes
  170. sub ecx, 10H
  171. movups xmm0, [rsi+rcx]
  172. movaps [rdi+rcx], xmm0
  173. jz H4600 ; early out if count divisible by 16
  174. H4200: test cl, 8
  175. jz H4300
  176. ; move 8 bytes
  177. sub ecx, 8
  178. mov rax, [rsi+rcx]
  179. mov [rdi+rcx], rax
  180. H4300: test cl, 4
  181. jz H4400
  182. ; move 4 bytes
  183. sub ecx, 4
  184. mov eax, [rsi+rcx]
  185. mov [rdi+rcx], eax
  186. jz H4600 ; early out if count divisible by 4
  187. H4400: test cl, 2
  188. jz H4500
  189. ; move 2 bytes
  190. sub ecx, 2
  191. movzx eax, word [rsi+rcx]
  192. mov [rdi+rcx], ax
  193. H4500: test cl, 1
  194. jz H4600
  195. ; move 1 byte
  196. movzx eax, byte [rsi] ; rcx-1 = 0
  197. mov [rdi], al
  198. H4600: ; finished
  199. RETURNM
  200. align 16
  201. H4800: ; 32 bytes move loop, bypass cache
  202. vmovups ymm0, [rsi+rdx-20H]
  203. vmovntps [rdi+rdx-20H], ymm0
  204. sub rdx, 20H
  205. jnz H4800
  206. sfence
  207. vzeroupper
  208. jmp H4090
  209. A1000: ; count < 64. Move 32-16-8-4-2-1 bytes
  210. test cl, 20H
  211. jz A1100
  212. ; move 32 bytes
  213. ; movups is faster on processors with SSSE3
  214. sub ecx, 20H
  215. movups xmm0, [rsi+rcx+10H]
  216. movups xmm1, [rsi+rcx]
  217. movups [rdi+rcx+10H], xmm0
  218. movups [rdi+rcx], xmm1
  219. A1100: test cl, 10H
  220. jz A1200
  221. ; move 16 bytes
  222. sub ecx, 10H
  223. movups xmm0, [rsi+rcx]
  224. movups [rdi+rcx], xmm0
  225. A1200: test cl, 8
  226. jz A1300
  227. ; move 8 bytes
  228. sub ecx, 8
  229. mov rax, [rsi+rcx]
  230. mov [rdi+rcx], rax
  231. A1300: test cl, 4
  232. jz A1400
  233. ; move 4 bytes
  234. sub ecx, 4
  235. mov eax, [rsi+rcx]
  236. mov [rdi+rcx], eax
  237. jz A1900 ; early out if count divisible by 4
  238. A1400: test cl, 2
  239. jz A1500
  240. ; move 2 bytes
  241. sub ecx, 2
  242. movzx eax, word [rsi+rcx]
  243. mov [rdi+rcx], ax
  244. A1500: test cl, 1
  245. jz A1900
  246. ; move 1 byte
  247. movzx eax, byte [rsi] ; rcx-1 = 0
  248. mov [rdi], al
  249. A1900: ; finished
  250. RETURNM
  251. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  252. ;
  253. ; Version for processors with fast unaligned read and fast 16 bytes write
  254. ;
  255. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  256. align 16
  257. memmoveU: ; Version for processors with fast unaligned read
  258. memmoveU@: ; local label
  259. PROLOGM memcpyU
  260. cmp rcx, 40H
  261. jb A1000 ; Use simpler code if count < 64
  262. ; count >= 64
  263. ; Note: this part will not always work if count < 64
  264. ; Calculate size of last block after last regular boundary of dest
  265. lea edx, [rdi+rcx] ; end of dext
  266. and edx, 0FH
  267. jz B3300 ; Skip if end of dest aligned by 16
  268. ; edx = size of last partial block, 1 - 15 bytes
  269. test dl, 3
  270. jz B3210
  271. test dl, 1
  272. jz B3201 ; B3200 if we haven't tested edx,3
  273. ; move 1 byte
  274. dec rcx
  275. movzx eax, byte [rsi+rcx]
  276. mov [rdi+rcx], al
  277. B3200: test dl, 2
  278. jz B3210
  279. B3201: ; move 2 bytes
  280. sub rcx, 2
  281. movzx eax, word [rsi+rcx]
  282. mov [rdi+rcx], ax
  283. B3210: test dl, 4
  284. jz B3220
  285. ; move 4 bytes
  286. sub rcx, 4
  287. mov eax, [rsi+rcx]
  288. mov [rdi+rcx], eax
  289. B3220: test dl, 8
  290. jz B3300
  291. ; move 8 bytes
  292. sub rcx, 8
  293. mov rax, [rsi+rcx]
  294. mov [rdi+rcx], rax
  295. B3300: ; Now end of dest is aligned by 16. Any partial block has been moved
  296. mov rdx, rcx
  297. and ecx, 1FH ; remaining size after 32 bytes blocks moved
  298. and rdx, -20H ; number of 32 bytes blocks
  299. jz H1100
  300. add rsi, rcx
  301. add rdi, rcx
  302. ; Check if count very big
  303. cmp rdx, [CacheBypassLimit]
  304. ja H1800 ; Use non-temporal store if count > _CacheBypassLimit
  305. align 16 ; minimize 16-bytes boundaries in H1000 loop
  306. H1000: ; 32 bytes move loop
  307. movups xmm1, [rsi+rdx-20H]
  308. movups xmm0, [rsi+rdx-10H]
  309. movaps [rdi+rdx-20H], xmm1
  310. movaps [rdi+rdx-10H], xmm0
  311. sub rdx, 20H
  312. jnz H1000
  313. H1090: sub rsi, rcx
  314. sub rdi, rcx
  315. H1100: ; remaining 0-31 bytes
  316. test ecx, ecx
  317. jz H1600
  318. test cl, 10H
  319. jz H1200
  320. ; move 16 bytes
  321. sub ecx, 10H
  322. movups xmm0, [rsi+rcx]
  323. movaps [rdi+rcx], xmm0
  324. jz H1600 ; early out if count divisible by 16
  325. H1200: test cl, 8
  326. jz H1300
  327. ; move 8 bytes
  328. sub ecx, 8
  329. mov rax, [rsi+rcx]
  330. mov [rdi+rcx], rax
  331. H1300: test cl, 4
  332. jz H1400
  333. ; move 4 bytes
  334. sub ecx, 4
  335. mov eax, [rsi+rcx]
  336. mov [rdi+rcx], eax
  337. jz H1600 ; early out if count divisible by 4
  338. H1400: test cl, 2
  339. jz H1500
  340. ; move 2 bytes
  341. sub ecx, 2
  342. movzx eax, word [rsi+rcx]
  343. mov [rdi+rcx], ax
  344. H1500: test cl, 1
  345. jz H1600
  346. ; move 1 byte
  347. movzx eax, byte [rsi] ; rcx-1 = 0
  348. mov [rdi], al
  349. H1600: ; finished
  350. RETURNM
  351. align 16
  352. H1800: ; 32 bytes move loop, bypass cache
  353. movups xmm1, [rsi+rdx-20H]
  354. movups xmm0, [rsi+rdx-10H]
  355. movntps [rdi+rdx-20H], xmm1
  356. movntps [rdi+rdx-10H], xmm0
  357. sub rdx, 20H
  358. jnz H1800
  359. sfence
  360. jmp H1090
  361. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  362. ;
  363. ; Version for processors with SSSE3. Aligned read + shift + aligned write
  364. ;
  365. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  366. align 16
  367. memmoveSSSE3: ; SSSE3 version begins here
  368. memmoveSSSE3@: ; local label
  369. PROLOGM memcpySSSE3
  370. ; Cannot use memcpy. Must move backwards because of overlap between src and dest
  371. cmp rcx, 40H
  372. jb A1000 ; Use simpler code if count < 64
  373. ; count >= 64
  374. ; Note: this part will not always work if count < 64
  375. ; Calculate size of last block after last regular boundary of dest
  376. lea edx, [rdi+rcx] ; end of dext
  377. and edx, 0FH
  378. jz B1300 ; Skip if end of dest aligned by 16
  379. ; edx = size of last partial block, 1 - 15 bytes
  380. test dl, 3
  381. jz B1210
  382. test dl, 1
  383. jz B1201 ; B1200 if we haven't tested edx,3
  384. ; move 1 byte
  385. dec rcx
  386. movzx eax, byte [rsi+rcx]
  387. mov [rdi+rcx], al
  388. B1200: test dl, 2
  389. jz B1210
  390. B1201: ; move 2 bytes
  391. sub rcx, 2
  392. movzx eax, word [rsi+rcx]
  393. mov [rdi+rcx], ax
  394. B1210: test dl, 4
  395. jz B1220
  396. ; move 4 bytes
  397. sub rcx, 4
  398. mov eax, [rsi+rcx]
  399. mov [rdi+rcx], eax
  400. B1220: test dl, 8
  401. jz B1300
  402. ; move 8 bytes
  403. sub rcx, 8
  404. mov rax, [rsi+rcx]
  405. mov [rdi+rcx], rax
  406. B1300: ; Now end of dest is aligned by 16. Any partial block has been moved
  407. ; Find alignment of end of src modulo 16 at this point:
  408. lea eax, [rsi+rcx]
  409. and eax, 0FH
  410. ; Set up for loop moving 32 bytes per iteration:
  411. mov edx, ecx ; Save count
  412. and rcx, -20H ; Round down to nearest multiple of 32
  413. sub edx, ecx ; Remaining data after loop
  414. sub rsi, rax ; Nearest preceding aligned block of src
  415. ; Add the same to rsi and rdi as we have subtracted from rcx
  416. add rsi, rdx
  417. add rdi, rdx
  418. ; Check if count very big
  419. cmp rcx, [CacheBypassLimit]
  420. ja B1400 ; Use non-temporal store if count > CacheBypassLimit
  421. ; Dispatch to different codes depending on src alignment
  422. lea r8, [MAlignmentDispatchSSSE3]
  423. jmp near [r8+rax*8]
  424. B1400: ; Dispatch to different codes depending on src alignment
  425. lea r8, [MAlignmentDispatchNT]
  426. jmp near [r8+rax*8]
  427. align 16
  428. C100: ; Code for aligned src. SSE2 and later CPUs
  429. ; The nice case, src and dest have same alignment.
  430. ; Loop. rcx has positive index from the beginning, counting down to zero
  431. movaps xmm0, [rsi+rcx-10H]
  432. movaps xmm1, [rsi+rcx-20H]
  433. movaps [rdi+rcx-10H], xmm0
  434. movaps [rdi+rcx-20H], xmm1
  435. sub rcx, 20H
  436. jnz C100
  437. ; Move the remaining edx bytes (0 - 31):
  438. ; move 16-8-4-2-1 bytes, aligned
  439. test edx, edx
  440. jz C500 ; Early out if no more data
  441. test dl, 10H
  442. jz C200
  443. ; move 16 bytes
  444. sub rcx, 10H
  445. movaps xmm0, [rsi+rcx]
  446. movaps [rdi+rcx], xmm0
  447. C200: ; Other branches come in here, rcx may contain arbitrary offset
  448. test edx, edx
  449. jz C500 ; Early out if no more data
  450. test dl, 8
  451. jz C210
  452. ; move 8 bytes
  453. sub rcx, 8
  454. mov rax, [rsi+rcx]
  455. mov [rdi+rcx], rax
  456. C210: test dl, 4
  457. jz C220
  458. ; move 4 bytes
  459. sub rcx, 4
  460. mov eax, [rsi+rcx]
  461. mov [rdi+rcx], eax
  462. jz C500 ; Early out if count divisible by 4
  463. C220: test dl, 2
  464. jz C230
  465. ; move 2 bytes
  466. sub rcx, 2
  467. movzx eax, word [rsi+rcx]
  468. mov [rdi+rcx], ax
  469. C230: test dl, 1
  470. jz C500
  471. ; move 1 byte
  472. movzx eax, byte [rsi+rcx-1] ; rcx-1 is not always 0 here
  473. mov [rdi+rcx-1], al
  474. C500: ; finished
  475. RETURNM
  476. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  477. ;
  478. ; Version for processors with SSE2. Aligned read + shift + aligned write
  479. ;
  480. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  481. memmoveSSE2: ; SSE2 version begins here
  482. memmoveSSE2@: ; local label
  483. PROLOGM memcpySSE2
  484. ; Cannot use memcpy. Must move backwards because of overlap between src and dest
  485. cmp rcx, 40H
  486. jae B0100 ; Use simpler code if count < 64
  487. ; count < 64. Move 32-16-8-4-2-1 bytes
  488. test cl, 20H
  489. jz A100
  490. ; move 32 bytes
  491. ; mov is faster than movdqu on SSE2 processors,
  492. ; movdqu is faster on later processors
  493. sub ecx, 20H
  494. mov rax, [rsi+rcx+18H]
  495. mov rdx, [rsi+rcx+10H]
  496. mov [rdi+rcx+18H], rax
  497. mov [rdi+rcx+10H], rdx
  498. mov rax, [rsi+rcx+8]
  499. mov rdx, [rsi+rcx]
  500. mov [rdi+rcx+8], rax
  501. mov [rdi+rcx], rdx
  502. A100: test cl, 10H
  503. jz A200
  504. ; move 16 bytes
  505. sub ecx, 10H
  506. mov rax, [rsi+rcx+8]
  507. mov rdx, [rsi+rcx]
  508. mov [rdi+rcx+8], rax
  509. mov [rdi+rcx], rdx
  510. A200: test cl, 8
  511. jz A300
  512. ; move 8 bytes
  513. sub ecx, 8
  514. mov rax, [rsi+rcx]
  515. mov [rdi+rcx], rax
  516. A300: test cl, 4
  517. jz A400
  518. ; move 4 bytes
  519. sub ecx, 4
  520. mov eax, [rsi+rcx]
  521. mov [rdi+rcx], eax
  522. jz A900 ; early out if count divisible by 4
  523. A400: test cl, 2
  524. jz A500
  525. ; move 2 bytes
  526. sub ecx, 2
  527. movzx eax, word [rsi+rcx]
  528. mov [rdi+rcx], ax
  529. A500: test cl, 1
  530. jz A900
  531. ; move 1 byte
  532. movzx eax, byte [rsi] ; rcx-1 = 0
  533. mov [rdi], al
  534. A900: ; finished
  535. RETURNM
  536. B0100: ; count >= 64
  537. ; Note: this part will not always work if count < 64
  538. ; Calculate size of last block after last regular boundary of dest
  539. lea edx, [rdi+rcx] ; end of dext
  540. and edx, 0FH
  541. jz B0300 ; Skip if end of dest aligned by 16
  542. ; edx = size of last partial block, 1 - 15 bytes
  543. test dl, 3
  544. jz B0210
  545. test dl, 1
  546. jz B0201 ; B0200 if we haven't tested edx,3
  547. ; move 1 byte
  548. dec rcx
  549. movzx eax, byte [rsi+rcx]
  550. mov [rdi+rcx], al
  551. B0200: test dl, 2
  552. jz B0210
  553. B0201: ; move 2 bytes
  554. sub rcx, 2
  555. movzx eax, word [rsi+rcx]
  556. mov [rdi+rcx], ax
  557. B0210: test dl, 4
  558. jz B0220
  559. ; move 4 bytes
  560. sub rcx, 4
  561. mov eax, [rsi+rcx]
  562. mov [rdi+rcx], eax
  563. B0220: test dl, 8
  564. jz B0300
  565. ; move 8 bytes
  566. sub rcx, 8
  567. mov rax, [rsi+rcx]
  568. mov [rdi+rcx], rax
  569. B0300: ; Now end of dest is aligned by 16. Any partial block has been moved
  570. ; Find alignment of end of src modulo 16 at this point:
  571. lea eax, [rsi+rcx]
  572. and eax, 0FH
  573. ; Set up for loop moving 32 bytes per iteration:
  574. mov edx, ecx ; Save count
  575. and rcx, -20H ; Round down to nearest multiple of 32
  576. sub edx, ecx ; Remaining data after loop
  577. sub rsi, rax ; Nearest preceding aligned block of src
  578. ; Add the same to rsi and rdi as we have subtracted from rcx
  579. add rsi, rdx
  580. add rdi, rdx
  581. ; Check if count very big
  582. cmp rcx, [CacheBypassLimit]
  583. ja B0400 ; Use non-temporal store if count > CacheBypassLimit
  584. ; Dispatch to different codes depending on src alignment
  585. lea r8, [MAlignmentDispatchSSE2]
  586. jmp near [r8+rax*8]
  587. B0400: ; Dispatch to different codes depending on src alignment
  588. lea r8, [MAlignmentDispatchNT]
  589. jmp near [r8+rax*8]
  590. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  591. ;
  592. ; Macros and alignment jump tables
  593. ;
  594. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  595. ; Macros for each src alignment, SSE2 instruction set:
  596. ; Make separate code for each alignment u because the shift instructions
  597. ; have the shift count as a constant:
  598. %MACRO MOVE_REVERSE_UNALIGNED_SSE2 2 ; u, nt
  599. ; Move rcx + rdx bytes of data
  600. ; Source is misaligned. (src-dest) modulo 16 = %1
  601. ; %2 = 1 if non-temporal store desired
  602. ; eax = %1
  603. ; rsi = src - %1 = nearest preceding 16-bytes boundary
  604. ; rdi = dest (aligned)
  605. ; rcx = count rounded down to nearest divisible by 32
  606. ; edx = remaining bytes to move after loop
  607. movdqa xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
  608. %%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
  609. sub rcx, 20H
  610. movdqa xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
  611. movdqa xmm2, [rsi+rcx]
  612. movdqa xmm3, xmm1 ; Copy because used twice
  613. pslldq xmm0, 16-%1 ; shift left
  614. psrldq xmm1, %1 ; shift right
  615. por xmm0, xmm1 ; combine blocks
  616. %IF %2 == 0
  617. movdqa [rdi+rcx+10H], xmm0 ; Save aligned
  618. %ELSE
  619. movntdq [rdi+rcx+10H], xmm0 ; Save aligned
  620. %ENDIF
  621. movdqa xmm0, xmm2 ; Save for next iteration
  622. pslldq xmm3, 16-%1 ; shift left
  623. psrldq xmm2, %1 ; shift right
  624. por xmm3, xmm2 ; combine blocks
  625. %IF %2 == 0
  626. movdqa [rdi+rcx], xmm3 ; Save aligned
  627. %ELSE
  628. movntdq [rdi+rcx], xmm3 ; Save aligned
  629. %ENDIF
  630. jnz %%L1
  631. ; Move edx remaining bytes
  632. test dl, 10H
  633. jz %%L2
  634. ; One more 16-bytes block to move
  635. sub rcx, 10H
  636. movdqa xmm1, [rsi+rcx]
  637. pslldq xmm0, 16-%1 ; shift left
  638. psrldq xmm1, %1 ; shift right
  639. por xmm0, xmm1 ; combine blocks
  640. %IF %2 == 0
  641. movdqa [rdi+rcx], xmm0 ; Save aligned
  642. %ELSE
  643. movntdq [rdi+rcx], xmm0 ; Save aligned
  644. %ENDIF
  645. %%L2: ; Get src pointer back to misaligned state
  646. add rsi, rax
  647. %IF %2 == 1
  648. sfence
  649. %ENDIF
  650. ; Move remaining 0 - 15 bytes, unaligned
  651. jmp C200
  652. %ENDMACRO
  653. %MACRO MOVE_REVERSE_UNALIGNED_SSE2_4 1 ; nt
  654. ; Special case: u = 4
  655. ; %1 = 1 if non-temporal store desired
  656. movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
  657. %%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
  658. sub rcx, 20H
  659. movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
  660. movaps xmm2, [rsi+rcx]
  661. movaps xmm3, xmm0
  662. movaps xmm0, xmm2
  663. movss xmm2, xmm1
  664. shufps xmm2, xmm2, 00111001B ; Rotate right
  665. movss xmm1, xmm3
  666. shufps xmm1, xmm1, 00111001B ; Rotate right
  667. %IF %1 == 0
  668. movaps [rdi+rcx+10H], xmm1 ; Save aligned
  669. movaps [rdi+rcx], xmm2 ; Save aligned
  670. %ELSE
  671. movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
  672. movntps [rdi+rcx], xmm2 ; Non-temporal save
  673. %ENDIF
  674. jnz %%L1
  675. ; Move edx remaining bytes
  676. test dl, 10H
  677. jz %%L2
  678. ; One more 16-bytes block to move
  679. sub rcx, 10H
  680. movaps xmm1, [rsi+rcx]
  681. movss xmm1, xmm0
  682. shufps xmm1, xmm1, 00111001B ; Rotate right
  683. %IF %1 == 0
  684. movaps [rdi+rcx], xmm1 ; Save aligned
  685. %ELSE
  686. movntps [rdi+rcx], xmm1 ; Non-temporal save
  687. %ENDIF
  688. %%L2: ; Get src pointer back to misaligned state
  689. add rsi, rax
  690. %IF %1 == 1
  691. sfence
  692. %ENDIF
  693. ; Move remaining 0 - 15 bytes, unaligned
  694. jmp C200
  695. %ENDMACRO
  696. %MACRO MOVE_REVERSE_UNALIGNED_SSE2_8 1 ; nt
  697. ; Special case: u = 8
  698. ; %1 = 1 if non-temporal store desired
  699. movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
  700. shufps xmm0, xmm0, 01001110B ; Rotate
  701. %%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
  702. sub rcx, 20H
  703. movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
  704. shufps xmm1, xmm1, 01001110B ; Rotate
  705. movsd xmm0, xmm1
  706. %IF %1 == 0
  707. movaps [rdi+rcx+10H], xmm0 ; Save aligned
  708. %ELSE
  709. movntps [rdi+rcx+10H], xmm0 ; Non-temporal save
  710. %ENDIF
  711. movaps xmm0, [rsi+rcx]
  712. shufps xmm0, xmm0, 01001110B ; Rotate
  713. movsd xmm1, xmm0
  714. %IF %1 == 0
  715. movaps [rdi+rcx], xmm1 ; Save aligned
  716. %ELSE
  717. movntps [rdi+rcx], xmm1 ; Non-temporal save
  718. %ENDIF
  719. jnz %%L1
  720. ; Move edx remaining bytes
  721. test dl, 10H
  722. jz %%L2
  723. ; One more 16-bytes block to move
  724. sub rcx, 10H
  725. movaps xmm1, [rsi+rcx]
  726. shufps xmm1, xmm1, 01001110B ; Rotate
  727. movsd xmm0, xmm1
  728. %IF %1 == 0
  729. movaps [rdi+rcx], xmm0 ; Save aligned
  730. %ELSE
  731. movntps [rdi+rcx], xmm0 ; Non-temporal save
  732. %ENDIF
  733. %%L2: ; Get src pointer back to misaligned state
  734. add rsi, rax
  735. %IF %1 == 1
  736. sfence
  737. %ENDIF
  738. ; Move remaining 0 - 15 bytes, unaligned
  739. jmp C200
  740. %ENDMACRO
  741. %MACRO MOVE_REVERSE_UNALIGNED_SSE2_12 1 ; nt
  742. ; Special case: u = 12
  743. ; %1 = 1 if non-temporal store desired
  744. movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
  745. shufps xmm0, xmm0, 10010011B ; Rotate right
  746. %%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
  747. sub rcx, 20H
  748. movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
  749. shufps xmm1, xmm1, 10010011B ; Rotate left
  750. movss xmm0, xmm1
  751. %IF %1 == 0
  752. movaps [rdi+rcx+10H], xmm0 ; Save aligned
  753. %ELSE
  754. movntps [rdi+rcx+10H], xmm0 ; Non-temporal save
  755. %ENDIF
  756. movaps xmm0, [rsi+rcx]
  757. shufps xmm0, xmm0, 10010011B ; Rotate left
  758. movss xmm1, xmm0
  759. %IF %1 == 0
  760. movaps [rdi+rcx], xmm1 ; Save aligned
  761. %ELSE
  762. movntps [rdi+rcx], xmm1 ; Non-temporal save
  763. %ENDIF
  764. jnz %%L1
  765. ; Move edx remaining bytes
  766. test dl, 10H
  767. jz %%L2
  768. ; One more 16-bytes block to move
  769. sub rcx, 10H
  770. movaps xmm1, [rsi+rcx]
  771. shufps xmm1, xmm1, 10010011B ; Rotate left
  772. movss xmm0, xmm1
  773. %IF %1 == 0
  774. movaps [rdi+rcx], xmm0 ; Save aligned
  775. %ELSE
  776. movntps [rdi+rcx], xmm0 ; Non-temporal save
  777. %ENDIF
  778. %%L2: ; Get src pointer back to misaligned state
  779. add rsi, rax
  780. %IF %1 == 1
  781. sfence
  782. %ENDIF
  783. ; Move remaining 0 - 15 bytes, unaligned
  784. jmp C200
  785. %ENDMACRO
  786. ; Macros for each src alignment, Suppl.SSE3 instruction set:
  787. ; Code for unaligned src, Suppl.SSE3 instruction set.
  788. ; Make separate code for each alignment u because the palignr instruction
  789. ; has the shift count as a constant:
  790. %MACRO MOVE_REVERSE_UNALIGNED_SSSE3 1; u
  791. ; Move rcx + rdx bytes of data
  792. ; Source is misaligned. (src-dest) modulo 16 = %1
  793. ; eax = %1
  794. ; rsi = src - %1 = nearest preceding 16-bytes boundary
  795. ; rdi = dest (aligned)
  796. ; rcx = - (count rounded down to nearest divisible by 32)
  797. ; edx = remaining bytes to move after loop
  798. movdqa xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
  799. %%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
  800. movdqa xmm1, [rsi+rcx-10H] ; Read next two blocks
  801. palignr xmm0, xmm1, %1 ; Combine parts into aligned block
  802. movdqa [rdi+rcx-10H], xmm0 ; Save aligned
  803. movdqa xmm0, [rsi+rcx-20H]
  804. palignr xmm1, xmm0, %1 ; Combine parts into aligned block
  805. movdqa [rdi+rcx-20H], xmm1 ; Save aligned
  806. sub rcx, 20H
  807. jnz %%L1
  808. ; Set up for edx remaining bytes
  809. test dl, 10H
  810. jz %%L2
  811. ; One more 16-bytes block to move
  812. sub rcx, 10H
  813. movdqa xmm1, [rsi+rcx] ; Read next two blocks
  814. palignr xmm0, xmm1, %1 ; Combine parts into aligned block
  815. movdqa [rdi+rcx], xmm0 ; Save aligned
  816. %%L2: ; Get src pointer back to misaligned state
  817. add rsi, rax
  818. ; Move remaining 0 - 15 bytes
  819. jmp C200
  820. %ENDMACRO
  821. ; Make 15 instances of SSE2 macro for each value of the alignment u.
  822. ; These are pointed to by the jump table MAlignmentDispatchSSE2 below
  823. ; (aligns and fillers are inserted manually to minimize the
  824. ; number of 16-bytes boundaries inside loops)
  825. align 16
  826. D104: MOVE_REVERSE_UNALIGNED_SSE2_4 0
  827. D108: MOVE_REVERSE_UNALIGNED_SSE2_8 0
  828. D10C: MOVE_REVERSE_UNALIGNED_SSE2_12 0
  829. D101: MOVE_REVERSE_UNALIGNED_SSE2 1, 0
  830. D102: MOVE_REVERSE_UNALIGNED_SSE2 2, 0
  831. D103: MOVE_REVERSE_UNALIGNED_SSE2 3, 0
  832. D105: MOVE_REVERSE_UNALIGNED_SSE2 5, 0
  833. D106: MOVE_REVERSE_UNALIGNED_SSE2 6, 0
  834. D107: MOVE_REVERSE_UNALIGNED_SSE2 7, 0
  835. D109: MOVE_REVERSE_UNALIGNED_SSE2 9, 0
  836. D10A: MOVE_REVERSE_UNALIGNED_SSE2 0AH, 0
  837. D10B: MOVE_REVERSE_UNALIGNED_SSE2 0BH, 0
  838. D10D: MOVE_REVERSE_UNALIGNED_SSE2 0DH, 0
  839. D10E: MOVE_REVERSE_UNALIGNED_SSE2 0EH, 0
  840. D10F: MOVE_REVERSE_UNALIGNED_SSE2 0FH, 0
  841. ; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
  842. ; These are pointed to by the jump table MAlignmentDispatchSupSSE3 below
  843. align 16
  844. E104: MOVE_REVERSE_UNALIGNED_SSSE3 4
  845. E108: MOVE_REVERSE_UNALIGNED_SSSE3 8
  846. E10C: MOVE_REVERSE_UNALIGNED_SSSE3 0CH
  847. E101: MOVE_REVERSE_UNALIGNED_SSSE3 1
  848. E102: MOVE_REVERSE_UNALIGNED_SSSE3 2
  849. E103: MOVE_REVERSE_UNALIGNED_SSSE3 3
  850. E105: MOVE_REVERSE_UNALIGNED_SSSE3 5
  851. E106: MOVE_REVERSE_UNALIGNED_SSSE3 6
  852. E107: MOVE_REVERSE_UNALIGNED_SSSE3 7
  853. E109: MOVE_REVERSE_UNALIGNED_SSSE3 9
  854. E10A: MOVE_REVERSE_UNALIGNED_SSSE3 0AH
  855. E10B: MOVE_REVERSE_UNALIGNED_SSSE3 0BH
  856. E10D: MOVE_REVERSE_UNALIGNED_SSSE3 0DH
  857. E10E: MOVE_REVERSE_UNALIGNED_SSSE3 0EH
  858. E10F: MOVE_REVERSE_UNALIGNED_SSSE3 0FH
  859. align 16
  860. F100: ; Non-temporal move, src and dest have same alignment.
  861. ; Loop. rcx has positive index from the beginning, counting down to zero
  862. sub rcx, 20H
  863. movaps xmm0, [rsi+rcx+10H]
  864. movaps xmm1, [rsi+rcx]
  865. movntps [rdi+rcx+10H], xmm0
  866. movntps [rdi+rcx], xmm1
  867. jnz F100
  868. ; Move the remaining edx bytes (0 - 31):
  869. ; move 16-8-4-2-1 bytes, aligned
  870. test dl, 10H
  871. jz C200
  872. ; move 16 bytes
  873. sub rcx, 10H
  874. movaps xmm0, [rsi+rcx]
  875. movntps [rdi+rcx], xmm0
  876. sfence
  877. ; move the remaining 0 - 15 bytes
  878. jmp C200
  879. ; Non-temporal move, src and dest have different alignment.
  880. ; Make 15 instances of SSE2 macro for each value of the alignment u.
  881. ; These are pointed to by the jump table MAlignmentDispatchNT below
  882. align 16
  883. F101: MOVE_REVERSE_UNALIGNED_SSE2 1, 1
  884. F102: MOVE_REVERSE_UNALIGNED_SSE2 2, 1
  885. F103: MOVE_REVERSE_UNALIGNED_SSE2 3, 1
  886. F104: MOVE_REVERSE_UNALIGNED_SSE2_4 1
  887. F105: MOVE_REVERSE_UNALIGNED_SSE2 5, 1
  888. F106: MOVE_REVERSE_UNALIGNED_SSE2 6, 1
  889. F107: MOVE_REVERSE_UNALIGNED_SSE2 7, 1
  890. F108: MOVE_REVERSE_UNALIGNED_SSE2_8 1
  891. F109: MOVE_REVERSE_UNALIGNED_SSE2 9, 1
  892. F10A: MOVE_REVERSE_UNALIGNED_SSE2 0AH, 1
  893. F10B: MOVE_REVERSE_UNALIGNED_SSE2 0BH, 1
  894. F10C: MOVE_REVERSE_UNALIGNED_SSE2_12 1
  895. F10D: MOVE_REVERSE_UNALIGNED_SSE2 0DH, 1
  896. F10E: MOVE_REVERSE_UNALIGNED_SSE2 0EH, 1
  897. F10F: MOVE_REVERSE_UNALIGNED_SSE2 0FH, 1
  898. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  899. ;
  900. ; CPU dispatcher
  901. ;
  902. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  903. memmoveCPUDispatch: ; CPU dispatcher, check for Suppl-SSE3 instruction set
  904. ; This part is executed only once
  905. push rbx
  906. push rcx
  907. push rdx
  908. push rsi
  909. push rdi
  910. push r8
  911. ; set CacheBypassLimit to half the size of the largest level cache
  912. %ifdef WINDOWS
  913. xor ecx, ecx ; 0 means default
  914. %else
  915. xor edi, edi
  916. %endif
  917. call SetMemcpyCacheLimit@
  918. mov eax, 1
  919. cpuid ; Get feature flags
  920. lea rbx, [memmoveSSE2@]
  921. bt ecx, 9 ; Test bit for SupplSSE3
  922. jnc Q100
  923. lea rbx, [memmoveSSSE3@]
  924. call UnalignedIsFaster
  925. test eax, eax
  926. jz Q100
  927. lea rbx, [memmoveU@]
  928. call Store256BitIsFaster
  929. test eax, eax
  930. jz Q100
  931. lea rbx, [memmoveU256@]
  932. Q100: ; Insert appropriate pointer
  933. mov [memmoveDispatch], rbx
  934. mov rax, rbx
  935. pop r8
  936. pop rdi
  937. pop rsi
  938. pop rdx
  939. pop rcx
  940. pop rbx
  941. ; Jump according to the replaced function pointer
  942. jmp rax
  943. ; Note: Must call SetMemcpyCacheLimit1 defined in memcpy64.asm
  944. SetMemcpyCacheLimit:
  945. SetMemcpyCacheLimit@:
  946. call SetMemcpyCacheLimit1
  947. mov [CacheBypassLimit], rax
  948. ret
  949. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  950. ;
  951. ; data section. jump tables, dispatch function pointer, cache size
  952. ;
  953. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  954. ; Data segment must be included in function namespace
  955. SECTION .data
  956. align 16
  957. ; Jump tables for alignments 0 - 15:
  958. ; The CPU dispatcher replaces MAlignmentDispatch with
  959. ; MAlignmentDispatchSSE2 or MAlignmentDispatchSupSSE3 if Suppl-SSE3
  960. ; is supported.
  961. ; Code pointer for each alignment for SSE2 instruction set
  962. MAlignmentDispatchSSE2:
  963. DQ C100, D101, D102, D103, D104, D105, D106, D107
  964. DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
  965. ; Code pointer for each alignment for Suppl-SSE3 instruction set
  966. MAlignmentDispatchSSSE3:
  967. DQ C100, E101, E102, E103, E104, E105, E106, E107
  968. DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
  969. ; Code pointer for each alignment for non-temporal store
  970. MAlignmentDispatchNT:
  971. DQ F100, F101, F102, F103, F104, F105, F106, F107
  972. DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
  973. memmoveDispatch: DQ memmoveCPUDispatch
  974. ; Bypass cache by using non-temporal moves if count > _CacheBypassLimit
  975. ; The optimal value of CacheBypassLimit is difficult to estimate, but
  976. ; a reasonable value is half the size of the largest cache:
  977. CacheBypassLimit: DD 0