memset64.asm 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. %include "defs.asm"
  2. ;************************* memset64.asm *************************************
  3. ; Author: Agner Fog
  4. ; Date created: 2008-07-19
  5. ; Last modified: 2016-11-12 (patched version with AVX512 support removed)
  6. ; Description:
  7. ; Faster version of the standard memset function:
  8. ; void * A_memset(void * dest, int c, size_t count);
  9. ; Sets 'count' bytes from 'dest' to the 8-bit value 'c'
  10. ;
  11. ; Overriding standard function memset:
  12. ; The alias ?OVR_memset is changed to _memset in the object file if
  13. ; it is desired to override the standard library function memset.
  14. ;
  15. ; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
  16. ; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
  17. ;
  18. ; Optimization:
  19. ; Uses XMM registers to set 16 bytes at a time, aligned.
  20. ;
  21. ; The latest version of this file is available at:
  22. ; www.agner.org/optimize/asmexamples.zip
  23. ; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
  24. ;******************************************************************************
  25. default rel
  26. global A_memset: function ; Function memset
  27. global EXP(memset): function ; ?OVR removed if standard function memset overridden
  28. global memsetSSE2: function ; SSE2 version
  29. global memsetAVX: function ; version for CPUs with fast 256-bit store
  30. global GetMemsetCacheLimit: function ; Data blocks bigger than this will be stored uncached by memset
  31. global SetMemsetCacheLimit: function ; Change limit in GetMemsetCacheLimit
  32. ; Imported from cachesize64.asm:
  33. extern DataCacheSize ; Get size of data cache
  34. ; Imported from unalignedisfaster64.asm:
  35. extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
  36. ; Define prolog for this function
  37. %MACRO PROLOGM 0
  38. %IFDEF WINDOWS
  39. %define Rdest rcx ; dest
  40. movzx eax, dl ; c
  41. mov rdx, r8 ; count
  42. %define Rcount rdx ; count
  43. %define Rdest2 r9 ; copy of dest
  44. %define Rcount2 r8 ; copy of count
  45. %ELSE ; Unix
  46. %define Rdest rdi ; dest
  47. movzx eax, sil ; c
  48. %define Rcount rdx ; count
  49. %define Rdest2 rcx ; copy of dest
  50. %define Rcount2 rsi ; copy of count
  51. mov Rcount2, Rcount ; copy count
  52. %ENDIF
  53. %ENDMACRO
  54. SECTION .text align=16
  55. ; extern "C" void * memset(void * dest, int c, size_t count);
  56. ; Function entry:
  57. A_memset:
  58. EXP(memset):
  59. jmp [memsetDispatch] ; CPU dispatch table
  60. memsetAVX: ; AVX version. Use ymm register
  61. memsetAVX@: ; local label
  62. PROLOGM
  63. imul eax, 01010101H ; Broadcast c into all bytes of eax
  64. mov Rdest2, Rdest ; save dest
  65. cmp Rcount, 16
  66. ja B100
  67. B050: lea r10, [MemsetJTab] ; SSE2 version comes in here
  68. jmp qword [r10+Rcount*8] ; jump table for small counts
  69. ; Separate code for each count from 0 to 16:
  70. M16: mov [Rdest+12], eax
  71. M12: mov [Rdest+8], eax
  72. M08: mov [Rdest+4], eax
  73. M04: mov [Rdest], eax
  74. M00: mov rax, Rdest2 ; return dest
  75. ret
  76. M15: mov [Rdest+11], eax
  77. M11: mov [Rdest+7], eax
  78. M07: mov [Rdest+3], eax
  79. M03: mov [Rdest+1], ax
  80. M01: mov [Rdest], al
  81. mov rax, Rdest2 ; return dest
  82. ret
  83. M14: mov [Rdest+10], eax
  84. M10: mov [Rdest+6], eax
  85. M06: mov [Rdest+2], eax
  86. M02: mov [Rdest], ax
  87. mov rax, Rdest2 ; return dest
  88. ret
  89. M13: mov [Rdest+9], eax
  90. M09: mov [Rdest+5], eax
  91. M05: mov [Rdest+1], eax
  92. mov [Rdest], al
  93. mov rax, Rdest2 ; return dest
  94. ret
  95. B100: ; AVX version, Rcount > 16
  96. movd xmm0, eax
  97. pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
  98. lea rax, [Rdest+Rcount] ; point to end
  99. cmp Rcount, 20H
  100. jbe K600 ; faster to use xmm registers if small
  101. ; Store the first possibly unaligned 16 bytes
  102. ; It is faster to always write 16 bytes, possibly overlapping
  103. ; with the subsequent regular part, than to make possibly mispredicted
  104. ; branches depending on the size of the first part.
  105. movups oword [Rdest], xmm0
  106. ; store another 16 bytes, aligned
  107. add Rdest, 10H
  108. and Rdest, -10H
  109. movaps oword [Rdest], xmm0
  110. ; go to next 32 bytes boundary
  111. add Rdest, 10H
  112. and Rdest, -20H
  113. ; Check if count very big
  114. cmp Rcount, [MemsetCacheLimit]
  115. ja K300 ; Use non-temporal store if count > MemsetCacheLimit
  116. ; find last 32 bytes boundary
  117. mov Rcount, rax
  118. and Rcount, -20H
  119. ; - size of 32-bytes blocks
  120. sub Rdest, Rcount
  121. jnb K200 ; Jump if not negative
  122. ; extend value to 256 bits
  123. vinsertf128 ymm0,ymm0,xmm0,1
  124. align 16
  125. K100: ; Loop through 32-bytes blocks. Register use is swapped
  126. ; Rcount = end of 32-bytes blocks part
  127. ; Rdest = negative index from the end, counting up to zero
  128. vmovaps [Rcount+Rdest], ymm0
  129. add Rdest, 20H
  130. jnz K100
  131. vzeroupper
  132. K200: ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
  133. movups [rax-20H], xmm0
  134. movups [rax-10H], xmm0
  135. mov rax, Rdest2 ; return dest
  136. ret
  137. K300: ; Use non-temporal moves, same code as above:
  138. ; find last 32 bytes boundary
  139. mov Rcount, rax
  140. and Rcount, -20H
  141. ; - size of 32-bytes blocks
  142. sub Rdest, Rcount
  143. jnb K500 ; Jump if not negative
  144. ; extend value to 256 bits
  145. vinsertf128 ymm0,ymm0,xmm0,1
  146. align 16
  147. K400: ; Loop through 32-bytes blocks. Register use is swapped
  148. ; Rcount = end of 32-bytes blocks part
  149. ; Rdest = negative index from the end, counting up to zero
  150. vmovntps [Rcount+Rdest], ymm0
  151. add Rdest, 20H
  152. jnz K400
  153. sfence
  154. vzeroupper
  155. K500: ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
  156. movups [rax-20H], xmm0
  157. movups [rax-10H], xmm0
  158. mov rax, Rdest2 ; return dest
  159. ret
  160. K600: ; 16 < count <= 32
  161. movups [Rdest], xmm0
  162. movups [rax-10H], xmm0
  163. mov rax, Rdest2 ; return dest
  164. ret
  165. memsetSSE2: ; count > 16. Use SSE2 instruction set
  166. memsetSSE2@: ; local label
  167. PROLOGM
  168. imul eax, 01010101H ; Broadcast c into all bytes of eax
  169. mov Rdest2, Rdest ; save dest
  170. cmp Rcount, 16
  171. jna B050
  172. movd xmm0, eax
  173. pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
  174. ; Store the first unaligned part.
  175. ; The size of this part is 1 - 16 bytes.
  176. ; It is faster to always write 16 bytes, possibly overlapping
  177. ; with the subsequent regular part, than to make possibly mispredicted
  178. ; branches depending on the size of the first part.
  179. movq qword [Rdest], xmm0
  180. movq qword [Rdest+8], xmm0
  181. ; Check if count very big
  182. M150: mov rax, [MemsetCacheLimit]
  183. cmp Rcount, rax
  184. ja M500 ; Use non-temporal store if count > MemsetCacheLimit
  185. ; Point to end of regular part:
  186. ; Round down dest+count to nearest preceding 16-bytes boundary
  187. lea Rcount, [Rdest+Rcount-1]
  188. and Rcount, -10H
  189. ; Point to start of regular part:
  190. ; Round up dest to next 16-bytes boundary
  191. add Rdest, 10H
  192. and Rdest, -10H
  193. ; -(size of regular part)
  194. sub Rdest, Rcount
  195. jnb M300 ; Jump if not negative
  196. align 16
  197. M200: ; Loop through regular part
  198. ; Rcount = end of regular part
  199. ; Rdest = negative index from the end, counting up to zero
  200. movdqa [Rcount+Rdest], xmm0
  201. add Rdest, 10H
  202. jnz M200
  203. M300: ; Do the last irregular part
  204. ; The size of this part is 1 - 16 bytes.
  205. ; It is faster to always write 16 bytes, possibly overlapping
  206. ; with the preceding regular part, than to make possibly mispredicted
  207. ; branches depending on the size of the last part.
  208. mov rax, Rdest2 ; dest
  209. movq qword [rax+Rcount2-10H], xmm0
  210. movq qword [rax+Rcount2-8], xmm0
  211. ret
  212. M500: ; Use non-temporal moves, same code as above:
  213. ; End of regular part:
  214. ; Round down dest+count to nearest preceding 16-bytes boundary
  215. lea Rcount, [Rdest+Rcount-1]
  216. and Rcount, -10H
  217. ; Start of regular part:
  218. ; Round up dest to next 16-bytes boundary
  219. add Rdest, 10H
  220. and Rdest, -10H
  221. ; -(size of regular part)
  222. sub Rdest, Rcount
  223. jnb M700 ; Jump if not negative
  224. align 16
  225. M600: ; Loop through regular part
  226. ; Rcount = end of regular part
  227. ; Rdest = negative index from the end, counting up to zero
  228. movntdq [Rcount+Rdest], xmm0
  229. add Rdest, 10H
  230. jnz M600
  231. sfence
  232. M700: ; Do the last irregular part
  233. ; The size of this part is 1 - 16 bytes.
  234. ; It is faster to always write 16 bytes, possibly overlapping
  235. ; with the preceding regular part, than to make possibly mispredicted
  236. ; branches depending on the size of the last part.
  237. mov rax, Rdest2 ; dest
  238. movq qword [rax+Rcount2-10H], xmm0
  239. movq qword [rax+Rcount2-8], xmm0
  240. ret
  241. memsetCPUDispatch: ; CPU dispatcher, check for instruction sets and which method is fastest
  242. ; This part is executed only once
  243. push rbx
  244. push rcx
  245. push rdx
  246. push rsi
  247. push rdi
  248. push r8
  249. ; set CacheBypassLimit to half the size of the largest level cache
  250. call GetMemsetCacheLimit@
  251. lea rbx, [memsetSSE2@]
  252. call Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write
  253. test eax, eax
  254. jz Q100
  255. lea rbx, [memsetAVX@]
  256. Q100:
  257. ; Insert appropriate pointer
  258. mov [memsetDispatch], rbx
  259. mov rax, rbx
  260. pop r8
  261. pop rdi
  262. pop rsi
  263. pop rdx
  264. pop rcx
  265. pop rbx
  266. ; Jump according to the replaced function pointer
  267. jmp rax
  268. ; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
  269. GetMemsetCacheLimit:
  270. GetMemsetCacheLimit@:
  271. mov rax, [MemsetCacheLimit]
  272. test rax, rax
  273. jnz U200
  274. ; Get half the size of the largest level cache
  275. %ifdef WINDOWS
  276. xor ecx, ecx ; 0 means largest level cache
  277. %else
  278. xor edi, edi ; 0 means largest level cache
  279. %endif
  280. call DataCacheSize ; get cache size
  281. shr eax, 1 ; half the size
  282. jnz U100
  283. mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
  284. U100: mov [MemsetCacheLimit], eax
  285. U200: ret
  286. ; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
  287. SetMemsetCacheLimit:
  288. %ifdef WINDOWS
  289. mov rax, rcx
  290. %else
  291. mov rax, rdi
  292. %endif
  293. test rax, rax
  294. jnz U400
  295. ; zero, means default
  296. mov [MemsetCacheLimit], rax
  297. call GetMemsetCacheLimit@
  298. U400: mov [MemsetCacheLimit], rax
  299. ret
  300. SECTION .data
  301. align 16
  302. ; Jump table for count from 0 to 16:
  303. MemsetJTab:DQ M00, M01, M02, M03, M04, M05, M06, M07
  304. DQ M08, M09, M10, M11, M12, M13, M14, M15, M16
  305. ; Pointer to appropriate version.
  306. ; This initially points to memsetCPUDispatch. memsetCPUDispatch will
  307. ; change this to the appropriate version of memset, so that
  308. ; memsetCPUDispatch is only executed once:
  309. memsetDispatch: DQ memsetCPUDispatch
  310. ; Bypass cache by using non-temporal moves if count > MemsetCacheLimit
  311. ; The optimal value of MemsetCacheLimit is difficult to estimate, but
  312. ; a reasonable value is half the size of the largest cache
  313. MemsetCacheLimit: DQ 0