memcpy.asm 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769
  1. ;;
  2. ;; Copyright (c) 2023, Intel Corporation
  3. ;;
  4. ;; Redistribution and use in source and binary forms, with or without
  5. ;; modification, are permitted provided that the following conditions are met:
  6. ;;
  7. ;; * Redistributions of source code must retain the above copyright notice,
  8. ;; this list of conditions and the following disclaimer.
  9. ;; * Redistributions in binary form must reproduce the above copyright
  10. ;; notice, this list of conditions and the following disclaimer in the
  11. ;; documentation and/or other materials provided with the distribution.
  12. ;; * Neither the name of Intel Corporation nor the names of its contributors
  13. ;; may be used to endorse or promote products derived from this software
  14. ;; without specific prior written permission.
  15. ;;
  16. ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
  20. ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25. ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. ;;
  27. %ifndef __MEMCPY_INC__
  28. %define __MEMCPY_INC__
  29. %include "reg_sizes.asm"
  30. ; This section defines a series of macros to copy small to medium amounts
  31. ; of data from memory to memory, where the size is variable but limited.
  32. ;
  33. ; The macros are all called as:
  34. ; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3
  35. ; with the parameters defined as:
  36. ; DST : register: pointer to dst (not modified)
  37. ; SRC : register: pointer to src (not modified)
  38. ; SIZE : register: length in bytes (not modified)
  39. ; TMP0 : 64-bit temp GPR (clobbered)
  40. ; TMP1 : 64-bit temp GPR (clobbered)
  41. ; XTMP0 : temp XMM (clobbered)
  42. ; XTMP1 : temp XMM (clobbered)
  43. ; XTMP2 : temp XMM (clobbered)
  44. ; XTMP3 : temp XMM (clobbered)
  45. ;
  46. ; The name indicates the options. The name is of the form:
  47. ; memcpy_<VEC>_<SZ><ZERO><RET>
  48. ; where:
  49. ; <VEC> is either "sse" or "avx" or "avx2"
  50. ; <SZ> is either "64" or "128" and defines largest value of SIZE
  51. ; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
  52. ; <RET> is blank or "_ret". If blank, the code falls through. If "ret"
  53. ; it does a "ret" at the end
  54. ;
  55. ; For the avx2 versions, the temp XMM registers need to be YMM registers
  56. ; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as:
  57. ; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1
  58. ; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3
  59. ;
  60. ; For example:
  61. ; memcpy_sse_64 : SSE, 0 <= size < 64, falls through
  62. ; memcpy_avx_64_1 : AVX1, 1 <= size < 64, falls through
  63. ; memcpy_sse_128_ret : SSE, 0 <= size < 128, ends with ret
  64. ; mempcy_avx_128_1_ret : AVX1, 1 <= size < 128, ends with ret
  65. ;
  66. %macro memcpy_sse_64 9
  67. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0
  68. %endm
  69. %macro memcpy_sse_64_1 9
  70. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0
  71. %endm
  72. %macro memcpy_sse_128 9
  73. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0
  74. %endm
  75. %macro memcpy_sse_128_1 9
  76. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0
  77. %endm
  78. %macro memcpy_sse_64_ret 9
  79. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0
  80. %endm
  81. %macro memcpy_sse_64_1_ret 9
  82. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0
  83. %endm
  84. %macro memcpy_sse_128_ret 9
  85. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0
  86. %endm
  87. %macro memcpy_sse_128_1_ret 9
  88. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0
  89. %endm
  90. %macro memcpy_sse_16 5
  91. __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0
  92. %endm
  93. %macro memcpy_sse_16_1 5
  94. __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0
  95. %endm
  96. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  97. %macro memcpy_avx_64 9
  98. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1
  99. %endm
  100. %macro memcpy_avx_64_1 9
  101. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1
  102. %endm
  103. %macro memcpy_avx_128 9
  104. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1
  105. %endm
  106. %macro memcpy_avx_128_1 9
  107. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1
  108. %endm
  109. %macro memcpy_avx_64_ret 9
  110. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1
  111. %endm
  112. %macro memcpy_avx_64_1_ret 9
  113. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1
  114. %endm
  115. %macro memcpy_avx_128_ret 9
  116. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1
  117. %endm
  118. %macro memcpy_avx_128_1_ret 9
  119. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1
  120. %endm
  121. %macro memcpy_avx_16 5
  122. __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1
  123. %endm
  124. %macro memcpy_avx_16_1 5
  125. __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1
  126. %endm
  127. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  128. %macro memcpy_avx2_64 7
  129. __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2
  130. %endm
  131. %macro memcpy_avx2_64_1 7
  132. __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2
  133. %endm
  134. %macro memcpy_avx2_128 9
  135. __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2
  136. %endm
  137. %macro memcpy_avx2_128_1 9
  138. __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2
  139. %endm
  140. %macro memcpy_avx2_64_ret 7
  141. __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2
  142. %endm
  143. %macro memcpy_avx2_64_1_ret 7
  144. __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2
  145. %endm
  146. %macro memcpy_avx2_128_ret 9
  147. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 2
  148. %endm
  149. %macro memcpy_avx2_128_1_ret 9
  150. __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 2
  151. %endm
  152. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  153. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  154. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  155. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  156. %macro __memcpy_int 13
  157. %define %%DST %1 ; register: pointer to dst (not modified)
  158. %define %%SRC %2 ; register: pointer to src (not modified)
  159. %define %%SIZE %3 ; register: length in bytes (not modified)
  160. %define %%TMP0 %4 ; 64-bit temp GPR (clobbered)
  161. %define %%TMP1 %5 ; 64-bit temp GPR (clobbered)
  162. %define %%XTMP0 %6 ; temp XMM (clobbered)
  163. %define %%XTMP1 %7 ; temp XMM (clobbered)
  164. %define %%XTMP2 %8 ; temp XMM (clobbered)
  165. %define %%XTMP3 %9 ; temp XMM (clobbered)
  166. %define %%NOT0 %10 ; if not 0, then assume size cannot be zero
  167. %define %%MAXSIZE %11 ; 128, 64, etc
  168. %define %%USERET %12 ; if not 0, use "ret" at end
  169. %define %%USEAVX %13 ; 0 = SSE, 1 = AVX1, 2 = AVX2
  170. %if (%%USERET != 0)
  171. %define %%DONE ret
  172. %else
  173. %define %%DONE jmp %%end
  174. %endif
  175. %if (%%USEAVX != 0)
  176. %define %%MOVDQU vmovdqu
  177. %else
  178. %define %%MOVDQU movdqu
  179. %endif
  180. %if (%%MAXSIZE >= 128)
  181. test %%SIZE, 64
  182. jz %%lt64
  183. %if (%%USEAVX >= 2)
  184. %%MOVDQU %%XTMP0, [%%SRC + 0*32]
  185. %%MOVDQU %%XTMP1, [%%SRC + 1*32]
  186. %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*32]
  187. %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*32]
  188. %%MOVDQU [%%DST + 0*32], %%XTMP0
  189. %%MOVDQU [%%DST + 1*32], %%XTMP1
  190. %%MOVDQU [%%DST + %%SIZE - 2*32], %%XTMP2
  191. %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP3
  192. %else
  193. %%MOVDQU %%XTMP0, [%%SRC + 0*16]
  194. %%MOVDQU %%XTMP1, [%%SRC + 1*16]
  195. %%MOVDQU %%XTMP2, [%%SRC + 2*16]
  196. %%MOVDQU %%XTMP3, [%%SRC + 3*16]
  197. %%MOVDQU [%%DST + 0*16], %%XTMP0
  198. %%MOVDQU [%%DST + 1*16], %%XTMP1
  199. %%MOVDQU [%%DST + 2*16], %%XTMP2
  200. %%MOVDQU [%%DST + 3*16], %%XTMP3
  201. %%MOVDQU %%XTMP0, [%%SRC + %%SIZE - 4*16]
  202. %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 3*16]
  203. %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16]
  204. %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16]
  205. %%MOVDQU [%%DST + %%SIZE - 4*16], %%XTMP0
  206. %%MOVDQU [%%DST + %%SIZE - 3*16], %%XTMP1
  207. %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2
  208. %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3
  209. %endif
  210. %%DONE
  211. %endif
  212. %if (%%MAXSIZE >= 64)
  213. %%lt64:
  214. test %%SIZE, 32
  215. jz %%lt32
  216. %if (%%USEAVX >= 2)
  217. %%MOVDQU %%XTMP0, [%%SRC + 0*32]
  218. %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*32]
  219. %%MOVDQU [%%DST + 0*32], %%XTMP0
  220. %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP1
  221. %else
  222. %%MOVDQU %%XTMP0, [%%SRC + 0*16]
  223. %%MOVDQU %%XTMP1, [%%SRC + 1*16]
  224. %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16]
  225. %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16]
  226. %%MOVDQU [%%DST + 0*16], %%XTMP0
  227. %%MOVDQU [%%DST + 1*16], %%XTMP1
  228. %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2
  229. %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3
  230. %endif
  231. %%DONE
  232. %endif
  233. %if (%%MAXSIZE >= 32)
  234. %%lt32:
  235. test %%SIZE, 16
  236. jz %%lt16
  237. %if (%%USEAVX >= 2)
  238. %%MOVDQU XWORD(%%XTMP0), [%%SRC + 0*16]
  239. %%MOVDQU XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16]
  240. %%MOVDQU [%%DST + 0*16], XWORD(%%XTMP0)
  241. %%MOVDQU [%%DST + %%SIZE - 1*16], XWORD(%%XTMP1)
  242. %else
  243. %%MOVDQU %%XTMP0, [%%SRC + 0*16]
  244. %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*16]
  245. %%MOVDQU [%%DST + 0*16], %%XTMP0
  246. %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP1
  247. %endif
  248. %%DONE
  249. %endif
  250. %if (%%MAXSIZE >= 16)
  251. test %%SIZE, 16
  252. jz %%lt16
  253. mov %%TMP0, [%%SRC]
  254. mov %%TMP1, [%%SRC + 8]
  255. mov [%%DST], %%TMP0
  256. mov [%%DST + 8], %%TMP1
  257. %%lt16:
  258. test %%SIZE, 8
  259. jz %%lt8
  260. mov %%TMP0, [%%SRC]
  261. mov %%TMP1, [%%SRC + %%SIZE - 8]
  262. mov [%%DST], %%TMP0
  263. mov [%%DST + %%SIZE - 8], %%TMP1
  264. %%DONE
  265. %endif
  266. %if (%%MAXSIZE >= 8)
  267. %%lt8:
  268. test %%SIZE, 4
  269. jz %%lt4
  270. mov DWORD(%%TMP0), [%%SRC]
  271. mov DWORD(%%TMP1), [%%SRC + %%SIZE - 4]
  272. mov [%%DST], DWORD(%%TMP0)
  273. mov [%%DST + %%SIZE - 4], DWORD(%%TMP1)
  274. %%DONE
  275. %endif
  276. %if (%%MAXSIZE >= 4)
  277. %%lt4:
  278. test %%SIZE, 2
  279. jz %%lt2
  280. movzx DWORD(%%TMP0), word [%%SRC]
  281. movzx DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1]
  282. mov [%%DST], WORD(%%TMP0)
  283. mov [%%DST + %%SIZE - 1], BYTE(%%TMP1)
  284. %%DONE
  285. %endif
  286. %%lt2:
  287. %if (%%NOT0 == 0)
  288. test %%SIZE, 1
  289. jz %%end
  290. %endif
  291. movzx DWORD(%%TMP0), byte [%%SRC]
  292. mov [%%DST], BYTE(%%TMP0)
  293. %%end:
  294. %if (%%USERET != 0)
  295. ret
  296. %endif
  297. %endm
  298. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  299. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  300. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  301. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  302. ;; Utility macro to assist with SIMD shifting
  303. %macro _PSRLDQ 3
  304. %define %%VEC %1
  305. %define %%REG %2
  306. %define %%IMM %3
  307. %ifidn %%VEC, SSE
  308. psrldq %%REG, %%IMM
  309. %else
  310. vpsrldq %%REG, %%REG, %%IMM
  311. %endif
  312. %endm
  313. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  314. ; This section defines a series of macros to store small to medium amounts
  315. ; of data from SIMD registers to memory, where the size is variable but limited.
  316. ;
  317. ; The macros are all called as:
  318. ; memcpy DST, SRC, SIZE, TMP, IDX
  319. ; with the parameters defined as:
  320. ; DST : register: pointer to dst (not modified)
  321. ; SRC : register: src data (clobbered)
  322. ; SIZE : register: length in bytes (not modified)
  323. ; TMP : 64-bit temp GPR (clobbered)
  324. ; IDX : 64-bit GPR to store dst index/offset (clobbered)
  325. ; OFFSET ; Offset to be applied to destination pointer (optional)
  326. ;
  327. ; The name indicates the options. The name is of the form:
  328. ; simd_store_<VEC>
  329. ; where <VEC> is the SIMD instruction type e.g. "sse" or "avx"
  330. %macro simd_store_sse 5-6
  331. %if %0 == 6
  332. __simd_store %1,%2,%3,%4,%5,SSE,16,%6
  333. %else
  334. __simd_store %1,%2,%3,%4,%5,SSE,16
  335. %endif
  336. %endm
  337. %macro simd_store_avx 5-6
  338. %if %0 == 6
  339. __simd_store %1,%2,%3,%4,%5,AVX,16,%6
  340. %else
  341. __simd_store %1,%2,%3,%4,%5,AVX,16
  342. %endif
  343. %endm
  344. %macro simd_store_sse_15 5-6
  345. %if %0 == 6
  346. __simd_store %1,%2,%3,%4,%5,SSE,15,%6
  347. %else
  348. __simd_store %1,%2,%3,%4,%5,SSE,15
  349. %endif
  350. %endm
  351. %macro simd_store_avx_15 5-6
  352. %if %0 == 6
  353. __simd_store %1,%2,%3,%4,%5,AVX,15,%6
  354. %else
  355. __simd_store %1,%2,%3,%4,%5,AVX,15
  356. %endif
  357. %endm
  358. %macro __simd_store 7-8
  359. %define %%DST %1 ; register: pointer to dst (not modified)
  360. %define %%SRC %2 ; register: src data (clobbered)
  361. %define %%SIZE %3 ; register: length in bytes (not modified)
  362. %define %%TMP %4 ; 64-bit temp GPR (clobbered)
  363. %define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered)
  364. %define %%SIMDTYPE %6 ; "SSE" or "AVX"
  365. %define %%MAX_LEN %7 ; maximum length to be stored
  366. %define %%OFFSET %8 ; offset to be applied to destination pointer
  367. %define %%PSRLDQ _PSRLDQ %%SIMDTYPE,
  368. %ifidn %%SIMDTYPE, SSE
  369. %define %%MOVDQU movdqu
  370. %define %%MOVQ movq
  371. %else
  372. %define %%MOVDQU vmovdqu
  373. %define %%MOVQ vmovq
  374. %endif
  375. ;; determine max byte size for store operation
  376. %assign max_length_to_store %%MAX_LEN
  377. %if max_length_to_store > 16
  378. %error "__simd_store macro invoked with MAX_LEN bigger than 16!"
  379. %endif
  380. %if %0 == 8
  381. mov %%IDX, %%OFFSET
  382. %else
  383. xor %%IDX, %%IDX ; zero idx
  384. %endif
  385. %if max_length_to_store == 16
  386. test %%SIZE, 16
  387. jz %%lt16
  388. %%MOVDQU [%%DST + %%IDX], %%SRC
  389. jmp %%end
  390. %%lt16:
  391. %endif
  392. %if max_length_to_store >= 8
  393. test %%SIZE, 8
  394. jz %%lt8
  395. %%MOVQ [%%DST + %%IDX], %%SRC
  396. %%PSRLDQ %%SRC, 8
  397. add %%IDX, 8
  398. %%lt8:
  399. %endif
  400. %%MOVQ %%TMP, %%SRC ; use GPR from now on
  401. %if max_length_to_store >= 4
  402. test %%SIZE, 4
  403. jz %%lt4
  404. mov [%%DST + %%IDX], DWORD(%%TMP)
  405. shr %%TMP, 32
  406. add %%IDX, 4
  407. %%lt4:
  408. %endif
  409. test %%SIZE, 2
  410. jz %%lt2
  411. mov [%%DST + %%IDX], WORD(%%TMP)
  412. shr %%TMP, 16
  413. add %%IDX, 2
  414. %%lt2:
  415. test %%SIZE, 1
  416. jz %%end
  417. mov [%%DST + %%IDX], BYTE(%%TMP)
  418. %%end:
  419. %endm
  420. ; This section defines a series of macros to load small to medium amounts
  421. ; (from 0 to 16 bytes) of data from memory to SIMD registers,
  422. ; where the size is variable but limited.
  423. ;
  424. ; The macros are all called as:
  425. ; simd_load DST, SRC, SIZE
  426. ; with the parameters defined as:
  427. ; DST : register: destination XMM register
  428. ; SRC : register: pointer to src data (not modified)
  429. ; SIZE : register: length in bytes (not modified)
  430. ;
  431. ; The name indicates the options. The name is of the form:
  432. ; simd_load_<VEC>_<SZ><ZERO>
  433. ; where:
  434. ; <VEC> is either "sse" or "avx"
  435. ; <SZ> is either "15" or "16" and defines largest value of SIZE
  436. ; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
  437. ;
  438. ; For example:
  439. ; simd_load_sse_16 : SSE, 0 <= size <= 16
  440. ; simd_load_avx_15_1 : AVX, 1 <= size <= 15
  441. %macro simd_load_sse_15_1 3
  442. __simd_load %1,%2,%3,0,0,SSE
  443. %endm
  444. %macro simd_load_sse_15 3
  445. __simd_load %1,%2,%3,1,0,SSE
  446. %endm
  447. %macro simd_load_sse_16_1 3
  448. __simd_load %1,%2,%3,0,1,SSE
  449. %endm
  450. %macro simd_load_sse_16 3
  451. __simd_load %1,%2,%3,1,1,SSE
  452. %endm
  453. %macro simd_load_avx_15_1 3
  454. __simd_load %1,%2,%3,0,0,AVX
  455. %endm
  456. %macro simd_load_avx_15 3
  457. __simd_load %1,%2,%3,1,0,AVX
  458. %endm
  459. %macro simd_load_avx_16_1 3
  460. __simd_load %1,%2,%3,0,1,AVX
  461. %endm
  462. %macro simd_load_avx_16 3
  463. __simd_load %1,%2,%3,1,1,AVX
  464. %endm
  465. %macro __simd_load 6
  466. %define %%DST %1 ; [out] destination XMM register
  467. %define %%SRC %2 ; [in] pointer to src data
  468. %define %%SIZE %3 ; [in] length in bytes (0-16 bytes)
  469. %define %%ACCEPT_0 %4 ; 0 = min length = 1, 1 = min length = 0
  470. %define %%ACCEPT_16 %5 ; 0 = max length = 15 , 1 = max length = 16
  471. %define %%SIMDTYPE %6 ; "SSE" or "AVX"
  472. %ifidn %%SIMDTYPE, SSE
  473. %define %%MOVDQU movdqu
  474. %define %%PINSRB pinsrb
  475. %define %%PINSRQ pinsrq
  476. %define %%PXOR pxor
  477. %else
  478. %define %%MOVDQU vmovdqu
  479. %define %%PINSRB vpinsrb
  480. %define %%PINSRQ vpinsrq
  481. %define %%PXOR vpxor
  482. %endif
  483. %if (%%ACCEPT_16 != 0)
  484. test %%SIZE, 16
  485. jz %%_skip_16
  486. %%MOVDQU %%DST, [%%SRC]
  487. jmp %%end_load
  488. %%_skip_16:
  489. %endif
  490. %%PXOR %%DST, %%DST ; clear XMM register
  491. %if (%%ACCEPT_0 != 0)
  492. or %%SIZE, %%SIZE
  493. je %%end_load
  494. %endif
  495. cmp %%SIZE, 2
  496. jb %%_size_1
  497. je %%_size_2
  498. cmp %%SIZE, 4
  499. jb %%_size_3
  500. je %%_size_4
  501. cmp %%SIZE, 6
  502. jb %%_size_5
  503. je %%_size_6
  504. cmp %%SIZE, 8
  505. jb %%_size_7
  506. je %%_size_8
  507. cmp %%SIZE, 10
  508. jb %%_size_9
  509. je %%_size_10
  510. cmp %%SIZE, 12
  511. jb %%_size_11
  512. je %%_size_12
  513. cmp %%SIZE, 14
  514. jb %%_size_13
  515. je %%_size_14
  516. %%_size_15:
  517. %%PINSRB %%DST, [%%SRC + 14], 14
  518. %%_size_14:
  519. %%PINSRB %%DST, [%%SRC + 13], 13
  520. %%_size_13:
  521. %%PINSRB %%DST, [%%SRC + 12], 12
  522. %%_size_12:
  523. %%PINSRB %%DST, [%%SRC + 11], 11
  524. %%_size_11:
  525. %%PINSRB %%DST, [%%SRC + 10], 10
  526. %%_size_10:
  527. %%PINSRB %%DST, [%%SRC + 9], 9
  528. %%_size_9:
  529. %%PINSRB %%DST, [%%SRC + 8], 8
  530. %%_size_8:
  531. %%PINSRQ %%DST, [%%SRC], 0
  532. jmp %%end_load
  533. %%_size_7:
  534. %%PINSRB %%DST, [%%SRC + 6], 6
  535. %%_size_6:
  536. %%PINSRB %%DST, [%%SRC + 5], 5
  537. %%_size_5:
  538. %%PINSRB %%DST, [%%SRC + 4], 4
  539. %%_size_4:
  540. %%PINSRB %%DST, [%%SRC + 3], 3
  541. %%_size_3:
  542. %%PINSRB %%DST, [%%SRC + 2], 2
  543. %%_size_2:
  544. %%PINSRB %%DST, [%%SRC + 1], 1
  545. %%_size_1:
  546. %%PINSRB %%DST, [%%SRC + 0], 0
  547. %%end_load:
  548. %endm
  549. %macro simd_load_avx2 5
  550. %define %%DST %1 ; [out] destination YMM register
  551. %define %%SRC %2 ; [in] pointer to src data
  552. %define %%SIZE %3 ; [in] length in bytes (0-32 bytes)
  553. %define %%IDX %4 ; [clobbered] Temp GP register to store src idx
  554. %define %%TMP %5 ; [clobbered] Temp GP register
  555. test %%SIZE, 32
  556. jz %%_skip_32
  557. vmovdqu %%DST, [%%SRC]
  558. jmp %%end_load
  559. %%_skip_32:
  560. vpxor %%DST, %%DST ; clear YMM register
  561. or %%SIZE, %%SIZE
  562. je %%end_load
  563. lea %%IDX, [%%SRC]
  564. mov %%TMP, %%SIZE
  565. cmp %%SIZE, 16
  566. jle %%_check_size
  567. add %%IDX, 16
  568. sub %%TMP, 16
  569. %%_check_size:
  570. cmp %%TMP, 2
  571. jb %%_size_1
  572. je %%_size_2
  573. cmp %%TMP, 4
  574. jb %%_size_3
  575. je %%_size_4
  576. cmp %%TMP, 6
  577. jb %%_size_5
  578. je %%_size_6
  579. cmp %%TMP, 8
  580. jb %%_size_7
  581. je %%_size_8
  582. cmp %%TMP, 10
  583. jb %%_size_9
  584. je %%_size_10
  585. cmp %%TMP, 12
  586. jb %%_size_11
  587. je %%_size_12
  588. cmp %%TMP, 14
  589. jb %%_size_13
  590. je %%_size_14
  591. cmp %%TMP, 15
  592. je %%_size_15
  593. %%_size_16:
  594. vmovdqu XWORD(%%DST), [%%IDX]
  595. jmp %%end_load
  596. %%_size_15:
  597. vpinsrb XWORD(%%DST), [%%IDX + 14], 14
  598. %%_size_14:
  599. vpinsrb XWORD(%%DST), [%%IDX + 13], 13
  600. %%_size_13:
  601. vpinsrb XWORD(%%DST), [%%IDX + 12], 12
  602. %%_size_12:
  603. vpinsrb XWORD(%%DST), [%%IDX + 11], 11
  604. %%_size_11:
  605. vpinsrb XWORD(%%DST), [%%IDX + 10], 10
  606. %%_size_10:
  607. vpinsrb XWORD(%%DST), [%%IDX + 9], 9
  608. %%_size_9:
  609. vpinsrb XWORD(%%DST), [%%IDX + 8], 8
  610. %%_size_8:
  611. vpinsrq XWORD(%%DST), [%%IDX], 0
  612. jmp %%_check_higher_16
  613. %%_size_7:
  614. vpinsrb XWORD(%%DST), [%%IDX + 6], 6
  615. %%_size_6:
  616. vpinsrb XWORD(%%DST), [%%IDX + 5], 5
  617. %%_size_5:
  618. vpinsrb XWORD(%%DST), [%%IDX + 4], 4
  619. %%_size_4:
  620. vpinsrb XWORD(%%DST), [%%IDX + 3], 3
  621. %%_size_3:
  622. vpinsrb XWORD(%%DST), [%%IDX + 2], 2
  623. %%_size_2:
  624. vpinsrb XWORD(%%DST), [%%IDX + 1], 1
  625. %%_size_1:
  626. vpinsrb XWORD(%%DST), [%%IDX + 0], 0
  627. %%_check_higher_16:
  628. test %%SIZE, 16
  629. jz %%end_load
  630. ; Move last bytes loaded to upper half and load 16 bytes in lower half
  631. vinserti128 %%DST, XWORD(%%DST), 1
  632. vinserti128 %%DST, [%%SRC], 0
  633. %%end_load:
  634. %endm
  635. %macro simd_store_avx2 5
  636. %define %%DST %1 ; register: pointer to dst (not modified)
  637. %define %%SRC %2 ; register: src data (clobbered)
  638. %define %%SIZE %3 ; register: length in bytes (not modified)
  639. %define %%TMP %4 ; 64-bit temp GPR (clobbered)
  640. %define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered)
  641. xor %%IDX, %%IDX ; zero idx
  642. test %%SIZE, 32
  643. jz %%lt32
  644. vmovdqu [%%DST], %%SRC
  645. jmp %%end
  646. %%lt32:
  647. test %%SIZE, 16
  648. jz %%lt16
  649. vmovdqu [%%DST], XWORD(%%SRC)
  650. ; Move upper half to lower half for further stores
  651. vperm2i128 %%SRC, %%SRC, %%SRC, 0x81
  652. add %%IDX, 16
  653. %%lt16:
  654. test %%SIZE, 8
  655. jz %%lt8
  656. vmovq [%%DST + %%IDX], XWORD(%%SRC)
  657. vpsrldq XWORD(%%SRC), 8
  658. add %%IDX, 8
  659. %%lt8:
  660. vmovq %%TMP, XWORD(%%SRC) ; use GPR from now on
  661. test %%SIZE, 4
  662. jz %%lt4
  663. mov [%%DST + %%IDX], DWORD(%%TMP)
  664. shr %%TMP, 32
  665. add %%IDX, 4
  666. %%lt4:
  667. test %%SIZE, 2
  668. jz %%lt2
  669. mov [%%DST + %%IDX], WORD(%%TMP)
  670. shr %%TMP, 16
  671. add %%IDX, 2
  672. %%lt2:
  673. test %%SIZE, 1
  674. jz %%end
  675. mov [%%DST + %%IDX], BYTE(%%TMP)
  676. %%end:
  677. %endm
  678. %endif ; ifndef __MEMCPY_INC__