huf_decompress_amd64.S 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602
  1. /*
  2. * Copyright (c) Meta Platforms, Inc. and affiliates.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under both the BSD-style license (found in the
  6. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  7. * in the COPYING file in the root directory of this source tree).
  8. * You may select, at your option, one of the above-listed licenses.
  9. */
  10. #include "../common/portability_macros.h"
  11. #if defined(__ELF__) && defined(__GNUC__)
  12. /* Stack marking
  13. * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
  14. */
  15. .section .note.GNU-stack,"",%progbits
  16. #if defined(__aarch64__)
  17. /* Mark that this assembly supports BTI & PAC, because it is empty for aarch64.
  18. * See: https://github.com/facebook/zstd/issues/3841
  19. * See: https://gcc.godbolt.org/z/sqr5T4ffK
  20. * See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/
  21. * See: https://reviews.llvm.org/D62609
  22. */
  23. .pushsection .note.gnu.property, "a"
  24. .p2align 3
  25. .long 4 /* size of the name - "GNU\0" */
  26. .long 0x10 /* size of descriptor */
  27. .long 0x5 /* NT_GNU_PROPERTY_TYPE_0 */
  28. .asciz "GNU"
  29. .long 0xc0000000 /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */
  30. .long 4 /* pr_datasz - 4 bytes */
  31. .long 3 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */
  32. .p2align 3 /* pr_padding - bring everything to 8 byte alignment */
  33. .popsection
  34. #endif
  35. #endif
  36. #if ZSTD_ENABLE_ASM_X86_64_BMI2
  37. /* Calling convention:
  38. *
  39. * %rdi (or %rcx on Windows) contains the first argument: HUF_DecompressAsmArgs*.
  40. * %rbp isn't maintained (no frame pointer).
  41. * %rsp contains the stack pointer that grows down.
  42. * No red-zone is assumed, only addresses >= %rsp are used.
  43. * All register contents are preserved.
  44. */
  45. ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
  46. ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
  47. ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
  48. ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
  49. .global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
  50. .global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
  51. .global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
  52. .global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
  53. .text
  54. /* Sets up register mappings for clarity.
  55. * op[], bits[], dtable & ip[0] each get their own register.
  56. * ip[1,2,3] & olimit alias var[].
  57. * %rax is a scratch register.
  58. */
  59. #define op0 rsi
  60. #define op1 rbx
  61. #define op2 rcx
  62. #define op3 rdi
  63. #define ip0 r8
  64. #define ip1 r9
  65. #define ip2 r10
  66. #define ip3 r11
  67. #define bits0 rbp
  68. #define bits1 rdx
  69. #define bits2 r12
  70. #define bits3 r13
  71. #define dtable r14
  72. #define olimit r15
  73. /* var[] aliases ip[1,2,3] & olimit
  74. * ip[1,2,3] are saved every iteration.
  75. * olimit is only used in compute_olimit.
  76. */
  77. #define var0 r15
  78. #define var1 r9
  79. #define var2 r10
  80. #define var3 r11
  81. /* 32-bit var registers */
  82. #define vard0 r15d
  83. #define vard1 r9d
  84. #define vard2 r10d
  85. #define vard3 r11d
  86. /* Calls X(N) for each stream 0, 1, 2, 3. */
  87. #define FOR_EACH_STREAM(X) \
  88. X(0); \
  89. X(1); \
  90. X(2); \
  91. X(3)
  92. /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
  93. #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
  94. X(0, idx); \
  95. X(1, idx); \
  96. X(2, idx); \
  97. X(3, idx)
  98. /* Define both _HUF_* & HUF_* symbols because MacOS
  99. * C symbols are prefixed with '_' & Linux symbols aren't.
  100. */
  101. _HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
  102. HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
  103. ZSTD_CET_ENDBRANCH
  104. /* Save all registers - even if they are callee saved for simplicity. */
  105. push %rax
  106. push %rbx
  107. push %rcx
  108. push %rdx
  109. push %rbp
  110. push %rsi
  111. push %rdi
  112. push %r8
  113. push %r9
  114. push %r10
  115. push %r11
  116. push %r12
  117. push %r13
  118. push %r14
  119. push %r15
  120. /* Read HUF_DecompressAsmArgs* args from %rax */
  121. #if defined(_WIN32)
  122. movq %rcx, %rax
  123. #else
  124. movq %rdi, %rax
  125. #endif
  126. movq 0(%rax), %ip0
  127. movq 8(%rax), %ip1
  128. movq 16(%rax), %ip2
  129. movq 24(%rax), %ip3
  130. movq 32(%rax), %op0
  131. movq 40(%rax), %op1
  132. movq 48(%rax), %op2
  133. movq 56(%rax), %op3
  134. movq 64(%rax), %bits0
  135. movq 72(%rax), %bits1
  136. movq 80(%rax), %bits2
  137. movq 88(%rax), %bits3
  138. movq 96(%rax), %dtable
  139. push %rax /* argument */
  140. push 104(%rax) /* ilowest */
  141. push 112(%rax) /* oend */
  142. push %olimit /* olimit space */
  143. subq $24, %rsp
  144. .L_4X1_compute_olimit:
  145. /* Computes how many iterations we can do safely
  146. * %r15, %rax may be clobbered
  147. * rbx, rdx must be saved
  148. * op3 & ip0 mustn't be clobbered
  149. */
  150. movq %rbx, 0(%rsp)
  151. movq %rdx, 8(%rsp)
  152. movq 32(%rsp), %rax /* rax = oend */
  153. subq %op3, %rax /* rax = oend - op3 */
  154. /* r15 = (oend - op3) / 5 */
  155. movabsq $-3689348814741910323, %rdx
  156. mulq %rdx
  157. movq %rdx, %r15
  158. shrq $2, %r15
  159. movq %ip0, %rax /* rax = ip0 */
  160. movq 40(%rsp), %rdx /* rdx = ilowest */
  161. subq %rdx, %rax /* rax = ip0 - ilowest */
  162. movq %rax, %rbx /* rbx = ip0 - ilowest */
  163. /* rdx = (ip0 - ilowest) / 7 */
  164. movabsq $2635249153387078803, %rdx
  165. mulq %rdx
  166. subq %rdx, %rbx
  167. shrq %rbx
  168. addq %rbx, %rdx
  169. shrq $2, %rdx
  170. /* r15 = min(%rdx, %r15) */
  171. cmpq %rdx, %r15
  172. cmova %rdx, %r15
  173. /* r15 = r15 * 5 */
  174. leaq (%r15, %r15, 4), %r15
  175. /* olimit = op3 + r15 */
  176. addq %op3, %olimit
  177. movq 8(%rsp), %rdx
  178. movq 0(%rsp), %rbx
  179. /* If (op3 + 20 > olimit) */
  180. movq %op3, %rax /* rax = op3 */
  181. cmpq %rax, %olimit /* op3 == olimit */
  182. je .L_4X1_exit
  183. /* If (ip1 < ip0) go to exit */
  184. cmpq %ip0, %ip1
  185. jb .L_4X1_exit
  186. /* If (ip2 < ip1) go to exit */
  187. cmpq %ip1, %ip2
  188. jb .L_4X1_exit
  189. /* If (ip3 < ip2) go to exit */
  190. cmpq %ip2, %ip3
  191. jb .L_4X1_exit
  192. /* Reads top 11 bits from bits[n]
  193. * Loads dt[bits[n]] into var[n]
  194. */
  195. #define GET_NEXT_DELT(n) \
  196. movq $53, %var##n; \
  197. shrxq %var##n, %bits##n, %var##n; \
  198. movzwl (%dtable,%var##n,2),%vard##n
  199. /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
  200. * Moves var[n] to %rax
  201. * bits[n] <<= var[n] & 63
  202. * op[n][idx] = %rax >> 8
  203. * %ah is a way to access bits [8, 16) of %rax
  204. */
  205. #define DECODE_FROM_DELT(n, idx) \
  206. movq %var##n, %rax; \
  207. shlxq %var##n, %bits##n, %bits##n; \
  208. movb %ah, idx(%op##n)
  209. /* Assumes GET_NEXT_DELT has been called.
  210. * Calls DECODE_FROM_DELT then GET_NEXT_DELT
  211. */
  212. #define DECODE_AND_GET_NEXT(n, idx) \
  213. DECODE_FROM_DELT(n, idx); \
  214. GET_NEXT_DELT(n) \
  215. /* // ctz & nbBytes is stored in bits[n]
  216. * // nbBits is stored in %rax
  217. * ctz = CTZ[bits[n]]
  218. * nbBits = ctz & 7
  219. * nbBytes = ctz >> 3
  220. * op[n] += 5
  221. * ip[n] -= nbBytes
  222. * // Note: x86-64 is little-endian ==> no bswap
  223. * bits[n] = MEM_readST(ip[n]) | 1
  224. * bits[n] <<= nbBits
  225. */
  226. #define RELOAD_BITS(n) \
  227. bsfq %bits##n, %bits##n; \
  228. movq %bits##n, %rax; \
  229. andq $7, %rax; \
  230. shrq $3, %bits##n; \
  231. leaq 5(%op##n), %op##n; \
  232. subq %bits##n, %ip##n; \
  233. movq (%ip##n), %bits##n; \
  234. orq $1, %bits##n; \
  235. shlx %rax, %bits##n, %bits##n
  236. /* Store clobbered variables on the stack */
  237. movq %olimit, 24(%rsp)
  238. movq %ip1, 0(%rsp)
  239. movq %ip2, 8(%rsp)
  240. movq %ip3, 16(%rsp)
  241. /* Call GET_NEXT_DELT for each stream */
  242. FOR_EACH_STREAM(GET_NEXT_DELT)
  243. .p2align 6
  244. .L_4X1_loop_body:
  245. /* Decode 5 symbols in each of the 4 streams (20 total)
  246. * Must have called GET_NEXT_DELT for each stream
  247. */
  248. FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
  249. FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
  250. FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
  251. FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
  252. FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
  253. /* Load ip[1,2,3] from stack (var[] aliases them)
  254. * ip[] is needed for RELOAD_BITS
  255. * Each will be stored back to the stack after RELOAD
  256. */
  257. movq 0(%rsp), %ip1
  258. movq 8(%rsp), %ip2
  259. movq 16(%rsp), %ip3
  260. /* Reload each stream & fetch the next table entry
  261. * to prepare for the next iteration
  262. */
  263. RELOAD_BITS(0)
  264. GET_NEXT_DELT(0)
  265. RELOAD_BITS(1)
  266. movq %ip1, 0(%rsp)
  267. GET_NEXT_DELT(1)
  268. RELOAD_BITS(2)
  269. movq %ip2, 8(%rsp)
  270. GET_NEXT_DELT(2)
  271. RELOAD_BITS(3)
  272. movq %ip3, 16(%rsp)
  273. GET_NEXT_DELT(3)
  274. /* If op3 < olimit: continue the loop */
  275. cmp %op3, 24(%rsp)
  276. ja .L_4X1_loop_body
  277. /* Reload ip[1,2,3] from stack */
  278. movq 0(%rsp), %ip1
  279. movq 8(%rsp), %ip2
  280. movq 16(%rsp), %ip3
  281. /* Re-compute olimit */
  282. jmp .L_4X1_compute_olimit
  283. #undef GET_NEXT_DELT
  284. #undef DECODE_FROM_DELT
  285. #undef DECODE
  286. #undef RELOAD_BITS
  287. .L_4X1_exit:
  288. addq $24, %rsp
  289. /* Restore stack (oend & olimit) */
  290. pop %rax /* olimit */
  291. pop %rax /* oend */
  292. pop %rax /* ilowest */
  293. pop %rax /* arg */
  294. /* Save ip / op / bits */
  295. movq %ip0, 0(%rax)
  296. movq %ip1, 8(%rax)
  297. movq %ip2, 16(%rax)
  298. movq %ip3, 24(%rax)
  299. movq %op0, 32(%rax)
  300. movq %op1, 40(%rax)
  301. movq %op2, 48(%rax)
  302. movq %op3, 56(%rax)
  303. movq %bits0, 64(%rax)
  304. movq %bits1, 72(%rax)
  305. movq %bits2, 80(%rax)
  306. movq %bits3, 88(%rax)
  307. /* Restore registers */
  308. pop %r15
  309. pop %r14
  310. pop %r13
  311. pop %r12
  312. pop %r11
  313. pop %r10
  314. pop %r9
  315. pop %r8
  316. pop %rdi
  317. pop %rsi
  318. pop %rbp
  319. pop %rdx
  320. pop %rcx
  321. pop %rbx
  322. pop %rax
  323. ret
  324. _HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
  325. HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
  326. ZSTD_CET_ENDBRANCH
  327. /* Save all registers - even if they are callee saved for simplicity. */
  328. push %rax
  329. push %rbx
  330. push %rcx
  331. push %rdx
  332. push %rbp
  333. push %rsi
  334. push %rdi
  335. push %r8
  336. push %r9
  337. push %r10
  338. push %r11
  339. push %r12
  340. push %r13
  341. push %r14
  342. push %r15
  343. /* Read HUF_DecompressAsmArgs* args from %rax */
  344. #if defined(_WIN32)
  345. movq %rcx, %rax
  346. #else
  347. movq %rdi, %rax
  348. #endif
  349. movq 0(%rax), %ip0
  350. movq 8(%rax), %ip1
  351. movq 16(%rax), %ip2
  352. movq 24(%rax), %ip3
  353. movq 32(%rax), %op0
  354. movq 40(%rax), %op1
  355. movq 48(%rax), %op2
  356. movq 56(%rax), %op3
  357. movq 64(%rax), %bits0
  358. movq 72(%rax), %bits1
  359. movq 80(%rax), %bits2
  360. movq 88(%rax), %bits3
  361. movq 96(%rax), %dtable
  362. push %rax /* argument */
  363. push %rax /* olimit */
  364. push 104(%rax) /* ilowest */
  365. movq 112(%rax), %rax
  366. push %rax /* oend3 */
  367. movq %op3, %rax
  368. push %rax /* oend2 */
  369. movq %op2, %rax
  370. push %rax /* oend1 */
  371. movq %op1, %rax
  372. push %rax /* oend0 */
  373. /* Scratch space */
  374. subq $8, %rsp
  375. .L_4X2_compute_olimit:
  376. /* Computes how many iterations we can do safely
  377. * %r15, %rax may be clobbered
  378. * rdx must be saved
  379. * op[1,2,3,4] & ip0 mustn't be clobbered
  380. */
  381. movq %rdx, 0(%rsp)
  382. /* We can consume up to 7 input bytes each iteration. */
  383. movq %ip0, %rax /* rax = ip0 */
  384. movq 40(%rsp), %rdx /* rdx = ilowest */
  385. subq %rdx, %rax /* rax = ip0 - ilowest */
  386. movq %rax, %r15 /* r15 = ip0 - ilowest */
  387. /* rdx = rax / 7 */
  388. movabsq $2635249153387078803, %rdx
  389. mulq %rdx
  390. subq %rdx, %r15
  391. shrq %r15
  392. addq %r15, %rdx
  393. shrq $2, %rdx
  394. /* r15 = (ip0 - ilowest) / 7 */
  395. movq %rdx, %r15
  396. /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
  397. movq 8(%rsp), %rax /* rax = oend0 */
  398. subq %op0, %rax /* rax = oend0 - op0 */
  399. movq 16(%rsp), %rdx /* rdx = oend1 */
  400. subq %op1, %rdx /* rdx = oend1 - op1 */
  401. cmpq %rax, %rdx
  402. cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
  403. movq 24(%rsp), %rax /* rax = oend2 */
  404. subq %op2, %rax /* rax = oend2 - op2 */
  405. cmpq %rax, %rdx
  406. cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
  407. movq 32(%rsp), %rax /* rax = oend3 */
  408. subq %op3, %rax /* rax = oend3 - op3 */
  409. cmpq %rax, %rdx
  410. cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
  411. movabsq $-3689348814741910323, %rax
  412. mulq %rdx
  413. shrq $3, %rdx /* rdx = rdx / 10 */
  414. /* r15 = min(%rdx, %r15) */
  415. cmpq %rdx, %r15
  416. cmova %rdx, %r15
  417. /* olimit = op3 + 5 * r15 */
  418. movq %r15, %rax
  419. leaq (%op3, %rax, 4), %olimit
  420. addq %rax, %olimit
  421. movq 0(%rsp), %rdx
  422. /* If (op3 + 10 > olimit) */
  423. movq %op3, %rax /* rax = op3 */
  424. cmpq %rax, %olimit /* op3 == olimit */
  425. je .L_4X2_exit
  426. /* If (ip1 < ip0) go to exit */
  427. cmpq %ip0, %ip1
  428. jb .L_4X2_exit
  429. /* If (ip2 < ip1) go to exit */
  430. cmpq %ip1, %ip2
  431. jb .L_4X2_exit
  432. /* If (ip3 < ip2) go to exit */
  433. cmpq %ip2, %ip3
  434. jb .L_4X2_exit
  435. #define DECODE(n, idx) \
  436. movq %bits##n, %rax; \
  437. shrq $53, %rax; \
  438. movzwl 0(%dtable,%rax,4),%r8d; \
  439. movzbl 2(%dtable,%rax,4),%r15d; \
  440. movzbl 3(%dtable,%rax,4),%eax; \
  441. movw %r8w, (%op##n); \
  442. shlxq %r15, %bits##n, %bits##n; \
  443. addq %rax, %op##n
  444. #define RELOAD_BITS(n) \
  445. bsfq %bits##n, %bits##n; \
  446. movq %bits##n, %rax; \
  447. shrq $3, %bits##n; \
  448. andq $7, %rax; \
  449. subq %bits##n, %ip##n; \
  450. movq (%ip##n), %bits##n; \
  451. orq $1, %bits##n; \
  452. shlxq %rax, %bits##n, %bits##n
  453. movq %olimit, 48(%rsp)
  454. .p2align 6
  455. .L_4X2_loop_body:
  456. /* We clobber r8, so store it on the stack */
  457. movq %r8, 0(%rsp)
  458. /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
  459. FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
  460. FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
  461. FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
  462. FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
  463. FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
  464. /* Reload r8 */
  465. movq 0(%rsp), %r8
  466. FOR_EACH_STREAM(RELOAD_BITS)
  467. cmp %op3, 48(%rsp)
  468. ja .L_4X2_loop_body
  469. jmp .L_4X2_compute_olimit
  470. #undef DECODE
  471. #undef RELOAD_BITS
  472. .L_4X2_exit:
  473. addq $8, %rsp
  474. /* Restore stack (oend & olimit) */
  475. pop %rax /* oend0 */
  476. pop %rax /* oend1 */
  477. pop %rax /* oend2 */
  478. pop %rax /* oend3 */
  479. pop %rax /* ilowest */
  480. pop %rax /* olimit */
  481. pop %rax /* arg */
  482. /* Save ip / op / bits */
  483. movq %ip0, 0(%rax)
  484. movq %ip1, 8(%rax)
  485. movq %ip2, 16(%rax)
  486. movq %ip3, 24(%rax)
  487. movq %op0, 32(%rax)
  488. movq %op1, 40(%rax)
  489. movq %op2, 48(%rax)
  490. movq %op3, 56(%rax)
  491. movq %bits0, 64(%rax)
  492. movq %bits1, 72(%rax)
  493. movq %bits2, 80(%rax)
  494. movq %bits3, 88(%rax)
  495. /* Restore registers */
  496. pop %r15
  497. pop %r14
  498. pop %r13
  499. pop %r12
  500. pop %r11
  501. pop %r10
  502. pop %r9
  503. pop %r8
  504. pop %rdi
  505. pop %rsi
  506. pop %rbp
  507. pop %rdx
  508. pop %rcx
  509. pop %rbx
  510. pop %rax
  511. ret
  512. #endif