huf_decompress_amd64.S 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595
  1. /*
  2. * Copyright (c) Meta Platforms, Inc. and affiliates.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under both the BSD-style license (found in the
  6. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  7. * in the COPYING file in the root directory of this source tree).
  8. * You may select, at your option, one of the above-listed licenses.
  9. */
  10. #include "../common/portability_macros.h"
  11. #if defined(__ELF__) && defined(__GNUC__)
  12. /* Stack marking
  13. * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
  14. */
  15. .section .note.GNU-stack,"",%progbits
  16. #if defined(__aarch64__)
  17. /* Mark that this assembly supports BTI & PAC, because it is empty for aarch64.
  18. * See: https://github.com/facebook/zstd/issues/3841
  19. * See: https://gcc.godbolt.org/z/sqr5T4ffK
  20. * See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/
  21. * See: https://reviews.llvm.org/D62609
  22. */
  23. .pushsection .note.gnu.property, "a"
  24. .p2align 3
  25. .long 4 /* size of the name - "GNU\0" */
  26. .long 0x10 /* size of descriptor */
  27. .long 0x5 /* NT_GNU_PROPERTY_TYPE_0 */
  28. .asciz "GNU"
  29. .long 0xc0000000 /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */
  30. .long 4 /* pr_datasz - 4 bytes */
  31. .long 3 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */
  32. .p2align 3 /* pr_padding - bring everything to 8 byte alignment */
  33. .popsection
  34. #endif
  35. #endif
  36. #if ZSTD_ENABLE_ASM_X86_64_BMI2
  37. /* Calling convention:
  38. *
  39. * %rdi contains the first argument: HUF_DecompressAsmArgs*.
  40. * %rbp isn't maintained (no frame pointer).
  41. * %rsp contains the stack pointer that grows down.
  42. * No red-zone is assumed, only addresses >= %rsp are used.
  43. * All register contents are preserved.
  44. *
  45. * TODO: Support Windows calling convention.
  46. */
  47. ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
  48. ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
  49. ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
  50. ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
  51. .global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
  52. .global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
  53. .global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
  54. .global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
  55. .text
  56. /* Sets up register mappings for clarity.
  57. * op[], bits[], dtable & ip[0] each get their own register.
  58. * ip[1,2,3] & olimit alias var[].
  59. * %rax is a scratch register.
  60. */
  61. #define op0 rsi
  62. #define op1 rbx
  63. #define op2 rcx
  64. #define op3 rdi
  65. #define ip0 r8
  66. #define ip1 r9
  67. #define ip2 r10
  68. #define ip3 r11
  69. #define bits0 rbp
  70. #define bits1 rdx
  71. #define bits2 r12
  72. #define bits3 r13
  73. #define dtable r14
  74. #define olimit r15
  75. /* var[] aliases ip[1,2,3] & olimit
  76. * ip[1,2,3] are saved every iteration.
  77. * olimit is only used in compute_olimit.
  78. */
  79. #define var0 r15
  80. #define var1 r9
  81. #define var2 r10
  82. #define var3 r11
  83. /* 32-bit var registers */
  84. #define vard0 r15d
  85. #define vard1 r9d
  86. #define vard2 r10d
  87. #define vard3 r11d
  88. /* Calls X(N) for each stream 0, 1, 2, 3. */
  89. #define FOR_EACH_STREAM(X) \
  90. X(0); \
  91. X(1); \
  92. X(2); \
  93. X(3)
  94. /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
  95. #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
  96. X(0, idx); \
  97. X(1, idx); \
  98. X(2, idx); \
  99. X(3, idx)
  100. /* Define both _HUF_* & HUF_* symbols because MacOS
  101. * C symbols are prefixed with '_' & Linux symbols aren't.
  102. */
  103. _HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
  104. HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
  105. ZSTD_CET_ENDBRANCH
  106. /* Save all registers - even if they are callee saved for simplicity. */
  107. push %rax
  108. push %rbx
  109. push %rcx
  110. push %rdx
  111. push %rbp
  112. push %rsi
  113. push %rdi
  114. push %r8
  115. push %r9
  116. push %r10
  117. push %r11
  118. push %r12
  119. push %r13
  120. push %r14
  121. push %r15
  122. /* Read HUF_DecompressAsmArgs* args from %rax */
  123. movq %rdi, %rax
  124. movq 0(%rax), %ip0
  125. movq 8(%rax), %ip1
  126. movq 16(%rax), %ip2
  127. movq 24(%rax), %ip3
  128. movq 32(%rax), %op0
  129. movq 40(%rax), %op1
  130. movq 48(%rax), %op2
  131. movq 56(%rax), %op3
  132. movq 64(%rax), %bits0
  133. movq 72(%rax), %bits1
  134. movq 80(%rax), %bits2
  135. movq 88(%rax), %bits3
  136. movq 96(%rax), %dtable
  137. push %rax /* argument */
  138. push 104(%rax) /* ilowest */
  139. push 112(%rax) /* oend */
  140. push %olimit /* olimit space */
  141. subq $24, %rsp
  142. .L_4X1_compute_olimit:
  143. /* Computes how many iterations we can do safely
  144. * %r15, %rax may be clobbered
  145. * rbx, rdx must be saved
  146. * op3 & ip0 mustn't be clobbered
  147. */
  148. movq %rbx, 0(%rsp)
  149. movq %rdx, 8(%rsp)
  150. movq 32(%rsp), %rax /* rax = oend */
  151. subq %op3, %rax /* rax = oend - op3 */
  152. /* r15 = (oend - op3) / 5 */
  153. movabsq $-3689348814741910323, %rdx
  154. mulq %rdx
  155. movq %rdx, %r15
  156. shrq $2, %r15
  157. movq %ip0, %rax /* rax = ip0 */
  158. movq 40(%rsp), %rdx /* rdx = ilowest */
  159. subq %rdx, %rax /* rax = ip0 - ilowest */
  160. movq %rax, %rbx /* rbx = ip0 - ilowest */
  161. /* rdx = (ip0 - ilowest) / 7 */
  162. movabsq $2635249153387078803, %rdx
  163. mulq %rdx
  164. subq %rdx, %rbx
  165. shrq %rbx
  166. addq %rbx, %rdx
  167. shrq $2, %rdx
  168. /* r15 = min(%rdx, %r15) */
  169. cmpq %rdx, %r15
  170. cmova %rdx, %r15
  171. /* r15 = r15 * 5 */
  172. leaq (%r15, %r15, 4), %r15
  173. /* olimit = op3 + r15 */
  174. addq %op3, %olimit
  175. movq 8(%rsp), %rdx
  176. movq 0(%rsp), %rbx
  177. /* If (op3 + 20 > olimit) */
  178. movq %op3, %rax /* rax = op3 */
  179. cmpq %rax, %olimit /* op3 == olimit */
  180. je .L_4X1_exit
  181. /* If (ip1 < ip0) go to exit */
  182. cmpq %ip0, %ip1
  183. jb .L_4X1_exit
  184. /* If (ip2 < ip1) go to exit */
  185. cmpq %ip1, %ip2
  186. jb .L_4X1_exit
  187. /* If (ip3 < ip2) go to exit */
  188. cmpq %ip2, %ip3
  189. jb .L_4X1_exit
  190. /* Reads top 11 bits from bits[n]
  191. * Loads dt[bits[n]] into var[n]
  192. */
  193. #define GET_NEXT_DELT(n) \
  194. movq $53, %var##n; \
  195. shrxq %var##n, %bits##n, %var##n; \
  196. movzwl (%dtable,%var##n,2),%vard##n
  197. /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
  198. * Moves var[n] to %rax
  199. * bits[n] <<= var[n] & 63
  200. * op[n][idx] = %rax >> 8
  201. * %ah is a way to access bits [8, 16) of %rax
  202. */
  203. #define DECODE_FROM_DELT(n, idx) \
  204. movq %var##n, %rax; \
  205. shlxq %var##n, %bits##n, %bits##n; \
  206. movb %ah, idx(%op##n)
  207. /* Assumes GET_NEXT_DELT has been called.
  208. * Calls DECODE_FROM_DELT then GET_NEXT_DELT
  209. */
  210. #define DECODE_AND_GET_NEXT(n, idx) \
  211. DECODE_FROM_DELT(n, idx); \
  212. GET_NEXT_DELT(n) \
  213. /* // ctz & nbBytes is stored in bits[n]
  214. * // nbBits is stored in %rax
  215. * ctz = CTZ[bits[n]]
  216. * nbBits = ctz & 7
  217. * nbBytes = ctz >> 3
  218. * op[n] += 5
  219. * ip[n] -= nbBytes
  220. * // Note: x86-64 is little-endian ==> no bswap
  221. * bits[n] = MEM_readST(ip[n]) | 1
  222. * bits[n] <<= nbBits
  223. */
  224. #define RELOAD_BITS(n) \
  225. bsfq %bits##n, %bits##n; \
  226. movq %bits##n, %rax; \
  227. andq $7, %rax; \
  228. shrq $3, %bits##n; \
  229. leaq 5(%op##n), %op##n; \
  230. subq %bits##n, %ip##n; \
  231. movq (%ip##n), %bits##n; \
  232. orq $1, %bits##n; \
  233. shlx %rax, %bits##n, %bits##n
  234. /* Store clobbered variables on the stack */
  235. movq %olimit, 24(%rsp)
  236. movq %ip1, 0(%rsp)
  237. movq %ip2, 8(%rsp)
  238. movq %ip3, 16(%rsp)
  239. /* Call GET_NEXT_DELT for each stream */
  240. FOR_EACH_STREAM(GET_NEXT_DELT)
  241. .p2align 6
  242. .L_4X1_loop_body:
  243. /* Decode 5 symbols in each of the 4 streams (20 total)
  244. * Must have called GET_NEXT_DELT for each stream
  245. */
  246. FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
  247. FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
  248. FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
  249. FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
  250. FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
  251. /* Load ip[1,2,3] from stack (var[] aliases them)
  252. * ip[] is needed for RELOAD_BITS
  253. * Each will be stored back to the stack after RELOAD
  254. */
  255. movq 0(%rsp), %ip1
  256. movq 8(%rsp), %ip2
  257. movq 16(%rsp), %ip3
  258. /* Reload each stream & fetch the next table entry
  259. * to prepare for the next iteration
  260. */
  261. RELOAD_BITS(0)
  262. GET_NEXT_DELT(0)
  263. RELOAD_BITS(1)
  264. movq %ip1, 0(%rsp)
  265. GET_NEXT_DELT(1)
  266. RELOAD_BITS(2)
  267. movq %ip2, 8(%rsp)
  268. GET_NEXT_DELT(2)
  269. RELOAD_BITS(3)
  270. movq %ip3, 16(%rsp)
  271. GET_NEXT_DELT(3)
  272. /* If op3 < olimit: continue the loop */
  273. cmp %op3, 24(%rsp)
  274. ja .L_4X1_loop_body
  275. /* Reload ip[1,2,3] from stack */
  276. movq 0(%rsp), %ip1
  277. movq 8(%rsp), %ip2
  278. movq 16(%rsp), %ip3
  279. /* Re-compute olimit */
  280. jmp .L_4X1_compute_olimit
  281. #undef GET_NEXT_DELT
  282. #undef DECODE_FROM_DELT
  283. #undef DECODE
  284. #undef RELOAD_BITS
  285. .L_4X1_exit:
  286. addq $24, %rsp
  287. /* Restore stack (oend & olimit) */
  288. pop %rax /* olimit */
  289. pop %rax /* oend */
  290. pop %rax /* ilowest */
  291. pop %rax /* arg */
  292. /* Save ip / op / bits */
  293. movq %ip0, 0(%rax)
  294. movq %ip1, 8(%rax)
  295. movq %ip2, 16(%rax)
  296. movq %ip3, 24(%rax)
  297. movq %op0, 32(%rax)
  298. movq %op1, 40(%rax)
  299. movq %op2, 48(%rax)
  300. movq %op3, 56(%rax)
  301. movq %bits0, 64(%rax)
  302. movq %bits1, 72(%rax)
  303. movq %bits2, 80(%rax)
  304. movq %bits3, 88(%rax)
  305. /* Restore registers */
  306. pop %r15
  307. pop %r14
  308. pop %r13
  309. pop %r12
  310. pop %r11
  311. pop %r10
  312. pop %r9
  313. pop %r8
  314. pop %rdi
  315. pop %rsi
  316. pop %rbp
  317. pop %rdx
  318. pop %rcx
  319. pop %rbx
  320. pop %rax
  321. ret
  322. _HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
  323. HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
  324. ZSTD_CET_ENDBRANCH
  325. /* Save all registers - even if they are callee saved for simplicity. */
  326. push %rax
  327. push %rbx
  328. push %rcx
  329. push %rdx
  330. push %rbp
  331. push %rsi
  332. push %rdi
  333. push %r8
  334. push %r9
  335. push %r10
  336. push %r11
  337. push %r12
  338. push %r13
  339. push %r14
  340. push %r15
  341. movq %rdi, %rax
  342. movq 0(%rax), %ip0
  343. movq 8(%rax), %ip1
  344. movq 16(%rax), %ip2
  345. movq 24(%rax), %ip3
  346. movq 32(%rax), %op0
  347. movq 40(%rax), %op1
  348. movq 48(%rax), %op2
  349. movq 56(%rax), %op3
  350. movq 64(%rax), %bits0
  351. movq 72(%rax), %bits1
  352. movq 80(%rax), %bits2
  353. movq 88(%rax), %bits3
  354. movq 96(%rax), %dtable
  355. push %rax /* argument */
  356. push %rax /* olimit */
  357. push 104(%rax) /* ilowest */
  358. movq 112(%rax), %rax
  359. push %rax /* oend3 */
  360. movq %op3, %rax
  361. push %rax /* oend2 */
  362. movq %op2, %rax
  363. push %rax /* oend1 */
  364. movq %op1, %rax
  365. push %rax /* oend0 */
  366. /* Scratch space */
  367. subq $8, %rsp
  368. .L_4X2_compute_olimit:
  369. /* Computes how many iterations we can do safely
  370. * %r15, %rax may be clobbered
  371. * rdx must be saved
  372. * op[1,2,3,4] & ip0 mustn't be clobbered
  373. */
  374. movq %rdx, 0(%rsp)
  375. /* We can consume up to 7 input bytes each iteration. */
  376. movq %ip0, %rax /* rax = ip0 */
  377. movq 40(%rsp), %rdx /* rdx = ilowest */
  378. subq %rdx, %rax /* rax = ip0 - ilowest */
  379. movq %rax, %r15 /* r15 = ip0 - ilowest */
  380. /* rdx = rax / 7 */
  381. movabsq $2635249153387078803, %rdx
  382. mulq %rdx
  383. subq %rdx, %r15
  384. shrq %r15
  385. addq %r15, %rdx
  386. shrq $2, %rdx
  387. /* r15 = (ip0 - ilowest) / 7 */
  388. movq %rdx, %r15
  389. /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
  390. movq 8(%rsp), %rax /* rax = oend0 */
  391. subq %op0, %rax /* rax = oend0 - op0 */
  392. movq 16(%rsp), %rdx /* rdx = oend1 */
  393. subq %op1, %rdx /* rdx = oend1 - op1 */
  394. cmpq %rax, %rdx
  395. cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
  396. movq 24(%rsp), %rax /* rax = oend2 */
  397. subq %op2, %rax /* rax = oend2 - op2 */
  398. cmpq %rax, %rdx
  399. cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
  400. movq 32(%rsp), %rax /* rax = oend3 */
  401. subq %op3, %rax /* rax = oend3 - op3 */
  402. cmpq %rax, %rdx
  403. cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
  404. movabsq $-3689348814741910323, %rax
  405. mulq %rdx
  406. shrq $3, %rdx /* rdx = rdx / 10 */
  407. /* r15 = min(%rdx, %r15) */
  408. cmpq %rdx, %r15
  409. cmova %rdx, %r15
  410. /* olimit = op3 + 5 * r15 */
  411. movq %r15, %rax
  412. leaq (%op3, %rax, 4), %olimit
  413. addq %rax, %olimit
  414. movq 0(%rsp), %rdx
  415. /* If (op3 + 10 > olimit) */
  416. movq %op3, %rax /* rax = op3 */
  417. cmpq %rax, %olimit /* op3 == olimit */
  418. je .L_4X2_exit
  419. /* If (ip1 < ip0) go to exit */
  420. cmpq %ip0, %ip1
  421. jb .L_4X2_exit
  422. /* If (ip2 < ip1) go to exit */
  423. cmpq %ip1, %ip2
  424. jb .L_4X2_exit
  425. /* If (ip3 < ip2) go to exit */
  426. cmpq %ip2, %ip3
  427. jb .L_4X2_exit
  428. #define DECODE(n, idx) \
  429. movq %bits##n, %rax; \
  430. shrq $53, %rax; \
  431. movzwl 0(%dtable,%rax,4),%r8d; \
  432. movzbl 2(%dtable,%rax,4),%r15d; \
  433. movzbl 3(%dtable,%rax,4),%eax; \
  434. movw %r8w, (%op##n); \
  435. shlxq %r15, %bits##n, %bits##n; \
  436. addq %rax, %op##n
  437. #define RELOAD_BITS(n) \
  438. bsfq %bits##n, %bits##n; \
  439. movq %bits##n, %rax; \
  440. shrq $3, %bits##n; \
  441. andq $7, %rax; \
  442. subq %bits##n, %ip##n; \
  443. movq (%ip##n), %bits##n; \
  444. orq $1, %bits##n; \
  445. shlxq %rax, %bits##n, %bits##n
  446. movq %olimit, 48(%rsp)
  447. .p2align 6
  448. .L_4X2_loop_body:
  449. /* We clobber r8, so store it on the stack */
  450. movq %r8, 0(%rsp)
  451. /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
  452. FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
  453. FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
  454. FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
  455. FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
  456. FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
  457. /* Reload r8 */
  458. movq 0(%rsp), %r8
  459. FOR_EACH_STREAM(RELOAD_BITS)
  460. cmp %op3, 48(%rsp)
  461. ja .L_4X2_loop_body
  462. jmp .L_4X2_compute_olimit
  463. #undef DECODE
  464. #undef RELOAD_BITS
  465. .L_4X2_exit:
  466. addq $8, %rsp
  467. /* Restore stack (oend & olimit) */
  468. pop %rax /* oend0 */
  469. pop %rax /* oend1 */
  470. pop %rax /* oend2 */
  471. pop %rax /* oend3 */
  472. pop %rax /* ilowest */
  473. pop %rax /* olimit */
  474. pop %rax /* arg */
  475. /* Save ip / op / bits */
  476. movq %ip0, 0(%rax)
  477. movq %ip1, 8(%rax)
  478. movq %ip2, 16(%rax)
  479. movq %ip3, 24(%rax)
  480. movq %op0, 32(%rax)
  481. movq %op1, 40(%rax)
  482. movq %op2, 48(%rax)
  483. movq %op3, 56(%rax)
  484. movq %bits0, 64(%rax)
  485. movq %bits1, 72(%rax)
  486. movq %bits2, 80(%rax)
  487. movq %bits3, 88(%rax)
  488. /* Restore registers */
  489. pop %r15
  490. pop %r14
  491. pop %r13
  492. pop %r12
  493. pop %r11
  494. pop %r10
  495. pop %r9
  496. pop %r8
  497. pop %rdi
  498. pop %rsi
  499. pop %rbp
  500. pop %rdx
  501. pop %rcx
  502. pop %rbx
  503. pop %rax
  504. ret
  505. #endif