huf_decompress_amd64.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. /*
  2. * Copyright (c) Meta Platforms, Inc. and affiliates.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under both the BSD-style license (found in the
  6. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  7. * in the COPYING file in the root directory of this source tree).
  8. * You may select, at your option, one of the above-listed licenses.
  9. */
  10. #include "../common/portability_macros.h"
  11. /* Stack marking
  12. * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
  13. */
  14. #if defined(__ELF__) && defined(__GNUC__)
  15. .section .note.GNU-stack,"",%progbits
  16. #endif
  17. #if ZSTD_ENABLE_ASM_X86_64_BMI2
  18. /* Calling convention:
  19. *
  20. * %rdi contains the first argument: HUF_DecompressAsmArgs*.
  21. * %rbp isn't maintained (no frame pointer).
  22. * %rsp contains the stack pointer that grows down.
  23. * No red-zone is assumed, only addresses >= %rsp are used.
  24. * All register contents are preserved.
  25. *
  26. * TODO: Support Windows calling convention.
  27. */
  28. ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
  29. ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
  30. ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
  31. ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
  32. .global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
  33. .global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
  34. .global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
  35. .global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
  36. .text
  37. /* Sets up register mappings for clarity.
  38. * op[], bits[], dtable & ip[0] each get their own register.
  39. * ip[1,2,3] & olimit alias var[].
  40. * %rax is a scratch register.
  41. */
  42. #define op0 rsi
  43. #define op1 rbx
  44. #define op2 rcx
  45. #define op3 rdi
  46. #define ip0 r8
  47. #define ip1 r9
  48. #define ip2 r10
  49. #define ip3 r11
  50. #define bits0 rbp
  51. #define bits1 rdx
  52. #define bits2 r12
  53. #define bits3 r13
  54. #define dtable r14
  55. #define olimit r15
  56. /* var[] aliases ip[1,2,3] & olimit
  57. * ip[1,2,3] are saved every iteration.
  58. * olimit is only used in compute_olimit.
  59. */
  60. #define var0 r15
  61. #define var1 r9
  62. #define var2 r10
  63. #define var3 r11
  64. /* 32-bit var registers */
  65. #define vard0 r15d
  66. #define vard1 r9d
  67. #define vard2 r10d
  68. #define vard3 r11d
  69. /* Calls X(N) for each stream 0, 1, 2, 3. */
  70. #define FOR_EACH_STREAM(X) \
  71. X(0); \
  72. X(1); \
  73. X(2); \
  74. X(3)
  75. /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
  76. #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
  77. X(0, idx); \
  78. X(1, idx); \
  79. X(2, idx); \
  80. X(3, idx)
  81. /* Define both _HUF_* & HUF_* symbols because MacOS
  82. * C symbols are prefixed with '_' & Linux symbols aren't.
  83. */
  84. _HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
  85. HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
  86. ZSTD_CET_ENDBRANCH
  87. /* Save all registers - even if they are callee saved for simplicity. */
  88. push %rax
  89. push %rbx
  90. push %rcx
  91. push %rdx
  92. push %rbp
  93. push %rsi
  94. push %rdi
  95. push %r8
  96. push %r9
  97. push %r10
  98. push %r11
  99. push %r12
  100. push %r13
  101. push %r14
  102. push %r15
  103. /* Read HUF_DecompressAsmArgs* args from %rax */
  104. movq %rdi, %rax
  105. movq 0(%rax), %ip0
  106. movq 8(%rax), %ip1
  107. movq 16(%rax), %ip2
  108. movq 24(%rax), %ip3
  109. movq 32(%rax), %op0
  110. movq 40(%rax), %op1
  111. movq 48(%rax), %op2
  112. movq 56(%rax), %op3
  113. movq 64(%rax), %bits0
  114. movq 72(%rax), %bits1
  115. movq 80(%rax), %bits2
  116. movq 88(%rax), %bits3
  117. movq 96(%rax), %dtable
  118. push %rax /* argument */
  119. push 104(%rax) /* ilimit */
  120. push 112(%rax) /* oend */
  121. push %olimit /* olimit space */
  122. subq $24, %rsp
  123. .L_4X1_compute_olimit:
  124. /* Computes how many iterations we can do safely
  125. * %r15, %rax may be clobbered
  126. * rbx, rdx must be saved
  127. * op3 & ip0 mustn't be clobbered
  128. */
  129. movq %rbx, 0(%rsp)
  130. movq %rdx, 8(%rsp)
  131. movq 32(%rsp), %rax /* rax = oend */
  132. subq %op3, %rax /* rax = oend - op3 */
  133. /* r15 = (oend - op3) / 5 */
  134. movabsq $-3689348814741910323, %rdx
  135. mulq %rdx
  136. movq %rdx, %r15
  137. shrq $2, %r15
  138. movq %ip0, %rax /* rax = ip0 */
  139. movq 40(%rsp), %rdx /* rdx = ilimit */
  140. subq %rdx, %rax /* rax = ip0 - ilimit */
  141. movq %rax, %rbx /* rbx = ip0 - ilimit */
  142. /* rdx = (ip0 - ilimit) / 7 */
  143. movabsq $2635249153387078803, %rdx
  144. mulq %rdx
  145. subq %rdx, %rbx
  146. shrq %rbx
  147. addq %rbx, %rdx
  148. shrq $2, %rdx
  149. /* r15 = min(%rdx, %r15) */
  150. cmpq %rdx, %r15
  151. cmova %rdx, %r15
  152. /* r15 = r15 * 5 */
  153. leaq (%r15, %r15, 4), %r15
  154. /* olimit = op3 + r15 */
  155. addq %op3, %olimit
  156. movq 8(%rsp), %rdx
  157. movq 0(%rsp), %rbx
  158. /* If (op3 + 20 > olimit) */
  159. movq %op3, %rax /* rax = op3 */
  160. addq $20, %rax /* rax = op3 + 20 */
  161. cmpq %rax, %olimit /* op3 + 20 > olimit */
  162. jb .L_4X1_exit
  163. /* If (ip1 < ip0) go to exit */
  164. cmpq %ip0, %ip1
  165. jb .L_4X1_exit
  166. /* If (ip2 < ip1) go to exit */
  167. cmpq %ip1, %ip2
  168. jb .L_4X1_exit
  169. /* If (ip3 < ip2) go to exit */
  170. cmpq %ip2, %ip3
  171. jb .L_4X1_exit
  172. /* Reads top 11 bits from bits[n]
  173. * Loads dt[bits[n]] into var[n]
  174. */
  175. #define GET_NEXT_DELT(n) \
  176. movq $53, %var##n; \
  177. shrxq %var##n, %bits##n, %var##n; \
  178. movzwl (%dtable,%var##n,2),%vard##n
  179. /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
  180. * Moves var[n] to %rax
  181. * bits[n] <<= var[n] & 63
  182. * op[n][idx] = %rax >> 8
  183. * %ah is a way to access bits [8, 16) of %rax
  184. */
  185. #define DECODE_FROM_DELT(n, idx) \
  186. movq %var##n, %rax; \
  187. shlxq %var##n, %bits##n, %bits##n; \
  188. movb %ah, idx(%op##n)
  189. /* Assumes GET_NEXT_DELT has been called.
  190. * Calls DECODE_FROM_DELT then GET_NEXT_DELT
  191. */
  192. #define DECODE_AND_GET_NEXT(n, idx) \
  193. DECODE_FROM_DELT(n, idx); \
  194. GET_NEXT_DELT(n) \
  195. /* // ctz & nbBytes is stored in bits[n]
  196. * // nbBits is stored in %rax
  197. * ctz = CTZ[bits[n]]
  198. * nbBits = ctz & 7
  199. * nbBytes = ctz >> 3
  200. * op[n] += 5
  201. * ip[n] -= nbBytes
  202. * // Note: x86-64 is little-endian ==> no bswap
  203. * bits[n] = MEM_readST(ip[n]) | 1
  204. * bits[n] <<= nbBits
  205. */
  206. #define RELOAD_BITS(n) \
  207. bsfq %bits##n, %bits##n; \
  208. movq %bits##n, %rax; \
  209. andq $7, %rax; \
  210. shrq $3, %bits##n; \
  211. leaq 5(%op##n), %op##n; \
  212. subq %bits##n, %ip##n; \
  213. movq (%ip##n), %bits##n; \
  214. orq $1, %bits##n; \
  215. shlx %rax, %bits##n, %bits##n
  216. /* Store clobbered variables on the stack */
  217. movq %olimit, 24(%rsp)
  218. movq %ip1, 0(%rsp)
  219. movq %ip2, 8(%rsp)
  220. movq %ip3, 16(%rsp)
  221. /* Call GET_NEXT_DELT for each stream */
  222. FOR_EACH_STREAM(GET_NEXT_DELT)
  223. .p2align 6
  224. .L_4X1_loop_body:
  225. /* Decode 5 symbols in each of the 4 streams (20 total)
  226. * Must have called GET_NEXT_DELT for each stream
  227. */
  228. FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
  229. FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
  230. FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
  231. FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
  232. FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
  233. /* Load ip[1,2,3] from stack (var[] aliases them)
  234. * ip[] is needed for RELOAD_BITS
  235. * Each will be stored back to the stack after RELOAD
  236. */
  237. movq 0(%rsp), %ip1
  238. movq 8(%rsp), %ip2
  239. movq 16(%rsp), %ip3
  240. /* Reload each stream & fetch the next table entry
  241. * to prepare for the next iteration
  242. */
  243. RELOAD_BITS(0)
  244. GET_NEXT_DELT(0)
  245. RELOAD_BITS(1)
  246. movq %ip1, 0(%rsp)
  247. GET_NEXT_DELT(1)
  248. RELOAD_BITS(2)
  249. movq %ip2, 8(%rsp)
  250. GET_NEXT_DELT(2)
  251. RELOAD_BITS(3)
  252. movq %ip3, 16(%rsp)
  253. GET_NEXT_DELT(3)
  254. /* If op3 < olimit: continue the loop */
  255. cmp %op3, 24(%rsp)
  256. ja .L_4X1_loop_body
  257. /* Reload ip[1,2,3] from stack */
  258. movq 0(%rsp), %ip1
  259. movq 8(%rsp), %ip2
  260. movq 16(%rsp), %ip3
  261. /* Re-compute olimit */
  262. jmp .L_4X1_compute_olimit
  263. #undef GET_NEXT_DELT
  264. #undef DECODE_FROM_DELT
  265. #undef DECODE
  266. #undef RELOAD_BITS
  267. .L_4X1_exit:
  268. addq $24, %rsp
  269. /* Restore stack (oend & olimit) */
  270. pop %rax /* olimit */
  271. pop %rax /* oend */
  272. pop %rax /* ilimit */
  273. pop %rax /* arg */
  274. /* Save ip / op / bits */
  275. movq %ip0, 0(%rax)
  276. movq %ip1, 8(%rax)
  277. movq %ip2, 16(%rax)
  278. movq %ip3, 24(%rax)
  279. movq %op0, 32(%rax)
  280. movq %op1, 40(%rax)
  281. movq %op2, 48(%rax)
  282. movq %op3, 56(%rax)
  283. movq %bits0, 64(%rax)
  284. movq %bits1, 72(%rax)
  285. movq %bits2, 80(%rax)
  286. movq %bits3, 88(%rax)
  287. /* Restore registers */
  288. pop %r15
  289. pop %r14
  290. pop %r13
  291. pop %r12
  292. pop %r11
  293. pop %r10
  294. pop %r9
  295. pop %r8
  296. pop %rdi
  297. pop %rsi
  298. pop %rbp
  299. pop %rdx
  300. pop %rcx
  301. pop %rbx
  302. pop %rax
  303. ret
  304. _HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
  305. HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
  306. ZSTD_CET_ENDBRANCH
  307. /* Save all registers - even if they are callee saved for simplicity. */
  308. push %rax
  309. push %rbx
  310. push %rcx
  311. push %rdx
  312. push %rbp
  313. push %rsi
  314. push %rdi
  315. push %r8
  316. push %r9
  317. push %r10
  318. push %r11
  319. push %r12
  320. push %r13
  321. push %r14
  322. push %r15
  323. movq %rdi, %rax
  324. movq 0(%rax), %ip0
  325. movq 8(%rax), %ip1
  326. movq 16(%rax), %ip2
  327. movq 24(%rax), %ip3
  328. movq 32(%rax), %op0
  329. movq 40(%rax), %op1
  330. movq 48(%rax), %op2
  331. movq 56(%rax), %op3
  332. movq 64(%rax), %bits0
  333. movq 72(%rax), %bits1
  334. movq 80(%rax), %bits2
  335. movq 88(%rax), %bits3
  336. movq 96(%rax), %dtable
  337. push %rax /* argument */
  338. push %rax /* olimit */
  339. push 104(%rax) /* ilimit */
  340. movq 112(%rax), %rax
  341. push %rax /* oend3 */
  342. movq %op3, %rax
  343. push %rax /* oend2 */
  344. movq %op2, %rax
  345. push %rax /* oend1 */
  346. movq %op1, %rax
  347. push %rax /* oend0 */
  348. /* Scratch space */
  349. subq $8, %rsp
  350. .L_4X2_compute_olimit:
  351. /* Computes how many iterations we can do safely
  352. * %r15, %rax may be clobbered
  353. * rdx must be saved
  354. * op[1,2,3,4] & ip0 mustn't be clobbered
  355. */
  356. movq %rdx, 0(%rsp)
  357. /* We can consume up to 7 input bytes each iteration. */
  358. movq %ip0, %rax /* rax = ip0 */
  359. movq 40(%rsp), %rdx /* rdx = ilimit */
  360. subq %rdx, %rax /* rax = ip0 - ilimit */
  361. movq %rax, %r15 /* r15 = ip0 - ilimit */
  362. /* rdx = rax / 7 */
  363. movabsq $2635249153387078803, %rdx
  364. mulq %rdx
  365. subq %rdx, %r15
  366. shrq %r15
  367. addq %r15, %rdx
  368. shrq $2, %rdx
  369. /* r15 = (ip0 - ilimit) / 7 */
  370. movq %rdx, %r15
  371. /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
  372. movq 8(%rsp), %rax /* rax = oend0 */
  373. subq %op0, %rax /* rax = oend0 - op0 */
  374. movq 16(%rsp), %rdx /* rdx = oend1 */
  375. subq %op1, %rdx /* rdx = oend1 - op1 */
  376. cmpq %rax, %rdx
  377. cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
  378. movq 24(%rsp), %rax /* rax = oend2 */
  379. subq %op2, %rax /* rax = oend2 - op2 */
  380. cmpq %rax, %rdx
  381. cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
  382. movq 32(%rsp), %rax /* rax = oend3 */
  383. subq %op3, %rax /* rax = oend3 - op3 */
  384. cmpq %rax, %rdx
  385. cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
  386. movabsq $-3689348814741910323, %rax
  387. mulq %rdx
  388. shrq $3, %rdx /* rdx = rdx / 10 */
  389. /* r15 = min(%rdx, %r15) */
  390. cmpq %rdx, %r15
  391. cmova %rdx, %r15
  392. /* olimit = op3 + 5 * r15 */
  393. movq %r15, %rax
  394. leaq (%op3, %rax, 4), %olimit
  395. addq %rax, %olimit
  396. movq 0(%rsp), %rdx
  397. /* If (op3 + 10 > olimit) */
  398. movq %op3, %rax /* rax = op3 */
  399. addq $10, %rax /* rax = op3 + 10 */
  400. cmpq %rax, %olimit /* op3 + 10 > olimit */
  401. jb .L_4X2_exit
  402. /* If (ip1 < ip0) go to exit */
  403. cmpq %ip0, %ip1
  404. jb .L_4X2_exit
  405. /* If (ip2 < ip1) go to exit */
  406. cmpq %ip1, %ip2
  407. jb .L_4X2_exit
  408. /* If (ip3 < ip2) go to exit */
  409. cmpq %ip2, %ip3
  410. jb .L_4X2_exit
  411. #define DECODE(n, idx) \
  412. movq %bits##n, %rax; \
  413. shrq $53, %rax; \
  414. movzwl 0(%dtable,%rax,4),%r8d; \
  415. movzbl 2(%dtable,%rax,4),%r15d; \
  416. movzbl 3(%dtable,%rax,4),%eax; \
  417. movw %r8w, (%op##n); \
  418. shlxq %r15, %bits##n, %bits##n; \
  419. addq %rax, %op##n
  420. #define RELOAD_BITS(n) \
  421. bsfq %bits##n, %bits##n; \
  422. movq %bits##n, %rax; \
  423. shrq $3, %bits##n; \
  424. andq $7, %rax; \
  425. subq %bits##n, %ip##n; \
  426. movq (%ip##n), %bits##n; \
  427. orq $1, %bits##n; \
  428. shlxq %rax, %bits##n, %bits##n
  429. movq %olimit, 48(%rsp)
  430. .p2align 6
  431. .L_4X2_loop_body:
  432. /* We clobber r8, so store it on the stack */
  433. movq %r8, 0(%rsp)
  434. /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
  435. FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
  436. FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
  437. FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
  438. FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
  439. FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
  440. /* Reload r8 */
  441. movq 0(%rsp), %r8
  442. FOR_EACH_STREAM(RELOAD_BITS)
  443. cmp %op3, 48(%rsp)
  444. ja .L_4X2_loop_body
  445. jmp .L_4X2_compute_olimit
  446. #undef DECODE
  447. #undef RELOAD_BITS
  448. .L_4X2_exit:
  449. addq $8, %rsp
  450. /* Restore stack (oend & olimit) */
  451. pop %rax /* oend0 */
  452. pop %rax /* oend1 */
  453. pop %rax /* oend2 */
  454. pop %rax /* oend3 */
  455. pop %rax /* ilimit */
  456. pop %rax /* olimit */
  457. pop %rax /* arg */
  458. /* Save ip / op / bits */
  459. movq %ip0, 0(%rax)
  460. movq %ip1, 8(%rax)
  461. movq %ip2, 16(%rax)
  462. movq %ip3, 24(%rax)
  463. movq %op0, 32(%rax)
  464. movq %op1, 40(%rax)
  465. movq %op2, 48(%rax)
  466. movq %op3, 56(%rax)
  467. movq %bits0, 64(%rax)
  468. movq %bits1, 72(%rax)
  469. movq %bits2, 80(%rax)
  470. movq %bits3, 88(%rax)
  471. /* Restore registers */
  472. pop %r15
  473. pop %r14
  474. pop %r13
  475. pop %r12
  476. pop %r11
  477. pop %r10
  478. pop %r9
  479. pop %r8
  480. pop %rdi
  481. pop %rsi
  482. pop %rbp
  483. pop %rdx
  484. pop %rcx
  485. pop %rbx
  486. pop %rax
  487. ret
  488. #endif