decompress_amd64.s 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830
  1. // Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
  2. //go:build amd64 && !appengine && !noasm && gc
  3. // func decompress4x_main_loop_amd64(ctx *decompress4xContext)
  4. TEXT ·decompress4x_main_loop_amd64(SB), $0-8
  5. // Preload values
  6. MOVQ ctx+0(FP), AX
  7. MOVBQZX 8(AX), DI
  8. MOVQ 16(AX), BX
  9. MOVQ 48(AX), SI
  10. MOVQ 24(AX), R8
  11. MOVQ 32(AX), R9
  12. MOVQ (AX), R10
  13. // Main loop
  14. main_loop:
  15. XORL DX, DX
  16. CMPQ BX, SI
  17. SETGE DL
  18. // br0.fillFast32()
  19. MOVQ 32(R10), R11
  20. MOVBQZX 40(R10), R12
  21. CMPQ R12, $0x20
  22. JBE skip_fill0
  23. MOVQ 24(R10), AX
  24. SUBQ $0x20, R12
  25. SUBQ $0x04, AX
  26. MOVQ (R10), R13
  27. // b.value |= uint64(low) << (b.bitsRead & 63)
  28. MOVL (AX)(R13*1), R13
  29. MOVQ R12, CX
  30. SHLQ CL, R13
  31. MOVQ AX, 24(R10)
  32. ORQ R13, R11
  33. // exhausted += (br0.off < 4)
  34. CMPQ AX, $0x04
  35. ADCB $+0, DL
  36. skip_fill0:
  37. // val0 := br0.peekTopBits(peekBits)
  38. MOVQ R11, R13
  39. MOVQ DI, CX
  40. SHRQ CL, R13
  41. // v0 := table[val0&mask]
  42. MOVW (R9)(R13*2), CX
  43. // br0.advance(uint8(v0.entry)
  44. MOVB CH, AL
  45. SHLQ CL, R11
  46. ADDB CL, R12
  47. // val1 := br0.peekTopBits(peekBits)
  48. MOVQ DI, CX
  49. MOVQ R11, R13
  50. SHRQ CL, R13
  51. // v1 := table[val1&mask]
  52. MOVW (R9)(R13*2), CX
  53. // br0.advance(uint8(v1.entry))
  54. MOVB CH, AH
  55. SHLQ CL, R11
  56. ADDB CL, R12
  57. // these two writes get coalesced
  58. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  59. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  60. MOVW AX, (BX)
  61. // update the bitreader structure
  62. MOVQ R11, 32(R10)
  63. MOVB R12, 40(R10)
  64. // br1.fillFast32()
  65. MOVQ 80(R10), R11
  66. MOVBQZX 88(R10), R12
  67. CMPQ R12, $0x20
  68. JBE skip_fill1
  69. MOVQ 72(R10), AX
  70. SUBQ $0x20, R12
  71. SUBQ $0x04, AX
  72. MOVQ 48(R10), R13
  73. // b.value |= uint64(low) << (b.bitsRead & 63)
  74. MOVL (AX)(R13*1), R13
  75. MOVQ R12, CX
  76. SHLQ CL, R13
  77. MOVQ AX, 72(R10)
  78. ORQ R13, R11
  79. // exhausted += (br1.off < 4)
  80. CMPQ AX, $0x04
  81. ADCB $+0, DL
  82. skip_fill1:
  83. // val0 := br1.peekTopBits(peekBits)
  84. MOVQ R11, R13
  85. MOVQ DI, CX
  86. SHRQ CL, R13
  87. // v0 := table[val0&mask]
  88. MOVW (R9)(R13*2), CX
  89. // br1.advance(uint8(v0.entry)
  90. MOVB CH, AL
  91. SHLQ CL, R11
  92. ADDB CL, R12
  93. // val1 := br1.peekTopBits(peekBits)
  94. MOVQ DI, CX
  95. MOVQ R11, R13
  96. SHRQ CL, R13
  97. // v1 := table[val1&mask]
  98. MOVW (R9)(R13*2), CX
  99. // br1.advance(uint8(v1.entry))
  100. MOVB CH, AH
  101. SHLQ CL, R11
  102. ADDB CL, R12
  103. // these two writes get coalesced
  104. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  105. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  106. MOVW AX, (BX)(R8*1)
  107. // update the bitreader structure
  108. MOVQ R11, 80(R10)
  109. MOVB R12, 88(R10)
  110. // br2.fillFast32()
  111. MOVQ 128(R10), R11
  112. MOVBQZX 136(R10), R12
  113. CMPQ R12, $0x20
  114. JBE skip_fill2
  115. MOVQ 120(R10), AX
  116. SUBQ $0x20, R12
  117. SUBQ $0x04, AX
  118. MOVQ 96(R10), R13
  119. // b.value |= uint64(low) << (b.bitsRead & 63)
  120. MOVL (AX)(R13*1), R13
  121. MOVQ R12, CX
  122. SHLQ CL, R13
  123. MOVQ AX, 120(R10)
  124. ORQ R13, R11
  125. // exhausted += (br2.off < 4)
  126. CMPQ AX, $0x04
  127. ADCB $+0, DL
  128. skip_fill2:
  129. // val0 := br2.peekTopBits(peekBits)
  130. MOVQ R11, R13
  131. MOVQ DI, CX
  132. SHRQ CL, R13
  133. // v0 := table[val0&mask]
  134. MOVW (R9)(R13*2), CX
  135. // br2.advance(uint8(v0.entry)
  136. MOVB CH, AL
  137. SHLQ CL, R11
  138. ADDB CL, R12
  139. // val1 := br2.peekTopBits(peekBits)
  140. MOVQ DI, CX
  141. MOVQ R11, R13
  142. SHRQ CL, R13
  143. // v1 := table[val1&mask]
  144. MOVW (R9)(R13*2), CX
  145. // br2.advance(uint8(v1.entry))
  146. MOVB CH, AH
  147. SHLQ CL, R11
  148. ADDB CL, R12
  149. // these two writes get coalesced
  150. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  151. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  152. MOVW AX, (BX)(R8*2)
  153. // update the bitreader structure
  154. MOVQ R11, 128(R10)
  155. MOVB R12, 136(R10)
  156. // br3.fillFast32()
  157. MOVQ 176(R10), R11
  158. MOVBQZX 184(R10), R12
  159. CMPQ R12, $0x20
  160. JBE skip_fill3
  161. MOVQ 168(R10), AX
  162. SUBQ $0x20, R12
  163. SUBQ $0x04, AX
  164. MOVQ 144(R10), R13
  165. // b.value |= uint64(low) << (b.bitsRead & 63)
  166. MOVL (AX)(R13*1), R13
  167. MOVQ R12, CX
  168. SHLQ CL, R13
  169. MOVQ AX, 168(R10)
  170. ORQ R13, R11
  171. // exhausted += (br3.off < 4)
  172. CMPQ AX, $0x04
  173. ADCB $+0, DL
  174. skip_fill3:
  175. // val0 := br3.peekTopBits(peekBits)
  176. MOVQ R11, R13
  177. MOVQ DI, CX
  178. SHRQ CL, R13
  179. // v0 := table[val0&mask]
  180. MOVW (R9)(R13*2), CX
  181. // br3.advance(uint8(v0.entry)
  182. MOVB CH, AL
  183. SHLQ CL, R11
  184. ADDB CL, R12
  185. // val1 := br3.peekTopBits(peekBits)
  186. MOVQ DI, CX
  187. MOVQ R11, R13
  188. SHRQ CL, R13
  189. // v1 := table[val1&mask]
  190. MOVW (R9)(R13*2), CX
  191. // br3.advance(uint8(v1.entry))
  192. MOVB CH, AH
  193. SHLQ CL, R11
  194. ADDB CL, R12
  195. // these two writes get coalesced
  196. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  197. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  198. LEAQ (R8)(R8*2), CX
  199. MOVW AX, (BX)(CX*1)
  200. // update the bitreader structure
  201. MOVQ R11, 176(R10)
  202. MOVB R12, 184(R10)
  203. ADDQ $0x02, BX
  204. TESTB DL, DL
  205. JZ main_loop
  206. MOVQ ctx+0(FP), AX
  207. SUBQ 16(AX), BX
  208. SHLQ $0x02, BX
  209. MOVQ BX, 40(AX)
  210. RET
  211. // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
  212. TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
  213. // Preload values
  214. MOVQ ctx+0(FP), CX
  215. MOVBQZX 8(CX), DI
  216. MOVQ 16(CX), BX
  217. MOVQ 48(CX), SI
  218. MOVQ 24(CX), R8
  219. MOVQ 32(CX), R9
  220. MOVQ (CX), R10
  221. // Main loop
  222. main_loop:
  223. XORL DX, DX
  224. CMPQ BX, SI
  225. SETGE DL
  226. // br0.fillFast32()
  227. MOVQ 32(R10), R11
  228. MOVBQZX 40(R10), R12
  229. CMPQ R12, $0x20
  230. JBE skip_fill0
  231. MOVQ 24(R10), R13
  232. SUBQ $0x20, R12
  233. SUBQ $0x04, R13
  234. MOVQ (R10), R14
  235. // b.value |= uint64(low) << (b.bitsRead & 63)
  236. MOVL (R13)(R14*1), R14
  237. MOVQ R12, CX
  238. SHLQ CL, R14
  239. MOVQ R13, 24(R10)
  240. ORQ R14, R11
  241. // exhausted += (br0.off < 4)
  242. CMPQ R13, $0x04
  243. ADCB $+0, DL
  244. skip_fill0:
  245. // val0 := br0.peekTopBits(peekBits)
  246. MOVQ R11, R13
  247. MOVQ DI, CX
  248. SHRQ CL, R13
  249. // v0 := table[val0&mask]
  250. MOVW (R9)(R13*2), CX
  251. // br0.advance(uint8(v0.entry)
  252. MOVB CH, AL
  253. SHLQ CL, R11
  254. ADDB CL, R12
  255. // val1 := br0.peekTopBits(peekBits)
  256. MOVQ R11, R13
  257. MOVQ DI, CX
  258. SHRQ CL, R13
  259. // v1 := table[val0&mask]
  260. MOVW (R9)(R13*2), CX
  261. // br0.advance(uint8(v1.entry)
  262. MOVB CH, AH
  263. SHLQ CL, R11
  264. ADDB CL, R12
  265. BSWAPL AX
  266. // val2 := br0.peekTopBits(peekBits)
  267. MOVQ R11, R13
  268. MOVQ DI, CX
  269. SHRQ CL, R13
  270. // v2 := table[val0&mask]
  271. MOVW (R9)(R13*2), CX
  272. // br0.advance(uint8(v2.entry)
  273. MOVB CH, AH
  274. SHLQ CL, R11
  275. ADDB CL, R12
  276. // val3 := br0.peekTopBits(peekBits)
  277. MOVQ R11, R13
  278. MOVQ DI, CX
  279. SHRQ CL, R13
  280. // v3 := table[val0&mask]
  281. MOVW (R9)(R13*2), CX
  282. // br0.advance(uint8(v3.entry)
  283. MOVB CH, AL
  284. SHLQ CL, R11
  285. ADDB CL, R12
  286. BSWAPL AX
  287. // these four writes get coalesced
  288. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  289. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  290. // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
  291. // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
  292. MOVL AX, (BX)
  293. // update the bitreader structure
  294. MOVQ R11, 32(R10)
  295. MOVB R12, 40(R10)
  296. // br1.fillFast32()
  297. MOVQ 80(R10), R11
  298. MOVBQZX 88(R10), R12
  299. CMPQ R12, $0x20
  300. JBE skip_fill1
  301. MOVQ 72(R10), R13
  302. SUBQ $0x20, R12
  303. SUBQ $0x04, R13
  304. MOVQ 48(R10), R14
  305. // b.value |= uint64(low) << (b.bitsRead & 63)
  306. MOVL (R13)(R14*1), R14
  307. MOVQ R12, CX
  308. SHLQ CL, R14
  309. MOVQ R13, 72(R10)
  310. ORQ R14, R11
  311. // exhausted += (br1.off < 4)
  312. CMPQ R13, $0x04
  313. ADCB $+0, DL
  314. skip_fill1:
  315. // val0 := br1.peekTopBits(peekBits)
  316. MOVQ R11, R13
  317. MOVQ DI, CX
  318. SHRQ CL, R13
  319. // v0 := table[val0&mask]
  320. MOVW (R9)(R13*2), CX
  321. // br1.advance(uint8(v0.entry)
  322. MOVB CH, AL
  323. SHLQ CL, R11
  324. ADDB CL, R12
  325. // val1 := br1.peekTopBits(peekBits)
  326. MOVQ R11, R13
  327. MOVQ DI, CX
  328. SHRQ CL, R13
  329. // v1 := table[val0&mask]
  330. MOVW (R9)(R13*2), CX
  331. // br1.advance(uint8(v1.entry)
  332. MOVB CH, AH
  333. SHLQ CL, R11
  334. ADDB CL, R12
  335. BSWAPL AX
  336. // val2 := br1.peekTopBits(peekBits)
  337. MOVQ R11, R13
  338. MOVQ DI, CX
  339. SHRQ CL, R13
  340. // v2 := table[val0&mask]
  341. MOVW (R9)(R13*2), CX
  342. // br1.advance(uint8(v2.entry)
  343. MOVB CH, AH
  344. SHLQ CL, R11
  345. ADDB CL, R12
  346. // val3 := br1.peekTopBits(peekBits)
  347. MOVQ R11, R13
  348. MOVQ DI, CX
  349. SHRQ CL, R13
  350. // v3 := table[val0&mask]
  351. MOVW (R9)(R13*2), CX
  352. // br1.advance(uint8(v3.entry)
  353. MOVB CH, AL
  354. SHLQ CL, R11
  355. ADDB CL, R12
  356. BSWAPL AX
  357. // these four writes get coalesced
  358. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  359. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  360. // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
  361. // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
  362. MOVL AX, (BX)(R8*1)
  363. // update the bitreader structure
  364. MOVQ R11, 80(R10)
  365. MOVB R12, 88(R10)
  366. // br2.fillFast32()
  367. MOVQ 128(R10), R11
  368. MOVBQZX 136(R10), R12
  369. CMPQ R12, $0x20
  370. JBE skip_fill2
  371. MOVQ 120(R10), R13
  372. SUBQ $0x20, R12
  373. SUBQ $0x04, R13
  374. MOVQ 96(R10), R14
  375. // b.value |= uint64(low) << (b.bitsRead & 63)
  376. MOVL (R13)(R14*1), R14
  377. MOVQ R12, CX
  378. SHLQ CL, R14
  379. MOVQ R13, 120(R10)
  380. ORQ R14, R11
  381. // exhausted += (br2.off < 4)
  382. CMPQ R13, $0x04
  383. ADCB $+0, DL
  384. skip_fill2:
  385. // val0 := br2.peekTopBits(peekBits)
  386. MOVQ R11, R13
  387. MOVQ DI, CX
  388. SHRQ CL, R13
  389. // v0 := table[val0&mask]
  390. MOVW (R9)(R13*2), CX
  391. // br2.advance(uint8(v0.entry)
  392. MOVB CH, AL
  393. SHLQ CL, R11
  394. ADDB CL, R12
  395. // val1 := br2.peekTopBits(peekBits)
  396. MOVQ R11, R13
  397. MOVQ DI, CX
  398. SHRQ CL, R13
  399. // v1 := table[val0&mask]
  400. MOVW (R9)(R13*2), CX
  401. // br2.advance(uint8(v1.entry)
  402. MOVB CH, AH
  403. SHLQ CL, R11
  404. ADDB CL, R12
  405. BSWAPL AX
  406. // val2 := br2.peekTopBits(peekBits)
  407. MOVQ R11, R13
  408. MOVQ DI, CX
  409. SHRQ CL, R13
  410. // v2 := table[val0&mask]
  411. MOVW (R9)(R13*2), CX
  412. // br2.advance(uint8(v2.entry)
  413. MOVB CH, AH
  414. SHLQ CL, R11
  415. ADDB CL, R12
  416. // val3 := br2.peekTopBits(peekBits)
  417. MOVQ R11, R13
  418. MOVQ DI, CX
  419. SHRQ CL, R13
  420. // v3 := table[val0&mask]
  421. MOVW (R9)(R13*2), CX
  422. // br2.advance(uint8(v3.entry)
  423. MOVB CH, AL
  424. SHLQ CL, R11
  425. ADDB CL, R12
  426. BSWAPL AX
  427. // these four writes get coalesced
  428. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  429. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  430. // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
  431. // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
  432. MOVL AX, (BX)(R8*2)
  433. // update the bitreader structure
  434. MOVQ R11, 128(R10)
  435. MOVB R12, 136(R10)
  436. // br3.fillFast32()
  437. MOVQ 176(R10), R11
  438. MOVBQZX 184(R10), R12
  439. CMPQ R12, $0x20
  440. JBE skip_fill3
  441. MOVQ 168(R10), R13
  442. SUBQ $0x20, R12
  443. SUBQ $0x04, R13
  444. MOVQ 144(R10), R14
  445. // b.value |= uint64(low) << (b.bitsRead & 63)
  446. MOVL (R13)(R14*1), R14
  447. MOVQ R12, CX
  448. SHLQ CL, R14
  449. MOVQ R13, 168(R10)
  450. ORQ R14, R11
  451. // exhausted += (br3.off < 4)
  452. CMPQ R13, $0x04
  453. ADCB $+0, DL
  454. skip_fill3:
  455. // val0 := br3.peekTopBits(peekBits)
  456. MOVQ R11, R13
  457. MOVQ DI, CX
  458. SHRQ CL, R13
  459. // v0 := table[val0&mask]
  460. MOVW (R9)(R13*2), CX
  461. // br3.advance(uint8(v0.entry)
  462. MOVB CH, AL
  463. SHLQ CL, R11
  464. ADDB CL, R12
  465. // val1 := br3.peekTopBits(peekBits)
  466. MOVQ R11, R13
  467. MOVQ DI, CX
  468. SHRQ CL, R13
  469. // v1 := table[val0&mask]
  470. MOVW (R9)(R13*2), CX
  471. // br3.advance(uint8(v1.entry)
  472. MOVB CH, AH
  473. SHLQ CL, R11
  474. ADDB CL, R12
  475. BSWAPL AX
  476. // val2 := br3.peekTopBits(peekBits)
  477. MOVQ R11, R13
  478. MOVQ DI, CX
  479. SHRQ CL, R13
  480. // v2 := table[val0&mask]
  481. MOVW (R9)(R13*2), CX
  482. // br3.advance(uint8(v2.entry)
  483. MOVB CH, AH
  484. SHLQ CL, R11
  485. ADDB CL, R12
  486. // val3 := br3.peekTopBits(peekBits)
  487. MOVQ R11, R13
  488. MOVQ DI, CX
  489. SHRQ CL, R13
  490. // v3 := table[val0&mask]
  491. MOVW (R9)(R13*2), CX
  492. // br3.advance(uint8(v3.entry)
  493. MOVB CH, AL
  494. SHLQ CL, R11
  495. ADDB CL, R12
  496. BSWAPL AX
  497. // these four writes get coalesced
  498. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  499. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  500. // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
  501. // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
  502. LEAQ (R8)(R8*2), CX
  503. MOVL AX, (BX)(CX*1)
  504. // update the bitreader structure
  505. MOVQ R11, 176(R10)
  506. MOVB R12, 184(R10)
  507. ADDQ $0x04, BX
  508. TESTB DL, DL
  509. JZ main_loop
  510. MOVQ ctx+0(FP), AX
  511. SUBQ 16(AX), BX
  512. SHLQ $0x02, BX
  513. MOVQ BX, 40(AX)
  514. RET
  515. // func decompress1x_main_loop_amd64(ctx *decompress1xContext)
  516. TEXT ·decompress1x_main_loop_amd64(SB), $0-8
  517. MOVQ ctx+0(FP), CX
  518. MOVQ 16(CX), DX
  519. MOVQ 24(CX), BX
  520. CMPQ BX, $0x04
  521. JB error_max_decoded_size_exceeded
  522. LEAQ (DX)(BX*1), BX
  523. MOVQ (CX), SI
  524. MOVQ (SI), R8
  525. MOVQ 24(SI), R9
  526. MOVQ 32(SI), R10
  527. MOVBQZX 40(SI), R11
  528. MOVQ 32(CX), SI
  529. MOVBQZX 8(CX), DI
  530. JMP loop_condition
  531. main_loop:
  532. // Check if we have room for 4 bytes in the output buffer
  533. LEAQ 4(DX), CX
  534. CMPQ CX, BX
  535. JGE error_max_decoded_size_exceeded
  536. // Decode 4 values
  537. CMPQ R11, $0x20
  538. JL bitReader_fillFast_1_end
  539. SUBQ $0x20, R11
  540. SUBQ $0x04, R9
  541. MOVL (R8)(R9*1), R12
  542. MOVQ R11, CX
  543. SHLQ CL, R12
  544. ORQ R12, R10
  545. bitReader_fillFast_1_end:
  546. MOVQ DI, CX
  547. MOVQ R10, R12
  548. SHRQ CL, R12
  549. MOVW (SI)(R12*2), CX
  550. MOVB CH, AL
  551. MOVBQZX CL, CX
  552. ADDQ CX, R11
  553. SHLQ CL, R10
  554. MOVQ DI, CX
  555. MOVQ R10, R12
  556. SHRQ CL, R12
  557. MOVW (SI)(R12*2), CX
  558. MOVB CH, AH
  559. MOVBQZX CL, CX
  560. ADDQ CX, R11
  561. SHLQ CL, R10
  562. BSWAPL AX
  563. CMPQ R11, $0x20
  564. JL bitReader_fillFast_2_end
  565. SUBQ $0x20, R11
  566. SUBQ $0x04, R9
  567. MOVL (R8)(R9*1), R12
  568. MOVQ R11, CX
  569. SHLQ CL, R12
  570. ORQ R12, R10
  571. bitReader_fillFast_2_end:
  572. MOVQ DI, CX
  573. MOVQ R10, R12
  574. SHRQ CL, R12
  575. MOVW (SI)(R12*2), CX
  576. MOVB CH, AH
  577. MOVBQZX CL, CX
  578. ADDQ CX, R11
  579. SHLQ CL, R10
  580. MOVQ DI, CX
  581. MOVQ R10, R12
  582. SHRQ CL, R12
  583. MOVW (SI)(R12*2), CX
  584. MOVB CH, AL
  585. MOVBQZX CL, CX
  586. ADDQ CX, R11
  587. SHLQ CL, R10
  588. BSWAPL AX
  589. // Store the decoded values
  590. MOVL AX, (DX)
  591. ADDQ $0x04, DX
  592. loop_condition:
  593. CMPQ R9, $0x08
  594. JGE main_loop
  595. // Update ctx structure
  596. MOVQ ctx+0(FP), AX
  597. SUBQ 16(AX), DX
  598. MOVQ DX, 40(AX)
  599. MOVQ (AX), AX
  600. MOVQ R9, 24(AX)
  601. MOVQ R10, 32(AX)
  602. MOVB R11, 40(AX)
  603. RET
  604. // Report error
  605. error_max_decoded_size_exceeded:
  606. MOVQ ctx+0(FP), AX
  607. MOVQ $-1, CX
  608. MOVQ CX, 40(AX)
  609. RET
  610. // func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
  611. // Requires: BMI2
  612. TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
  613. MOVQ ctx+0(FP), CX
  614. MOVQ 16(CX), DX
  615. MOVQ 24(CX), BX
  616. CMPQ BX, $0x04
  617. JB error_max_decoded_size_exceeded
  618. LEAQ (DX)(BX*1), BX
  619. MOVQ (CX), SI
  620. MOVQ (SI), R8
  621. MOVQ 24(SI), R9
  622. MOVQ 32(SI), R10
  623. MOVBQZX 40(SI), R11
  624. MOVQ 32(CX), SI
  625. MOVBQZX 8(CX), DI
  626. JMP loop_condition
  627. main_loop:
  628. // Check if we have room for 4 bytes in the output buffer
  629. LEAQ 4(DX), CX
  630. CMPQ CX, BX
  631. JGE error_max_decoded_size_exceeded
  632. // Decode 4 values
  633. CMPQ R11, $0x20
  634. JL bitReader_fillFast_1_end
  635. SUBQ $0x20, R11
  636. SUBQ $0x04, R9
  637. MOVL (R8)(R9*1), CX
  638. SHLXQ R11, CX, CX
  639. ORQ CX, R10
  640. bitReader_fillFast_1_end:
  641. SHRXQ DI, R10, CX
  642. MOVW (SI)(CX*2), CX
  643. MOVB CH, AL
  644. MOVBQZX CL, CX
  645. ADDQ CX, R11
  646. SHLXQ CX, R10, R10
  647. SHRXQ DI, R10, CX
  648. MOVW (SI)(CX*2), CX
  649. MOVB CH, AH
  650. MOVBQZX CL, CX
  651. ADDQ CX, R11
  652. SHLXQ CX, R10, R10
  653. BSWAPL AX
  654. CMPQ R11, $0x20
  655. JL bitReader_fillFast_2_end
  656. SUBQ $0x20, R11
  657. SUBQ $0x04, R9
  658. MOVL (R8)(R9*1), CX
  659. SHLXQ R11, CX, CX
  660. ORQ CX, R10
  661. bitReader_fillFast_2_end:
  662. SHRXQ DI, R10, CX
  663. MOVW (SI)(CX*2), CX
  664. MOVB CH, AH
  665. MOVBQZX CL, CX
  666. ADDQ CX, R11
  667. SHLXQ CX, R10, R10
  668. SHRXQ DI, R10, CX
  669. MOVW (SI)(CX*2), CX
  670. MOVB CH, AL
  671. MOVBQZX CL, CX
  672. ADDQ CX, R11
  673. SHLXQ CX, R10, R10
  674. BSWAPL AX
  675. // Store the decoded values
  676. MOVL AX, (DX)
  677. ADDQ $0x04, DX
  678. loop_condition:
  679. CMPQ R9, $0x08
  680. JGE main_loop
  681. // Update ctx structure
  682. MOVQ ctx+0(FP), AX
  683. SUBQ 16(AX), DX
  684. MOVQ DX, 40(AX)
  685. MOVQ (AX), AX
  686. MOVQ R9, 24(AX)
  687. MOVQ R10, 32(AX)
  688. MOVB R11, 40(AX)
  689. RET
  690. // Report error
  691. error_max_decoded_size_exceeded:
  692. MOVQ ctx+0(FP), AX
  693. MOVQ $-1, CX
  694. MOVQ CX, 40(AX)
  695. RET