jchuff-sse2.asm 31 KB


  1. ;
  2. ; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
  3. ;
  4. ; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander.
  5. ; Copyright (C) 2015, Matthieu Darbois.
  6. ; Copyright (C) 2018, Matthias Räncker.
  7. ;
  8. ; Based on the x86 SIMD extension for IJG JPEG library
  9. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11. ;
  12. ; This file should be assembled with NASM (Netwide Assembler),
  13. ; can *not* be assembled with Microsoft's MASM or any compatible
  14. ; assembler (including Borland's Turbo Assembler).
  15. ; NASM is available from http://nasm.sourceforge.net/ or
  16. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  17. ;
  18. ; This file contains an SSE2 implementation for Huffman coding of one block.
  19. ; The following code is based on jchuff.c; see jchuff.c for more details.
  20. %include "jsimdext.inc"
  21. struc working_state
  22. .next_output_byte: resp 1 ; => next byte to write in buffer
  23. .free_in_buffer: resp 1 ; # of byte spaces remaining in buffer
  24. .cur.put_buffer.simd resq 1 ; current bit accumulation buffer
  25. .cur.free_bits resd 1 ; # of bits available in it
  26. .cur.last_dc_val resd 4 ; last DC coef for each component
  27. .cinfo: resp 1 ; dump_buffer needs access to this
  28. endstruc
  29. struc c_derived_tbl
  30. .ehufco: resd 256 ; code for each symbol
  31. .ehufsi: resb 256 ; length of code for each symbol
  32. ; If no code has been allocated for a symbol S, ehufsi[S] contains 0
  33. endstruc
  34. ; --------------------------------------------------------------------------
  35. SECTION SEG_CONST
  36. alignz 32
  37. GLOBAL_DATA(jconst_huff_encode_one_block)
  38. EXTN(jconst_huff_encode_one_block):
  39. jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
  40. dd 0x000f, 0x001f, 0x003f, 0x007f
  41. dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
  42. dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
  43. alignz 32
  44. times 1 << 14 db 15
  45. times 1 << 13 db 14
  46. times 1 << 12 db 13
  47. times 1 << 11 db 12
  48. times 1 << 10 db 11
  49. times 1 << 9 db 10
  50. times 1 << 8 db 9
  51. times 1 << 7 db 8
  52. times 1 << 6 db 7
  53. times 1 << 5 db 6
  54. times 1 << 4 db 5
  55. times 1 << 3 db 4
  56. times 1 << 2 db 3
  57. times 1 << 1 db 2
  58. times 1 << 0 db 1
  59. times 1 db 0
  60. jpeg_nbits_table:
  61. times 1 db 0
  62. times 1 << 0 db 1
  63. times 1 << 1 db 2
  64. times 1 << 2 db 3
  65. times 1 << 3 db 4
  66. times 1 << 4 db 5
  67. times 1 << 5 db 6
  68. times 1 << 6 db 7
  69. times 1 << 7 db 8
  70. times 1 << 8 db 9
  71. times 1 << 9 db 10
  72. times 1 << 10 db 11
  73. times 1 << 11 db 12
  74. times 1 << 12 db 13
  75. times 1 << 13 db 14
  76. times 1 << 14 db 15
  77. times 1 << 15 db 16
  78. alignz 32
  79. %define NBITS(x) nbits_base + x
  80. %define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
  81. ; --------------------------------------------------------------------------
  82. SECTION SEG_TEXT
  83. BITS 64
  84. ; Shorthand used to describe SIMD operations:
  85. ; wN: xmmN treated as eight signed 16-bit values
  86. ; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7
  87. ; bN: xmmN treated as 16 unsigned 8-bit values
  88. ; bN[i]: perform the same operation on all 16 unsigned 8-bit values, i=0..15
  89. ; Contents of SIMD registers are shown in memory order.
  90. ; Fill the bit buffer to capacity with the leading bits from code, then output
  91. ; the bit buffer and put the remaining bits from code into the bit buffer.
  92. ;
  93. ; Usage:
  94. ; code - contains the bits to shift into the bit buffer (LSB-aligned)
  95. ; %1 - the label to which to jump when the macro completes
  96. ; %2 (optional) - extra instructions to execute after nbits has been set
  97. ;
  98. ; Upon completion, free_bits will be set to the number of remaining bits from
  99. ; code, and put_buffer will contain those remaining bits. temp and code will
  100. ; be clobbered.
  101. ;
  102. ; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
  103. ; macro in jchuff.c.
  104. %macro EMIT_QWORD 1-2
  105. add nbitsb, free_bitsb ; nbits += free_bits;
  106. neg free_bitsb ; free_bits = -free_bits;
  107. mov tempd, code ; temp = code;
  108. shl put_buffer, nbitsb ; put_buffer <<= nbits;
  109. mov nbitsb, free_bitsb ; nbits = free_bits;
  110. neg free_bitsb ; free_bits = -free_bits;
  111. shr tempd, nbitsb ; temp >>= nbits;
  112. or tempq, put_buffer ; temp |= put_buffer;
  113. movq xmm0, tempq ; xmm0.u64 = { temp, 0 };
  114. bswap tempq ; temp = htonl(temp);
  115. mov put_buffer, codeq ; put_buffer = code;
  116. pcmpeqb xmm0, xmm1 ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
  117. %2
  118. pmovmskb code, xmm0 ; code = 0; code |= ((b0[i] >> 7) << i);
  119. mov qword [buffer], tempq ; memcpy(buffer, &temp, 8);
  120. ; (speculative; will be overwritten if
  121. ; code contains any 0xFF bytes)
  122. add free_bitsb, 64 ; free_bits += 64;
  123. add bufferp, 8 ; buffer += 8;
  124. test code, code ; if (code == 0) /* No 0xFF bytes */
  125. jz %1 ; return;
  126. ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
  127. ; bytes in the qword.
  128. cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
  129. mov byte [buffer-7], 0 ; buffer[-7] = 0;
  130. sbb bufferp, 6 ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
  131. mov byte [buffer], temph ; buffer[0] = temp[1];
  132. cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
  133. mov byte [buffer+1], 0 ; buffer[1] = 0;
  134. sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
  135. shr tempq, 16 ; temp >>= 16;
  136. mov byte [buffer], tempb ; buffer[0] = temp[0];
  137. cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
  138. mov byte [buffer+1], 0 ; buffer[1] = 0;
  139. sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
  140. mov byte [buffer], temph ; buffer[0] = temp[1];
  141. cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
  142. mov byte [buffer+1], 0 ; buffer[1] = 0;
  143. sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
  144. shr tempq, 16 ; temp >>= 16;
  145. mov byte [buffer], tempb ; buffer[0] = temp[0];
  146. cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
  147. mov byte [buffer+1], 0 ; buffer[1] = 0;
  148. sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
  149. mov byte [buffer], temph ; buffer[0] = temp[1];
  150. cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
  151. mov byte [buffer+1], 0 ; buffer[1] = 0;
  152. sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
  153. shr tempd, 16 ; temp >>= 16;
  154. mov byte [buffer], tempb ; buffer[0] = temp[0];
  155. cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
  156. mov byte [buffer+1], 0 ; buffer[1] = 0;
  157. sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
  158. mov byte [buffer], temph ; buffer[0] = temp[1];
  159. cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
  160. mov byte [buffer+1], 0 ; buffer[1] = 0;
  161. sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
  162. jmp %1 ; return;
  163. %endmacro
  164. ;
  165. ; Encode a single block's worth of coefficients.
  166. ;
  167. ; GLOBAL(JOCTET *)
  168. ; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
  169. ; JCOEFPTR block, int last_dc_val,
  170. ; c_derived_tbl *dctbl, c_derived_tbl *actbl)
  171. ;
  172. ; NOTES:
  173. ; When shuffling data, we try to avoid pinsrw as much as possible, since it is
  174. ; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on
  175. ; modern CPUs, so chains of pinsrw instructions (even with different outputs)
  176. ; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and
  177. ; requires 2 µops (with memory operand) on Intel. In either case, only one
  178. ; pinsrw instruction can be decoded per cycle (and nothing else if they are
  179. ; back-to-back), so out-of-order execution cannot be used to work around long
  180. ; pinsrw chains (though for Sandy Bridge and later, this may be less of a
  181. ; problem if the code runs from the µop cache.)
  182. ;
  183. ; We use tzcnt instead of bsf without checking for support. The instruction is
  184. ; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
  185. ; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is
  186. ; an input dependency (although the behavior is not formally defined, Intel
  187. ; CPUs usually leave the destination unmodified if the source is zero.) This
  188. ; can prevent out-of-order execution, so we clear the destination before
  189. ; invoking tzcnt.
  190. ;
  191. ; Initial register allocation
  192. ; rax - buffer
  193. ; rbx - temp
  194. ; rcx - nbits
  195. ; rdx - block --> free_bits
  196. ; rsi - nbits_base
  197. ; rdi - t
  198. ; rbp - code
  199. ; r8 - dctbl --> code_temp
  200. ; r9 - actbl
  201. ; r10 - state
  202. ; r11 - index
  203. ; r12 - put_buffer
  204. %define buffer rax
  205. %ifdef WIN64
  206. %define bufferp rax
  207. %else
  208. %define bufferp raxp
  209. %endif
  210. %define tempq rbx
  211. %define tempd ebx
  212. %define tempb bl
  213. %define temph bh
  214. %define nbitsq rcx
  215. %define nbits ecx
  216. %define nbitsb cl
  217. %define block rdx
  218. %define nbits_base rsi
  219. %define t rdi
  220. %define td edi
  221. %define codeq rbp
  222. %define code ebp
  223. %define dctbl r8
  224. %define actbl r9
  225. %define state r10
  226. %define index r11
  227. %define indexd r11d
  228. %define put_buffer r12
  229. %define put_bufferd r12d
  230. ; Step 1: Re-arrange input data according to jpeg_natural_order
  231. ; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
  232. ; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05
  233. ; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34
  234. ; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28
  235. ; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36
  236. ; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51
  237. ; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46
  238. ; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63
  239. align 32
  240. GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
  241. EXTN(jsimd_huff_encode_one_block_sse2):
  242. %ifdef WIN64
  243. ; rcx = working_state *state
  244. ; rdx = JOCTET *buffer
  245. ; r8 = JCOEFPTR block
  246. ; r9 = int last_dc_val
  247. ; [rax+48] = c_derived_tbl *dctbl
  248. ; [rax+56] = c_derived_tbl *actbl
  249. ;X: X = code stream
  250. mov buffer, rdx
  251. mov block, r8
  252. movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
  253. push rbx
  254. push rbp
  255. movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
  256. push rsi
  257. push rdi
  258. push r12
  259. movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
  260. mov state, rcx
  261. movsx code, word [block] ;Z: code = block[0];
  262. pxor xmm4, xmm4 ;A: w4[i] = 0;
  263. sub code, r9d ;Z: code -= last_dc_val;
  264. mov dctbl, POINTER [rsp+6*8+4*8]
  265. mov actbl, POINTER [rsp+6*8+5*8]
  266. punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
  267. lea nbits_base, [rel jpeg_nbits_table]
  268. add rsp, -DCTSIZE2 * SIZEOF_WORD
  269. mov t, rsp
  270. %else
  271. ; rdi = working_state *state
  272. ; rsi = JOCTET *buffer
  273. ; rdx = JCOEFPTR block
  274. ; rcx = int last_dc_val
  275. ; r8 = c_derived_tbl *dctbl
  276. ; r9 = c_derived_tbl *actbl
  277. ;X: X = code stream
  278. movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
  279. push rbx
  280. push rbp
  281. movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
  282. push r12
  283. mov state, rdi
  284. mov buffer, rsi
  285. movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
  286. movsx codeq, word [block] ;Z: code = block[0];
  287. lea nbits_base, [rel jpeg_nbits_table]
  288. pxor xmm4, xmm4 ;A: w4[i] = 0;
  289. sub codeq, rcx ;Z: code -= last_dc_val;
  290. punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
  291. lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_
  292. %endif
  293. pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
  294. pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
  295. punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
  296. punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13
  297. pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17
  298. ;A: (Row 0, offset 1)
  299. pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
  300. paddw xmm0, xmm4 ;A: w0[i] += w4[i];
  301. movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i];
  302. movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- --
  303. pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- --
  304. pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12
  305. movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55
  306. movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12
  307. punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51
  308. pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12
  309. pxor xmm4, xmm4 ;A: w4[i] = 0;
  310. psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- --
  311. pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
  312. pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12
  313. ; (Row 1, offset 1)
  314. pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
  315. paddw xmm1, xmm4 ;B: w1[i] += w4[i];
  316. movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i];
  317. pxor xmm4, xmm4 ;B: w4[i] = 0;
  318. pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
  319. packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
  320. ; w/ signed saturation
  321. pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- --
  322. pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- --
  323. pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 --
  324. pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35
  325. ; (Row 3, offset 1)
  326. pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
  327. paddw xmm3, xmm4 ;D: w3[i] += w4[i];
  328. movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i];
  329. pxor xmm4, xmm4 ;D: w4[i] = 0;
  330. pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
  331. pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51
  332. cmp code, 1 << 31 ;Z: Set CF if code < 0x80000000,
  333. ;Z: i.e. if code is positive
  334. pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51
  335. pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51
  336. adc code, -1 ;Z: code += -1 + (code >= 0 ? 1 : 0);
  337. pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51
  338. pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51
  339. movsxd codeq, code ;Z: sign extend code
  340. pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27
  341. ; (Row 2, offset 1)
  342. pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
  343. paddw xmm2, xmm4 ;C: w2[i] += w4[i];
  344. movaps XMMWORD [t + 16 * SIZEOF_WORD], xmm2 ;C: t[i+16] = w2[i];
  345. pxor xmm4, xmm4 ;C: w4[i] = 0;
  346. pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
  347. packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
  348. ; w/ signed saturation
  349. movzx nbitsq, byte [NBITS(codeq)] ;Z: nbits = JPEG_NBITS(code);
  350. movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55
  351. pmovmskb tempd, xmm2 ;Z: temp = 0; temp |= ((b2[i] >> 7) << i);
  352. pmovmskb put_bufferd, xmm0 ;Z: put_buffer = 0; put_buffer |= ((b0[i] >> 7) << i);
  353. movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63
  354. punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63
  355. shl tempd, 16 ;Z: temp <<= 16;
  356. psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 --
  357. pxor xmm2, xmm2 ;H: w2[i] = 0;
  358. or put_bufferd, tempd ;Z: put_buffer |= temp;
  359. pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 --
  360. movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- --
  361. unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59
  362. pxor xmm0, xmm0 ;H: w0[i] = 0;
  363. pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 --
  364. ; (Row 7, offset 1)
  365. pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
  366. paddw xmm3, xmm2 ;H: w3[i] += w2[i];
  367. movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i];
  368. movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- --
  369. pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
  370. punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47
  371. mov tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
  372. ;Z: temp = dctbl->ehufco[nbits];
  373. movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47
  374. psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
  375. shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59
  376. and code, dword [MASK_BITS(nbitsq)] ;Z: code &= (1 << nbits) - 1;
  377. pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 --
  378. pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58
  379. shl tempq, nbitsb ;Z: temp <<= nbits;
  380. pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 --
  381. pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58
  382. pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 --
  383. or code, tempd ;Z: code |= temp;
  384. movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58
  385. pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 --
  386. pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58
  387. pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53
  388. ; (Row 6, offset 1)
  389. pxor xmm2, xmm2 ;G: w2[i] = 0;
  390. pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
  391. pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58
  392. paddw xmm4, xmm0 ;G: w4[i] += w0[i];
  393. movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i];
  394. pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58
  395. ; (Row 5, offset 1)
  396. pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
  397. pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59
  398. packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
  399. ; w/ signed saturation
  400. pxor xmm0, xmm0 ;F: w0[i] = 0;
  401. pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59
  402. pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
  403. pmovmskb tempd, xmm4 ;Z: temp = 0; temp |= ((b4[i] >> 7) << i);
  404. pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59
  405. paddw xmm1, xmm2 ;F: w1[i] += w2[i];
  406. movaps XMMWORD [t + 40 * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i];
  407. pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
  408. ; (Row 4, offset 1)
  409. %undef block
  410. %define free_bitsq rdx
  411. %define free_bitsd edx
  412. %define free_bitsb dl
  413. pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
  414. shl tempq, 48 ;Z: temp <<= 48;
  415. pxor xmm2, xmm2 ;E: w2[i] = 0;
  416. pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
  417. paddw xmm5, xmm0 ;E: w5[i] += w0[i];
  418. or tempq, put_buffer ;Z: temp |= put_buffer;
  419. movaps XMMWORD [t + 32 * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i];
  420. lea t, [dword t - 2] ;Z: t = &t[-1];
  421. pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
  422. packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
  423. ; w/ signed saturation
  424. add nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
  425. ;Z: nbits += dctbl->ehufsi[nbits];
  426. %undef dctbl
  427. %define code_temp r8d
  428. pmovmskb indexd, xmm5 ;Z: index = 0; index |= ((b5[i] >> 7) << i);
  429. mov free_bitsd, [state+working_state.cur.free_bits]
  430. ;Z: free_bits = state->cur.free_bits;
  431. pcmpeqw xmm1, xmm1 ;Z: b1[i] = 0xFF;
  432. shl index, 32 ;Z: index <<= 32;
  433. mov put_buffer, [state+working_state.cur.put_buffer.simd]
  434. ;Z: put_buffer = state->cur.put_buffer.simd;
  435. or index, tempq ;Z: index |= temp;
  436. not index ;Z: index = ~index;
  437. sub free_bitsb, nbitsb ;Z: if ((free_bits -= nbits) >= 0)
  438. jnl .ENTRY_SKIP_EMIT_CODE ;Z: goto .ENTRY_SKIP_EMIT_CODE;
  439. align 16
  440. .EMIT_CODE: ;Z: .EMIT_CODE:
  441. EMIT_QWORD .BLOOP_COND ;Z: insert code, flush buffer, goto .BLOOP_COND
  442. ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  443. align 16
  444. .BRLOOP: ; do {
  445. lea code_temp, [nbitsq - 16] ; code_temp = nbits - 16;
  446. movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
  447. ; nbits = actbl->ehufsi[0xf0];
  448. mov code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
  449. ; code = actbl->ehufco[0xf0];
  450. sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
  451. jle .EMIT_BRLOOP_CODE ; goto .EMIT_BRLOOP_CODE;
  452. shl put_buffer, nbitsb ; put_buffer <<= nbits;
  453. mov nbits, code_temp ; nbits = code_temp;
  454. or put_buffer, codeq ; put_buffer |= code;
  455. cmp nbits, 16 ; if (nbits <= 16)
  456. jle .ERLOOP ; break;
  457. jmp .BRLOOP ; } while (1);
  458. ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  459. align 16
  460. times 5 nop
  461. .ENTRY_SKIP_EMIT_CODE: ; .ENTRY_SKIP_EMIT_CODE:
  462. shl put_buffer, nbitsb ; put_buffer <<= nbits;
  463. or put_buffer, codeq ; put_buffer |= code;
  464. .BLOOP_COND: ; .BLOOP_COND:
  465. test index, index ; if (index != 0)
  466. jz .ELOOP ; {
  467. .BLOOP: ; do {
  468. xor nbits, nbits ; nbits = 0; /* kill tzcnt input dependency */
  469. tzcnt nbitsq, index ; nbits = # of trailing 0 bits in index
  470. inc nbits ; ++nbits;
  471. lea t, [t + nbitsq * 2] ; t = &t[nbits];
  472. shr index, nbitsb ; index >>= nbits;
  473. .EMIT_BRLOOP_CODE_END: ; .EMIT_BRLOOP_CODE_END:
  474. cmp nbits, 16 ; if (nbits > 16)
  475. jg .BRLOOP ; goto .BRLOOP;
  476. .ERLOOP: ; .ERLOOP:
  477. movsx codeq, word [t] ; code = *t;
  478. lea tempd, [nbitsq * 2] ; temp = nbits * 2;
  479. movzx nbits, byte [NBITS(codeq)] ; nbits = JPEG_NBITS(code);
  480. lea tempd, [nbitsq + tempq * 8] ; temp = temp * 8 + nbits;
  481. mov code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
  482. ; code_temp = actbl->ehufco[temp-16];
  483. shl code_temp, nbitsb ; code_temp <<= nbits;
  484. and code, dword [MASK_BITS(nbitsq)] ; code &= (1 << nbits) - 1;
  485. add nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
  486. ; free_bits -= actbl->ehufsi[temp-16];
  487. or code, code_temp ; code |= code_temp;
  488. sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
  489. jle .EMIT_CODE ; goto .EMIT_CODE;
  490. shl put_buffer, nbitsb ; put_buffer <<= nbits;
  491. or put_buffer, codeq ; put_buffer |= code;
  492. test index, index
  493. jnz .BLOOP ; } while (index != 0);
  494. .ELOOP: ; } /* index != 0 */
  495. sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
  496. %ifdef WIN64
  497. cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62)
  498. %else
  499. cmp td, -2 * SIZEOF_WORD ; if (t != -2)
  500. %endif
  501. je .EFN ; {
  502. movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
  503. ; nbits = actbl->ehufsi[0];
  504. mov code, [actbl + c_derived_tbl.ehufco + 0] ; code = actbl->ehufco[0];
  505. sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
  506. jg .EFN_SKIP_EMIT_CODE ; {
  507. EMIT_QWORD .EFN ; insert code, flush buffer
  508. align 16
  509. .EFN_SKIP_EMIT_CODE: ; } else {
  510. shl put_buffer, nbitsb ; put_buffer <<= nbits;
  511. or put_buffer, codeq ; put_buffer |= code;
  512. .EFN: ; } }
  513. mov [state + working_state.cur.put_buffer.simd], put_buffer
  514. ; state->cur.put_buffer.simd = put_buffer;
  515. mov byte [state + working_state.cur.free_bits], free_bitsb
  516. ; state->cur.free_bits = free_bits;
  517. %ifdef WIN64
  518. sub rsp, -DCTSIZE2 * SIZEOF_WORD
  519. pop r12
  520. pop rdi
  521. pop rsi
  522. pop rbp
  523. pop rbx
  524. %else
  525. pop r12
  526. pop rbp
  527. pop rbx
  528. %endif
  529. ret
  530. ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  531. align 16
  532. .EMIT_BRLOOP_CODE:
  533. EMIT_QWORD .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp }
  534. ; insert code, flush buffer,
  535. ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
  536. ; For some reason, the OS X linker does not honor the request to align the
  537. ; segment unless we do this.
  538. align 32