jquanti-sse2.asm 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. ;
  2. ; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2009, 2016, D. R. Commander.
  6. ; Copyright (C) 2018, Matthias Räncker.
  7. ;
  8. ; Based on the x86 SIMD extension for IJG JPEG library
  9. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11. ;
  12. ; This file should be assembled with NASM (Netwide Assembler),
  13. ; can *not* be assembled with Microsoft's MASM or any compatible
  14. ; assembler (including Borland's Turbo Assembler).
  15. ; NASM is available from http://nasm.sourceforge.net/ or
  16. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  17. %include "jsimdext.inc"
  18. %include "jdct.inc"
  19. ; --------------------------------------------------------------------------
  20. SECTION SEG_TEXT
  21. BITS 64
  22. ;
  23. ; Load data into workspace, applying unsigned->signed conversion
  24. ;
  25. ; GLOBAL(void)
  26. ; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
  27. ; DCTELEM *workspace);
  28. ;
  29. ; r10 = JSAMPARRAY sample_data
  30. ; r11d = JDIMENSION start_col
  31. ; r12 = DCTELEM *workspace
  32. align 32
  33. GLOBAL_FUNCTION(jsimd_convsamp_sse2)
  34. EXTN(jsimd_convsamp_sse2):
  35. push rbp
  36. mov rax, rsp
  37. mov rbp, rsp
  38. collect_args 3
  39. push rbx
  40. pxor xmm6, xmm6 ; xmm6=(all 0's)
  41. pcmpeqw xmm7, xmm7
  42. psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
  43. mov rsi, r10
  44. mov eax, r11d
  45. mov rdi, r12
  46. mov rcx, DCTSIZE/4
  47. .convloop:
  48. mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  49. mov rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  50. movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
  51. movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
  52. mov rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  53. mov rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  54. movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
  55. movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
  56. punpcklbw xmm0, xmm6 ; xmm0=(01234567)
  57. punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
  58. paddw xmm0, xmm7
  59. paddw xmm1, xmm7
  60. punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
  61. punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
  62. paddw xmm2, xmm7
  63. paddw xmm3, xmm7
  64. movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
  65. movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
  66. movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
  67. movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
  68. add rsi, byte 4*SIZEOF_JSAMPROW
  69. add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
  70. dec rcx
  71. jnz short .convloop
  72. pop rbx
  73. uncollect_args 3
  74. pop rbp
  75. ret
  76. ; --------------------------------------------------------------------------
  77. ;
  78. ; Quantize/descale the coefficients, and store into coef_block
  79. ;
  80. ; This implementation is based on an algorithm described in
  81. ; "How to optimize for the Pentium family of microprocessors"
  82. ; (http://www.agner.org/assem/).
  83. ;
  84. ; GLOBAL(void)
  85. ; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
  86. ; DCTELEM *workspace);
  87. ;
  88. %define RECIPROCAL(m, n, b) \
  89. XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
  90. %define CORRECTION(m, n, b) \
  91. XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
  92. %define SCALE(m, n, b) \
  93. XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
  94. ; r10 = JCOEFPTR coef_block
  95. ; r11 = DCTELEM *divisors
  96. ; r12 = DCTELEM *workspace
  97. align 32
  98. GLOBAL_FUNCTION(jsimd_quantize_sse2)
  99. EXTN(jsimd_quantize_sse2):
  100. push rbp
  101. mov rax, rsp
  102. mov rbp, rsp
  103. collect_args 3
  104. mov rsi, r12
  105. mov rdx, r11
  106. mov rdi, r10
  107. mov rax, DCTSIZE2/32
  108. .quantloop:
  109. movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
  110. movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
  111. movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
  112. movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
  113. movdqa xmm0, xmm4
  114. movdqa xmm1, xmm5
  115. movdqa xmm2, xmm6
  116. movdqa xmm3, xmm7
  117. psraw xmm4, (WORD_BIT-1)
  118. psraw xmm5, (WORD_BIT-1)
  119. psraw xmm6, (WORD_BIT-1)
  120. psraw xmm7, (WORD_BIT-1)
  121. pxor xmm0, xmm4
  122. pxor xmm1, xmm5
  123. pxor xmm2, xmm6
  124. pxor xmm3, xmm7
  125. psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
  126. psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
  127. psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
  128. psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
  129. paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
  130. paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
  131. paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]
  132. paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]
  133. pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal
  134. pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
  135. pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
  136. pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
  137. pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale
  138. pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
  139. pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
  140. pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
  141. pxor xmm0, xmm4
  142. pxor xmm1, xmm5
  143. pxor xmm2, xmm6
  144. pxor xmm3, xmm7
  145. psubw xmm0, xmm4
  146. psubw xmm1, xmm5
  147. psubw xmm2, xmm6
  148. psubw xmm3, xmm7
  149. movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
  150. movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
  151. movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
  152. movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
  153. add rsi, byte 32*SIZEOF_DCTELEM
  154. add rdx, byte 32*SIZEOF_DCTELEM
  155. add rdi, byte 32*SIZEOF_JCOEF
  156. dec rax
  157. jnz near .quantloop
  158. uncollect_args 3
  159. pop rbp
  160. ret
  161. ; For some reason, the OS X linker does not honor the request to align the
  162. ; segment unless we do this.
  163. align 32