jcsample-avx2.asm 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. ;
  2. ; jcsample.asm - downsampling (64-bit AVX2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2009, 2016, D. R. Commander.
  6. ; Copyright (C) 2015, Intel Corporation.
  7. ; Copyright (C) 2018, Matthias Räncker.
  8. ;
  9. ; Based on the x86 SIMD extension for IJG JPEG library
  10. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  11. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  12. ;
  13. ; This file should be assembled with NASM (Netwide Assembler),
  14. ; can *not* be assembled with Microsoft's MASM or any compatible
  15. ; assembler (including Borland's Turbo Assembler).
  16. ; NASM is available from http://nasm.sourceforge.net/ or
  17. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  18. %include "jsimdext.inc"
  19. ; --------------------------------------------------------------------------
  20. SECTION SEG_TEXT
  21. BITS 64
  22. ;
  23. ; Downsample pixel values of a single component.
  24. ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
  25. ; without smoothing.
  26. ;
  27. ; GLOBAL(void)
  28. ; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
  29. ; JDIMENSION v_samp_factor,
  30. ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  31. ; JSAMPARRAY output_data);
  32. ;
  33. ; r10d = JDIMENSION image_width
  34. ; r11 = int max_v_samp_factor
  35. ; r12d = JDIMENSION v_samp_factor
  36. ; r13d = JDIMENSION width_in_blocks
  37. ; r14 = JSAMPARRAY input_data
  38. ; r15 = JSAMPARRAY output_data
  39. align 32
  40. GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
  41. EXTN(jsimd_h2v1_downsample_avx2):
  42. push rbp
  43. mov rax, rsp
  44. mov rbp, rsp
  45. collect_args 6
  46. mov ecx, r13d
  47. shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
  48. jz near .return
  49. mov edx, r10d
  50. ; -- expand_right_edge
  51. push rcx
  52. shl rcx, 1 ; output_cols * 2
  53. sub rcx, rdx
  54. jle short .expand_end
  55. mov rax, r11
  56. test rax, rax
  57. jle short .expand_end
  58. cld
  59. mov rsi, r14 ; input_data
  60. .expandloop:
  61. push rax
  62. push rcx
  63. mov rdip, JSAMPROW [rsi]
  64. add rdi, rdx
  65. mov al, JSAMPLE [rdi-1]
  66. rep stosb
  67. pop rcx
  68. pop rax
  69. add rsi, byte SIZEOF_JSAMPROW
  70. dec rax
  71. jg short .expandloop
  72. .expand_end:
  73. pop rcx ; output_cols
  74. ; -- h2v1_downsample
  75. mov eax, r12d ; rowctr
  76. test eax, eax
  77. jle near .return
  78. mov rdx, 0x00010000 ; bias pattern
  79. vmovd xmm7, edx
  80. vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
  81. vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7}
  82. vpcmpeqw ymm6, ymm6, ymm6
  83. vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
  84. mov rsi, r14 ; input_data
  85. mov rdi, r15 ; output_data
  86. .rowloop:
  87. push rcx
  88. push rdi
  89. push rsi
  90. mov rsip, JSAMPROW [rsi] ; inptr
  91. mov rdip, JSAMPROW [rdi] ; outptr
  92. cmp rcx, byte SIZEOF_YMMWORD
  93. jae short .columnloop
  94. .columnloop_r24:
  95. ; rcx can possibly be 8, 16, 24
  96. cmp rcx, 24
  97. jne .columnloop_r16
  98. vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  99. vmovdqu xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
  100. mov rcx, SIZEOF_YMMWORD
  101. jmp short .downsample
  102. .columnloop_r16:
  103. cmp rcx, 16
  104. jne .columnloop_r8
  105. vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  106. vpxor ymm1, ymm1, ymm1
  107. mov rcx, SIZEOF_YMMWORD
  108. jmp short .downsample
  109. .columnloop_r8:
  110. vmovdqu xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
  111. vpxor ymm1, ymm1, ymm1
  112. mov rcx, SIZEOF_YMMWORD
  113. jmp short .downsample
  114. .columnloop:
  115. vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  116. vmovdqu ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
  117. .downsample:
  118. vpsrlw ymm2, ymm0, BYTE_BIT
  119. vpand ymm0, ymm0, ymm6
  120. vpsrlw ymm3, ymm1, BYTE_BIT
  121. vpand ymm1, ymm1, ymm6
  122. vpaddw ymm0, ymm0, ymm2
  123. vpaddw ymm1, ymm1, ymm3
  124. vpaddw ymm0, ymm0, ymm7
  125. vpaddw ymm1, ymm1, ymm7
  126. vpsrlw ymm0, ymm0, 1
  127. vpsrlw ymm1, ymm1, 1
  128. vpackuswb ymm0, ymm0, ymm1
  129. vpermq ymm0, ymm0, 0xd8
  130. vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
  131. sub rcx, byte SIZEOF_YMMWORD ; outcol
  132. add rsi, byte 2*SIZEOF_YMMWORD ; inptr
  133. add rdi, byte 1*SIZEOF_YMMWORD ; outptr
  134. cmp rcx, byte SIZEOF_YMMWORD
  135. jae short .columnloop
  136. test rcx, rcx
  137. jnz near .columnloop_r24
  138. pop rsi
  139. pop rdi
  140. pop rcx
  141. add rsi, byte SIZEOF_JSAMPROW ; input_data
  142. add rdi, byte SIZEOF_JSAMPROW ; output_data
  143. dec rax ; rowctr
  144. jg near .rowloop
  145. .return:
  146. vzeroupper
  147. uncollect_args 6
  148. pop rbp
  149. ret
  150. ; --------------------------------------------------------------------------
  151. ;
  152. ; Downsample pixel values of a single component.
  153. ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
  154. ; without smoothing.
  155. ;
  156. ; GLOBAL(void)
  157. ; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
  158. ; JDIMENSION v_samp_factor,
  159. ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  160. ; JSAMPARRAY output_data);
  161. ;
  162. ; r10d = JDIMENSION image_width
  163. ; r11 = int max_v_samp_factor
  164. ; r12d = JDIMENSION v_samp_factor
  165. ; r13d = JDIMENSION width_in_blocks
  166. ; r14 = JSAMPARRAY input_data
  167. ; r15 = JSAMPARRAY output_data
  168. align 32
  169. GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
  170. EXTN(jsimd_h2v2_downsample_avx2):
  171. push rbp
  172. mov rax, rsp
  173. mov rbp, rsp
  174. collect_args 6
  175. mov ecx, r13d
  176. shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
  177. jz near .return
  178. mov edx, r10d
  179. ; -- expand_right_edge
  180. push rcx
  181. shl rcx, 1 ; output_cols * 2
  182. sub rcx, rdx
  183. jle short .expand_end
  184. mov rax, r11
  185. test rax, rax
  186. jle short .expand_end
  187. cld
  188. mov rsi, r14 ; input_data
  189. .expandloop:
  190. push rax
  191. push rcx
  192. mov rdip, JSAMPROW [rsi]
  193. add rdi, rdx
  194. mov al, JSAMPLE [rdi-1]
  195. rep stosb
  196. pop rcx
  197. pop rax
  198. add rsi, byte SIZEOF_JSAMPROW
  199. dec rax
  200. jg short .expandloop
  201. .expand_end:
  202. pop rcx ; output_cols
  203. ; -- h2v2_downsample
  204. mov eax, r12d ; rowctr
  205. test rax, rax
  206. jle near .return
  207. mov rdx, 0x00020001 ; bias pattern
  208. vmovd xmm7, edx
  209. vpcmpeqw ymm6, ymm6, ymm6
  210. vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
  211. vperm2i128 ymm7, ymm7, ymm7, 0
  212. vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
  213. mov rsi, r14 ; input_data
  214. mov rdi, r15 ; output_data
  215. .rowloop:
  216. push rcx
  217. push rdi
  218. push rsi
  219. mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
  220. mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
  221. mov rdip, JSAMPROW [rdi] ; outptr
  222. cmp rcx, byte SIZEOF_YMMWORD
  223. jae short .columnloop
  224. .columnloop_r24:
  225. cmp rcx, 24
  226. jne .columnloop_r16
  227. vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
  228. vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  229. vmovdqu xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
  230. vmovdqu xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
  231. mov rcx, SIZEOF_YMMWORD
  232. jmp short .downsample
  233. .columnloop_r16:
  234. cmp rcx, 16
  235. jne .columnloop_r8
  236. vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
  237. vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  238. vpxor ymm2, ymm2, ymm2
  239. vpxor ymm3, ymm3, ymm3
  240. mov rcx, SIZEOF_YMMWORD
  241. jmp short .downsample
  242. .columnloop_r8:
  243. vmovdqu xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
  244. vmovdqu xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  245. vpxor ymm2, ymm2, ymm2
  246. vpxor ymm3, ymm3, ymm3
  247. mov rcx, SIZEOF_YMMWORD
  248. jmp short .downsample
  249. .columnloop:
  250. vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
  251. vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  252. vmovdqu ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
  253. vmovdqu ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
  254. .downsample:
  255. vpand ymm4, ymm0, ymm6
  256. vpsrlw ymm0, ymm0, BYTE_BIT
  257. vpand ymm5, ymm1, ymm6
  258. vpsrlw ymm1, ymm1, BYTE_BIT
  259. vpaddw ymm0, ymm0, ymm4
  260. vpaddw ymm1, ymm1, ymm5
  261. vpand ymm4, ymm2, ymm6
  262. vpsrlw ymm2, ymm2, BYTE_BIT
  263. vpand ymm5, ymm3, ymm6
  264. vpsrlw ymm3, ymm3, BYTE_BIT
  265. vpaddw ymm2, ymm2, ymm4
  266. vpaddw ymm3, ymm3, ymm5
  267. vpaddw ymm0, ymm0, ymm1
  268. vpaddw ymm2, ymm2, ymm3
  269. vpaddw ymm0, ymm0, ymm7
  270. vpaddw ymm2, ymm2, ymm7
  271. vpsrlw ymm0, ymm0, 2
  272. vpsrlw ymm2, ymm2, 2
  273. vpackuswb ymm0, ymm0, ymm2
  274. vpermq ymm0, ymm0, 0xd8
  275. vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
  276. sub rcx, byte SIZEOF_YMMWORD ; outcol
  277. add rdx, byte 2*SIZEOF_YMMWORD ; inptr0
  278. add rsi, byte 2*SIZEOF_YMMWORD ; inptr1
  279. add rdi, byte 1*SIZEOF_YMMWORD ; outptr
  280. cmp rcx, byte SIZEOF_YMMWORD
  281. jae near .columnloop
  282. test rcx, rcx
  283. jnz near .columnloop_r24
  284. pop rsi
  285. pop rdi
  286. pop rcx
  287. add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
  288. add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
  289. dec rax ; rowctr
  290. jg near .rowloop
  291. .return:
  292. vzeroupper
  293. uncollect_args 6
  294. pop rbp
  295. ret
  296. ; For some reason, the OS X linker does not honor the request to align the
  297. ; segment unless we do this.
  298. align 32