jidctfst-neon.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. /*
  2. * jidctfst-neon.c - fast integer IDCT (Arm Neon)
  3. *
  4. * Copyright (C) 2020, Arm Limited. All Rights Reserved.
  5. *
  6. * This software is provided 'as-is', without any express or implied
  7. * warranty. In no event will the authors be held liable for any damages
  8. * arising from the use of this software.
  9. *
  10. * Permission is granted to anyone to use this software for any purpose,
  11. * including commercial applications, and to alter it and redistribute it
  12. * freely, subject to the following restrictions:
  13. *
  14. * 1. The origin of this software must not be misrepresented; you must not
  15. * claim that you wrote the original software. If you use this software
  16. * in a product, an acknowledgment in the product documentation would be
  17. * appreciated but is not required.
  18. * 2. Altered source versions must be plainly marked as such, and must not be
  19. * misrepresented as being the original software.
  20. * 3. This notice may not be removed or altered from any source distribution.
  21. */
  22. #define JPEG_INTERNALS
  23. #include "../../jinclude.h"
  24. #include "../../jpeglib.h"
  25. #include "../../jsimd.h"
  26. #include "../../jdct.h"
  27. #include "../../jsimddct.h"
  28. #include "../jsimd.h"
  29. #include "align.h"
  30. #include <arm_neon.h>
  31. /* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
  32. * inverse DCT (Discrete Cosine Transform) on one block of coefficients. It
  33. * uses the same calculations and produces exactly the same output as IJG's
  34. * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
  35. *
  36. * Scaled integer constants are used to avoid floating-point arithmetic:
  37. * 0.082392200 = 2688 * 2^-15
  38. * 0.414213562 = 13568 * 2^-15
  39. * 0.847759065 = 27776 * 2^-15
  40. * 0.613125930 = 20096 * 2^-15
  41. *
  42. * See jidctfst.c for further details of the IDCT algorithm. Where possible,
  43. * the variable names and comments here in jsimd_idct_ifast_neon() match up
  44. * with those in jpeg_idct_ifast().
  45. */
  46. #define PASS1_BITS 2
  47. #define F_0_082 2688
  48. #define F_0_414 13568
  49. #define F_0_847 27776
  50. #define F_0_613 20096
  51. ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
  52. F_0_082, F_0_414, F_0_847, F_0_613
  53. };
  54. void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
  55. JSAMPARRAY output_buf, JDIMENSION output_col)
  56. {
  57. IFAST_MULT_TYPE *quantptr = dct_table;
  58. /* Load DCT coefficients. */
  59. int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
  60. int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
  61. int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
  62. int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
  63. int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
  64. int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
  65. int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
  66. int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
  67. /* Load quantization table values for DC coefficients. */
  68. int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
  69. /* Dequantize DC coefficients. */
  70. row0 = vmulq_s16(row0, quant_row0);
  71. /* Construct bitmap to test if all AC coefficients are 0. */
  72. int16x8_t bitmap = vorrq_s16(row1, row2);
  73. bitmap = vorrq_s16(bitmap, row3);
  74. bitmap = vorrq_s16(bitmap, row4);
  75. bitmap = vorrq_s16(bitmap, row5);
  76. bitmap = vorrq_s16(bitmap, row6);
  77. bitmap = vorrq_s16(bitmap, row7);
  78. int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
  79. int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
  80. /* Load IDCT conversion constants. */
  81. const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
  82. if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
  83. /* All AC coefficients are zero.
  84. * Compute DC values and duplicate into vectors.
  85. */
  86. int16x8_t dcval = row0;
  87. row1 = dcval;
  88. row2 = dcval;
  89. row3 = dcval;
  90. row4 = dcval;
  91. row5 = dcval;
  92. row6 = dcval;
  93. row7 = dcval;
  94. } else if (left_ac_bitmap == 0) {
  95. /* AC coefficients are zero for columns 0, 1, 2, and 3.
  96. * Use DC values for these columns.
  97. */
  98. int16x4_t dcval = vget_low_s16(row0);
  99. /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
  100. /* Load quantization table. */
  101. int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
  102. int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
  103. int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
  104. int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
  105. int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
  106. int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
  107. int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
  108. /* Even part: dequantize DCT coefficients. */
  109. int16x4_t tmp0 = vget_high_s16(row0);
  110. int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
  111. int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
  112. int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
  113. int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
  114. int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
  115. int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
  116. int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
  117. int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
  118. tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
  119. tmp12 = vsub_s16(tmp12, tmp13);
  120. tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
  121. tmp3 = vsub_s16(tmp10, tmp13);
  122. tmp1 = vadd_s16(tmp11, tmp12);
  123. tmp2 = vsub_s16(tmp11, tmp12);
  124. /* Odd part: dequantize DCT coefficients. */
  125. int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
  126. int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
  127. int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
  128. int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
  129. int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
  130. int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
  131. int16x4_t z11 = vadd_s16(tmp4, tmp7);
  132. int16x4_t z12 = vsub_s16(tmp4, tmp7);
  133. tmp7 = vadd_s16(z11, z13); /* phase 5 */
  134. int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
  135. tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
  136. tmp11 = vadd_s16(tmp11, z11_sub_z13);
  137. int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
  138. int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
  139. z5 = vadd_s16(z5, z10_add_z12);
  140. tmp10 = vqdmulh_lane_s16(z12, consts, 0);
  141. tmp10 = vadd_s16(tmp10, z12);
  142. tmp10 = vsub_s16(tmp10, z5);
  143. tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
  144. tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
  145. tmp12 = vadd_s16(tmp12, z5);
  146. tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
  147. tmp5 = vsub_s16(tmp11, tmp6);
  148. tmp4 = vadd_s16(tmp10, tmp5);
  149. row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
  150. row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
  151. row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
  152. row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
  153. row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
  154. row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
  155. row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
  156. row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
  157. } else if (right_ac_bitmap == 0) {
  158. /* AC coefficients are zero for columns 4, 5, 6, and 7.
  159. * Use DC values for these columns.
  160. */
  161. int16x4_t dcval = vget_high_s16(row0);
  162. /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
  163. /* Load quantization table. */
  164. int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
  165. int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
  166. int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
  167. int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
  168. int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
  169. int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
  170. int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
  171. /* Even part: dequantize DCT coefficients. */
  172. int16x4_t tmp0 = vget_low_s16(row0);
  173. int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
  174. int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
  175. int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
  176. int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
  177. int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
  178. int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
  179. int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
  180. int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
  181. tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
  182. tmp12 = vsub_s16(tmp12, tmp13);
  183. tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
  184. tmp3 = vsub_s16(tmp10, tmp13);
  185. tmp1 = vadd_s16(tmp11, tmp12);
  186. tmp2 = vsub_s16(tmp11, tmp12);
  187. /* Odd part: dequantize DCT coefficients. */
  188. int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
  189. int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
  190. int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
  191. int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
  192. int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
  193. int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
  194. int16x4_t z11 = vadd_s16(tmp4, tmp7);
  195. int16x4_t z12 = vsub_s16(tmp4, tmp7);
  196. tmp7 = vadd_s16(z11, z13); /* phase 5 */
  197. int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
  198. tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
  199. tmp11 = vadd_s16(tmp11, z11_sub_z13);
  200. int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
  201. int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
  202. z5 = vadd_s16(z5, z10_add_z12);
  203. tmp10 = vqdmulh_lane_s16(z12, consts, 0);
  204. tmp10 = vadd_s16(tmp10, z12);
  205. tmp10 = vsub_s16(tmp10, z5);
  206. tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
  207. tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
  208. tmp12 = vadd_s16(tmp12, z5);
  209. tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
  210. tmp5 = vsub_s16(tmp11, tmp6);
  211. tmp4 = vadd_s16(tmp10, tmp5);
  212. row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
  213. row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
  214. row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
  215. row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
  216. row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
  217. row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
  218. row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
  219. row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
  220. } else {
  221. /* Some AC coefficients are non-zero; full IDCT calculation required. */
  222. /* Load quantization table. */
  223. int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
  224. int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
  225. int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
  226. int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
  227. int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
  228. int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
  229. int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
  230. /* Even part: dequantize DCT coefficients. */
  231. int16x8_t tmp0 = row0;
  232. int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
  233. int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
  234. int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
  235. int16x8_t tmp10 = vaddq_s16(tmp0, tmp2); /* phase 3 */
  236. int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
  237. int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */
  238. int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
  239. int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
  240. tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
  241. tmp12 = vsubq_s16(tmp12, tmp13);
  242. tmp0 = vaddq_s16(tmp10, tmp13); /* phase 2 */
  243. tmp3 = vsubq_s16(tmp10, tmp13);
  244. tmp1 = vaddq_s16(tmp11, tmp12);
  245. tmp2 = vsubq_s16(tmp11, tmp12);
  246. /* Odd part: dequantize DCT coefficients. */
  247. int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
  248. int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
  249. int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
  250. int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
  251. int16x8_t z13 = vaddq_s16(tmp6, tmp5); /* phase 6 */
  252. int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
  253. int16x8_t z11 = vaddq_s16(tmp4, tmp7);
  254. int16x8_t z12 = vsubq_s16(tmp4, tmp7);
  255. tmp7 = vaddq_s16(z11, z13); /* phase 5 */
  256. int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
  257. tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
  258. tmp11 = vaddq_s16(tmp11, z11_sub_z13);
  259. int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
  260. int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
  261. z5 = vaddq_s16(z5, z10_add_z12);
  262. tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
  263. tmp10 = vaddq_s16(tmp10, z12);
  264. tmp10 = vsubq_s16(tmp10, z5);
  265. tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
  266. tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
  267. tmp12 = vaddq_s16(tmp12, z5);
  268. tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
  269. tmp5 = vsubq_s16(tmp11, tmp6);
  270. tmp4 = vaddq_s16(tmp10, tmp5);
  271. row0 = vaddq_s16(tmp0, tmp7);
  272. row7 = vsubq_s16(tmp0, tmp7);
  273. row1 = vaddq_s16(tmp1, tmp6);
  274. row6 = vsubq_s16(tmp1, tmp6);
  275. row2 = vaddq_s16(tmp2, tmp5);
  276. row5 = vsubq_s16(tmp2, tmp5);
  277. row4 = vaddq_s16(tmp3, tmp4);
  278. row3 = vsubq_s16(tmp3, tmp4);
  279. }
  280. /* Transpose rows to work on columns in pass 2. */
  281. int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
  282. int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
  283. int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
  284. int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
  285. int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
  286. vreinterpretq_s32_s16(rows_45.val[0]));
  287. int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
  288. vreinterpretq_s32_s16(rows_45.val[1]));
  289. int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
  290. vreinterpretq_s32_s16(rows_67.val[0]));
  291. int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
  292. vreinterpretq_s32_s16(rows_67.val[1]));
  293. int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
  294. int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
  295. int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
  296. int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
  297. int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
  298. int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
  299. int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
  300. int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
  301. int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
  302. int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
  303. int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
  304. int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
  305. /* 1-D IDCT, pass 2 */
  306. /* Even part */
  307. int16x8_t tmp10 = vaddq_s16(col0, col4);
  308. int16x8_t tmp11 = vsubq_s16(col0, col4);
  309. int16x8_t tmp13 = vaddq_s16(col2, col6);
  310. int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
  311. int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
  312. tmp12 = vaddq_s16(tmp12, col2_sub_col6);
  313. tmp12 = vsubq_s16(tmp12, tmp13);
  314. int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
  315. int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
  316. int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
  317. int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
  318. /* Odd part */
  319. int16x8_t z13 = vaddq_s16(col5, col3);
  320. int16x8_t neg_z10 = vsubq_s16(col3, col5);
  321. int16x8_t z11 = vaddq_s16(col1, col7);
  322. int16x8_t z12 = vsubq_s16(col1, col7);
  323. int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */
  324. int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
  325. tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
  326. tmp11 = vaddq_s16(tmp11, z11_sub_z13);
  327. int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
  328. int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
  329. z5 = vaddq_s16(z5, z10_add_z12);
  330. tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
  331. tmp10 = vaddq_s16(tmp10, z12);
  332. tmp10 = vsubq_s16(tmp10, z5);
  333. tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
  334. tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
  335. tmp12 = vaddq_s16(tmp12, z5);
  336. int16x8_t tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
  337. int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
  338. int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
  339. col0 = vaddq_s16(tmp0, tmp7);
  340. col7 = vsubq_s16(tmp0, tmp7);
  341. col1 = vaddq_s16(tmp1, tmp6);
  342. col6 = vsubq_s16(tmp1, tmp6);
  343. col2 = vaddq_s16(tmp2, tmp5);
  344. col5 = vsubq_s16(tmp2, tmp5);
  345. col4 = vaddq_s16(tmp3, tmp4);
  346. col3 = vsubq_s16(tmp3, tmp4);
  347. /* Scale down by a factor of 8, narrowing to 8-bit. */
  348. int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
  349. vqshrn_n_s16(col1, PASS1_BITS + 3));
  350. int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
  351. vqshrn_n_s16(col5, PASS1_BITS + 3));
  352. int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
  353. vqshrn_n_s16(col3, PASS1_BITS + 3));
  354. int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
  355. vqshrn_n_s16(col7, PASS1_BITS + 3));
  356. /* Clamp to range [0-255]. */
  357. uint8x16_t cols_01 =
  358. vreinterpretq_u8_s8
  359. (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
  360. uint8x16_t cols_45 =
  361. vreinterpretq_u8_s8
  362. (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
  363. uint8x16_t cols_23 =
  364. vreinterpretq_u8_s8
  365. (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
  366. uint8x16_t cols_67 =
  367. vreinterpretq_u8_s8
  368. (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
  369. /* Transpose block to prepare for store. */
  370. uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
  371. vreinterpretq_u32_u8(cols_45));
  372. uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
  373. vreinterpretq_u32_u8(cols_67));
  374. uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
  375. vreinterpretq_u8_u32(cols_0415.val[1]));
  376. uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
  377. vreinterpretq_u8_u32(cols_2637.val[1]));
  378. uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
  379. vreinterpretq_u16_u8(cols_2367.val[0]));
  380. uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
  381. vreinterpretq_u16_u8(cols_2367.val[1]));
  382. uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
  383. uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
  384. uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
  385. uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
  386. JSAMPROW outptr0 = output_buf[0] + output_col;
  387. JSAMPROW outptr1 = output_buf[1] + output_col;
  388. JSAMPROW outptr2 = output_buf[2] + output_col;
  389. JSAMPROW outptr3 = output_buf[3] + output_col;
  390. JSAMPROW outptr4 = output_buf[4] + output_col;
  391. JSAMPROW outptr5 = output_buf[5] + output_col;
  392. JSAMPROW outptr6 = output_buf[6] + output_col;
  393. JSAMPROW outptr7 = output_buf[7] + output_col;
  394. /* Store DCT block to memory. */
  395. vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
  396. vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
  397. vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
  398. vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
  399. vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
  400. vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
  401. vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
  402. vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
  403. }