jcphuff-neon.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622
  1. /*
  2. * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
  3. *
  4. * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
  5. *
  6. * This software is provided 'as-is', without any express or implied
  7. * warranty. In no event will the authors be held liable for any damages
  8. * arising from the use of this software.
  9. *
  10. * Permission is granted to anyone to use this software for any purpose,
  11. * including commercial applications, and to alter it and redistribute it
  12. * freely, subject to the following restrictions:
  13. *
  14. * 1. The origin of this software must not be misrepresented; you must not
  15. * claim that you wrote the original software. If you use this software
  16. * in a product, an acknowledgment in the product documentation would be
  17. * appreciated but is not required.
  18. * 2. Altered source versions must be plainly marked as such, and must not be
  19. * misrepresented as being the original software.
  20. * 3. This notice may not be removed or altered from any source distribution.
  21. */
  22. #define JPEG_INTERNALS
  23. #include "jconfigint.h"
  24. #include "../../jinclude.h"
  25. #include "../../jpeglib.h"
  26. #include "../../jsimd.h"
  27. #include "../../jdct.h"
  28. #include "../../jsimddct.h"
  29. #include "../jsimd.h"
  30. #include "neon-compat.h"
  31. #include <arm_neon.h>
  32. /* Data preparation for encode_mcu_AC_first().
  33. *
  34. * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
  35. * found in jcphuff.c.
  36. */
  37. void jsimd_encode_mcu_AC_first_prepare_neon
  38. (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
  39. JCOEF *values, size_t *zerobits)
  40. {
  41. JCOEF *values_ptr = values;
  42. JCOEF *diff_values_ptr = values + DCTSIZE2;
  43. /* Rows of coefficients to zero (since they haven't been processed) */
  44. int i, rows_to_zero = 8;
  45. for (i = 0; i < Sl / 16; i++) {
  46. int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
  47. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
  48. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
  49. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
  50. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
  51. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
  52. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
  53. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
  54. int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
  55. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
  56. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
  57. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
  58. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
  59. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
  60. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
  61. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
  62. /* Isolate sign of coefficients. */
  63. int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
  64. int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
  65. /* Compute absolute value of coefficients and apply point transform Al. */
  66. int16x8_t abs_coefs1 = vabsq_s16(coefs1);
  67. int16x8_t abs_coefs2 = vabsq_s16(coefs2);
  68. coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
  69. coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
  70. /* Compute diff values. */
  71. int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
  72. int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
  73. /* Store transformed coefficients and diff values. */
  74. vst1q_s16(values_ptr, coefs1);
  75. vst1q_s16(values_ptr + DCTSIZE, coefs2);
  76. vst1q_s16(diff_values_ptr, diff1);
  77. vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
  78. values_ptr += 16;
  79. diff_values_ptr += 16;
  80. jpeg_natural_order_start += 16;
  81. rows_to_zero -= 2;
  82. }
  83. /* Same operation but for remaining partial vector */
  84. int remaining_coefs = Sl % 16;
  85. if (remaining_coefs > 8) {
  86. int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
  87. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
  88. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
  89. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
  90. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
  91. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
  92. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
  93. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
  94. int16x8_t coefs2 = vdupq_n_s16(0);
  95. switch (remaining_coefs) {
  96. case 15:
  97. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
  98. FALLTHROUGH /*FALLTHROUGH*/
  99. case 14:
  100. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
  101. FALLTHROUGH /*FALLTHROUGH*/
  102. case 13:
  103. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
  104. FALLTHROUGH /*FALLTHROUGH*/
  105. case 12:
  106. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
  107. FALLTHROUGH /*FALLTHROUGH*/
  108. case 11:
  109. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
  110. FALLTHROUGH /*FALLTHROUGH*/
  111. case 10:
  112. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
  113. FALLTHROUGH /*FALLTHROUGH*/
  114. case 9:
  115. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
  116. FALLTHROUGH /*FALLTHROUGH*/
  117. default:
  118. break;
  119. }
  120. /* Isolate sign of coefficients. */
  121. int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
  122. int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
  123. /* Compute absolute value of coefficients and apply point transform Al. */
  124. int16x8_t abs_coefs1 = vabsq_s16(coefs1);
  125. int16x8_t abs_coefs2 = vabsq_s16(coefs2);
  126. coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
  127. coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
  128. /* Compute diff values. */
  129. int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
  130. int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
  131. /* Store transformed coefficients and diff values. */
  132. vst1q_s16(values_ptr, coefs1);
  133. vst1q_s16(values_ptr + DCTSIZE, coefs2);
  134. vst1q_s16(diff_values_ptr, diff1);
  135. vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
  136. values_ptr += 16;
  137. diff_values_ptr += 16;
  138. rows_to_zero -= 2;
  139. } else if (remaining_coefs > 0) {
  140. int16x8_t coefs = vdupq_n_s16(0);
  141. switch (remaining_coefs) {
  142. case 8:
  143. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
  144. FALLTHROUGH /*FALLTHROUGH*/
  145. case 7:
  146. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
  147. FALLTHROUGH /*FALLTHROUGH*/
  148. case 6:
  149. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
  150. FALLTHROUGH /*FALLTHROUGH*/
  151. case 5:
  152. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
  153. FALLTHROUGH /*FALLTHROUGH*/
  154. case 4:
  155. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
  156. FALLTHROUGH /*FALLTHROUGH*/
  157. case 3:
  158. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
  159. FALLTHROUGH /*FALLTHROUGH*/
  160. case 2:
  161. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
  162. FALLTHROUGH /*FALLTHROUGH*/
  163. case 1:
  164. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
  165. FALLTHROUGH /*FALLTHROUGH*/
  166. default:
  167. break;
  168. }
  169. /* Isolate sign of coefficients. */
  170. int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
  171. /* Compute absolute value of coefficients and apply point transform Al. */
  172. int16x8_t abs_coefs = vabsq_s16(coefs);
  173. coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
  174. /* Compute diff values. */
  175. int16x8_t diff = veorq_s16(coefs, sign_coefs);
  176. /* Store transformed coefficients and diff values. */
  177. vst1q_s16(values_ptr, coefs);
  178. vst1q_s16(diff_values_ptr, diff);
  179. values_ptr += 8;
  180. diff_values_ptr += 8;
  181. rows_to_zero--;
  182. }
  183. /* Zero remaining memory in the values and diff_values blocks. */
  184. for (i = 0; i < rows_to_zero; i++) {
  185. vst1q_s16(values_ptr, vdupq_n_s16(0));
  186. vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
  187. values_ptr += 8;
  188. diff_values_ptr += 8;
  189. }
  190. /* Construct zerobits bitmap. A set bit means that the corresponding
  191. * coefficient != 0.
  192. */
  193. int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
  194. int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
  195. int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
  196. int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
  197. int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
  198. int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
  199. int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
  200. int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
  201. uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
  202. uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
  203. uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
  204. uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
  205. uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
  206. uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
  207. uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
  208. uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
  209. /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
  210. const uint8x8_t bitmap_mask =
  211. vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
  212. row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
  213. row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
  214. row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
  215. row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
  216. row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
  217. row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
  218. row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
  219. row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
  220. uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
  221. uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
  222. uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
  223. uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
  224. uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
  225. uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
  226. uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
  227. #if defined(__aarch64__) || defined(_M_ARM64)
  228. /* Move bitmap to a 64-bit scalar register. */
  229. uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
  230. /* Store zerobits bitmap. */
  231. *zerobits = ~bitmap;
  232. #else
  233. /* Move bitmap to two 32-bit scalar registers. */
  234. uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
  235. uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
  236. /* Store zerobits bitmap. */
  237. zerobits[0] = ~bitmap0;
  238. zerobits[1] = ~bitmap1;
  239. #endif
  240. }
  241. /* Data preparation for encode_mcu_AC_refine().
  242. *
  243. * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
  244. * found in jcphuff.c.
  245. */
  246. int jsimd_encode_mcu_AC_refine_prepare_neon
  247. (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
  248. JCOEF *absvalues, size_t *bits)
  249. {
  250. /* Temporary storage buffers for data used to compute the signbits bitmap and
  251. * the end-of-block (EOB) position
  252. */
  253. uint8_t coef_sign_bits[64];
  254. uint8_t coef_eq1_bits[64];
  255. JCOEF *absvalues_ptr = absvalues;
  256. uint8_t *coef_sign_bits_ptr = coef_sign_bits;
  257. uint8_t *eq1_bits_ptr = coef_eq1_bits;
  258. /* Rows of coefficients to zero (since they haven't been processed) */
  259. int i, rows_to_zero = 8;
  260. for (i = 0; i < Sl / 16; i++) {
  261. int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
  262. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
  263. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
  264. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
  265. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
  266. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
  267. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
  268. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
  269. int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
  270. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
  271. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
  272. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
  273. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
  274. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
  275. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
  276. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
  277. /* Compute and store data for signbits bitmap. */
  278. uint8x8_t sign_coefs1 =
  279. vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
  280. uint8x8_t sign_coefs2 =
  281. vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
  282. vst1_u8(coef_sign_bits_ptr, sign_coefs1);
  283. vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
  284. /* Compute absolute value of coefficients and apply point transform Al. */
  285. int16x8_t abs_coefs1 = vabsq_s16(coefs1);
  286. int16x8_t abs_coefs2 = vabsq_s16(coefs2);
  287. coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
  288. coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
  289. vst1q_s16(absvalues_ptr, coefs1);
  290. vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
  291. /* Test whether transformed coefficient values == 1 (used to find EOB
  292. * position.)
  293. */
  294. uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
  295. uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
  296. vst1_u8(eq1_bits_ptr, coefs_eq11);
  297. vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
  298. absvalues_ptr += 16;
  299. coef_sign_bits_ptr += 16;
  300. eq1_bits_ptr += 16;
  301. jpeg_natural_order_start += 16;
  302. rows_to_zero -= 2;
  303. }
  304. /* Same operation but for remaining partial vector */
  305. int remaining_coefs = Sl % 16;
  306. if (remaining_coefs > 8) {
  307. int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
  308. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
  309. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
  310. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
  311. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
  312. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
  313. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
  314. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
  315. int16x8_t coefs2 = vdupq_n_s16(0);
  316. switch (remaining_coefs) {
  317. case 15:
  318. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
  319. FALLTHROUGH /*FALLTHROUGH*/
  320. case 14:
  321. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
  322. FALLTHROUGH /*FALLTHROUGH*/
  323. case 13:
  324. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
  325. FALLTHROUGH /*FALLTHROUGH*/
  326. case 12:
  327. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
  328. FALLTHROUGH /*FALLTHROUGH*/
  329. case 11:
  330. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
  331. FALLTHROUGH /*FALLTHROUGH*/
  332. case 10:
  333. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
  334. FALLTHROUGH /*FALLTHROUGH*/
  335. case 9:
  336. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
  337. FALLTHROUGH /*FALLTHROUGH*/
  338. default:
  339. break;
  340. }
  341. /* Compute and store data for signbits bitmap. */
  342. uint8x8_t sign_coefs1 =
  343. vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
  344. uint8x8_t sign_coefs2 =
  345. vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
  346. vst1_u8(coef_sign_bits_ptr, sign_coefs1);
  347. vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
  348. /* Compute absolute value of coefficients and apply point transform Al. */
  349. int16x8_t abs_coefs1 = vabsq_s16(coefs1);
  350. int16x8_t abs_coefs2 = vabsq_s16(coefs2);
  351. coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
  352. coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
  353. vst1q_s16(absvalues_ptr, coefs1);
  354. vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
  355. /* Test whether transformed coefficient values == 1 (used to find EOB
  356. * position.)
  357. */
  358. uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
  359. uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
  360. vst1_u8(eq1_bits_ptr, coefs_eq11);
  361. vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
  362. absvalues_ptr += 16;
  363. coef_sign_bits_ptr += 16;
  364. eq1_bits_ptr += 16;
  365. jpeg_natural_order_start += 16;
  366. rows_to_zero -= 2;
  367. } else if (remaining_coefs > 0) {
  368. int16x8_t coefs = vdupq_n_s16(0);
  369. switch (remaining_coefs) {
  370. case 8:
  371. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
  372. FALLTHROUGH /*FALLTHROUGH*/
  373. case 7:
  374. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
  375. FALLTHROUGH /*FALLTHROUGH*/
  376. case 6:
  377. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
  378. FALLTHROUGH /*FALLTHROUGH*/
  379. case 5:
  380. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
  381. FALLTHROUGH /*FALLTHROUGH*/
  382. case 4:
  383. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
  384. FALLTHROUGH /*FALLTHROUGH*/
  385. case 3:
  386. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
  387. FALLTHROUGH /*FALLTHROUGH*/
  388. case 2:
  389. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
  390. FALLTHROUGH /*FALLTHROUGH*/
  391. case 1:
  392. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
  393. FALLTHROUGH /*FALLTHROUGH*/
  394. default:
  395. break;
  396. }
  397. /* Compute and store data for signbits bitmap. */
  398. uint8x8_t sign_coefs =
  399. vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
  400. vst1_u8(coef_sign_bits_ptr, sign_coefs);
  401. /* Compute absolute value of coefficients and apply point transform Al. */
  402. int16x8_t abs_coefs = vabsq_s16(coefs);
  403. coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
  404. vst1q_s16(absvalues_ptr, coefs);
  405. /* Test whether transformed coefficient values == 1 (used to find EOB
  406. * position.)
  407. */
  408. uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
  409. vst1_u8(eq1_bits_ptr, coefs_eq1);
  410. absvalues_ptr += 8;
  411. coef_sign_bits_ptr += 8;
  412. eq1_bits_ptr += 8;
  413. rows_to_zero--;
  414. }
  415. /* Zero remaining memory in blocks. */
  416. for (i = 0; i < rows_to_zero; i++) {
  417. vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
  418. vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
  419. vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
  420. absvalues_ptr += 8;
  421. coef_sign_bits_ptr += 8;
  422. eq1_bits_ptr += 8;
  423. }
  424. /* Construct zerobits bitmap. */
  425. int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
  426. int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
  427. int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
  428. int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
  429. int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
  430. int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
  431. int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
  432. int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
  433. uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
  434. uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
  435. uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
  436. uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
  437. uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
  438. uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
  439. uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
  440. uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
  441. /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
  442. const uint8x8_t bitmap_mask =
  443. vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
  444. abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
  445. abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
  446. abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
  447. abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
  448. abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
  449. abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
  450. abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
  451. abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
  452. uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
  453. uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
  454. uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
  455. uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
  456. uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
  457. uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
  458. uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
  459. #if defined(__aarch64__) || defined(_M_ARM64)
  460. /* Move bitmap to a 64-bit scalar register. */
  461. uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
  462. /* Store zerobits bitmap. */
  463. bits[0] = ~bitmap;
  464. #else
  465. /* Move bitmap to two 32-bit scalar registers. */
  466. uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
  467. uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
  468. /* Store zerobits bitmap. */
  469. bits[0] = ~bitmap0;
  470. bits[1] = ~bitmap1;
  471. #endif
  472. /* Construct signbits bitmap. */
  473. uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
  474. uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
  475. uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
  476. uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
  477. uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
  478. uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
  479. uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
  480. uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
  481. signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
  482. signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
  483. signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
  484. signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
  485. signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
  486. signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
  487. signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
  488. signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
  489. bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
  490. bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
  491. bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
  492. bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
  493. bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
  494. bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
  495. bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
  496. #if defined(__aarch64__) || defined(_M_ARM64)
  497. /* Move bitmap to a 64-bit scalar register. */
  498. bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
  499. /* Store signbits bitmap. */
  500. bits[1] = ~bitmap;
  501. #else
  502. /* Move bitmap to two 32-bit scalar registers. */
  503. bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
  504. bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
  505. /* Store signbits bitmap. */
  506. bits[2] = ~bitmap0;
  507. bits[3] = ~bitmap1;
  508. #endif
  509. /* Construct bitmap to find EOB position (the index of the last coefficient
  510. * equal to 1.)
  511. */
  512. uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
  513. uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
  514. uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
  515. uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
  516. uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
  517. uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
  518. uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
  519. uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
  520. row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
  521. row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
  522. row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
  523. row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
  524. row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
  525. row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
  526. row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
  527. row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
  528. bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
  529. bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
  530. bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
  531. bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
  532. bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
  533. bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
  534. bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
  535. #if defined(__aarch64__) || defined(_M_ARM64)
  536. /* Move bitmap to a 64-bit scalar register. */
  537. bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
  538. /* Return EOB position. */
  539. if (bitmap == 0) {
  540. /* EOB position is defined to be 0 if all coefficients != 1. */
  541. return 0;
  542. } else {
  543. return 63 - BUILTIN_CLZLL(bitmap);
  544. }
  545. #else
  546. /* Move bitmap to two 32-bit scalar registers. */
  547. bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
  548. bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
  549. /* Return EOB position. */
  550. if (bitmap0 == 0 && bitmap1 == 0) {
  551. return 0;
  552. } else if (bitmap1 != 0) {
  553. return 63 - BUILTIN_CLZ(bitmap1);
  554. } else {
  555. return 31 - BUILTIN_CLZ(bitmap0);
  556. }
  557. #endif
  558. }