jdmrgext-neon.c 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723
  1. /*
  2. * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
  3. *
  4. * Copyright (C) 2020, Arm Limited. All Rights Reserved.
  5. * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
  6. *
  7. * This software is provided 'as-is', without any express or implied
  8. * warranty. In no event will the authors be held liable for any damages
  9. * arising from the use of this software.
  10. *
  11. * Permission is granted to anyone to use this software for any purpose,
  12. * including commercial applications, and to alter it and redistribute it
  13. * freely, subject to the following restrictions:
  14. *
  15. * 1. The origin of this software must not be misrepresented; you must not
  16. * claim that you wrote the original software. If you use this software
  17. * in a product, an acknowledgment in the product documentation would be
  18. * appreciated but is not required.
  19. * 2. Altered source versions must be plainly marked as such, and must not be
  20. * misrepresented as being the original software.
  21. * 3. This notice may not be removed or altered from any source distribution.
  22. */
  23. /* This file is included by jdmerge-neon.c. */
  24. /* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
  25. * chroma upsampling and YCbCr -> RGB color conversion into a single function.
  26. *
  27. * As with the standalone functions, YCbCr -> RGB conversion is defined by the
  28. * following equations:
  29. * R = Y + 1.40200 * (Cr - 128)
  30. * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
  31. * B = Y + 1.77200 * (Cb - 128)
  32. *
  33. * Scaled integer constants are used to avoid floating-point arithmetic:
  34. * 0.3441467 = 11277 * 2^-15
  35. * 0.7141418 = 23401 * 2^-15
  36. * 1.4020386 = 22971 * 2^-14
  37. * 1.7720337 = 29033 * 2^-14
  38. * These constants are defined in jdmerge-neon.c.
  39. *
  40. * To ensure correct results, rounding is used when descaling.
  41. */
  42. /* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
  43. * routines:
  44. *
  45. * Input memory buffers can be safely overread up to the next multiple of
  46. * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
  47. * jmemmgr.c.
  48. *
  49. * The output buffer cannot safely be written beyond output_width, since
  50. * output_buf points to a possibly unpadded row in the decompressed image
  51. * buffer allocated by the calling program.
  52. */
  53. /* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
  54. */
  55. void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
  56. JSAMPIMAGE input_buf,
  57. JDIMENSION in_row_group_ctr,
  58. JSAMPARRAY output_buf)
  59. {
  60. JSAMPROW outptr;
  61. /* Pointers to Y, Cb, and Cr data */
  62. JSAMPROW inptr0, inptr1, inptr2;
  63. const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
  64. const int16x8_t neg_128 = vdupq_n_s16(-128);
  65. inptr0 = input_buf[0][in_row_group_ctr];
  66. inptr1 = input_buf[1][in_row_group_ctr];
  67. inptr2 = input_buf[2][in_row_group_ctr];
  68. outptr = output_buf[0];
  69. int cols_remaining = output_width;
  70. for (; cols_remaining >= 16; cols_remaining -= 16) {
  71. /* De-interleave Y component values into two separate vectors, one
  72. * containing the component values with even-numbered indices and one
  73. * containing the component values with odd-numbered indices.
  74. */
  75. uint8x8x2_t y = vld2_u8(inptr0);
  76. uint8x8_t cb = vld1_u8(inptr1);
  77. uint8x8_t cr = vld1_u8(inptr2);
  78. /* Subtract 128 from Cb and Cr. */
  79. int16x8_t cr_128 =
  80. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
  81. int16x8_t cb_128 =
  82. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
  83. /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
  84. int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
  85. int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
  86. g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
  87. g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
  88. /* Descale G components: shift right 15, round, and narrow to 16-bit. */
  89. int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
  90. vrshrn_n_s32(g_sub_y_h, 15));
  91. /* Compute R-Y: 1.40200 * (Cr - 128) */
  92. int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
  93. /* Compute B-Y: 1.77200 * (Cb - 128) */
  94. int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
  95. /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
  96. * "odd" Y component values. This effectively upsamples the chroma
  97. * components horizontally.
  98. */
  99. int16x8_t g_even =
  100. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
  101. y.val[0]));
  102. int16x8_t r_even =
  103. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
  104. y.val[0]));
  105. int16x8_t b_even =
  106. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
  107. y.val[0]));
  108. int16x8_t g_odd =
  109. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
  110. y.val[1]));
  111. int16x8_t r_odd =
  112. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
  113. y.val[1]));
  114. int16x8_t b_odd =
  115. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
  116. y.val[1]));
  117. /* Convert each component to unsigned and narrow, clamping to [0-255].
  118. * Re-interleave the "even" and "odd" component values.
  119. */
  120. uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
  121. uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
  122. uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
  123. #ifdef RGB_ALPHA
  124. uint8x16x4_t rgba;
  125. rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
  126. rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
  127. rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
  128. /* Set alpha channel to opaque (0xFF). */
  129. rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
  130. /* Store RGBA pixel data to memory. */
  131. vst4q_u8(outptr, rgba);
  132. #else
  133. uint8x16x3_t rgb;
  134. rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
  135. rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
  136. rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
  137. /* Store RGB pixel data to memory. */
  138. vst3q_u8(outptr, rgb);
  139. #endif
  140. /* Increment pointers. */
  141. inptr0 += 16;
  142. inptr1 += 8;
  143. inptr2 += 8;
  144. outptr += (RGB_PIXELSIZE * 16);
  145. }
  146. if (cols_remaining > 0) {
  147. /* De-interleave Y component values into two separate vectors, one
  148. * containing the component values with even-numbered indices and one
  149. * containing the component values with odd-numbered indices.
  150. */
  151. uint8x8x2_t y = vld2_u8(inptr0);
  152. uint8x8_t cb = vld1_u8(inptr1);
  153. uint8x8_t cr = vld1_u8(inptr2);
  154. /* Subtract 128 from Cb and Cr. */
  155. int16x8_t cr_128 =
  156. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
  157. int16x8_t cb_128 =
  158. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
  159. /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
  160. int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
  161. int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
  162. g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
  163. g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
  164. /* Descale G components: shift right 15, round, and narrow to 16-bit. */
  165. int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
  166. vrshrn_n_s32(g_sub_y_h, 15));
  167. /* Compute R-Y: 1.40200 * (Cr - 128) */
  168. int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
  169. /* Compute B-Y: 1.77200 * (Cb - 128) */
  170. int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
  171. /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
  172. * "odd" Y component values. This effectively upsamples the chroma
  173. * components horizontally.
  174. */
  175. int16x8_t g_even =
  176. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
  177. y.val[0]));
  178. int16x8_t r_even =
  179. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
  180. y.val[0]));
  181. int16x8_t b_even =
  182. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
  183. y.val[0]));
  184. int16x8_t g_odd =
  185. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
  186. y.val[1]));
  187. int16x8_t r_odd =
  188. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
  189. y.val[1]));
  190. int16x8_t b_odd =
  191. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
  192. y.val[1]));
  193. /* Convert each component to unsigned and narrow, clamping to [0-255].
  194. * Re-interleave the "even" and "odd" component values.
  195. */
  196. uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
  197. uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
  198. uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
  199. #ifdef RGB_ALPHA
  200. uint8x8x4_t rgba_h;
  201. rgba_h.val[RGB_RED] = r.val[1];
  202. rgba_h.val[RGB_GREEN] = g.val[1];
  203. rgba_h.val[RGB_BLUE] = b.val[1];
  204. /* Set alpha channel to opaque (0xFF). */
  205. rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
  206. uint8x8x4_t rgba_l;
  207. rgba_l.val[RGB_RED] = r.val[0];
  208. rgba_l.val[RGB_GREEN] = g.val[0];
  209. rgba_l.val[RGB_BLUE] = b.val[0];
  210. /* Set alpha channel to opaque (0xFF). */
  211. rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
  212. /* Store RGBA pixel data to memory. */
  213. switch (cols_remaining) {
  214. case 15:
  215. vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
  216. FALLTHROUGH /*FALLTHROUGH*/
  217. case 14:
  218. vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
  219. FALLTHROUGH /*FALLTHROUGH*/
  220. case 13:
  221. vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
  222. FALLTHROUGH /*FALLTHROUGH*/
  223. case 12:
  224. vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
  225. FALLTHROUGH /*FALLTHROUGH*/
  226. case 11:
  227. vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
  228. FALLTHROUGH /*FALLTHROUGH*/
  229. case 10:
  230. vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
  231. FALLTHROUGH /*FALLTHROUGH*/
  232. case 9:
  233. vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
  234. FALLTHROUGH /*FALLTHROUGH*/
  235. case 8:
  236. vst4_u8(outptr, rgba_l);
  237. break;
  238. case 7:
  239. vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
  240. FALLTHROUGH /*FALLTHROUGH*/
  241. case 6:
  242. vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
  243. FALLTHROUGH /*FALLTHROUGH*/
  244. case 5:
  245. vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
  246. FALLTHROUGH /*FALLTHROUGH*/
  247. case 4:
  248. vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
  249. FALLTHROUGH /*FALLTHROUGH*/
  250. case 3:
  251. vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
  252. FALLTHROUGH /*FALLTHROUGH*/
  253. case 2:
  254. vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
  255. FALLTHROUGH /*FALLTHROUGH*/
  256. case 1:
  257. vst4_lane_u8(outptr, rgba_l, 0);
  258. FALLTHROUGH /*FALLTHROUGH*/
  259. default:
  260. break;
  261. }
  262. #else
  263. uint8x8x3_t rgb_h;
  264. rgb_h.val[RGB_RED] = r.val[1];
  265. rgb_h.val[RGB_GREEN] = g.val[1];
  266. rgb_h.val[RGB_BLUE] = b.val[1];
  267. uint8x8x3_t rgb_l;
  268. rgb_l.val[RGB_RED] = r.val[0];
  269. rgb_l.val[RGB_GREEN] = g.val[0];
  270. rgb_l.val[RGB_BLUE] = b.val[0];
  271. /* Store RGB pixel data to memory. */
  272. switch (cols_remaining) {
  273. case 15:
  274. vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
  275. FALLTHROUGH /*FALLTHROUGH*/
  276. case 14:
  277. vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
  278. FALLTHROUGH /*FALLTHROUGH*/
  279. case 13:
  280. vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
  281. FALLTHROUGH /*FALLTHROUGH*/
  282. case 12:
  283. vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
  284. FALLTHROUGH /*FALLTHROUGH*/
  285. case 11:
  286. vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
  287. FALLTHROUGH /*FALLTHROUGH*/
  288. case 10:
  289. vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
  290. FALLTHROUGH /*FALLTHROUGH*/
  291. case 9:
  292. vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
  293. FALLTHROUGH /*FALLTHROUGH*/
  294. case 8:
  295. vst3_u8(outptr, rgb_l);
  296. break;
  297. case 7:
  298. vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
  299. FALLTHROUGH /*FALLTHROUGH*/
  300. case 6:
  301. vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
  302. FALLTHROUGH /*FALLTHROUGH*/
  303. case 5:
  304. vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
  305. FALLTHROUGH /*FALLTHROUGH*/
  306. case 4:
  307. vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
  308. FALLTHROUGH /*FALLTHROUGH*/
  309. case 3:
  310. vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
  311. FALLTHROUGH /*FALLTHROUGH*/
  312. case 2:
  313. vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
  314. FALLTHROUGH /*FALLTHROUGH*/
  315. case 1:
  316. vst3_lane_u8(outptr, rgb_l, 0);
  317. FALLTHROUGH /*FALLTHROUGH*/
  318. default:
  319. break;
  320. }
  321. #endif
  322. }
  323. }
  324. /* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
  325. *
  326. * See comments above for details regarding color conversion and safe memory
  327. * access.
  328. */
  329. void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
  330. JSAMPIMAGE input_buf,
  331. JDIMENSION in_row_group_ctr,
  332. JSAMPARRAY output_buf)
  333. {
  334. JSAMPROW outptr0, outptr1;
  335. /* Pointers to Y (both rows), Cb, and Cr data */
  336. JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
  337. const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
  338. const int16x8_t neg_128 = vdupq_n_s16(-128);
  339. inptr0_0 = input_buf[0][in_row_group_ctr * 2];
  340. inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
  341. inptr1 = input_buf[1][in_row_group_ctr];
  342. inptr2 = input_buf[2][in_row_group_ctr];
  343. outptr0 = output_buf[0];
  344. outptr1 = output_buf[1];
  345. int cols_remaining = output_width;
  346. for (; cols_remaining >= 16; cols_remaining -= 16) {
  347. /* For each row, de-interleave Y component values into two separate
  348. * vectors, one containing the component values with even-numbered indices
  349. * and one containing the component values with odd-numbered indices.
  350. */
  351. uint8x8x2_t y0 = vld2_u8(inptr0_0);
  352. uint8x8x2_t y1 = vld2_u8(inptr0_1);
  353. uint8x8_t cb = vld1_u8(inptr1);
  354. uint8x8_t cr = vld1_u8(inptr2);
  355. /* Subtract 128 from Cb and Cr. */
  356. int16x8_t cr_128 =
  357. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
  358. int16x8_t cb_128 =
  359. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
  360. /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
  361. int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
  362. int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
  363. g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
  364. g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
  365. /* Descale G components: shift right 15, round, and narrow to 16-bit. */
  366. int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
  367. vrshrn_n_s32(g_sub_y_h, 15));
  368. /* Compute R-Y: 1.40200 * (Cr - 128) */
  369. int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
  370. /* Compute B-Y: 1.77200 * (Cb - 128) */
  371. int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
  372. /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
  373. * the "even" and "odd" Y component values. This effectively upsamples the
  374. * chroma components both horizontally and vertically.
  375. */
  376. int16x8_t g0_even =
  377. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
  378. y0.val[0]));
  379. int16x8_t r0_even =
  380. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
  381. y0.val[0]));
  382. int16x8_t b0_even =
  383. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
  384. y0.val[0]));
  385. int16x8_t g0_odd =
  386. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
  387. y0.val[1]));
  388. int16x8_t r0_odd =
  389. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
  390. y0.val[1]));
  391. int16x8_t b0_odd =
  392. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
  393. y0.val[1]));
  394. int16x8_t g1_even =
  395. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
  396. y1.val[0]));
  397. int16x8_t r1_even =
  398. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
  399. y1.val[0]));
  400. int16x8_t b1_even =
  401. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
  402. y1.val[0]));
  403. int16x8_t g1_odd =
  404. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
  405. y1.val[1]));
  406. int16x8_t r1_odd =
  407. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
  408. y1.val[1]));
  409. int16x8_t b1_odd =
  410. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
  411. y1.val[1]));
  412. /* Convert each component to unsigned and narrow, clamping to [0-255].
  413. * Re-interleave the "even" and "odd" component values.
  414. */
  415. uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
  416. uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
  417. uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
  418. uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
  419. uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
  420. uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
  421. #ifdef RGB_ALPHA
  422. uint8x16x4_t rgba0, rgba1;
  423. rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
  424. rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
  425. rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
  426. rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
  427. rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
  428. rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
  429. /* Set alpha channel to opaque (0xFF). */
  430. rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
  431. rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
  432. /* Store RGBA pixel data to memory. */
  433. vst4q_u8(outptr0, rgba0);
  434. vst4q_u8(outptr1, rgba1);
  435. #else
  436. uint8x16x3_t rgb0, rgb1;
  437. rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
  438. rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
  439. rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
  440. rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
  441. rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
  442. rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
  443. /* Store RGB pixel data to memory. */
  444. vst3q_u8(outptr0, rgb0);
  445. vst3q_u8(outptr1, rgb1);
  446. #endif
  447. /* Increment pointers. */
  448. inptr0_0 += 16;
  449. inptr0_1 += 16;
  450. inptr1 += 8;
  451. inptr2 += 8;
  452. outptr0 += (RGB_PIXELSIZE * 16);
  453. outptr1 += (RGB_PIXELSIZE * 16);
  454. }
  455. if (cols_remaining > 0) {
  456. /* For each row, de-interleave Y component values into two separate
  457. * vectors, one containing the component values with even-numbered indices
  458. * and one containing the component values with odd-numbered indices.
  459. */
  460. uint8x8x2_t y0 = vld2_u8(inptr0_0);
  461. uint8x8x2_t y1 = vld2_u8(inptr0_1);
  462. uint8x8_t cb = vld1_u8(inptr1);
  463. uint8x8_t cr = vld1_u8(inptr2);
  464. /* Subtract 128 from Cb and Cr. */
  465. int16x8_t cr_128 =
  466. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
  467. int16x8_t cb_128 =
  468. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
  469. /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
  470. int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
  471. int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
  472. g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
  473. g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
  474. /* Descale G components: shift right 15, round, and narrow to 16-bit. */
  475. int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
  476. vrshrn_n_s32(g_sub_y_h, 15));
  477. /* Compute R-Y: 1.40200 * (Cr - 128) */
  478. int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
  479. /* Compute B-Y: 1.77200 * (Cb - 128) */
  480. int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
  481. /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
  482. * the "even" and "odd" Y component values. This effectively upsamples the
  483. * chroma components both horizontally and vertically.
  484. */
  485. int16x8_t g0_even =
  486. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
  487. y0.val[0]));
  488. int16x8_t r0_even =
  489. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
  490. y0.val[0]));
  491. int16x8_t b0_even =
  492. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
  493. y0.val[0]));
  494. int16x8_t g0_odd =
  495. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
  496. y0.val[1]));
  497. int16x8_t r0_odd =
  498. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
  499. y0.val[1]));
  500. int16x8_t b0_odd =
  501. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
  502. y0.val[1]));
  503. int16x8_t g1_even =
  504. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
  505. y1.val[0]));
  506. int16x8_t r1_even =
  507. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
  508. y1.val[0]));
  509. int16x8_t b1_even =
  510. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
  511. y1.val[0]));
  512. int16x8_t g1_odd =
  513. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
  514. y1.val[1]));
  515. int16x8_t r1_odd =
  516. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
  517. y1.val[1]));
  518. int16x8_t b1_odd =
  519. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
  520. y1.val[1]));
  521. /* Convert each component to unsigned and narrow, clamping to [0-255].
  522. * Re-interleave the "even" and "odd" component values.
  523. */
  524. uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
  525. uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
  526. uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
  527. uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
  528. uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
  529. uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
  530. #ifdef RGB_ALPHA
  531. uint8x8x4_t rgba0_h, rgba1_h;
  532. rgba0_h.val[RGB_RED] = r0.val[1];
  533. rgba1_h.val[RGB_RED] = r1.val[1];
  534. rgba0_h.val[RGB_GREEN] = g0.val[1];
  535. rgba1_h.val[RGB_GREEN] = g1.val[1];
  536. rgba0_h.val[RGB_BLUE] = b0.val[1];
  537. rgba1_h.val[RGB_BLUE] = b1.val[1];
  538. /* Set alpha channel to opaque (0xFF). */
  539. rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
  540. rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
  541. uint8x8x4_t rgba0_l, rgba1_l;
  542. rgba0_l.val[RGB_RED] = r0.val[0];
  543. rgba1_l.val[RGB_RED] = r1.val[0];
  544. rgba0_l.val[RGB_GREEN] = g0.val[0];
  545. rgba1_l.val[RGB_GREEN] = g1.val[0];
  546. rgba0_l.val[RGB_BLUE] = b0.val[0];
  547. rgba1_l.val[RGB_BLUE] = b1.val[0];
  548. /* Set alpha channel to opaque (0xFF). */
  549. rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
  550. rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
  551. /* Store RGBA pixel data to memory. */
  552. switch (cols_remaining) {
  553. case 15:
  554. vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
  555. vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
  556. FALLTHROUGH /*FALLTHROUGH*/
  557. case 14:
  558. vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
  559. vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
  560. FALLTHROUGH /*FALLTHROUGH*/
  561. case 13:
  562. vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
  563. vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
  564. FALLTHROUGH /*FALLTHROUGH*/
  565. case 12:
  566. vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
  567. vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
  568. FALLTHROUGH /*FALLTHROUGH*/
  569. case 11:
  570. vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
  571. vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
  572. FALLTHROUGH /*FALLTHROUGH*/
  573. case 10:
  574. vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
  575. vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
  576. FALLTHROUGH /*FALLTHROUGH*/
  577. case 9:
  578. vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
  579. vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
  580. FALLTHROUGH /*FALLTHROUGH*/
  581. case 8:
  582. vst4_u8(outptr0, rgba0_l);
  583. vst4_u8(outptr1, rgba1_l);
  584. break;
  585. case 7:
  586. vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
  587. vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
  588. FALLTHROUGH /*FALLTHROUGH*/
  589. case 6:
  590. vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
  591. vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
  592. FALLTHROUGH /*FALLTHROUGH*/
  593. case 5:
  594. vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
  595. vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
  596. FALLTHROUGH /*FALLTHROUGH*/
  597. case 4:
  598. vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
  599. vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
  600. FALLTHROUGH /*FALLTHROUGH*/
  601. case 3:
  602. vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
  603. vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
  604. FALLTHROUGH /*FALLTHROUGH*/
  605. case 2:
  606. vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
  607. vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
  608. FALLTHROUGH /*FALLTHROUGH*/
  609. case 1:
  610. vst4_lane_u8(outptr0, rgba0_l, 0);
  611. vst4_lane_u8(outptr1, rgba1_l, 0);
  612. FALLTHROUGH /*FALLTHROUGH*/
  613. default:
  614. break;
  615. }
  616. #else
  617. uint8x8x3_t rgb0_h, rgb1_h;
  618. rgb0_h.val[RGB_RED] = r0.val[1];
  619. rgb1_h.val[RGB_RED] = r1.val[1];
  620. rgb0_h.val[RGB_GREEN] = g0.val[1];
  621. rgb1_h.val[RGB_GREEN] = g1.val[1];
  622. rgb0_h.val[RGB_BLUE] = b0.val[1];
  623. rgb1_h.val[RGB_BLUE] = b1.val[1];
  624. uint8x8x3_t rgb0_l, rgb1_l;
  625. rgb0_l.val[RGB_RED] = r0.val[0];
  626. rgb1_l.val[RGB_RED] = r1.val[0];
  627. rgb0_l.val[RGB_GREEN] = g0.val[0];
  628. rgb1_l.val[RGB_GREEN] = g1.val[0];
  629. rgb0_l.val[RGB_BLUE] = b0.val[0];
  630. rgb1_l.val[RGB_BLUE] = b1.val[0];
  631. /* Store RGB pixel data to memory. */
  632. switch (cols_remaining) {
  633. case 15:
  634. vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
  635. vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
  636. FALLTHROUGH /*FALLTHROUGH*/
  637. case 14:
  638. vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
  639. vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
  640. FALLTHROUGH /*FALLTHROUGH*/
  641. case 13:
  642. vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
  643. vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
  644. FALLTHROUGH /*FALLTHROUGH*/
  645. case 12:
  646. vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
  647. vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
  648. FALLTHROUGH /*FALLTHROUGH*/
  649. case 11:
  650. vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
  651. vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
  652. FALLTHROUGH /*FALLTHROUGH*/
  653. case 10:
  654. vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
  655. vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
  656. FALLTHROUGH /*FALLTHROUGH*/
  657. case 9:
  658. vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
  659. vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
  660. FALLTHROUGH /*FALLTHROUGH*/
  661. case 8:
  662. vst3_u8(outptr0, rgb0_l);
  663. vst3_u8(outptr1, rgb1_l);
  664. break;
  665. case 7:
  666. vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
  667. vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
  668. FALLTHROUGH /*FALLTHROUGH*/
  669. case 6:
  670. vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
  671. vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
  672. FALLTHROUGH /*FALLTHROUGH*/
  673. case 5:
  674. vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
  675. vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
  676. FALLTHROUGH /*FALLTHROUGH*/
  677. case 4:
  678. vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
  679. vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
  680. FALLTHROUGH /*FALLTHROUGH*/
  681. case 3:
  682. vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
  683. vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
  684. FALLTHROUGH /*FALLTHROUGH*/
  685. case 2:
  686. vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
  687. vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
  688. FALLTHROUGH /*FALLTHROUGH*/
  689. case 1:
  690. vst3_lane_u8(outptr0, rgb0_l, 0);
  691. vst3_lane_u8(outptr1, rgb1_l, 0);
  692. FALLTHROUGH /*FALLTHROUGH*/
  693. default:
  694. break;
  695. }
  696. #endif
  697. }
  698. }