jdcolext-neon.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. /*
  2. * jdcolext-neon.c - colorspace conversion (Arm Neon)
  3. *
  4. * Copyright (C) 2020, Arm Limited. All Rights Reserved.
  5. * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
  6. *
  7. * This software is provided 'as-is', without any express or implied
  8. * warranty. In no event will the authors be held liable for any damages
  9. * arising from the use of this software.
  10. *
  11. * Permission is granted to anyone to use this software for any purpose,
  12. * including commercial applications, and to alter it and redistribute it
  13. * freely, subject to the following restrictions:
  14. *
  15. * 1. The origin of this software must not be misrepresented; you must not
  16. * claim that you wrote the original software. If you use this software
  17. * in a product, an acknowledgment in the product documentation would be
  18. * appreciated but is not required.
  19. * 2. Altered source versions must be plainly marked as such, and must not be
  20. * misrepresented as being the original software.
  21. * 3. This notice may not be removed or altered from any source distribution.
  22. */
  23. /* This file is included by jdcolor-neon.c. */
  24. /* YCbCr -> RGB conversion is defined by the following equations:
  25. * R = Y + 1.40200 * (Cr - 128)
  26. * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
  27. * B = Y + 1.77200 * (Cb - 128)
  28. *
  29. * Scaled integer constants are used to avoid floating-point arithmetic:
  30. * 0.3441467 = 11277 * 2^-15
  31. * 0.7141418 = 23401 * 2^-15
  32. * 1.4020386 = 22971 * 2^-14
  33. * 1.7720337 = 29033 * 2^-14
  34. * These constants are defined in jdcolor-neon.c.
  35. *
  36. * To ensure correct results, rounding is used when descaling.
  37. */
  38. /* Notes on safe memory access for YCbCr -> RGB conversion routines:
  39. *
  40. * Input memory buffers can be safely overread up to the next multiple of
  41. * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
  42. * jmemmgr.c.
  43. *
  44. * The output buffer cannot safely be written beyond output_width, since
  45. * output_buf points to a possibly unpadded row in the decompressed image
  46. * buffer allocated by the calling program.
  47. */
  48. void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
  49. JDIMENSION input_row, JSAMPARRAY output_buf,
  50. int num_rows)
  51. {
  52. JSAMPROW outptr;
  53. /* Pointers to Y, Cb, and Cr data */
  54. JSAMPROW inptr0, inptr1, inptr2;
  55. const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
  56. const int16x8_t neg_128 = vdupq_n_s16(-128);
  57. while (--num_rows >= 0) {
  58. inptr0 = input_buf[0][input_row];
  59. inptr1 = input_buf[1][input_row];
  60. inptr2 = input_buf[2][input_row];
  61. input_row++;
  62. outptr = *output_buf++;
  63. int cols_remaining = output_width;
  64. for (; cols_remaining >= 16; cols_remaining -= 16) {
  65. uint8x16_t y = vld1q_u8(inptr0);
  66. uint8x16_t cb = vld1q_u8(inptr1);
  67. uint8x16_t cr = vld1q_u8(inptr2);
  68. /* Subtract 128 from Cb and Cr. */
  69. int16x8_t cr_128_l =
  70. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
  71. vget_low_u8(cr)));
  72. int16x8_t cr_128_h =
  73. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
  74. vget_high_u8(cr)));
  75. int16x8_t cb_128_l =
  76. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
  77. vget_low_u8(cb)));
  78. int16x8_t cb_128_h =
  79. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
  80. vget_high_u8(cb)));
  81. /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
  82. int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
  83. int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
  84. consts, 0);
  85. int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
  86. int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
  87. consts, 0);
  88. g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
  89. consts, 1);
  90. g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
  91. consts, 1);
  92. g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
  93. consts, 1);
  94. g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
  95. consts, 1);
  96. /* Descale G components: shift right 15, round, and narrow to 16-bit. */
  97. int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
  98. vrshrn_n_s32(g_sub_y_lh, 15));
  99. int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
  100. vrshrn_n_s32(g_sub_y_hh, 15));
  101. /* Compute R-Y: 1.40200 * (Cr - 128) */
  102. int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
  103. consts, 2);
  104. int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
  105. consts, 2);
  106. /* Compute B-Y: 1.77200 * (Cb - 128) */
  107. int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
  108. consts, 3);
  109. int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
  110. consts, 3);
  111. /* Add Y. */
  112. int16x8_t r_l =
  113. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
  114. vget_low_u8(y)));
  115. int16x8_t r_h =
  116. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
  117. vget_high_u8(y)));
  118. int16x8_t b_l =
  119. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
  120. vget_low_u8(y)));
  121. int16x8_t b_h =
  122. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
  123. vget_high_u8(y)));
  124. int16x8_t g_l =
  125. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
  126. vget_low_u8(y)));
  127. int16x8_t g_h =
  128. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
  129. vget_high_u8(y)));
  130. #if RGB_PIXELSIZE == 4
  131. uint8x16x4_t rgba;
  132. /* Convert each component to unsigned and narrow, clamping to [0-255]. */
  133. rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
  134. rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
  135. rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
  136. /* Set alpha channel to opaque (0xFF). */
  137. rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
  138. /* Store RGBA pixel data to memory. */
  139. vst4q_u8(outptr, rgba);
  140. #elif RGB_PIXELSIZE == 3
  141. uint8x16x3_t rgb;
  142. /* Convert each component to unsigned and narrow, clamping to [0-255]. */
  143. rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
  144. rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
  145. rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
  146. /* Store RGB pixel data to memory. */
  147. vst3q_u8(outptr, rgb);
  148. #else
  149. /* Pack R, G, and B values in ratio 5:6:5. */
  150. uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
  151. rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
  152. rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
  153. uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
  154. rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
  155. rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
  156. /* Store RGB pixel data to memory. */
  157. vst1q_u16((uint16_t *)outptr, rgb565_l);
  158. vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
  159. #endif
  160. /* Increment pointers. */
  161. inptr0 += 16;
  162. inptr1 += 16;
  163. inptr2 += 16;
  164. outptr += (RGB_PIXELSIZE * 16);
  165. }
  166. if (cols_remaining >= 8) {
  167. uint8x8_t y = vld1_u8(inptr0);
  168. uint8x8_t cb = vld1_u8(inptr1);
  169. uint8x8_t cr = vld1_u8(inptr2);
  170. /* Subtract 128 from Cb and Cr. */
  171. int16x8_t cr_128 =
  172. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
  173. int16x8_t cb_128 =
  174. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
  175. /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
  176. int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
  177. int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
  178. g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
  179. g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
  180. /* Descale G components: shift right 15, round, and narrow to 16-bit. */
  181. int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
  182. vrshrn_n_s32(g_sub_y_h, 15));
  183. /* Compute R-Y: 1.40200 * (Cr - 128) */
  184. int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
  185. consts, 2);
  186. /* Compute B-Y: 1.77200 * (Cb - 128) */
  187. int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
  188. consts, 3);
  189. /* Add Y. */
  190. int16x8_t r =
  191. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
  192. int16x8_t b =
  193. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
  194. int16x8_t g =
  195. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
  196. #if RGB_PIXELSIZE == 4
  197. uint8x8x4_t rgba;
  198. /* Convert each component to unsigned and narrow, clamping to [0-255]. */
  199. rgba.val[RGB_RED] = vqmovun_s16(r);
  200. rgba.val[RGB_GREEN] = vqmovun_s16(g);
  201. rgba.val[RGB_BLUE] = vqmovun_s16(b);
  202. /* Set alpha channel to opaque (0xFF). */
  203. rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
  204. /* Store RGBA pixel data to memory. */
  205. vst4_u8(outptr, rgba);
  206. #elif RGB_PIXELSIZE == 3
  207. uint8x8x3_t rgb;
  208. /* Convert each component to unsigned and narrow, clamping to [0-255]. */
  209. rgb.val[RGB_RED] = vqmovun_s16(r);
  210. rgb.val[RGB_GREEN] = vqmovun_s16(g);
  211. rgb.val[RGB_BLUE] = vqmovun_s16(b);
  212. /* Store RGB pixel data to memory. */
  213. vst3_u8(outptr, rgb);
  214. #else
  215. /* Pack R, G, and B values in ratio 5:6:5. */
  216. uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
  217. rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
  218. rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
  219. /* Store RGB pixel data to memory. */
  220. vst1q_u16((uint16_t *)outptr, rgb565);
  221. #endif
  222. /* Increment pointers. */
  223. inptr0 += 8;
  224. inptr1 += 8;
  225. inptr2 += 8;
  226. outptr += (RGB_PIXELSIZE * 8);
  227. cols_remaining -= 8;
  228. }
  229. /* Handle the tail elements. */
  230. if (cols_remaining > 0) {
  231. uint8x8_t y = vld1_u8(inptr0);
  232. uint8x8_t cb = vld1_u8(inptr1);
  233. uint8x8_t cr = vld1_u8(inptr2);
  234. /* Subtract 128 from Cb and Cr. */
  235. int16x8_t cr_128 =
  236. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
  237. int16x8_t cb_128 =
  238. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
  239. /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
  240. int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
  241. int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
  242. g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
  243. g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
  244. /* Descale G components: shift right 15, round, and narrow to 16-bit. */
  245. int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
  246. vrshrn_n_s32(g_sub_y_h, 15));
  247. /* Compute R-Y: 1.40200 * (Cr - 128) */
  248. int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
  249. consts, 2);
  250. /* Compute B-Y: 1.77200 * (Cb - 128) */
  251. int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
  252. consts, 3);
  253. /* Add Y. */
  254. int16x8_t r =
  255. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
  256. int16x8_t b =
  257. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
  258. int16x8_t g =
  259. vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
  260. #if RGB_PIXELSIZE == 4
  261. uint8x8x4_t rgba;
  262. /* Convert each component to unsigned and narrow, clamping to [0-255]. */
  263. rgba.val[RGB_RED] = vqmovun_s16(r);
  264. rgba.val[RGB_GREEN] = vqmovun_s16(g);
  265. rgba.val[RGB_BLUE] = vqmovun_s16(b);
  266. /* Set alpha channel to opaque (0xFF). */
  267. rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
  268. /* Store RGBA pixel data to memory. */
  269. switch (cols_remaining) {
  270. case 7:
  271. vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
  272. FALLTHROUGH /*FALLTHROUGH*/
  273. case 6:
  274. vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
  275. FALLTHROUGH /*FALLTHROUGH*/
  276. case 5:
  277. vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
  278. FALLTHROUGH /*FALLTHROUGH*/
  279. case 4:
  280. vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
  281. FALLTHROUGH /*FALLTHROUGH*/
  282. case 3:
  283. vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
  284. FALLTHROUGH /*FALLTHROUGH*/
  285. case 2:
  286. vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
  287. FALLTHROUGH /*FALLTHROUGH*/
  288. case 1:
  289. vst4_lane_u8(outptr, rgba, 0);
  290. FALLTHROUGH /*FALLTHROUGH*/
  291. default:
  292. break;
  293. }
  294. #elif RGB_PIXELSIZE == 3
  295. uint8x8x3_t rgb;
  296. /* Convert each component to unsigned and narrow, clamping to [0-255]. */
  297. rgb.val[RGB_RED] = vqmovun_s16(r);
  298. rgb.val[RGB_GREEN] = vqmovun_s16(g);
  299. rgb.val[RGB_BLUE] = vqmovun_s16(b);
  300. /* Store RGB pixel data to memory. */
  301. switch (cols_remaining) {
  302. case 7:
  303. vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
  304. FALLTHROUGH /*FALLTHROUGH*/
  305. case 6:
  306. vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
  307. FALLTHROUGH /*FALLTHROUGH*/
  308. case 5:
  309. vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
  310. FALLTHROUGH /*FALLTHROUGH*/
  311. case 4:
  312. vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
  313. FALLTHROUGH /*FALLTHROUGH*/
  314. case 3:
  315. vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
  316. FALLTHROUGH /*FALLTHROUGH*/
  317. case 2:
  318. vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
  319. FALLTHROUGH /*FALLTHROUGH*/
  320. case 1:
  321. vst3_lane_u8(outptr, rgb, 0);
  322. FALLTHROUGH /*FALLTHROUGH*/
  323. default:
  324. break;
  325. }
  326. #else
  327. /* Pack R, G, and B values in ratio 5:6:5. */
  328. uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
  329. rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
  330. rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
  331. /* Store RGB565 pixel data to memory. */
  332. switch (cols_remaining) {
  333. case 7:
  334. vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
  335. FALLTHROUGH /*FALLTHROUGH*/
  336. case 6:
  337. vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
  338. FALLTHROUGH /*FALLTHROUGH*/
  339. case 5:
  340. vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
  341. FALLTHROUGH /*FALLTHROUGH*/
  342. case 4:
  343. vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
  344. FALLTHROUGH /*FALLTHROUGH*/
  345. case 3:
  346. vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
  347. FALLTHROUGH /*FALLTHROUGH*/
  348. case 2:
  349. vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
  350. FALLTHROUGH /*FALLTHROUGH*/
  351. case 1:
  352. vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
  353. FALLTHROUGH /*FALLTHROUGH*/
  354. default:
  355. break;
  356. }
  357. #endif
  358. }
  359. }
  360. }