jquanti-neon.c 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. /*
  2. * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
  3. *
  4. * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
  5. *
  6. * This software is provided 'as-is', without any express or implied
  7. * warranty. In no event will the authors be held liable for any damages
  8. * arising from the use of this software.
  9. *
  10. * Permission is granted to anyone to use this software for any purpose,
  11. * including commercial applications, and to alter it and redistribute it
  12. * freely, subject to the following restrictions:
  13. *
  14. * 1. The origin of this software must not be misrepresented; you must not
  15. * claim that you wrote the original software. If you use this software
  16. * in a product, an acknowledgment in the product documentation would be
  17. * appreciated but is not required.
  18. * 2. Altered source versions must be plainly marked as such, and must not be
  19. * misrepresented as being the original software.
  20. * 3. This notice may not be removed or altered from any source distribution.
  21. */
  22. #define JPEG_INTERNALS
  23. #include "../../jinclude.h"
  24. #include "../../jpeglib.h"
  25. #include "../../jsimd.h"
  26. #include "../../jdct.h"
  27. #include "../../jsimddct.h"
  28. #include "../jsimd.h"
  29. #include <arm_neon.h>
  30. /* After downsampling, the resulting sample values are in the range [0, 255],
  31. * but the Discrete Cosine Transform (DCT) operates on values centered around
  32. * 0.
  33. *
  34. * To prepare sample values for the DCT, load samples into a DCT workspace,
  35. * subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127],
  36. * are also widened from 8- to 16-bit.
  37. *
  38. * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
  39. */
  40. void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
  41. DCTELEM *workspace)
  42. {
  43. uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
  44. uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
  45. uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
  46. uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
  47. uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
  48. uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
  49. uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
  50. uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
  51. int16x8_t row0 =
  52. vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
  53. int16x8_t row1 =
  54. vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
  55. int16x8_t row2 =
  56. vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
  57. int16x8_t row3 =
  58. vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
  59. int16x8_t row4 =
  60. vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
  61. int16x8_t row5 =
  62. vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
  63. int16x8_t row6 =
  64. vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
  65. int16x8_t row7 =
  66. vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
  67. vst1q_s16(workspace + 0 * DCTSIZE, row0);
  68. vst1q_s16(workspace + 1 * DCTSIZE, row1);
  69. vst1q_s16(workspace + 2 * DCTSIZE, row2);
  70. vst1q_s16(workspace + 3 * DCTSIZE, row3);
  71. vst1q_s16(workspace + 4 * DCTSIZE, row4);
  72. vst1q_s16(workspace + 5 * DCTSIZE, row5);
  73. vst1q_s16(workspace + 6 * DCTSIZE, row6);
  74. vst1q_s16(workspace + 7 * DCTSIZE, row7);
  75. }
  76. /* After the DCT, the resulting array of coefficient values needs to be divided
  77. * by an array of quantization values.
  78. *
  79. * To avoid a slow division operation, the DCT coefficients are multiplied by
  80. * the (scaled) reciprocals of the quantization values and then right-shifted.
  81. *
  82. * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
  83. */
  84. void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
  85. DCTELEM *workspace)
  86. {
  87. JCOEFPTR out_ptr = coef_block;
  88. UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
  89. UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
  90. DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
  91. int i;
  92. #if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
  93. #pragma unroll
  94. #endif
  95. for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
  96. /* Load DCT coefficients. */
  97. int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
  98. int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
  99. int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
  100. int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
  101. /* Load reciprocals of quantization values. */
  102. uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
  103. uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
  104. uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
  105. uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
  106. uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
  107. uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
  108. uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
  109. uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
  110. int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
  111. int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
  112. int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
  113. int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
  114. /* Extract sign from coefficients. */
  115. int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
  116. int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
  117. int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
  118. int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
  119. /* Get absolute value of DCT coefficients. */
  120. uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
  121. uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
  122. uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
  123. uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
  124. /* Add correction. */
  125. abs_row0 = vaddq_u16(abs_row0, corr0);
  126. abs_row1 = vaddq_u16(abs_row1, corr1);
  127. abs_row2 = vaddq_u16(abs_row2, corr2);
  128. abs_row3 = vaddq_u16(abs_row3, corr3);
  129. /* Multiply DCT coefficients by quantization reciprocals. */
  130. int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
  131. vget_low_u16(recip0)));
  132. int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
  133. vget_high_u16(recip0)));
  134. int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
  135. vget_low_u16(recip1)));
  136. int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
  137. vget_high_u16(recip1)));
  138. int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
  139. vget_low_u16(recip2)));
  140. int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
  141. vget_high_u16(recip2)));
  142. int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
  143. vget_low_u16(recip3)));
  144. int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
  145. vget_high_u16(recip3)));
  146. /* Narrow back to 16-bit. */
  147. row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
  148. row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
  149. row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
  150. row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
  151. /* Since VSHR only supports an immediate as its second argument, negate the
  152. * shift value and shift left.
  153. */
  154. row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
  155. vnegq_s16(shift0)));
  156. row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
  157. vnegq_s16(shift1)));
  158. row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
  159. vnegq_s16(shift2)));
  160. row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
  161. vnegq_s16(shift3)));
  162. /* Restore sign to original product. */
  163. row0 = veorq_s16(row0, sign_row0);
  164. row0 = vsubq_s16(row0, sign_row0);
  165. row1 = veorq_s16(row1, sign_row1);
  166. row1 = vsubq_s16(row1, sign_row1);
  167. row2 = veorq_s16(row2, sign_row2);
  168. row2 = vsubq_s16(row2, sign_row2);
  169. row3 = veorq_s16(row3, sign_row3);
  170. row3 = vsubq_s16(row3, sign_row3);
  171. /* Store quantized coefficients to memory. */
  172. vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
  173. vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
  174. vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
  175. vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
  176. }
  177. }