rescaler_msa.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. // Copyright 2016 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // MSA version of rescaling functions
  11. //
  12. // Author: Prashant Patil (prashant.patil@imgtec.com)
  13. #include "./dsp.h"
  14. #if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
  15. #include <assert.h>
  16. #include "../utils/rescaler_utils.h"
  17. #include "./msa_macro.h"
  18. #define ROUNDER (WEBP_RESCALER_ONE >> 1)
  19. #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
  20. #define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
  21. #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \
  22. v4u32 tmp0, tmp1, tmp2, tmp3; \
  23. v16u8 t0, t1, t2, t3, t4, t5; \
  24. v2u64 out0, out1, out2, out3; \
  25. ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
  26. ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
  27. DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
  28. DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
  29. SRAR_D4_UD(out0, out1, out2, out3, shift); \
  30. PCKEV_B2_UB(out1, out0, out3, out2, t0, t1); \
  31. ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
  32. ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
  33. DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
  34. DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
  35. SRAR_D4_UD(out0, out1, out2, out3, shift); \
  36. PCKEV_B2_UB(out1, out0, out3, out2, t2, t3); \
  37. PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); \
  38. dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); \
  39. } while (0)
  40. #define CALC_MULT_FIX_4(in0, scale, shift, dst) do { \
  41. v4u32 tmp0, tmp1; \
  42. v16i8 t0, t1; \
  43. v2u64 out0, out1; \
  44. ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
  45. DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
  46. SRAR_D2_UD(out0, out1, shift); \
  47. t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
  48. t1 = __msa_pckev_b(t0, t0); \
  49. t0 = __msa_pckev_b(t1, t1); \
  50. dst = __msa_copy_s_w((v4i32)t0, 0); \
  51. } while (0)
  52. #define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift, \
  53. dst0, dst1, dst2, dst3) do { \
  54. v4u32 tmp0, tmp1, tmp2, tmp3; \
  55. v2u64 out0, out1, out2, out3; \
  56. ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
  57. ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
  58. DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
  59. DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
  60. SRAR_D4_UD(out0, out1, out2, out3, shift); \
  61. PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1); \
  62. ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
  63. ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
  64. DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
  65. DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
  66. SRAR_D4_UD(out0, out1, out2, out3, shift); \
  67. PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3); \
  68. } while (0)
  69. #define CALC_MULT_FIX1_4(in0, scale, shift, dst) do { \
  70. v4u32 tmp0, tmp1; \
  71. v2u64 out0, out1; \
  72. ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
  73. DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
  74. SRAR_D2_UD(out0, out1, shift); \
  75. dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0); \
  76. } while (0)
  77. #define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift, \
  78. dst0, dst1) do { \
  79. v4u32 tmp0, tmp1, tmp2, tmp3; \
  80. v2u64 out0, out1, out2, out3; \
  81. ILVRL_W2_UW(in0, in2, tmp0, tmp1); \
  82. ILVRL_W2_UW(in1, in3, tmp2, tmp3); \
  83. DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
  84. DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3); \
  85. SRAR_D4_UD(out0, out1, out2, out3, shift); \
  86. DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
  87. DOTP_UW2_UD(out2, out3, scale, scale, out2, out3); \
  88. SRAR_D4_UD(out0, out1, out2, out3, shift); \
  89. PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1); \
  90. } while (0)
  91. #define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do { \
  92. v4u32 tmp0, tmp1; \
  93. v2u64 out0, out1; \
  94. v16i8 t0, t1; \
  95. ILVRL_W2_UW(in0, in1, tmp0, tmp1); \
  96. DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
  97. SRAR_D2_UD(out0, out1, shift); \
  98. DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
  99. SRAR_D2_UD(out0, out1, shift); \
  100. t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
  101. t1 = __msa_pckev_b(t0, t0); \
  102. t0 = __msa_pckev_b(t1, t1); \
  103. dst = __msa_copy_s_w((v4i32)t0, 0); \
  104. } while (0)
  105. static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
  106. int length,
  107. WebPRescaler* const wrk) {
  108. const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
  109. const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
  110. const v4i32 zero = { 0 };
  111. while (length >= 16) {
  112. v4u32 src0, src1, src2, src3;
  113. v16u8 out;
  114. LD_UW4(frow, 4, src0, src1, src2, src3);
  115. CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
  116. ST_UB(out, dst);
  117. length -= 16;
  118. frow += 16;
  119. dst += 16;
  120. }
  121. if (length > 0) {
  122. int x_out;
  123. if (length >= 12) {
  124. uint32_t val0_m, val1_m, val2_m;
  125. v4u32 src0, src1, src2;
  126. LD_UW3(frow, 4, src0, src1, src2);
  127. CALC_MULT_FIX_4(src0, scale, shift, val0_m);
  128. CALC_MULT_FIX_4(src1, scale, shift, val1_m);
  129. CALC_MULT_FIX_4(src2, scale, shift, val2_m);
  130. SW3(val0_m, val1_m, val2_m, dst, 4);
  131. length -= 12;
  132. frow += 12;
  133. dst += 12;
  134. } else if (length >= 8) {
  135. uint32_t val0_m, val1_m;
  136. v4u32 src0, src1;
  137. LD_UW2(frow, 4, src0, src1);
  138. CALC_MULT_FIX_4(src0, scale, shift, val0_m);
  139. CALC_MULT_FIX_4(src1, scale, shift, val1_m);
  140. SW2(val0_m, val1_m, dst, 4);
  141. length -= 8;
  142. frow += 8;
  143. dst += 8;
  144. } else if (length >= 4) {
  145. uint32_t val0_m;
  146. const v4u32 src0 = LD_UW(frow);
  147. CALC_MULT_FIX_4(src0, scale, shift, val0_m);
  148. SW(val0_m, dst);
  149. length -= 4;
  150. frow += 4;
  151. dst += 4;
  152. }
  153. for (x_out = 0; x_out < length; ++x_out) {
  154. const uint32_t J = frow[x_out];
  155. const int v = (int)MULT_FIX(J, wrk->fy_scale);
  156. dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
  157. }
  158. }
  159. }
  160. static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
  161. uint8_t* dst, int length,
  162. WebPRescaler* const wrk) {
  163. const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
  164. const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
  165. const v4i32 B1 = __msa_fill_w(B);
  166. const v4i32 A1 = __msa_fill_w(A);
  167. const v4i32 AB = __msa_ilvr_w(A1, B1);
  168. const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
  169. const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
  170. while (length >= 16) {
  171. v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
  172. v16u8 t0, t1, t2, t3, t4, t5;
  173. LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
  174. LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
  175. CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
  176. CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
  177. PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
  178. t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
  179. ST_UB(t0, dst);
  180. frow += 16;
  181. irow += 16;
  182. dst += 16;
  183. length -= 16;
  184. }
  185. if (length > 0) {
  186. int x_out;
  187. if (length >= 12) {
  188. uint32_t val0_m, val1_m, val2_m;
  189. v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
  190. LD_UW3(frow, 4, frow0, frow1, frow2);
  191. LD_UW3(irow, 4, irow0, irow1, irow2);
  192. CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
  193. CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
  194. CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
  195. SW3(val0_m, val1_m, val2_m, dst, 4);
  196. frow += 12;
  197. irow += 12;
  198. dst += 12;
  199. length -= 12;
  200. } else if (length >= 8) {
  201. uint32_t val0_m, val1_m;
  202. v4u32 frow0, frow1, irow0, irow1;
  203. LD_UW2(frow, 4, frow0, frow1);
  204. LD_UW2(irow, 4, irow0, irow1);
  205. CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
  206. CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
  207. SW2(val0_m, val1_m, dst, 4);
  208. frow += 4;
  209. irow += 4;
  210. dst += 4;
  211. length -= 4;
  212. } else if (length >= 4) {
  213. uint32_t val0_m;
  214. const v4u32 frow0 = LD_UW(frow + 0);
  215. const v4u32 irow0 = LD_UW(irow + 0);
  216. CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
  217. SW(val0_m, dst);
  218. frow += 4;
  219. irow += 4;
  220. dst += 4;
  221. length -= 4;
  222. }
  223. for (x_out = 0; x_out < length; ++x_out) {
  224. const uint64_t I = (uint64_t)A * frow[x_out]
  225. + (uint64_t)B * irow[x_out];
  226. const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
  227. const int v = (int)MULT_FIX(J, wrk->fy_scale);
  228. dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
  229. }
  230. }
  231. }
  232. static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
  233. uint8_t* dst = wrk->dst;
  234. rescaler_t* irow = wrk->irow;
  235. const int x_out_max = wrk->dst_width * wrk->num_channels;
  236. const rescaler_t* frow = wrk->frow;
  237. assert(!WebPRescalerOutputDone(wrk));
  238. assert(wrk->y_accum <= 0);
  239. assert(wrk->y_expand);
  240. assert(wrk->y_sub != 0);
  241. if (wrk->y_accum == 0) {
  242. ExportRowExpand_0(frow, dst, x_out_max, wrk);
  243. } else {
  244. ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
  245. }
  246. }
  247. #if 0 // disabled for now. TODO(skal): make match the C-code
  248. static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
  249. uint8_t* dst, int length,
  250. const uint32_t yscale,
  251. WebPRescaler* const wrk) {
  252. const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
  253. const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
  254. const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
  255. const v4i32 zero = { 0 };
  256. while (length >= 16) {
  257. v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
  258. v16u8 out;
  259. LD_UW4(frow, 4, src0, src1, src2, src3);
  260. CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
  261. frac0, frac1, frac2, frac3);
  262. LD_UW4(irow, 4, src0, src1, src2, src3);
  263. SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
  264. src0, src1, src2, src3);
  265. CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
  266. ST_UB(out, dst);
  267. ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
  268. frow += 16;
  269. irow += 16;
  270. dst += 16;
  271. length -= 16;
  272. }
  273. if (length > 0) {
  274. int x_out;
  275. if (length >= 12) {
  276. uint32_t val0_m, val1_m, val2_m;
  277. v4u32 src0, src1, src2, frac0, frac1, frac2;
  278. LD_UW3(frow, 4, src0, src1, src2);
  279. CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
  280. CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
  281. CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
  282. LD_UW3(irow, 4, src0, src1, src2);
  283. SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
  284. CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
  285. CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
  286. CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
  287. SW3(val0_m, val1_m, val2_m, dst, 4);
  288. ST_UW3(frac0, frac1, frac2, irow, 4);
  289. frow += 12;
  290. irow += 12;
  291. dst += 12;
  292. length -= 12;
  293. } else if (length >= 8) {
  294. uint32_t val0_m, val1_m;
  295. v4u32 src0, src1, frac0, frac1;
  296. LD_UW2(frow, 4, src0, src1);
  297. CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
  298. CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
  299. LD_UW2(irow, 4, src0, src1);
  300. SUB2(src0, frac0, src1, frac1, src0, src1);
  301. CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
  302. CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
  303. SW2(val0_m, val1_m, dst, 4);
  304. ST_UW2(frac0, frac1, irow, 4);
  305. frow += 8;
  306. irow += 8;
  307. dst += 8;
  308. length -= 8;
  309. } else if (length >= 4) {
  310. uint32_t val0_m;
  311. v4u32 frac0;
  312. v4u32 src0 = LD_UW(frow);
  313. CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
  314. src0 = LD_UW(irow);
  315. src0 = src0 - frac0;
  316. CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
  317. SW(val0_m, dst);
  318. ST_UW(frac0, irow);
  319. frow += 4;
  320. irow += 4;
  321. dst += 4;
  322. length -= 4;
  323. }
  324. for (x_out = 0; x_out < length; ++x_out) {
  325. const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
  326. const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
  327. dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
  328. irow[x_out] = frac;
  329. }
  330. }
  331. }
  332. static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
  333. int length,
  334. WebPRescaler* const wrk) {
  335. const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
  336. const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
  337. const v4i32 zero = { 0 };
  338. while (length >= 16) {
  339. v4u32 src0, src1, src2, src3;
  340. v16u8 dst0;
  341. LD_UW4(irow, 4, src0, src1, src2, src3);
  342. CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
  343. ST_UB(dst0, dst);
  344. ST_SW4(zero, zero, zero, zero, irow, 4);
  345. length -= 16;
  346. irow += 16;
  347. dst += 16;
  348. }
  349. if (length > 0) {
  350. int x_out;
  351. if (length >= 12) {
  352. uint32_t val0_m, val1_m, val2_m;
  353. v4u32 src0, src1, src2;
  354. LD_UW3(irow, 4, src0, src1, src2);
  355. CALC_MULT_FIX_4(src0, scale, shift, val0_m);
  356. CALC_MULT_FIX_4(src1, scale, shift, val1_m);
  357. CALC_MULT_FIX_4(src2, scale, shift, val2_m);
  358. SW3(val0_m, val1_m, val2_m, dst, 4);
  359. ST_SW3(zero, zero, zero, irow, 4);
  360. length -= 12;
  361. irow += 12;
  362. dst += 12;
  363. } else if (length >= 8) {
  364. uint32_t val0_m, val1_m;
  365. v4u32 src0, src1;
  366. LD_UW2(irow, 4, src0, src1);
  367. CALC_MULT_FIX_4(src0, scale, shift, val0_m);
  368. CALC_MULT_FIX_4(src1, scale, shift, val1_m);
  369. SW2(val0_m, val1_m, dst, 4);
  370. ST_SW2(zero, zero, irow, 4);
  371. length -= 8;
  372. irow += 8;
  373. dst += 8;
  374. } else if (length >= 4) {
  375. uint32_t val0_m;
  376. const v4u32 src0 = LD_UW(irow + 0);
  377. CALC_MULT_FIX_4(src0, scale, shift, val0_m);
  378. SW(val0_m, dst);
  379. ST_SW(zero, irow);
  380. length -= 4;
  381. irow += 4;
  382. dst += 4;
  383. }
  384. for (x_out = 0; x_out < length; ++x_out) {
  385. const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
  386. dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
  387. irow[x_out] = 0;
  388. }
  389. }
  390. }
  391. static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
  392. uint8_t* dst = wrk->dst;
  393. rescaler_t* irow = wrk->irow;
  394. const int x_out_max = wrk->dst_width * wrk->num_channels;
  395. const rescaler_t* frow = wrk->frow;
  396. const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
  397. assert(!WebPRescalerOutputDone(wrk));
  398. assert(wrk->y_accum <= 0);
  399. assert(!wrk->y_expand);
  400. if (yscale) {
  401. ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
  402. } else {
  403. ExportRowShrink_1(irow, dst, x_out_max, wrk);
  404. }
  405. }
  406. #endif // 0
  407. //------------------------------------------------------------------------------
  408. // Entry point
  409. extern void WebPRescalerDspInitMSA(void);
  410. WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
  411. WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
  412. // WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
  413. }
  414. #else // !WEBP_USE_MSA
  415. WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
  416. #endif // WEBP_USE_MSA