rescaler_mips_dsp_r2.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. // Copyright 2014 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // MIPS version of rescaling functions
  11. //
  12. // Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
  13. #include "./dsp.h"
  14. #if defined(WEBP_USE_MIPS_DSP_R2) && !defined(WEBP_REDUCE_SIZE)
  15. #include <assert.h>
  16. #include "../utils/rescaler_utils.h"
  17. #define ROUNDER (WEBP_RESCALER_ONE >> 1)
  18. #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
  19. #define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
  20. //------------------------------------------------------------------------------
  21. // Row export
  22. #if 0 // disabled for now. TODO(skal): make match the C-code
  23. static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
  24. int i;
  25. const int x_out_max = wrk->dst_width * wrk->num_channels;
  26. uint8_t* dst = wrk->dst;
  27. rescaler_t* irow = wrk->irow;
  28. const rescaler_t* frow = wrk->frow;
  29. const int yscale = wrk->fy_scale * (-wrk->y_accum);
  30. int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
  31. const int temp7 = (int)wrk->fxy_scale;
  32. const int temp6 = (x_out_max & ~0x3) << 2;
  33. assert(!WebPRescalerOutputDone(wrk));
  34. assert(wrk->y_accum <= 0);
  35. assert(!wrk->y_expand);
  36. assert(wrk->fxy_scale != 0);
  37. if (yscale) {
  38. if (x_out_max >= 4) {
  39. int temp8, temp9, temp10, temp11;
  40. __asm__ volatile (
  41. "li %[temp3], 0x10000 \n\t"
  42. "li %[temp4], 0x8000 \n\t"
  43. "addu %[loop_end], %[frow], %[temp6] \n\t"
  44. "1: \n\t"
  45. "lw %[temp0], 0(%[frow]) \n\t"
  46. "lw %[temp1], 4(%[frow]) \n\t"
  47. "lw %[temp2], 8(%[frow]) \n\t"
  48. "lw %[temp5], 12(%[frow]) \n\t"
  49. "mult $ac0, %[temp3], %[temp4] \n\t"
  50. "maddu $ac0, %[temp0], %[yscale] \n\t"
  51. "mult $ac1, %[temp3], %[temp4] \n\t"
  52. "maddu $ac1, %[temp1], %[yscale] \n\t"
  53. "mult $ac2, %[temp3], %[temp4] \n\t"
  54. "maddu $ac2, %[temp2], %[yscale] \n\t"
  55. "mult $ac3, %[temp3], %[temp4] \n\t"
  56. "maddu $ac3, %[temp5], %[yscale] \n\t"
  57. "addiu %[frow], %[frow], 16 \n\t"
  58. "mfhi %[temp0], $ac0 \n\t"
  59. "mfhi %[temp1], $ac1 \n\t"
  60. "mfhi %[temp2], $ac2 \n\t"
  61. "mfhi %[temp5], $ac3 \n\t"
  62. "lw %[temp8], 0(%[irow]) \n\t"
  63. "lw %[temp9], 4(%[irow]) \n\t"
  64. "lw %[temp10], 8(%[irow]) \n\t"
  65. "lw %[temp11], 12(%[irow]) \n\t"
  66. "addiu %[dst], %[dst], 4 \n\t"
  67. "addiu %[irow], %[irow], 16 \n\t"
  68. "subu %[temp8], %[temp8], %[temp0] \n\t"
  69. "subu %[temp9], %[temp9], %[temp1] \n\t"
  70. "subu %[temp10], %[temp10], %[temp2] \n\t"
  71. "subu %[temp11], %[temp11], %[temp5] \n\t"
  72. "mult $ac0, %[temp3], %[temp4] \n\t"
  73. "maddu $ac0, %[temp8], %[temp7] \n\t"
  74. "mult $ac1, %[temp3], %[temp4] \n\t"
  75. "maddu $ac1, %[temp9], %[temp7] \n\t"
  76. "mult $ac2, %[temp3], %[temp4] \n\t"
  77. "maddu $ac2, %[temp10], %[temp7] \n\t"
  78. "mult $ac3, %[temp3], %[temp4] \n\t"
  79. "maddu $ac3, %[temp11], %[temp7] \n\t"
  80. "mfhi %[temp8], $ac0 \n\t"
  81. "mfhi %[temp9], $ac1 \n\t"
  82. "mfhi %[temp10], $ac2 \n\t"
  83. "mfhi %[temp11], $ac3 \n\t"
  84. "sw %[temp0], -16(%[irow]) \n\t"
  85. "sw %[temp1], -12(%[irow]) \n\t"
  86. "sw %[temp2], -8(%[irow]) \n\t"
  87. "sw %[temp5], -4(%[irow]) \n\t"
  88. "sb %[temp8], -4(%[dst]) \n\t"
  89. "sb %[temp9], -3(%[dst]) \n\t"
  90. "sb %[temp10], -2(%[dst]) \n\t"
  91. "sb %[temp11], -1(%[dst]) \n\t"
  92. "bne %[frow], %[loop_end], 1b \n\t"
  93. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
  94. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
  95. [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
  96. [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
  97. [temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
  98. : [temp7]"r"(temp7), [yscale]"r"(yscale), [temp6]"r"(temp6)
  99. : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
  100. "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
  101. );
  102. }
  103. for (i = 0; i < (x_out_max & 0x3); ++i) {
  104. const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(*frow++, yscale);
  105. const int v = (int)MULT_FIX(*irow - frac, wrk->fxy_scale);
  106. *dst++ = (v > 255) ? 255u : (uint8_t)v;
  107. *irow++ = frac; // new fractional start
  108. }
  109. } else {
  110. if (x_out_max >= 4) {
  111. __asm__ volatile (
  112. "li %[temp3], 0x10000 \n\t"
  113. "li %[temp4], 0x8000 \n\t"
  114. "addu %[loop_end], %[irow], %[temp6] \n\t"
  115. "1: \n\t"
  116. "lw %[temp0], 0(%[irow]) \n\t"
  117. "lw %[temp1], 4(%[irow]) \n\t"
  118. "lw %[temp2], 8(%[irow]) \n\t"
  119. "lw %[temp5], 12(%[irow]) \n\t"
  120. "addiu %[dst], %[dst], 4 \n\t"
  121. "addiu %[irow], %[irow], 16 \n\t"
  122. "mult $ac0, %[temp3], %[temp4] \n\t"
  123. "maddu $ac0, %[temp0], %[temp7] \n\t"
  124. "mult $ac1, %[temp3], %[temp4] \n\t"
  125. "maddu $ac1, %[temp1], %[temp7] \n\t"
  126. "mult $ac2, %[temp3], %[temp4] \n\t"
  127. "maddu $ac2, %[temp2], %[temp7] \n\t"
  128. "mult $ac3, %[temp3], %[temp4] \n\t"
  129. "maddu $ac3, %[temp5], %[temp7] \n\t"
  130. "mfhi %[temp0], $ac0 \n\t"
  131. "mfhi %[temp1], $ac1 \n\t"
  132. "mfhi %[temp2], $ac2 \n\t"
  133. "mfhi %[temp5], $ac3 \n\t"
  134. "sw $zero, -16(%[irow]) \n\t"
  135. "sw $zero, -12(%[irow]) \n\t"
  136. "sw $zero, -8(%[irow]) \n\t"
  137. "sw $zero, -4(%[irow]) \n\t"
  138. "sb %[temp0], -4(%[dst]) \n\t"
  139. "sb %[temp1], -3(%[dst]) \n\t"
  140. "sb %[temp2], -2(%[dst]) \n\t"
  141. "sb %[temp5], -1(%[dst]) \n\t"
  142. "bne %[irow], %[loop_end], 1b \n\t"
  143. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
  144. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
  145. [dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
  146. : [temp7]"r"(temp7), [temp6]"r"(temp6)
  147. : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
  148. "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
  149. );
  150. }
  151. for (i = 0; i < (x_out_max & 0x3); ++i) {
  152. const int v = (int)MULT_FIX_FLOOR(*irow, wrk->fxy_scale);
  153. *dst++ = (v > 255) ? 255u : (uint8_t)v;
  154. *irow++ = 0;
  155. }
  156. }
  157. }
  158. #endif // 0
  159. static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
  160. int i;
  161. uint8_t* dst = wrk->dst;
  162. rescaler_t* irow = wrk->irow;
  163. const int x_out_max = wrk->dst_width * wrk->num_channels;
  164. const rescaler_t* frow = wrk->frow;
  165. int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
  166. const int temp6 = (x_out_max & ~0x3) << 2;
  167. const int temp7 = (int)wrk->fy_scale;
  168. assert(!WebPRescalerOutputDone(wrk));
  169. assert(wrk->y_accum <= 0);
  170. assert(wrk->y_expand);
  171. assert(wrk->y_sub != 0);
  172. if (wrk->y_accum == 0) {
  173. if (x_out_max >= 4) {
  174. __asm__ volatile (
  175. "li %[temp4], 0x10000 \n\t"
  176. "li %[temp5], 0x8000 \n\t"
  177. "addu %[loop_end], %[frow], %[temp6] \n\t"
  178. "1: \n\t"
  179. "lw %[temp0], 0(%[frow]) \n\t"
  180. "lw %[temp1], 4(%[frow]) \n\t"
  181. "lw %[temp2], 8(%[frow]) \n\t"
  182. "lw %[temp3], 12(%[frow]) \n\t"
  183. "addiu %[dst], %[dst], 4 \n\t"
  184. "addiu %[frow], %[frow], 16 \n\t"
  185. "mult $ac0, %[temp4], %[temp5] \n\t"
  186. "maddu $ac0, %[temp0], %[temp7] \n\t"
  187. "mult $ac1, %[temp4], %[temp5] \n\t"
  188. "maddu $ac1, %[temp1], %[temp7] \n\t"
  189. "mult $ac2, %[temp4], %[temp5] \n\t"
  190. "maddu $ac2, %[temp2], %[temp7] \n\t"
  191. "mult $ac3, %[temp4], %[temp5] \n\t"
  192. "maddu $ac3, %[temp3], %[temp7] \n\t"
  193. "mfhi %[temp0], $ac0 \n\t"
  194. "mfhi %[temp1], $ac1 \n\t"
  195. "mfhi %[temp2], $ac2 \n\t"
  196. "mfhi %[temp3], $ac3 \n\t"
  197. "sb %[temp0], -4(%[dst]) \n\t"
  198. "sb %[temp1], -3(%[dst]) \n\t"
  199. "sb %[temp2], -2(%[dst]) \n\t"
  200. "sb %[temp3], -1(%[dst]) \n\t"
  201. "bne %[frow], %[loop_end], 1b \n\t"
  202. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
  203. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
  204. [dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
  205. : [temp7]"r"(temp7), [temp6]"r"(temp6)
  206. : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
  207. "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
  208. );
  209. }
  210. for (i = 0; i < (x_out_max & 0x3); ++i) {
  211. const uint32_t J = *frow++;
  212. const int v = (int)MULT_FIX(J, wrk->fy_scale);
  213. *dst++ = (v > 255) ? 255u : (uint8_t)v;
  214. }
  215. } else {
  216. const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
  217. const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
  218. if (x_out_max >= 4) {
  219. int temp8, temp9, temp10, temp11;
  220. __asm__ volatile (
  221. "li %[temp8], 0x10000 \n\t"
  222. "li %[temp9], 0x8000 \n\t"
  223. "addu %[loop_end], %[frow], %[temp6] \n\t"
  224. "1: \n\t"
  225. "lw %[temp0], 0(%[frow]) \n\t"
  226. "lw %[temp1], 4(%[frow]) \n\t"
  227. "lw %[temp2], 8(%[frow]) \n\t"
  228. "lw %[temp3], 12(%[frow]) \n\t"
  229. "lw %[temp4], 0(%[irow]) \n\t"
  230. "lw %[temp5], 4(%[irow]) \n\t"
  231. "lw %[temp10], 8(%[irow]) \n\t"
  232. "lw %[temp11], 12(%[irow]) \n\t"
  233. "addiu %[dst], %[dst], 4 \n\t"
  234. "mult $ac0, %[temp8], %[temp9] \n\t"
  235. "maddu $ac0, %[A], %[temp0] \n\t"
  236. "maddu $ac0, %[B], %[temp4] \n\t"
  237. "mult $ac1, %[temp8], %[temp9] \n\t"
  238. "maddu $ac1, %[A], %[temp1] \n\t"
  239. "maddu $ac1, %[B], %[temp5] \n\t"
  240. "mult $ac2, %[temp8], %[temp9] \n\t"
  241. "maddu $ac2, %[A], %[temp2] \n\t"
  242. "maddu $ac2, %[B], %[temp10] \n\t"
  243. "mult $ac3, %[temp8], %[temp9] \n\t"
  244. "maddu $ac3, %[A], %[temp3] \n\t"
  245. "maddu $ac3, %[B], %[temp11] \n\t"
  246. "addiu %[frow], %[frow], 16 \n\t"
  247. "addiu %[irow], %[irow], 16 \n\t"
  248. "mfhi %[temp0], $ac0 \n\t"
  249. "mfhi %[temp1], $ac1 \n\t"
  250. "mfhi %[temp2], $ac2 \n\t"
  251. "mfhi %[temp3], $ac3 \n\t"
  252. "mult $ac0, %[temp8], %[temp9] \n\t"
  253. "maddu $ac0, %[temp0], %[temp7] \n\t"
  254. "mult $ac1, %[temp8], %[temp9] \n\t"
  255. "maddu $ac1, %[temp1], %[temp7] \n\t"
  256. "mult $ac2, %[temp8], %[temp9] \n\t"
  257. "maddu $ac2, %[temp2], %[temp7] \n\t"
  258. "mult $ac3, %[temp8], %[temp9] \n\t"
  259. "maddu $ac3, %[temp3], %[temp7] \n\t"
  260. "mfhi %[temp0], $ac0 \n\t"
  261. "mfhi %[temp1], $ac1 \n\t"
  262. "mfhi %[temp2], $ac2 \n\t"
  263. "mfhi %[temp3], $ac3 \n\t"
  264. "sb %[temp0], -4(%[dst]) \n\t"
  265. "sb %[temp1], -3(%[dst]) \n\t"
  266. "sb %[temp2], -2(%[dst]) \n\t"
  267. "sb %[temp3], -1(%[dst]) \n\t"
  268. "bne %[frow], %[loop_end], 1b \n\t"
  269. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
  270. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
  271. [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
  272. [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
  273. [temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
  274. : [temp7]"r"(temp7), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
  275. : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
  276. "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
  277. );
  278. }
  279. for (i = 0; i < (x_out_max & 0x3); ++i) {
  280. const uint64_t I = (uint64_t)A * *frow++
  281. + (uint64_t)B * *irow++;
  282. const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
  283. const int v = (int)MULT_FIX(J, wrk->fy_scale);
  284. *dst++ = (v > 255) ? 255u : (uint8_t)v;
  285. }
  286. }
  287. }
  288. #undef MULT_FIX_FLOOR
  289. #undef MULT_FIX
  290. #undef ROUNDER
  291. //------------------------------------------------------------------------------
  292. // Entry point
  293. extern void WebPRescalerDspInitMIPSdspR2(void);
  294. WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
  295. WebPRescalerExportRowExpand = ExportRowExpand_MIPSdspR2;
  296. // WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2;
  297. }
  298. #else // !WEBP_USE_MIPS_DSP_R2
  299. WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPSdspR2)
  300. #endif // WEBP_USE_MIPS_DSP_R2