lossless_enc_mips32.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. // Copyright 2015 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // MIPS version of lossless functions
  11. //
  12. // Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
  13. // Jovan Zelincevic (jovan.zelincevic@imgtec.com)
  14. #include "./dsp.h"
  15. #include "./lossless.h"
  16. #include "./lossless_common.h"
  17. #if defined(WEBP_USE_MIPS32)
  18. #include <assert.h>
  19. #include <math.h>
  20. #include <stdlib.h>
  21. #include <string.h>
  22. static float FastSLog2Slow_MIPS32(uint32_t v) {
  23. assert(v >= LOG_LOOKUP_IDX_MAX);
  24. if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
  25. uint32_t log_cnt, y, correction;
  26. const int c24 = 24;
  27. const float v_f = (float)v;
  28. uint32_t temp;
  29. // Xf = 256 = 2^8
  30. // log_cnt is index of leading one in upper 24 bits
  31. __asm__ volatile(
  32. "clz %[log_cnt], %[v] \n\t"
  33. "addiu %[y], $zero, 1 \n\t"
  34. "subu %[log_cnt], %[c24], %[log_cnt] \n\t"
  35. "sllv %[y], %[y], %[log_cnt] \n\t"
  36. "srlv %[temp], %[v], %[log_cnt] \n\t"
  37. : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
  38. [temp]"=r"(temp)
  39. : [c24]"r"(c24), [v]"r"(v)
  40. );
  41. // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
  42. // Xf = floor(Xf) * (1 + (v % y) / v)
  43. // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
  44. // The correction factor: log(1 + d) ~ d; for very small d values, so
  45. // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
  46. // LOG_2_RECIPROCAL ~ 23/16
  47. // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
  48. correction = (23 * (v & (y - 1))) >> 4;
  49. return v_f * (kLog2Table[temp] + log_cnt) + correction;
  50. } else {
  51. return (float)(LOG_2_RECIPROCAL * v * log((double)v));
  52. }
  53. }
  54. static float FastLog2Slow_MIPS32(uint32_t v) {
  55. assert(v >= LOG_LOOKUP_IDX_MAX);
  56. if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
  57. uint32_t log_cnt, y;
  58. const int c24 = 24;
  59. double log_2;
  60. uint32_t temp;
  61. __asm__ volatile(
  62. "clz %[log_cnt], %[v] \n\t"
  63. "addiu %[y], $zero, 1 \n\t"
  64. "subu %[log_cnt], %[c24], %[log_cnt] \n\t"
  65. "sllv %[y], %[y], %[log_cnt] \n\t"
  66. "srlv %[temp], %[v], %[log_cnt] \n\t"
  67. : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
  68. [temp]"=r"(temp)
  69. : [c24]"r"(c24), [v]"r"(v)
  70. );
  71. log_2 = kLog2Table[temp] + log_cnt;
  72. if (v >= APPROX_LOG_MAX) {
  73. // Since the division is still expensive, add this correction factor only
  74. // for large values of 'v'.
  75. const uint32_t correction = (23 * (v & (y - 1))) >> 4;
  76. log_2 += (double)correction / v;
  77. }
  78. return (float)log_2;
  79. } else {
  80. return (float)(LOG_2_RECIPROCAL * log((double)v));
  81. }
  82. }
  83. // C version of this function:
  84. // int i = 0;
  85. // int64_t cost = 0;
  86. // const uint32_t* pop = &population[4];
  87. // const uint32_t* LoopEnd = &population[length];
  88. // while (pop != LoopEnd) {
  89. // ++i;
  90. // cost += i * *pop;
  91. // cost += i * *(pop + 1);
  92. // pop += 2;
  93. // }
  94. // return (double)cost;
  95. static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
  96. int i, temp0, temp1;
  97. const uint32_t* pop = &population[4];
  98. const uint32_t* const LoopEnd = &population[length];
  99. __asm__ volatile(
  100. "mult $zero, $zero \n\t"
  101. "xor %[i], %[i], %[i] \n\t"
  102. "beq %[pop], %[LoopEnd], 2f \n\t"
  103. "1: \n\t"
  104. "lw %[temp0], 0(%[pop]) \n\t"
  105. "lw %[temp1], 4(%[pop]) \n\t"
  106. "addiu %[i], %[i], 1 \n\t"
  107. "addiu %[pop], %[pop], 8 \n\t"
  108. "madd %[i], %[temp0] \n\t"
  109. "madd %[i], %[temp1] \n\t"
  110. "bne %[pop], %[LoopEnd], 1b \n\t"
  111. "2: \n\t"
  112. "mfhi %[temp0] \n\t"
  113. "mflo %[temp1] \n\t"
  114. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  115. [i]"=&r"(i), [pop]"+r"(pop)
  116. : [LoopEnd]"r"(LoopEnd)
  117. : "memory", "hi", "lo"
  118. );
  119. return (double)((int64_t)temp0 << 32 | temp1);
  120. }
  121. // C version of this function:
  122. // int i = 0;
  123. // int64_t cost = 0;
  124. // const uint32_t* pX = &X[4];
  125. // const uint32_t* pY = &Y[4];
  126. // const uint32_t* LoopEnd = &X[length];
  127. // while (pX != LoopEnd) {
  128. // const uint32_t xy0 = *pX + *pY;
  129. // const uint32_t xy1 = *(pX + 1) + *(pY + 1);
  130. // ++i;
  131. // cost += i * xy0;
  132. // cost += i * xy1;
  133. // pX += 2;
  134. // pY += 2;
  135. // }
  136. // return (double)cost;
  137. static double ExtraCostCombined_MIPS32(const uint32_t* const X,
  138. const uint32_t* const Y, int length) {
  139. int i, temp0, temp1, temp2, temp3;
  140. const uint32_t* pX = &X[4];
  141. const uint32_t* pY = &Y[4];
  142. const uint32_t* const LoopEnd = &X[length];
  143. __asm__ volatile(
  144. "mult $zero, $zero \n\t"
  145. "xor %[i], %[i], %[i] \n\t"
  146. "beq %[pX], %[LoopEnd], 2f \n\t"
  147. "1: \n\t"
  148. "lw %[temp0], 0(%[pX]) \n\t"
  149. "lw %[temp1], 0(%[pY]) \n\t"
  150. "lw %[temp2], 4(%[pX]) \n\t"
  151. "lw %[temp3], 4(%[pY]) \n\t"
  152. "addiu %[i], %[i], 1 \n\t"
  153. "addu %[temp0], %[temp0], %[temp1] \n\t"
  154. "addu %[temp2], %[temp2], %[temp3] \n\t"
  155. "addiu %[pX], %[pX], 8 \n\t"
  156. "addiu %[pY], %[pY], 8 \n\t"
  157. "madd %[i], %[temp0] \n\t"
  158. "madd %[i], %[temp2] \n\t"
  159. "bne %[pX], %[LoopEnd], 1b \n\t"
  160. "2: \n\t"
  161. "mfhi %[temp0] \n\t"
  162. "mflo %[temp1] \n\t"
  163. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  164. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
  165. [i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)
  166. : [LoopEnd]"r"(LoopEnd)
  167. : "memory", "hi", "lo"
  168. );
  169. return (double)((int64_t)temp0 << 32 | temp1);
  170. }
  171. #define HUFFMAN_COST_PASS \
  172. __asm__ volatile( \
  173. "sll %[temp1], %[temp0], 3 \n\t" \
  174. "addiu %[temp3], %[streak], -3 \n\t" \
  175. "addu %[temp2], %[pstreaks], %[temp1] \n\t" \
  176. "blez %[temp3], 1f \n\t" \
  177. "srl %[temp1], %[temp1], 1 \n\t" \
  178. "addu %[temp3], %[pcnts], %[temp1] \n\t" \
  179. "lw %[temp0], 4(%[temp2]) \n\t" \
  180. "lw %[temp1], 0(%[temp3]) \n\t" \
  181. "addu %[temp0], %[temp0], %[streak] \n\t" \
  182. "addiu %[temp1], %[temp1], 1 \n\t" \
  183. "sw %[temp0], 4(%[temp2]) \n\t" \
  184. "sw %[temp1], 0(%[temp3]) \n\t" \
  185. "b 2f \n\t" \
  186. "1: \n\t" \
  187. "lw %[temp0], 0(%[temp2]) \n\t" \
  188. "addu %[temp0], %[temp0], %[streak] \n\t" \
  189. "sw %[temp0], 0(%[temp2]) \n\t" \
  190. "2: \n\t" \
  191. : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
  192. [temp3]"=&r"(temp3), [temp0]"+r"(temp0) \
  193. : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \
  194. [streak]"r"(streak) \
  195. : "memory" \
  196. );
  197. // Returns the various RLE counts
  198. static WEBP_INLINE void GetEntropyUnrefinedHelper(
  199. uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
  200. VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
  201. int* const pstreaks = &stats->streaks[0][0];
  202. int* const pcnts = &stats->counts[0];
  203. int temp0, temp1, temp2, temp3;
  204. const int streak = i - *i_prev;
  205. // Gather info for the bit entropy.
  206. if (*val_prev != 0) {
  207. bit_entropy->sum += (*val_prev) * streak;
  208. bit_entropy->nonzeros += streak;
  209. bit_entropy->nonzero_code = *i_prev;
  210. bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
  211. if (bit_entropy->max_val < *val_prev) {
  212. bit_entropy->max_val = *val_prev;
  213. }
  214. }
  215. // Gather info for the Huffman cost.
  216. temp0 = (*val_prev != 0);
  217. HUFFMAN_COST_PASS
  218. *val_prev = val;
  219. *i_prev = i;
  220. }
  221. static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
  222. VP8LBitEntropy* const bit_entropy,
  223. VP8LStreaks* const stats) {
  224. int i;
  225. int i_prev = 0;
  226. uint32_t x_prev = X[0];
  227. memset(stats, 0, sizeof(*stats));
  228. VP8LBitEntropyInit(bit_entropy);
  229. for (i = 1; i < length; ++i) {
  230. const uint32_t x = X[i];
  231. if (x != x_prev) {
  232. GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
  233. }
  234. }
  235. GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
  236. bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
  237. }
  238. static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
  239. const uint32_t Y[],
  240. int length,
  241. VP8LBitEntropy* const entropy,
  242. VP8LStreaks* const stats) {
  243. int i = 1;
  244. int i_prev = 0;
  245. uint32_t xy_prev = X[0] + Y[0];
  246. memset(stats, 0, sizeof(*stats));
  247. VP8LBitEntropyInit(entropy);
  248. for (i = 1; i < length; ++i) {
  249. const uint32_t xy = X[i] + Y[i];
  250. if (xy != xy_prev) {
  251. GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats);
  252. }
  253. }
  254. GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
  255. entropy->entropy += VP8LFastSLog2(entropy->sum);
  256. }
  257. #define ASM_START \
  258. __asm__ volatile( \
  259. ".set push \n\t" \
  260. ".set at \n\t" \
  261. ".set macro \n\t" \
  262. "1: \n\t"
  263. // P2 = P0 + P1
  264. // A..D - offsets
  265. // E - temp variable to tell macro
  266. // if pointer should be incremented
  267. // literal_ and successive histograms could be unaligned
  268. // so we must use ulw and usw
  269. #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \
  270. "ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \
  271. "ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \
  272. "ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \
  273. "ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \
  274. "ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \
  275. "ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \
  276. "ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \
  277. "ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \
  278. "addu %[temp4], %[temp4], %[temp0] \n\t" \
  279. "addu %[temp5], %[temp5], %[temp1] \n\t" \
  280. "addu %[temp6], %[temp6], %[temp2] \n\t" \
  281. "addu %[temp7], %[temp7], %[temp3] \n\t" \
  282. "addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \
  283. ".if " #E " == 1 \n\t" \
  284. "addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \
  285. ".endif \n\t" \
  286. "usw %[temp4], " #A "(%[" #P2 "]) \n\t" \
  287. "usw %[temp5], " #B "(%[" #P2 "]) \n\t" \
  288. "usw %[temp6], " #C "(%[" #P2 "]) \n\t" \
  289. "usw %[temp7], " #D "(%[" #P2 "]) \n\t" \
  290. "addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \
  291. "bne %[" #P0 "], %[LoopEnd], 1b \n\t" \
  292. ".set pop \n\t" \
  293. #define ASM_END_COMMON_0 \
  294. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
  295. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \
  296. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
  297. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \
  298. [pa]"+r"(pa), [pout]"+r"(pout)
  299. #define ASM_END_COMMON_1 \
  300. : [LoopEnd]"r"(LoopEnd) \
  301. : "memory", "at" \
  302. );
  303. #define ASM_END_0 \
  304. ASM_END_COMMON_0 \
  305. , [pb]"+r"(pb) \
  306. ASM_END_COMMON_1
  307. #define ASM_END_1 \
  308. ASM_END_COMMON_0 \
  309. ASM_END_COMMON_1
  310. static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
  311. uint32_t* pout, int size) {
  312. uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  313. const uint32_t end = ((size) / 4) * 4;
  314. const uint32_t* const LoopEnd = pa + end;
  315. int i;
  316. ASM_START
  317. ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)
  318. ASM_END_0
  319. for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i];
  320. }
  321. static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) {
  322. uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  323. const uint32_t end = ((size) / 4) * 4;
  324. const uint32_t* const LoopEnd = pa + end;
  325. int i;
  326. ASM_START
  327. ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)
  328. ASM_END_1
  329. for (i = end; i < size; ++i) pout[i] += pa[i];
  330. }
  331. #undef ASM_END_1
  332. #undef ASM_END_0
  333. #undef ASM_END_COMMON_1
  334. #undef ASM_END_COMMON_0
  335. #undef ADD_TO_OUT
  336. #undef ASM_START
  337. //------------------------------------------------------------------------------
  338. // Entry point
  339. extern void VP8LEncDspInitMIPS32(void);
  340. WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
  341. VP8LFastSLog2Slow = FastSLog2Slow_MIPS32;
  342. VP8LFastLog2Slow = FastLog2Slow_MIPS32;
  343. VP8LExtraCost = ExtraCost_MIPS32;
  344. VP8LExtraCostCombined = ExtraCostCombined_MIPS32;
  345. VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;
  346. VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;
  347. VP8LAddVector = AddVector_MIPS32;
  348. VP8LAddVectorEq = AddVectorEq_MIPS32;
  349. }
  350. #else // !WEBP_USE_MIPS32
  351. WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32)
  352. #endif // WEBP_USE_MIPS32