123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443 |
- // Copyright 2016 Google Inc. All Rights Reserved.
- //
- // Use of this source code is governed by a BSD-style license
- // that can be found in the COPYING file in the root of the source
- // tree. An additional intellectual property rights grant can be found
- // in the file PATENTS. All contributing project authors may
- // be found in the AUTHORS file in the root of the source tree.
- // -----------------------------------------------------------------------------
- //
- // MSA version of rescaling functions
- //
- // Author: Prashant Patil (prashant.patil@imgtec.com)
- #include "./dsp.h"
- #if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
- #include <assert.h>
- #include "../utils/rescaler_utils.h"
- #include "./msa_macro.h"
- #define ROUNDER (WEBP_RESCALER_ONE >> 1)
- #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
- #define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
- #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \
- v4u32 tmp0, tmp1, tmp2, tmp3; \
- v16u8 t0, t1, t2, t3, t4, t5; \
- v2u64 out0, out1, out2, out3; \
- ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
- ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
- DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
- DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
- SRAR_D4_UD(out0, out1, out2, out3, shift); \
- PCKEV_B2_UB(out1, out0, out3, out2, t0, t1); \
- ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
- ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
- DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
- DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
- SRAR_D4_UD(out0, out1, out2, out3, shift); \
- PCKEV_B2_UB(out1, out0, out3, out2, t2, t3); \
- PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); \
- dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); \
- } while (0)
- #define CALC_MULT_FIX_4(in0, scale, shift, dst) do { \
- v4u32 tmp0, tmp1; \
- v16i8 t0, t1; \
- v2u64 out0, out1; \
- ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
- DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
- SRAR_D2_UD(out0, out1, shift); \
- t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
- t1 = __msa_pckev_b(t0, t0); \
- t0 = __msa_pckev_b(t1, t1); \
- dst = __msa_copy_s_w((v4i32)t0, 0); \
- } while (0)
- #define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift, \
- dst0, dst1, dst2, dst3) do { \
- v4u32 tmp0, tmp1, tmp2, tmp3; \
- v2u64 out0, out1, out2, out3; \
- ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
- ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
- DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
- DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
- SRAR_D4_UD(out0, out1, out2, out3, shift); \
- PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1); \
- ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
- ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
- DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
- DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
- SRAR_D4_UD(out0, out1, out2, out3, shift); \
- PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3); \
- } while (0)
- #define CALC_MULT_FIX1_4(in0, scale, shift, dst) do { \
- v4u32 tmp0, tmp1; \
- v2u64 out0, out1; \
- ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
- DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
- SRAR_D2_UD(out0, out1, shift); \
- dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0); \
- } while (0)
- #define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift, \
- dst0, dst1) do { \
- v4u32 tmp0, tmp1, tmp2, tmp3; \
- v2u64 out0, out1, out2, out3; \
- ILVRL_W2_UW(in0, in2, tmp0, tmp1); \
- ILVRL_W2_UW(in1, in3, tmp2, tmp3); \
- DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
- DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3); \
- SRAR_D4_UD(out0, out1, out2, out3, shift); \
- DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
- DOTP_UW2_UD(out2, out3, scale, scale, out2, out3); \
- SRAR_D4_UD(out0, out1, out2, out3, shift); \
- PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1); \
- } while (0)
- #define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do { \
- v4u32 tmp0, tmp1; \
- v2u64 out0, out1; \
- v16i8 t0, t1; \
- ILVRL_W2_UW(in0, in1, tmp0, tmp1); \
- DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
- SRAR_D2_UD(out0, out1, shift); \
- DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
- SRAR_D2_UD(out0, out1, shift); \
- t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
- t1 = __msa_pckev_b(t0, t0); \
- t0 = __msa_pckev_b(t1, t1); \
- dst = __msa_copy_s_w((v4i32)t0, 0); \
- } while (0)
- static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
- int length,
- WebPRescaler* const wrk) {
- const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
- const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
- const v4i32 zero = { 0 };
- while (length >= 16) {
- v4u32 src0, src1, src2, src3;
- v16u8 out;
- LD_UW4(frow, 4, src0, src1, src2, src3);
- CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
- ST_UB(out, dst);
- length -= 16;
- frow += 16;
- dst += 16;
- }
- if (length > 0) {
- int x_out;
- if (length >= 12) {
- uint32_t val0_m, val1_m, val2_m;
- v4u32 src0, src1, src2;
- LD_UW3(frow, 4, src0, src1, src2);
- CALC_MULT_FIX_4(src0, scale, shift, val0_m);
- CALC_MULT_FIX_4(src1, scale, shift, val1_m);
- CALC_MULT_FIX_4(src2, scale, shift, val2_m);
- SW3(val0_m, val1_m, val2_m, dst, 4);
- length -= 12;
- frow += 12;
- dst += 12;
- } else if (length >= 8) {
- uint32_t val0_m, val1_m;
- v4u32 src0, src1;
- LD_UW2(frow, 4, src0, src1);
- CALC_MULT_FIX_4(src0, scale, shift, val0_m);
- CALC_MULT_FIX_4(src1, scale, shift, val1_m);
- SW2(val0_m, val1_m, dst, 4);
- length -= 8;
- frow += 8;
- dst += 8;
- } else if (length >= 4) {
- uint32_t val0_m;
- const v4u32 src0 = LD_UW(frow);
- CALC_MULT_FIX_4(src0, scale, shift, val0_m);
- SW(val0_m, dst);
- length -= 4;
- frow += 4;
- dst += 4;
- }
- for (x_out = 0; x_out < length; ++x_out) {
- const uint32_t J = frow[x_out];
- const int v = (int)MULT_FIX(J, wrk->fy_scale);
- dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
- }
- }
- }
- static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
- uint8_t* dst, int length,
- WebPRescaler* const wrk) {
- const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
- const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
- const v4i32 B1 = __msa_fill_w(B);
- const v4i32 A1 = __msa_fill_w(A);
- const v4i32 AB = __msa_ilvr_w(A1, B1);
- const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
- const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
- while (length >= 16) {
- v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
- v16u8 t0, t1, t2, t3, t4, t5;
- LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
- LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
- CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
- CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
- PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
- t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
- ST_UB(t0, dst);
- frow += 16;
- irow += 16;
- dst += 16;
- length -= 16;
- }
- if (length > 0) {
- int x_out;
- if (length >= 12) {
- uint32_t val0_m, val1_m, val2_m;
- v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
- LD_UW3(frow, 4, frow0, frow1, frow2);
- LD_UW3(irow, 4, irow0, irow1, irow2);
- CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
- CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
- CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
- SW3(val0_m, val1_m, val2_m, dst, 4);
- frow += 12;
- irow += 12;
- dst += 12;
- length -= 12;
- } else if (length >= 8) {
- uint32_t val0_m, val1_m;
- v4u32 frow0, frow1, irow0, irow1;
- LD_UW2(frow, 4, frow0, frow1);
- LD_UW2(irow, 4, irow0, irow1);
- CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
- CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
- SW2(val0_m, val1_m, dst, 4);
- frow += 4;
- irow += 4;
- dst += 4;
- length -= 4;
- } else if (length >= 4) {
- uint32_t val0_m;
- const v4u32 frow0 = LD_UW(frow + 0);
- const v4u32 irow0 = LD_UW(irow + 0);
- CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
- SW(val0_m, dst);
- frow += 4;
- irow += 4;
- dst += 4;
- length -= 4;
- }
- for (x_out = 0; x_out < length; ++x_out) {
- const uint64_t I = (uint64_t)A * frow[x_out]
- + (uint64_t)B * irow[x_out];
- const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
- const int v = (int)MULT_FIX(J, wrk->fy_scale);
- dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
- }
- }
- }
- static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
- uint8_t* dst = wrk->dst;
- rescaler_t* irow = wrk->irow;
- const int x_out_max = wrk->dst_width * wrk->num_channels;
- const rescaler_t* frow = wrk->frow;
- assert(!WebPRescalerOutputDone(wrk));
- assert(wrk->y_accum <= 0);
- assert(wrk->y_expand);
- assert(wrk->y_sub != 0);
- if (wrk->y_accum == 0) {
- ExportRowExpand_0(frow, dst, x_out_max, wrk);
- } else {
- ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
- }
- }
- #if 0 // disabled for now. TODO(skal): make match the C-code
- static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
- uint8_t* dst, int length,
- const uint32_t yscale,
- WebPRescaler* const wrk) {
- const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
- const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
- const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
- const v4i32 zero = { 0 };
- while (length >= 16) {
- v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
- v16u8 out;
- LD_UW4(frow, 4, src0, src1, src2, src3);
- CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
- frac0, frac1, frac2, frac3);
- LD_UW4(irow, 4, src0, src1, src2, src3);
- SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
- src0, src1, src2, src3);
- CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
- ST_UB(out, dst);
- ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
- frow += 16;
- irow += 16;
- dst += 16;
- length -= 16;
- }
- if (length > 0) {
- int x_out;
- if (length >= 12) {
- uint32_t val0_m, val1_m, val2_m;
- v4u32 src0, src1, src2, frac0, frac1, frac2;
- LD_UW3(frow, 4, src0, src1, src2);
- CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
- CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
- CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
- LD_UW3(irow, 4, src0, src1, src2);
- SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
- CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
- CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
- CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
- SW3(val0_m, val1_m, val2_m, dst, 4);
- ST_UW3(frac0, frac1, frac2, irow, 4);
- frow += 12;
- irow += 12;
- dst += 12;
- length -= 12;
- } else if (length >= 8) {
- uint32_t val0_m, val1_m;
- v4u32 src0, src1, frac0, frac1;
- LD_UW2(frow, 4, src0, src1);
- CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
- CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
- LD_UW2(irow, 4, src0, src1);
- SUB2(src0, frac0, src1, frac1, src0, src1);
- CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
- CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
- SW2(val0_m, val1_m, dst, 4);
- ST_UW2(frac0, frac1, irow, 4);
- frow += 8;
- irow += 8;
- dst += 8;
- length -= 8;
- } else if (length >= 4) {
- uint32_t val0_m;
- v4u32 frac0;
- v4u32 src0 = LD_UW(frow);
- CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
- src0 = LD_UW(irow);
- src0 = src0 - frac0;
- CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
- SW(val0_m, dst);
- ST_UW(frac0, irow);
- frow += 4;
- irow += 4;
- dst += 4;
- length -= 4;
- }
- for (x_out = 0; x_out < length; ++x_out) {
- const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
- const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
- dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
- irow[x_out] = frac;
- }
- }
- }
- static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
- int length,
- WebPRescaler* const wrk) {
- const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
- const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
- const v4i32 zero = { 0 };
- while (length >= 16) {
- v4u32 src0, src1, src2, src3;
- v16u8 dst0;
- LD_UW4(irow, 4, src0, src1, src2, src3);
- CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
- ST_UB(dst0, dst);
- ST_SW4(zero, zero, zero, zero, irow, 4);
- length -= 16;
- irow += 16;
- dst += 16;
- }
- if (length > 0) {
- int x_out;
- if (length >= 12) {
- uint32_t val0_m, val1_m, val2_m;
- v4u32 src0, src1, src2;
- LD_UW3(irow, 4, src0, src1, src2);
- CALC_MULT_FIX_4(src0, scale, shift, val0_m);
- CALC_MULT_FIX_4(src1, scale, shift, val1_m);
- CALC_MULT_FIX_4(src2, scale, shift, val2_m);
- SW3(val0_m, val1_m, val2_m, dst, 4);
- ST_SW3(zero, zero, zero, irow, 4);
- length -= 12;
- irow += 12;
- dst += 12;
- } else if (length >= 8) {
- uint32_t val0_m, val1_m;
- v4u32 src0, src1;
- LD_UW2(irow, 4, src0, src1);
- CALC_MULT_FIX_4(src0, scale, shift, val0_m);
- CALC_MULT_FIX_4(src1, scale, shift, val1_m);
- SW2(val0_m, val1_m, dst, 4);
- ST_SW2(zero, zero, irow, 4);
- length -= 8;
- irow += 8;
- dst += 8;
- } else if (length >= 4) {
- uint32_t val0_m;
- const v4u32 src0 = LD_UW(irow + 0);
- CALC_MULT_FIX_4(src0, scale, shift, val0_m);
- SW(val0_m, dst);
- ST_SW(zero, irow);
- length -= 4;
- irow += 4;
- dst += 4;
- }
- for (x_out = 0; x_out < length; ++x_out) {
- const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
- dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
- irow[x_out] = 0;
- }
- }
- }
- static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
- uint8_t* dst = wrk->dst;
- rescaler_t* irow = wrk->irow;
- const int x_out_max = wrk->dst_width * wrk->num_channels;
- const rescaler_t* frow = wrk->frow;
- const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
- assert(!WebPRescalerOutputDone(wrk));
- assert(wrk->y_accum <= 0);
- assert(!wrk->y_expand);
- if (yscale) {
- ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
- } else {
- ExportRowShrink_1(irow, dst, x_out_max, wrk);
- }
- }
- #endif // 0
- //------------------------------------------------------------------------------
- // Entry point
- extern void WebPRescalerDspInitMSA(void);
- WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
- WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
- // WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
- }
- #else // !WEBP_USE_MSA
- WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
- #endif // WEBP_USE_MSA
|