/* * Loongson LSX optimized swscale * * Copyright (c) 2023 Loongson Technology Corporation Limited * Contributed by Lu Wang * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavcodec/loongarch/loongson_asm.S" /* void ff_hscale_8_to_15_lsx(SwsInternal *c, int16_t *dst, int dstW, * const uint8_t *src, const int16_t *filter, * const int32_t *filterPos, int filterSize) */ function ff_hscale_8_to_15_lsx addi.d sp, sp, -72 st.d s0, sp, 0 st.d s1, sp, 8 st.d s2, sp, 16 st.d s3, sp, 24 st.d s4, sp, 32 st.d s5, sp, 40 st.d s6, sp, 48 st.d s7, sp, 56 st.d s8, sp, 64 li.w t0, 32767 li.w t8, 8 li.w t7, 4 vldi vr0, 0 vreplgr2vr.w vr20, t0 beq a6, t7, .LOOP_DSTW4 beq a6, t8, .LOOP_DSTW8 blt t8, a6, .LOOP_START b .END_DSTW4 .LOOP_START: li.w t1, 0 li.w s1, 0 li.w s2, 0 li.w s3, 0 li.w s4, 0 li.w s5, 0 vldi vr22, 0 addi.w s0, a6, -7 slli.w s7, a6, 1 slli.w s8, a6, 2 add.w t6, s7, s8 .LOOP_DSTW: ld.w t2, a5, 0 ld.w t3, a5, 4 ld.w t4, a5, 8 ld.w t5, a5, 12 fldx.d f1, a3, t2 fldx.d f2, a3, t3 fldx.d f3, a3, t4 fldx.d f4, a3, t5 vld vr9, a4, 0 vldx vr10, a4, s7 vldx vr11, a4, s8 vldx vr12, a4, t6 vilvl.b vr1, vr0, vr1 vilvl.b vr2, vr0, vr2 vilvl.b vr3, vr0, vr3 vilvl.b vr4, vr0, vr4 vdp2.w.h vr17, vr1, vr9 vdp2.w.h vr18, vr2, vr10 vdp2.w.h vr19, vr3, vr11 vdp2.w.h vr21, vr4, vr12 vhaddw.d.w vr1, vr17, vr17 vhaddw.d.w vr2, vr18, vr18 vhaddw.d.w vr3, vr19, vr19 vhaddw.d.w vr4, vr21, vr21 vhaddw.q.d vr1, vr1, vr1 vhaddw.q.d vr2, vr2, vr2 vhaddw.q.d vr3, vr3, vr3 vhaddw.q.d vr4, vr4, vr4 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.d vr1, vr3, vr1 vadd.w vr22, vr22, vr1 addi.w s1, s1, 8 addi.d a3, a3, 8 addi.d a4, a4, 16 blt s1, s0, .LOOP_DSTW blt s1, a6, .DSTWA b .END_FILTER .DSTWA: ld.w t2, a5, 0 li.w t3, 0 move s6, s1 .FILTERSIZEA: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s2, s2, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .FILTERSIZEA ld.w t2, a5, 4 li.w t3, 0 move s6, s1 addi.w t1, t1, 1 .FILTERSIZEB: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s3, s3, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .FILTERSIZEB ld.w t2, a5, 8 addi.w t1, t1, 1 li.w t3, 0 move s6, s1 .FILTERSIZEC: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s4, s4, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .FILTERSIZEC ld.w t2, a5, 12 addi.w t1, t1, 1 move s6, s1 li.w t3, 0 .FILTERSIZED: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s5, s5, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .FILTERSIZED .END_FILTER: vpickve2gr.w t1, vr22, 0 vpickve2gr.w t2, vr22, 1 vpickve2gr.w t3, vr22, 2 vpickve2gr.w t4, vr22, 3 add.w s2, s2, t1 add.w s3, s3, t2 add.w s4, s4, t3 add.w s5, s5, t4 srai.w s2, s2, 7 srai.w s3, s3, 7 srai.w s4, s4, 7 srai.w s5, s5, 7 slt t1, s2, t0 slt t2, s3, t0 slt t3, s4, t0 slt t4, s5, t0 maskeqz s2, s2, t1 maskeqz s3, s3, t2 maskeqz s4, s4, t3 maskeqz s5, s5, t4 masknez t1, t0, t1 masknez t2, t0, t2 masknez t3, t0, t3 masknez t4, t0, t4 or s2, s2, t1 or s3, s3, t2 or s4, s4, t3 or s5, s5, t4 st.h s2, a1, 0 st.h s3, a1, 2 st.h s4, a1, 4 st.h s5, a1, 6 addi.d a1, a1, 8 sub.d a3, a3, s1 addi.d a5, a5, 16 slli.d t3, a6, 3 add.d a4, a4, t3 sub.d a4, a4, s1 sub.d a4, a4, s1 addi.d a2, a2, -4 bge a2, t7, .LOOP_START blt zero, a2, .RES b .END_LOOP .RES: li.w t1, 0 .DSTW: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .FILTERSIZE: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .FILTERSIZE srai.w t8, t8, 7 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 1 stx.h t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .DSTW b .END_LOOP .LOOP_DSTW8: ld.w t1, a5, 0 ld.w t2, a5, 4 ld.w t3, a5, 8 ld.w t4, a5, 12 fldx.d f1, a3, t1 fldx.d f2, a3, t2 fldx.d f3, a3, t3 fldx.d f4, a3, t4 ld.w t1, a5, 16 ld.w t2, a5, 20 ld.w t3, a5, 24 ld.w t4, a5, 28 fldx.d f5, a3, t1 fldx.d f6, a3, t2 fldx.d f7, a3, t3 fldx.d f8, a3, t4 vld vr9, a4, 0 vld vr10, a4, 16 vld vr11, a4, 32 vld vr12, a4, 48 vld vr13, a4, 64 vld vr14, a4, 80 vld vr15, a4, 96 vld vr16, a4, 112 vilvl.b vr1, vr0, vr1 vilvl.b vr2, vr0, vr2 vilvl.b vr3, vr0, vr3 vilvl.b vr4, vr0, vr4 vilvl.b vr5, vr0, vr5 vilvl.b vr6, vr0, vr6 vilvl.b vr7, vr0, vr7 vilvl.b vr8, vr0, vr8 vdp2.w.h vr17, vr1, vr9 vdp2.w.h vr18, vr2, vr10 vdp2.w.h vr19, vr3, vr11 vdp2.w.h vr21, vr4, vr12 vdp2.w.h vr1, vr5, vr13 vdp2.w.h vr2, vr6, vr14 vdp2.w.h vr3, vr7, vr15 vdp2.w.h vr4, vr8, vr16 vhaddw.d.w vr5, vr1, vr1 vhaddw.d.w vr6, vr2, vr2 vhaddw.d.w vr7, vr3, vr3 vhaddw.d.w vr8, vr4, vr4 vhaddw.d.w vr1, vr17, vr17 vhaddw.d.w vr2, vr18, vr18 vhaddw.d.w vr3, vr19, vr19 vhaddw.d.w vr4, vr21, vr21 vhaddw.q.d vr1, vr1, vr1 vhaddw.q.d vr2, vr2, vr2 vhaddw.q.d vr3, vr3, vr3 vhaddw.q.d vr4, vr4, vr4 vhaddw.q.d vr5, vr5, vr5 vhaddw.q.d vr6, vr6, vr6 vhaddw.q.d vr7, vr7, vr7 vhaddw.q.d vr8, vr8, vr8 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.w vr5, vr6, vr5 vilvl.w vr7, vr8, vr7 vilvl.d vr1, vr3, vr1 vilvl.d vr5, vr7, vr5 vsrai.w vr1, vr1, 7 vsrai.w vr5, vr5, 7 vmin.w vr1, vr1, vr20 vmin.w vr5, vr5, vr20 vpickev.h vr1, vr5, vr1 vst vr1, a1, 0 addi.d a1, a1, 16 addi.d a5, a5, 32 addi.d a4, a4, 128 addi.d a2, a2, -8 bge a2, t8, .LOOP_DSTW8 blt zero, a2, .RES8 b .END_LOOP .RES8: li.w t1, 0 .DSTW8: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .FILTERSIZE8: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .FILTERSIZE8 srai.w t8, t8, 7 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 1 stx.h t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .DSTW8 b .END_LOOP .LOOP_DSTW4: ld.w t1, a5, 0 ld.w t2, a5, 4 ld.w t3, a5, 8 ld.w t4, a5, 12 fldx.s f1, a3, t1 fldx.s f2, a3, t2 fldx.s f3, a3, t3 fldx.s f4, a3, t4 ld.w t1, a5, 16 ld.w t2, a5, 20 ld.w t3, a5, 24 ld.w t4, a5, 28 fldx.s f5, a3, t1 fldx.s f6, a3, t2 fldx.s f7, a3, t3 fldx.s f8, a3, t4 vld vr9, a4, 0 vld vr10, a4, 16 vld vr11, a4, 32 vld vr12, a4, 48 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.w vr5, vr6, vr5 vilvl.w vr7, vr8, vr7 vilvl.b vr1, vr0, vr1 vilvl.b vr3, vr0, vr3 vilvl.b vr5, vr0, vr5 vilvl.b vr7, vr0, vr7 vdp2.w.h vr13, vr1, vr9 vdp2.w.h vr14, vr3, vr10 vdp2.w.h vr15, vr5, vr11 vdp2.w.h vr16, vr7, vr12 vhaddw.d.w vr13, vr13, vr13 vhaddw.d.w vr14, vr14, vr14 vhaddw.d.w vr15, vr15, vr15 vhaddw.d.w vr16, vr16, vr16 vpickev.w vr13, vr14, vr13 vpickev.w vr15, vr16, vr15 vsrai.w vr13, vr13, 7 vsrai.w vr15, vr15, 7 vmin.w vr13, vr13, vr20 vmin.w vr15, vr15, vr20 vpickev.h vr13, vr15, vr13 vst vr13, a1, 0 addi.d a1, a1, 16 addi.d a5, a5, 32 addi.d a4, a4, 64 addi.d a2, a2, -8 bge a2, t8, .LOOP_DSTW4 blt zero, a2, .RES4 b .END_LOOP .RES4: li.w t1, 0 .DSTW4: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .FILTERSIZE4: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .FILTERSIZE4 srai.w t8, t8, 7 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 1 stx.h t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .DSTW4 b .END_LOOP .END_DSTW4: li.w t1, 0 .LOOP_DSTW1: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .FILTERSIZE1: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .FILTERSIZE1 srai.w t8, t8, 7 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 1 stx.h t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .LOOP_DSTW1 b .END_LOOP .END_LOOP: ld.d s0, sp, 0 ld.d s1, sp, 8 ld.d s2, sp, 16 ld.d s3, sp, 24 ld.d s4, sp, 32 ld.d s5, sp, 40 ld.d s6, sp, 48 ld.d s7, sp, 56 ld.d s8, sp, 64 addi.d sp, sp, 72 endfunc /* void ff_hscale_8_to_19_lsx(SwsInternal *c, int16_t *dst, int dstW, * const uint8_t *src, const int16_t *filter, * const int32_t *filterPos, int filterSize) */ function ff_hscale_8_to_19_lsx addi.d sp, sp, -72 st.d s0, sp, 0 st.d s1, sp, 8 st.d s2, sp, 16 st.d s3, sp, 24 st.d s4, sp, 32 st.d s5, sp, 40 st.d s6, sp, 48 st.d s7, sp, 56 st.d s8, sp, 64 li.w t0, 524287 li.w t8, 8 li.w t7, 4 vldi vr0, 0 vreplgr2vr.w vr20, t0 beq a6, t7, .LOOP_DST4 beq a6, t8, .LOOP_DST8 blt t8, a6, .LOOP b .END_DST4 .LOOP: li.w t1, 0 li.w s1, 0 li.w s2, 0 li.w s3, 0 li.w s4, 0 li.w s5, 0 vldi vr22, 0 addi.w s0, a6, -7 slli.w s7, a6, 1 slli.w s8, a6, 2 add.w t6, s7, s8 .LOOP_DST: ld.w t2, a5, 0 ld.w t3, a5, 4 ld.w t4, a5, 8 ld.w t5, a5, 12 fldx.d f1, a3, t2 fldx.d f2, a3, t3 fldx.d f3, a3, t4 fldx.d f4, a3, t5 vld vr9, a4, 0 vldx vr10, a4, s7 vldx vr11, a4, s8 vldx vr12, a4, t6 vilvl.b vr1, vr0, vr1 vilvl.b vr2, vr0, vr2 vilvl.b vr3, vr0, vr3 vilvl.b vr4, vr0, vr4 vdp2.w.h vr17, vr1, vr9 vdp2.w.h vr18, vr2, vr10 vdp2.w.h vr19, vr3, vr11 vdp2.w.h vr21, vr4, vr12 vhaddw.d.w vr1, vr17, vr17 vhaddw.d.w vr2, vr18, vr18 vhaddw.d.w vr3, vr19, vr19 vhaddw.d.w vr4, vr21, vr21 vhaddw.q.d vr1, vr1, vr1 vhaddw.q.d vr2, vr2, vr2 vhaddw.q.d vr3, vr3, vr3 vhaddw.q.d vr4, vr4, vr4 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.d vr1, vr3, vr1 vadd.w vr22, vr22, vr1 addi.w s1, s1, 8 addi.d a3, a3, 8 addi.d a4, a4, 16 blt s1, s0, .LOOP_DST blt s1, a6, .DSTA b .END_FILTERA .DSTA: ld.w t2, a5, 0 li.w t3, 0 move s6, s1 .FILTERA: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s2, s2, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .FILTERA ld.w t2, a5, 4 li.w t3, 0 move s6, s1 addi.w t1, t1, 1 .FILTERB: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s3, s3, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .FILTERB ld.w t2, a5, 8 addi.w t1, t1, 1 li.w t3, 0 move s6, s1 .FILTERC: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s4, s4, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .FILTERC ld.w t2, a5, 12 addi.w t1, t1, 1 move s6, s1 li.w t3, 0 .FILTERD: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s5, s5, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .FILTERD .END_FILTERA: vpickve2gr.w t1, vr22, 0 vpickve2gr.w t2, vr22, 1 vpickve2gr.w t3, vr22, 2 vpickve2gr.w t4, vr22, 3 add.w s2, s2, t1 add.w s3, s3, t2 add.w s4, s4, t3 add.w s5, s5, t4 srai.w s2, s2, 3 srai.w s3, s3, 3 srai.w s4, s4, 3 srai.w s5, s5, 3 slt t1, s2, t0 slt t2, s3, t0 slt t3, s4, t0 slt t4, s5, t0 maskeqz s2, s2, t1 maskeqz s3, s3, t2 maskeqz s4, s4, t3 maskeqz s5, s5, t4 masknez t1, t0, t1 masknez t2, t0, t2 masknez t3, t0, t3 masknez t4, t0, t4 or s2, s2, t1 or s3, s3, t2 or s4, s4, t3 or s5, s5, t4 st.w s2, a1, 0 st.w s3, a1, 4 st.w s4, a1, 8 st.w s5, a1, 12 addi.d a1, a1, 16 sub.d a3, a3, s1 addi.d a5, a5, 16 slli.d t3, a6, 3 add.d a4, a4, t3 sub.d a4, a4, s1 sub.d a4, a4, s1 addi.d a2, a2, -4 bge a2, t7, .LOOP blt zero, a2, .RESA b .END .RESA: li.w t1, 0 .DST: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .FILTER: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .FILTER srai.w t8, t8, 3 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 2 stx.w t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .DST b .END .LOOP_DST8: ld.w t1, a5, 0 ld.w t2, a5, 4 ld.w t3, a5, 8 ld.w t4, a5, 12 fldx.d f1, a3, t1 fldx.d f2, a3, t2 fldx.d f3, a3, t3 fldx.d f4, a3, t4 ld.w t1, a5, 16 ld.w t2, a5, 20 ld.w t3, a5, 24 ld.w t4, a5, 28 fldx.d f5, a3, t1 fldx.d f6, a3, t2 fldx.d f7, a3, t3 fldx.d f8, a3, t4 vld vr9, a4, 0 vld vr10, a4, 16 vld vr11, a4, 32 vld vr12, a4, 48 vld vr13, a4, 64 vld vr14, a4, 80 vld vr15, a4, 96 vld vr16, a4, 112 vilvl.b vr1, vr0, vr1 vilvl.b vr2, vr0, vr2 vilvl.b vr3, vr0, vr3 vilvl.b vr4, vr0, vr4 vilvl.b vr5, vr0, vr5 vilvl.b vr6, vr0, vr6 vilvl.b vr7, vr0, vr7 vilvl.b vr8, vr0, vr8 vdp2.w.h vr17, vr1, vr9 vdp2.w.h vr18, vr2, vr10 vdp2.w.h vr19, vr3, vr11 vdp2.w.h vr21, vr4, vr12 vdp2.w.h vr1, vr5, vr13 vdp2.w.h vr2, vr6, vr14 vdp2.w.h vr3, vr7, vr15 vdp2.w.h vr4, vr8, vr16 vhaddw.d.w vr5, vr1, vr1 vhaddw.d.w vr6, vr2, vr2 vhaddw.d.w vr7, vr3, vr3 vhaddw.d.w vr8, vr4, vr4 vhaddw.d.w vr1, vr17, vr17 vhaddw.d.w vr2, vr18, vr18 vhaddw.d.w vr3, vr19, vr19 vhaddw.d.w vr4, vr21, vr21 vhaddw.q.d vr1, vr1, vr1 vhaddw.q.d vr2, vr2, vr2 vhaddw.q.d vr3, vr3, vr3 vhaddw.q.d vr4, vr4, vr4 vhaddw.q.d vr5, vr5, vr5 vhaddw.q.d vr6, vr6, vr6 vhaddw.q.d vr7, vr7, vr7 vhaddw.q.d vr8, vr8, vr8 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.w vr5, vr6, vr5 vilvl.w vr7, vr8, vr7 vilvl.d vr1, vr3, vr1 vilvl.d vr5, vr7, vr5 vsrai.w vr1, vr1, 3 vsrai.w vr5, vr5, 3 vmin.w vr1, vr1, vr20 vmin.w vr5, vr5, vr20 vst vr1, a1, 0 vst vr5, a1, 16 addi.d a1, a1, 32 addi.d a5, a5, 32 addi.d a4, a4, 128 addi.d a2, a2, -8 bge a2, t8, .LOOP_DST8 blt zero, a2, .REST8 b .END .REST8: li.w t1, 0 .DST8: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .FILTER8: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .FILTER8 srai.w t8, t8, 3 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 2 stx.w t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .DST8 b .END .LOOP_DST4: ld.w t1, a5, 0 ld.w t2, a5, 4 ld.w t3, a5, 8 ld.w t4, a5, 12 fldx.s f1, a3, t1 fldx.s f2, a3, t2 fldx.s f3, a3, t3 fldx.s f4, a3, t4 ld.w t1, a5, 16 ld.w t2, a5, 20 ld.w t3, a5, 24 ld.w t4, a5, 28 fldx.s f5, a3, t1 fldx.s f6, a3, t2 fldx.s f7, a3, t3 fldx.s f8, a3, t4 vld vr9, a4, 0 vld vr10, a4, 16 vld vr11, a4, 32 vld vr12, a4, 48 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.w vr5, vr6, vr5 vilvl.w vr7, vr8, vr7 vilvl.b vr1, vr0, vr1 vilvl.b vr3, vr0, vr3 vilvl.b vr5, vr0, vr5 vilvl.b vr7, vr0, vr7 vdp2.w.h vr13, vr1, vr9 vdp2.w.h vr14, vr3, vr10 vdp2.w.h vr15, vr5, vr11 vdp2.w.h vr16, vr7, vr12 vhaddw.d.w vr13, vr13, vr13 vhaddw.d.w vr14, vr14, vr14 vhaddw.d.w vr15, vr15, vr15 vhaddw.d.w vr16, vr16, vr16 vpickev.w vr13, vr14, vr13 vpickev.w vr15, vr16, vr15 vsrai.w vr13, vr13, 3 vsrai.w vr15, vr15, 3 vmin.w vr13, vr13, vr20 vmin.w vr15, vr15, vr20 vst vr13, a1, 0 vst vr15, a1, 16 addi.d a1, a1, 32 addi.d a5, a5, 32 addi.d a4, a4, 64 addi.d a2, a2, -8 bge a2, t8, .LOOP_DST4 blt zero, a2, .REST4 b .END .REST4: li.w t1, 0 .DST4: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .FILTER4: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .FILTER4 srai.w t8, t8, 3 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 2 stx.w t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .DST4 b .END .END_DST4: li.w t1, 0 .LOOP_DST1: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .FILTER1: add.w t4, t2, t3 ldx.bu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .FILTER1 srai.w t8, t8, 3 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 2 stx.w t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .LOOP_DST1 b .END .END: ld.d s0, sp, 0 ld.d s1, sp, 8 ld.d s2, sp, 16 ld.d s3, sp, 24 ld.d s4, sp, 32 ld.d s5, sp, 40 ld.d s6, sp, 48 ld.d s7, sp, 56 ld.d s8, sp, 64 addi.d sp, sp, 72 endfunc /* void ff_hscale_16_to_15_sub_lsx(SwsInternal *c, int16_t *dst, int dstW, * const uint8_t *src, const int16_t *filter, * const int32_t *filterPos, int filterSize, int sh) */ function ff_hscale_16_to_15_sub_lsx addi.d sp, sp, -72 st.d s0, sp, 0 st.d s1, sp, 8 st.d s2, sp, 16 st.d s3, sp, 24 st.d s4, sp, 32 st.d s5, sp, 40 st.d s6, sp, 48 st.d s7, sp, 56 st.d s8, sp, 64 li.w t0, 32767 li.w t8, 8 li.w t7, 4 vreplgr2vr.w vr20, t0 vreplgr2vr.w vr0, a7 beq a6, t7, .LOOP_HS15_DST4 beq a6, t8, .LOOP_HS15_DST8 blt t8, a6, .LOOP_HS15 b .END_HS15_DST4 .LOOP_HS15: li.w t1, 0 li.w s1, 0 li.w s2, 0 li.w s3, 0 li.w s4, 0 li.w s5, 0 vldi vr22, 0 addi.w s0, a6, -7 slli.w s7, a6, 1 slli.w s8, a6, 2 add.w t6, s7, s8 .LOOP_HS15_DST: ld.w t2, a5, 0 ld.w t3, a5, 4 ld.w t4, a5, 8 ld.w t5, a5, 12 slli.w t2, t2, 1 slli.w t3, t3, 1 slli.w t4, t4, 1 slli.w t5, t5, 1 vldx vr1, a3, t2 vldx vr2, a3, t3 vldx vr3, a3, t4 vldx vr4, a3, t5 vld vr9, a4, 0 vldx vr10, a4, s7 vldx vr11, a4, s8 vldx vr12, a4, t6 vmulwev.w.hu.h vr17, vr1, vr9 vmulwev.w.hu.h vr18, vr2, vr10 vmulwev.w.hu.h vr19, vr3, vr11 vmulwev.w.hu.h vr21, vr4, vr12 vmaddwod.w.hu.h vr17, vr1, vr9 vmaddwod.w.hu.h vr18, vr2, vr10 vmaddwod.w.hu.h vr19, vr3, vr11 vmaddwod.w.hu.h vr21, vr4, vr12 vhaddw.d.w vr1, vr17, vr17 vhaddw.d.w vr2, vr18, vr18 vhaddw.d.w vr3, vr19, vr19 vhaddw.d.w vr4, vr21, vr21 vhaddw.q.d vr1, vr1, vr1 vhaddw.q.d vr2, vr2, vr2 vhaddw.q.d vr3, vr3, vr3 vhaddw.q.d vr4, vr4, vr4 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.d vr1, vr3, vr1 vadd.w vr22, vr22, vr1 addi.w s1, s1, 8 addi.d a3, a3, 16 addi.d a4, a4, 16 blt s1, s0, .LOOP_HS15_DST blt s1, a6, .HS15_DSTA b .END_HS15_FILTERA .HS15_DSTA: ld.w t2, a5, 0 li.w t3, 0 move s6, s1 .HS15_FILTERA: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s2, s2, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .HS15_FILTERA ld.w t2, a5, 4 li.w t3, 0 move s6, s1 addi.w t1, t1, 1 .HS15_FILTERB: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s3, s3, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .HS15_FILTERB ld.w t2, a5, 8 addi.w t1, t1, 1 li.w t3, 0 move s6, s1 .HS15_FILTERC: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s4, s4, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .HS15_FILTERC ld.w t2, a5, 12 addi.w t1, t1, 1 move s6, s1 li.w t3, 0 .HS15_FILTERD: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s5, s5, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .HS15_FILTERD .END_HS15_FILTERA: vpickve2gr.w t1, vr22, 0 vpickve2gr.w t2, vr22, 1 vpickve2gr.w t3, vr22, 2 vpickve2gr.w t4, vr22, 3 add.w s2, s2, t1 add.w s3, s3, t2 add.w s4, s4, t3 add.w s5, s5, t4 sra.w s2, s2, a7 sra.w s3, s3, a7 sra.w s4, s4, a7 sra.w s5, s5, a7 slt t1, s2, t0 slt t2, s3, t0 slt t3, s4, t0 slt t4, s5, t0 maskeqz s2, s2, t1 maskeqz s3, s3, t2 maskeqz s4, s4, t3 maskeqz s5, s5, t4 masknez t1, t0, t1 masknez t2, t0, t2 masknez t3, t0, t3 masknez t4, t0, t4 or s2, s2, t1 or s3, s3, t2 or s4, s4, t3 or s5, s5, t4 st.h s2, a1, 0 st.h s3, a1, 2 st.h s4, a1, 4 st.h s5, a1, 6 addi.d a1, a1, 8 sub.d a3, a3, s1 sub.d a3, a3, s1 addi.d a5, a5, 16 slli.d t3, a6, 3 add.d a4, a4, t3 sub.d a4, a4, s1 sub.d a4, a4, s1 addi.d a2, a2, -4 bge a2, t7, .LOOP_HS15 blt zero, a2, .HS15_RESA b .HS15_END .HS15_RESA: li.w t1, 0 .HS15_DST: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .HS15_FILTER: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .HS15_FILTER sra.w t8, t8, a7 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 1 stx.h t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .HS15_DST b .HS15_END .LOOP_HS15_DST8: ld.w t1, a5, 0 ld.w t2, a5, 4 ld.w t3, a5, 8 ld.w t4, a5, 12 slli.w t1, t1, 1 slli.w t2, t2, 1 slli.w t3, t3, 1 slli.w t4, t4, 1 vldx vr1, a3, t1 vldx vr2, a3, t2 vldx vr3, a3, t3 vldx vr4, a3, t4 ld.w t1, a5, 16 ld.w t2, a5, 20 ld.w t3, a5, 24 ld.w t4, a5, 28 slli.w t1, t1, 1 slli.w t2, t2, 1 slli.w t3, t3, 1 slli.w t4, t4, 1 vldx vr5, a3, t1 vldx vr6, a3, t2 vldx vr7, a3, t3 vldx vr8, a3, t4 vld vr9, a4, 0 vld vr10, a4, 16 vld vr11, a4, 32 vld vr12, a4, 48 vld vr13, a4, 64 vld vr14, a4, 80 vld vr15, a4, 96 vld vr16, a4, 112 vmulwev.w.hu.h vr17, vr1, vr9 vmulwev.w.hu.h vr18, vr2, vr10 vmulwev.w.hu.h vr19, vr3, vr11 vmulwev.w.hu.h vr21, vr4, vr12 vmaddwod.w.hu.h vr17, vr1, vr9 vmaddwod.w.hu.h vr18, vr2, vr10 vmaddwod.w.hu.h vr19, vr3, vr11 vmaddwod.w.hu.h vr21, vr4, vr12 vmulwev.w.hu.h vr1, vr5, vr13 vmulwev.w.hu.h vr2, vr6, vr14 vmulwev.w.hu.h vr3, vr7, vr15 vmulwev.w.hu.h vr4, vr8, vr16 vmaddwod.w.hu.h vr1, vr5, vr13 vmaddwod.w.hu.h vr2, vr6, vr14 vmaddwod.w.hu.h vr3, vr7, vr15 vmaddwod.w.hu.h vr4, vr8, vr16 vhaddw.d.w vr5, vr1, vr1 vhaddw.d.w vr6, vr2, vr2 vhaddw.d.w vr7, vr3, vr3 vhaddw.d.w vr8, vr4, vr4 vhaddw.d.w vr1, vr17, vr17 vhaddw.d.w vr2, vr18, vr18 vhaddw.d.w vr3, vr19, vr19 vhaddw.d.w vr4, vr21, vr21 vhaddw.q.d vr1, vr1, vr1 vhaddw.q.d vr2, vr2, vr2 vhaddw.q.d vr3, vr3, vr3 vhaddw.q.d vr4, vr4, vr4 vhaddw.q.d vr5, vr5, vr5 vhaddw.q.d vr6, vr6, vr6 vhaddw.q.d vr7, vr7, vr7 vhaddw.q.d vr8, vr8, vr8 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.w vr5, vr6, vr5 vilvl.w vr7, vr8, vr7 vilvl.d vr1, vr3, vr1 vilvl.d vr5, vr7, vr5 vsra.w vr1, vr1, vr0 vsra.w vr5, vr5, vr0 vmin.w vr1, vr1, vr20 vmin.w vr5, vr5, vr20 vpickev.h vr1, vr5, vr1 vst vr1, a1, 0 addi.d a1, a1, 16 addi.d a5, a5, 32 addi.d a4, a4, 128 addi.d a2, a2, -8 bge a2, t8, .LOOP_HS15_DST8 blt zero, a2, .HS15_REST8 b .HS15_END .HS15_REST8: li.w t1, 0 .HS15_DST8: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .HS15_FILTER8: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .HS15_FILTER8 sra.w t8, t8, a7 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 1 stx.h t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .HS15_DST8 b .HS15_END .LOOP_HS15_DST4: ld.w t1, a5, 0 ld.w t2, a5, 4 ld.w t3, a5, 8 ld.w t4, a5, 12 slli.w t1, t1, 1 slli.w t2, t2, 1 slli.w t3, t3, 1 slli.w t4, t4, 1 fldx.d f1, a3, t1 fldx.d f2, a3, t2 fldx.d f3, a3, t3 fldx.d f4, a3, t4 ld.w t1, a5, 16 ld.w t2, a5, 20 ld.w t3, a5, 24 ld.w t4, a5, 28 slli.w t1, t1, 1 slli.w t2, t2, 1 slli.w t3, t3, 1 slli.w t4, t4, 1 fldx.d f5, a3, t1 fldx.d f6, a3, t2 fldx.d f7, a3, t3 fldx.d f8, a3, t4 vld vr9, a4, 0 vld vr10, a4, 16 vld vr11, a4, 32 vld vr12, a4, 48 vilvl.d vr1, vr2, vr1 vilvl.d vr3, vr4, vr3 vilvl.d vr5, vr6, vr5 vilvl.d vr7, vr8, vr7 vmulwev.w.hu.h vr13, vr1, vr9 vmulwev.w.hu.h vr14, vr3, vr10 vmulwev.w.hu.h vr15, vr5, vr11 vmulwev.w.hu.h vr16, vr7, vr12 vmaddwod.w.hu.h vr13, vr1, vr9 vmaddwod.w.hu.h vr14, vr3, vr10 vmaddwod.w.hu.h vr15, vr5, vr11 vmaddwod.w.hu.h vr16, vr7, vr12 vhaddw.d.w vr13, vr13, vr13 vhaddw.d.w vr14, vr14, vr14 vhaddw.d.w vr15, vr15, vr15 vhaddw.d.w vr16, vr16, vr16 vpickev.w vr13, vr14, vr13 vpickev.w vr15, vr16, vr15 vsra.w vr13, vr13, vr0 vsra.w vr15, vr15, vr0 vmin.w vr13, vr13, vr20 vmin.w vr15, vr15, vr20 vpickev.h vr13, vr15, vr13 vst vr13, a1, 0 addi.d a1, a1, 16 addi.d a5, a5, 32 addi.d a4, a4, 64 addi.d a2, a2, -8 bge a2, t8, .LOOP_HS15_DST4 blt zero, a2, .HS15_REST4 b .HS15_END .HS15_REST4: li.w t1, 0 .HS15_DST4: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .HS15_FILTER4: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .HS15_FILTER4 sra.w t8, t8, a7 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 1 stx.h t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .HS15_DST4 b .HS15_END .END_HS15_DST4: li.w t1, 0 .LOOP_HS15_DST1: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .HS15_FILTER1: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .HS15_FILTER1 sra.w t8, t8, a7 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 1 stx.h t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .LOOP_HS15_DST1 b .HS15_END .HS15_END: ld.d s0, sp, 0 ld.d s1, sp, 8 ld.d s2, sp, 16 ld.d s3, sp, 24 ld.d s4, sp, 32 ld.d s5, sp, 40 ld.d s6, sp, 48 ld.d s7, sp, 56 ld.d s8, sp, 64 addi.d sp, sp, 72 endfunc /* void ff_hscale_16_to_19_sub_lsx(SwsInternal *c, int16_t *dst, int dstW, * const uint8_t *src, const int16_t *filter, * const int32_t *filterPos, int filterSize, int sh) */ function ff_hscale_16_to_19_sub_lsx addi.d sp, sp, -72 st.d s0, sp, 0 st.d s1, sp, 8 st.d s2, sp, 16 st.d s3, sp, 24 st.d s4, sp, 32 st.d s5, sp, 40 st.d s6, sp, 48 st.d s7, sp, 56 st.d s8, sp, 64 li.w t0, 524287 li.w t8, 8 li.w t7, 4 vreplgr2vr.w vr20, t0 vreplgr2vr.w vr0, a7 beq a6, t7, .LOOP_HS19_DST4 beq a6, t8, .LOOP_HS19_DST8 blt t8, a6, .LOOP_HS19 b .END_HS19_DST4 .LOOP_HS19: li.w t1, 0 li.w s1, 0 li.w s2, 0 li.w s3, 0 li.w s4, 0 li.w s5, 0 vldi vr22, 0 addi.w s0, a6, -7 slli.w s7, a6, 1 slli.w s8, a6, 2 add.w t6, s7, s8 .LOOP_HS19_DST: ld.w t2, a5, 0 ld.w t3, a5, 4 ld.w t4, a5, 8 ld.w t5, a5, 12 slli.w t2, t2, 1 slli.w t3, t3, 1 slli.w t4, t4, 1 slli.w t5, t5, 1 vldx vr1, a3, t2 vldx vr2, a3, t3 vldx vr3, a3, t4 vldx vr4, a3, t5 vld vr9, a4, 0 vldx vr10, a4, s7 vldx vr11, a4, s8 vldx vr12, a4, t6 vmulwev.w.hu.h vr17, vr1, vr9 vmulwev.w.hu.h vr18, vr2, vr10 vmulwev.w.hu.h vr19, vr3, vr11 vmulwev.w.hu.h vr21, vr4, vr12 vmaddwod.w.hu.h vr17, vr1, vr9 vmaddwod.w.hu.h vr18, vr2, vr10 vmaddwod.w.hu.h vr19, vr3, vr11 vmaddwod.w.hu.h vr21, vr4, vr12 vhaddw.d.w vr1, vr17, vr17 vhaddw.d.w vr2, vr18, vr18 vhaddw.d.w vr3, vr19, vr19 vhaddw.d.w vr4, vr21, vr21 vhaddw.q.d vr1, vr1, vr1 vhaddw.q.d vr2, vr2, vr2 vhaddw.q.d vr3, vr3, vr3 vhaddw.q.d vr4, vr4, vr4 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.d vr1, vr3, vr1 vadd.w vr22, vr22, vr1 addi.w s1, s1, 8 addi.d a3, a3, 16 addi.d a4, a4, 16 blt s1, s0, .LOOP_HS19_DST blt s1, a6, .HS19_DSTA b .END_HS19_FILTERA .HS19_DSTA: ld.w t2, a5, 0 li.w t3, 0 move s6, s1 .HS19_FILTERA: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s2, s2, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .HS19_FILTERA ld.w t2, a5, 4 li.w t3, 0 move s6, s1 addi.w t1, t1, 1 .HS19_FILTERB: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s3, s3, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .HS19_FILTERB ld.w t2, a5, 8 addi.w t1, t1, 1 li.w t3, 0 move s6, s1 .HS19_FILTERC: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s4, s4, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .HS19_FILTERC ld.w t2, a5, 12 addi.w t1, t1, 1 move s6, s1 li.w t3, 0 .HS19_FILTERD: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t6, t6, 1 ldx.h t6, a4, t6 mul.w t6, t5, t6 add.w s5, s5, t6 addi.w t3, t3, 1 addi.w s6, s6, 1 blt s6, a6, .HS19_FILTERD .END_HS19_FILTERA: vpickve2gr.w t1, vr22, 0 vpickve2gr.w t2, vr22, 1 vpickve2gr.w t3, vr22, 2 vpickve2gr.w t4, vr22, 3 add.w s2, s2, t1 add.w s3, s3, t2 add.w s4, s4, t3 add.w s5, s5, t4 sra.w s2, s2, a7 sra.w s3, s3, a7 sra.w s4, s4, a7 sra.w s5, s5, a7 slt t1, s2, t0 slt t2, s3, t0 slt t3, s4, t0 slt t4, s5, t0 maskeqz s2, s2, t1 maskeqz s3, s3, t2 maskeqz s4, s4, t3 maskeqz s5, s5, t4 masknez t1, t0, t1 masknez t2, t0, t2 masknez t3, t0, t3 masknez t4, t0, t4 or s2, s2, t1 or s3, s3, t2 or s4, s4, t3 or s5, s5, t4 st.w s2, a1, 0 st.w s3, a1, 4 st.w s4, a1, 8 st.w s5, a1, 12 addi.d a1, a1, 16 sub.d a3, a3, s1 sub.d a3, a3, s1 addi.d a5, a5, 16 slli.d t3, a6, 3 add.d a4, a4, t3 sub.d a4, a4, s1 sub.d a4, a4, s1 addi.d a2, a2, -4 bge a2, t7, .LOOP_HS19 blt zero, a2, .HS19_RESA b .HS19_END .HS19_RESA: li.w t1, 0 .HS19_DST: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .HS19_FILTER: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .HS19_FILTER sra.w t8, t8, a7 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 2 stx.w t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .HS19_DST b .HS19_END .LOOP_HS19_DST8: ld.w t1, a5, 0 ld.w t2, a5, 4 ld.w t3, a5, 8 ld.w t4, a5, 12 slli.w t1, t1, 1 slli.w t2, t2, 1 slli.w t3, t3, 1 slli.w t4, t4, 1 vldx vr1, a3, t1 vldx vr2, a3, t2 vldx vr3, a3, t3 vldx vr4, a3, t4 ld.w t1, a5, 16 ld.w t2, a5, 20 ld.w t3, a5, 24 ld.w t4, a5, 28 slli.w t1, t1, 1 slli.w t2, t2, 1 slli.w t3, t3, 1 slli.w t4, t4, 1 vldx vr5, a3, t1 vldx vr6, a3, t2 vldx vr7, a3, t3 vldx vr8, a3, t4 vld vr9, a4, 0 vld vr10, a4, 16 vld vr11, a4, 32 vld vr12, a4, 48 vld vr13, a4, 64 vld vr14, a4, 80 vld vr15, a4, 96 vld vr16, a4, 112 vmulwev.w.hu.h vr17, vr1, vr9 vmulwev.w.hu.h vr18, vr2, vr10 vmulwev.w.hu.h vr19, vr3, vr11 vmulwev.w.hu.h vr21, vr4, vr12 vmaddwod.w.hu.h vr17, vr1, vr9 vmaddwod.w.hu.h vr18, vr2, vr10 vmaddwod.w.hu.h vr19, vr3, vr11 vmaddwod.w.hu.h vr21, vr4, vr12 vmulwev.w.hu.h vr1, vr5, vr13 vmulwev.w.hu.h vr2, vr6, vr14 vmulwev.w.hu.h vr3, vr7, vr15 vmulwev.w.hu.h vr4, vr8, vr16 vmaddwod.w.hu.h vr1, vr5, vr13 vmaddwod.w.hu.h vr2, vr6, vr14 vmaddwod.w.hu.h vr3, vr7, vr15 vmaddwod.w.hu.h vr4, vr8, vr16 vhaddw.d.w vr5, vr1, vr1 vhaddw.d.w vr6, vr2, vr2 vhaddw.d.w vr7, vr3, vr3 vhaddw.d.w vr8, vr4, vr4 vhaddw.d.w vr1, vr17, vr17 vhaddw.d.w vr2, vr18, vr18 vhaddw.d.w vr3, vr19, vr19 vhaddw.d.w vr4, vr21, vr21 vhaddw.q.d vr1, vr1, vr1 vhaddw.q.d vr2, vr2, vr2 vhaddw.q.d vr3, vr3, vr3 vhaddw.q.d vr4, vr4, vr4 vhaddw.q.d vr5, vr5, vr5 vhaddw.q.d vr6, vr6, vr6 vhaddw.q.d vr7, vr7, vr7 vhaddw.q.d vr8, vr8, vr8 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.w vr5, vr6, vr5 vilvl.w vr7, vr8, vr7 vilvl.d vr1, vr3, vr1 vilvl.d vr5, vr7, vr5 vsra.w vr1, vr1, vr0 vsra.w vr5, vr5, vr0 vmin.w vr1, vr1, vr20 vmin.w vr5, vr5, vr20 vst vr1, a1, 0 vst vr5, a1, 16 addi.d a1, a1, 32 addi.d a5, a5, 32 addi.d a4, a4, 128 addi.d a2, a2, -8 bge a2, t8, .LOOP_HS19_DST8 blt zero, a2, .HS19_REST8 b .HS19_END .HS19_REST8: li.w t1, 0 .HS19_DST8: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .HS19_FILTER8: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .HS19_FILTER8 sra.w t8, t8, a7 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 2 stx.w t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .HS19_DST8 b .HS19_END .LOOP_HS19_DST4: ld.w t1, a5, 0 ld.w t2, a5, 4 ld.w t3, a5, 8 ld.w t4, a5, 12 slli.w t1, t1, 1 slli.w t2, t2, 1 slli.w t3, t3, 1 slli.w t4, t4, 1 fldx.d f1, a3, t1 fldx.d f2, a3, t2 fldx.d f3, a3, t3 fldx.d f4, a3, t4 ld.w t1, a5, 16 ld.w t2, a5, 20 ld.w t3, a5, 24 ld.w t4, a5, 28 slli.w t1, t1, 1 slli.w t2, t2, 1 slli.w t3, t3, 1 slli.w t4, t4, 1 fldx.d f5, a3, t1 fldx.d f6, a3, t2 fldx.d f7, a3, t3 fldx.d f8, a3, t4 vld vr9, a4, 0 vld vr10, a4, 16 vld vr11, a4, 32 vld vr12, a4, 48 vilvl.d vr1, vr2, vr1 vilvl.d vr3, vr4, vr3 vilvl.d vr5, vr6, vr5 vilvl.d vr7, vr8, vr7 vmulwev.w.hu.h vr13, vr1, vr9 vmulwev.w.hu.h vr14, vr3, vr10 vmulwev.w.hu.h vr15, vr5, vr11 vmulwev.w.hu.h vr16, vr7, vr12 vmaddwod.w.hu.h vr13, vr1, vr9 vmaddwod.w.hu.h vr14, vr3, vr10 vmaddwod.w.hu.h vr15, vr5, vr11 vmaddwod.w.hu.h vr16, vr7, vr12 vhaddw.d.w vr13, vr13, vr13 vhaddw.d.w vr14, vr14, vr14 vhaddw.d.w vr15, vr15, vr15 vhaddw.d.w vr16, vr16, vr16 vpickev.w vr13, vr14, vr13 vpickev.w vr15, vr16, vr15 vsra.w vr13, vr13, vr0 vsra.w vr15, vr15, vr0 vmin.w vr13, vr13, vr20 vmin.w vr15, vr15, vr20 vst vr13, a1, 0 vst vr15, a1, 16 addi.d a1, a1, 32 addi.d a5, a5, 32 addi.d a4, a4, 64 addi.d a2, a2, -8 bge a2, t8, .LOOP_HS19_DST4 blt zero, a2, .HS19_REST4 b .HS19_END .HS19_REST4: li.w t1, 0 .HS19_DST4: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .HS19_FILTER4: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .HS19_FILTER4 sra.w t8, t8, a7 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 2 stx.w t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .HS19_DST4 b .HS19_END .END_HS19_DST4: li.w t1, 0 .LOOP_HS19_DST1: slli.w t2, t1, 2 ldx.w t2, a5, t2 li.w t3, 0 li.w t8, 0 .HS19_FILTER1: add.w t4, t2, t3 slli.w t4, t4, 1 ldx.hu t5, a3, t4 mul.w t6, a6, t1 add.w t6, t6, t3 slli.w t7, t6, 1 ldx.h t7, a4, t7 mul.w t7, t5, t7 add.w t8, t8, t7 addi.w t3, t3, 1 blt t3, a6, .HS19_FILTER1 sra.w t8, t8, a7 slt t5, t8, t0 maskeqz t8, t8, t5 masknez t5, t0, t5 or t8, t8, t5 slli.w t4, t1, 2 stx.w t8, a1, t4 addi.w t1, t1, 1 blt t1, a2, .LOOP_HS19_DST1 b .HS19_END .HS19_END: ld.d s0, sp, 0 ld.d s1, sp, 8 ld.d s2, sp, 16 ld.d s3, sp, 24 ld.d s4, sp, 32 ld.d s5, sp, 40 ld.d s6, sp, 48 ld.d s7, sp, 56 ld.d s8, sp, 64 addi.d sp, sp, 72 endfunc function lumRangeFromJpeg_lsx li.w t0, 14071 li.w t1, 33561947 vreplgr2vr.h vr0, t0 srli.w t2, a1, 3 andi t3, a1, 7 beqz t2, 2f 1: vld vr1, a0, 0 vreplgr2vr.w vr2, t1 vreplgr2vr.w vr3, t1 vmaddwev.w.h vr2, vr0, vr1 vmaddwod.w.h vr3, vr0, vr1 vsrai.w vr2, vr2, 14 vsrai.w vr3, vr3, 14 vpackev.h vr1, vr3, vr2 vst vr1, a0, 0 addi.d a0, a0, 16 addi.d t2, t2, -1 bnez t2, 1b 2: beqz t3, 4f 3: ld.h t4, a0, 0 mul.w t4, t4, t0 add.w t4, t4, t1 srai.w t4, t4, 14 st.h t4, a0, 0 addi.d a0, a0, 2 addi.d t3, t3, -1 bnez t3, 3b 4: endfunc function lumRangeFromJpeg_lasx li.w t0, 14071 li.w t1, 33561947 xvreplgr2vr.h xr0, t0 srli.w t2, a1, 4 andi t3, a1, 15 beqz t2, 2f 1: xvld xr1, a0, 0 xvreplgr2vr.w xr2, t1 xvreplgr2vr.w xr3, t1 xvmaddwev.w.h xr2, xr0, xr1 xvmaddwod.w.h xr3, xr0, xr1 xvsrai.w xr2, xr2, 14 xvsrai.w xr3, xr3, 14 xvpackev.h xr1, xr3, xr2 xvst xr1, a0, 0 addi.d a0, a0, 32 addi.d t2, t2, -1 bnez t2, 1b 2: beqz t3, 4f 3: ld.h t4, a0, 0 mul.w t4, t4, t0 add.w t4, t4, t1 srai.w t4, t4, 14 st.h t4, a0, 0 addi.d a0, a0, 2 addi.d t3, t3, -1 bnez t3, 3b 4: endfunc function lumRangeToJpeg_lsx li.w t0, 19077 li.w t1, -39057361 li.w t2, 30189 vreplgr2vr.h vr0, t0 vreplgr2vr.h vr4, t2 srli.w t2, a1, 3 andi t3, a1, 7 beqz t2, 2f 1: vld vr1, a0, 0 vreplgr2vr.w vr2, t1 vreplgr2vr.w vr3, t1 vmin.h vr1, vr1, vr4 vmaddwev.w.h vr2, vr0, vr1 vmaddwod.w.h vr3, vr0, vr1 vsrai.w vr2, vr2, 14 vsrai.w vr3, vr3, 14 vpackev.h vr1, vr3, vr2 vst vr1, a0, 0 addi.d a0, a0, 16 addi.d t2, t2, -1 bnez t2, 1b 2: beqz t3, 4f 3: ld.h t4, a0, 0 vreplgr2vr.h vr1, t4 vmin.h vr1, vr1, vr4 vpickve2gr.h t4, vr1, 0 mul.w t4, t4, t0 add.w t4, t4, t1 srai.w t4, t4, 14 st.h t4, a0, 0 addi.d a0, a0, 2 addi.d t3, t3, -1 bnez t3, 3b 4: endfunc function lumRangeToJpeg_lasx li.w t0, 19077 li.w t1, -39057361 li.w t2, 30189 xvreplgr2vr.h xr0, t0 xvreplgr2vr.h xr4, t2 srli.w t2, a1, 4 andi t3, a1, 15 beqz t2, 2f 1: xvld xr1, a0, 0 xvreplgr2vr.w xr2, t1 xvreplgr2vr.w xr3, t1 xvmin.h xr1, xr1, xr4 xvmaddwev.w.h xr2, xr0, xr1 xvmaddwod.w.h xr3, xr0, xr1 xvsrai.w xr2, xr2, 14 xvsrai.w xr3, xr3, 14 xvpackev.h xr1, xr3, xr2 xvst xr1, a0, 0 addi.d a0, a0, 32 addi.d t2, t2, -1 bnez t2, 1b 2: beqz t3, 4f 3: ld.h t4, a0, 0 vreplgr2vr.h vr1, t4 vmin.h vr1, vr1, vr4 vpickve2gr.h t4, vr1, 0 mul.w t4, t4, t0 add.w t4, t4, t1 srai.w t4, t4, 14 st.h t4, a0, 0 addi.d a0, a0, 2 addi.d t3, t3, -1 bnez t3, 3b 4: endfunc function chrRangeFromJpeg_lsx li.w t0, 1799 li.w t1, 4081085 vreplgr2vr.h vr0, t0 srli.w t2, a2, 3 andi t3, a2, 7 beqz t2, 2f 1: vld vr1, a0, 0 vld vr2, a1, 0 vreplgr2vr.w vr3, t1 vreplgr2vr.w vr4, t1 vreplgr2vr.w vr5, t1 vreplgr2vr.w vr6, t1 vmaddwev.w.h vr3, vr0, vr1 vmaddwod.w.h vr4, vr0, vr1 vmaddwev.w.h vr5, vr0, vr2 vmaddwod.w.h vr6, vr0, vr2 vsrai.w vr3, vr3, 11 vsrai.w vr4, vr4, 11 vsrai.w vr5, vr5, 11 vsrai.w vr6, vr6, 11 vpackev.h vr1, vr4, vr3 vpackev.h vr2, vr6, vr5 vst vr1, a0, 0 vst vr2, a1, 0 addi.d a0, a0, 16 addi.d a1, a1, 16 addi.d t2, t2, -1 bnez t2, 1b 2: beqz t3, 4f 3: ld.h t4, a0, 0 ld.h t5, a1, 0 mul.w t4, t4, t0 mul.w t5, t5, t0 add.w t4, t4, t1 add.w t5, t5, t1 srai.w t4, t4, 11 srai.w t5, t5, 11 st.h t4, a0, 0 st.h t5, a1, 0 addi.d a0, a0, 2 addi.d a1, a1, 2 addi.d t3, t3, -1 bnez t3, 3b 4: endfunc function chrRangeFromJpeg_lasx li.w t0, 1799 li.w t1, 4081085 xvreplgr2vr.h xr0, t0 srli.w t2, a2, 4 andi t3, a2, 15 beqz t2, 2f 1: xvld xr1, a0, 0 xvld xr2, a1, 0 xvreplgr2vr.w xr3, t1 xvreplgr2vr.w xr4, t1 xvreplgr2vr.w xr5, t1 xvreplgr2vr.w xr6, t1 xvmaddwev.w.h xr3, xr0, xr1 xvmaddwod.w.h xr4, xr0, xr1 xvmaddwev.w.h xr5, xr0, xr2 xvmaddwod.w.h xr6, xr0, xr2 xvsrai.w xr3, xr3, 11 xvsrai.w xr4, xr4, 11 xvsrai.w xr5, xr5, 11 xvsrai.w xr6, xr6, 11 xvpackev.h xr1, xr4, xr3 xvpackev.h xr2, xr6, xr5 xvst xr1, a0, 0 xvst xr2, a1, 0 addi.d a0, a0, 32 addi.d a1, a1, 32 addi.d t2, t2, -1 bnez t2, 1b 2: beqz t3, 4f 3: ld.h t4, a0, 0 ld.h t5, a1, 0 mul.w t4, t4, t0 mul.w t5, t5, t0 add.w t4, t4, t1 add.w t5, t5, t1 srai.w t4, t4, 11 srai.w t5, t5, 11 st.h t4, a0, 0 st.h t5, a1, 0 addi.d a0, a0, 2 addi.d a1, a1, 2 addi.d t3, t3, -1 bnez t3, 3b 4: endfunc function chrRangeToJpeg_lsx li.w t0, 4663 li.w t1, -9289992 li.w t2, 30775 vreplgr2vr.h vr0, t0 vreplgr2vr.h vr7, t2 srli.w t2, a2, 3 andi t3, a2, 7 beqz t2, 2f 1: vld vr1, a0, 0 vld vr2, a1, 0 vreplgr2vr.w vr3, t1 vreplgr2vr.w vr4, t1 vreplgr2vr.w vr5, t1 vreplgr2vr.w vr6, t1 vmin.h vr1, vr1, vr7 vmin.h vr2, vr2, vr7 vmaddwev.w.h vr3, vr0, vr1 vmaddwod.w.h vr4, vr0, vr1 vmaddwev.w.h vr5, vr0, vr2 vmaddwod.w.h vr6, vr0, vr2 vsrai.w vr3, vr3, 12 vsrai.w vr4, vr4, 12 vsrai.w vr5, vr5, 12 vsrai.w vr6, vr6, 12 vpackev.h vr1, vr4, vr3 vpackev.h vr2, vr6, vr5 vst vr1, a0, 0 vst vr2, a1, 0 addi.d a0, a0, 16 addi.d a1, a1, 16 addi.d t2, t2, -1 bnez t2, 1b 2: beqz t3, 4f 3: ld.h t4, a0, 0 ld.h t5, a1, 0 vreplgr2vr.h vr1, t4 vreplgr2vr.h vr2, t5 vmin.h vr1, vr1, vr7 vmin.h vr2, vr2, vr7 vpickve2gr.h t4, vr1, 0 vpickve2gr.h t5, vr2, 0 mul.w t4, t4, t0 mul.w t5, t5, t0 add.w t4, t4, t1 add.w t5, t5, t1 srai.w t4, t4, 12 srai.w t5, t5, 12 st.h t4, a0, 0 st.h t5, a1, 0 addi.d a0, a0, 2 addi.d a1, a1, 2 addi.d t3, t3, -1 bnez t3, 3b 4: endfunc function chrRangeToJpeg_lasx li.w t0, 4663 li.w t1, -9289992 li.w t2, 30775 xvreplgr2vr.h xr0, t0 xvreplgr2vr.h xr7, t2 srli.w t2, a2, 4 andi t3, a2, 15 beqz t2, 2f 1: xvld xr1, a0, 0 xvld xr2, a1, 0 xvreplgr2vr.w xr3, t1 xvreplgr2vr.w xr4, t1 xvreplgr2vr.w xr5, t1 xvreplgr2vr.w xr6, t1 xvmin.h xr1, xr1, xr7 xvmin.h xr2, xr2, xr7 xvmaddwev.w.h xr3, xr0, xr1 xvmaddwod.w.h xr4, xr0, xr1 xvmaddwev.w.h xr5, xr0, xr2 xvmaddwod.w.h xr6, xr0, xr2 xvsrai.w xr3, xr3, 12 xvsrai.w xr4, xr4, 12 xvsrai.w xr5, xr5, 12 xvsrai.w xr6, xr6, 12 xvpackev.h xr1, xr4, xr3 xvpackev.h xr2, xr6, xr5 xvst xr1, a0, 0 xvst xr2, a1, 0 addi.d a0, a0, 32 addi.d a1, a1, 32 addi.d t2, t2, -1 bnez t2, 1b 2: beqz t3, 4f 3: ld.h t4, a0, 0 ld.h t5, a1, 0 vreplgr2vr.h vr1, t4 vreplgr2vr.h vr2, t5 vmin.h vr1, vr1, vr7 vmin.h vr2, vr2, vr7 vpickve2gr.h t4, vr1, 0 vpickve2gr.h t5, vr2, 0 mul.w t4, t4, t0 mul.w t5, t5, t0 add.w t4, t4, t1 add.w t5, t5, t1 srai.w t4, t4, 12 srai.w t5, t5, 12 st.h t4, a0, 0 st.h t5, a1, 0 addi.d a0, a0, 2 addi.d a1, a1, 2 addi.d t3, t3, -1 bnez t3, 3b 4: endfunc