output.S 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. /*
  2. * Loongson LSX optimized swscale
  3. *
  4. * Copyright (c) 2023 Loongson Technology Corporation Limited
  5. * Contributed by Lu Wang <wanglu@loongson.cn>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "libavcodec/loongarch/loongson_asm.S"
  24. /* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
  25. * const int16_t **src, uint8_t *dest, int dstW,
  26. * const uint8_t *dither, int offset)
  27. */
  28. function ff_yuv2planeX_8_lsx
  29. addi.w t1, a6, 1
  30. addi.w t2, a6, 2
  31. addi.w t3, a6, 3
  32. addi.w t4, a6, 4
  33. addi.w t5, a6, 5
  34. addi.w t6, a6, 6
  35. addi.w t7, a6, 7
  36. andi t0, a6, 7
  37. andi t1, t1, 7
  38. andi t2, t2, 7
  39. andi t3, t3, 7
  40. andi t4, t4, 7
  41. andi t5, t5, 7
  42. andi t6, t6, 7
  43. andi t7, t7, 7
  44. ldx.bu t0, a5, t0
  45. ldx.bu t1, a5, t1
  46. ldx.bu t2, a5, t2
  47. ldx.bu t3, a5, t3
  48. ldx.bu t4, a5, t4
  49. ldx.bu t5, a5, t5
  50. ldx.bu t6, a5, t6
  51. ldx.bu t7, a5, t7
  52. vreplgr2vr.w vr0, t0
  53. vreplgr2vr.w vr1, t1
  54. vreplgr2vr.w vr2, t2
  55. vreplgr2vr.w vr3, t3
  56. vreplgr2vr.w vr4, t4
  57. vreplgr2vr.w vr5, t5
  58. vreplgr2vr.w vr6, t6
  59. vreplgr2vr.w vr7, t7
  60. vilvl.w vr0, vr2, vr0
  61. vilvl.w vr4, vr6, vr4
  62. vilvl.w vr1, vr3, vr1
  63. vilvl.w vr5, vr7, vr5
  64. vilvl.d vr12, vr4, vr0
  65. vilvl.d vr13, vr5, vr1
  66. li.w t5, 0
  67. li.w t8, 8
  68. bge a4, t8, .WIDTH8
  69. blt zero, a4, .WIDTH
  70. b .END
  71. .WIDTH8:
  72. li.d t1, 0
  73. li.d t4, 0
  74. vslli.w vr2, vr12, 12
  75. vslli.w vr3, vr13, 12
  76. move t3, a0
  77. .FILTERSIZE8:
  78. ldx.d t2, a2, t1
  79. vldx vr4, t2, t5
  80. vldrepl.h vr5, t3, 0
  81. vmaddwev.w.h vr2, vr4, vr5
  82. vmaddwod.w.h vr3, vr4, vr5
  83. addi.d t1, t1, 8
  84. addi.d t3, t3, 2
  85. addi.d t4, t4, 1
  86. blt t4, a1, .FILTERSIZE8
  87. vsrai.w vr2, vr2, 19
  88. vsrai.w vr3, vr3, 19
  89. vclip255.w vr2, vr2
  90. vclip255.w vr3, vr3
  91. vpickev.h vr2, vr3, vr2
  92. vpickev.b vr2, vr2, vr2
  93. vbsrl.v vr3, vr2, 4
  94. vilvl.b vr2, vr3, vr2
  95. fst.d f2, a3, 0
  96. addi.d t5, t5, 16
  97. addi.d a4, a4, -8
  98. addi.d a3, a3, 8
  99. bge a4, t8, .WIDTH8
  100. blt zero, a4, .WIDTH
  101. b .END
  102. .WIDTH:
  103. li.d t1, 0
  104. li.d t4, 0
  105. vslli.w vr2, vr12, 12
  106. vslli.w vr3, vr13, 12
  107. .FILTERSIZE:
  108. ldx.d t2, a2, t1
  109. vldx vr4, t2, t5
  110. vldrepl.h vr5, a0, 0
  111. vmaddwev.w.h vr2, vr4, vr5
  112. vmaddwod.w.h vr3, vr4, vr5
  113. addi.d t1, t1, 8
  114. addi.d a0, a0, 2
  115. addi.d t4, t4, 1
  116. blt t4, a1, .FILTERSIZE
  117. vsrai.w vr2, vr2, 19
  118. vsrai.w vr3, vr3, 19
  119. vclip255.w vr2, vr2
  120. vclip255.w vr3, vr3
  121. vpickev.h vr2, vr3, vr2
  122. vpickev.b vr2, vr2, vr2
  123. vbsrl.v vr3, vr2, 4
  124. vilvl.b vr2, vr3, vr2
  125. .DEST:
  126. vstelm.b vr2, a3, 0, 0
  127. vbsrl.v vr2, vr2, 1
  128. addi.d a4, a4, -1
  129. addi.d a3, a3, 1
  130. blt zero, a4, .DEST
  131. .END:
  132. endfunc