input.S 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. /*
  2. * Loongson LSX optimized swscale
  3. *
  4. * Copyright (c) 2023 Loongson Technology Corporation Limited
  5. * Contributed by Lu Wang <wanglu@loongson.cn>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "libavcodec/loongarch/loongson_asm.S"
  24. /* void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4],
  25. * int width, int32_t *rgb2yuv)
  26. */
  27. function planar_rgb_to_y_lsx
  28. ld.d a5, a1, 0
  29. ld.d a6, a1, 8
  30. ld.d a7, a1, 16
  31. ld.w t1, a3, 0 // ry
  32. ld.w t2, a3, 4 // gy
  33. ld.w t3, a3, 8 // by
  34. li.w t4, 9
  35. li.w t5, 524544
  36. li.w t7, 4
  37. li.w t8, 8
  38. vldi vr7, 0
  39. vreplgr2vr.w vr1, t1
  40. vreplgr2vr.w vr2, t2
  41. vreplgr2vr.w vr3, t3
  42. vreplgr2vr.w vr4, t4
  43. vreplgr2vr.w vr5, t5
  44. bge a2, t8, .WIDTH8
  45. bge a2, t7, .WIDTH4
  46. blt zero, a2, .WIDTH
  47. b .END
  48. .WIDTH8:
  49. vld vr8, a5, 0
  50. vld vr9, a6, 0
  51. vld vr10, a7, 0
  52. vilvl.b vr11, vr7, vr8
  53. vilvl.b vr12, vr7, vr9
  54. vilvl.b vr13, vr7, vr10
  55. vilvl.h vr14, vr7, vr11
  56. vilvl.h vr15, vr7, vr12
  57. vilvl.h vr16, vr7, vr13
  58. vilvh.h vr17, vr7, vr11
  59. vilvh.h vr18, vr7, vr12
  60. vilvh.h vr19, vr7, vr13
  61. vmul.w vr20, vr1, vr16
  62. vmul.w vr21, vr1, vr19
  63. vmadd.w vr20, vr2, vr14
  64. vmadd.w vr20, vr3, vr15
  65. vmadd.w vr21, vr2, vr17
  66. vmadd.w vr21, vr3, vr18
  67. vadd.w vr20, vr20, vr5
  68. vadd.w vr21, vr21, vr5
  69. vsra.w vr20, vr20, vr4
  70. vsra.w vr21, vr21, vr4
  71. vpickev.h vr20, vr21, vr20
  72. vst vr20, a0, 0
  73. addi.d a2, a2, -8
  74. addi.d a5, a5, 8
  75. addi.d a6, a6, 8
  76. addi.d a7, a7, 8
  77. addi.d a0, a0, 16
  78. bge a2, t8, .WIDTH8
  79. bge a2, t7, .WIDTH4
  80. blt zero, a2, .WIDTH
  81. b .END
  82. .WIDTH4:
  83. vld vr8, a5, 0
  84. vld vr9, a6, 0
  85. vld vr10, a7, 0
  86. vilvl.b vr11, vr7, vr8
  87. vilvl.b vr12, vr7, vr9
  88. vilvl.b vr13, vr7, vr10
  89. vilvl.h vr14, vr7, vr11
  90. vilvl.h vr15, vr7, vr12
  91. vilvl.h vr16, vr7, vr13
  92. vmul.w vr17, vr1, vr16
  93. vmadd.w vr17, vr2, vr14
  94. vmadd.w vr17, vr3, vr15
  95. vadd.w vr17, vr17, vr5
  96. vsra.w vr17, vr17, vr4
  97. vpickev.h vr17, vr17, vr17
  98. vstelm.d vr17, a0, 0, 0
  99. addi.d a2, a2, -4
  100. addi.d a5, a5, 4
  101. addi.d a6, a6, 4
  102. addi.d a7, a7, 4
  103. addi.d a0, a0, 8
  104. bge a2, t7, .WIDTH4
  105. blt zero, a2, .WIDTH
  106. b .END
  107. .WIDTH:
  108. ld.bu t0, a5, 0
  109. ld.bu t4, a6, 0
  110. ld.bu t6, a7, 0
  111. mul.w t8, t6, t1
  112. mul.w t7, t0, t2
  113. add.w t8, t8, t7
  114. mul.w t7, t4, t3
  115. add.w t8, t8, t7
  116. add.w t8, t8, t5
  117. srai.w t8, t8, 9
  118. st.h t8, a0, 0
  119. addi.d a2, a2, -1
  120. addi.d a5, a5, 1
  121. addi.d a6, a6, 1
  122. addi.d a7, a7, 1
  123. addi.d a0, a0, 2
  124. blt zero, a2, .WIDTH
  125. .END:
  126. endfunc
  127. /* void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
  128. * int width, int32_t *rgb2yuv)
  129. */
  130. function planar_rgb_to_uv_lsx
  131. addi.d sp, sp, -24
  132. st.d s1, sp, 0
  133. st.d s2, sp, 8
  134. st.d s3, sp, 16
  135. ld.d a5, a2, 0
  136. ld.d a6, a2, 8
  137. ld.d a7, a2, 16
  138. ld.w t1, a4, 12 // ru
  139. ld.w t2, a4, 16 // gu
  140. ld.w t3, a4, 20 // bu
  141. ld.w s1, a4, 24 // rv
  142. ld.w s2, a4, 28 // gv
  143. ld.w s3, a4, 32 // bv
  144. li.w t4, 9
  145. li.w t5, 4194560
  146. li.w t7, 4
  147. li.w t8, 8
  148. vldi vr0, 0
  149. vreplgr2vr.w vr1, t1
  150. vreplgr2vr.w vr2, t2
  151. vreplgr2vr.w vr3, t3
  152. vreplgr2vr.w vr4, s1
  153. vreplgr2vr.w vr5, s2
  154. vreplgr2vr.w vr6, s3
  155. vreplgr2vr.w vr7, t4
  156. vreplgr2vr.w vr8, t5
  157. bge a2, t8, .LOOP_WIDTH8
  158. bge a2, t7, .LOOP_WIDTH4
  159. blt zero, a2, .LOOP_WIDTH
  160. b .LOOP_END
  161. .LOOP_WIDTH8:
  162. vld vr9, a5, 0
  163. vld vr10, a6, 0
  164. vld vr11, a7, 0
  165. vilvl.b vr9, vr0, vr9
  166. vilvl.b vr10, vr0, vr10
  167. vilvl.b vr11, vr0, vr11
  168. vilvl.h vr12, vr0, vr9
  169. vilvl.h vr13, vr0, vr10
  170. vilvl.h vr14, vr0, vr11
  171. vilvh.h vr15, vr0, vr9
  172. vilvh.h vr16, vr0, vr10
  173. vilvh.h vr17, vr0, vr11
  174. vmul.w vr18, vr1, vr14
  175. vmul.w vr19, vr1, vr17
  176. vmul.w vr20, vr4, vr14
  177. vmul.w vr21, vr4, vr17
  178. vmadd.w vr18, vr2, vr12
  179. vmadd.w vr18, vr3, vr13
  180. vmadd.w vr19, vr2, vr15
  181. vmadd.w vr19, vr3, vr16
  182. vmadd.w vr20, vr5, vr12
  183. vmadd.w vr20, vr6, vr13
  184. vmadd.w vr21, vr5, vr15
  185. vmadd.w vr21, vr6, vr16
  186. vadd.w vr18, vr18, vr8
  187. vadd.w vr19, vr19, vr8
  188. vadd.w vr20, vr20, vr8
  189. vadd.w vr21, vr21, vr8
  190. vsra.w vr18, vr18, vr7
  191. vsra.w vr19, vr19, vr7
  192. vsra.w vr20, vr20, vr7
  193. vsra.w vr21, vr21, vr7
  194. vpickev.h vr18, vr19, vr18
  195. vpickev.h vr20, vr21, vr20
  196. vst vr18, a0, 0
  197. vst vr20, a1, 0
  198. addi.d a3, a3, -8
  199. addi.d a5, a5, 8
  200. addi.d a6, a6, 8
  201. addi.d a7, a7, 8
  202. addi.d a0, a0, 16
  203. addi.d a1, a1, 16
  204. bge a3, t8, .LOOP_WIDTH8
  205. bge a3, t7, .LOOP_WIDTH4
  206. blt zero, a3, .LOOP_WIDTH
  207. b .LOOP_END
  208. .LOOP_WIDTH4:
  209. vld vr9, a5, 0
  210. vld vr10, a6, 0
  211. vld vr11, a7, 0
  212. vilvl.b vr9, vr0, vr9
  213. vilvl.b vr10, vr0, vr10
  214. vilvl.b vr11, vr0, vr11
  215. vilvl.h vr12, vr0, vr9
  216. vilvl.h vr13, vr0, vr10
  217. vilvl.h vr14, vr0, vr11
  218. vmul.w vr18, vr1, vr14
  219. vmul.w vr19, vr4, vr14
  220. vmadd.w vr18, vr2, vr12
  221. vmadd.w vr18, vr3, vr13
  222. vmadd.w vr19, vr5, vr12
  223. vmadd.w vr19, vr6, vr13
  224. vadd.w vr18, vr18, vr8
  225. vadd.w vr19, vr19, vr8
  226. vsra.w vr18, vr18, vr7
  227. vsra.w vr19, vr19, vr7
  228. vpickev.h vr18, vr18, vr18
  229. vpickev.h vr19, vr19, vr19
  230. vstelm.d vr18, a0, 0, 0
  231. vstelm.d vr19, a1, 0, 0
  232. addi.d a3, a3, -4
  233. addi.d a5, a5, 4
  234. addi.d a6, a6, 4
  235. addi.d a7, a7, 4
  236. addi.d a0, a0, 8
  237. addi.d a1, a1, 8
  238. bge a3, t7, .LOOP_WIDTH4
  239. blt zero, a3, .LOOP_WIDTH
  240. b .LOOP_END
  241. .LOOP_WIDTH:
  242. ld.bu t0, a5, 0
  243. ld.bu t4, a6, 0
  244. ld.bu t6, a7, 0
  245. mul.w t8, t6, t1
  246. mul.w t7, t0, t2
  247. add.w t8, t8, t7
  248. mul.w t7, t4, t3
  249. add.w t8, t8, t7
  250. add.w t8, t8, t5
  251. srai.w t8, t8, 9
  252. st.h t8, a0, 0
  253. mul.w t8, t6, s1
  254. mul.w t7, t0, s2
  255. add.w t8, t8, t7
  256. mul.w t7, t4, s3
  257. add.w t8, t8, t7
  258. add.w t8, t8, t5
  259. srai.w t8, t8, 9
  260. st.h t8, a1, 0
  261. addi.d a3, a3, -1
  262. addi.d a5, a5, 1
  263. addi.d a6, a6, 1
  264. addi.d a7, a7, 1
  265. addi.d a0, a0, 2
  266. addi.d a1, a1, 2
  267. blt zero, a3, .LOOP_WIDTH
  268. .LOOP_END:
  269. ld.d s1, sp, 0
  270. ld.d s2, sp, 8
  271. ld.d s3, sp, 16
  272. addi.d sp, sp, 24
  273. endfunc