output.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. /*
  2. * Loongson LSX optimized swscale
  3. *
  4. * Copyright (c) 2023 Loongson Technology Corporation Limited
  5. * Contributed by Lu Wang <wanglu@loongson.cn>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "libavcodec/loongarch/loongson_asm.S"
  24. /* static void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
  25. * const int16_t **src, uint8_t *dest, int dstW,
  26. * const uint8_t *dither, int offset)
  27. */
  28. function yuv2planeX_8_lsx
  29. addi.w t1, a6, 1
  30. addi.w t2, a6, 2
  31. addi.w t3, a6, 3
  32. addi.w t4, a6, 4
  33. addi.w t5, a6, 5
  34. addi.w t6, a6, 6
  35. addi.w t7, a6, 7
  36. andi t0, a6, 7
  37. andi t1, t1, 7
  38. andi t2, t2, 7
  39. andi t3, t3, 7
  40. andi t4, t4, 7
  41. andi t5, t5, 7
  42. andi t6, t6, 7
  43. andi t7, t7, 7
  44. ldx.bu t0, a5, t0
  45. ldx.bu t1, a5, t1
  46. ldx.bu t2, a5, t2
  47. ldx.bu t3, a5, t3
  48. ldx.bu t4, a5, t4
  49. ldx.bu t5, a5, t5
  50. ldx.bu t6, a5, t6
  51. ldx.bu t7, a5, t7
  52. vreplgr2vr.w vr0, t0
  53. vreplgr2vr.w vr1, t1
  54. vreplgr2vr.w vr2, t2
  55. vreplgr2vr.w vr3, t3
  56. vreplgr2vr.w vr4, t4
  57. vreplgr2vr.w vr5, t5
  58. vreplgr2vr.w vr6, t6
  59. vreplgr2vr.w vr7, t7
  60. vilvl.w vr0, vr2, vr0
  61. vilvl.w vr4, vr6, vr4
  62. vilvl.w vr1, vr3, vr1
  63. vilvl.w vr5, vr7, vr5
  64. vilvl.d vr12, vr4, vr0
  65. vilvl.d vr13, vr5, vr1
  66. li.w t5, 0
  67. li.w t8, 8
  68. bge a4, t8, .WIDTH8
  69. blt zero, a4, .WIDTH
  70. b .END
  71. .WIDTH8:
  72. li.d t1, 0
  73. li.d t4, 0
  74. vslli.w vr2, vr12, 12
  75. vslli.w vr3, vr13, 12
  76. move t3, a0
  77. .FILTERSIZE8:
  78. ldx.d t2, a2, t1
  79. vldx vr4, t2, t5
  80. vldrepl.h vr5, t3, 0
  81. vmaddwev.w.h vr2, vr4, vr5
  82. vmaddwod.w.h vr3, vr4, vr5
  83. addi.d t1, t1, 8
  84. addi.d t3, t3, 2
  85. addi.d t4, t4, 1
  86. blt t4, a1, .FILTERSIZE8
  87. vsrai.w vr2, vr2, 19
  88. vsrai.w vr3, vr3, 19
  89. vclip255.w vr2, vr2
  90. vclip255.w vr3, vr3
  91. vpickev.h vr2, vr3, vr2
  92. vpickev.b vr2, vr2, vr2
  93. vbsrl.v vr3, vr2, 4
  94. vilvl.b vr2, vr3, vr2
  95. fst.d f2, a3, 0
  96. addi.d t5, t5, 16
  97. addi.d a4, a4, -8
  98. addi.d a3, a3, 8
  99. bge a4, t8, .WIDTH8
  100. blt zero, a4, .WIDTH
  101. b .END
  102. .WIDTH:
  103. li.d t1, 0
  104. li.d t4, 0
  105. vslli.w vr2, vr12, 12
  106. vslli.w vr3, vr13, 12
  107. .FILTERSIZE:
  108. ldx.d t2, a2, t1
  109. vldx vr4, t2, t5
  110. vldrepl.h vr5, a0, 0
  111. vmaddwev.w.h vr2, vr4, vr5
  112. vmaddwod.w.h vr3, vr4, vr5
  113. addi.d t1, t1, 8
  114. addi.d a0, a0, 2
  115. addi.d t4, t4, 1
  116. blt t4, a1, .FILTERSIZE
  117. vsrai.w vr2, vr2, 19
  118. vsrai.w vr3, vr3, 19
  119. vclip255.w vr2, vr2
  120. vclip255.w vr3, vr3
  121. vpickev.h vr2, vr3, vr2
  122. vpickev.b vr2, vr2, vr2
  123. vbsrl.v vr3, vr2, 4
  124. vilvl.b vr2, vr3, vr2
  125. .DEST:
  126. vstelm.b vr2, a3, 0, 0
  127. vbsrl.v vr2, vr2, 1
  128. addi.d a4, a4, -1
  129. addi.d a3, a3, 1
  130. blt zero, a4, .DEST
  131. .END:
  132. endfunc
  133. /*
  134. * void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
  135. * const uint8_t *dither, int offset)
  136. */
  137. function yuv2plane1_8_lsx
  138. addi.w t1, a4, 1
  139. addi.w t2, a4, 2
  140. addi.w t3, a4, 3
  141. addi.w t4, a4, 4
  142. addi.w t5, a4, 5
  143. addi.w t6, a4, 6
  144. addi.w t7, a4, 7
  145. andi t0, a4, 7
  146. andi t1, t1, 7
  147. andi t2, t2, 7
  148. andi t3, t3, 7
  149. andi t4, t4, 7
  150. andi t5, t5, 7
  151. andi t6, t6, 7
  152. andi t7, t7, 7
  153. ldx.bu t0, a3, t0
  154. ldx.bu t1, a3, t1
  155. ldx.bu t2, a3, t2
  156. ldx.bu t3, a3, t3
  157. ldx.bu t4, a3, t4
  158. ldx.bu t5, a3, t5
  159. ldx.bu t6, a3, t6
  160. ldx.bu t7, a3, t7
  161. vinsgr2vr.h vr1, t0, 0
  162. vinsgr2vr.h vr1, t1, 1
  163. vinsgr2vr.h vr1, t2, 2
  164. vinsgr2vr.h vr1, t3, 3
  165. vinsgr2vr.h vr1, t4, 4
  166. vinsgr2vr.h vr1, t5, 5
  167. vinsgr2vr.h vr1, t6, 6
  168. vinsgr2vr.h vr1, t7, 7
  169. vsub.h vr0, vr0, vr0
  170. vilvl.h vr2, vr0, vr1
  171. vilvh.h vr3, vr0, vr1
  172. andi t8, a2, 7
  173. srli.d a2, a2, 3
  174. beqz a2, 2f
  175. 1:
  176. vld vr1, a0, 0
  177. addi.d a0, a0, 16
  178. vshuf4i.d vr0, vr1, 8
  179. vexth.w.h vr4, vr0
  180. vexth.w.h vr5, vr1
  181. vadd.w vr4, vr2, vr4
  182. vadd.w vr5, vr3, vr5
  183. vsrai.w vr4, vr4, 7
  184. vsrai.w vr5, vr5, 7
  185. vclip255.w vr4, vr4
  186. vclip255.w vr5, vr5
  187. vpickev.h vr1, vr5, vr4
  188. vpickev.b vr1, vr1, vr1
  189. fst.d f1, a1, 0
  190. addi.d a1, a1, 8
  191. addi.d a2, a2, -1
  192. bnez a2, 1b
  193. 2:
  194. beqz t8, 4f
  195. 3:
  196. add.w a4, a4, t8
  197. addi.w t1, a4, 1
  198. addi.w t2, a4, 2
  199. addi.w t3, a4, 3
  200. addi.w t4, a4, 4
  201. addi.w t5, a4, 5
  202. addi.w t6, a4, 6
  203. addi.w t7, a4, 7
  204. andi t0, a4, 7
  205. andi t1, t1, 7
  206. andi t2, t2, 7
  207. andi t3, t3, 7
  208. andi t4, t4, 7
  209. andi t5, t5, 7
  210. andi t6, t6, 7
  211. andi t7, t7, 7
  212. ldx.bu t0, a3, t0
  213. ldx.bu t1, a3, t1
  214. ldx.bu t2, a3, t2
  215. ldx.bu t3, a3, t3
  216. ldx.bu t4, a3, t4
  217. ldx.bu t5, a3, t5
  218. ldx.bu t6, a3, t6
  219. ldx.bu t7, a3, t7
  220. vinsgr2vr.h vr1, t0, 0
  221. vinsgr2vr.h vr1, t1, 1
  222. vinsgr2vr.h vr1, t2, 2
  223. vinsgr2vr.h vr1, t3, 3
  224. vinsgr2vr.h vr1, t4, 4
  225. vinsgr2vr.h vr1, t5, 5
  226. vinsgr2vr.h vr1, t6, 6
  227. vinsgr2vr.h vr1, t7, 7
  228. vsub.h vr0, vr0, vr0
  229. vilvl.h vr2, vr0, vr1
  230. vilvh.h vr3, vr0, vr1
  231. addi.d a0, a0, -16
  232. add.d a0, a0, t8
  233. add.d a0, a0, t8
  234. addi.d a1, a1, -8
  235. add.d a1, a1, t8
  236. vld vr1, a0, 0
  237. vshuf4i.d vr0, vr1, 8
  238. vexth.w.h vr4, vr0
  239. vexth.w.h vr5, vr1
  240. vadd.w vr4, vr2, vr4
  241. vadd.w vr5, vr3, vr5
  242. vsrai.w vr4, vr4, 7
  243. vsrai.w vr5, vr5, 7
  244. vclip255.w vr4, vr4
  245. vclip255.w vr5, vr5
  246. vpickev.h vr1, vr5, vr4
  247. vpickev.b vr1, vr1, vr1
  248. fst.d f1, a1, 0
  249. 4:
  250. endfunc
  251. function yuv2plane1_8_lasx
  252. addi.w t1, a4, 1
  253. addi.w t2, a4, 2
  254. addi.w t3, a4, 3
  255. addi.w t4, a4, 4
  256. addi.w t5, a4, 5
  257. addi.w t6, a4, 6
  258. addi.w t7, a4, 7
  259. andi t0, a4, 7
  260. andi t1, t1, 7
  261. andi t2, t2, 7
  262. andi t3, t3, 7
  263. andi t4, t4, 7
  264. andi t5, t5, 7
  265. andi t6, t6, 7
  266. andi t7, t7, 7
  267. ldx.bu t0, a3, t0
  268. ldx.bu t1, a3, t1
  269. ldx.bu t2, a3, t2
  270. ldx.bu t3, a3, t3
  271. ldx.bu t4, a3, t4
  272. ldx.bu t5, a3, t5
  273. ldx.bu t6, a3, t6
  274. ldx.bu t7, a3, t7
  275. vinsgr2vr.h vr1, t0, 0
  276. vinsgr2vr.h vr1, t1, 1
  277. vinsgr2vr.h vr1, t2, 2
  278. vinsgr2vr.h vr1, t3, 3
  279. vinsgr2vr.h vr1, t4, 4
  280. vinsgr2vr.h vr1, t5, 5
  281. vinsgr2vr.h vr1, t6, 6
  282. vinsgr2vr.h vr1, t7, 7
  283. xvpermi.q xr1, xr1, 0
  284. xvsub.h xr0, xr0, xr0
  285. xvilvl.h xr2, xr0, xr1
  286. xvilvh.h xr3, xr0, xr1
  287. andi t8, a2, 15
  288. srli.d a2, a2, 4
  289. beqz a2, 2f
  290. 1:
  291. xvld xr1, a0, 0
  292. addi.d a0, a0, 32
  293. xvpermi.d xr0, xr1, 0xa0
  294. xvexth.w.h xr4, xr0
  295. xvexth.w.h xr5, xr1
  296. xvadd.w xr4, xr2, xr4
  297. xvadd.w xr5, xr3, xr5
  298. xvsrai.w xr4, xr4, 7
  299. xvsrai.w xr5, xr5, 7
  300. xvclip255.w xr4, xr4
  301. xvclip255.w xr5, xr5
  302. xvpickev.h xr1, xr5, xr4
  303. xvpickev.b xr0, xr1, xr1
  304. xvpermi.q xr1, xr0, 1
  305. fst.d f0, a1, 0
  306. fst.d f1, a1, 8
  307. addi.d a1, a1, 16
  308. addi.d a2, a2, -1
  309. bnez a2, 1b
  310. 2:
  311. beqz t8, 4f
  312. 3:
  313. add.w a4, a4, t8
  314. addi.w t1, a4, 1
  315. addi.w t2, a4, 2
  316. addi.w t3, a4, 3
  317. addi.w t4, a4, 4
  318. addi.w t5, a4, 5
  319. addi.w t6, a4, 6
  320. addi.w t7, a4, 7
  321. andi t0, a4, 7
  322. andi t1, t1, 7
  323. andi t2, t2, 7
  324. andi t3, t3, 7
  325. andi t4, t4, 7
  326. andi t5, t5, 7
  327. andi t6, t6, 7
  328. andi t7, t7, 7
  329. ldx.bu t0, a3, t0
  330. ldx.bu t1, a3, t1
  331. ldx.bu t2, a3, t2
  332. ldx.bu t3, a3, t3
  333. ldx.bu t4, a3, t4
  334. ldx.bu t5, a3, t5
  335. ldx.bu t6, a3, t6
  336. ldx.bu t7, a3, t7
  337. vinsgr2vr.h vr1, t0, 0
  338. vinsgr2vr.h vr1, t1, 1
  339. vinsgr2vr.h vr1, t2, 2
  340. vinsgr2vr.h vr1, t3, 3
  341. vinsgr2vr.h vr1, t4, 4
  342. vinsgr2vr.h vr1, t5, 5
  343. vinsgr2vr.h vr1, t6, 6
  344. vinsgr2vr.h vr1, t7, 7
  345. xvpermi.q xr1, xr1, 0
  346. xvsub.h xr0, xr0, xr0
  347. xvilvl.h xr2, xr0, xr1
  348. xvilvh.h xr3, xr0, xr1
  349. addi.d a0, a0, -32
  350. add.d a0, a0, t8
  351. add.d a0, a0, t8
  352. addi.d a1, a1, -16
  353. add.d a1, a1, t8
  354. xvld xr1, a0, 0
  355. xvpermi.d xr0, xr1, 0xa0
  356. xvexth.w.h xr4, xr0
  357. xvexth.w.h xr5, xr1
  358. xvadd.w xr4, xr2, xr4
  359. xvadd.w xr5, xr3, xr5
  360. xvsrai.w xr4, xr4, 7
  361. xvsrai.w xr5, xr5, 7
  362. xvclip255.w xr4, xr4
  363. xvclip255.w xr5, xr5
  364. xvpickev.h xr1, xr5, xr4
  365. xvpickev.b xr0, xr1, xr1
  366. xvpermi.q xr1, xr0, 1
  367. fst.d f0, a1, 0
  368. fst.d f1, a1, 8
  369. 4:
  370. endfunc