input.S 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780
  1. /*
  2. * Loongson LSX optimized swscale
  3. *
  4. * Copyright (c) 2023 Loongson Technology Corporation Limited
  5. * Contributed by Lu Wang <wanglu@loongson.cn>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "libavcodec/loongarch/loongson_asm.S"
  24. /* void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4],
  25. * int width, int32_t *rgb2yuv)
  26. */
  27. function planar_rgb_to_y_lsx
  28. ld.d a5, a1, 0
  29. ld.d a6, a1, 8
  30. ld.d a7, a1, 16
  31. ld.w t1, a3, 0 // ry
  32. ld.w t2, a3, 4 // gy
  33. ld.w t3, a3, 8 // by
  34. li.w t4, 9
  35. li.w t5, 524544
  36. li.w t7, 4
  37. li.w t8, 8
  38. vldi vr7, 0
  39. vreplgr2vr.w vr1, t1
  40. vreplgr2vr.w vr2, t2
  41. vreplgr2vr.w vr3, t3
  42. vreplgr2vr.w vr4, t4
  43. vreplgr2vr.w vr5, t5
  44. bge a2, t8, .WIDTH8
  45. bge a2, t7, .WIDTH4
  46. blt zero, a2, .WIDTH
  47. b .END
  48. .WIDTH8:
  49. vld vr8, a5, 0
  50. vld vr9, a6, 0
  51. vld vr10, a7, 0
  52. vilvl.b vr11, vr7, vr8
  53. vilvl.b vr12, vr7, vr9
  54. vilvl.b vr13, vr7, vr10
  55. vilvl.h vr14, vr7, vr11
  56. vilvl.h vr15, vr7, vr12
  57. vilvl.h vr16, vr7, vr13
  58. vilvh.h vr17, vr7, vr11
  59. vilvh.h vr18, vr7, vr12
  60. vilvh.h vr19, vr7, vr13
  61. vmul.w vr20, vr1, vr16
  62. vmul.w vr21, vr1, vr19
  63. vmadd.w vr20, vr2, vr14
  64. vmadd.w vr20, vr3, vr15
  65. vmadd.w vr21, vr2, vr17
  66. vmadd.w vr21, vr3, vr18
  67. vadd.w vr20, vr20, vr5
  68. vadd.w vr21, vr21, vr5
  69. vsra.w vr20, vr20, vr4
  70. vsra.w vr21, vr21, vr4
  71. vpickev.h vr20, vr21, vr20
  72. vst vr20, a0, 0
  73. addi.d a2, a2, -8
  74. addi.d a5, a5, 8
  75. addi.d a6, a6, 8
  76. addi.d a7, a7, 8
  77. addi.d a0, a0, 16
  78. bge a2, t8, .WIDTH8
  79. bge a2, t7, .WIDTH4
  80. blt zero, a2, .WIDTH
  81. b .END
  82. .WIDTH4:
  83. vld vr8, a5, 0
  84. vld vr9, a6, 0
  85. vld vr10, a7, 0
  86. vilvl.b vr11, vr7, vr8
  87. vilvl.b vr12, vr7, vr9
  88. vilvl.b vr13, vr7, vr10
  89. vilvl.h vr14, vr7, vr11
  90. vilvl.h vr15, vr7, vr12
  91. vilvl.h vr16, vr7, vr13
  92. vmul.w vr17, vr1, vr16
  93. vmadd.w vr17, vr2, vr14
  94. vmadd.w vr17, vr3, vr15
  95. vadd.w vr17, vr17, vr5
  96. vsra.w vr17, vr17, vr4
  97. vpickev.h vr17, vr17, vr17
  98. vstelm.d vr17, a0, 0, 0
  99. addi.d a2, a2, -4
  100. addi.d a5, a5, 4
  101. addi.d a6, a6, 4
  102. addi.d a7, a7, 4
  103. addi.d a0, a0, 8
  104. bge a2, t7, .WIDTH4
  105. blt zero, a2, .WIDTH
  106. b .END
  107. .WIDTH:
  108. ld.bu t0, a5, 0
  109. ld.bu t4, a6, 0
  110. ld.bu t6, a7, 0
  111. mul.w t8, t6, t1
  112. mul.w t7, t0, t2
  113. add.w t8, t8, t7
  114. mul.w t7, t4, t3
  115. add.w t8, t8, t7
  116. add.w t8, t8, t5
  117. srai.w t8, t8, 9
  118. st.h t8, a0, 0
  119. addi.d a2, a2, -1
  120. addi.d a5, a5, 1
  121. addi.d a6, a6, 1
  122. addi.d a7, a7, 1
  123. addi.d a0, a0, 2
  124. blt zero, a2, .WIDTH
  125. .END:
  126. endfunc
  127. /* void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
  128. * int width, int32_t *rgb2yuv)
  129. */
  130. function planar_rgb_to_uv_lsx
  131. addi.d sp, sp, -24
  132. st.d s1, sp, 0
  133. st.d s2, sp, 8
  134. st.d s3, sp, 16
  135. ld.d a5, a2, 0
  136. ld.d a6, a2, 8
  137. ld.d a7, a2, 16
  138. ld.w t1, a4, 12 // ru
  139. ld.w t2, a4, 16 // gu
  140. ld.w t3, a4, 20 // bu
  141. ld.w s1, a4, 24 // rv
  142. ld.w s2, a4, 28 // gv
  143. ld.w s3, a4, 32 // bv
  144. li.w t4, 9
  145. li.w t5, 4194560
  146. li.w t7, 4
  147. li.w t8, 8
  148. vldi vr0, 0
  149. vreplgr2vr.w vr1, t1
  150. vreplgr2vr.w vr2, t2
  151. vreplgr2vr.w vr3, t3
  152. vreplgr2vr.w vr4, s1
  153. vreplgr2vr.w vr5, s2
  154. vreplgr2vr.w vr6, s3
  155. vreplgr2vr.w vr7, t4
  156. vreplgr2vr.w vr8, t5
  157. bge a2, t8, .LOOP_WIDTH8
  158. bge a2, t7, .LOOP_WIDTH4
  159. blt zero, a2, .LOOP_WIDTH
  160. b .LOOP_END
  161. .LOOP_WIDTH8:
  162. vld vr9, a5, 0
  163. vld vr10, a6, 0
  164. vld vr11, a7, 0
  165. vilvl.b vr9, vr0, vr9
  166. vilvl.b vr10, vr0, vr10
  167. vilvl.b vr11, vr0, vr11
  168. vilvl.h vr12, vr0, vr9
  169. vilvl.h vr13, vr0, vr10
  170. vilvl.h vr14, vr0, vr11
  171. vilvh.h vr15, vr0, vr9
  172. vilvh.h vr16, vr0, vr10
  173. vilvh.h vr17, vr0, vr11
  174. vmul.w vr18, vr1, vr14
  175. vmul.w vr19, vr1, vr17
  176. vmul.w vr20, vr4, vr14
  177. vmul.w vr21, vr4, vr17
  178. vmadd.w vr18, vr2, vr12
  179. vmadd.w vr18, vr3, vr13
  180. vmadd.w vr19, vr2, vr15
  181. vmadd.w vr19, vr3, vr16
  182. vmadd.w vr20, vr5, vr12
  183. vmadd.w vr20, vr6, vr13
  184. vmadd.w vr21, vr5, vr15
  185. vmadd.w vr21, vr6, vr16
  186. vadd.w vr18, vr18, vr8
  187. vadd.w vr19, vr19, vr8
  188. vadd.w vr20, vr20, vr8
  189. vadd.w vr21, vr21, vr8
  190. vsra.w vr18, vr18, vr7
  191. vsra.w vr19, vr19, vr7
  192. vsra.w vr20, vr20, vr7
  193. vsra.w vr21, vr21, vr7
  194. vpickev.h vr18, vr19, vr18
  195. vpickev.h vr20, vr21, vr20
  196. vst vr18, a0, 0
  197. vst vr20, a1, 0
  198. addi.d a3, a3, -8
  199. addi.d a5, a5, 8
  200. addi.d a6, a6, 8
  201. addi.d a7, a7, 8
  202. addi.d a0, a0, 16
  203. addi.d a1, a1, 16
  204. bge a3, t8, .LOOP_WIDTH8
  205. bge a3, t7, .LOOP_WIDTH4
  206. blt zero, a3, .LOOP_WIDTH
  207. b .LOOP_END
  208. .LOOP_WIDTH4:
  209. vld vr9, a5, 0
  210. vld vr10, a6, 0
  211. vld vr11, a7, 0
  212. vilvl.b vr9, vr0, vr9
  213. vilvl.b vr10, vr0, vr10
  214. vilvl.b vr11, vr0, vr11
  215. vilvl.h vr12, vr0, vr9
  216. vilvl.h vr13, vr0, vr10
  217. vilvl.h vr14, vr0, vr11
  218. vmul.w vr18, vr1, vr14
  219. vmul.w vr19, vr4, vr14
  220. vmadd.w vr18, vr2, vr12
  221. vmadd.w vr18, vr3, vr13
  222. vmadd.w vr19, vr5, vr12
  223. vmadd.w vr19, vr6, vr13
  224. vadd.w vr18, vr18, vr8
  225. vadd.w vr19, vr19, vr8
  226. vsra.w vr18, vr18, vr7
  227. vsra.w vr19, vr19, vr7
  228. vpickev.h vr18, vr18, vr18
  229. vpickev.h vr19, vr19, vr19
  230. vstelm.d vr18, a0, 0, 0
  231. vstelm.d vr19, a1, 0, 0
  232. addi.d a3, a3, -4
  233. addi.d a5, a5, 4
  234. addi.d a6, a6, 4
  235. addi.d a7, a7, 4
  236. addi.d a0, a0, 8
  237. addi.d a1, a1, 8
  238. bge a3, t7, .LOOP_WIDTH4
  239. blt zero, a3, .LOOP_WIDTH
  240. b .LOOP_END
  241. .LOOP_WIDTH:
  242. ld.bu t0, a5, 0
  243. ld.bu t4, a6, 0
  244. ld.bu t6, a7, 0
  245. mul.w t8, t6, t1
  246. mul.w t7, t0, t2
  247. add.w t8, t8, t7
  248. mul.w t7, t4, t3
  249. add.w t8, t8, t7
  250. add.w t8, t8, t5
  251. srai.w t8, t8, 9
  252. st.h t8, a0, 0
  253. mul.w t8, t6, s1
  254. mul.w t7, t0, s2
  255. add.w t8, t8, t7
  256. mul.w t7, t4, s3
  257. add.w t8, t8, t7
  258. add.w t8, t8, t5
  259. srai.w t8, t8, 9
  260. st.h t8, a1, 0
  261. addi.d a3, a3, -1
  262. addi.d a5, a5, 1
  263. addi.d a6, a6, 1
  264. addi.d a7, a7, 1
  265. addi.d a0, a0, 2
  266. addi.d a1, a1, 2
  267. blt zero, a3, .LOOP_WIDTH
  268. .LOOP_END:
  269. ld.d s1, sp, 0
  270. ld.d s2, sp, 8
  271. ld.d s3, sp, 16
  272. addi.d sp, sp, 24
  273. endfunc
  274. /*
  275. * void yuy2ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
  276. * const uint8_t *src2, int width, uint32_t *unused, void *opq)
  277. */
  278. function yuy2ToUV_lsx
  279. andi t0, a5, 7
  280. srli.d a5, a5, 3
  281. beqz a5, 2f
  282. 1:
  283. vld vr0, a3, 1
  284. vld vr1, a3, 17
  285. addi.d a5, a5, -1
  286. addi.d a3, a3, 32
  287. vpickev.b vr2, vr1, vr0
  288. vpickev.b vr0, vr2, vr2
  289. vpickod.b vr1, vr2, vr2
  290. fst.d f0, a0, 0
  291. fst.d f1, a1, 0
  292. addi.d a0, a0, 8
  293. addi.d a1, a1, 8
  294. bnez a5, 1b
  295. 2:
  296. beqz t0, 4f
  297. 3:
  298. ld.b t1, a3, 1
  299. ld.b t2, a3, 3
  300. addi.d a3, a3, 4
  301. addi.d t0, t0, -1
  302. st.b t1, a0, 0
  303. st.b t2, a1, 0
  304. addi.d a0, a0, 1
  305. addi.d a1, a1, 1
  306. bnez t0, 3b
  307. 4:
  308. endfunc
  309. function yuy2ToUV_lasx
  310. andi t0, a5, 15
  311. srli.d a5, a5, 4
  312. beqz a5, 2f
  313. 1:
  314. xvld xr0, a3, 1
  315. xvld xr1, a3, 33
  316. addi.d a5, a5, -1
  317. addi.d a3, a3, 64
  318. xvpickev.b xr2, xr1, xr0
  319. xvpermi.d xr2, xr2, 0xd8
  320. xvpickev.b xr0, xr2, xr2
  321. xvpermi.d xr0, xr0, 0xd8
  322. xvpickod.b xr1, xr2, xr2
  323. xvpermi.d xr1, xr1, 0xd8
  324. vst vr0, a0, 0
  325. vst vr1, a1, 0
  326. addi.d a0, a0, 16
  327. addi.d a1, a1, 16
  328. bnez a5, 1b
  329. 2:
  330. beqz t0, 4f
  331. 3:
  332. ld.b t1, a3, 1
  333. ld.b t2, a3, 3
  334. addi.d a3, a3, 4
  335. addi.d t0, t0, -1
  336. st.b t1, a0, 0
  337. st.b t2, a1, 0
  338. addi.d a0, a0, 1
  339. addi.d a1, a1, 1
  340. bnez t0, 3b
  341. 4:
  342. endfunc
  343. /*
  344. * void yvy2ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
  345. * const uint8_t *src2, int width, uint32_t *unused, void *opq)
  346. */
  347. function yvy2ToUV_lsx
  348. andi t0, a5, 7
  349. srli.d a5, a5, 3
  350. beqz a5, 2f
  351. 1:
  352. vld vr0, a3, 1
  353. vld vr1, a3, 17
  354. addi.d a5, a5, -1
  355. addi.d a3, a3, 32
  356. vpickev.b vr2, vr1, vr0
  357. vpickev.b vr0, vr2, vr2
  358. vpickod.b vr1, vr2, vr2
  359. fst.d f0, a1, 0
  360. fst.d f1, a0, 0
  361. addi.d a0, a0, 8
  362. addi.d a1, a1, 8
  363. bnez a5, 1b
  364. 2:
  365. beqz t0, 4f
  366. 3:
  367. ld.b t1, a3, 1
  368. ld.b t2, a3, 3
  369. addi.d a3, a3, 4
  370. addi.d t0, t0, -1
  371. st.b t1, a1, 0
  372. st.b t2, a0, 0
  373. addi.d a0, a0, 1
  374. addi.d a1, a1, 1
  375. bnez t0, 3b
  376. 4:
  377. endfunc
  378. function yvy2ToUV_lasx
  379. andi t0, a5, 15
  380. srli.d a5, a5, 4
  381. beqz a5, 2f
  382. 1:
  383. xvld xr0, a3, 1
  384. xvld xr1, a3, 33
  385. addi.d a5, a5, -1
  386. addi.d a3, a3, 64
  387. xvpickev.b xr2, xr1, xr0
  388. xvpermi.d xr2, xr2, 0xd8
  389. xvpickev.b xr0, xr2, xr2
  390. xvpermi.d xr0, xr0, 0xd8
  391. xvpickod.b xr1, xr2, xr2
  392. xvpermi.d xr1, xr1, 0xd8
  393. vst vr0, a1, 0
  394. vst vr1, a0, 0
  395. addi.d a0, a0, 16
  396. addi.d a1, a1, 16
  397. bnez a5, 1b
  398. 2:
  399. beqz t0, 4f
  400. 3:
  401. ld.b t1, a3, 1
  402. ld.b t2, a3, 3
  403. addi.d a3, a3, 4
  404. addi.d t0, t0, -1
  405. st.b t1, a1, 0
  406. st.b t2, a0, 0
  407. addi.d a0, a0, 1
  408. addi.d a1, a1, 1
  409. bnez t0, 3b
  410. 4:
  411. endfunc
  412. /*
  413. * void uyvyToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
  414. * const uint8_t *src2, int width, uint32_t *unused, void *opq)
  415. */
  416. function uyvyToUV_lsx
  417. andi t0, a5, 7
  418. srli.d a5, a5, 3
  419. beqz a5, 2f
  420. 1:
  421. vld vr0, a3, 0
  422. vld vr1, a3, 16
  423. addi.d a5, a5, -1
  424. addi.d a3, a3, 32
  425. vpickev.b vr2, vr1, vr0
  426. vpickev.b vr0, vr2, vr2
  427. vpickod.b vr1, vr2, vr2
  428. fst.d f0, a0, 0
  429. fst.d f1, a1, 0
  430. addi.d a0, a0, 8
  431. addi.d a1, a1, 8
  432. bnez a5, 1b
  433. 2:
  434. beqz t0, 4f
  435. 3:
  436. ld.b t1, a3, 1
  437. ld.b t2, a3, 3
  438. addi.d a3, a3, 4
  439. addi.d t0, t0, -1
  440. st.b t1, a0, 0
  441. st.b t2, a1, 0
  442. addi.d a0, a0, 1
  443. addi.d a1, a1, 1
  444. bnez t0, 3b
  445. 4:
  446. endfunc
  447. function uyvyToUV_lasx
  448. andi t0, a5, 15
  449. srli.d a5, a5, 4
  450. beqz a5, 2f
  451. 1:
  452. xvld xr0, a3, 0
  453. xvld xr1, a3, 32
  454. addi.d a5, a5, -1
  455. addi.d a3, a3, 64
  456. xvpickev.b xr2, xr1, xr0
  457. xvpermi.d xr2, xr2, 0xd8
  458. xvpickev.b xr0, xr2, xr2
  459. xvpermi.d xr0, xr0, 0xd8
  460. xvpickod.b xr1, xr2, xr2
  461. xvpermi.d xr1, xr1, 0xd8
  462. vst vr0, a0, 0
  463. vst vr1, a1, 0
  464. addi.d a0, a0, 16
  465. addi.d a1, a1, 16
  466. bnez a5, 1b
  467. 2:
  468. beqz t0, 4f
  469. 3:
  470. ld.b t1, a3, 1
  471. ld.b t2, a3, 3
  472. addi.d a3, a3, 4
  473. addi.d t0, t0, -1
  474. st.b t1, a0, 0
  475. st.b t2, a1, 0
  476. addi.d a0, a0, 1
  477. addi.d a1, a1, 1
  478. bnez t0, 3b
  479. 4:
  480. endfunc
  481. /*
  482. * void nv12ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
  483. * const uint8_t *src2, int width, uint32_t *unused, void *opq)
  484. */
  485. function nv12ToUV_lsx
  486. andi t0, a5, 15
  487. srli.d a5, a5, 4
  488. beqz a5, 2f
  489. 1:
  490. vld vr0, a3, 0
  491. vld vr1, a3, 16
  492. addi.d a5, a5, -1
  493. addi.d a3, a3, 32
  494. vpickev.b vr2, vr1, vr0
  495. vpickod.b vr3, vr1, vr0
  496. vst vr2, a0, 0
  497. vst vr3, a1, 0
  498. addi.d a0, a0, 16
  499. addi.d a1, a1, 16
  500. bnez a5, 1b
  501. 2:
  502. beqz t0, 4f
  503. 3:
  504. ld.b t1, a3, 0
  505. ld.b t2, a3, 1
  506. addi.d a3, a3, 2
  507. addi.d t0, t0, -1
  508. st.b t1, a0, 0
  509. st.b t2, a1, 0
  510. addi.d a0, a0, 1
  511. addi.d a1, a1, 1
  512. bnez t0, 3b
  513. 4:
  514. endfunc
  515. function nv12ToUV_lasx
  516. andi t0, a5, 31
  517. srli.d a5, a5, 5
  518. beqz a5, 2f
  519. 1:
  520. xvld xr0, a3, 0
  521. xvld xr1, a3, 32
  522. addi.d a5, a5, -1
  523. addi.d a3, a3, 64
  524. xvpickev.b xr2, xr1, xr0
  525. xvpickod.b xr3, xr1, xr0
  526. xvpermi.d xr2, xr2, 0xd8
  527. xvpermi.d xr3, xr3, 0xd8
  528. xvst xr2, a0, 0
  529. xvst xr3, a1, 0
  530. addi.d a0, a0, 32
  531. addi.d a1, a1, 32
  532. bnez a5, 1b
  533. 2:
  534. beqz t0, 4f
  535. 3:
  536. ld.b t1, a3, 0
  537. ld.b t2, a3, 1
  538. addi.d a3, a3, 2
  539. addi.d t0, t0, -1
  540. st.b t1, a0, 0
  541. st.b t2, a1, 0
  542. addi.d a0, a0, 1
  543. addi.d a1, a1, 1
  544. bnez t0, 3b
  545. 4:
  546. endfunc
  547. /*
  548. * void nv21ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
  549. * const uint8_t *src2, int width, uint32_t *unused, void *opq)
  550. */
  551. function nv21ToUV_lsx
  552. andi t0, a5, 15
  553. srli.d a5, a5, 4
  554. beqz a5, 2f
  555. 1:
  556. vld vr0, a3, 0
  557. vld vr1, a3, 16
  558. addi.d a5, a5, -1
  559. addi.d a3, a3, 32
  560. vpickev.b vr2, vr1, vr0
  561. vpickod.b vr3, vr1, vr0
  562. vst vr2, a1, 0
  563. vst vr3, a0, 0
  564. addi.d a0, a0, 16
  565. addi.d a1, a1, 16
  566. bnez a5, 1b
  567. 2:
  568. beqz t0, 4f
  569. 3:
  570. ld.b t1, a3, 0
  571. ld.b t2, a3, 1
  572. addi.d a3, a3, 2
  573. addi.d t0, t0, -1
  574. st.b t1, a1, 0
  575. st.b t2, a0, 0
  576. addi.d a0, a0, 1
  577. addi.d a1, a1, 1
  578. bnez t0, 3b
  579. 4:
  580. endfunc
  581. function nv21ToUV_lasx
  582. andi t0, a5, 31
  583. srli.d a5, a5, 5
  584. beqz a5, 2f
  585. 1:
  586. xvld xr0, a3, 0
  587. xvld xr1, a3, 32
  588. addi.d a5, a5, -1
  589. addi.d a3, a3, 64
  590. xvpickev.b xr2, xr1, xr0
  591. xvpickod.b xr3, xr1, xr0
  592. xvpermi.d xr2, xr2, 0xd8
  593. xvpermi.d xr3, xr3, 0xd8
  594. xvst xr2, a1, 0
  595. xvst xr3, a0, 0
  596. addi.d a0, a0, 32
  597. addi.d a1, a1, 32
  598. bnez a5, 1b
  599. 2:
  600. beqz t0, 4f
  601. 3:
  602. ld.b t1, a3, 0
  603. ld.b t2, a3, 1
  604. addi.d a3, a3, 2
  605. addi.d t0, t0, -1
  606. st.b t1, a1, 0
  607. st.b t2, a0, 0
  608. addi.d a0, a0, 1
  609. addi.d a1, a1, 1
  610. bnez t0, 3b
  611. 4:
  612. endfunc
  613. /*
  614. *void abgrToA_lsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
  615. * const uint8_t *unused2, int width, uint32_t *unused, void *opq)
  616. */
  617. function abgrToA_lsx
  618. andi t0, a4, 7
  619. srli.d a4, a4, 3
  620. vxor.v vr0, vr0, vr0
  621. beqz a4, 2f
  622. 1:
  623. vld vr1, a1, 0
  624. vld vr2, a1, 16
  625. addi.d a4, a4, -1
  626. addi.d a1, a1, 32
  627. vpickev.b vr3, vr2, vr1
  628. vpackev.b vr3, vr0, vr3
  629. vslli.h vr1, vr3, 6
  630. vsrli.h vr2, vr3, 2
  631. vor.v vr3, vr2, vr1
  632. vst vr3, a0, 0
  633. addi.d a0, a0, 16
  634. bnez a4, 1b
  635. 2:
  636. beqz t0, 4f
  637. 3:
  638. ld.b t1, a1, 3
  639. addi.d t0, t0, -1
  640. addi.d a1, a1, 4
  641. andi t1, t1, 0xff
  642. slli.w t2, t1, 6
  643. srli.w t3, t1, 2
  644. or t1, t2, t3
  645. st.h t1, a0, 0
  646. addi.d a0, a0, 2
  647. bnez t0, 3b
  648. 4:
  649. endfunc
  650. function abgrToA_lasx
  651. andi t0, a4, 15
  652. srli.d a4, a4, 4
  653. xvxor.v xr0, xr0, xr0
  654. beqz a4, 2f
  655. 1:
  656. xvld xr1, a1, 0
  657. xvld xr2, a1, 32
  658. addi.d a4, a4, -1
  659. addi.d a1, a1, 64
  660. xvpickev.b xr3, xr2, xr1
  661. xvpermi.d xr3, xr3, 0xd8
  662. xvpackev.b xr3, xr0, xr3
  663. xvslli.h xr1, xr3, 6
  664. xvsrli.h xr2, xr3, 2
  665. xvor.v xr3, xr2, xr1
  666. xvst xr3, a0, 0
  667. addi.d a0, a0, 32
  668. bnez a4, 1b
  669. 2:
  670. beqz t0, 4f
  671. 3:
  672. ld.b t1, a1, 3
  673. addi.d t0, t0, -1
  674. addi.d a1, a1, 4
  675. andi t1, t1, 0xff
  676. slli.w t2, t1, 6
  677. srli.w t3, t1, 2
  678. or t1, t2, t3
  679. st.h t1, a0, 0
  680. addi.d a0, a0, 2
  681. bnez t0, 3b
  682. 4:
  683. endfunc
  684. /*
  685. *void rgbaToA_lsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
  686. * const uint8_t *unused2, int width, uint32_t *unused, void *opq)
  687. */
  688. function rgbaToA_lsx
  689. andi t0, a4, 7
  690. srli.d a4, a4, 3
  691. vxor.v vr0, vr0, vr0
  692. beqz a4, 2f
  693. 1:
  694. vld vr1, a1, 3
  695. vld vr2, a1, 19
  696. addi.d a4, a4, -1
  697. addi.d a1, a1, 32
  698. vpickev.b vr3, vr2, vr1
  699. vpackev.b vr3, vr0, vr3
  700. vslli.h vr1, vr3, 6
  701. vsrli.h vr2, vr3, 2
  702. vor.v vr3, vr2, vr1
  703. vst vr3, a0, 0
  704. addi.d a0, a0, 16
  705. bnez a4, 1b
  706. 2:
  707. beqz t0, 4f
  708. 3:
  709. ld.b t1, a1, 3
  710. addi.d t0, t0, -1
  711. addi.d a1, a1, 4
  712. andi t1, t1, 0xff
  713. slli.w t2, t1, 6
  714. srli.w t3, t1, 2
  715. or t1, t2, t3
  716. st.h t1, a0, 0
  717. addi.d a0, a0, 2
  718. bnez t0, 3b
  719. 4:
  720. endfunc
  721. function rgbaToA_lasx
  722. andi t0, a4, 15
  723. srli.d a4, a4, 4
  724. xvxor.v xr0, xr0, xr0
  725. beqz a4, 2f
  726. 1:
  727. xvld xr1, a1, 3
  728. xvld xr2, a1, 35
  729. addi.d a4, a4, -1
  730. addi.d a1, a1, 64
  731. xvpickev.b xr3, xr2, xr1
  732. xvpermi.d xr3, xr3, 0xd8
  733. xvpackev.b xr3, xr0, xr3
  734. xvslli.h xr1, xr3, 6
  735. xvsrli.h xr2, xr3, 2
  736. xvor.v xr3, xr2, xr1
  737. xvst xr3, a0, 0
  738. addi.d a0, a0, 32
  739. bnez a4, 1b
  740. 2:
  741. beqz t0, 4f
  742. 3:
  743. ld.b t1, a1, 3
  744. addi.d t0, t0, -1
  745. addi.d a1, a1, 4
  746. andi t1, t1, 0xff
  747. slli.w t2, t1, 6
  748. srli.w t3, t1, 2
  749. or t1, t2, t3
  750. st.h t1, a0, 0
  751. addi.d a0, a0, 2
  752. bnez t0, 3b
  753. 4:
  754. endfunc