swscale.S 81 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236
  1. /*
  2. * Loongson LSX optimized swscale
  3. *
  4. * Copyright (c) 2023 Loongson Technology Corporation Limited
  5. * Contributed by Lu Wang <wanglu@loongson.cn>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "libavcodec/loongarch/loongson_asm.S"
  24. /* void ff_hscale_8_to_15_lsx(SwsInternal *c, int16_t *dst, int dstW,
  25. * const uint8_t *src, const int16_t *filter,
  26. * const int32_t *filterPos, int filterSize)
  27. */
  28. function ff_hscale_8_to_15_lsx
  29. addi.d sp, sp, -72
  30. st.d s0, sp, 0
  31. st.d s1, sp, 8
  32. st.d s2, sp, 16
  33. st.d s3, sp, 24
  34. st.d s4, sp, 32
  35. st.d s5, sp, 40
  36. st.d s6, sp, 48
  37. st.d s7, sp, 56
  38. st.d s8, sp, 64
  39. li.w t0, 32767
  40. li.w t8, 8
  41. li.w t7, 4
  42. vldi vr0, 0
  43. vreplgr2vr.w vr20, t0
  44. beq a6, t7, .LOOP_DSTW4
  45. beq a6, t8, .LOOP_DSTW8
  46. blt t8, a6, .LOOP_START
  47. b .END_DSTW4
  48. .LOOP_START:
  49. li.w t1, 0
  50. li.w s1, 0
  51. li.w s2, 0
  52. li.w s3, 0
  53. li.w s4, 0
  54. li.w s5, 0
  55. vldi vr22, 0
  56. addi.w s0, a6, -7
  57. slli.w s7, a6, 1
  58. slli.w s8, a6, 2
  59. add.w t6, s7, s8
  60. .LOOP_DSTW:
  61. ld.w t2, a5, 0
  62. ld.w t3, a5, 4
  63. ld.w t4, a5, 8
  64. ld.w t5, a5, 12
  65. fldx.d f1, a3, t2
  66. fldx.d f2, a3, t3
  67. fldx.d f3, a3, t4
  68. fldx.d f4, a3, t5
  69. vld vr9, a4, 0
  70. vldx vr10, a4, s7
  71. vldx vr11, a4, s8
  72. vldx vr12, a4, t6
  73. vilvl.b vr1, vr0, vr1
  74. vilvl.b vr2, vr0, vr2
  75. vilvl.b vr3, vr0, vr3
  76. vilvl.b vr4, vr0, vr4
  77. vdp2.w.h vr17, vr1, vr9
  78. vdp2.w.h vr18, vr2, vr10
  79. vdp2.w.h vr19, vr3, vr11
  80. vdp2.w.h vr21, vr4, vr12
  81. vhaddw.d.w vr1, vr17, vr17
  82. vhaddw.d.w vr2, vr18, vr18
  83. vhaddw.d.w vr3, vr19, vr19
  84. vhaddw.d.w vr4, vr21, vr21
  85. vhaddw.q.d vr1, vr1, vr1
  86. vhaddw.q.d vr2, vr2, vr2
  87. vhaddw.q.d vr3, vr3, vr3
  88. vhaddw.q.d vr4, vr4, vr4
  89. vilvl.w vr1, vr2, vr1
  90. vilvl.w vr3, vr4, vr3
  91. vilvl.d vr1, vr3, vr1
  92. vadd.w vr22, vr22, vr1
  93. addi.w s1, s1, 8
  94. addi.d a3, a3, 8
  95. addi.d a4, a4, 16
  96. blt s1, s0, .LOOP_DSTW
  97. blt s1, a6, .DSTWA
  98. b .END_FILTER
  99. .DSTWA:
  100. ld.w t2, a5, 0
  101. li.w t3, 0
  102. move s6, s1
  103. .FILTERSIZEA:
  104. add.w t4, t2, t3
  105. ldx.bu t5, a3, t4
  106. mul.w t6, a6, t1
  107. add.w t6, t6, t3
  108. slli.w t6, t6, 1
  109. ldx.h t6, a4, t6
  110. mul.w t6, t5, t6
  111. add.w s2, s2, t6
  112. addi.w t3, t3, 1
  113. addi.w s6, s6, 1
  114. blt s6, a6, .FILTERSIZEA
  115. ld.w t2, a5, 4
  116. li.w t3, 0
  117. move s6, s1
  118. addi.w t1, t1, 1
  119. .FILTERSIZEB:
  120. add.w t4, t2, t3
  121. ldx.bu t5, a3, t4
  122. mul.w t6, a6, t1
  123. add.w t6, t6, t3
  124. slli.w t6, t6, 1
  125. ldx.h t6, a4, t6
  126. mul.w t6, t5, t6
  127. add.w s3, s3, t6
  128. addi.w t3, t3, 1
  129. addi.w s6, s6, 1
  130. blt s6, a6, .FILTERSIZEB
  131. ld.w t2, a5, 8
  132. addi.w t1, t1, 1
  133. li.w t3, 0
  134. move s6, s1
  135. .FILTERSIZEC:
  136. add.w t4, t2, t3
  137. ldx.bu t5, a3, t4
  138. mul.w t6, a6, t1
  139. add.w t6, t6, t3
  140. slli.w t6, t6, 1
  141. ldx.h t6, a4, t6
  142. mul.w t6, t5, t6
  143. add.w s4, s4, t6
  144. addi.w t3, t3, 1
  145. addi.w s6, s6, 1
  146. blt s6, a6, .FILTERSIZEC
  147. ld.w t2, a5, 12
  148. addi.w t1, t1, 1
  149. move s6, s1
  150. li.w t3, 0
  151. .FILTERSIZED:
  152. add.w t4, t2, t3
  153. ldx.bu t5, a3, t4
  154. mul.w t6, a6, t1
  155. add.w t6, t6, t3
  156. slli.w t6, t6, 1
  157. ldx.h t6, a4, t6
  158. mul.w t6, t5, t6
  159. add.w s5, s5, t6
  160. addi.w t3, t3, 1
  161. addi.w s6, s6, 1
  162. blt s6, a6, .FILTERSIZED
  163. .END_FILTER:
  164. vpickve2gr.w t1, vr22, 0
  165. vpickve2gr.w t2, vr22, 1
  166. vpickve2gr.w t3, vr22, 2
  167. vpickve2gr.w t4, vr22, 3
  168. add.w s2, s2, t1
  169. add.w s3, s3, t2
  170. add.w s4, s4, t3
  171. add.w s5, s5, t4
  172. srai.w s2, s2, 7
  173. srai.w s3, s3, 7
  174. srai.w s4, s4, 7
  175. srai.w s5, s5, 7
  176. slt t1, s2, t0
  177. slt t2, s3, t0
  178. slt t3, s4, t0
  179. slt t4, s5, t0
  180. maskeqz s2, s2, t1
  181. maskeqz s3, s3, t2
  182. maskeqz s4, s4, t3
  183. maskeqz s5, s5, t4
  184. masknez t1, t0, t1
  185. masknez t2, t0, t2
  186. masknez t3, t0, t3
  187. masknez t4, t0, t4
  188. or s2, s2, t1
  189. or s3, s3, t2
  190. or s4, s4, t3
  191. or s5, s5, t4
  192. st.h s2, a1, 0
  193. st.h s3, a1, 2
  194. st.h s4, a1, 4
  195. st.h s5, a1, 6
  196. addi.d a1, a1, 8
  197. sub.d a3, a3, s1
  198. addi.d a5, a5, 16
  199. slli.d t3, a6, 3
  200. add.d a4, a4, t3
  201. sub.d a4, a4, s1
  202. sub.d a4, a4, s1
  203. addi.d a2, a2, -4
  204. bge a2, t7, .LOOP_START
  205. blt zero, a2, .RES
  206. b .END_LOOP
  207. .RES:
  208. li.w t1, 0
  209. .DSTW:
  210. slli.w t2, t1, 2
  211. ldx.w t2, a5, t2
  212. li.w t3, 0
  213. li.w t8, 0
  214. .FILTERSIZE:
  215. add.w t4, t2, t3
  216. ldx.bu t5, a3, t4
  217. mul.w t6, a6, t1
  218. add.w t6, t6, t3
  219. slli.w t7, t6, 1
  220. ldx.h t7, a4, t7
  221. mul.w t7, t5, t7
  222. add.w t8, t8, t7
  223. addi.w t3, t3, 1
  224. blt t3, a6, .FILTERSIZE
  225. srai.w t8, t8, 7
  226. slt t5, t8, t0
  227. maskeqz t8, t8, t5
  228. masknez t5, t0, t5
  229. or t8, t8, t5
  230. slli.w t4, t1, 1
  231. stx.h t8, a1, t4
  232. addi.w t1, t1, 1
  233. blt t1, a2, .DSTW
  234. b .END_LOOP
  235. .LOOP_DSTW8:
  236. ld.w t1, a5, 0
  237. ld.w t2, a5, 4
  238. ld.w t3, a5, 8
  239. ld.w t4, a5, 12
  240. fldx.d f1, a3, t1
  241. fldx.d f2, a3, t2
  242. fldx.d f3, a3, t3
  243. fldx.d f4, a3, t4
  244. ld.w t1, a5, 16
  245. ld.w t2, a5, 20
  246. ld.w t3, a5, 24
  247. ld.w t4, a5, 28
  248. fldx.d f5, a3, t1
  249. fldx.d f6, a3, t2
  250. fldx.d f7, a3, t3
  251. fldx.d f8, a3, t4
  252. vld vr9, a4, 0
  253. vld vr10, a4, 16
  254. vld vr11, a4, 32
  255. vld vr12, a4, 48
  256. vld vr13, a4, 64
  257. vld vr14, a4, 80
  258. vld vr15, a4, 96
  259. vld vr16, a4, 112
  260. vilvl.b vr1, vr0, vr1
  261. vilvl.b vr2, vr0, vr2
  262. vilvl.b vr3, vr0, vr3
  263. vilvl.b vr4, vr0, vr4
  264. vilvl.b vr5, vr0, vr5
  265. vilvl.b vr6, vr0, vr6
  266. vilvl.b vr7, vr0, vr7
  267. vilvl.b vr8, vr0, vr8
  268. vdp2.w.h vr17, vr1, vr9
  269. vdp2.w.h vr18, vr2, vr10
  270. vdp2.w.h vr19, vr3, vr11
  271. vdp2.w.h vr21, vr4, vr12
  272. vdp2.w.h vr1, vr5, vr13
  273. vdp2.w.h vr2, vr6, vr14
  274. vdp2.w.h vr3, vr7, vr15
  275. vdp2.w.h vr4, vr8, vr16
  276. vhaddw.d.w vr5, vr1, vr1
  277. vhaddw.d.w vr6, vr2, vr2
  278. vhaddw.d.w vr7, vr3, vr3
  279. vhaddw.d.w vr8, vr4, vr4
  280. vhaddw.d.w vr1, vr17, vr17
  281. vhaddw.d.w vr2, vr18, vr18
  282. vhaddw.d.w vr3, vr19, vr19
  283. vhaddw.d.w vr4, vr21, vr21
  284. vhaddw.q.d vr1, vr1, vr1
  285. vhaddw.q.d vr2, vr2, vr2
  286. vhaddw.q.d vr3, vr3, vr3
  287. vhaddw.q.d vr4, vr4, vr4
  288. vhaddw.q.d vr5, vr5, vr5
  289. vhaddw.q.d vr6, vr6, vr6
  290. vhaddw.q.d vr7, vr7, vr7
  291. vhaddw.q.d vr8, vr8, vr8
  292. vilvl.w vr1, vr2, vr1
  293. vilvl.w vr3, vr4, vr3
  294. vilvl.w vr5, vr6, vr5
  295. vilvl.w vr7, vr8, vr7
  296. vilvl.d vr1, vr3, vr1
  297. vilvl.d vr5, vr7, vr5
  298. vsrai.w vr1, vr1, 7
  299. vsrai.w vr5, vr5, 7
  300. vmin.w vr1, vr1, vr20
  301. vmin.w vr5, vr5, vr20
  302. vpickev.h vr1, vr5, vr1
  303. vst vr1, a1, 0
  304. addi.d a1, a1, 16
  305. addi.d a5, a5, 32
  306. addi.d a4, a4, 128
  307. addi.d a2, a2, -8
  308. bge a2, t8, .LOOP_DSTW8
  309. blt zero, a2, .RES8
  310. b .END_LOOP
  311. .RES8:
  312. li.w t1, 0
  313. .DSTW8:
  314. slli.w t2, t1, 2
  315. ldx.w t2, a5, t2
  316. li.w t3, 0
  317. li.w t8, 0
  318. .FILTERSIZE8:
  319. add.w t4, t2, t3
  320. ldx.bu t5, a3, t4
  321. mul.w t6, a6, t1
  322. add.w t6, t6, t3
  323. slli.w t7, t6, 1
  324. ldx.h t7, a4, t7
  325. mul.w t7, t5, t7
  326. add.w t8, t8, t7
  327. addi.w t3, t3, 1
  328. blt t3, a6, .FILTERSIZE8
  329. srai.w t8, t8, 7
  330. slt t5, t8, t0
  331. maskeqz t8, t8, t5
  332. masknez t5, t0, t5
  333. or t8, t8, t5
  334. slli.w t4, t1, 1
  335. stx.h t8, a1, t4
  336. addi.w t1, t1, 1
  337. blt t1, a2, .DSTW8
  338. b .END_LOOP
  339. .LOOP_DSTW4:
  340. ld.w t1, a5, 0
  341. ld.w t2, a5, 4
  342. ld.w t3, a5, 8
  343. ld.w t4, a5, 12
  344. fldx.s f1, a3, t1
  345. fldx.s f2, a3, t2
  346. fldx.s f3, a3, t3
  347. fldx.s f4, a3, t4
  348. ld.w t1, a5, 16
  349. ld.w t2, a5, 20
  350. ld.w t3, a5, 24
  351. ld.w t4, a5, 28
  352. fldx.s f5, a3, t1
  353. fldx.s f6, a3, t2
  354. fldx.s f7, a3, t3
  355. fldx.s f8, a3, t4
  356. vld vr9, a4, 0
  357. vld vr10, a4, 16
  358. vld vr11, a4, 32
  359. vld vr12, a4, 48
  360. vilvl.w vr1, vr2, vr1
  361. vilvl.w vr3, vr4, vr3
  362. vilvl.w vr5, vr6, vr5
  363. vilvl.w vr7, vr8, vr7
  364. vilvl.b vr1, vr0, vr1
  365. vilvl.b vr3, vr0, vr3
  366. vilvl.b vr5, vr0, vr5
  367. vilvl.b vr7, vr0, vr7
  368. vdp2.w.h vr13, vr1, vr9
  369. vdp2.w.h vr14, vr3, vr10
  370. vdp2.w.h vr15, vr5, vr11
  371. vdp2.w.h vr16, vr7, vr12
  372. vhaddw.d.w vr13, vr13, vr13
  373. vhaddw.d.w vr14, vr14, vr14
  374. vhaddw.d.w vr15, vr15, vr15
  375. vhaddw.d.w vr16, vr16, vr16
  376. vpickev.w vr13, vr14, vr13
  377. vpickev.w vr15, vr16, vr15
  378. vsrai.w vr13, vr13, 7
  379. vsrai.w vr15, vr15, 7
  380. vmin.w vr13, vr13, vr20
  381. vmin.w vr15, vr15, vr20
  382. vpickev.h vr13, vr15, vr13
  383. vst vr13, a1, 0
  384. addi.d a1, a1, 16
  385. addi.d a5, a5, 32
  386. addi.d a4, a4, 64
  387. addi.d a2, a2, -8
  388. bge a2, t8, .LOOP_DSTW4
  389. blt zero, a2, .RES4
  390. b .END_LOOP
  391. .RES4:
  392. li.w t1, 0
  393. .DSTW4:
  394. slli.w t2, t1, 2
  395. ldx.w t2, a5, t2
  396. li.w t3, 0
  397. li.w t8, 0
  398. .FILTERSIZE4:
  399. add.w t4, t2, t3
  400. ldx.bu t5, a3, t4
  401. mul.w t6, a6, t1
  402. add.w t6, t6, t3
  403. slli.w t7, t6, 1
  404. ldx.h t7, a4, t7
  405. mul.w t7, t5, t7
  406. add.w t8, t8, t7
  407. addi.w t3, t3, 1
  408. blt t3, a6, .FILTERSIZE4
  409. srai.w t8, t8, 7
  410. slt t5, t8, t0
  411. maskeqz t8, t8, t5
  412. masknez t5, t0, t5
  413. or t8, t8, t5
  414. slli.w t4, t1, 1
  415. stx.h t8, a1, t4
  416. addi.w t1, t1, 1
  417. blt t1, a2, .DSTW4
  418. b .END_LOOP
  419. .END_DSTW4:
  420. li.w t1, 0
  421. .LOOP_DSTW1:
  422. slli.w t2, t1, 2
  423. ldx.w t2, a5, t2
  424. li.w t3, 0
  425. li.w t8, 0
  426. .FILTERSIZE1:
  427. add.w t4, t2, t3
  428. ldx.bu t5, a3, t4
  429. mul.w t6, a6, t1
  430. add.w t6, t6, t3
  431. slli.w t7, t6, 1
  432. ldx.h t7, a4, t7
  433. mul.w t7, t5, t7
  434. add.w t8, t8, t7
  435. addi.w t3, t3, 1
  436. blt t3, a6, .FILTERSIZE1
  437. srai.w t8, t8, 7
  438. slt t5, t8, t0
  439. maskeqz t8, t8, t5
  440. masknez t5, t0, t5
  441. or t8, t8, t5
  442. slli.w t4, t1, 1
  443. stx.h t8, a1, t4
  444. addi.w t1, t1, 1
  445. blt t1, a2, .LOOP_DSTW1
  446. b .END_LOOP
  447. .END_LOOP:
  448. ld.d s0, sp, 0
  449. ld.d s1, sp, 8
  450. ld.d s2, sp, 16
  451. ld.d s3, sp, 24
  452. ld.d s4, sp, 32
  453. ld.d s5, sp, 40
  454. ld.d s6, sp, 48
  455. ld.d s7, sp, 56
  456. ld.d s8, sp, 64
  457. addi.d sp, sp, 72
  458. endfunc
  459. /* void ff_hscale_8_to_19_lsx(SwsInternal *c, int16_t *dst, int dstW,
  460. * const uint8_t *src, const int16_t *filter,
  461. * const int32_t *filterPos, int filterSize)
  462. */
  463. function ff_hscale_8_to_19_lsx
  464. addi.d sp, sp, -72
  465. st.d s0, sp, 0
  466. st.d s1, sp, 8
  467. st.d s2, sp, 16
  468. st.d s3, sp, 24
  469. st.d s4, sp, 32
  470. st.d s5, sp, 40
  471. st.d s6, sp, 48
  472. st.d s7, sp, 56
  473. st.d s8, sp, 64
  474. li.w t0, 524287
  475. li.w t8, 8
  476. li.w t7, 4
  477. vldi vr0, 0
  478. vreplgr2vr.w vr20, t0
  479. beq a6, t7, .LOOP_DST4
  480. beq a6, t8, .LOOP_DST8
  481. blt t8, a6, .LOOP
  482. b .END_DST4
  483. .LOOP:
  484. li.w t1, 0
  485. li.w s1, 0
  486. li.w s2, 0
  487. li.w s3, 0
  488. li.w s4, 0
  489. li.w s5, 0
  490. vldi vr22, 0
  491. addi.w s0, a6, -7
  492. slli.w s7, a6, 1
  493. slli.w s8, a6, 2
  494. add.w t6, s7, s8
  495. .LOOP_DST:
  496. ld.w t2, a5, 0
  497. ld.w t3, a5, 4
  498. ld.w t4, a5, 8
  499. ld.w t5, a5, 12
  500. fldx.d f1, a3, t2
  501. fldx.d f2, a3, t3
  502. fldx.d f3, a3, t4
  503. fldx.d f4, a3, t5
  504. vld vr9, a4, 0
  505. vldx vr10, a4, s7
  506. vldx vr11, a4, s8
  507. vldx vr12, a4, t6
  508. vilvl.b vr1, vr0, vr1
  509. vilvl.b vr2, vr0, vr2
  510. vilvl.b vr3, vr0, vr3
  511. vilvl.b vr4, vr0, vr4
  512. vdp2.w.h vr17, vr1, vr9
  513. vdp2.w.h vr18, vr2, vr10
  514. vdp2.w.h vr19, vr3, vr11
  515. vdp2.w.h vr21, vr4, vr12
  516. vhaddw.d.w vr1, vr17, vr17
  517. vhaddw.d.w vr2, vr18, vr18
  518. vhaddw.d.w vr3, vr19, vr19
  519. vhaddw.d.w vr4, vr21, vr21
  520. vhaddw.q.d vr1, vr1, vr1
  521. vhaddw.q.d vr2, vr2, vr2
  522. vhaddw.q.d vr3, vr3, vr3
  523. vhaddw.q.d vr4, vr4, vr4
  524. vilvl.w vr1, vr2, vr1
  525. vilvl.w vr3, vr4, vr3
  526. vilvl.d vr1, vr3, vr1
  527. vadd.w vr22, vr22, vr1
  528. addi.w s1, s1, 8
  529. addi.d a3, a3, 8
  530. addi.d a4, a4, 16
  531. blt s1, s0, .LOOP_DST
  532. blt s1, a6, .DSTA
  533. b .END_FILTERA
  534. .DSTA:
  535. ld.w t2, a5, 0
  536. li.w t3, 0
  537. move s6, s1
  538. .FILTERA:
  539. add.w t4, t2, t3
  540. ldx.bu t5, a3, t4
  541. mul.w t6, a6, t1
  542. add.w t6, t6, t3
  543. slli.w t6, t6, 1
  544. ldx.h t6, a4, t6
  545. mul.w t6, t5, t6
  546. add.w s2, s2, t6
  547. addi.w t3, t3, 1
  548. addi.w s6, s6, 1
  549. blt s6, a6, .FILTERA
  550. ld.w t2, a5, 4
  551. li.w t3, 0
  552. move s6, s1
  553. addi.w t1, t1, 1
  554. .FILTERB:
  555. add.w t4, t2, t3
  556. ldx.bu t5, a3, t4
  557. mul.w t6, a6, t1
  558. add.w t6, t6, t3
  559. slli.w t6, t6, 1
  560. ldx.h t6, a4, t6
  561. mul.w t6, t5, t6
  562. add.w s3, s3, t6
  563. addi.w t3, t3, 1
  564. addi.w s6, s6, 1
  565. blt s6, a6, .FILTERB
  566. ld.w t2, a5, 8
  567. addi.w t1, t1, 1
  568. li.w t3, 0
  569. move s6, s1
  570. .FILTERC:
  571. add.w t4, t2, t3
  572. ldx.bu t5, a3, t4
  573. mul.w t6, a6, t1
  574. add.w t6, t6, t3
  575. slli.w t6, t6, 1
  576. ldx.h t6, a4, t6
  577. mul.w t6, t5, t6
  578. add.w s4, s4, t6
  579. addi.w t3, t3, 1
  580. addi.w s6, s6, 1
  581. blt s6, a6, .FILTERC
  582. ld.w t2, a5, 12
  583. addi.w t1, t1, 1
  584. move s6, s1
  585. li.w t3, 0
  586. .FILTERD:
  587. add.w t4, t2, t3
  588. ldx.bu t5, a3, t4
  589. mul.w t6, a6, t1
  590. add.w t6, t6, t3
  591. slli.w t6, t6, 1
  592. ldx.h t6, a4, t6
  593. mul.w t6, t5, t6
  594. add.w s5, s5, t6
  595. addi.w t3, t3, 1
  596. addi.w s6, s6, 1
  597. blt s6, a6, .FILTERD
  598. .END_FILTERA:
  599. vpickve2gr.w t1, vr22, 0
  600. vpickve2gr.w t2, vr22, 1
  601. vpickve2gr.w t3, vr22, 2
  602. vpickve2gr.w t4, vr22, 3
  603. add.w s2, s2, t1
  604. add.w s3, s3, t2
  605. add.w s4, s4, t3
  606. add.w s5, s5, t4
  607. srai.w s2, s2, 3
  608. srai.w s3, s3, 3
  609. srai.w s4, s4, 3
  610. srai.w s5, s5, 3
  611. slt t1, s2, t0
  612. slt t2, s3, t0
  613. slt t3, s4, t0
  614. slt t4, s5, t0
  615. maskeqz s2, s2, t1
  616. maskeqz s3, s3, t2
  617. maskeqz s4, s4, t3
  618. maskeqz s5, s5, t4
  619. masknez t1, t0, t1
  620. masknez t2, t0, t2
  621. masknez t3, t0, t3
  622. masknez t4, t0, t4
  623. or s2, s2, t1
  624. or s3, s3, t2
  625. or s4, s4, t3
  626. or s5, s5, t4
  627. st.w s2, a1, 0
  628. st.w s3, a1, 4
  629. st.w s4, a1, 8
  630. st.w s5, a1, 12
  631. addi.d a1, a1, 16
  632. sub.d a3, a3, s1
  633. addi.d a5, a5, 16
  634. slli.d t3, a6, 3
  635. add.d a4, a4, t3
  636. sub.d a4, a4, s1
  637. sub.d a4, a4, s1
  638. addi.d a2, a2, -4
  639. bge a2, t7, .LOOP
  640. blt zero, a2, .RESA
  641. b .END
  642. .RESA:
  643. li.w t1, 0
  644. .DST:
  645. slli.w t2, t1, 2
  646. ldx.w t2, a5, t2
  647. li.w t3, 0
  648. li.w t8, 0
  649. .FILTER:
  650. add.w t4, t2, t3
  651. ldx.bu t5, a3, t4
  652. mul.w t6, a6, t1
  653. add.w t6, t6, t3
  654. slli.w t7, t6, 1
  655. ldx.h t7, a4, t7
  656. mul.w t7, t5, t7
  657. add.w t8, t8, t7
  658. addi.w t3, t3, 1
  659. blt t3, a6, .FILTER
  660. srai.w t8, t8, 3
  661. slt t5, t8, t0
  662. maskeqz t8, t8, t5
  663. masknez t5, t0, t5
  664. or t8, t8, t5
  665. slli.w t4, t1, 2
  666. stx.w t8, a1, t4
  667. addi.w t1, t1, 1
  668. blt t1, a2, .DST
  669. b .END
  670. .LOOP_DST8:
  671. ld.w t1, a5, 0
  672. ld.w t2, a5, 4
  673. ld.w t3, a5, 8
  674. ld.w t4, a5, 12
  675. fldx.d f1, a3, t1
  676. fldx.d f2, a3, t2
  677. fldx.d f3, a3, t3
  678. fldx.d f4, a3, t4
  679. ld.w t1, a5, 16
  680. ld.w t2, a5, 20
  681. ld.w t3, a5, 24
  682. ld.w t4, a5, 28
  683. fldx.d f5, a3, t1
  684. fldx.d f6, a3, t2
  685. fldx.d f7, a3, t3
  686. fldx.d f8, a3, t4
  687. vld vr9, a4, 0
  688. vld vr10, a4, 16
  689. vld vr11, a4, 32
  690. vld vr12, a4, 48
  691. vld vr13, a4, 64
  692. vld vr14, a4, 80
  693. vld vr15, a4, 96
  694. vld vr16, a4, 112
  695. vilvl.b vr1, vr0, vr1
  696. vilvl.b vr2, vr0, vr2
  697. vilvl.b vr3, vr0, vr3
  698. vilvl.b vr4, vr0, vr4
  699. vilvl.b vr5, vr0, vr5
  700. vilvl.b vr6, vr0, vr6
  701. vilvl.b vr7, vr0, vr7
  702. vilvl.b vr8, vr0, vr8
  703. vdp2.w.h vr17, vr1, vr9
  704. vdp2.w.h vr18, vr2, vr10
  705. vdp2.w.h vr19, vr3, vr11
  706. vdp2.w.h vr21, vr4, vr12
  707. vdp2.w.h vr1, vr5, vr13
  708. vdp2.w.h vr2, vr6, vr14
  709. vdp2.w.h vr3, vr7, vr15
  710. vdp2.w.h vr4, vr8, vr16
  711. vhaddw.d.w vr5, vr1, vr1
  712. vhaddw.d.w vr6, vr2, vr2
  713. vhaddw.d.w vr7, vr3, vr3
  714. vhaddw.d.w vr8, vr4, vr4
  715. vhaddw.d.w vr1, vr17, vr17
  716. vhaddw.d.w vr2, vr18, vr18
  717. vhaddw.d.w vr3, vr19, vr19
  718. vhaddw.d.w vr4, vr21, vr21
  719. vhaddw.q.d vr1, vr1, vr1
  720. vhaddw.q.d vr2, vr2, vr2
  721. vhaddw.q.d vr3, vr3, vr3
  722. vhaddw.q.d vr4, vr4, vr4
  723. vhaddw.q.d vr5, vr5, vr5
  724. vhaddw.q.d vr6, vr6, vr6
  725. vhaddw.q.d vr7, vr7, vr7
  726. vhaddw.q.d vr8, vr8, vr8
  727. vilvl.w vr1, vr2, vr1
  728. vilvl.w vr3, vr4, vr3
  729. vilvl.w vr5, vr6, vr5
  730. vilvl.w vr7, vr8, vr7
  731. vilvl.d vr1, vr3, vr1
  732. vilvl.d vr5, vr7, vr5
  733. vsrai.w vr1, vr1, 3
  734. vsrai.w vr5, vr5, 3
  735. vmin.w vr1, vr1, vr20
  736. vmin.w vr5, vr5, vr20
  737. vst vr1, a1, 0
  738. vst vr5, a1, 16
  739. addi.d a1, a1, 32
  740. addi.d a5, a5, 32
  741. addi.d a4, a4, 128
  742. addi.d a2, a2, -8
  743. bge a2, t8, .LOOP_DST8
  744. blt zero, a2, .REST8
  745. b .END
  746. .REST8:
  747. li.w t1, 0
  748. .DST8:
  749. slli.w t2, t1, 2
  750. ldx.w t2, a5, t2
  751. li.w t3, 0
  752. li.w t8, 0
  753. .FILTER8:
  754. add.w t4, t2, t3
  755. ldx.bu t5, a3, t4
  756. mul.w t6, a6, t1
  757. add.w t6, t6, t3
  758. slli.w t7, t6, 1
  759. ldx.h t7, a4, t7
  760. mul.w t7, t5, t7
  761. add.w t8, t8, t7
  762. addi.w t3, t3, 1
  763. blt t3, a6, .FILTER8
  764. srai.w t8, t8, 3
  765. slt t5, t8, t0
  766. maskeqz t8, t8, t5
  767. masknez t5, t0, t5
  768. or t8, t8, t5
  769. slli.w t4, t1, 2
  770. stx.w t8, a1, t4
  771. addi.w t1, t1, 1
  772. blt t1, a2, .DST8
  773. b .END
  774. .LOOP_DST4:
  775. ld.w t1, a5, 0
  776. ld.w t2, a5, 4
  777. ld.w t3, a5, 8
  778. ld.w t4, a5, 12
  779. fldx.s f1, a3, t1
  780. fldx.s f2, a3, t2
  781. fldx.s f3, a3, t3
  782. fldx.s f4, a3, t4
  783. ld.w t1, a5, 16
  784. ld.w t2, a5, 20
  785. ld.w t3, a5, 24
  786. ld.w t4, a5, 28
  787. fldx.s f5, a3, t1
  788. fldx.s f6, a3, t2
  789. fldx.s f7, a3, t3
  790. fldx.s f8, a3, t4
  791. vld vr9, a4, 0
  792. vld vr10, a4, 16
  793. vld vr11, a4, 32
  794. vld vr12, a4, 48
  795. vilvl.w vr1, vr2, vr1
  796. vilvl.w vr3, vr4, vr3
  797. vilvl.w vr5, vr6, vr5
  798. vilvl.w vr7, vr8, vr7
  799. vilvl.b vr1, vr0, vr1
  800. vilvl.b vr3, vr0, vr3
  801. vilvl.b vr5, vr0, vr5
  802. vilvl.b vr7, vr0, vr7
  803. vdp2.w.h vr13, vr1, vr9
  804. vdp2.w.h vr14, vr3, vr10
  805. vdp2.w.h vr15, vr5, vr11
  806. vdp2.w.h vr16, vr7, vr12
  807. vhaddw.d.w vr13, vr13, vr13
  808. vhaddw.d.w vr14, vr14, vr14
  809. vhaddw.d.w vr15, vr15, vr15
  810. vhaddw.d.w vr16, vr16, vr16
  811. vpickev.w vr13, vr14, vr13
  812. vpickev.w vr15, vr16, vr15
  813. vsrai.w vr13, vr13, 3
  814. vsrai.w vr15, vr15, 3
  815. vmin.w vr13, vr13, vr20
  816. vmin.w vr15, vr15, vr20
  817. vst vr13, a1, 0
  818. vst vr15, a1, 16
  819. addi.d a1, a1, 32
  820. addi.d a5, a5, 32
  821. addi.d a4, a4, 64
  822. addi.d a2, a2, -8
  823. bge a2, t8, .LOOP_DST4
  824. blt zero, a2, .REST4
  825. b .END
  826. .REST4:
  827. li.w t1, 0
  828. .DST4:
  829. slli.w t2, t1, 2
  830. ldx.w t2, a5, t2
  831. li.w t3, 0
  832. li.w t8, 0
  833. .FILTER4:
  834. add.w t4, t2, t3
  835. ldx.bu t5, a3, t4
  836. mul.w t6, a6, t1
  837. add.w t6, t6, t3
  838. slli.w t7, t6, 1
  839. ldx.h t7, a4, t7
  840. mul.w t7, t5, t7
  841. add.w t8, t8, t7
  842. addi.w t3, t3, 1
  843. blt t3, a6, .FILTER4
  844. srai.w t8, t8, 3
  845. slt t5, t8, t0
  846. maskeqz t8, t8, t5
  847. masknez t5, t0, t5
  848. or t8, t8, t5
  849. slli.w t4, t1, 2
  850. stx.w t8, a1, t4
  851. addi.w t1, t1, 1
  852. blt t1, a2, .DST4
  853. b .END
  854. .END_DST4:
  855. li.w t1, 0
  856. .LOOP_DST1:
  857. slli.w t2, t1, 2
  858. ldx.w t2, a5, t2
  859. li.w t3, 0
  860. li.w t8, 0
  861. .FILTER1:
  862. add.w t4, t2, t3
  863. ldx.bu t5, a3, t4
  864. mul.w t6, a6, t1
  865. add.w t6, t6, t3
  866. slli.w t7, t6, 1
  867. ldx.h t7, a4, t7
  868. mul.w t7, t5, t7
  869. add.w t8, t8, t7
  870. addi.w t3, t3, 1
  871. blt t3, a6, .FILTER1
  872. srai.w t8, t8, 3
  873. slt t5, t8, t0
  874. maskeqz t8, t8, t5
  875. masknez t5, t0, t5
  876. or t8, t8, t5
  877. slli.w t4, t1, 2
  878. stx.w t8, a1, t4
  879. addi.w t1, t1, 1
  880. blt t1, a2, .LOOP_DST1
  881. b .END
  882. .END:
  883. ld.d s0, sp, 0
  884. ld.d s1, sp, 8
  885. ld.d s2, sp, 16
  886. ld.d s3, sp, 24
  887. ld.d s4, sp, 32
  888. ld.d s5, sp, 40
  889. ld.d s6, sp, 48
  890. ld.d s7, sp, 56
  891. ld.d s8, sp, 64
  892. addi.d sp, sp, 72
  893. endfunc
  894. /* void ff_hscale_16_to_15_sub_lsx(SwsInternal *c, int16_t *dst, int dstW,
  895. * const uint8_t *src, const int16_t *filter,
  896. * const int32_t *filterPos, int filterSize, int sh)
  897. */
  898. function ff_hscale_16_to_15_sub_lsx
  899. addi.d sp, sp, -72
  900. st.d s0, sp, 0
  901. st.d s1, sp, 8
  902. st.d s2, sp, 16
  903. st.d s3, sp, 24
  904. st.d s4, sp, 32
  905. st.d s5, sp, 40
  906. st.d s6, sp, 48
  907. st.d s7, sp, 56
  908. st.d s8, sp, 64
  909. li.w t0, 32767
  910. li.w t8, 8
  911. li.w t7, 4
  912. vreplgr2vr.w vr20, t0
  913. vreplgr2vr.w vr0, a7
  914. beq a6, t7, .LOOP_HS15_DST4
  915. beq a6, t8, .LOOP_HS15_DST8
  916. blt t8, a6, .LOOP_HS15
  917. b .END_HS15_DST4
  918. .LOOP_HS15:
  919. li.w t1, 0
  920. li.w s1, 0
  921. li.w s2, 0
  922. li.w s3, 0
  923. li.w s4, 0
  924. li.w s5, 0
  925. vldi vr22, 0
  926. addi.w s0, a6, -7
  927. slli.w s7, a6, 1
  928. slli.w s8, a6, 2
  929. add.w t6, s7, s8
  930. .LOOP_HS15_DST:
  931. ld.w t2, a5, 0
  932. ld.w t3, a5, 4
  933. ld.w t4, a5, 8
  934. ld.w t5, a5, 12
  935. slli.w t2, t2, 1
  936. slli.w t3, t3, 1
  937. slli.w t4, t4, 1
  938. slli.w t5, t5, 1
  939. vldx vr1, a3, t2
  940. vldx vr2, a3, t3
  941. vldx vr3, a3, t4
  942. vldx vr4, a3, t5
  943. vld vr9, a4, 0
  944. vldx vr10, a4, s7
  945. vldx vr11, a4, s8
  946. vldx vr12, a4, t6
  947. vmulwev.w.hu.h vr17, vr1, vr9
  948. vmulwev.w.hu.h vr18, vr2, vr10
  949. vmulwev.w.hu.h vr19, vr3, vr11
  950. vmulwev.w.hu.h vr21, vr4, vr12
  951. vmaddwod.w.hu.h vr17, vr1, vr9
  952. vmaddwod.w.hu.h vr18, vr2, vr10
  953. vmaddwod.w.hu.h vr19, vr3, vr11
  954. vmaddwod.w.hu.h vr21, vr4, vr12
  955. vhaddw.d.w vr1, vr17, vr17
  956. vhaddw.d.w vr2, vr18, vr18
  957. vhaddw.d.w vr3, vr19, vr19
  958. vhaddw.d.w vr4, vr21, vr21
  959. vhaddw.q.d vr1, vr1, vr1
  960. vhaddw.q.d vr2, vr2, vr2
  961. vhaddw.q.d vr3, vr3, vr3
  962. vhaddw.q.d vr4, vr4, vr4
  963. vilvl.w vr1, vr2, vr1
  964. vilvl.w vr3, vr4, vr3
  965. vilvl.d vr1, vr3, vr1
  966. vadd.w vr22, vr22, vr1
  967. addi.w s1, s1, 8
  968. addi.d a3, a3, 16
  969. addi.d a4, a4, 16
  970. blt s1, s0, .LOOP_HS15_DST
  971. blt s1, a6, .HS15_DSTA
  972. b .END_HS15_FILTERA
  973. .HS15_DSTA:
  974. ld.w t2, a5, 0
  975. li.w t3, 0
  976. move s6, s1
  977. .HS15_FILTERA:
  978. add.w t4, t2, t3
  979. slli.w t4, t4, 1
  980. ldx.hu t5, a3, t4
  981. mul.w t6, a6, t1
  982. add.w t6, t6, t3
  983. slli.w t6, t6, 1
  984. ldx.h t6, a4, t6
  985. mul.w t6, t5, t6
  986. add.w s2, s2, t6
  987. addi.w t3, t3, 1
  988. addi.w s6, s6, 1
  989. blt s6, a6, .HS15_FILTERA
  990. ld.w t2, a5, 4
  991. li.w t3, 0
  992. move s6, s1
  993. addi.w t1, t1, 1
  994. .HS15_FILTERB:
  995. add.w t4, t2, t3
  996. slli.w t4, t4, 1
  997. ldx.hu t5, a3, t4
  998. mul.w t6, a6, t1
  999. add.w t6, t6, t3
  1000. slli.w t6, t6, 1
  1001. ldx.h t6, a4, t6
  1002. mul.w t6, t5, t6
  1003. add.w s3, s3, t6
  1004. addi.w t3, t3, 1
  1005. addi.w s6, s6, 1
  1006. blt s6, a6, .HS15_FILTERB
  1007. ld.w t2, a5, 8
  1008. addi.w t1, t1, 1
  1009. li.w t3, 0
  1010. move s6, s1
  1011. .HS15_FILTERC:
  1012. add.w t4, t2, t3
  1013. slli.w t4, t4, 1
  1014. ldx.hu t5, a3, t4
  1015. mul.w t6, a6, t1
  1016. add.w t6, t6, t3
  1017. slli.w t6, t6, 1
  1018. ldx.h t6, a4, t6
  1019. mul.w t6, t5, t6
  1020. add.w s4, s4, t6
  1021. addi.w t3, t3, 1
  1022. addi.w s6, s6, 1
  1023. blt s6, a6, .HS15_FILTERC
  1024. ld.w t2, a5, 12
  1025. addi.w t1, t1, 1
  1026. move s6, s1
  1027. li.w t3, 0
  1028. .HS15_FILTERD:
  1029. add.w t4, t2, t3
  1030. slli.w t4, t4, 1
  1031. ldx.hu t5, a3, t4
  1032. mul.w t6, a6, t1
  1033. add.w t6, t6, t3
  1034. slli.w t6, t6, 1
  1035. ldx.h t6, a4, t6
  1036. mul.w t6, t5, t6
  1037. add.w s5, s5, t6
  1038. addi.w t3, t3, 1
  1039. addi.w s6, s6, 1
  1040. blt s6, a6, .HS15_FILTERD
  1041. .END_HS15_FILTERA:
  1042. vpickve2gr.w t1, vr22, 0
  1043. vpickve2gr.w t2, vr22, 1
  1044. vpickve2gr.w t3, vr22, 2
  1045. vpickve2gr.w t4, vr22, 3
  1046. add.w s2, s2, t1
  1047. add.w s3, s3, t2
  1048. add.w s4, s4, t3
  1049. add.w s5, s5, t4
  1050. sra.w s2, s2, a7
  1051. sra.w s3, s3, a7
  1052. sra.w s4, s4, a7
  1053. sra.w s5, s5, a7
  1054. slt t1, s2, t0
  1055. slt t2, s3, t0
  1056. slt t3, s4, t0
  1057. slt t4, s5, t0
  1058. maskeqz s2, s2, t1
  1059. maskeqz s3, s3, t2
  1060. maskeqz s4, s4, t3
  1061. maskeqz s5, s5, t4
  1062. masknez t1, t0, t1
  1063. masknez t2, t0, t2
  1064. masknez t3, t0, t3
  1065. masknez t4, t0, t4
  1066. or s2, s2, t1
  1067. or s3, s3, t2
  1068. or s4, s4, t3
  1069. or s5, s5, t4
  1070. st.h s2, a1, 0
  1071. st.h s3, a1, 2
  1072. st.h s4, a1, 4
  1073. st.h s5, a1, 6
  1074. addi.d a1, a1, 8
  1075. sub.d a3, a3, s1
  1076. sub.d a3, a3, s1
  1077. addi.d a5, a5, 16
  1078. slli.d t3, a6, 3
  1079. add.d a4, a4, t3
  1080. sub.d a4, a4, s1
  1081. sub.d a4, a4, s1
  1082. addi.d a2, a2, -4
  1083. bge a2, t7, .LOOP_HS15
  1084. blt zero, a2, .HS15_RESA
  1085. b .HS15_END
  1086. .HS15_RESA:
  1087. li.w t1, 0
  1088. .HS15_DST:
  1089. slli.w t2, t1, 2
  1090. ldx.w t2, a5, t2
  1091. li.w t3, 0
  1092. li.w t8, 0
  1093. .HS15_FILTER:
  1094. add.w t4, t2, t3
  1095. slli.w t4, t4, 1
  1096. ldx.hu t5, a3, t4
  1097. mul.w t6, a6, t1
  1098. add.w t6, t6, t3
  1099. slli.w t7, t6, 1
  1100. ldx.h t7, a4, t7
  1101. mul.w t7, t5, t7
  1102. add.w t8, t8, t7
  1103. addi.w t3, t3, 1
  1104. blt t3, a6, .HS15_FILTER
  1105. sra.w t8, t8, a7
  1106. slt t5, t8, t0
  1107. maskeqz t8, t8, t5
  1108. masknez t5, t0, t5
  1109. or t8, t8, t5
  1110. slli.w t4, t1, 1
  1111. stx.h t8, a1, t4
  1112. addi.w t1, t1, 1
  1113. blt t1, a2, .HS15_DST
  1114. b .HS15_END
  1115. .LOOP_HS15_DST8:
  1116. ld.w t1, a5, 0
  1117. ld.w t2, a5, 4
  1118. ld.w t3, a5, 8
  1119. ld.w t4, a5, 12
  1120. slli.w t1, t1, 1
  1121. slli.w t2, t2, 1
  1122. slli.w t3, t3, 1
  1123. slli.w t4, t4, 1
  1124. vldx vr1, a3, t1
  1125. vldx vr2, a3, t2
  1126. vldx vr3, a3, t3
  1127. vldx vr4, a3, t4
  1128. ld.w t1, a5, 16
  1129. ld.w t2, a5, 20
  1130. ld.w t3, a5, 24
  1131. ld.w t4, a5, 28
  1132. slli.w t1, t1, 1
  1133. slli.w t2, t2, 1
  1134. slli.w t3, t3, 1
  1135. slli.w t4, t4, 1
  1136. vldx vr5, a3, t1
  1137. vldx vr6, a3, t2
  1138. vldx vr7, a3, t3
  1139. vldx vr8, a3, t4
  1140. vld vr9, a4, 0
  1141. vld vr10, a4, 16
  1142. vld vr11, a4, 32
  1143. vld vr12, a4, 48
  1144. vld vr13, a4, 64
  1145. vld vr14, a4, 80
  1146. vld vr15, a4, 96
  1147. vld vr16, a4, 112
  1148. vmulwev.w.hu.h vr17, vr1, vr9
  1149. vmulwev.w.hu.h vr18, vr2, vr10
  1150. vmulwev.w.hu.h vr19, vr3, vr11
  1151. vmulwev.w.hu.h vr21, vr4, vr12
  1152. vmaddwod.w.hu.h vr17, vr1, vr9
  1153. vmaddwod.w.hu.h vr18, vr2, vr10
  1154. vmaddwod.w.hu.h vr19, vr3, vr11
  1155. vmaddwod.w.hu.h vr21, vr4, vr12
  1156. vmulwev.w.hu.h vr1, vr5, vr13
  1157. vmulwev.w.hu.h vr2, vr6, vr14
  1158. vmulwev.w.hu.h vr3, vr7, vr15
  1159. vmulwev.w.hu.h vr4, vr8, vr16
  1160. vmaddwod.w.hu.h vr1, vr5, vr13
  1161. vmaddwod.w.hu.h vr2, vr6, vr14
  1162. vmaddwod.w.hu.h vr3, vr7, vr15
  1163. vmaddwod.w.hu.h vr4, vr8, vr16
  1164. vhaddw.d.w vr5, vr1, vr1
  1165. vhaddw.d.w vr6, vr2, vr2
  1166. vhaddw.d.w vr7, vr3, vr3
  1167. vhaddw.d.w vr8, vr4, vr4
  1168. vhaddw.d.w vr1, vr17, vr17
  1169. vhaddw.d.w vr2, vr18, vr18
  1170. vhaddw.d.w vr3, vr19, vr19
  1171. vhaddw.d.w vr4, vr21, vr21
  1172. vhaddw.q.d vr1, vr1, vr1
  1173. vhaddw.q.d vr2, vr2, vr2
  1174. vhaddw.q.d vr3, vr3, vr3
  1175. vhaddw.q.d vr4, vr4, vr4
  1176. vhaddw.q.d vr5, vr5, vr5
  1177. vhaddw.q.d vr6, vr6, vr6
  1178. vhaddw.q.d vr7, vr7, vr7
  1179. vhaddw.q.d vr8, vr8, vr8
  1180. vilvl.w vr1, vr2, vr1
  1181. vilvl.w vr3, vr4, vr3
  1182. vilvl.w vr5, vr6, vr5
  1183. vilvl.w vr7, vr8, vr7
  1184. vilvl.d vr1, vr3, vr1
  1185. vilvl.d vr5, vr7, vr5
  1186. vsra.w vr1, vr1, vr0
  1187. vsra.w vr5, vr5, vr0
  1188. vmin.w vr1, vr1, vr20
  1189. vmin.w vr5, vr5, vr20
  1190. vpickev.h vr1, vr5, vr1
  1191. vst vr1, a1, 0
  1192. addi.d a1, a1, 16
  1193. addi.d a5, a5, 32
  1194. addi.d a4, a4, 128
  1195. addi.d a2, a2, -8
  1196. bge a2, t8, .LOOP_HS15_DST8
  1197. blt zero, a2, .HS15_REST8
  1198. b .HS15_END
  1199. .HS15_REST8:
  1200. li.w t1, 0
  1201. .HS15_DST8:
  1202. slli.w t2, t1, 2
  1203. ldx.w t2, a5, t2
  1204. li.w t3, 0
  1205. li.w t8, 0
  1206. .HS15_FILTER8:
  1207. add.w t4, t2, t3
  1208. slli.w t4, t4, 1
  1209. ldx.hu t5, a3, t4
  1210. mul.w t6, a6, t1
  1211. add.w t6, t6, t3
  1212. slli.w t7, t6, 1
  1213. ldx.h t7, a4, t7
  1214. mul.w t7, t5, t7
  1215. add.w t8, t8, t7
  1216. addi.w t3, t3, 1
  1217. blt t3, a6, .HS15_FILTER8
  1218. sra.w t8, t8, a7
  1219. slt t5, t8, t0
  1220. maskeqz t8, t8, t5
  1221. masknez t5, t0, t5
  1222. or t8, t8, t5
  1223. slli.w t4, t1, 1
  1224. stx.h t8, a1, t4
  1225. addi.w t1, t1, 1
  1226. blt t1, a2, .HS15_DST8
  1227. b .HS15_END
  1228. .LOOP_HS15_DST4:
  1229. ld.w t1, a5, 0
  1230. ld.w t2, a5, 4
  1231. ld.w t3, a5, 8
  1232. ld.w t4, a5, 12
  1233. slli.w t1, t1, 1
  1234. slli.w t2, t2, 1
  1235. slli.w t3, t3, 1
  1236. slli.w t4, t4, 1
  1237. fldx.d f1, a3, t1
  1238. fldx.d f2, a3, t2
  1239. fldx.d f3, a3, t3
  1240. fldx.d f4, a3, t4
  1241. ld.w t1, a5, 16
  1242. ld.w t2, a5, 20
  1243. ld.w t3, a5, 24
  1244. ld.w t4, a5, 28
  1245. slli.w t1, t1, 1
  1246. slli.w t2, t2, 1
  1247. slli.w t3, t3, 1
  1248. slli.w t4, t4, 1
  1249. fldx.d f5, a3, t1
  1250. fldx.d f6, a3, t2
  1251. fldx.d f7, a3, t3
  1252. fldx.d f8, a3, t4
  1253. vld vr9, a4, 0
  1254. vld vr10, a4, 16
  1255. vld vr11, a4, 32
  1256. vld vr12, a4, 48
  1257. vilvl.d vr1, vr2, vr1
  1258. vilvl.d vr3, vr4, vr3
  1259. vilvl.d vr5, vr6, vr5
  1260. vilvl.d vr7, vr8, vr7
  1261. vmulwev.w.hu.h vr13, vr1, vr9
  1262. vmulwev.w.hu.h vr14, vr3, vr10
  1263. vmulwev.w.hu.h vr15, vr5, vr11
  1264. vmulwev.w.hu.h vr16, vr7, vr12
  1265. vmaddwod.w.hu.h vr13, vr1, vr9
  1266. vmaddwod.w.hu.h vr14, vr3, vr10
  1267. vmaddwod.w.hu.h vr15, vr5, vr11
  1268. vmaddwod.w.hu.h vr16, vr7, vr12
  1269. vhaddw.d.w vr13, vr13, vr13
  1270. vhaddw.d.w vr14, vr14, vr14
  1271. vhaddw.d.w vr15, vr15, vr15
  1272. vhaddw.d.w vr16, vr16, vr16
  1273. vpickev.w vr13, vr14, vr13
  1274. vpickev.w vr15, vr16, vr15
  1275. vsra.w vr13, vr13, vr0
  1276. vsra.w vr15, vr15, vr0
  1277. vmin.w vr13, vr13, vr20
  1278. vmin.w vr15, vr15, vr20
  1279. vpickev.h vr13, vr15, vr13
  1280. vst vr13, a1, 0
  1281. addi.d a1, a1, 16
  1282. addi.d a5, a5, 32
  1283. addi.d a4, a4, 64
  1284. addi.d a2, a2, -8
  1285. bge a2, t8, .LOOP_HS15_DST4
  1286. blt zero, a2, .HS15_REST4
  1287. b .HS15_END
  1288. .HS15_REST4:
  1289. li.w t1, 0
  1290. .HS15_DST4:
  1291. slli.w t2, t1, 2
  1292. ldx.w t2, a5, t2
  1293. li.w t3, 0
  1294. li.w t8, 0
  1295. .HS15_FILTER4:
  1296. add.w t4, t2, t3
  1297. slli.w t4, t4, 1
  1298. ldx.hu t5, a3, t4
  1299. mul.w t6, a6, t1
  1300. add.w t6, t6, t3
  1301. slli.w t7, t6, 1
  1302. ldx.h t7, a4, t7
  1303. mul.w t7, t5, t7
  1304. add.w t8, t8, t7
  1305. addi.w t3, t3, 1
  1306. blt t3, a6, .HS15_FILTER4
  1307. sra.w t8, t8, a7
  1308. slt t5, t8, t0
  1309. maskeqz t8, t8, t5
  1310. masknez t5, t0, t5
  1311. or t8, t8, t5
  1312. slli.w t4, t1, 1
  1313. stx.h t8, a1, t4
  1314. addi.w t1, t1, 1
  1315. blt t1, a2, .HS15_DST4
  1316. b .HS15_END
  1317. .END_HS15_DST4:
  1318. li.w t1, 0
  1319. .LOOP_HS15_DST1:
  1320. slli.w t2, t1, 2
  1321. ldx.w t2, a5, t2
  1322. li.w t3, 0
  1323. li.w t8, 0
  1324. .HS15_FILTER1:
  1325. add.w t4, t2, t3
  1326. slli.w t4, t4, 1
  1327. ldx.hu t5, a3, t4
  1328. mul.w t6, a6, t1
  1329. add.w t6, t6, t3
  1330. slli.w t7, t6, 1
  1331. ldx.h t7, a4, t7
  1332. mul.w t7, t5, t7
  1333. add.w t8, t8, t7
  1334. addi.w t3, t3, 1
  1335. blt t3, a6, .HS15_FILTER1
  1336. sra.w t8, t8, a7
  1337. slt t5, t8, t0
  1338. maskeqz t8, t8, t5
  1339. masknez t5, t0, t5
  1340. or t8, t8, t5
  1341. slli.w t4, t1, 1
  1342. stx.h t8, a1, t4
  1343. addi.w t1, t1, 1
  1344. blt t1, a2, .LOOP_HS15_DST1
  1345. b .HS15_END
  1346. .HS15_END:
  1347. ld.d s0, sp, 0
  1348. ld.d s1, sp, 8
  1349. ld.d s2, sp, 16
  1350. ld.d s3, sp, 24
  1351. ld.d s4, sp, 32
  1352. ld.d s5, sp, 40
  1353. ld.d s6, sp, 48
  1354. ld.d s7, sp, 56
  1355. ld.d s8, sp, 64
  1356. addi.d sp, sp, 72
  1357. endfunc
  1358. /* void ff_hscale_16_to_19_sub_lsx(SwsInternal *c, int16_t *dst, int dstW,
  1359. * const uint8_t *src, const int16_t *filter,
  1360. * const int32_t *filterPos, int filterSize, int sh)
  1361. */
  1362. function ff_hscale_16_to_19_sub_lsx
  1363. addi.d sp, sp, -72
  1364. st.d s0, sp, 0
  1365. st.d s1, sp, 8
  1366. st.d s2, sp, 16
  1367. st.d s3, sp, 24
  1368. st.d s4, sp, 32
  1369. st.d s5, sp, 40
  1370. st.d s6, sp, 48
  1371. st.d s7, sp, 56
  1372. st.d s8, sp, 64
  1373. li.w t0, 524287
  1374. li.w t8, 8
  1375. li.w t7, 4
  1376. vreplgr2vr.w vr20, t0
  1377. vreplgr2vr.w vr0, a7
  1378. beq a6, t7, .LOOP_HS19_DST4
  1379. beq a6, t8, .LOOP_HS19_DST8
  1380. blt t8, a6, .LOOP_HS19
  1381. b .END_HS19_DST4
  1382. .LOOP_HS19:
  1383. li.w t1, 0
  1384. li.w s1, 0
  1385. li.w s2, 0
  1386. li.w s3, 0
  1387. li.w s4, 0
  1388. li.w s5, 0
  1389. vldi vr22, 0
  1390. addi.w s0, a6, -7
  1391. slli.w s7, a6, 1
  1392. slli.w s8, a6, 2
  1393. add.w t6, s7, s8
  1394. .LOOP_HS19_DST:
  1395. ld.w t2, a5, 0
  1396. ld.w t3, a5, 4
  1397. ld.w t4, a5, 8
  1398. ld.w t5, a5, 12
  1399. slli.w t2, t2, 1
  1400. slli.w t3, t3, 1
  1401. slli.w t4, t4, 1
  1402. slli.w t5, t5, 1
  1403. vldx vr1, a3, t2
  1404. vldx vr2, a3, t3
  1405. vldx vr3, a3, t4
  1406. vldx vr4, a3, t5
  1407. vld vr9, a4, 0
  1408. vldx vr10, a4, s7
  1409. vldx vr11, a4, s8
  1410. vldx vr12, a4, t6
  1411. vmulwev.w.hu.h vr17, vr1, vr9
  1412. vmulwev.w.hu.h vr18, vr2, vr10
  1413. vmulwev.w.hu.h vr19, vr3, vr11
  1414. vmulwev.w.hu.h vr21, vr4, vr12
  1415. vmaddwod.w.hu.h vr17, vr1, vr9
  1416. vmaddwod.w.hu.h vr18, vr2, vr10
  1417. vmaddwod.w.hu.h vr19, vr3, vr11
  1418. vmaddwod.w.hu.h vr21, vr4, vr12
  1419. vhaddw.d.w vr1, vr17, vr17
  1420. vhaddw.d.w vr2, vr18, vr18
  1421. vhaddw.d.w vr3, vr19, vr19
  1422. vhaddw.d.w vr4, vr21, vr21
  1423. vhaddw.q.d vr1, vr1, vr1
  1424. vhaddw.q.d vr2, vr2, vr2
  1425. vhaddw.q.d vr3, vr3, vr3
  1426. vhaddw.q.d vr4, vr4, vr4
  1427. vilvl.w vr1, vr2, vr1
  1428. vilvl.w vr3, vr4, vr3
  1429. vilvl.d vr1, vr3, vr1
  1430. vadd.w vr22, vr22, vr1
  1431. addi.w s1, s1, 8
  1432. addi.d a3, a3, 16
  1433. addi.d a4, a4, 16
  1434. blt s1, s0, .LOOP_HS19_DST
  1435. blt s1, a6, .HS19_DSTA
  1436. b .END_HS19_FILTERA
  1437. .HS19_DSTA:
  1438. ld.w t2, a5, 0
  1439. li.w t3, 0
  1440. move s6, s1
  1441. .HS19_FILTERA:
  1442. add.w t4, t2, t3
  1443. slli.w t4, t4, 1
  1444. ldx.hu t5, a3, t4
  1445. mul.w t6, a6, t1
  1446. add.w t6, t6, t3
  1447. slli.w t6, t6, 1
  1448. ldx.h t6, a4, t6
  1449. mul.w t6, t5, t6
  1450. add.w s2, s2, t6
  1451. addi.w t3, t3, 1
  1452. addi.w s6, s6, 1
  1453. blt s6, a6, .HS19_FILTERA
  1454. ld.w t2, a5, 4
  1455. li.w t3, 0
  1456. move s6, s1
  1457. addi.w t1, t1, 1
  1458. .HS19_FILTERB:
  1459. add.w t4, t2, t3
  1460. slli.w t4, t4, 1
  1461. ldx.hu t5, a3, t4
  1462. mul.w t6, a6, t1
  1463. add.w t6, t6, t3
  1464. slli.w t6, t6, 1
  1465. ldx.h t6, a4, t6
  1466. mul.w t6, t5, t6
  1467. add.w s3, s3, t6
  1468. addi.w t3, t3, 1
  1469. addi.w s6, s6, 1
  1470. blt s6, a6, .HS19_FILTERB
  1471. ld.w t2, a5, 8
  1472. addi.w t1, t1, 1
  1473. li.w t3, 0
  1474. move s6, s1
  1475. .HS19_FILTERC:
  1476. add.w t4, t2, t3
  1477. slli.w t4, t4, 1
  1478. ldx.hu t5, a3, t4
  1479. mul.w t6, a6, t1
  1480. add.w t6, t6, t3
  1481. slli.w t6, t6, 1
  1482. ldx.h t6, a4, t6
  1483. mul.w t6, t5, t6
  1484. add.w s4, s4, t6
  1485. addi.w t3, t3, 1
  1486. addi.w s6, s6, 1
  1487. blt s6, a6, .HS19_FILTERC
  1488. ld.w t2, a5, 12
  1489. addi.w t1, t1, 1
  1490. move s6, s1
  1491. li.w t3, 0
  1492. .HS19_FILTERD:
  1493. add.w t4, t2, t3
  1494. slli.w t4, t4, 1
  1495. ldx.hu t5, a3, t4
  1496. mul.w t6, a6, t1
  1497. add.w t6, t6, t3
  1498. slli.w t6, t6, 1
  1499. ldx.h t6, a4, t6
  1500. mul.w t6, t5, t6
  1501. add.w s5, s5, t6
  1502. addi.w t3, t3, 1
  1503. addi.w s6, s6, 1
  1504. blt s6, a6, .HS19_FILTERD
  1505. .END_HS19_FILTERA:
  1506. vpickve2gr.w t1, vr22, 0
  1507. vpickve2gr.w t2, vr22, 1
  1508. vpickve2gr.w t3, vr22, 2
  1509. vpickve2gr.w t4, vr22, 3
  1510. add.w s2, s2, t1
  1511. add.w s3, s3, t2
  1512. add.w s4, s4, t3
  1513. add.w s5, s5, t4
  1514. sra.w s2, s2, a7
  1515. sra.w s3, s3, a7
  1516. sra.w s4, s4, a7
  1517. sra.w s5, s5, a7
  1518. slt t1, s2, t0
  1519. slt t2, s3, t0
  1520. slt t3, s4, t0
  1521. slt t4, s5, t0
  1522. maskeqz s2, s2, t1
  1523. maskeqz s3, s3, t2
  1524. maskeqz s4, s4, t3
  1525. maskeqz s5, s5, t4
  1526. masknez t1, t0, t1
  1527. masknez t2, t0, t2
  1528. masknez t3, t0, t3
  1529. masknez t4, t0, t4
  1530. or s2, s2, t1
  1531. or s3, s3, t2
  1532. or s4, s4, t3
  1533. or s5, s5, t4
  1534. st.w s2, a1, 0
  1535. st.w s3, a1, 4
  1536. st.w s4, a1, 8
  1537. st.w s5, a1, 12
  1538. addi.d a1, a1, 16
  1539. sub.d a3, a3, s1
  1540. sub.d a3, a3, s1
  1541. addi.d a5, a5, 16
  1542. slli.d t3, a6, 3
  1543. add.d a4, a4, t3
  1544. sub.d a4, a4, s1
  1545. sub.d a4, a4, s1
  1546. addi.d a2, a2, -4
  1547. bge a2, t7, .LOOP_HS19
  1548. blt zero, a2, .HS19_RESA
  1549. b .HS19_END
  1550. .HS19_RESA:
  1551. li.w t1, 0
  1552. .HS19_DST:
  1553. slli.w t2, t1, 2
  1554. ldx.w t2, a5, t2
  1555. li.w t3, 0
  1556. li.w t8, 0
  1557. .HS19_FILTER:
  1558. add.w t4, t2, t3
  1559. slli.w t4, t4, 1
  1560. ldx.hu t5, a3, t4
  1561. mul.w t6, a6, t1
  1562. add.w t6, t6, t3
  1563. slli.w t7, t6, 1
  1564. ldx.h t7, a4, t7
  1565. mul.w t7, t5, t7
  1566. add.w t8, t8, t7
  1567. addi.w t3, t3, 1
  1568. blt t3, a6, .HS19_FILTER
  1569. sra.w t8, t8, a7
  1570. slt t5, t8, t0
  1571. maskeqz t8, t8, t5
  1572. masknez t5, t0, t5
  1573. or t8, t8, t5
  1574. slli.w t4, t1, 2
  1575. stx.w t8, a1, t4
  1576. addi.w t1, t1, 1
  1577. blt t1, a2, .HS19_DST
  1578. b .HS19_END
  1579. .LOOP_HS19_DST8:
  1580. ld.w t1, a5, 0
  1581. ld.w t2, a5, 4
  1582. ld.w t3, a5, 8
  1583. ld.w t4, a5, 12
  1584. slli.w t1, t1, 1
  1585. slli.w t2, t2, 1
  1586. slli.w t3, t3, 1
  1587. slli.w t4, t4, 1
  1588. vldx vr1, a3, t1
  1589. vldx vr2, a3, t2
  1590. vldx vr3, a3, t3
  1591. vldx vr4, a3, t4
  1592. ld.w t1, a5, 16
  1593. ld.w t2, a5, 20
  1594. ld.w t3, a5, 24
  1595. ld.w t4, a5, 28
  1596. slli.w t1, t1, 1
  1597. slli.w t2, t2, 1
  1598. slli.w t3, t3, 1
  1599. slli.w t4, t4, 1
  1600. vldx vr5, a3, t1
  1601. vldx vr6, a3, t2
  1602. vldx vr7, a3, t3
  1603. vldx vr8, a3, t4
  1604. vld vr9, a4, 0
  1605. vld vr10, a4, 16
  1606. vld vr11, a4, 32
  1607. vld vr12, a4, 48
  1608. vld vr13, a4, 64
  1609. vld vr14, a4, 80
  1610. vld vr15, a4, 96
  1611. vld vr16, a4, 112
  1612. vmulwev.w.hu.h vr17, vr1, vr9
  1613. vmulwev.w.hu.h vr18, vr2, vr10
  1614. vmulwev.w.hu.h vr19, vr3, vr11
  1615. vmulwev.w.hu.h vr21, vr4, vr12
  1616. vmaddwod.w.hu.h vr17, vr1, vr9
  1617. vmaddwod.w.hu.h vr18, vr2, vr10
  1618. vmaddwod.w.hu.h vr19, vr3, vr11
  1619. vmaddwod.w.hu.h vr21, vr4, vr12
  1620. vmulwev.w.hu.h vr1, vr5, vr13
  1621. vmulwev.w.hu.h vr2, vr6, vr14
  1622. vmulwev.w.hu.h vr3, vr7, vr15
  1623. vmulwev.w.hu.h vr4, vr8, vr16
  1624. vmaddwod.w.hu.h vr1, vr5, vr13
  1625. vmaddwod.w.hu.h vr2, vr6, vr14
  1626. vmaddwod.w.hu.h vr3, vr7, vr15
  1627. vmaddwod.w.hu.h vr4, vr8, vr16
  1628. vhaddw.d.w vr5, vr1, vr1
  1629. vhaddw.d.w vr6, vr2, vr2
  1630. vhaddw.d.w vr7, vr3, vr3
  1631. vhaddw.d.w vr8, vr4, vr4
  1632. vhaddw.d.w vr1, vr17, vr17
  1633. vhaddw.d.w vr2, vr18, vr18
  1634. vhaddw.d.w vr3, vr19, vr19
  1635. vhaddw.d.w vr4, vr21, vr21
  1636. vhaddw.q.d vr1, vr1, vr1
  1637. vhaddw.q.d vr2, vr2, vr2
  1638. vhaddw.q.d vr3, vr3, vr3
  1639. vhaddw.q.d vr4, vr4, vr4
  1640. vhaddw.q.d vr5, vr5, vr5
  1641. vhaddw.q.d vr6, vr6, vr6
  1642. vhaddw.q.d vr7, vr7, vr7
  1643. vhaddw.q.d vr8, vr8, vr8
  1644. vilvl.w vr1, vr2, vr1
  1645. vilvl.w vr3, vr4, vr3
  1646. vilvl.w vr5, vr6, vr5
  1647. vilvl.w vr7, vr8, vr7
  1648. vilvl.d vr1, vr3, vr1
  1649. vilvl.d vr5, vr7, vr5
  1650. vsra.w vr1, vr1, vr0
  1651. vsra.w vr5, vr5, vr0
  1652. vmin.w vr1, vr1, vr20
  1653. vmin.w vr5, vr5, vr20
  1654. vst vr1, a1, 0
  1655. vst vr5, a1, 16
  1656. addi.d a1, a1, 32
  1657. addi.d a5, a5, 32
  1658. addi.d a4, a4, 128
  1659. addi.d a2, a2, -8
  1660. bge a2, t8, .LOOP_HS19_DST8
  1661. blt zero, a2, .HS19_REST8
  1662. b .HS19_END
  1663. .HS19_REST8:
  1664. li.w t1, 0
  1665. .HS19_DST8:
  1666. slli.w t2, t1, 2
  1667. ldx.w t2, a5, t2
  1668. li.w t3, 0
  1669. li.w t8, 0
  1670. .HS19_FILTER8:
  1671. add.w t4, t2, t3
  1672. slli.w t4, t4, 1
  1673. ldx.hu t5, a3, t4
  1674. mul.w t6, a6, t1
  1675. add.w t6, t6, t3
  1676. slli.w t7, t6, 1
  1677. ldx.h t7, a4, t7
  1678. mul.w t7, t5, t7
  1679. add.w t8, t8, t7
  1680. addi.w t3, t3, 1
  1681. blt t3, a6, .HS19_FILTER8
  1682. sra.w t8, t8, a7
  1683. slt t5, t8, t0
  1684. maskeqz t8, t8, t5
  1685. masknez t5, t0, t5
  1686. or t8, t8, t5
  1687. slli.w t4, t1, 2
  1688. stx.w t8, a1, t4
  1689. addi.w t1, t1, 1
  1690. blt t1, a2, .HS19_DST8
  1691. b .HS19_END
  1692. .LOOP_HS19_DST4:
  1693. ld.w t1, a5, 0
  1694. ld.w t2, a5, 4
  1695. ld.w t3, a5, 8
  1696. ld.w t4, a5, 12
  1697. slli.w t1, t1, 1
  1698. slli.w t2, t2, 1
  1699. slli.w t3, t3, 1
  1700. slli.w t4, t4, 1
  1701. fldx.d f1, a3, t1
  1702. fldx.d f2, a3, t2
  1703. fldx.d f3, a3, t3
  1704. fldx.d f4, a3, t4
  1705. ld.w t1, a5, 16
  1706. ld.w t2, a5, 20
  1707. ld.w t3, a5, 24
  1708. ld.w t4, a5, 28
  1709. slli.w t1, t1, 1
  1710. slli.w t2, t2, 1
  1711. slli.w t3, t3, 1
  1712. slli.w t4, t4, 1
  1713. fldx.d f5, a3, t1
  1714. fldx.d f6, a3, t2
  1715. fldx.d f7, a3, t3
  1716. fldx.d f8, a3, t4
  1717. vld vr9, a4, 0
  1718. vld vr10, a4, 16
  1719. vld vr11, a4, 32
  1720. vld vr12, a4, 48
  1721. vilvl.d vr1, vr2, vr1
  1722. vilvl.d vr3, vr4, vr3
  1723. vilvl.d vr5, vr6, vr5
  1724. vilvl.d vr7, vr8, vr7
  1725. vmulwev.w.hu.h vr13, vr1, vr9
  1726. vmulwev.w.hu.h vr14, vr3, vr10
  1727. vmulwev.w.hu.h vr15, vr5, vr11
  1728. vmulwev.w.hu.h vr16, vr7, vr12
  1729. vmaddwod.w.hu.h vr13, vr1, vr9
  1730. vmaddwod.w.hu.h vr14, vr3, vr10
  1731. vmaddwod.w.hu.h vr15, vr5, vr11
  1732. vmaddwod.w.hu.h vr16, vr7, vr12
  1733. vhaddw.d.w vr13, vr13, vr13
  1734. vhaddw.d.w vr14, vr14, vr14
  1735. vhaddw.d.w vr15, vr15, vr15
  1736. vhaddw.d.w vr16, vr16, vr16
  1737. vpickev.w vr13, vr14, vr13
  1738. vpickev.w vr15, vr16, vr15
  1739. vsra.w vr13, vr13, vr0
  1740. vsra.w vr15, vr15, vr0
  1741. vmin.w vr13, vr13, vr20
  1742. vmin.w vr15, vr15, vr20
  1743. vst vr13, a1, 0
  1744. vst vr15, a1, 16
  1745. addi.d a1, a1, 32
  1746. addi.d a5, a5, 32
  1747. addi.d a4, a4, 64
  1748. addi.d a2, a2, -8
  1749. bge a2, t8, .LOOP_HS19_DST4
  1750. blt zero, a2, .HS19_REST4
  1751. b .HS19_END
  1752. .HS19_REST4:
  1753. li.w t1, 0
  1754. .HS19_DST4:
  1755. slli.w t2, t1, 2
  1756. ldx.w t2, a5, t2
  1757. li.w t3, 0
  1758. li.w t8, 0
  1759. .HS19_FILTER4:
  1760. add.w t4, t2, t3
  1761. slli.w t4, t4, 1
  1762. ldx.hu t5, a3, t4
  1763. mul.w t6, a6, t1
  1764. add.w t6, t6, t3
  1765. slli.w t7, t6, 1
  1766. ldx.h t7, a4, t7
  1767. mul.w t7, t5, t7
  1768. add.w t8, t8, t7
  1769. addi.w t3, t3, 1
  1770. blt t3, a6, .HS19_FILTER4
  1771. sra.w t8, t8, a7
  1772. slt t5, t8, t0
  1773. maskeqz t8, t8, t5
  1774. masknez t5, t0, t5
  1775. or t8, t8, t5
  1776. slli.w t4, t1, 2
  1777. stx.w t8, a1, t4
  1778. addi.w t1, t1, 1
  1779. blt t1, a2, .HS19_DST4
  1780. b .HS19_END
  1781. .END_HS19_DST4:
  1782. li.w t1, 0
  1783. .LOOP_HS19_DST1:
  1784. slli.w t2, t1, 2
  1785. ldx.w t2, a5, t2
  1786. li.w t3, 0
  1787. li.w t8, 0
  1788. .HS19_FILTER1:
  1789. add.w t4, t2, t3
  1790. slli.w t4, t4, 1
  1791. ldx.hu t5, a3, t4
  1792. mul.w t6, a6, t1
  1793. add.w t6, t6, t3
  1794. slli.w t7, t6, 1
  1795. ldx.h t7, a4, t7
  1796. mul.w t7, t5, t7
  1797. add.w t8, t8, t7
  1798. addi.w t3, t3, 1
  1799. blt t3, a6, .HS19_FILTER1
  1800. sra.w t8, t8, a7
  1801. slt t5, t8, t0
  1802. maskeqz t8, t8, t5
  1803. masknez t5, t0, t5
  1804. or t8, t8, t5
  1805. slli.w t4, t1, 2
  1806. stx.w t8, a1, t4
  1807. addi.w t1, t1, 1
  1808. blt t1, a2, .LOOP_HS19_DST1
  1809. b .HS19_END
  1810. .HS19_END:
  1811. ld.d s0, sp, 0
  1812. ld.d s1, sp, 8
  1813. ld.d s2, sp, 16
  1814. ld.d s3, sp, 24
  1815. ld.d s4, sp, 32
  1816. ld.d s5, sp, 40
  1817. ld.d s6, sp, 48
  1818. ld.d s7, sp, 56
  1819. ld.d s8, sp, 64
  1820. addi.d sp, sp, 72
  1821. endfunc
  1822. function lumRangeFromJpeg_lsx
  1823. li.w t0, 14071
  1824. li.w t1, 33561947
  1825. vreplgr2vr.h vr0, t0
  1826. srli.w t2, a1, 3
  1827. andi t3, a1, 7
  1828. beqz t2, 2f
  1829. 1:
  1830. vld vr1, a0, 0
  1831. vreplgr2vr.w vr2, t1
  1832. vreplgr2vr.w vr3, t1
  1833. vmaddwev.w.h vr2, vr0, vr1
  1834. vmaddwod.w.h vr3, vr0, vr1
  1835. vsrai.w vr2, vr2, 14
  1836. vsrai.w vr3, vr3, 14
  1837. vpackev.h vr1, vr3, vr2
  1838. vst vr1, a0, 0
  1839. addi.d a0, a0, 16
  1840. addi.d t2, t2, -1
  1841. bnez t2, 1b
  1842. 2:
  1843. beqz t3, 4f
  1844. 3:
  1845. ld.h t4, a0, 0
  1846. mul.w t4, t4, t0
  1847. add.w t4, t4, t1
  1848. srai.w t4, t4, 14
  1849. st.h t4, a0, 0
  1850. addi.d a0, a0, 2
  1851. addi.d t3, t3, -1
  1852. bnez t3, 3b
  1853. 4:
  1854. endfunc
  1855. function lumRangeFromJpeg_lasx
  1856. li.w t0, 14071
  1857. li.w t1, 33561947
  1858. xvreplgr2vr.h xr0, t0
  1859. srli.w t2, a1, 4
  1860. andi t3, a1, 15
  1861. beqz t2, 2f
  1862. 1:
  1863. xvld xr1, a0, 0
  1864. xvreplgr2vr.w xr2, t1
  1865. xvreplgr2vr.w xr3, t1
  1866. xvmaddwev.w.h xr2, xr0, xr1
  1867. xvmaddwod.w.h xr3, xr0, xr1
  1868. xvsrai.w xr2, xr2, 14
  1869. xvsrai.w xr3, xr3, 14
  1870. xvpackev.h xr1, xr3, xr2
  1871. xvst xr1, a0, 0
  1872. addi.d a0, a0, 32
  1873. addi.d t2, t2, -1
  1874. bnez t2, 1b
  1875. 2:
  1876. beqz t3, 4f
  1877. 3:
  1878. ld.h t4, a0, 0
  1879. mul.w t4, t4, t0
  1880. add.w t4, t4, t1
  1881. srai.w t4, t4, 14
  1882. st.h t4, a0, 0
  1883. addi.d a0, a0, 2
  1884. addi.d t3, t3, -1
  1885. bnez t3, 3b
  1886. 4:
  1887. endfunc
  1888. function lumRangeToJpeg_lsx
  1889. li.w t0, 19077
  1890. li.w t1, -39057361
  1891. li.w t2, 30189
  1892. vreplgr2vr.h vr0, t0
  1893. vreplgr2vr.h vr4, t2
  1894. srli.w t2, a1, 3
  1895. andi t3, a1, 7
  1896. beqz t2, 2f
  1897. 1:
  1898. vld vr1, a0, 0
  1899. vreplgr2vr.w vr2, t1
  1900. vreplgr2vr.w vr3, t1
  1901. vmin.h vr1, vr1, vr4
  1902. vmaddwev.w.h vr2, vr0, vr1
  1903. vmaddwod.w.h vr3, vr0, vr1
  1904. vsrai.w vr2, vr2, 14
  1905. vsrai.w vr3, vr3, 14
  1906. vpackev.h vr1, vr3, vr2
  1907. vst vr1, a0, 0
  1908. addi.d a0, a0, 16
  1909. addi.d t2, t2, -1
  1910. bnez t2, 1b
  1911. 2:
  1912. beqz t3, 4f
  1913. 3:
  1914. ld.h t4, a0, 0
  1915. vreplgr2vr.h vr1, t4
  1916. vmin.h vr1, vr1, vr4
  1917. vpickve2gr.h t4, vr1, 0
  1918. mul.w t4, t4, t0
  1919. add.w t4, t4, t1
  1920. srai.w t4, t4, 14
  1921. st.h t4, a0, 0
  1922. addi.d a0, a0, 2
  1923. addi.d t3, t3, -1
  1924. bnez t3, 3b
  1925. 4:
  1926. endfunc
  1927. function lumRangeToJpeg_lasx
  1928. li.w t0, 19077
  1929. li.w t1, -39057361
  1930. li.w t2, 30189
  1931. xvreplgr2vr.h xr0, t0
  1932. xvreplgr2vr.h xr4, t2
  1933. srli.w t2, a1, 4
  1934. andi t3, a1, 15
  1935. beqz t2, 2f
  1936. 1:
  1937. xvld xr1, a0, 0
  1938. xvreplgr2vr.w xr2, t1
  1939. xvreplgr2vr.w xr3, t1
  1940. xvmin.h xr1, xr1, xr4
  1941. xvmaddwev.w.h xr2, xr0, xr1
  1942. xvmaddwod.w.h xr3, xr0, xr1
  1943. xvsrai.w xr2, xr2, 14
  1944. xvsrai.w xr3, xr3, 14
  1945. xvpackev.h xr1, xr3, xr2
  1946. xvst xr1, a0, 0
  1947. addi.d a0, a0, 32
  1948. addi.d t2, t2, -1
  1949. bnez t2, 1b
  1950. 2:
  1951. beqz t3, 4f
  1952. 3:
  1953. ld.h t4, a0, 0
  1954. vreplgr2vr.h vr1, t4
  1955. vmin.h vr1, vr1, vr4
  1956. vpickve2gr.h t4, vr1, 0
  1957. mul.w t4, t4, t0
  1958. add.w t4, t4, t1
  1959. srai.w t4, t4, 14
  1960. st.h t4, a0, 0
  1961. addi.d a0, a0, 2
  1962. addi.d t3, t3, -1
  1963. bnez t3, 3b
  1964. 4:
  1965. endfunc
  1966. function chrRangeFromJpeg_lsx
  1967. li.w t0, 1799
  1968. li.w t1, 4081085
  1969. vreplgr2vr.h vr0, t0
  1970. srli.w t2, a2, 3
  1971. andi t3, a2, 7
  1972. beqz t2, 2f
  1973. 1:
  1974. vld vr1, a0, 0
  1975. vld vr2, a1, 0
  1976. vreplgr2vr.w vr3, t1
  1977. vreplgr2vr.w vr4, t1
  1978. vreplgr2vr.w vr5, t1
  1979. vreplgr2vr.w vr6, t1
  1980. vmaddwev.w.h vr3, vr0, vr1
  1981. vmaddwod.w.h vr4, vr0, vr1
  1982. vmaddwev.w.h vr5, vr0, vr2
  1983. vmaddwod.w.h vr6, vr0, vr2
  1984. vsrai.w vr3, vr3, 11
  1985. vsrai.w vr4, vr4, 11
  1986. vsrai.w vr5, vr5, 11
  1987. vsrai.w vr6, vr6, 11
  1988. vpackev.h vr1, vr4, vr3
  1989. vpackev.h vr2, vr6, vr5
  1990. vst vr1, a0, 0
  1991. vst vr2, a1, 0
  1992. addi.d a0, a0, 16
  1993. addi.d a1, a1, 16
  1994. addi.d t2, t2, -1
  1995. bnez t2, 1b
  1996. 2:
  1997. beqz t3, 4f
  1998. 3:
  1999. ld.h t4, a0, 0
  2000. ld.h t5, a1, 0
  2001. mul.w t4, t4, t0
  2002. mul.w t5, t5, t0
  2003. add.w t4, t4, t1
  2004. add.w t5, t5, t1
  2005. srai.w t4, t4, 11
  2006. srai.w t5, t5, 11
  2007. st.h t4, a0, 0
  2008. st.h t5, a1, 0
  2009. addi.d a0, a0, 2
  2010. addi.d a1, a1, 2
  2011. addi.d t3, t3, -1
  2012. bnez t3, 3b
  2013. 4:
  2014. endfunc
  2015. function chrRangeFromJpeg_lasx
  2016. li.w t0, 1799
  2017. li.w t1, 4081085
  2018. xvreplgr2vr.h xr0, t0
  2019. srli.w t2, a2, 4
  2020. andi t3, a2, 15
  2021. beqz t2, 2f
  2022. 1:
  2023. xvld xr1, a0, 0
  2024. xvld xr2, a1, 0
  2025. xvreplgr2vr.w xr3, t1
  2026. xvreplgr2vr.w xr4, t1
  2027. xvreplgr2vr.w xr5, t1
  2028. xvreplgr2vr.w xr6, t1
  2029. xvmaddwev.w.h xr3, xr0, xr1
  2030. xvmaddwod.w.h xr4, xr0, xr1
  2031. xvmaddwev.w.h xr5, xr0, xr2
  2032. xvmaddwod.w.h xr6, xr0, xr2
  2033. xvsrai.w xr3, xr3, 11
  2034. xvsrai.w xr4, xr4, 11
  2035. xvsrai.w xr5, xr5, 11
  2036. xvsrai.w xr6, xr6, 11
  2037. xvpackev.h xr1, xr4, xr3
  2038. xvpackev.h xr2, xr6, xr5
  2039. xvst xr1, a0, 0
  2040. xvst xr2, a1, 0
  2041. addi.d a0, a0, 32
  2042. addi.d a1, a1, 32
  2043. addi.d t2, t2, -1
  2044. bnez t2, 1b
  2045. 2:
  2046. beqz t3, 4f
  2047. 3:
  2048. ld.h t4, a0, 0
  2049. ld.h t5, a1, 0
  2050. mul.w t4, t4, t0
  2051. mul.w t5, t5, t0
  2052. add.w t4, t4, t1
  2053. add.w t5, t5, t1
  2054. srai.w t4, t4, 11
  2055. srai.w t5, t5, 11
  2056. st.h t4, a0, 0
  2057. st.h t5, a1, 0
  2058. addi.d a0, a0, 2
  2059. addi.d a1, a1, 2
  2060. addi.d t3, t3, -1
  2061. bnez t3, 3b
  2062. 4:
  2063. endfunc
  2064. function chrRangeToJpeg_lsx
  2065. li.w t0, 4663
  2066. li.w t1, -9289992
  2067. li.w t2, 30775
  2068. vreplgr2vr.h vr0, t0
  2069. vreplgr2vr.h vr7, t2
  2070. srli.w t2, a2, 3
  2071. andi t3, a2, 7
  2072. beqz t2, 2f
  2073. 1:
  2074. vld vr1, a0, 0
  2075. vld vr2, a1, 0
  2076. vreplgr2vr.w vr3, t1
  2077. vreplgr2vr.w vr4, t1
  2078. vreplgr2vr.w vr5, t1
  2079. vreplgr2vr.w vr6, t1
  2080. vmin.h vr1, vr1, vr7
  2081. vmin.h vr2, vr2, vr7
  2082. vmaddwev.w.h vr3, vr0, vr1
  2083. vmaddwod.w.h vr4, vr0, vr1
  2084. vmaddwev.w.h vr5, vr0, vr2
  2085. vmaddwod.w.h vr6, vr0, vr2
  2086. vsrai.w vr3, vr3, 12
  2087. vsrai.w vr4, vr4, 12
  2088. vsrai.w vr5, vr5, 12
  2089. vsrai.w vr6, vr6, 12
  2090. vpackev.h vr1, vr4, vr3
  2091. vpackev.h vr2, vr6, vr5
  2092. vst vr1, a0, 0
  2093. vst vr2, a1, 0
  2094. addi.d a0, a0, 16
  2095. addi.d a1, a1, 16
  2096. addi.d t2, t2, -1
  2097. bnez t2, 1b
  2098. 2:
  2099. beqz t3, 4f
  2100. 3:
  2101. ld.h t4, a0, 0
  2102. ld.h t5, a1, 0
  2103. vreplgr2vr.h vr1, t4
  2104. vreplgr2vr.h vr2, t5
  2105. vmin.h vr1, vr1, vr7
  2106. vmin.h vr2, vr2, vr7
  2107. vpickve2gr.h t4, vr1, 0
  2108. vpickve2gr.h t5, vr2, 0
  2109. mul.w t4, t4, t0
  2110. mul.w t5, t5, t0
  2111. add.w t4, t4, t1
  2112. add.w t5, t5, t1
  2113. srai.w t4, t4, 12
  2114. srai.w t5, t5, 12
  2115. st.h t4, a0, 0
  2116. st.h t5, a1, 0
  2117. addi.d a0, a0, 2
  2118. addi.d a1, a1, 2
  2119. addi.d t3, t3, -1
  2120. bnez t3, 3b
  2121. 4:
  2122. endfunc
  2123. function chrRangeToJpeg_lasx
  2124. li.w t0, 4663
  2125. li.w t1, -9289992
  2126. li.w t2, 30775
  2127. xvreplgr2vr.h xr0, t0
  2128. xvreplgr2vr.h xr7, t2
  2129. srli.w t2, a2, 4
  2130. andi t3, a2, 15
  2131. beqz t2, 2f
  2132. 1:
  2133. xvld xr1, a0, 0
  2134. xvld xr2, a1, 0
  2135. xvreplgr2vr.w xr3, t1
  2136. xvreplgr2vr.w xr4, t1
  2137. xvreplgr2vr.w xr5, t1
  2138. xvreplgr2vr.w xr6, t1
  2139. xvmin.h xr1, xr1, xr7
  2140. xvmin.h xr2, xr2, xr7
  2141. xvmaddwev.w.h xr3, xr0, xr1
  2142. xvmaddwod.w.h xr4, xr0, xr1
  2143. xvmaddwev.w.h xr5, xr0, xr2
  2144. xvmaddwod.w.h xr6, xr0, xr2
  2145. xvsrai.w xr3, xr3, 12
  2146. xvsrai.w xr4, xr4, 12
  2147. xvsrai.w xr5, xr5, 12
  2148. xvsrai.w xr6, xr6, 12
  2149. xvpackev.h xr1, xr4, xr3
  2150. xvpackev.h xr2, xr6, xr5
  2151. xvst xr1, a0, 0
  2152. xvst xr2, a1, 0
  2153. addi.d a0, a0, 32
  2154. addi.d a1, a1, 32
  2155. addi.d t2, t2, -1
  2156. bnez t2, 1b
  2157. 2:
  2158. beqz t3, 4f
  2159. 3:
  2160. ld.h t4, a0, 0
  2161. ld.h t5, a1, 0
  2162. vreplgr2vr.h vr1, t4
  2163. vreplgr2vr.h vr2, t5
  2164. vmin.h vr1, vr1, vr7
  2165. vmin.h vr2, vr2, vr7
  2166. vpickve2gr.h t4, vr1, 0
  2167. vpickve2gr.h t5, vr2, 0
  2168. mul.w t4, t4, t0
  2169. mul.w t5, t5, t0
  2170. add.w t4, t4, t1
  2171. add.w t5, t5, t1
  2172. srai.w t4, t4, 12
  2173. srai.w t5, t5, 12
  2174. st.h t4, a0, 0
  2175. st.h t5, a1, 0
  2176. addi.d a0, a0, 2
  2177. addi.d a1, a1, 2
  2178. addi.d t3, t3, -1
  2179. bnez t3, 3b
  2180. 4:
  2181. endfunc