swscale_lasx.c 46 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972
  1. /*
  2. * Copyright (C) 2022 Loongson Technology Corporation Limited
  3. * Contributed by Hao Chen(chenhao@loongson.cn)
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "swscale_loongarch.h"
  22. #include "libavutil/loongarch/loongson_intrinsics.h"
  23. #include "libavutil/intreadwrite.h"
  24. #define SCALE_8_16(_sh) \
  25. { \
  26. src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
  27. src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
  28. src2 = __lasx_xvldrepl_d(src + filterPos[2], 0); \
  29. src3 = __lasx_xvldrepl_d(src + filterPos[3], 0); \
  30. src4 = __lasx_xvldrepl_d(src + filterPos[4], 0); \
  31. src5 = __lasx_xvldrepl_d(src + filterPos[5], 0); \
  32. src6 = __lasx_xvldrepl_d(src + filterPos[6], 0); \
  33. src7 = __lasx_xvldrepl_d(src + filterPos[7], 0); \
  34. src8 = __lasx_xvldrepl_d(src + filterPos[8], 0); \
  35. src9 = __lasx_xvldrepl_d(src + filterPos[9], 0); \
  36. src10 = __lasx_xvldrepl_d(src + filterPos[10], 0); \
  37. src11 = __lasx_xvldrepl_d(src + filterPos[11], 0); \
  38. src12 = __lasx_xvldrepl_d(src + filterPos[12], 0); \
  39. src13 = __lasx_xvldrepl_d(src + filterPos[13], 0); \
  40. src14 = __lasx_xvldrepl_d(src + filterPos[14], 0); \
  41. src15 = __lasx_xvldrepl_d(src + filterPos[15], 0); \
  42. DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64, \
  43. filter, 96, filter0, filter1, filter2, filter3); \
  44. DUP4_ARG2(__lasx_xvld, filter, 128, filter, 160, \
  45. filter, 192, filter, 224, filter4, \
  46. filter5, filter6, filter7); \
  47. DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2, \
  48. src5, src4, src7, src6, src0, src2, src4, src6); \
  49. DUP4_ARG2(__lasx_xvilvl_d, src9, src8, src11, src10, \
  50. src13, src12, src15, src14, src8, src10, src12, src14); \
  51. DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6, \
  52. src0, src2, src4, src6); \
  53. DUP4_ARG1(__lasx_vext2xv_hu_bu, src8, src10, src12, \
  54. src14, src8, src10, src12, src14); \
  55. DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2, \
  56. filter2, src4, filter3, src6, src0, src1, src2, src3); \
  57. DUP4_ARG2(__lasx_xvdp2_w_h, filter4, src8, filter5, src10, \
  58. filter6, src12, filter7, src14, src4, src5, src6, src7);\
  59. src0 = __lasx_xvhaddw_d_w(src0, src0); \
  60. src1 = __lasx_xvhaddw_d_w(src1, src1); \
  61. src2 = __lasx_xvhaddw_d_w(src2, src2); \
  62. src3 = __lasx_xvhaddw_d_w(src3, src3); \
  63. src4 = __lasx_xvhaddw_d_w(src4, src4); \
  64. src5 = __lasx_xvhaddw_d_w(src5, src5); \
  65. src6 = __lasx_xvhaddw_d_w(src6, src6); \
  66. src7 = __lasx_xvhaddw_d_w(src7, src7); \
  67. DUP4_ARG2(__lasx_xvpickev_w, src1, src0, src3, src2, \
  68. src5, src4, src7, src6, src0, src1, src2, src3); \
  69. src0 = __lasx_xvhaddw_d_w(src0, src0); \
  70. src1 = __lasx_xvhaddw_d_w(src1, src1); \
  71. src2 = __lasx_xvhaddw_d_w(src2, src2); \
  72. src3 = __lasx_xvhaddw_d_w(src3, src3); \
  73. src0 = __lasx_xvpickev_w(src1, src0); \
  74. src1 = __lasx_xvpickev_w(src3, src2); \
  75. src0 = __lasx_xvsrai_w(src0, _sh); \
  76. src1 = __lasx_xvsrai_w(src1, _sh); \
  77. src0 = __lasx_xvmin_w(src0, vmax); \
  78. src1 = __lasx_xvmin_w(src1, vmax); \
  79. src0 = __lasx_xvperm_w(src0, shuf); \
  80. src1 = __lasx_xvperm_w(src1, shuf); \
  81. src0 = __lasx_xvpickev_h(src1, src0); \
  82. src0 = __lasx_xvpermi_d(src0, 0xd8); \
  83. __lasx_xvst(src0, dst, 0); \
  84. filterPos += 16; \
  85. filter += 128; \
  86. dst += 16; \
  87. }
  88. #define SCALE_8_8(_sh) \
  89. { \
  90. src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
  91. src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
  92. src2 = __lasx_xvldrepl_d(src + filterPos[2], 0); \
  93. src3 = __lasx_xvldrepl_d(src + filterPos[3], 0); \
  94. src4 = __lasx_xvldrepl_d(src + filterPos[4], 0); \
  95. src5 = __lasx_xvldrepl_d(src + filterPos[5], 0); \
  96. src6 = __lasx_xvldrepl_d(src + filterPos[6], 0); \
  97. src7 = __lasx_xvldrepl_d(src + filterPos[7], 0); \
  98. DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64, \
  99. filter, 96, filter0, filter1, filter2, filter3); \
  100. filterPos += 8; \
  101. filter += 64; \
  102. DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2, \
  103. src5, src4, src7, src6, src0, src2, src4, src6); \
  104. DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6, \
  105. src0, src2, src4, src6); \
  106. DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2, \
  107. filter2, src4, filter3, src6, src0, src1, src2,src3); \
  108. src0 = __lasx_xvhaddw_d_w(src0, src0); \
  109. src1 = __lasx_xvhaddw_d_w(src1, src1); \
  110. src2 = __lasx_xvhaddw_d_w(src2, src2); \
  111. src3 = __lasx_xvhaddw_d_w(src3, src3); \
  112. src0 = __lasx_xvpickev_w(src1, src0); \
  113. src1 = __lasx_xvpickev_w(src3, src2); \
  114. src0 = __lasx_xvhaddw_d_w(src0, src0); \
  115. src1 = __lasx_xvhaddw_d_w(src1, src1); \
  116. src0 = __lasx_xvpickev_w(src1, src0); \
  117. src0 = __lasx_xvsrai_w(src0, _sh); \
  118. src0 = __lasx_xvmin_w(src0, vmax); \
  119. src0 = __lasx_xvperm_w(src0, shuf); \
  120. }
  121. #define SCALE_8_4(_sh) \
  122. { \
  123. src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
  124. src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
  125. src2 = __lasx_xvldrepl_d(src + filterPos[2], 0); \
  126. src3 = __lasx_xvldrepl_d(src + filterPos[3], 0); \
  127. filter0 = __lasx_xvld(filter, 0); \
  128. filter1 = __lasx_xvld(filter, 32); \
  129. filterPos += 4; \
  130. filter += 32; \
  131. src0 = __lasx_xvilvl_d(src1, src0); \
  132. src2 = __lasx_xvilvl_d(src3, src2); \
  133. src0 = __lasx_vext2xv_hu_bu(src0); \
  134. src2 = __lasx_vext2xv_hu_bu(src2); \
  135. src0 = __lasx_xvdp2_w_h(src0, filter0); \
  136. src1 = __lasx_xvdp2_w_h(src2, filter1); \
  137. src0 = __lasx_xvhaddw_d_w(src0, src0); \
  138. src1 = __lasx_xvhaddw_d_w(src1, src1); \
  139. src0 = __lasx_xvpickev_w(src1, src0); \
  140. src0 = __lasx_xvhaddw_d_w(src0, src0); \
  141. src0 = __lasx_xvpickev_w(src0, src0); \
  142. src0 = __lasx_xvsrai_w(src0, _sh); \
  143. src0 = __lasx_xvmin_w(src0, vmax); \
  144. src0 = __lasx_xvperm_w(src0, shuf); \
  145. }
  146. #define SCALE_8_2(_sh) \
  147. { \
  148. src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
  149. src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
  150. filter0 = __lasx_xvld(filter, 0); \
  151. src0 = __lasx_xvilvl_d(src1, src0); \
  152. src0 = __lasx_vext2xv_hu_bu(src0); \
  153. src0 = __lasx_xvdp2_w_h(filter0, src0); \
  154. src0 = __lasx_xvhaddw_d_w(src0, src0); \
  155. src0 = __lasx_xvhaddw_q_d(src0, src0); \
  156. src0 = __lasx_xvsrai_w(src0, _sh); \
  157. src0 = __lasx_xvmin_w(src0, vmax); \
  158. dst[0] = __lasx_xvpickve2gr_w(src0, 0); \
  159. dst[1] = __lasx_xvpickve2gr_w(src0, 4); \
  160. filterPos += 2; \
  161. filter += 16; \
  162. dst += 2; \
  163. }
  164. #define SCALE_4_16(_sh) \
  165. { \
  166. src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
  167. src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
  168. src2 = __lasx_xvldrepl_w(src + filterPos[2], 0); \
  169. src3 = __lasx_xvldrepl_w(src + filterPos[3], 0); \
  170. src4 = __lasx_xvldrepl_w(src + filterPos[4], 0); \
  171. src5 = __lasx_xvldrepl_w(src + filterPos[5], 0); \
  172. src6 = __lasx_xvldrepl_w(src + filterPos[6], 0); \
  173. src7 = __lasx_xvldrepl_w(src + filterPos[7], 0); \
  174. src8 = __lasx_xvldrepl_w(src + filterPos[8], 0); \
  175. src9 = __lasx_xvldrepl_w(src + filterPos[9], 0); \
  176. src10 = __lasx_xvldrepl_w(src + filterPos[10], 0); \
  177. src11 = __lasx_xvldrepl_w(src + filterPos[11], 0); \
  178. src12 = __lasx_xvldrepl_w(src + filterPos[12], 0); \
  179. src13 = __lasx_xvldrepl_w(src + filterPos[13], 0); \
  180. src14 = __lasx_xvldrepl_w(src + filterPos[14], 0); \
  181. src15 = __lasx_xvldrepl_w(src + filterPos[15], 0); \
  182. DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64, \
  183. filter, 96, filter0, filter1, filter2, filter3); \
  184. DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5, \
  185. src4, src7, src6, src0, src2, src4, src6); \
  186. DUP4_ARG2(__lasx_xvilvl_w, src9, src8, src11, src10, src13, \
  187. src12, src15, src14, src8, src10, src12, src14); \
  188. DUP4_ARG2(__lasx_xvilvl_d, src2, src0, src6, src4, src10, \
  189. src8, src14, src12, src0, src1, src2, src3); \
  190. DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src1, src2, src3, \
  191. src0, src1, src2, src3); \
  192. DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src1, \
  193. filter2, src2, filter3, src3, src0, src1, src2, src3); \
  194. src0 = __lasx_xvhaddw_d_w(src0, src0); \
  195. src1 = __lasx_xvhaddw_d_w(src1, src1); \
  196. src2 = __lasx_xvhaddw_d_w(src2, src2); \
  197. src3 = __lasx_xvhaddw_d_w(src3, src3); \
  198. src0 = __lasx_xvpickev_w(src1, src0); \
  199. src1 = __lasx_xvpickev_w(src3, src2); \
  200. src0 = __lasx_xvsrai_w(src0, _sh); \
  201. src1 = __lasx_xvsrai_w(src1, _sh); \
  202. src0 = __lasx_xvmin_w(src0, vmax); \
  203. src1 = __lasx_xvmin_w(src1, vmax); \
  204. src0 = __lasx_xvpickev_h(src1, src0); \
  205. src0 = __lasx_xvperm_w(src0, shuf); \
  206. __lasx_xvst(src0, dst, 0); \
  207. filterPos += 16; \
  208. filter += 64; \
  209. dst += 16; \
  210. }
  211. #define SCALE_4_8(_sh) \
  212. { \
  213. src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
  214. src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
  215. src2 = __lasx_xvldrepl_w(src + filterPos[2], 0); \
  216. src3 = __lasx_xvldrepl_w(src + filterPos[3], 0); \
  217. src4 = __lasx_xvldrepl_w(src + filterPos[4], 0); \
  218. src5 = __lasx_xvldrepl_w(src + filterPos[5], 0); \
  219. src6 = __lasx_xvldrepl_w(src + filterPos[6], 0); \
  220. src7 = __lasx_xvldrepl_w(src + filterPos[7], 0); \
  221. filter0 = __lasx_xvld(filter, 0); \
  222. filter1 = __lasx_xvld(filter, 32); \
  223. filterPos += 8; \
  224. filter += 32; \
  225. DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5, \
  226. src4, src7, src6, src0, src2, src4, src6); \
  227. src0 = __lasx_xvilvl_d(src2, src0); \
  228. src1 = __lasx_xvilvl_d(src6, src4); \
  229. \
  230. src0 = __lasx_vext2xv_hu_bu(src0); \
  231. src1 = __lasx_vext2xv_hu_bu(src1); \
  232. src0 = __lasx_xvdp2_w_h(filter0, src0); \
  233. src1 = __lasx_xvdp2_w_h(filter1, src1); \
  234. src0 = __lasx_xvhaddw_d_w(src0, src0); \
  235. src1 = __lasx_xvhaddw_d_w(src1, src1); \
  236. src0 = __lasx_xvpickev_w(src1, src0); \
  237. src0 = __lasx_xvsrai_w(src0, _sh); \
  238. src0 = __lasx_xvmin_w(src0, vmax); \
  239. }
  240. #define SCALE_4_4(_sh) \
  241. { \
  242. src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
  243. src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
  244. src2 = __lasx_xvldrepl_w(src + filterPos[2], 0); \
  245. src3 = __lasx_xvldrepl_w(src + filterPos[3], 0); \
  246. filter0 = __lasx_xvld(filter, 0); \
  247. filterPos += 4; \
  248. filter += 16; \
  249. src0 = __lasx_xvilvl_w(src1, src0); \
  250. src1 = __lasx_xvilvl_w(src3, src2); \
  251. \
  252. src0 = __lasx_xvilvl_d(src1, src0); \
  253. src0 = __lasx_vext2xv_hu_bu(src0); \
  254. src0 = __lasx_xvdp2_w_h(filter0, src0); \
  255. src0 = __lasx_xvhaddw_d_w(src0, src0); \
  256. src0 = __lasx_xvsrai_w(src0, _sh); \
  257. src0 = __lasx_xvmin_w(src0, vmax); \
  258. src0 = __lasx_xvpickev_w(src0, src0); \
  259. src0 = __lasx_xvpermi_d(src0, 0xd8); \
  260. }
  261. #define SCALE_4_2(_sh) \
  262. { \
  263. src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
  264. src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
  265. filter0 = __lasx_xvld(filter, 0); \
  266. src0 = __lasx_xvilvl_w(src1, src0); \
  267. src0 = __lasx_vext2xv_hu_bu(src0); \
  268. src0 = __lasx_xvdp2_w_h(filter0, src0); \
  269. src0 = __lasx_xvhaddw_d_w(src0, src0); \
  270. src0 = __lasx_xvsrai_w(src0, _sh); \
  271. src0 = __lasx_xvmin_w(src0, vmax); \
  272. dst[0] = __lasx_xvpickve2gr_w(src0, 0); \
  273. dst[1] = __lasx_xvpickve2gr_w(src0, 2); \
  274. filterPos += 2; \
  275. filter += 8; \
  276. dst += 2; \
  277. }
  278. #define SCALE_16 \
  279. { \
  280. int dex = j << 1; \
  281. src0 = __lasx_xvldrepl_d((srcPos1 + j), 0); \
  282. src1 = __lasx_xvldrepl_d((srcPos2 + j), 0); \
  283. src2 = __lasx_xvldrepl_d((srcPos3 + j), 0); \
  284. src3 = __lasx_xvldrepl_d((srcPos4 + j), 0); \
  285. DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex, \
  286. filterStart3, dex, filterStart4, dex, filter0, \
  287. filter1, filter2, filter3); \
  288. src0 = __lasx_xvpermi_q(src0, src1, 0x02); \
  289. src1 = __lasx_xvpermi_q(src2, src3, 0x02); \
  290. filter0 = __lasx_xvpermi_q(filter0, filter1, 0x02); \
  291. filter1 = __lasx_xvpermi_q(filter2, filter3, 0x02); \
  292. src0 = __lasx_xvilvl_b(zero, src0); \
  293. src1 = __lasx_xvilvl_b(zero, src1); \
  294. out0 = __lasx_xvdp2_w_h(filter0, src0); \
  295. out1 = __lasx_xvdp2_w_h(filter1, src1); \
  296. src0 = __lasx_xvhaddw_d_w(out0, out0); \
  297. src1 = __lasx_xvhaddw_d_w(out1, out1); \
  298. out0 = __lasx_xvpackev_d(src1, src0); \
  299. out1 = __lasx_xvpackod_d(src1, src0); \
  300. out0 = __lasx_xvadd_w(out0, out1); \
  301. out = __lasx_xvadd_w(out, out0); \
  302. }
  303. void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
  304. const uint8_t *src, const int16_t *filter,
  305. const int32_t *filterPos, int filterSize)
  306. {
  307. int i;
  308. int max = (1 << 15) - 1;
  309. if (filterSize == 8) {
  310. __m256i src0, src1, src2, src3, src4, src5, src6, src7;
  311. __m256i src8, src9, src10, src11, src12, src13, src14, src15;
  312. __m256i filter0, filter1, filter2, filter3;
  313. __m256i filter4, filter5, filter6, filter7;
  314. __m256i vmax = __lasx_xvreplgr2vr_w(max);
  315. __m256i shuf = {0x0000000400000000, 0x0000000500000001,
  316. 0x0000000600000002, 0x0000000700000003};
  317. int len = dstW >> 4;
  318. int res = dstW & 15;
  319. while (len--) {
  320. SCALE_8_16(7);
  321. }
  322. if (res & 8) {
  323. SCALE_8_8(7);
  324. src0 = __lasx_xvpickev_h(src0, src0);
  325. __lasx_xvstelm_d(src0, dst, 0, 0);
  326. __lasx_xvstelm_d(src0, dst, 8, 2);
  327. dst += 8;
  328. }
  329. if (res & 4) {
  330. SCALE_8_4(7);
  331. src0 = __lasx_xvpickev_h(src0, src0);
  332. __lasx_xvstelm_d(src0, dst, 0, 0);
  333. dst += 4;
  334. }
  335. if (res & 2) {
  336. SCALE_8_2(7);
  337. }
  338. if (res & 1) {
  339. int val = 0;
  340. src0 = __lasx_xvldrepl_d(src + filterPos[0], 0);
  341. filter0 = __lasx_xvld(filter, 0);
  342. src0 = __lasx_vext2xv_hu_bu(src0);
  343. src0 = __lasx_xvdp2_w_h(filter0, src0);
  344. src0 = __lasx_xvhaddw_d_w(src0, src0);
  345. src0 = __lasx_xvhaddw_q_d(src0, src0);
  346. val = __lasx_xvpickve2gr_w(src0, 0);
  347. dst[0] = FFMIN(val >> 7, max);
  348. }
  349. } else if (filterSize == 4) {
  350. __m256i src0, src1, src2, src3, src4, src5, src6, src7;
  351. __m256i src8, src9, src10, src11, src12, src13, src14, src15;
  352. __m256i filter0, filter1, filter2, filter3;
  353. __m256i vmax = __lasx_xvreplgr2vr_w(max);
  354. __m256i shuf = {0x0000000400000000, 0x0000000500000001,
  355. 0x0000000600000002, 0x0000000700000003};
  356. int len = dstW >> 4;
  357. int res = dstW & 15;
  358. while (len--) {
  359. SCALE_4_16(7);
  360. }
  361. if (res & 8) {
  362. SCALE_4_8(7);
  363. src0 = __lasx_xvpickev_h(src1, src0);
  364. src0 = __lasx_xvperm_w(src0, shuf);
  365. __lasx_xvstelm_d(src0, dst, 0, 0);
  366. __lasx_xvstelm_d(src0, dst, 8, 1);
  367. dst += 8;
  368. }
  369. if (res & 4) {
  370. SCALE_4_4(7);
  371. src0 = __lasx_xvpickev_h(src0, src0);
  372. __lasx_xvstelm_d(src0, dst, 0, 0);
  373. dst += 4;
  374. }
  375. if (res & 2) {
  376. SCALE_4_2(7);
  377. }
  378. if (res & 1) {
  379. int val = 0;
  380. const uint8_t *srcPos = src + filterPos[0];
  381. for (int j = 0; j < filterSize; j++) {
  382. val += ((int)srcPos[j]) * filter[j];
  383. }
  384. dst[0] = FFMIN(val >> 7, max);
  385. }
  386. } else if (filterSize > 8) {
  387. int filterlen = filterSize - 7;
  388. int len = dstW >> 2;
  389. int res = dstW & 3;
  390. __m256i zero = __lasx_xvldi(0);
  391. while (len--) {
  392. __m256i src0, src1, src2, src3;
  393. __m256i filter0, filter1, filter2, filter3, out0, out1;
  394. __m256i out = zero;
  395. const uint8_t *srcPos1 = src + filterPos[0];
  396. const uint8_t *srcPos2 = src + filterPos[1];
  397. const uint8_t *srcPos3 = src + filterPos[2];
  398. const uint8_t *srcPos4 = src + filterPos[3];
  399. const int16_t *filterStart1 = filter;
  400. const int16_t *filterStart2 = filterStart1 + filterSize;
  401. const int16_t *filterStart3 = filterStart2 + filterSize;
  402. const int16_t *filterStart4 = filterStart3 + filterSize;
  403. int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
  404. for (j = 0; j < filterlen; j += 8) {
  405. SCALE_16
  406. }
  407. val1 = __lasx_xvpickve2gr_w(out, 0);
  408. val2 = __lasx_xvpickve2gr_w(out, 4);
  409. val3 = __lasx_xvpickve2gr_w(out, 2);
  410. val4 = __lasx_xvpickve2gr_w(out, 6);
  411. for (; j < filterSize; j++) {
  412. val1 += ((int)srcPos1[j]) * filterStart1[j];
  413. val2 += ((int)srcPos2[j]) * filterStart2[j];
  414. val3 += ((int)srcPos3[j]) * filterStart3[j];
  415. val4 += ((int)srcPos4[j]) * filterStart4[j];
  416. }
  417. dst[0] = FFMIN(val1 >> 7, max);
  418. dst[1] = FFMIN(val2 >> 7, max);
  419. dst[2] = FFMIN(val3 >> 7, max);
  420. dst[3] = FFMIN(val4 >> 7, max);
  421. dst += 4;
  422. filterPos += 4;
  423. filter = filterStart4 + filterSize;
  424. }
  425. for(i = 0; i < res; i++) {
  426. int j, val = 0;
  427. const uint8_t *srcPos = src + filterPos[i];
  428. __m256i src1, filter0, out0;
  429. for (j = 0; j < filterlen; j += 8) {
  430. src1 = __lasx_xvldrepl_d((srcPos + j), 0);
  431. filter0 = __lasx_xvld(filter + j, 0);
  432. src1 = __lasx_xvilvl_b(zero, src1);
  433. out0 = __lasx_xvdp2_w_h(filter0, src1);
  434. out0 = __lasx_xvhaddw_d_w(out0, out0);
  435. out0 = __lasx_xvhaddw_q_d(out0, out0);
  436. val += __lasx_xvpickve2gr_w(out0, 0);
  437. }
  438. for (; j < filterSize; j++) {
  439. val += ((int)srcPos[j]) * filter[j];
  440. }
  441. dst[i] = FFMIN(val >> 7, max);
  442. filter += filterSize;
  443. }
  444. } else {
  445. for (i = 0; i < dstW; i++) {
  446. int val = 0;
  447. const uint8_t *srcPos = src + filterPos[i];
  448. for (int j = 0; j < filterSize; j++) {
  449. val += ((int)srcPos[j]) * filter[j];
  450. }
  451. dst[i] = FFMIN(val >> 7, max);
  452. filter += filterSize;
  453. }
  454. }
  455. }
  456. void ff_hscale_8_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
  457. const uint8_t *src, const int16_t *filter,
  458. const int32_t *filterPos, int filterSize)
  459. {
  460. int i;
  461. int max = (1 << 19) - 1;
  462. int32_t *dst = (int32_t *) _dst;
  463. if (filterSize == 8) {
  464. __m256i src0, src1, src2, src3, src4, src5, src6, src7;
  465. __m256i filter0, filter1, filter2, filter3;
  466. __m256i vmax = __lasx_xvreplgr2vr_w(max);
  467. __m256i shuf = {0x0000000400000000, 0x0000000500000001,
  468. 0x0000000600000002, 0x0000000700000003};
  469. int len = dstW >> 3;
  470. int res = dstW & 7;
  471. while (len--) {
  472. SCALE_8_8(3);
  473. __lasx_xvst(src0, dst, 0);
  474. dst += 8;
  475. }
  476. if (res & 4) {
  477. SCALE_8_4(3);
  478. __lasx_xvstelm_d(src0, dst, 0, 0);
  479. __lasx_xvstelm_d(src0, dst, 8, 1);
  480. dst += 4;
  481. }
  482. if (res & 2) {
  483. SCALE_8_2(3);
  484. }
  485. if (res & 1) {
  486. int val = 0;
  487. __m256i src0, filter0, out0;
  488. src0 = __lasx_xvldrepl_d(src + filterPos[0], 0);
  489. filter0 = __lasx_xvld(filter, 0);
  490. src0 = __lasx_vext2xv_hu_bu(src0);
  491. out0 = __lasx_xvdp2_w_h(filter0, src0);
  492. out0 = __lasx_xvhaddw_d_w(out0, out0);
  493. out0 = __lasx_xvhaddw_q_d(out0, out0);
  494. val = __lasx_xvpickve2gr_w(out0, 0);
  495. dst[0] = FFMIN(val >> 3, max);
  496. }
  497. } else if (filterSize == 4) {
  498. __m256i src0, src1, src2, src3, src4, src5, src6, src7;
  499. __m256i filter0, filter1;
  500. __m256i vmax = __lasx_xvreplgr2vr_w(max);
  501. __m256i shuf = {0x0000000100000000, 0x0000000500000004,
  502. 0x0000000300000002, 0x0000000700000006};
  503. int len = dstW >> 3;
  504. int res = dstW & 7;
  505. while (len--) {
  506. SCALE_4_8(3);
  507. src0 = __lasx_xvperm_w(src0, shuf);
  508. __lasx_xvst(src0, dst, 0);
  509. dst += 8;
  510. }
  511. if (res & 4) {
  512. SCALE_4_4(3);
  513. __lasx_xvstelm_d(src0, dst, 0, 0);
  514. __lasx_xvstelm_d(src0, dst, 8, 1);
  515. dst += 4;
  516. }
  517. if (res & 2) {
  518. SCALE_4_2(3);
  519. }
  520. if (res & 1) {
  521. int val = 0;
  522. const uint8_t *srcPos = src + filterPos[0];
  523. for (int j = 0; j < filterSize; j++) {
  524. val += ((int)srcPos[j]) * filter[j];
  525. }
  526. dst[0] = FFMIN(val >> 3, max);
  527. }
  528. } else if (filterSize > 8) {
  529. int len = dstW >> 2;
  530. int res = dstW & 3;
  531. int filterlen = filterSize - 7;
  532. __m256i zero = __lasx_xvldi(0);
  533. while (len--) {
  534. __m256i src0, src1, src2, src3;
  535. __m256i filter0, filter1, filter2, filter3, out0, out1;
  536. __m256i out = zero;
  537. const uint8_t *srcPos1 = src + filterPos[0];
  538. const uint8_t *srcPos2 = src + filterPos[1];
  539. const uint8_t *srcPos3 = src + filterPos[2];
  540. const uint8_t *srcPos4 = src + filterPos[3];
  541. const int16_t *filterStart1 = filter;
  542. const int16_t *filterStart2 = filterStart1 + filterSize;
  543. const int16_t *filterStart3 = filterStart2 + filterSize;
  544. const int16_t *filterStart4 = filterStart3 + filterSize;
  545. int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
  546. for (j = 0; j < filterlen; j += 8) {
  547. SCALE_16
  548. }
  549. val1 = __lasx_xvpickve2gr_w(out, 0);
  550. val2 = __lasx_xvpickve2gr_w(out, 4);
  551. val3 = __lasx_xvpickve2gr_w(out, 2);
  552. val4 = __lasx_xvpickve2gr_w(out, 6);
  553. for (; j < filterSize; j++) {
  554. val1 += ((int)srcPos1[j]) * filterStart1[j];
  555. val2 += ((int)srcPos2[j]) * filterStart2[j];
  556. val3 += ((int)srcPos3[j]) * filterStart3[j];
  557. val4 += ((int)srcPos4[j]) * filterStart4[j];
  558. }
  559. dst[0] = FFMIN(val1 >> 3, max);
  560. dst[1] = FFMIN(val2 >> 3, max);
  561. dst[2] = FFMIN(val3 >> 3, max);
  562. dst[3] = FFMIN(val4 >> 3, max);
  563. dst += 4;
  564. filterPos += 4;
  565. filter = filterStart4 + filterSize;
  566. }
  567. for (i = 0; i < res; i++) {
  568. int j, val = 0;
  569. const uint8_t *srcPos = src + filterPos[i];
  570. __m256i src1, filter0, out0;
  571. for (j = 0; j < filterlen; j += 8) {
  572. src1 = __lasx_xvldrepl_d((srcPos + j), 0);
  573. filter0 = __lasx_xvld(filter + j, 0);
  574. src1 = __lasx_xvilvl_b(zero, src1);
  575. out0 = __lasx_xvdp2_w_h(filter0, src1);
  576. out0 = __lasx_xvhaddw_d_w(out0, out0);
  577. out0 = __lasx_xvhaddw_q_d(out0, out0);
  578. val += __lasx_xvpickve2gr_w(out0, 0);
  579. }
  580. for (; j < filterSize; j++) {
  581. val += ((int)srcPos[j]) * filter[j];
  582. }
  583. dst[i] = FFMIN(val >> 3, max);
  584. filter += filterSize;
  585. }
  586. } else {
  587. for (i = 0; i < dstW; i++) {
  588. int val = 0;
  589. const uint8_t *srcPos = src + filterPos[i];
  590. for (int j = 0; j < filterSize; j++) {
  591. val += ((int)srcPos[j]) * filter[j];
  592. }
  593. dst[i] = FFMIN(val >> 3, max);
  594. filter += filterSize;
  595. }
  596. }
  597. }
  598. #undef SCALE_16
  599. #define SCALE_8 \
  600. { \
  601. __m256i src0, src1, src2, src3, filter0, filter1, out0, out1; \
  602. DUP4_ARG2(__lasx_xvld, src + filterPos[0], 0, src + filterPos[1], 0, \
  603. src + filterPos[2], 0, src + filterPos[3], 0, src0, src1, src2,\
  604. src3); \
  605. filter0 = __lasx_xvld(filter, 0); \
  606. filter1 = __lasx_xvld(filter, 32); \
  607. src0 = __lasx_xvpermi_q(src0, src1, 0x02); \
  608. src2 = __lasx_xvpermi_q(src2, src3, 0x02); \
  609. out0 = __lasx_xvdp2_w_hu_h(src0, filter0); \
  610. out1 = __lasx_xvdp2_w_hu_h(src2, filter1); \
  611. src0 = __lasx_xvhaddw_d_w(out0, out0); \
  612. src1 = __lasx_xvhaddw_d_w(out1, out1); \
  613. out0 = __lasx_xvpackev_d(src1, src0); \
  614. out1 = __lasx_xvpackod_d(src1, src0); \
  615. out0 = __lasx_xvadd_w(out0, out1); \
  616. out0 = __lasx_xvsra_w(out0, shift); \
  617. out0 = __lasx_xvmin_w(out0, v_max); \
  618. dst[0] = __lasx_xvpickve2gr_w(out0, 0); \
  619. dst[1] = __lasx_xvpickve2gr_w(out0, 4); \
  620. dst[2] = __lasx_xvpickve2gr_w(out0, 2); \
  621. dst[3] = __lasx_xvpickve2gr_w(out0, 6); \
  622. filterPos += 4; \
  623. filter += 32; \
  624. dst += 4; \
  625. }
  626. #define SCALE_16 \
  627. { \
  628. int dex = j << 1; \
  629. DUP4_ARG2(__lasx_xvldx, srcPos1, dex, srcPos2, dex, srcPos3, dex, \
  630. srcPos4, dex, src0, src1, src2, src3); \
  631. DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex, \
  632. filterStart3, dex, filterStart4, dex, filter0, \
  633. filter1, filter2, filter3); \
  634. src0 = __lasx_xvpermi_q(src0, src1, 0x02); \
  635. src1 = __lasx_xvpermi_q(src2, src3, 0x02); \
  636. filter0 = __lasx_xvpermi_q(filter0, filter1, 0x02); \
  637. filter1 = __lasx_xvpermi_q(filter2, filter3, 0x02); \
  638. out0 = __lasx_xvdp2_w_hu_h(src0, filter0); \
  639. out1 = __lasx_xvdp2_w_hu_h(src1, filter1); \
  640. src0 = __lasx_xvhaddw_d_w(out0, out0); \
  641. src1 = __lasx_xvhaddw_d_w(out1, out1); \
  642. out0 = __lasx_xvpackev_d(src1, src0); \
  643. out1 = __lasx_xvpackod_d(src1, src0); \
  644. out0 = __lasx_xvadd_w(out0, out1); \
  645. out = __lasx_xvadd_w(out, out0); \
  646. }
  647. void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
  648. const uint8_t *_src, const int16_t *filter,
  649. const int32_t *filterPos, int filterSize)
  650. {
  651. const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
  652. int i;
  653. const uint16_t *src = (const uint16_t *) _src;
  654. int sh = desc->comp[0].depth - 1;
  655. int max = (1 << 15) - 1;
  656. int len = dstW >> 2;
  657. int res = dstW & 3;
  658. __m256i shift;
  659. __m256i zero = __lasx_xvldi(0);
  660. if (sh < 15) {
  661. sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 :
  662. (desc->comp[0].depth - 1);
  663. } else if (desc->flags && AV_PIX_FMT_FLAG_FLOAT) {
  664. sh = 15;
  665. }
  666. shift = __lasx_xvreplgr2vr_w(sh);
  667. if (filterSize == 8) {
  668. __m256i v_max = __lasx_xvreplgr2vr_w(max);
  669. for (i = 0; i < len; i++) {
  670. SCALE_8
  671. }
  672. for (i = 0; i < res; i++) {
  673. int val = 0;
  674. __m256i src0, filter0, out0;
  675. src0 = __lasx_xvld(src + filterPos[i], 0);
  676. filter0 = __lasx_xvld(filter, 0);
  677. out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
  678. out0 = __lasx_xvhaddw_d_w(out0, out0);
  679. out0 = __lasx_xvhaddw_q_d(out0, out0);
  680. val = __lasx_xvpickve2gr_w(out0, 0);
  681. dst[i] = FFMIN(val >> sh, max);
  682. filter += 8;
  683. }
  684. } else if (filterSize == 4) {
  685. __m256i v_max = __lasx_xvreplgr2vr_w(max);
  686. for (i = 0; i < len; i++) {
  687. __m256i src1, src2, src3, src4, src0, filter0, out0;
  688. src1 = __lasx_xvldrepl_d(src + filterPos[0], 0);
  689. src2 = __lasx_xvldrepl_d(src + filterPos[1], 0);
  690. src3 = __lasx_xvldrepl_d(src + filterPos[2], 0);
  691. src4 = __lasx_xvldrepl_d(src + filterPos[3], 0);
  692. filter0 = __lasx_xvld(filter, 0);
  693. src1 = __lasx_xvextrins_d(src1, src2, 0x10);
  694. src3 = __lasx_xvextrins_d(src3, src4, 0x10);
  695. src0 = __lasx_xvpermi_q(src1, src3, 0x02);
  696. out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
  697. out0 = __lasx_xvhaddw_d_w(out0, out0);
  698. out0 = __lasx_xvsra_w(out0, shift);
  699. out0 = __lasx_xvmin_w(out0, v_max);
  700. dst[0] = __lasx_xvpickve2gr_w(out0, 0);
  701. dst[1] = __lasx_xvpickve2gr_w(out0, 2);
  702. dst[2] = __lasx_xvpickve2gr_w(out0, 4);
  703. dst[3] = __lasx_xvpickve2gr_w(out0, 6);
  704. dst += 4;
  705. filterPos += 4;
  706. filter += 16;
  707. }
  708. for (i = 0; i < res; i++) {
  709. int val = 0;
  710. const uint16_t *srcPos = src + filterPos[i];
  711. for (int j = 0; j < filterSize; j++) {
  712. val += ((int)srcPos[j]) * filter[j];
  713. }
  714. dst[i] = FFMIN(val >> sh, max);
  715. filter += 4;
  716. }
  717. } else if (filterSize > 8) {
  718. int filterlen = filterSize - 7;
  719. for (i = 0; i < len; i++) {
  720. __m256i src0, src1, src2, src3;
  721. __m256i filter0, filter1, filter2, filter3, out0, out1;
  722. __m256i out = zero;
  723. const uint16_t *srcPos1 = src + filterPos[0];
  724. const uint16_t *srcPos2 = src + filterPos[1];
  725. const uint16_t *srcPos3 = src + filterPos[2];
  726. const uint16_t *srcPos4 = src + filterPos[3];
  727. const int16_t *filterStart1 = filter;
  728. const int16_t *filterStart2 = filterStart1 + filterSize;
  729. const int16_t *filterStart3 = filterStart2 + filterSize;
  730. const int16_t *filterStart4 = filterStart3 + filterSize;
  731. int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
  732. for (j = 0; j < filterlen; j += 8) {
  733. SCALE_16
  734. }
  735. val1 = __lasx_xvpickve2gr_w(out, 0);
  736. val2 = __lasx_xvpickve2gr_w(out, 4);
  737. val3 = __lasx_xvpickve2gr_w(out, 2);
  738. val4 = __lasx_xvpickve2gr_w(out, 6);
  739. for (; j < filterSize; j++) {
  740. val1 += ((int)srcPos1[j]) * filterStart1[j];
  741. val2 += ((int)srcPos2[j]) * filterStart2[j];
  742. val3 += ((int)srcPos3[j]) * filterStart3[j];
  743. val4 += ((int)srcPos4[j]) * filterStart4[j];
  744. }
  745. dst[0] = FFMIN(val1 >> sh, max);
  746. dst[1] = FFMIN(val2 >> sh, max);
  747. dst[2] = FFMIN(val3 >> sh, max);
  748. dst[3] = FFMIN(val4 >> sh, max);
  749. dst += 4;
  750. filterPos += 4;
  751. filter = filterStart4 + filterSize;
  752. }
  753. for (i = 0; i < res; i++) {
  754. int j, val = 0;
  755. const uint16_t *srcPos = src + filterPos[i];
  756. __m256i src0, filter0, out0;
  757. for (j = 0; j < filterlen; j += 8) {
  758. int dex = j << 1;
  759. src0 = __lasx_xvldx(srcPos, dex);
  760. filter0 = __lasx_xvldx(filter, dex);
  761. out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
  762. out0 = __lasx_xvhaddw_d_w(out0, out0);
  763. out0 = __lasx_xvhaddw_q_d(out0, out0);
  764. val += __lasx_xvpickve2gr_w(out0, 0);
  765. }
  766. for (; j < filterSize; j++) {
  767. val += ((int)srcPos[j]) * filter[j];
  768. }
  769. dst[i] = FFMIN(val >> sh, max);
  770. filter += filterSize;
  771. }
  772. } else {
  773. for (i = 0; i < dstW; i++) {
  774. int val = 0;
  775. const uint16_t *srcPos = src + filterPos[i];
  776. for (int j = 0; j < filterSize; j++) {
  777. val += ((int)srcPos[j]) * filter[j];
  778. }
  779. dst[i] = FFMIN(val >> sh, max);
  780. filter += filterSize;
  781. }
  782. }
  783. }
  784. void ff_hscale_16_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
  785. const uint8_t *_src, const int16_t *filter,
  786. const int32_t *filterPos, int filterSize)
  787. {
  788. const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
  789. int i;
  790. int32_t *dst = (int32_t *) _dst;
  791. const uint16_t *src = (const uint16_t *) _src;
  792. int sh = desc->comp[0].depth - 5;
  793. int max = (1 << 19) - 1;
  794. int len = dstW >> 2;
  795. int res = dstW & 3;
  796. __m256i shift;
  797. __m256i zero = __lasx_xvldi(0);
  798. if ((isAnyRGB(c->srcFormat) || c->srcFormat == AV_PIX_FMT_PAL8)
  799. && desc->comp[0].depth<16) {
  800. sh = 9;
  801. } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) {
  802. sh = 11;
  803. }
  804. shift = __lasx_xvreplgr2vr_w(sh);
  805. if (filterSize == 8) {
  806. __m256i v_max = __lasx_xvreplgr2vr_w(max);
  807. for (i = 0; i < len; i++) {
  808. SCALE_8
  809. }
  810. for (i = 0; i < res; i++) {
  811. int val = 0;
  812. __m256i src0, filter0, out0;
  813. src0 = __lasx_xvld(src + filterPos[i], 0);
  814. filter0 = __lasx_xvld(filter, 0);
  815. out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
  816. out0 = __lasx_xvhaddw_d_w(out0, out0);
  817. out0 = __lasx_xvhaddw_q_d(out0, out0);
  818. val = __lasx_xvpickve2gr_w(out0, 0);
  819. dst[i] = FFMIN(val >> sh, max);
  820. filter += 8;
  821. }
  822. } else if (filterSize == 4) {
  823. __m256i v_max = __lasx_xvreplgr2vr_w(max);
  824. for (i = 0; i < len; i++) {
  825. __m256i src1, src2, src3, src4, src0, filter0, out0;
  826. src1 = __lasx_xvldrepl_d(src + filterPos[0], 0);
  827. src2 = __lasx_xvldrepl_d(src + filterPos[1], 0);
  828. src3 = __lasx_xvldrepl_d(src + filterPos[2], 0);
  829. src4 = __lasx_xvldrepl_d(src + filterPos[3], 0);
  830. filter0 = __lasx_xvld(filter, 0);
  831. src1 = __lasx_xvextrins_d(src1, src2, 0x10);
  832. src3 = __lasx_xvextrins_d(src3, src4, 0x10);
  833. src0 = __lasx_xvpermi_q(src1, src3, 0x02);
  834. out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
  835. out0 = __lasx_xvhaddw_d_w(out0, out0);
  836. out0 = __lasx_xvsra_w(out0, shift);
  837. out0 = __lasx_xvmin_w(out0, v_max);
  838. dst[0] = __lasx_xvpickve2gr_w(out0, 0);
  839. dst[1] = __lasx_xvpickve2gr_w(out0, 2);
  840. dst[2] = __lasx_xvpickve2gr_w(out0, 4);
  841. dst[3] = __lasx_xvpickve2gr_w(out0, 6);
  842. dst += 4;
  843. filterPos += 4;
  844. filter += 16;
  845. }
  846. for (i = 0; i < res; i++) {
  847. int val = 0;
  848. const uint16_t *srcPos = src + filterPos[i];
  849. for (int j = 0; j < filterSize; j++) {
  850. val += ((int)srcPos[j]) * filter[j];
  851. }
  852. dst[i] = FFMIN(val >> sh, max);
  853. filter += 4;
  854. }
  855. } else if (filterSize > 8) {
  856. int filterlen = filterSize - 7;
  857. for (i = 0; i < len; i ++) {
  858. __m256i src0, src1, src2, src3;
  859. __m256i filter0, filter1, filter2, filter3, out0, out1;
  860. __m256i out = zero;
  861. const uint16_t *srcPos1 = src + filterPos[0];
  862. const uint16_t *srcPos2 = src + filterPos[1];
  863. const uint16_t *srcPos3 = src + filterPos[2];
  864. const uint16_t *srcPos4 = src + filterPos[3];
  865. const int16_t *filterStart1 = filter;
  866. const int16_t *filterStart2 = filterStart1 + filterSize;
  867. const int16_t *filterStart3 = filterStart2 + filterSize;
  868. const int16_t *filterStart4 = filterStart3 + filterSize;
  869. int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
  870. for (j = 0; j < filterlen; j += 8) {
  871. SCALE_16
  872. }
  873. val1 = __lasx_xvpickve2gr_w(out, 0);
  874. val2 = __lasx_xvpickve2gr_w(out, 4);
  875. val3 = __lasx_xvpickve2gr_w(out, 2);
  876. val4 = __lasx_xvpickve2gr_w(out, 6);
  877. for (; j < filterSize; j++) {
  878. val1 += ((int)srcPos1[j]) * filterStart1[j];
  879. val2 += ((int)srcPos2[j]) * filterStart2[j];
  880. val3 += ((int)srcPos3[j]) * filterStart3[j];
  881. val4 += ((int)srcPos4[j]) * filterStart4[j];
  882. }
  883. dst[0] = FFMIN(val1 >> sh, max);
  884. dst[1] = FFMIN(val2 >> sh, max);
  885. dst[2] = FFMIN(val3 >> sh, max);
  886. dst[3] = FFMIN(val4 >> sh, max);
  887. dst += 4;
  888. filterPos += 4;
  889. filter = filterStart4 + filterSize;
  890. }
  891. for (i = 0; i < res; i++) {
  892. int j, val = 0;
  893. const uint16_t *srcPos = src + filterPos[i];
  894. __m256i src0, filter0, out0;
  895. for (j = 0; j < filterlen; j += 8) {
  896. int dex = j << 1;
  897. src0 = __lasx_xvldx(srcPos, dex);
  898. filter0 = __lasx_xvldx(filter, dex);
  899. out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
  900. out0 = __lasx_xvhaddw_d_w(out0, out0);
  901. out0 = __lasx_xvhaddw_q_d(out0, out0);
  902. val += __lasx_xvpickve2gr_w(out0, 0);
  903. }
  904. for (; j < filterSize; j++) {
  905. val += ((int)srcPos[j]) * filter[j];
  906. }
  907. dst[i] = FFMIN(val >> sh, max);
  908. filter += filterSize;
  909. }
  910. } else {
  911. for (i = 0; i < dstW; i++) {
  912. int val = 0;
  913. const uint16_t *srcPos = src + filterPos[i];
  914. for (int j = 0; j < filterSize; j++) {
  915. val += ((int)srcPos[j]) * filter[j];
  916. }
  917. dst[i] = FFMIN(val >> sh, max);
  918. filter += filterSize;
  919. }
  920. }
  921. }
  922. #undef SCALE_8
  923. #undef SCALE_16