output_lasx.c 84 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999
  1. /*
  2. * Copyright (C) 2022 Loongson Technology Corporation Limited
  3. * Contributed by Hao Chen(chenhao@loongson.cn)
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "swscale_loongarch.h"
  22. #include "libavutil/loongarch/loongson_intrinsics.h"
  23. void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
  24. const int16_t **src, uint8_t *dest, int dstW,
  25. const uint8_t *dither, int offset)
  26. {
  27. int i;
  28. int len = dstW - 15;
  29. __m256i mask = {0x1C0C180814041000, 0x1C1814100C080400,
  30. 0x1C0C180814041000, 0x1C1814100C080400};
  31. __m256i val1, val2, val3;
  32. uint8_t dither0 = dither[offset & 7];
  33. uint8_t dither1 = dither[(offset + 1) & 7];
  34. uint8_t dither2 = dither[(offset + 2) & 7];
  35. uint8_t dither3 = dither[(offset + 3) & 7];
  36. uint8_t dither4 = dither[(offset + 4) & 7];
  37. uint8_t dither5 = dither[(offset + 5) & 7];
  38. uint8_t dither6 = dither[(offset + 6) & 7];
  39. uint8_t dither7 = dither[(offset + 7) & 7];
  40. int val_1[8] = {dither0, dither2, dither4, dither6,
  41. dither0, dither2, dither4, dither6};
  42. int val_2[8] = {dither1, dither3, dither5, dither7,
  43. dither1, dither3, dither5, dither7};
  44. int val_3[8] = {dither0, dither1, dither2, dither3,
  45. dither4, dither5, dither6, dither7};
  46. DUP2_ARG2(__lasx_xvld, val_1, 0, val_2, 0, val1, val2);
  47. val3 = __lasx_xvld(val_3, 0);
  48. for (i = 0; i < len; i += 16) {
  49. int j;
  50. __m256i src0, filter0, val;
  51. __m256i val_ev, val_od;
  52. val_ev = __lasx_xvslli_w(val1, 12);
  53. val_od = __lasx_xvslli_w(val2, 12);
  54. for (j = 0; j < filterSize; j++) {
  55. src0 = __lasx_xvld(src[j]+ i, 0);
  56. filter0 = __lasx_xvldrepl_h((filter + j), 0);
  57. val_ev = __lasx_xvmaddwev_w_h(val_ev, src0, filter0);
  58. val_od = __lasx_xvmaddwod_w_h(val_od, src0, filter0);
  59. }
  60. val_ev = __lasx_xvsrai_w(val_ev, 19);
  61. val_od = __lasx_xvsrai_w(val_od, 19);
  62. val_ev = __lasx_xvclip255_w(val_ev);
  63. val_od = __lasx_xvclip255_w(val_od);
  64. val = __lasx_xvshuf_b(val_od, val_ev, mask);
  65. __lasx_xvstelm_d(val, (dest + i), 0, 0);
  66. __lasx_xvstelm_d(val, (dest + i), 8, 2);
  67. }
  68. if (dstW - i >= 8){
  69. int j;
  70. __m256i src0, filter0, val_h;
  71. __m256i val_l;
  72. val_l = __lasx_xvslli_w(val3, 12);
  73. for (j = 0; j < filterSize; j++) {
  74. src0 = __lasx_xvld(src[j] + i, 0);
  75. src0 = __lasx_vext2xv_w_h(src0);
  76. filter0 = __lasx_xvldrepl_h((filter + j), 0);
  77. filter0 = __lasx_vext2xv_w_h(filter0);
  78. val_l = __lasx_xvmadd_w(val_l, src0, filter0);
  79. }
  80. val_l = __lasx_xvsrai_w(val_l, 19);
  81. val_l = __lasx_xvclip255_w(val_l);
  82. val_h = __lasx_xvpermi_d(val_l, 0x4E);
  83. val_l = __lasx_xvshuf_b(val_h, val_l, mask);
  84. __lasx_xvstelm_d(val_l, (dest + i), 0, 1);
  85. i += 8;
  86. }
  87. for (; i < dstW; i++) {
  88. int val = dither[(i + offset) & 7] << 12;
  89. int j;
  90. for (j = 0; j< filterSize; j++)
  91. val += src[j][i] * filter[j];
  92. dest[i] = av_clip_uint8(val >> 19);
  93. }
  94. }
  95. /*Copy from libswscale/output.c*/
  96. static av_always_inline void
  97. yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
  98. unsigned A1, unsigned A2,
  99. const void *_r, const void *_g, const void *_b, int y,
  100. enum AVPixelFormat target, int hasAlpha)
  101. {
  102. if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA ||
  103. target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) {
  104. uint32_t *dest = (uint32_t *) _dest;
  105. const uint32_t *r = (const uint32_t *) _r;
  106. const uint32_t *g = (const uint32_t *) _g;
  107. const uint32_t *b = (const uint32_t *) _b;
  108. #if CONFIG_SMALL
  109. dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
  110. dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
  111. #else
  112. #if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
  113. int sh = (target == AV_PIX_FMT_RGB32_1 ||
  114. target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
  115. av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF);
  116. #endif
  117. dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
  118. dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
  119. #endif
  120. } else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) {
  121. uint8_t *dest = (uint8_t *) _dest;
  122. const uint8_t *r = (const uint8_t *) _r;
  123. const uint8_t *g = (const uint8_t *) _g;
  124. const uint8_t *b = (const uint8_t *) _b;
  125. #define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
  126. #define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
  127. dest[i * 6 + 0] = r_b[Y1];
  128. dest[i * 6 + 1] = g[Y1];
  129. dest[i * 6 + 2] = b_r[Y1];
  130. dest[i * 6 + 3] = r_b[Y2];
  131. dest[i * 6 + 4] = g[Y2];
  132. dest[i * 6 + 5] = b_r[Y2];
  133. #undef r_b
  134. #undef b_r
  135. } else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 ||
  136. target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 ||
  137. target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) {
  138. uint16_t *dest = (uint16_t *) _dest;
  139. const uint16_t *r = (const uint16_t *) _r;
  140. const uint16_t *g = (const uint16_t *) _g;
  141. const uint16_t *b = (const uint16_t *) _b;
  142. int dr1, dg1, db1, dr2, dg2, db2;
  143. if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) {
  144. dr1 = ff_dither_2x2_8[ y & 1 ][0];
  145. dg1 = ff_dither_2x2_4[ y & 1 ][0];
  146. db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
  147. dr2 = ff_dither_2x2_8[ y & 1 ][1];
  148. dg2 = ff_dither_2x2_4[ y & 1 ][1];
  149. db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
  150. } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) {
  151. dr1 = ff_dither_2x2_8[ y & 1 ][0];
  152. dg1 = ff_dither_2x2_8[ y & 1 ][1];
  153. db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
  154. dr2 = ff_dither_2x2_8[ y & 1 ][1];
  155. dg2 = ff_dither_2x2_8[ y & 1 ][0];
  156. db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
  157. } else {
  158. dr1 = ff_dither_4x4_16[ y & 3 ][0];
  159. dg1 = ff_dither_4x4_16[ y & 3 ][1];
  160. db1 = ff_dither_4x4_16[(y & 3) ^ 3][0];
  161. dr2 = ff_dither_4x4_16[ y & 3 ][1];
  162. dg2 = ff_dither_4x4_16[ y & 3 ][0];
  163. db2 = ff_dither_4x4_16[(y & 3) ^ 3][1];
  164. }
  165. dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
  166. dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
  167. } else /* 8/4 bits */ {
  168. uint8_t *dest = (uint8_t *) _dest;
  169. const uint8_t *r = (const uint8_t *) _r;
  170. const uint8_t *g = (const uint8_t *) _g;
  171. const uint8_t *b = (const uint8_t *) _b;
  172. int dr1, dg1, db1, dr2, dg2, db2;
  173. if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) {
  174. const uint8_t * const d64 = ff_dither_8x8_73[y & 7];
  175. const uint8_t * const d32 = ff_dither_8x8_32[y & 7];
  176. dr1 = dg1 = d32[(i * 2 + 0) & 7];
  177. db1 = d64[(i * 2 + 0) & 7];
  178. dr2 = dg2 = d32[(i * 2 + 1) & 7];
  179. db2 = d64[(i * 2 + 1) & 7];
  180. } else {
  181. const uint8_t * const d64 = ff_dither_8x8_73 [y & 7];
  182. const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
  183. dr1 = db1 = d128[(i * 2 + 0) & 7];
  184. dg1 = d64[(i * 2 + 0) & 7];
  185. dr2 = db2 = d128[(i * 2 + 1) & 7];
  186. dg2 = d64[(i * 2 + 1) & 7];
  187. }
  188. if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) {
  189. dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
  190. ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
  191. } else {
  192. dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
  193. dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
  194. }
  195. }
  196. }
  197. #define WRITE_YUV2RGB(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4) \
  198. { \
  199. Y1 = __lasx_xvpickve2gr_w(vec_y1, t1); \
  200. Y2 = __lasx_xvpickve2gr_w(vec_y2, t2); \
  201. U = __lasx_xvpickve2gr_w(vec_u, t3); \
  202. V = __lasx_xvpickve2gr_w(vec_v, t4); \
  203. r = c->table_rV[V]; \
  204. g = (c->table_gU[U] + c->table_gV[V]); \
  205. b = c->table_bU[U]; \
  206. yuv2rgb_write(dest, count, Y1, Y2, 0, 0, \
  207. r, g, b, y, target, 0); \
  208. count++; \
  209. }
  210. static void
  211. yuv2rgb_X_template_lasx(SwsInternal *c, const int16_t *lumFilter,
  212. const int16_t **lumSrc, int lumFilterSize,
  213. const int16_t *chrFilter, const int16_t **chrUSrc,
  214. const int16_t **chrVSrc, int chrFilterSize,
  215. const int16_t **alpSrc, uint8_t *dest, int dstW,
  216. int y, enum AVPixelFormat target, int hasAlpha)
  217. {
  218. int i, j;
  219. int count = 0;
  220. int t = 1 << 18;
  221. int len = dstW >> 6;
  222. int res = dstW & 63;
  223. int len_count = (dstW + 1) >> 1;
  224. const void *r, *g, *b;
  225. int head = YUVRGB_TABLE_HEADROOM;
  226. __m256i headroom = __lasx_xvreplgr2vr_w(head);
  227. for (i = 0; i < len; i++) {
  228. int Y1, Y2, U, V, count_lum = count << 1;
  229. __m256i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
  230. __m256i yl1_ev, yl1_od, yh1_ev, yh1_od, yl2_ev, yl2_od, yh2_ev, yh2_od;
  231. __m256i u1_ev, u1_od, v1_ev, v1_od, u2_ev, u2_od, v2_ev, v2_od, temp;
  232. yl1_ev = __lasx_xvldrepl_w(&t, 0);
  233. yl1_od = yl1_ev;
  234. yh1_ev = yl1_ev;
  235. yh1_od = yl1_ev;
  236. u1_ev = yl1_ev;
  237. v1_ev = yl1_ev;
  238. u1_od = yl1_ev;
  239. v1_od = yl1_ev;
  240. yl2_ev = yl1_ev;
  241. yl2_od = yl1_ev;
  242. yh2_ev = yl1_ev;
  243. yh2_od = yl1_ev;
  244. u2_ev = yl1_ev;
  245. v2_ev = yl1_ev;
  246. u2_od = yl1_ev;
  247. v2_od = yl1_ev;
  248. for (j = 0; j < lumFilterSize; j++) {
  249. const int16_t *src_lum = lumSrc[j] + count_lum;
  250. temp = __lasx_xvldrepl_h((lumFilter + j), 0);
  251. DUP4_ARG2(__lasx_xvld, src_lum, 0, src_lum, 32, src_lum, 64,
  252. src_lum, 96, l_src1, l_src2, l_src3, l_src4);
  253. yl1_ev = __lasx_xvmaddwev_w_h(yl1_ev, temp, l_src1);
  254. yl1_od = __lasx_xvmaddwod_w_h(yl1_od, temp, l_src1);
  255. yh1_ev = __lasx_xvmaddwev_w_h(yh1_ev, temp, l_src2);
  256. yh1_od = __lasx_xvmaddwod_w_h(yh1_od, temp, l_src2);
  257. yl2_ev = __lasx_xvmaddwev_w_h(yl2_ev, temp, l_src3);
  258. yl2_od = __lasx_xvmaddwod_w_h(yl2_od, temp, l_src3);
  259. yh2_ev = __lasx_xvmaddwev_w_h(yh2_ev, temp, l_src4);
  260. yh2_od = __lasx_xvmaddwod_w_h(yh2_od, temp, l_src4);
  261. }
  262. for (j = 0; j < chrFilterSize; j++) {
  263. DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrUSrc[j] + count, 32,
  264. u_src1, u_src2);
  265. DUP2_ARG2(__lasx_xvld, chrVSrc[j] + count, 0, chrVSrc[j] + count, 32,
  266. v_src1, v_src2);
  267. temp = __lasx_xvldrepl_h((chrFilter + j), 0);
  268. u1_ev = __lasx_xvmaddwev_w_h(u1_ev, temp, u_src1);
  269. u1_od = __lasx_xvmaddwod_w_h(u1_od, temp, u_src1);
  270. v1_ev = __lasx_xvmaddwev_w_h(v1_ev, temp, v_src1);
  271. v1_od = __lasx_xvmaddwod_w_h(v1_od, temp, v_src1);
  272. u2_ev = __lasx_xvmaddwev_w_h(u2_ev, temp, u_src2);
  273. u2_od = __lasx_xvmaddwod_w_h(u2_od, temp, u_src2);
  274. v2_ev = __lasx_xvmaddwev_w_h(v2_ev, temp, v_src2);
  275. v2_od = __lasx_xvmaddwod_w_h(v2_od, temp, v_src2);
  276. }
  277. yl1_ev = __lasx_xvsrai_w(yl1_ev, 19);
  278. yh1_ev = __lasx_xvsrai_w(yh1_ev, 19);
  279. yl1_od = __lasx_xvsrai_w(yl1_od, 19);
  280. yh1_od = __lasx_xvsrai_w(yh1_od, 19);
  281. u1_ev = __lasx_xvsrai_w(u1_ev, 19);
  282. v1_ev = __lasx_xvsrai_w(v1_ev, 19);
  283. u1_od = __lasx_xvsrai_w(u1_od, 19);
  284. v1_od = __lasx_xvsrai_w(v1_od, 19);
  285. yl2_ev = __lasx_xvsrai_w(yl2_ev, 19);
  286. yh2_ev = __lasx_xvsrai_w(yh2_ev, 19);
  287. yl2_od = __lasx_xvsrai_w(yl2_od, 19);
  288. yh2_od = __lasx_xvsrai_w(yh2_od, 19);
  289. u2_ev = __lasx_xvsrai_w(u2_ev, 19);
  290. v2_ev = __lasx_xvsrai_w(v2_ev, 19);
  291. u2_od = __lasx_xvsrai_w(u2_od, 19);
  292. v2_od = __lasx_xvsrai_w(v2_od, 19);
  293. u1_ev = __lasx_xvadd_w(u1_ev, headroom);
  294. v1_ev = __lasx_xvadd_w(v1_ev, headroom);
  295. u1_od = __lasx_xvadd_w(u1_od, headroom);
  296. v1_od = __lasx_xvadd_w(v1_od, headroom);
  297. u2_ev = __lasx_xvadd_w(u2_ev, headroom);
  298. v2_ev = __lasx_xvadd_w(v2_ev, headroom);
  299. u2_od = __lasx_xvadd_w(u2_od, headroom);
  300. v2_od = __lasx_xvadd_w(v2_od, headroom);
  301. WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 0, 0, 0, 0);
  302. WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 1, 1, 0, 0);
  303. WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 2, 2, 1, 1);
  304. WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 3, 3, 1, 1);
  305. WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 4, 4, 2, 2);
  306. WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 5, 5, 2, 2);
  307. WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 6, 6, 3, 3);
  308. WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 7, 7, 3, 3);
  309. WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 0, 0, 4, 4);
  310. WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 1, 1, 4, 4);
  311. WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 2, 2, 5, 5);
  312. WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 3, 3, 5, 5);
  313. WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 4, 4, 6, 6);
  314. WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 5, 5, 6, 6);
  315. WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 6, 6, 7, 7);
  316. WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 7, 7, 7, 7);
  317. WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 0, 0, 0, 0);
  318. WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 1, 1, 0, 0);
  319. WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 2, 2, 1, 1);
  320. WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 3, 3, 1, 1);
  321. WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 4, 4, 2, 2);
  322. WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 5, 5, 2, 2);
  323. WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 6, 6, 3, 3);
  324. WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 7, 7, 3, 3);
  325. WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 0, 0, 4, 4);
  326. WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 1, 1, 4, 4);
  327. WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 2, 2, 5, 5);
  328. WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 3, 3, 5, 5);
  329. WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 4, 4, 6, 6);
  330. WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 5, 5, 6, 6);
  331. WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 6, 6, 7, 7);
  332. WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 7, 7, 7, 7);
  333. }
  334. if (res >= 32) {
  335. int Y1, Y2, U, V, count_lum = count << 1;
  336. __m256i l_src1, l_src2, u_src, v_src;
  337. __m256i yl_ev, yl_od, yh_ev, yh_od;
  338. __m256i u_ev, u_od, v_ev, v_od, temp;
  339. yl_ev = __lasx_xvldrepl_w(&t, 0);
  340. yl_od = yl_ev;
  341. yh_ev = yl_ev;
  342. yh_od = yl_ev;
  343. u_ev = yl_ev;
  344. v_ev = yl_ev;
  345. u_od = yl_ev;
  346. v_od = yl_ev;
  347. for (j = 0; j < lumFilterSize; j++) {
  348. temp = __lasx_xvldrepl_h((lumFilter + j), 0);
  349. DUP2_ARG2(__lasx_xvld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
  350. 32, l_src1, l_src2);
  351. yl_ev = __lasx_xvmaddwev_w_h(yl_ev, temp, l_src1);
  352. yl_od = __lasx_xvmaddwod_w_h(yl_od, temp, l_src1);
  353. yh_ev = __lasx_xvmaddwev_w_h(yh_ev, temp, l_src2);
  354. yh_od = __lasx_xvmaddwod_w_h(yh_od, temp, l_src2);
  355. }
  356. for (j = 0; j < chrFilterSize; j++) {
  357. DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
  358. u_src, v_src);
  359. temp = __lasx_xvldrepl_h((chrFilter + j), 0);
  360. u_ev = __lasx_xvmaddwev_w_h(u_ev, temp, u_src);
  361. u_od = __lasx_xvmaddwod_w_h(u_od, temp, u_src);
  362. v_ev = __lasx_xvmaddwev_w_h(v_ev, temp, v_src);
  363. v_od = __lasx_xvmaddwod_w_h(v_od, temp, v_src);
  364. }
  365. yl_ev = __lasx_xvsrai_w(yl_ev, 19);
  366. yh_ev = __lasx_xvsrai_w(yh_ev, 19);
  367. yl_od = __lasx_xvsrai_w(yl_od, 19);
  368. yh_od = __lasx_xvsrai_w(yh_od, 19);
  369. u_ev = __lasx_xvsrai_w(u_ev, 19);
  370. v_ev = __lasx_xvsrai_w(v_ev, 19);
  371. u_od = __lasx_xvsrai_w(u_od, 19);
  372. v_od = __lasx_xvsrai_w(v_od, 19);
  373. u_ev = __lasx_xvadd_w(u_ev, headroom);
  374. v_ev = __lasx_xvadd_w(v_ev, headroom);
  375. u_od = __lasx_xvadd_w(u_od, headroom);
  376. v_od = __lasx_xvadd_w(v_od, headroom);
  377. WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
  378. WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
  379. WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 2, 2, 1, 1);
  380. WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 3, 3, 1, 1);
  381. WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 4, 4, 2, 2);
  382. WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 5, 5, 2, 2);
  383. WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 6, 6, 3, 3);
  384. WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 7, 7, 3, 3);
  385. WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 0, 0, 4, 4);
  386. WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 1, 1, 4, 4);
  387. WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 2, 2, 5, 5);
  388. WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 3, 3, 5, 5);
  389. WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 4, 4, 6, 6);
  390. WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 5, 5, 6, 6);
  391. WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 6, 6, 7, 7);
  392. WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 7, 7, 7, 7);
  393. res -= 32;
  394. }
  395. if (res >= 16) {
  396. int Y1, Y2, U, V;
  397. int count_lum = count << 1;
  398. __m256i l_src, u_src, v_src;
  399. __m256i y_ev, y_od, u, v, temp;
  400. y_ev = __lasx_xvldrepl_w(&t, 0);
  401. y_od = y_ev;
  402. u = y_ev;
  403. v = y_ev;
  404. for (j = 0; j < lumFilterSize; j++) {
  405. temp = __lasx_xvldrepl_h((lumFilter + j), 0);
  406. l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
  407. y_ev = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
  408. y_od = __lasx_xvmaddwod_w_h(y_od, temp, l_src);
  409. }
  410. for (j = 0; j < chrFilterSize; j++) {
  411. DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count,
  412. 0, u_src, v_src);
  413. temp = __lasx_xvldrepl_h((chrFilter + j), 0);
  414. u_src = __lasx_vext2xv_w_h(u_src);
  415. v_src = __lasx_vext2xv_w_h(v_src);
  416. u = __lasx_xvmaddwev_w_h(u, temp, u_src);
  417. v = __lasx_xvmaddwev_w_h(v, temp, v_src);
  418. }
  419. y_ev = __lasx_xvsrai_w(y_ev, 19);
  420. y_od = __lasx_xvsrai_w(y_od, 19);
  421. u = __lasx_xvsrai_w(u, 19);
  422. v = __lasx_xvsrai_w(v, 19);
  423. u = __lasx_xvadd_w(u, headroom);
  424. v = __lasx_xvadd_w(v, headroom);
  425. WRITE_YUV2RGB(y_ev, y_od, u, v, 0, 0, 0, 0);
  426. WRITE_YUV2RGB(y_ev, y_od, u, v, 1, 1, 1, 1);
  427. WRITE_YUV2RGB(y_ev, y_od, u, v, 2, 2, 2, 2);
  428. WRITE_YUV2RGB(y_ev, y_od, u, v, 3, 3, 3, 3);
  429. WRITE_YUV2RGB(y_ev, y_od, u, v, 4, 4, 4, 4);
  430. WRITE_YUV2RGB(y_ev, y_od, u, v, 5, 5, 5, 5);
  431. WRITE_YUV2RGB(y_ev, y_od, u, v, 6, 6, 6, 6);
  432. WRITE_YUV2RGB(y_ev, y_od, u, v, 7, 7, 7, 7);
  433. res -= 16;
  434. }
  435. if (res >= 8) {
  436. int Y1, Y2, U, V;
  437. int count_lum = count << 1;
  438. __m256i l_src, u_src, v_src;
  439. __m256i y_ev, uv, temp;
  440. y_ev = __lasx_xvldrepl_w(&t, 0);
  441. uv = y_ev;
  442. for (j = 0; j < lumFilterSize; j++) {
  443. temp = __lasx_xvldrepl_h((lumFilter + j), 0);
  444. l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
  445. l_src = __lasx_vext2xv_w_h(l_src);
  446. y_ev = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
  447. }
  448. for (j = 0; j < chrFilterSize; j++) {
  449. u_src = __lasx_xvldrepl_d((chrUSrc[j] + count), 0);
  450. v_src = __lasx_xvldrepl_d((chrVSrc[j] + count), 0);
  451. temp = __lasx_xvldrepl_h((chrFilter + j), 0);
  452. u_src = __lasx_xvilvl_d(v_src, u_src);
  453. u_src = __lasx_vext2xv_w_h(u_src);
  454. uv = __lasx_xvmaddwev_w_h(uv, temp, u_src);
  455. }
  456. y_ev = __lasx_xvsrai_w(y_ev, 19);
  457. uv = __lasx_xvsrai_w(uv, 19);
  458. uv = __lasx_xvadd_w(uv, headroom);
  459. WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 0, 1, 0, 4);
  460. WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 2, 3, 1, 5);
  461. WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 4, 5, 2, 6);
  462. WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 6, 7, 3, 7);
  463. }
  464. for (; count < len_count; count++) {
  465. int Y1 = 1 << 18;
  466. int Y2 = Y1;
  467. int U = Y1;
  468. int V = Y1;
  469. for (j = 0; j < lumFilterSize; j++) {
  470. Y1 += lumSrc[j][count * 2] * lumFilter[j];
  471. Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
  472. }
  473. for (j = 0; j < chrFilterSize; j++) {
  474. U += chrUSrc[j][count] * chrFilter[j];
  475. V += chrVSrc[j][count] * chrFilter[j];
  476. }
  477. Y1 >>= 19;
  478. Y2 >>= 19;
  479. U >>= 19;
  480. V >>= 19;
  481. r = c->table_rV[V + YUVRGB_TABLE_HEADROOM];
  482. g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
  483. c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
  484. b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
  485. yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
  486. r, g, b, y, target, 0);
  487. }
  488. }
  489. static void
  490. yuv2rgb_2_template_lasx(SwsInternal *c, const int16_t *buf[2],
  491. const int16_t *ubuf[2], const int16_t *vbuf[2],
  492. const int16_t *abuf[2], uint8_t *dest, int dstW,
  493. int yalpha, int uvalpha, int y,
  494. enum AVPixelFormat target, int hasAlpha)
  495. {
  496. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  497. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
  498. *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
  499. int yalpha1 = 4096 - yalpha;
  500. int uvalpha1 = 4096 - uvalpha;
  501. int i, count = 0;
  502. int len = dstW - 15;
  503. int len_count = (dstW + 1) >> 1;
  504. const void *r, *g, *b;
  505. int head = YUVRGB_TABLE_HEADROOM;
  506. __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
  507. __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
  508. __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
  509. __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
  510. __m256i headroom = __lasx_xvreplgr2vr_w(head);
  511. for (i = 0; i < len; i += 16) {
  512. int Y1, Y2, U, V;
  513. int i_dex = i << 1;
  514. int c_dex = count << 1;
  515. __m256i y0_h, y0_l, y0, u0, v0;
  516. __m256i y1_h, y1_l, y1, u1, v1;
  517. __m256i y_l, y_h, u, v;
  518. DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
  519. buf1, i_dex, y0, u0, v0, y1);
  520. DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
  521. DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
  522. DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
  523. DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
  524. y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
  525. y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
  526. u0 = __lasx_xvmul_w(u0, v_uvalpha1);
  527. v0 = __lasx_xvmul_w(v0, v_uvalpha1);
  528. y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
  529. y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
  530. u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
  531. v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
  532. y_l = __lasx_xvsrai_w(y_l, 19);
  533. y_h = __lasx_xvsrai_w(y_h, 19);
  534. u = __lasx_xvsrai_w(u, 19);
  535. v = __lasx_xvsrai_w(v, 19);
  536. u = __lasx_xvadd_w(u, headroom);
  537. v = __lasx_xvadd_w(v, headroom);
  538. WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
  539. WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
  540. WRITE_YUV2RGB(y_h, y_h, u, v, 0, 1, 2, 2);
  541. WRITE_YUV2RGB(y_h, y_h, u, v, 2, 3, 3, 3);
  542. WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 4, 4);
  543. WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 5, 5);
  544. WRITE_YUV2RGB(y_h, y_h, u, v, 4, 5, 6, 6);
  545. WRITE_YUV2RGB(y_h, y_h, u, v, 6, 7, 7, 7);
  546. }
  547. if (dstW - i >= 8) {
  548. int Y1, Y2, U, V;
  549. int i_dex = i << 1;
  550. __m256i y0_l, y0, u0, v0;
  551. __m256i y1_l, y1, u1, v1;
  552. __m256i y_l, u, v;
  553. y0 = __lasx_xvldx(buf0, i_dex);
  554. u0 = __lasx_xvldrepl_d((ubuf0 + count), 0);
  555. v0 = __lasx_xvldrepl_d((vbuf0 + count), 0);
  556. y1 = __lasx_xvldx(buf1, i_dex);
  557. u1 = __lasx_xvldrepl_d((ubuf1 + count), 0);
  558. v1 = __lasx_xvldrepl_d((vbuf1 + count), 0);
  559. DUP2_ARG1(__lasx_vext2xv_w_h, y0, y1, y0_l, y1_l);
  560. DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
  561. y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
  562. u0 = __lasx_xvmul_w(u0, v_uvalpha1);
  563. v0 = __lasx_xvmul_w(v0, v_uvalpha1);
  564. y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
  565. u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
  566. v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
  567. y_l = __lasx_xvsrai_w(y_l, 19);
  568. u = __lasx_xvsrai_w(u, 19);
  569. v = __lasx_xvsrai_w(v, 19);
  570. u = __lasx_xvadd_w(u, headroom);
  571. v = __lasx_xvadd_w(v, headroom);
  572. WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
  573. WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
  574. WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 2, 2);
  575. WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 3, 3);
  576. i += 8;
  577. }
  578. for (; count < len_count; count++) {
  579. int Y1 = (buf0[count * 2] * yalpha1 +
  580. buf1[count * 2] * yalpha) >> 19;
  581. int Y2 = (buf0[count * 2 + 1] * yalpha1 +
  582. buf1[count * 2 + 1] * yalpha) >> 19;
  583. int U = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
  584. int V = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
  585. r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
  586. g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
  587. c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
  588. b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
  589. yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
  590. r, g, b, y, target, 0);
  591. }
  592. }
  593. static void
  594. yuv2rgb_1_template_lasx(SwsInternal *c, const int16_t *buf0,
  595. const int16_t *ubuf[2], const int16_t *vbuf[2],
  596. const int16_t *abuf0, uint8_t *dest, int dstW,
  597. int uvalpha, int y, enum AVPixelFormat target,
  598. int hasAlpha)
  599. {
  600. const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
  601. int i;
  602. int len = (dstW - 15);
  603. int len_count = (dstW + 1) >> 1;
  604. const void *r, *g, *b;
  605. if (uvalpha < 2048) {
  606. int count = 0;
  607. int head = YUVRGB_TABLE_HEADROOM;
  608. __m256i headroom = __lasx_xvreplgr2vr_h(head);
  609. for (i = 0; i < len; i += 16) {
  610. int Y1, Y2, U, V;
  611. int i_dex = i << 1;
  612. int c_dex = count << 1;
  613. __m256i src_y, src_u, src_v;
  614. __m256i u, v, y_l, y_h;
  615. DUP2_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, src_y, src_u);
  616. src_v = __lasx_xvldx(vbuf0, c_dex);
  617. src_u = __lasx_xvpermi_q(src_u, src_v, 0x02);
  618. src_y = __lasx_xvsrari_h(src_y, 7);
  619. src_u = __lasx_xvsrari_h(src_u, 7);
  620. y_l = __lasx_xvsllwil_w_h(src_y, 0);
  621. y_h = __lasx_xvexth_w_h(src_y);
  622. u = __lasx_xvaddwev_w_h(src_u, headroom);
  623. v = __lasx_xvaddwod_w_h(src_u, headroom);
  624. WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
  625. WRITE_YUV2RGB(y_l, y_l, v, v, 2, 3, 0, 4);
  626. WRITE_YUV2RGB(y_h, y_h, u, u, 0, 1, 1, 5);
  627. WRITE_YUV2RGB(y_h, y_h, v, v, 2, 3, 1, 5);
  628. WRITE_YUV2RGB(y_l, y_l, u, u, 4, 5, 2, 6);
  629. WRITE_YUV2RGB(y_l, y_l, v, v, 6, 7, 2, 6);
  630. WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
  631. WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
  632. }
  633. if (dstW - i >= 8){
  634. int Y1, Y2, U, V;
  635. int i_dex = i << 1;
  636. __m256i src_y, src_u, src_v;
  637. __m256i y_l, uv;
  638. src_y = __lasx_xvldx(buf0, i_dex);
  639. src_u = __lasx_xvldrepl_d((ubuf0 + count), 0);
  640. src_v = __lasx_xvldrepl_d((vbuf0 + count), 0);
  641. src_u = __lasx_xvilvl_d(src_v, src_u);
  642. y_l = __lasx_xvsrari_h(src_y, 7);
  643. uv = __lasx_xvsrari_h(src_u, 7);
  644. y_l = __lasx_vext2xv_w_h(y_l);
  645. uv = __lasx_vext2xv_w_h(uv);
  646. uv = __lasx_xvaddwev_w_h(uv, headroom);
  647. WRITE_YUV2RGB(y_l, y_l, uv, uv, 0, 1, 0, 4);
  648. WRITE_YUV2RGB(y_l, y_l, uv, uv, 2, 3, 1, 5);
  649. WRITE_YUV2RGB(y_l, y_l, uv, uv, 4, 5, 2, 6);
  650. WRITE_YUV2RGB(y_l, y_l, uv, uv, 6, 7, 3, 7);
  651. i += 8;
  652. }
  653. for (; count < len_count; count++) {
  654. int Y1 = (buf0[count * 2 ] + 64) >> 7;
  655. int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
  656. int U = (ubuf0[count] + 64) >> 7;
  657. int V = (vbuf0[count] + 64) >> 7;
  658. r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
  659. g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
  660. c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
  661. b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
  662. yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
  663. r, g, b, y, target, 0);
  664. }
  665. } else {
  666. const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
  667. int count = 0;
  668. int HEADROOM = YUVRGB_TABLE_HEADROOM;
  669. __m256i headroom = __lasx_xvreplgr2vr_w(HEADROOM);
  670. for (i = 0; i < len; i += 16) {
  671. int Y1, Y2, U, V;
  672. int i_dex = i << 1;
  673. int c_dex = count << 1;
  674. __m256i src_y, src_u0, src_v0, src_u1, src_v1;
  675. __m256i y_l, y_h, u, v;
  676. DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
  677. ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
  678. src_v1 = __lasx_xvldx(vbuf1, c_dex);
  679. src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
  680. src_u1 = __lasx_xvpermi_q(src_u1, src_v1, 0x02);
  681. src_y = __lasx_xvsrari_h(src_y, 7);
  682. u = __lasx_xvaddwev_w_h(src_u0, src_u1);
  683. v = __lasx_xvaddwod_w_h(src_u0, src_u1);
  684. y_l = __lasx_xvsllwil_w_h(src_y, 0);
  685. y_h = __lasx_xvexth_w_h(src_y);
  686. u = __lasx_xvsrari_w(u, 8);
  687. v = __lasx_xvsrari_w(v, 8);
  688. u = __lasx_xvadd_w(u, headroom);
  689. v = __lasx_xvadd_w(v, headroom);
  690. WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
  691. WRITE_YUV2RGB(y_l, y_l, v, v, 2, 3, 0, 4);
  692. WRITE_YUV2RGB(y_h, y_h, u, u, 0, 1, 1, 5);
  693. WRITE_YUV2RGB(y_h, y_h, v, v, 2, 3, 1, 5);
  694. WRITE_YUV2RGB(y_l, y_l, u, u, 4, 5, 2, 6);
  695. WRITE_YUV2RGB(y_l, y_l, v, v, 6, 7, 2, 6);
  696. WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
  697. WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
  698. }
  699. if (dstW - i >= 8) {
  700. int Y1, Y2, U, V;
  701. int i_dex = i << 1;
  702. __m256i src_y, src_u0, src_v0, src_u1, src_v1;
  703. __m256i uv;
  704. src_y = __lasx_xvldx(buf0, i_dex);
  705. src_u0 = __lasx_xvldrepl_d((ubuf0 + count), 0);
  706. src_v0 = __lasx_xvldrepl_d((vbuf0 + count), 0);
  707. src_u1 = __lasx_xvldrepl_d((ubuf1 + count), 0);
  708. src_v1 = __lasx_xvldrepl_d((vbuf1 + count), 0);
  709. src_u0 = __lasx_xvilvl_h(src_u1, src_u0);
  710. src_v0 = __lasx_xvilvl_h(src_v1, src_v0);
  711. src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
  712. src_y = __lasx_xvsrari_h(src_y, 7);
  713. uv = __lasx_xvhaddw_w_h(src_u0, src_u0);
  714. src_y = __lasx_vext2xv_w_h(src_y);
  715. uv = __lasx_xvsrari_w(uv, 8);
  716. uv = __lasx_xvadd_w(uv, headroom);
  717. WRITE_YUV2RGB(src_y, src_y, uv, uv, 0, 1, 0, 4);
  718. WRITE_YUV2RGB(src_y, src_y, uv, uv, 2, 3, 1, 5);
  719. WRITE_YUV2RGB(src_y, src_y, uv, uv, 4, 5, 2, 6);
  720. WRITE_YUV2RGB(src_y, src_y, uv, uv, 6, 7, 3, 7);
  721. i += 8;
  722. }
  723. for (; count < len_count; count++) {
  724. int Y1 = (buf0[count * 2 ] + 64) >> 7;
  725. int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
  726. int U = (ubuf0[count] + ubuf1[count] + 128) >> 8;
  727. int V = (vbuf0[count] + vbuf1[count] + 128) >> 8;
  728. r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
  729. g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
  730. c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
  731. b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
  732. yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
  733. r, g, b, y, target, 0);
  734. }
  735. }
  736. }
  737. #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
  738. static void name ## ext ## _X_lasx(SwsInternal *c, const int16_t *lumFilter, \
  739. const int16_t **lumSrc, int lumFilterSize, \
  740. const int16_t *chrFilter, const int16_t **chrUSrc, \
  741. const int16_t **chrVSrc, int chrFilterSize, \
  742. const int16_t **alpSrc, uint8_t *dest, int dstW, \
  743. int y) \
  744. { \
  745. name ## base ## _X_template_lasx(c, lumFilter, lumSrc, lumFilterSize, \
  746. chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
  747. alpSrc, dest, dstW, y, fmt, hasAlpha); \
  748. }
  749. #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
  750. YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
  751. static void name ## ext ## _2_lasx(SwsInternal *c, const int16_t *buf[2], \
  752. const int16_t *ubuf[2], const int16_t *vbuf[2], \
  753. const int16_t *abuf[2], uint8_t *dest, int dstW, \
  754. int yalpha, int uvalpha, int y) \
  755. { \
  756. name ## base ## _2_template_lasx(c, buf, ubuf, vbuf, abuf, dest, \
  757. dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
  758. }
  759. #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
  760. YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
  761. static void name ## ext ## _1_lasx(SwsInternal *c, const int16_t *buf0, \
  762. const int16_t *ubuf[2], const int16_t *vbuf[2], \
  763. const int16_t *abuf0, uint8_t *dest, int dstW, \
  764. int uvalpha, int y) \
  765. { \
  766. name ## base ## _1_template_lasx(c, buf0, ubuf, vbuf, abuf0, dest, \
  767. dstW, uvalpha, y, fmt, hasAlpha); \
  768. }
  769. #if CONFIG_SMALL
  770. #else
  771. #if CONFIG_SWSCALE_ALPHA
  772. #endif
  773. YUV2RGBWRAPPER(yuv2rgb,, x32_1, AV_PIX_FMT_RGB32_1, 0)
  774. YUV2RGBWRAPPER(yuv2rgb,, x32, AV_PIX_FMT_RGB32, 0)
  775. #endif
  776. YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
  777. YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
  778. YUV2RGBWRAPPER(yuv2rgb,, 16, AV_PIX_FMT_RGB565, 0)
  779. YUV2RGBWRAPPER(yuv2rgb,, 15, AV_PIX_FMT_RGB555, 0)
  780. YUV2RGBWRAPPER(yuv2rgb,, 12, AV_PIX_FMT_RGB444, 0)
  781. YUV2RGBWRAPPER(yuv2rgb,, 8, AV_PIX_FMT_RGB8, 0)
  782. YUV2RGBWRAPPER(yuv2rgb,, 4, AV_PIX_FMT_RGB4, 0)
  783. YUV2RGBWRAPPER(yuv2rgb,, 4b, AV_PIX_FMT_RGB4_BYTE, 0)
  784. // This function is copied from libswscale/output.c
  785. static av_always_inline void yuv2rgb_write_full(SwsInternal *c,
  786. uint8_t *dest, int i, int R, int A, int G, int B,
  787. int y, enum AVPixelFormat target, int hasAlpha, int err[4])
  788. {
  789. int isrgb8 = target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8;
  790. if ((R | G | B) & 0xC0000000) {
  791. R = av_clip_uintp2(R, 30);
  792. G = av_clip_uintp2(G, 30);
  793. B = av_clip_uintp2(B, 30);
  794. }
  795. switch(target) {
  796. case AV_PIX_FMT_ARGB:
  797. dest[0] = hasAlpha ? A : 255;
  798. dest[1] = R >> 22;
  799. dest[2] = G >> 22;
  800. dest[3] = B >> 22;
  801. break;
  802. case AV_PIX_FMT_RGB24:
  803. dest[0] = R >> 22;
  804. dest[1] = G >> 22;
  805. dest[2] = B >> 22;
  806. break;
  807. case AV_PIX_FMT_RGBA:
  808. dest[0] = R >> 22;
  809. dest[1] = G >> 22;
  810. dest[2] = B >> 22;
  811. dest[3] = hasAlpha ? A : 255;
  812. break;
  813. case AV_PIX_FMT_ABGR:
  814. dest[0] = hasAlpha ? A : 255;
  815. dest[1] = B >> 22;
  816. dest[2] = G >> 22;
  817. dest[3] = R >> 22;
  818. break;
  819. case AV_PIX_FMT_BGR24:
  820. dest[0] = B >> 22;
  821. dest[1] = G >> 22;
  822. dest[2] = R >> 22;
  823. break;
  824. case AV_PIX_FMT_BGRA:
  825. dest[0] = B >> 22;
  826. dest[1] = G >> 22;
  827. dest[2] = R >> 22;
  828. dest[3] = hasAlpha ? A : 255;
  829. break;
  830. case AV_PIX_FMT_BGR4_BYTE:
  831. case AV_PIX_FMT_RGB4_BYTE:
  832. case AV_PIX_FMT_BGR8:
  833. case AV_PIX_FMT_RGB8:
  834. {
  835. int r,g,b;
  836. switch (c->dither) {
  837. default:
  838. case SWS_DITHER_AUTO:
  839. case SWS_DITHER_ED:
  840. R >>= 22;
  841. G >>= 22;
  842. B >>= 22;
  843. R += (7*err[0] + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2])>>4;
  844. G += (7*err[1] + 1*c->dither_error[1][i] + 5*c->dither_error[1][i+1] + 3*c->dither_error[1][i+2])>>4;
  845. B += (7*err[2] + 1*c->dither_error[2][i] + 5*c->dither_error[2][i+1] + 3*c->dither_error[2][i+2])>>4;
  846. c->dither_error[0][i] = err[0];
  847. c->dither_error[1][i] = err[1];
  848. c->dither_error[2][i] = err[2];
  849. r = R >> (isrgb8 ? 5 : 7);
  850. g = G >> (isrgb8 ? 5 : 6);
  851. b = B >> (isrgb8 ? 6 : 7);
  852. r = av_clip(r, 0, isrgb8 ? 7 : 1);
  853. g = av_clip(g, 0, isrgb8 ? 7 : 3);
  854. b = av_clip(b, 0, isrgb8 ? 3 : 1);
  855. err[0] = R - r*(isrgb8 ? 36 : 255);
  856. err[1] = G - g*(isrgb8 ? 36 : 85);
  857. err[2] = B - b*(isrgb8 ? 85 : 255);
  858. break;
  859. case SWS_DITHER_A_DITHER:
  860. if (isrgb8) {
  861. /* see http://pippin.gimp.org/a_dither/ for details/origin */
  862. #define A_DITHER(u,v) (((((u)+((v)*236))*119)&0xff))
  863. r = (((R >> 19) + A_DITHER(i,y) -96)>>8);
  864. g = (((G >> 19) + A_DITHER(i + 17,y) - 96)>>8);
  865. b = (((B >> 20) + A_DITHER(i + 17*2,y) -96)>>8);
  866. r = av_clip_uintp2(r, 3);
  867. g = av_clip_uintp2(g, 3);
  868. b = av_clip_uintp2(b, 2);
  869. } else {
  870. r = (((R >> 21) + A_DITHER(i,y)-256)>>8);
  871. g = (((G >> 19) + A_DITHER(i + 17,y)-256)>>8);
  872. b = (((B >> 21) + A_DITHER(i + 17*2,y)-256)>>8);
  873. r = av_clip_uintp2(r, 1);
  874. g = av_clip_uintp2(g, 2);
  875. b = av_clip_uintp2(b, 1);
  876. }
  877. break;
  878. case SWS_DITHER_X_DITHER:
  879. if (isrgb8) {
  880. /* see http://pippin.gimp.org/a_dither/ for details/origin */
  881. #define X_DITHER(u,v) (((((u)^((v)*237))*181)&0x1ff)/2)
  882. r = (((R >> 19) + X_DITHER(i,y) - 96)>>8);
  883. g = (((G >> 19) + X_DITHER(i + 17,y) - 96)>>8);
  884. b = (((B >> 20) + X_DITHER(i + 17*2,y) - 96)>>8);
  885. r = av_clip_uintp2(r, 3);
  886. g = av_clip_uintp2(g, 3);
  887. b = av_clip_uintp2(b, 2);
  888. } else {
  889. r = (((R >> 21) + X_DITHER(i,y)-256)>>8);
  890. g = (((G >> 19) + X_DITHER(i + 17,y)-256)>>8);
  891. b = (((B >> 21) + X_DITHER(i + 17*2,y)-256)>>8);
  892. r = av_clip_uintp2(r, 1);
  893. g = av_clip_uintp2(g, 2);
  894. b = av_clip_uintp2(b, 1);
  895. }
  896. break;
  897. }
  898. if(target == AV_PIX_FMT_BGR4_BYTE) {
  899. dest[0] = r + 2*g + 8*b;
  900. } else if(target == AV_PIX_FMT_RGB4_BYTE) {
  901. dest[0] = b + 2*g + 8*r;
  902. } else if(target == AV_PIX_FMT_BGR8) {
  903. dest[0] = r + 8*g + 64*b;
  904. } else if(target == AV_PIX_FMT_RGB8) {
  905. dest[0] = b + 4*g + 32*r;
  906. } else
  907. av_assert2(0);
  908. break; }
  909. }
  910. }
  911. #define YUV2RGB_SETUP \
  912. int y_offset = c->yuv2rgb_y_offset; \
  913. int y_coeff = c->yuv2rgb_y_coeff; \
  914. int v2r_coe = c->yuv2rgb_v2r_coeff; \
  915. int v2g_coe = c->yuv2rgb_v2g_coeff; \
  916. int u2g_coe = c->yuv2rgb_u2g_coeff; \
  917. int u2b_coe = c->yuv2rgb_u2b_coeff; \
  918. __m256i offset = __lasx_xvreplgr2vr_w(y_offset); \
  919. __m256i coeff = __lasx_xvreplgr2vr_w(y_coeff); \
  920. __m256i v2r = __lasx_xvreplgr2vr_w(v2r_coe); \
  921. __m256i v2g = __lasx_xvreplgr2vr_w(v2g_coe); \
  922. __m256i u2g = __lasx_xvreplgr2vr_w(u2g_coe); \
  923. __m256i u2b = __lasx_xvreplgr2vr_w(u2b_coe); \
  924. #define YUV2RGB(y, u, v, R, G, B, offset, coeff, \
  925. y_temp, v2r, v2g, u2g, u2b) \
  926. { \
  927. y = __lasx_xvsub_w(y, offset); \
  928. y = __lasx_xvmul_w(y, coeff); \
  929. y = __lasx_xvadd_w(y, y_temp); \
  930. R = __lasx_xvmadd_w(y, v, v2r); \
  931. v = __lasx_xvmadd_w(y, v, v2g); \
  932. G = __lasx_xvmadd_w(v, u, u2g); \
  933. B = __lasx_xvmadd_w(y, u, u2b); \
  934. }
  935. #define WRITE_FULL_A(r, g, b, a, t1, s) \
  936. { \
  937. R = __lasx_xvpickve2gr_w(r, t1); \
  938. G = __lasx_xvpickve2gr_w(g, t1); \
  939. B = __lasx_xvpickve2gr_w(b, t1); \
  940. A = __lasx_xvpickve2gr_w(a, t1); \
  941. if (A & 0x100) \
  942. A = av_clip_uint8(A); \
  943. yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
  944. dest += step; \
  945. }
  946. #define WRITE_FULL(r, g, b, t1, s) \
  947. { \
  948. R = __lasx_xvpickve2gr_w(r, t1); \
  949. G = __lasx_xvpickve2gr_w(g, t1); \
  950. B = __lasx_xvpickve2gr_w(b, t1); \
  951. yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
  952. dest += step; \
  953. }
  954. static void
  955. yuv2rgb_full_X_template_lasx(SwsInternal *c, const int16_t *lumFilter,
  956. const int16_t **lumSrc, int lumFilterSize,
  957. const int16_t *chrFilter, const int16_t **chrUSrc,
  958. const int16_t **chrVSrc, int chrFilterSize,
  959. const int16_t **alpSrc, uint8_t *dest,
  960. int dstW, int y, enum AVPixelFormat target,
  961. int hasAlpha)
  962. {
  963. int i, j, B, G, R, A;
  964. int step = (target == AV_PIX_FMT_RGB24 ||
  965. target == AV_PIX_FMT_BGR24) ? 3 : 4;
  966. int err[4] = {0};
  967. int a_temp = 1 << 18;
  968. int templ = 1 << 9;
  969. int tempc = templ - (128 << 19);
  970. int ytemp = 1 << 21;
  971. int len = dstW - 15;
  972. __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
  973. YUV2RGB_SETUP
  974. if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
  975. || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
  976. step = 1;
  977. for (i = 0; i < len; i += 16) {
  978. __m256i l_src, u_src, v_src;
  979. __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od, temp;
  980. __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
  981. int n = i << 1;
  982. y_ev = y_od = __lasx_xvreplgr2vr_w(templ);
  983. u_ev = u_od = v_ev = v_od = __lasx_xvreplgr2vr_w(tempc);
  984. for (j = 0; j < lumFilterSize; j++) {
  985. temp = __lasx_xvldrepl_h((lumFilter + j), 0);
  986. l_src = __lasx_xvldx(lumSrc[j], n);
  987. y_ev = __lasx_xvmaddwev_w_h(y_ev, l_src, temp);
  988. y_od = __lasx_xvmaddwod_w_h(y_od, l_src, temp);
  989. }
  990. for (j = 0; j < chrFilterSize; j++) {
  991. temp = __lasx_xvldrepl_h((chrFilter + j), 0);
  992. DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n,
  993. u_src, v_src);
  994. DUP2_ARG3(__lasx_xvmaddwev_w_h, u_ev, u_src, temp, v_ev,
  995. v_src, temp, u_ev, v_ev);
  996. DUP2_ARG3(__lasx_xvmaddwod_w_h, u_od, u_src, temp, v_od,
  997. v_src, temp, u_od, v_od);
  998. }
  999. y_ev = __lasx_xvsrai_w(y_ev, 10);
  1000. y_od = __lasx_xvsrai_w(y_od, 10);
  1001. u_ev = __lasx_xvsrai_w(u_ev, 10);
  1002. u_od = __lasx_xvsrai_w(u_od, 10);
  1003. v_ev = __lasx_xvsrai_w(v_ev, 10);
  1004. v_od = __lasx_xvsrai_w(v_od, 10);
  1005. YUV2RGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
  1006. y_temp, v2r, v2g, u2g, u2b);
  1007. YUV2RGB(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
  1008. y_temp, v2r, v2g, u2g, u2b);
  1009. if (hasAlpha) {
  1010. __m256i a_src, a_ev, a_od;
  1011. a_ev = a_od = __lasx_xvreplgr2vr_w(a_temp);
  1012. for (j = 0; j < lumFilterSize; j++) {
  1013. temp = __lasx_xvldrepl_h(lumFilter + j, 0);
  1014. a_src = __lasx_xvldx(alpSrc[j], n);
  1015. a_ev = __lasx_xvmaddwev_w_h(a_ev, a_src, temp);
  1016. a_od = __lasx_xvmaddwod_w_h(a_od, a_src, temp);
  1017. }
  1018. a_ev = __lasx_xvsrai_w(a_ev, 19);
  1019. a_od = __lasx_xvsrai_w(a_od, 19);
  1020. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
  1021. WRITE_FULL_A(R_od, G_od, B_od, a_od, 0, 1);
  1022. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 2);
  1023. WRITE_FULL_A(R_od, G_od, B_od, a_od, 1, 3);
  1024. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 4);
  1025. WRITE_FULL_A(R_od, G_od, B_od, a_od, 2, 5);
  1026. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 6);
  1027. WRITE_FULL_A(R_od, G_od, B_od, a_od, 3, 7);
  1028. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 8);
  1029. WRITE_FULL_A(R_od, G_od, B_od, a_od, 4, 9);
  1030. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 10);
  1031. WRITE_FULL_A(R_od, G_od, B_od, a_od, 5, 11);
  1032. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 12);
  1033. WRITE_FULL_A(R_od, G_od, B_od, a_od, 6, 13);
  1034. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 14);
  1035. WRITE_FULL_A(R_od, G_od, B_od, a_od, 7, 15);
  1036. } else {
  1037. WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
  1038. WRITE_FULL(R_od, G_od, B_od, 0, 1);
  1039. WRITE_FULL(R_ev, G_ev, B_ev, 1, 2);
  1040. WRITE_FULL(R_od, G_od, B_od, 1, 3);
  1041. WRITE_FULL(R_ev, G_ev, B_ev, 2, 4);
  1042. WRITE_FULL(R_od, G_od, B_od, 2, 5);
  1043. WRITE_FULL(R_ev, G_ev, B_ev, 3, 6);
  1044. WRITE_FULL(R_od, G_od, B_od, 3, 7);
  1045. WRITE_FULL(R_ev, G_ev, B_ev, 4, 8);
  1046. WRITE_FULL(R_od, G_od, B_od, 4, 9);
  1047. WRITE_FULL(R_ev, G_ev, B_ev, 5, 10);
  1048. WRITE_FULL(R_od, G_od, B_od, 5, 11);
  1049. WRITE_FULL(R_ev, G_ev, B_ev, 6, 12);
  1050. WRITE_FULL(R_od, G_od, B_od, 6, 13);
  1051. WRITE_FULL(R_ev, G_ev, B_ev, 7, 14);
  1052. WRITE_FULL(R_od, G_od, B_od, 7, 15);
  1053. }
  1054. }
  1055. if (dstW - i >= 8) {
  1056. __m256i l_src, u_src, v_src;
  1057. __m256i y_ev, u_ev, v_ev, uv, temp;
  1058. __m256i R_ev, G_ev, B_ev;
  1059. int n = i << 1;
  1060. y_ev = __lasx_xvreplgr2vr_w(templ);
  1061. u_ev = v_ev = __lasx_xvreplgr2vr_w(tempc);
  1062. for (j = 0; j < lumFilterSize; j++) {
  1063. temp = __lasx_xvldrepl_h((lumFilter + j), 0);
  1064. l_src = __lasx_xvldx(lumSrc[j], n);
  1065. l_src = __lasx_xvpermi_d(l_src, 0xD8);
  1066. l_src = __lasx_xvilvl_h(l_src, l_src);
  1067. y_ev = __lasx_xvmaddwev_w_h(y_ev, l_src, temp);
  1068. }
  1069. for (j = 0; j < chrFilterSize; j++) {
  1070. temp = __lasx_xvldrepl_h((chrFilter + j), 0);
  1071. DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
  1072. u_src = __lasx_xvpermi_d(u_src, 0xD8);
  1073. v_src = __lasx_xvpermi_d(v_src, 0xD8);
  1074. uv = __lasx_xvilvl_h(v_src, u_src);
  1075. u_ev = __lasx_xvmaddwev_w_h(u_ev, uv, temp);
  1076. v_ev = __lasx_xvmaddwod_w_h(v_ev, uv, temp);
  1077. }
  1078. y_ev = __lasx_xvsrai_w(y_ev, 10);
  1079. u_ev = __lasx_xvsrai_w(u_ev, 10);
  1080. v_ev = __lasx_xvsrai_w(v_ev, 10);
  1081. YUV2RGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
  1082. y_temp, v2r, v2g, u2g, u2b);
  1083. if (hasAlpha) {
  1084. __m256i a_src, a_ev;
  1085. a_ev = __lasx_xvreplgr2vr_w(a_temp);
  1086. for (j = 0; j < lumFilterSize; j++) {
  1087. temp = __lasx_xvldrepl_h(lumFilter + j, 0);
  1088. a_src = __lasx_xvldx(alpSrc[j], n);
  1089. a_src = __lasx_xvpermi_d(a_src, 0xD8);
  1090. a_src = __lasx_xvilvl_h(a_src, a_src);
  1091. a_ev = __lasx_xvmaddwev_w_h(a_ev, a_src, temp);
  1092. }
  1093. a_ev = __lasx_xvsrai_w(a_ev, 19);
  1094. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
  1095. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 1);
  1096. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 2);
  1097. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 3);
  1098. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 4);
  1099. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 5);
  1100. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 6);
  1101. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 7);
  1102. } else {
  1103. WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
  1104. WRITE_FULL(R_ev, G_ev, B_ev, 1, 1);
  1105. WRITE_FULL(R_ev, G_ev, B_ev, 2, 2);
  1106. WRITE_FULL(R_ev, G_ev, B_ev, 3, 3);
  1107. WRITE_FULL(R_ev, G_ev, B_ev, 4, 4);
  1108. WRITE_FULL(R_ev, G_ev, B_ev, 5, 5);
  1109. WRITE_FULL(R_ev, G_ev, B_ev, 6, 6);
  1110. WRITE_FULL(R_ev, G_ev, B_ev, 7, 7);
  1111. }
  1112. i += 8;
  1113. }
  1114. for (; i < dstW; i++) {
  1115. int Y = templ;
  1116. int V, U = V = tempc;
  1117. A = 0;
  1118. for (j = 0; j < lumFilterSize; j++) {
  1119. Y += lumSrc[j][i] * lumFilter[j];
  1120. }
  1121. for (j = 0; j < chrFilterSize; j++) {
  1122. U += chrUSrc[j][i] * chrFilter[j];
  1123. V += chrVSrc[j][i] * chrFilter[j];
  1124. }
  1125. Y >>= 10;
  1126. U >>= 10;
  1127. V >>= 10;
  1128. if (hasAlpha) {
  1129. A = 1 << 18;
  1130. for (j = 0; j < lumFilterSize; j++) {
  1131. A += alpSrc[j][i] * lumFilter[j];
  1132. }
  1133. A >>= 19;
  1134. if (A & 0x100)
  1135. A = av_clip_uint8(A);
  1136. }
  1137. Y -= y_offset;
  1138. Y *= y_coeff;
  1139. Y += ytemp;
  1140. R = (unsigned)Y + V * v2r_coe;
  1141. G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
  1142. B = (unsigned)Y + U * u2b_coe;
  1143. yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
  1144. dest += step;
  1145. }
  1146. c->dither_error[0][i] = err[0];
  1147. c->dither_error[1][i] = err[1];
  1148. c->dither_error[2][i] = err[2];
  1149. }
  1150. static void
  1151. yuv2rgb_full_2_template_lasx(SwsInternal *c, const int16_t *buf[2],
  1152. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1153. const int16_t *abuf[2], uint8_t *dest, int dstW,
  1154. int yalpha, int uvalpha, int y,
  1155. enum AVPixelFormat target, int hasAlpha)
  1156. {
  1157. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1158. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
  1159. *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
  1160. *abuf0 = hasAlpha ? abuf[0] : NULL,
  1161. *abuf1 = hasAlpha ? abuf[1] : NULL;
  1162. int yalpha1 = 4096 - yalpha;
  1163. int uvalpha1 = 4096 - uvalpha;
  1164. int uvtemp = 128 << 19;
  1165. int atemp = 1 << 18;
  1166. int err[4] = {0};
  1167. int ytemp = 1 << 21;
  1168. int len = dstW - 15;
  1169. int i, R, G, B, A;
  1170. int step = (target == AV_PIX_FMT_RGB24 ||
  1171. target == AV_PIX_FMT_BGR24) ? 3 : 4;
  1172. __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
  1173. __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
  1174. __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
  1175. __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
  1176. __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
  1177. __m256i a_bias = __lasx_xvreplgr2vr_w(atemp);
  1178. __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
  1179. YUV2RGB_SETUP
  1180. av_assert2(yalpha <= 4096U);
  1181. av_assert2(uvalpha <= 4096U);
  1182. if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
  1183. || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
  1184. step = 1;
  1185. for (i = 0; i < len; i += 16) {
  1186. __m256i b0, b1, ub0, ub1, vb0, vb1;
  1187. __m256i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
  1188. __m256i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
  1189. __m256i y_l, y_h, v_l, v_h, u_l, u_h;
  1190. __m256i R_l, R_h, G_l, G_h, B_l, B_h;
  1191. int n = i << 1;
  1192. DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0,
  1193. n, ubuf1, n, b0, b1, ub0, ub1);
  1194. DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0 , vb1);
  1195. DUP2_ARG2(__lasx_xvsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
  1196. DUP4_ARG2(__lasx_xvsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
  1197. u0_l, u1_l, v0_l, v1_l);
  1198. DUP2_ARG1(__lasx_xvexth_w_h, b0, b1, y0_h, y1_h);
  1199. DUP4_ARG1(__lasx_xvexth_w_h, ub0, ub1, vb0, vb1,
  1200. u0_h, u1_h, v0_h, v1_h);
  1201. y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
  1202. y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
  1203. u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
  1204. u0_h = __lasx_xvmul_w(u0_h, v_uvalpha1);
  1205. v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
  1206. v0_h = __lasx_xvmul_w(v0_h, v_uvalpha1);
  1207. y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
  1208. y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
  1209. u_l = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
  1210. u_h = __lasx_xvmadd_w(u0_h, v_uvalpha, u1_h);
  1211. v_l = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
  1212. v_h = __lasx_xvmadd_w(v0_h, v_uvalpha, v1_h);
  1213. u_l = __lasx_xvsub_w(u_l, uv);
  1214. u_h = __lasx_xvsub_w(u_h, uv);
  1215. v_l = __lasx_xvsub_w(v_l, uv);
  1216. v_h = __lasx_xvsub_w(v_h, uv);
  1217. y_l = __lasx_xvsrai_w(y_l, 10);
  1218. y_h = __lasx_xvsrai_w(y_h, 10);
  1219. u_l = __lasx_xvsrai_w(u_l, 10);
  1220. u_h = __lasx_xvsrai_w(u_h, 10);
  1221. v_l = __lasx_xvsrai_w(v_l, 10);
  1222. v_h = __lasx_xvsrai_w(v_h, 10);
  1223. YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
  1224. y_temp, v2r, v2g, u2g, u2b);
  1225. YUV2RGB(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
  1226. y_temp, v2r, v2g, u2g, u2b);
  1227. if (hasAlpha) {
  1228. __m256i a0, a1, a0_l, a0_h;
  1229. __m256i a_l, a_h, a1_l, a1_h;
  1230. DUP2_ARG2(__lasx_xvldx, abuf0, n, abuf1, n, a0, a1);
  1231. DUP2_ARG2(__lasx_xvsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
  1232. DUP2_ARG1(__lasx_xvexth_w_h, a0, a1, a0_h, a1_h);
  1233. a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
  1234. a_h = __lasx_xvmadd_w(a_bias, a0_h, v_yalpha1);
  1235. a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
  1236. a_h = __lasx_xvmadd_w(a_h, v_yalpha, a1_h);
  1237. a_l = __lasx_xvsrai_w(a_l, 19);
  1238. a_h = __lasx_xvsrai_w(a_h, 19);
  1239. WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
  1240. WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
  1241. WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
  1242. WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
  1243. WRITE_FULL_A(R_h, G_h, B_h, a_h, 0, 4);
  1244. WRITE_FULL_A(R_h, G_h, B_h, a_h, 1, 5);
  1245. WRITE_FULL_A(R_h, G_h, B_h, a_h, 2, 6);
  1246. WRITE_FULL_A(R_h, G_h, B_h, a_h, 3, 7);
  1247. WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 8);
  1248. WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 9);
  1249. WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 10);
  1250. WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 11);
  1251. WRITE_FULL_A(R_h, G_h, B_h, a_h, 4, 12);
  1252. WRITE_FULL_A(R_h, G_h, B_h, a_h, 5, 13);
  1253. WRITE_FULL_A(R_h, G_h, B_h, a_h, 6, 14);
  1254. WRITE_FULL_A(R_h, G_h, B_h, a_h, 7, 15);
  1255. } else {
  1256. WRITE_FULL(R_l, G_l, B_l, 0, 0);
  1257. WRITE_FULL(R_l, G_l, B_l, 1, 1);
  1258. WRITE_FULL(R_l, G_l, B_l, 2, 2);
  1259. WRITE_FULL(R_l, G_l, B_l, 3, 3);
  1260. WRITE_FULL(R_h, G_h, B_h, 0, 4);
  1261. WRITE_FULL(R_h, G_h, B_h, 1, 5);
  1262. WRITE_FULL(R_h, G_h, B_h, 2, 6);
  1263. WRITE_FULL(R_h, G_h, B_h, 3, 7);
  1264. WRITE_FULL(R_l, G_l, B_l, 4, 8);
  1265. WRITE_FULL(R_l, G_l, B_l, 5, 9);
  1266. WRITE_FULL(R_l, G_l, B_l, 6, 10);
  1267. WRITE_FULL(R_l, G_l, B_l, 7, 11);
  1268. WRITE_FULL(R_h, G_h, B_h, 4, 12);
  1269. WRITE_FULL(R_h, G_h, B_h, 5, 13);
  1270. WRITE_FULL(R_h, G_h, B_h, 6, 14);
  1271. WRITE_FULL(R_h, G_h, B_h, 7, 15);
  1272. }
  1273. }
  1274. if (dstW - i >= 8) {
  1275. __m256i b0, b1, ub0, ub1, vb0, vb1;
  1276. __m256i y0_l, y1_l, u0_l;
  1277. __m256i v0_l, u1_l, v1_l;
  1278. __m256i y_l, u_l, v_l;
  1279. __m256i R_l, G_l, B_l;
  1280. int n = i << 1;
  1281. DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0, n,
  1282. ubuf1, n, b0, b1, ub0, ub1);
  1283. DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0, vb1);
  1284. DUP2_ARG1(__lasx_vext2xv_w_h, b0, b1, y0_l, y1_l);
  1285. DUP4_ARG1(__lasx_vext2xv_w_h, ub0, ub1, vb0, vb1,
  1286. u0_l, u1_l, v0_l, v1_l);
  1287. y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
  1288. u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
  1289. v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
  1290. y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
  1291. u_l = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
  1292. v_l = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
  1293. u_l = __lasx_xvsub_w(u_l, uv);
  1294. v_l = __lasx_xvsub_w(v_l, uv);
  1295. y_l = __lasx_xvsrai_w(y_l, 10);
  1296. u_l = __lasx_xvsrai_w(u_l, 10);
  1297. v_l = __lasx_xvsrai_w(v_l, 10);
  1298. YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
  1299. y_temp, v2r, v2g, u2g, u2b);
  1300. if (hasAlpha) {
  1301. __m256i a0, a1, a0_l;
  1302. __m256i a_l, a1_l;
  1303. DUP2_ARG2(__lasx_xvldx, abuf0, n, abuf1, n, a0, a1);
  1304. DUP2_ARG1(__lasx_vext2xv_w_h, a0, a1, a0_l, a1_l);
  1305. a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
  1306. a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
  1307. a_l = __lasx_xvsrai_w(a_l, 19);
  1308. WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
  1309. WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
  1310. WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
  1311. WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
  1312. WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
  1313. WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
  1314. WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
  1315. WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
  1316. } else {
  1317. WRITE_FULL(R_l, G_l, B_l, 0, 0);
  1318. WRITE_FULL(R_l, G_l, B_l, 1, 1);
  1319. WRITE_FULL(R_l, G_l, B_l, 2, 2);
  1320. WRITE_FULL(R_l, G_l, B_l, 3, 3);
  1321. WRITE_FULL(R_l, G_l, B_l, 4, 4);
  1322. WRITE_FULL(R_l, G_l, B_l, 5, 5);
  1323. WRITE_FULL(R_l, G_l, B_l, 6, 6);
  1324. WRITE_FULL(R_l, G_l, B_l, 7, 7);
  1325. }
  1326. i += 8;
  1327. }
  1328. for (; i < dstW; i++){
  1329. int Y = ( buf0[i] * yalpha1 + buf1[i] * yalpha ) >> 10;
  1330. int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha- uvtemp) >> 10;
  1331. int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha- uvtemp) >> 10;
  1332. A = 0;
  1333. if (hasAlpha){
  1334. A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha + atemp) >> 19;
  1335. if (A & 0x100)
  1336. A = av_clip_uint8(A);
  1337. }
  1338. Y -= y_offset;
  1339. Y *= y_coeff;
  1340. Y += ytemp;
  1341. R = (unsigned)Y + V * v2r_coe;
  1342. G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
  1343. B = (unsigned)Y + U * u2b_coe;
  1344. yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
  1345. dest += step;
  1346. }
  1347. c->dither_error[0][i] = err[0];
  1348. c->dither_error[1][i] = err[1];
  1349. c->dither_error[2][i] = err[2];
  1350. }
  1351. static void
  1352. yuv2rgb_full_1_template_lasx(SwsInternal *c, const int16_t *buf0,
  1353. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1354. const int16_t *abuf0, uint8_t *dest, int dstW,
  1355. int uvalpha, int y, enum AVPixelFormat target,
  1356. int hasAlpha)
  1357. {
  1358. const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
  1359. int i, B, G, R, A;
  1360. int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
  1361. int err[4] = {0};
  1362. int ytemp = 1 << 21;
  1363. int bias_int = 64;
  1364. int len = dstW - 15;
  1365. __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
  1366. YUV2RGB_SETUP
  1367. if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
  1368. || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
  1369. step = 1;
  1370. if (uvalpha < 2048) {
  1371. int uvtemp = 128 << 7;
  1372. __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
  1373. __m256i bias = __lasx_xvreplgr2vr_w(bias_int);
  1374. for (i = 0; i < len; i += 16) {
  1375. __m256i b, ub, vb, ub_l, ub_h, vb_l, vb_h;
  1376. __m256i y_l, y_h, u_l, u_h, v_l, v_h;
  1377. __m256i R_l, R_h, G_l, G_h, B_l, B_h;
  1378. int n = i << 1;
  1379. DUP2_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, b, ub);
  1380. vb = __lasx_xvldx(vbuf0, n);
  1381. y_l = __lasx_xvsllwil_w_h(b, 2);
  1382. y_h = __lasx_xvexth_w_h(b);
  1383. DUP2_ARG2(__lasx_xvsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
  1384. DUP2_ARG1(__lasx_xvexth_w_h, ub, vb, ub_h, vb_h);
  1385. y_h = __lasx_xvslli_w(y_h, 2);
  1386. u_l = __lasx_xvsub_w(ub_l, uv);
  1387. u_h = __lasx_xvsub_w(ub_h, uv);
  1388. v_l = __lasx_xvsub_w(vb_l, uv);
  1389. v_h = __lasx_xvsub_w(vb_h, uv);
  1390. u_l = __lasx_xvslli_w(u_l, 2);
  1391. u_h = __lasx_xvslli_w(u_h, 2);
  1392. v_l = __lasx_xvslli_w(v_l, 2);
  1393. v_h = __lasx_xvslli_w(v_h, 2);
  1394. YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
  1395. y_temp, v2r, v2g, u2g, u2b);
  1396. YUV2RGB(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
  1397. y_temp, v2r, v2g, u2g, u2b);
  1398. if(hasAlpha) {
  1399. __m256i a_src;
  1400. __m256i a_l, a_h;
  1401. a_src = __lasx_xvld(abuf0 + i, 0);
  1402. a_l = __lasx_xvsllwil_w_h(a_src, 0);
  1403. a_h = __lasx_xvexth_w_h(a_src);
  1404. a_l = __lasx_xvadd_w(a_l, bias);
  1405. a_h = __lasx_xvadd_w(a_h, bias);
  1406. a_l = __lasx_xvsrai_w(a_l, 7);
  1407. a_h = __lasx_xvsrai_w(a_h, 7);
  1408. WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
  1409. WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
  1410. WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
  1411. WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
  1412. WRITE_FULL_A(R_h, G_h, B_h, a_h, 0, 4);
  1413. WRITE_FULL_A(R_h, G_h, B_h, a_h, 1, 5);
  1414. WRITE_FULL_A(R_h, G_h, B_h, a_h, 2, 6);
  1415. WRITE_FULL_A(R_h, G_h, B_h, a_h, 3, 7);
  1416. WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 8);
  1417. WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 9);
  1418. WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 10);
  1419. WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 11);
  1420. WRITE_FULL_A(R_h, G_h, B_h, a_h, 4, 12);
  1421. WRITE_FULL_A(R_h, G_h, B_h, a_h, 5, 13);
  1422. WRITE_FULL_A(R_h, G_h, B_h, a_h, 6, 14);
  1423. WRITE_FULL_A(R_h, G_h, B_h, a_h, 7, 15);
  1424. } else {
  1425. WRITE_FULL(R_l, G_l, B_l, 0, 0);
  1426. WRITE_FULL(R_l, G_l, B_l, 1, 1);
  1427. WRITE_FULL(R_l, G_l, B_l, 2, 2);
  1428. WRITE_FULL(R_l, G_l, B_l, 3, 3);
  1429. WRITE_FULL(R_h, G_h, B_h, 0, 4);
  1430. WRITE_FULL(R_h, G_h, B_h, 1, 5);
  1431. WRITE_FULL(R_h, G_h, B_h, 2, 6);
  1432. WRITE_FULL(R_h, G_h, B_h, 3, 7);
  1433. WRITE_FULL(R_l, G_l, B_l, 4, 8);
  1434. WRITE_FULL(R_l, G_l, B_l, 5, 9);
  1435. WRITE_FULL(R_l, G_l, B_l, 6, 10);
  1436. WRITE_FULL(R_l, G_l, B_l, 7, 11);
  1437. WRITE_FULL(R_h, G_h, B_h, 4, 12);
  1438. WRITE_FULL(R_h, G_h, B_h, 5, 13);
  1439. WRITE_FULL(R_h, G_h, B_h, 6, 14);
  1440. WRITE_FULL(R_h, G_h, B_h, 7, 15);
  1441. }
  1442. }
  1443. if (dstW - i >= 8) {
  1444. __m256i b, ub, vb, ub_l, vb_l;
  1445. __m256i y_l, u_l, v_l;
  1446. __m256i R_l, G_l, B_l;
  1447. int n = i << 1;
  1448. DUP2_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, b, ub);
  1449. vb = __lasx_xvldx(vbuf0, n);
  1450. y_l = __lasx_vext2xv_w_h(b);
  1451. DUP2_ARG1(__lasx_vext2xv_w_h, ub, vb, ub_l, vb_l);
  1452. y_l = __lasx_xvslli_w(y_l, 2);
  1453. u_l = __lasx_xvsub_w(ub_l, uv);
  1454. v_l = __lasx_xvsub_w(vb_l, uv);
  1455. u_l = __lasx_xvslli_w(u_l, 2);
  1456. v_l = __lasx_xvslli_w(v_l, 2);
  1457. YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
  1458. y_temp, v2r, v2g, u2g, u2b);
  1459. if(hasAlpha) {
  1460. __m256i a_src, a_l;
  1461. a_src = __lasx_xvldx(abuf0, n);
  1462. a_src = __lasx_vext2xv_w_h(a_src);
  1463. a_l = __lasx_xvadd_w(bias, a_src);
  1464. a_l = __lasx_xvsrai_w(a_l, 7);
  1465. WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
  1466. WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
  1467. WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
  1468. WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
  1469. WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
  1470. WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
  1471. WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
  1472. WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
  1473. } else {
  1474. WRITE_FULL(R_l, G_l, B_l, 0, 0);
  1475. WRITE_FULL(R_l, G_l, B_l, 1, 1);
  1476. WRITE_FULL(R_l, G_l, B_l, 2, 2);
  1477. WRITE_FULL(R_l, G_l, B_l, 3, 3);
  1478. WRITE_FULL(R_l, G_l, B_l, 4, 4);
  1479. WRITE_FULL(R_l, G_l, B_l, 5, 5);
  1480. WRITE_FULL(R_l, G_l, B_l, 6, 6);
  1481. WRITE_FULL(R_l, G_l, B_l, 7, 7);
  1482. }
  1483. i += 8;
  1484. }
  1485. for (; i < dstW; i++) {
  1486. int Y = buf0[i] << 2;
  1487. int U = (ubuf0[i] - uvtemp) << 2;
  1488. int V = (vbuf0[i] - uvtemp) << 2;
  1489. A = 0;
  1490. if(hasAlpha) {
  1491. A = (abuf0[i] + 64) >> 7;
  1492. if (A & 0x100)
  1493. A = av_clip_uint8(A);
  1494. }
  1495. Y -= y_offset;
  1496. Y *= y_coeff;
  1497. Y += ytemp;
  1498. R = (unsigned)Y + V * v2r_coe;
  1499. G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
  1500. B = (unsigned)Y + U * u2b_coe;
  1501. yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
  1502. dest += step;
  1503. }
  1504. } else {
  1505. const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
  1506. int uvtemp = 128 << 8;
  1507. __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
  1508. __m256i zero = __lasx_xvldi(0);
  1509. __m256i bias = __lasx_xvreplgr2vr_h(bias_int);
  1510. for (i = 0; i < len; i += 16) {
  1511. __m256i b, ub0, ub1, vb0, vb1;
  1512. __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od;
  1513. __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
  1514. int n = i << 1;
  1515. DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
  1516. ubuf1, n, b, ub0, vb0, ub1);
  1517. vb1 = __lasx_xvldx(vbuf, n);
  1518. y_ev = __lasx_xvaddwev_w_h(b, zero);
  1519. y_od = __lasx_xvaddwod_w_h(b, zero);
  1520. DUP2_ARG2(__lasx_xvaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
  1521. DUP2_ARG2(__lasx_xvaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
  1522. DUP2_ARG2(__lasx_xvslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
  1523. DUP4_ARG2(__lasx_xvsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
  1524. u_ev, u_od, v_ev, v_od);
  1525. DUP4_ARG2(__lasx_xvslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
  1526. u_ev, u_od, v_ev, v_od);
  1527. YUV2RGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
  1528. y_temp, v2r, v2g, u2g, u2b);
  1529. YUV2RGB(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
  1530. y_temp, v2r, v2g, u2g, u2b);
  1531. if(hasAlpha) {
  1532. __m256i a_src;
  1533. __m256i a_ev, a_od;
  1534. a_src = __lasx_xvld(abuf0 + i, 0);
  1535. a_ev = __lasx_xvaddwev_w_h(bias, a_src);
  1536. a_od = __lasx_xvaddwod_w_h(bias, a_src);
  1537. a_ev = __lasx_xvsrai_w(a_ev, 7);
  1538. a_od = __lasx_xvsrai_w(a_od, 7);
  1539. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
  1540. WRITE_FULL_A(R_od, G_od, B_od, a_od, 0, 1);
  1541. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 2);
  1542. WRITE_FULL_A(R_od, G_od, B_od, a_od, 1, 3);
  1543. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 4);
  1544. WRITE_FULL_A(R_od, G_od, B_od, a_od, 2, 5);
  1545. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 6);
  1546. WRITE_FULL_A(R_od, G_od, B_od, a_od, 3, 7);
  1547. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 8);
  1548. WRITE_FULL_A(R_od, G_od, B_od, a_od, 4, 9);
  1549. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 10);
  1550. WRITE_FULL_A(R_od, G_od, B_od, a_od, 5, 11);
  1551. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 12);
  1552. WRITE_FULL_A(R_od, G_od, B_od, a_od, 6, 13);
  1553. WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 14);
  1554. WRITE_FULL_A(R_od, G_od, B_od, a_od, 7, 15);
  1555. } else {
  1556. WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
  1557. WRITE_FULL(R_od, G_od, B_od, 0, 1);
  1558. WRITE_FULL(R_ev, G_ev, B_ev, 1, 2);
  1559. WRITE_FULL(R_od, G_od, B_od, 1, 3);
  1560. WRITE_FULL(R_ev, G_ev, B_ev, 2, 4);
  1561. WRITE_FULL(R_od, G_od, B_od, 2, 5);
  1562. WRITE_FULL(R_ev, G_ev, B_ev, 3, 6);
  1563. WRITE_FULL(R_od, G_od, B_od, 3, 7);
  1564. WRITE_FULL(R_ev, G_ev, B_ev, 4, 8);
  1565. WRITE_FULL(R_od, G_od, B_od, 4, 9);
  1566. WRITE_FULL(R_ev, G_ev, B_ev, 5, 10);
  1567. WRITE_FULL(R_od, G_od, B_od, 5, 11);
  1568. WRITE_FULL(R_ev, G_ev, B_ev, 6, 12);
  1569. WRITE_FULL(R_od, G_od, B_od, 6, 13);
  1570. WRITE_FULL(R_ev, G_ev, B_ev, 7, 14);
  1571. WRITE_FULL(R_od, G_od, B_od, 7, 15);
  1572. }
  1573. }
  1574. if (dstW - i >= 8) {
  1575. __m256i b, ub0, ub1, vb0, vb1;
  1576. __m256i y_l, u_l, v_l;
  1577. __m256i R_l, G_l, B_l;
  1578. int n = i << 1;
  1579. DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
  1580. ubuf1, n, b, ub0, vb0, ub1);
  1581. vb1 = __lasx_xvldx(vbuf1, n);
  1582. y_l = __lasx_vext2xv_w_h(b);
  1583. y_l = __lasx_xvslli_w(y_l, 2);
  1584. DUP4_ARG1(__lasx_vext2xv_w_h, ub0, vb0, ub1, vb1,
  1585. ub0, vb0, ub1, vb1);
  1586. DUP2_ARG2(__lasx_xvadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
  1587. u_l = __lasx_xvsub_w(u_l, uv);
  1588. v_l = __lasx_xvsub_w(v_l, uv);
  1589. u_l = __lasx_xvslli_w(u_l, 1);
  1590. v_l = __lasx_xvslli_w(v_l, 1);
  1591. YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
  1592. y_temp, v2r, v2g, u2g, u2b);
  1593. if(hasAlpha) {
  1594. __m256i a_src;
  1595. __m256i a_l;
  1596. a_src = __lasx_xvld(abuf0 + i, 0);
  1597. a_src = __lasx_xvpermi_d(a_src, 0xD8);
  1598. a_src = __lasx_xvilvl_h(a_src, a_src);
  1599. a_l = __lasx_xvaddwev_w_h(bias, a_src);
  1600. a_l = __lasx_xvsrai_w(a_l, 7);
  1601. WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
  1602. WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
  1603. WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
  1604. WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
  1605. WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
  1606. WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
  1607. WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
  1608. WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
  1609. } else {
  1610. WRITE_FULL(R_l, G_l, B_l, 0, 0);
  1611. WRITE_FULL(R_l, G_l, B_l, 1, 1);
  1612. WRITE_FULL(R_l, G_l, B_l, 2, 2);
  1613. WRITE_FULL(R_l, G_l, B_l, 3, 3);
  1614. WRITE_FULL(R_l, G_l, B_l, 4, 4);
  1615. WRITE_FULL(R_l, G_l, B_l, 5, 5);
  1616. WRITE_FULL(R_l, G_l, B_l, 6, 6);
  1617. WRITE_FULL(R_l, G_l, B_l, 7, 7);
  1618. }
  1619. i += 8;
  1620. }
  1621. for (; i < dstW; i++) {
  1622. int Y = buf0[i] << 2;
  1623. int U = (ubuf0[i] + ubuf1[i] - uvtemp) << 1;
  1624. int V = (vbuf0[i] + vbuf1[i] - uvtemp) << 1;
  1625. A = 0;
  1626. if(hasAlpha) {
  1627. A = (abuf0[i] + 64) >> 7;
  1628. if (A & 0x100)
  1629. A = av_clip_uint8(A);
  1630. }
  1631. Y -= y_offset;
  1632. Y *= y_coeff;
  1633. Y += ytemp;
  1634. R = (unsigned)Y + V * v2r_coe;
  1635. G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
  1636. B = (unsigned)Y + U * u2b_coe;
  1637. yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
  1638. dest += step;
  1639. }
  1640. }
  1641. c->dither_error[0][i] = err[0];
  1642. c->dither_error[1][i] = err[1];
  1643. c->dither_error[2][i] = err[2];
  1644. }
  1645. #if CONFIG_SMALL
  1646. YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,
  1647. CONFIG_SWSCALE_ALPHA && c->needAlpha)
  1648. YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,
  1649. CONFIG_SWSCALE_ALPHA && c->needAlpha)
  1650. YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,
  1651. CONFIG_SWSCALE_ALPHA && c->needAlpha)
  1652. YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,
  1653. CONFIG_SWSCALE_ALPHA && c->needAlpha)
  1654. #else
  1655. #if CONFIG_SWSCALE_ALPHA
  1656. YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA, 1)
  1657. YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR, 1)
  1658. YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA, 1)
  1659. YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB, 1)
  1660. #endif
  1661. YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
  1662. YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
  1663. YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
  1664. YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
  1665. #endif
  1666. YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
  1667. YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
  1668. YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full, AV_PIX_FMT_BGR4_BYTE, 0)
  1669. YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full, AV_PIX_FMT_RGB4_BYTE, 0)
  1670. YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
  1671. YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
  1672. av_cold void ff_sws_init_output_lasx(SwsInternal *c,
  1673. yuv2planar1_fn *yuv2plane1,
  1674. yuv2planarX_fn *yuv2planeX,
  1675. yuv2interleavedX_fn *yuv2nv12cX,
  1676. yuv2packed1_fn *yuv2packed1,
  1677. yuv2packed2_fn *yuv2packed2,
  1678. yuv2packedX_fn *yuv2packedX,
  1679. yuv2anyX_fn *yuv2anyX)
  1680. {
  1681. enum AVPixelFormat dstFormat = c->dstFormat;
  1682. /* Add initialization once optimized */
  1683. if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
  1684. } else if (is16BPS(dstFormat)) {
  1685. } else if (isNBPS(dstFormat)) {
  1686. } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
  1687. } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
  1688. } else {
  1689. *yuv2plane1 = yuv2plane1_8_lasx;
  1690. *yuv2planeX = yuv2planeX_8_lasx;
  1691. }
  1692. if(c->flags & SWS_FULL_CHR_H_INT) {
  1693. switch (c->dstFormat) {
  1694. case AV_PIX_FMT_RGBA:
  1695. #if CONFIG_SMALL
  1696. c->yuv2packedX = yuv2rgba32_full_X_lasx;
  1697. c->yuv2packed2 = yuv2rgba32_full_2_lasx;
  1698. c->yuv2packed1 = yuv2rgba32_full_1_lasx;
  1699. #else
  1700. #if CONFIG_SWSCALE_ALPHA
  1701. if (c->needAlpha) {
  1702. c->yuv2packedX = yuv2rgba32_full_X_lasx;
  1703. c->yuv2packed2 = yuv2rgba32_full_2_lasx;
  1704. c->yuv2packed1 = yuv2rgba32_full_1_lasx;
  1705. } else
  1706. #endif /* CONFIG_SWSCALE_ALPHA */
  1707. {
  1708. c->yuv2packedX = yuv2rgbx32_full_X_lasx;
  1709. c->yuv2packed2 = yuv2rgbx32_full_2_lasx;
  1710. c->yuv2packed1 = yuv2rgbx32_full_1_lasx;
  1711. }
  1712. #endif /* !CONFIG_SMALL */
  1713. break;
  1714. case AV_PIX_FMT_ARGB:
  1715. #if CONFIG_SMALL
  1716. c->yuv2packedX = yuv2argb32_full_X_lasx;
  1717. c->yuv2packed2 = yuv2argb32_full_2_lasx;
  1718. c->yuv2packed1 = yuv2argb32_full_1_lasx;
  1719. #else
  1720. #if CONFIG_SWSCALE_ALPHA
  1721. if (c->needAlpha) {
  1722. c->yuv2packedX = yuv2argb32_full_X_lasx;
  1723. c->yuv2packed2 = yuv2argb32_full_2_lasx;
  1724. c->yuv2packed1 = yuv2argb32_full_1_lasx;
  1725. } else
  1726. #endif /* CONFIG_SWSCALE_ALPHA */
  1727. {
  1728. c->yuv2packedX = yuv2xrgb32_full_X_lasx;
  1729. c->yuv2packed2 = yuv2xrgb32_full_2_lasx;
  1730. c->yuv2packed1 = yuv2xrgb32_full_1_lasx;
  1731. }
  1732. #endif /* !CONFIG_SMALL */
  1733. break;
  1734. case AV_PIX_FMT_BGRA:
  1735. #if CONFIG_SMALL
  1736. c->yuv2packedX = yuv2bgra32_full_X_lasx;
  1737. c->yuv2packed2 = yuv2bgra32_full_2_lasx;
  1738. c->yuv2packed1 = yuv2bgra32_full_1_lasx;
  1739. #else
  1740. #if CONFIG_SWSCALE_ALPHA
  1741. if (c->needAlpha) {
  1742. c->yuv2packedX = yuv2bgra32_full_X_lasx;
  1743. c->yuv2packed2 = yuv2bgra32_full_2_lasx;
  1744. c->yuv2packed1 = yuv2bgra32_full_1_lasx;
  1745. } else
  1746. #endif /* CONFIG_SWSCALE_ALPHA */
  1747. {
  1748. c->yuv2packedX = yuv2bgrx32_full_X_lasx;
  1749. c->yuv2packed2 = yuv2bgrx32_full_2_lasx;
  1750. c->yuv2packed1 = yuv2bgrx32_full_1_lasx;
  1751. }
  1752. #endif /* !CONFIG_SMALL */
  1753. break;
  1754. case AV_PIX_FMT_ABGR:
  1755. #if CONFIG_SMALL
  1756. c->yuv2packedX = yuv2abgr32_full_X_lasx;
  1757. c->yuv2packed2 = yuv2abgr32_full_2_lasx;
  1758. c->yuv2packed1 = yuv2abgr32_full_1_lasx;
  1759. #else
  1760. #if CONFIG_SWSCALE_ALPHA
  1761. if (c->needAlpha) {
  1762. c->yuv2packedX = yuv2abgr32_full_X_lasx;
  1763. c->yuv2packed2 = yuv2abgr32_full_2_lasx;
  1764. c->yuv2packed1 = yuv2abgr32_full_1_lasx;
  1765. } else
  1766. #endif /* CONFIG_SWSCALE_ALPHA */
  1767. {
  1768. c->yuv2packedX = yuv2xbgr32_full_X_lasx;
  1769. c->yuv2packed2 = yuv2xbgr32_full_2_lasx;
  1770. c->yuv2packed1 = yuv2xbgr32_full_1_lasx;
  1771. }
  1772. #endif /* !CONFIG_SMALL */
  1773. break;
  1774. case AV_PIX_FMT_RGB24:
  1775. c->yuv2packedX = yuv2rgb24_full_X_lasx;
  1776. c->yuv2packed2 = yuv2rgb24_full_2_lasx;
  1777. c->yuv2packed1 = yuv2rgb24_full_1_lasx;
  1778. break;
  1779. case AV_PIX_FMT_BGR24:
  1780. c->yuv2packedX = yuv2bgr24_full_X_lasx;
  1781. c->yuv2packed2 = yuv2bgr24_full_2_lasx;
  1782. c->yuv2packed1 = yuv2bgr24_full_1_lasx;
  1783. break;
  1784. case AV_PIX_FMT_BGR4_BYTE:
  1785. c->yuv2packedX = yuv2bgr4_byte_full_X_lasx;
  1786. c->yuv2packed2 = yuv2bgr4_byte_full_2_lasx;
  1787. c->yuv2packed1 = yuv2bgr4_byte_full_1_lasx;
  1788. break;
  1789. case AV_PIX_FMT_RGB4_BYTE:
  1790. c->yuv2packedX = yuv2rgb4_byte_full_X_lasx;
  1791. c->yuv2packed2 = yuv2rgb4_byte_full_2_lasx;
  1792. c->yuv2packed1 = yuv2rgb4_byte_full_1_lasx;
  1793. break;
  1794. case AV_PIX_FMT_BGR8:
  1795. c->yuv2packedX = yuv2bgr8_full_X_lasx;
  1796. c->yuv2packed2 = yuv2bgr8_full_2_lasx;
  1797. c->yuv2packed1 = yuv2bgr8_full_1_lasx;
  1798. break;
  1799. case AV_PIX_FMT_RGB8:
  1800. c->yuv2packedX = yuv2rgb8_full_X_lasx;
  1801. c->yuv2packed2 = yuv2rgb8_full_2_lasx;
  1802. c->yuv2packed1 = yuv2rgb8_full_1_lasx;
  1803. break;
  1804. }
  1805. } else {
  1806. switch (c->dstFormat) {
  1807. case AV_PIX_FMT_RGB32:
  1808. case AV_PIX_FMT_BGR32:
  1809. #if CONFIG_SMALL
  1810. #else
  1811. #if CONFIG_SWSCALE_ALPHA
  1812. if (c->needAlpha) {
  1813. } else
  1814. #endif /* CONFIG_SWSCALE_ALPHA */
  1815. {
  1816. c->yuv2packed1 = yuv2rgbx32_1_lasx;
  1817. c->yuv2packed2 = yuv2rgbx32_2_lasx;
  1818. c->yuv2packedX = yuv2rgbx32_X_lasx;
  1819. }
  1820. #endif /* !CONFIG_SMALL */
  1821. break;
  1822. case AV_PIX_FMT_RGB32_1:
  1823. case AV_PIX_FMT_BGR32_1:
  1824. #if CONFIG_SMALL
  1825. #else
  1826. #if CONFIG_SWSCALE_ALPHA
  1827. if (c->needAlpha) {
  1828. } else
  1829. #endif /* CONFIG_SWSCALE_ALPHA */
  1830. {
  1831. c->yuv2packed1 = yuv2rgbx32_1_1_lasx;
  1832. c->yuv2packed2 = yuv2rgbx32_1_2_lasx;
  1833. c->yuv2packedX = yuv2rgbx32_1_X_lasx;
  1834. }
  1835. #endif /* !CONFIG_SMALL */
  1836. break;
  1837. case AV_PIX_FMT_RGB24:
  1838. c->yuv2packed1 = yuv2rgb24_1_lasx;
  1839. c->yuv2packed2 = yuv2rgb24_2_lasx;
  1840. c->yuv2packedX = yuv2rgb24_X_lasx;
  1841. break;
  1842. case AV_PIX_FMT_BGR24:
  1843. c->yuv2packed1 = yuv2bgr24_1_lasx;
  1844. c->yuv2packed2 = yuv2bgr24_2_lasx;
  1845. c->yuv2packedX = yuv2bgr24_X_lasx;
  1846. break;
  1847. case AV_PIX_FMT_RGB565LE:
  1848. case AV_PIX_FMT_RGB565BE:
  1849. case AV_PIX_FMT_BGR565LE:
  1850. case AV_PIX_FMT_BGR565BE:
  1851. c->yuv2packed1 = yuv2rgb16_1_lasx;
  1852. c->yuv2packed2 = yuv2rgb16_2_lasx;
  1853. c->yuv2packedX = yuv2rgb16_X_lasx;
  1854. break;
  1855. case AV_PIX_FMT_RGB555LE:
  1856. case AV_PIX_FMT_RGB555BE:
  1857. case AV_PIX_FMT_BGR555LE:
  1858. case AV_PIX_FMT_BGR555BE:
  1859. c->yuv2packed1 = yuv2rgb15_1_lasx;
  1860. c->yuv2packed2 = yuv2rgb15_2_lasx;
  1861. c->yuv2packedX = yuv2rgb15_X_lasx;
  1862. break;
  1863. case AV_PIX_FMT_RGB444LE:
  1864. case AV_PIX_FMT_RGB444BE:
  1865. case AV_PIX_FMT_BGR444LE:
  1866. case AV_PIX_FMT_BGR444BE:
  1867. c->yuv2packed1 = yuv2rgb12_1_lasx;
  1868. c->yuv2packed2 = yuv2rgb12_2_lasx;
  1869. c->yuv2packedX = yuv2rgb12_X_lasx;
  1870. break;
  1871. case AV_PIX_FMT_RGB8:
  1872. case AV_PIX_FMT_BGR8:
  1873. c->yuv2packed1 = yuv2rgb8_1_lasx;
  1874. c->yuv2packed2 = yuv2rgb8_2_lasx;
  1875. c->yuv2packedX = yuv2rgb8_X_lasx;
  1876. break;
  1877. case AV_PIX_FMT_RGB4:
  1878. case AV_PIX_FMT_BGR4:
  1879. c->yuv2packed1 = yuv2rgb4_1_lasx;
  1880. c->yuv2packed2 = yuv2rgb4_2_lasx;
  1881. c->yuv2packedX = yuv2rgb4_X_lasx;
  1882. break;
  1883. case AV_PIX_FMT_RGB4_BYTE:
  1884. case AV_PIX_FMT_BGR4_BYTE:
  1885. c->yuv2packed1 = yuv2rgb4b_1_lasx;
  1886. c->yuv2packed2 = yuv2rgb4b_2_lasx;
  1887. c->yuv2packedX = yuv2rgb4b_X_lasx;
  1888. break;
  1889. }
  1890. }
  1891. }