output_lsx.c 75 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833
  1. /*
  2. * Copyright (C) 2023 Loongson Technology Corporation Limited
  3. * Contributed by Lu Wang <wanglu@loongson.cn>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "swscale_loongarch.h"
  22. #include "libavutil/loongarch/loongson_intrinsics.h"
  23. /*Copy from libswscale/output.c*/
  24. static av_always_inline void
  25. yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
  26. unsigned A1, unsigned A2,
  27. const void *_r, const void *_g, const void *_b, int y,
  28. enum AVPixelFormat target, int hasAlpha)
  29. {
  30. if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA ||
  31. target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) {
  32. uint32_t *dest = (uint32_t *) _dest;
  33. const uint32_t *r = (const uint32_t *) _r;
  34. const uint32_t *g = (const uint32_t *) _g;
  35. const uint32_t *b = (const uint32_t *) _b;
  36. #if CONFIG_SMALL
  37. dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
  38. dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
  39. #else
  40. #if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
  41. int sh = (target == AV_PIX_FMT_RGB32_1 ||
  42. target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
  43. av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF);
  44. #endif
  45. dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
  46. dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
  47. #endif
  48. } else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) {
  49. uint8_t *dest = (uint8_t *) _dest;
  50. const uint8_t *r = (const uint8_t *) _r;
  51. const uint8_t *g = (const uint8_t *) _g;
  52. const uint8_t *b = (const uint8_t *) _b;
  53. #define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
  54. #define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
  55. dest[i * 6 + 0] = r_b[Y1];
  56. dest[i * 6 + 1] = g[Y1];
  57. dest[i * 6 + 2] = b_r[Y1];
  58. dest[i * 6 + 3] = r_b[Y2];
  59. dest[i * 6 + 4] = g[Y2];
  60. dest[i * 6 + 5] = b_r[Y2];
  61. #undef r_b
  62. #undef b_r
  63. } else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 ||
  64. target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 ||
  65. target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) {
  66. uint16_t *dest = (uint16_t *) _dest;
  67. const uint16_t *r = (const uint16_t *) _r;
  68. const uint16_t *g = (const uint16_t *) _g;
  69. const uint16_t *b = (const uint16_t *) _b;
  70. int dr1, dg1, db1, dr2, dg2, db2;
  71. if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) {
  72. dr1 = ff_dither_2x2_8[ y & 1 ][0];
  73. dg1 = ff_dither_2x2_4[ y & 1 ][0];
  74. db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
  75. dr2 = ff_dither_2x2_8[ y & 1 ][1];
  76. dg2 = ff_dither_2x2_4[ y & 1 ][1];
  77. db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
  78. } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) {
  79. dr1 = ff_dither_2x2_8[ y & 1 ][0];
  80. dg1 = ff_dither_2x2_8[ y & 1 ][1];
  81. db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
  82. dr2 = ff_dither_2x2_8[ y & 1 ][1];
  83. dg2 = ff_dither_2x2_8[ y & 1 ][0];
  84. db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
  85. } else {
  86. dr1 = ff_dither_4x4_16[ y & 3 ][0];
  87. dg1 = ff_dither_4x4_16[ y & 3 ][1];
  88. db1 = ff_dither_4x4_16[(y & 3) ^ 3][0];
  89. dr2 = ff_dither_4x4_16[ y & 3 ][1];
  90. dg2 = ff_dither_4x4_16[ y & 3 ][0];
  91. db2 = ff_dither_4x4_16[(y & 3) ^ 3][1];
  92. }
  93. dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
  94. dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
  95. } else { /* 8/4 bits */
  96. uint8_t *dest = (uint8_t *) _dest;
  97. const uint8_t *r = (const uint8_t *) _r;
  98. const uint8_t *g = (const uint8_t *) _g;
  99. const uint8_t *b = (const uint8_t *) _b;
  100. int dr1, dg1, db1, dr2, dg2, db2;
  101. if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) {
  102. const uint8_t * const d64 = ff_dither_8x8_73[y & 7];
  103. const uint8_t * const d32 = ff_dither_8x8_32[y & 7];
  104. dr1 = dg1 = d32[(i * 2 + 0) & 7];
  105. db1 = d64[(i * 2 + 0) & 7];
  106. dr2 = dg2 = d32[(i * 2 + 1) & 7];
  107. db2 = d64[(i * 2 + 1) & 7];
  108. } else {
  109. const uint8_t * const d64 = ff_dither_8x8_73 [y & 7];
  110. const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
  111. dr1 = db1 = d128[(i * 2 + 0) & 7];
  112. dg1 = d64[(i * 2 + 0) & 7];
  113. dr2 = db2 = d128[(i * 2 + 1) & 7];
  114. dg2 = d64[(i * 2 + 1) & 7];
  115. }
  116. if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) {
  117. dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
  118. ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
  119. } else {
  120. dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
  121. dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
  122. }
  123. }
  124. }
  125. #define WRITE_YUV2RGB_LSX(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4) \
  126. { \
  127. Y1 = __lsx_vpickve2gr_w(vec_y1, t1); \
  128. Y2 = __lsx_vpickve2gr_w(vec_y2, t2); \
  129. U = __lsx_vpickve2gr_w(vec_u, t3); \
  130. V = __lsx_vpickve2gr_w(vec_v, t4); \
  131. r = c->table_rV[V]; \
  132. g = (c->table_gU[U] + c->table_gV[V]); \
  133. b = c->table_bU[U]; \
  134. yuv2rgb_write(dest, count, Y1, Y2, 0, 0, \
  135. r, g, b, y, target, 0); \
  136. count++; \
  137. }
  138. static void
  139. yuv2rgb_X_template_lsx(SwsInternal *c, const int16_t *lumFilter,
  140. const int16_t **lumSrc, int lumFilterSize,
  141. const int16_t *chrFilter, const int16_t **chrUSrc,
  142. const int16_t **chrVSrc, int chrFilterSize,
  143. const int16_t **alpSrc, uint8_t *dest, int dstW,
  144. int y, enum AVPixelFormat target, int hasAlpha)
  145. {
  146. int i, j;
  147. int count = 0;
  148. int t = 1 << 18;
  149. int len = dstW >> 5;
  150. int res = dstW & 31;
  151. int len_count = (dstW + 1) >> 1;
  152. const void *r, *g, *b;
  153. int head = YUVRGB_TABLE_HEADROOM;
  154. __m128i headroom = __lsx_vreplgr2vr_w(head);
  155. for (i = 0; i < len; i++) {
  156. int Y1, Y2, U, V, count_lum = count << 1;
  157. __m128i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
  158. __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2, yh_ev1, yh_ev2, yh_od1, yh_od2;
  159. __m128i u_ev1, u_ev2, u_od1, u_od2, v_ev1, v_ev2, v_od1, v_od2, temp;
  160. yl_ev = __lsx_vldrepl_w(&t, 0);
  161. yl_ev1 = yl_ev;
  162. yl_od1 = yl_ev;
  163. yh_ev1 = yl_ev;
  164. yh_od1 = yl_ev;
  165. u_ev1 = yl_ev;
  166. v_ev1 = yl_ev;
  167. u_od1 = yl_ev;
  168. v_od1 = yl_ev;
  169. yl_ev2 = yl_ev;
  170. yl_od2 = yl_ev;
  171. yh_ev2 = yl_ev;
  172. yh_od2 = yl_ev;
  173. u_ev2 = yl_ev;
  174. v_ev2 = yl_ev;
  175. u_od2 = yl_ev;
  176. v_od2 = yl_ev;
  177. for (j = 0; j < lumFilterSize; j++) {
  178. temp = __lsx_vldrepl_h((lumFilter + j), 0);
  179. DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
  180. 16, l_src1, l_src2);
  181. DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 32, lumSrc[j] + count_lum,
  182. 48, l_src3, l_src4);
  183. yl_ev1 = __lsx_vmaddwev_w_h(yl_ev1, temp, l_src1);
  184. yl_od1 = __lsx_vmaddwod_w_h(yl_od1, temp, l_src1);
  185. yh_ev1 = __lsx_vmaddwev_w_h(yh_ev1, temp, l_src3);
  186. yh_od1 = __lsx_vmaddwod_w_h(yh_od1, temp, l_src3);
  187. yl_ev2 = __lsx_vmaddwev_w_h(yl_ev2, temp, l_src2);
  188. yl_od2 = __lsx_vmaddwod_w_h(yl_od2, temp, l_src2);
  189. yh_ev2 = __lsx_vmaddwev_w_h(yh_ev2, temp, l_src4);
  190. yh_od2 = __lsx_vmaddwod_w_h(yh_od2, temp, l_src4);
  191. }
  192. for (j = 0; j < chrFilterSize; j++) {
  193. DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
  194. u_src1, v_src1);
  195. DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 16, chrVSrc[j] + count, 16,
  196. u_src2, v_src2);
  197. temp = __lsx_vldrepl_h((chrFilter + j), 0);
  198. u_ev1 = __lsx_vmaddwev_w_h(u_ev1, temp, u_src1);
  199. u_od1 = __lsx_vmaddwod_w_h(u_od1, temp, u_src1);
  200. v_ev1 = __lsx_vmaddwev_w_h(v_ev1, temp, v_src1);
  201. v_od1 = __lsx_vmaddwod_w_h(v_od1, temp, v_src1);
  202. u_ev2 = __lsx_vmaddwev_w_h(u_ev2, temp, u_src2);
  203. u_od2 = __lsx_vmaddwod_w_h(u_od2, temp, u_src2);
  204. v_ev2 = __lsx_vmaddwev_w_h(v_ev2, temp, v_src2);
  205. v_od2 = __lsx_vmaddwod_w_h(v_od2, temp, v_src2);
  206. }
  207. yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
  208. yh_ev1 = __lsx_vsrai_w(yh_ev1, 19);
  209. yl_od1 = __lsx_vsrai_w(yl_od1, 19);
  210. yh_od1 = __lsx_vsrai_w(yh_od1, 19);
  211. u_ev1 = __lsx_vsrai_w(u_ev1, 19);
  212. v_ev1 = __lsx_vsrai_w(v_ev1, 19);
  213. u_od1 = __lsx_vsrai_w(u_od1, 19);
  214. v_od1 = __lsx_vsrai_w(v_od1, 19);
  215. yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
  216. yh_ev2 = __lsx_vsrai_w(yh_ev2, 19);
  217. yl_od2 = __lsx_vsrai_w(yl_od2, 19);
  218. yh_od2 = __lsx_vsrai_w(yh_od2, 19);
  219. u_ev2 = __lsx_vsrai_w(u_ev2, 19);
  220. v_ev2 = __lsx_vsrai_w(v_ev2, 19);
  221. u_od2 = __lsx_vsrai_w(u_od2, 19);
  222. v_od2 = __lsx_vsrai_w(v_od2, 19);
  223. u_ev1 = __lsx_vadd_w(u_ev1, headroom);
  224. v_ev1 = __lsx_vadd_w(v_ev1, headroom);
  225. u_od1 = __lsx_vadd_w(u_od1, headroom);
  226. v_od1 = __lsx_vadd_w(v_od1, headroom);
  227. u_ev2 = __lsx_vadd_w(u_ev2, headroom);
  228. v_ev2 = __lsx_vadd_w(v_ev2, headroom);
  229. u_od2 = __lsx_vadd_w(u_od2, headroom);
  230. v_od2 = __lsx_vadd_w(v_od2, headroom);
  231. WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 0, 0, 0, 0);
  232. WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 1, 1, 0, 0);
  233. WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 2, 2, 1, 1);
  234. WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 3, 3, 1, 1);
  235. WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 0, 0, 2, 2);
  236. WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 1, 1, 2, 2);
  237. WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 2, 2, 3, 3);
  238. WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 3, 3, 3, 3);
  239. WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_ev2, v_ev2, 0, 0, 0, 0);
  240. WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_od2, v_od2, 1, 1, 0, 0);
  241. WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_ev2, v_ev2, 2, 2, 1, 1);
  242. WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_od2, v_od2, 3, 3, 1, 1);
  243. WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_ev2, v_ev2, 0, 0, 2, 2);
  244. WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_od2, v_od2, 1, 1, 2, 2);
  245. WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_ev2, v_ev2, 2, 2, 3, 3);
  246. WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_od2, v_od2, 3, 3, 3, 3);
  247. }
  248. if (res >= 16) {
  249. int Y1, Y2, U, V, count_lum = count << 1;
  250. __m128i l_src1, l_src2, u_src1, v_src1;
  251. __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2;
  252. __m128i u_ev1, u_od1, v_ev1, v_od1, temp;
  253. yl_ev = __lsx_vldrepl_w(&t, 0);
  254. yl_ev1 = yl_ev;
  255. yl_od1 = yl_ev;
  256. u_ev1 = yl_ev;
  257. v_ev1 = yl_ev;
  258. u_od1 = yl_ev;
  259. v_od1 = yl_ev;
  260. yl_ev2 = yl_ev;
  261. yl_od2 = yl_ev;
  262. for (j = 0; j < lumFilterSize; j++) {
  263. temp = __lsx_vldrepl_h((lumFilter + j), 0);
  264. DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
  265. 16, l_src1, l_src2);
  266. yl_ev1 = __lsx_vmaddwev_w_h(yl_ev1, temp, l_src1);
  267. yl_od1 = __lsx_vmaddwod_w_h(yl_od1, temp, l_src1);
  268. yl_ev2 = __lsx_vmaddwev_w_h(yl_ev2, temp, l_src2);
  269. yl_od2 = __lsx_vmaddwod_w_h(yl_od2, temp, l_src2);
  270. }
  271. for (j = 0; j < chrFilterSize; j++) {
  272. DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
  273. u_src1, v_src1);
  274. temp = __lsx_vldrepl_h((chrFilter + j), 0);
  275. u_ev1 = __lsx_vmaddwev_w_h(u_ev1, temp, u_src1);
  276. u_od1 = __lsx_vmaddwod_w_h(u_od1, temp, u_src1);
  277. v_ev1 = __lsx_vmaddwev_w_h(v_ev1, temp, v_src1);
  278. v_od1 = __lsx_vmaddwod_w_h(v_od1, temp, v_src1);
  279. }
  280. yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
  281. yl_od1 = __lsx_vsrai_w(yl_od1, 19);
  282. u_ev1 = __lsx_vsrai_w(u_ev1, 19);
  283. v_ev1 = __lsx_vsrai_w(v_ev1, 19);
  284. u_od1 = __lsx_vsrai_w(u_od1, 19);
  285. v_od1 = __lsx_vsrai_w(v_od1, 19);
  286. yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
  287. yl_od2 = __lsx_vsrai_w(yl_od2, 19);
  288. u_ev1 = __lsx_vadd_w(u_ev1, headroom);
  289. v_ev1 = __lsx_vadd_w(v_ev1, headroom);
  290. u_od1 = __lsx_vadd_w(u_od1, headroom);
  291. v_od1 = __lsx_vadd_w(v_od1, headroom);
  292. WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 0, 0, 0, 0);
  293. WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 1, 1, 0, 0);
  294. WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 2, 2, 1, 1);
  295. WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 3, 3, 1, 1);
  296. WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 0, 0, 2, 2);
  297. WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 1, 1, 2, 2);
  298. WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 2, 2, 3, 3);
  299. WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 3, 3, 3, 3);
  300. res -= 16;
  301. }
  302. if (res >= 8) {
  303. int Y1, Y2, U, V, count_lum = count << 1;
  304. __m128i l_src1, u_src, v_src;
  305. __m128i yl_ev, yl_od;
  306. __m128i u_ev, u_od, v_ev, v_od, temp;
  307. yl_ev = __lsx_vldrepl_w(&t, 0);
  308. yl_od = yl_ev;
  309. u_ev = yl_ev;
  310. v_ev = yl_ev;
  311. u_od = yl_ev;
  312. v_od = yl_ev;
  313. for (j = 0; j < lumFilterSize; j++) {
  314. temp = __lsx_vldrepl_h((lumFilter + j), 0);
  315. l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
  316. yl_ev = __lsx_vmaddwev_w_h(yl_ev, temp, l_src1);
  317. yl_od = __lsx_vmaddwod_w_h(yl_od, temp, l_src1);
  318. }
  319. for (j = 0; j < chrFilterSize; j++) {
  320. DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
  321. u_src, v_src);
  322. temp = __lsx_vldrepl_h((chrFilter + j), 0);
  323. u_ev = __lsx_vmaddwev_w_h(u_ev, temp, u_src);
  324. u_od = __lsx_vmaddwod_w_h(u_od, temp, u_src);
  325. v_ev = __lsx_vmaddwev_w_h(v_ev, temp, v_src);
  326. v_od = __lsx_vmaddwod_w_h(v_od, temp, v_src);
  327. }
  328. yl_ev = __lsx_vsrai_w(yl_ev, 19);
  329. yl_od = __lsx_vsrai_w(yl_od, 19);
  330. u_ev = __lsx_vsrai_w(u_ev, 19);
  331. v_ev = __lsx_vsrai_w(v_ev, 19);
  332. u_od = __lsx_vsrai_w(u_od, 19);
  333. v_od = __lsx_vsrai_w(v_od, 19);
  334. u_ev = __lsx_vadd_w(u_ev, headroom);
  335. v_ev = __lsx_vadd_w(v_ev, headroom);
  336. u_od = __lsx_vadd_w(u_od, headroom);
  337. v_od = __lsx_vadd_w(v_od, headroom);
  338. WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
  339. WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
  340. WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 2, 2, 1, 1);
  341. WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_od, v_od, 3, 3, 1, 1);
  342. res -= 8;
  343. }
  344. if (res >= 4) {
  345. int Y1, Y2, U, V, count_lum = count << 1;
  346. __m128i l_src1, u_src, v_src;
  347. __m128i yl_ev, yl_od;
  348. __m128i u_ev, u_od, v_ev, v_od, temp;
  349. yl_ev = __lsx_vldrepl_w(&t, 0);
  350. yl_od = yl_ev;
  351. u_ev = yl_ev;
  352. v_ev = yl_ev;
  353. u_od = yl_ev;
  354. v_od = yl_ev;
  355. for (j = 0; j < lumFilterSize; j++) {
  356. temp = __lsx_vldrepl_h((lumFilter + j), 0);
  357. l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
  358. yl_ev = __lsx_vmaddwev_w_h(yl_ev, temp, l_src1);
  359. yl_od = __lsx_vmaddwod_w_h(yl_od, temp, l_src1);
  360. }
  361. for (j = 0; j < chrFilterSize; j++) {
  362. DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
  363. u_src, v_src);
  364. temp = __lsx_vldrepl_h((chrFilter + j), 0);
  365. u_ev = __lsx_vmaddwev_w_h(u_ev, temp, u_src);
  366. u_od = __lsx_vmaddwod_w_h(u_od, temp, u_src);
  367. v_ev = __lsx_vmaddwev_w_h(v_ev, temp, v_src);
  368. v_od = __lsx_vmaddwod_w_h(v_od, temp, v_src);
  369. }
  370. yl_ev = __lsx_vsrai_w(yl_ev, 19);
  371. yl_od = __lsx_vsrai_w(yl_od, 19);
  372. u_ev = __lsx_vsrai_w(u_ev, 19);
  373. v_ev = __lsx_vsrai_w(v_ev, 19);
  374. u_od = __lsx_vsrai_w(u_od, 19);
  375. v_od = __lsx_vsrai_w(v_od, 19);
  376. u_ev = __lsx_vadd_w(u_ev, headroom);
  377. v_ev = __lsx_vadd_w(v_ev, headroom);
  378. u_od = __lsx_vadd_w(u_od, headroom);
  379. v_od = __lsx_vadd_w(v_od, headroom);
  380. WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
  381. WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
  382. res -= 4;
  383. }
  384. if (res >= 2) {
  385. int Y1, Y2, U, V, count_lum = count << 1;
  386. __m128i l_src1, u_src, v_src;
  387. __m128i yl_ev, yl_od;
  388. __m128i u_ev, u_od, v_ev, v_od, temp;
  389. yl_ev = __lsx_vldrepl_w(&t, 0);
  390. yl_od = yl_ev;
  391. u_ev = yl_ev;
  392. v_ev = yl_ev;
  393. u_od = yl_ev;
  394. v_od = yl_ev;
  395. for (j = 0; j < lumFilterSize; j++) {
  396. temp = __lsx_vldrepl_h((lumFilter + j), 0);
  397. l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
  398. yl_ev = __lsx_vmaddwev_w_h(yl_ev, temp, l_src1);
  399. yl_od = __lsx_vmaddwod_w_h(yl_od, temp, l_src1);
  400. }
  401. for (j = 0; j < chrFilterSize; j++) {
  402. DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
  403. u_src, v_src);
  404. temp = __lsx_vldrepl_h((chrFilter + j), 0);
  405. u_ev = __lsx_vmaddwev_w_h(u_ev, temp, u_src);
  406. u_od = __lsx_vmaddwod_w_h(u_od, temp, u_src);
  407. v_ev = __lsx_vmaddwev_w_h(v_ev, temp, v_src);
  408. v_od = __lsx_vmaddwod_w_h(v_od, temp, v_src);
  409. }
  410. yl_ev = __lsx_vsrai_w(yl_ev, 19);
  411. yl_od = __lsx_vsrai_w(yl_od, 19);
  412. u_ev = __lsx_vsrai_w(u_ev, 19);
  413. v_ev = __lsx_vsrai_w(v_ev, 19);
  414. u_od = __lsx_vsrai_w(u_od, 19);
  415. v_od = __lsx_vsrai_w(v_od, 19);
  416. u_ev = __lsx_vadd_w(u_ev, headroom);
  417. v_ev = __lsx_vadd_w(v_ev, headroom);
  418. u_od = __lsx_vadd_w(u_od, headroom);
  419. v_od = __lsx_vadd_w(v_od, headroom);
  420. WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
  421. res -= 2;
  422. }
  423. for (; count < len_count; count++) {
  424. int Y1 = 1 << 18;
  425. int Y2 = Y1;
  426. int U = Y1;
  427. int V = Y1;
  428. for (j = 0; j < lumFilterSize; j++) {
  429. Y1 += lumSrc[j][count * 2] * lumFilter[j];
  430. Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
  431. }
  432. for (j = 0; j < chrFilterSize; j++) {
  433. U += chrUSrc[j][count] * chrFilter[j];
  434. V += chrVSrc[j][count] * chrFilter[j];
  435. }
  436. Y1 >>= 19;
  437. Y2 >>= 19;
  438. U >>= 19;
  439. V >>= 19;
  440. r = c->table_rV[V + YUVRGB_TABLE_HEADROOM];
  441. g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
  442. c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
  443. b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
  444. yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
  445. r, g, b, y, target, 0);
  446. }
  447. }
  448. static void
  449. yuv2rgb_2_template_lsx(SwsInternal *c, const int16_t *buf[2],
  450. const int16_t *ubuf[2], const int16_t *vbuf[2],
  451. const int16_t *abuf[2], uint8_t *dest, int dstW,
  452. int yalpha, int uvalpha, int y,
  453. enum AVPixelFormat target, int hasAlpha)
  454. {
  455. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  456. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
  457. *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
  458. int yalpha1 = 4096 - yalpha;
  459. int uvalpha1 = 4096 - uvalpha;
  460. int i, count = 0;
  461. int len = dstW - 7;
  462. int len_count = (dstW + 1) >> 1;
  463. const void *r, *g, *b;
  464. int head = YUVRGB_TABLE_HEADROOM;
  465. __m128i v_yalpha1 = __lsx_vreplgr2vr_w(yalpha1);
  466. __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
  467. __m128i v_yalpha = __lsx_vreplgr2vr_w(yalpha);
  468. __m128i v_uvalpha = __lsx_vreplgr2vr_w(uvalpha);
  469. __m128i headroom = __lsx_vreplgr2vr_w(head);
  470. __m128i zero = __lsx_vldi(0);
  471. for (i = 0; i < len; i += 8) {
  472. int Y1, Y2, U, V;
  473. int i_dex = i << 1;
  474. int c_dex = count << 1;
  475. __m128i y0_h, y0_l, y0, u0, v0;
  476. __m128i y1_h, y1_l, y1, u1, v1;
  477. __m128i y_l, y_h, u, v;
  478. DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
  479. buf1, i_dex, y0, u0, v0, y1);
  480. DUP2_ARG2(__lsx_vldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
  481. DUP2_ARG2(__lsx_vsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
  482. DUP2_ARG1(__lsx_vexth_w_h, y0, y1, y0_h, y1_h);
  483. DUP4_ARG2(__lsx_vilvl_h, zero, u0, zero, u1, zero, v0, zero, v1,
  484. u0, u1, v0, v1);
  485. y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
  486. y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
  487. u0 = __lsx_vmul_w(u0, v_uvalpha1);
  488. v0 = __lsx_vmul_w(v0, v_uvalpha1);
  489. y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
  490. y_h = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
  491. u = __lsx_vmadd_w(u0, v_uvalpha, u1);
  492. v = __lsx_vmadd_w(v0, v_uvalpha, v1);
  493. y_l = __lsx_vsrai_w(y_l, 19);
  494. y_h = __lsx_vsrai_w(y_h, 19);
  495. u = __lsx_vsrai_w(u, 19);
  496. v = __lsx_vsrai_w(v, 19);
  497. u = __lsx_vadd_w(u, headroom);
  498. v = __lsx_vadd_w(v, headroom);
  499. WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
  500. WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
  501. WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 0, 1, 2, 2);
  502. WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 2, 3, 3, 3);
  503. }
  504. if (dstW - i >= 4) {
  505. int Y1, Y2, U, V;
  506. int i_dex = i << 1;
  507. __m128i y0_l, y0, u0, v0;
  508. __m128i y1_l, y1, u1, v1;
  509. __m128i y_l, u, v;
  510. y0 = __lsx_vldx(buf0, i_dex);
  511. u0 = __lsx_vldrepl_d((ubuf0 + count), 0);
  512. v0 = __lsx_vldrepl_d((vbuf0 + count), 0);
  513. y1 = __lsx_vldx(buf1, i_dex);
  514. u1 = __lsx_vldrepl_d((ubuf1 + count), 0);
  515. v1 = __lsx_vldrepl_d((vbuf1 + count), 0);
  516. DUP2_ARG2(__lsx_vilvl_h, zero, y0, zero, y1, y0_l, y1_l);
  517. DUP4_ARG2(__lsx_vilvl_h, zero, u0, zero, u1, zero, v0, zero, v1,
  518. u0, u1, v0, v1);
  519. y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
  520. u0 = __lsx_vmul_w(u0, v_uvalpha1);
  521. v0 = __lsx_vmul_w(v0, v_uvalpha1);
  522. y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
  523. u = __lsx_vmadd_w(u0, v_uvalpha, u1);
  524. v = __lsx_vmadd_w(v0, v_uvalpha, v1);
  525. y_l = __lsx_vsrai_w(y_l, 19);
  526. u = __lsx_vsrai_w(u, 19);
  527. v = __lsx_vsrai_w(v, 19);
  528. u = __lsx_vadd_w(u, headroom);
  529. v = __lsx_vadd_w(v, headroom);
  530. WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
  531. WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
  532. i += 4;
  533. }
  534. for (; count < len_count; count++) {
  535. int Y1 = (buf0[count * 2] * yalpha1 +
  536. buf1[count * 2] * yalpha) >> 19;
  537. int Y2 = (buf0[count * 2 + 1] * yalpha1 +
  538. buf1[count * 2 + 1] * yalpha) >> 19;
  539. int U = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
  540. int V = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
  541. r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
  542. g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
  543. c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
  544. b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
  545. yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
  546. r, g, b, y, target, 0);
  547. }
  548. }
  549. static void
  550. yuv2rgb_1_template_lsx(SwsInternal *c, const int16_t *buf0,
  551. const int16_t *ubuf[2], const int16_t *vbuf[2],
  552. const int16_t *abuf0, uint8_t *dest, int dstW,
  553. int uvalpha, int y, enum AVPixelFormat target,
  554. int hasAlpha)
  555. {
  556. const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
  557. int i;
  558. int len = (dstW - 7);
  559. int len_count = (dstW + 1) >> 1;
  560. const void *r, *g, *b;
  561. if (uvalpha == 0) {
  562. int count = 0;
  563. int head = YUVRGB_TABLE_HEADROOM;
  564. __m128i headroom = __lsx_vreplgr2vr_h(head);
  565. for (i = 0; i < len; i += 8) {
  566. int Y1, Y2, U, V;
  567. int i_dex = i << 1;
  568. int c_dex = count << 1;
  569. __m128i src_y, src_u, src_v;
  570. __m128i u, v, uv, y_l, y_h;
  571. src_y = __lsx_vldx(buf0, i_dex);
  572. DUP2_ARG2(__lsx_vldx, ubuf0, c_dex, vbuf0, c_dex, src_u, src_v);
  573. src_y = __lsx_vsrari_h(src_y, 7);
  574. src_u = __lsx_vsrari_h(src_u, 7);
  575. src_v = __lsx_vsrari_h(src_v, 7);
  576. y_l = __lsx_vsllwil_w_h(src_y, 0);
  577. y_h = __lsx_vexth_w_h(src_y);
  578. uv = __lsx_vilvl_h(src_v, src_u);
  579. u = __lsx_vaddwev_w_h(uv, headroom);
  580. v = __lsx_vaddwod_w_h(uv, headroom);
  581. WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
  582. WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
  583. WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 0, 1, 2, 2);
  584. WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 2, 3, 3, 3);
  585. }
  586. if (dstW - i >= 4){
  587. int Y1, Y2, U, V;
  588. int i_dex = i << 1;
  589. __m128i src_y, src_u, src_v;
  590. __m128i y_l, u, v, uv;
  591. src_y = __lsx_vldx(buf0, i_dex);
  592. src_u = __lsx_vldrepl_d((ubuf0 + count), 0);
  593. src_v = __lsx_vldrepl_d((vbuf0 + count), 0);
  594. y_l = __lsx_vsrari_h(src_y, 7);
  595. y_l = __lsx_vsllwil_w_h(y_l, 0);
  596. uv = __lsx_vilvl_h(src_v, src_u);
  597. uv = __lsx_vsrari_h(uv, 7);
  598. u = __lsx_vaddwev_w_h(uv, headroom);
  599. v = __lsx_vaddwod_w_h(uv, headroom);
  600. WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
  601. WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
  602. i += 4;
  603. }
  604. for (; count < len_count; count++) {
  605. int Y1 = (buf0[count * 2 ] + 64) >> 7;
  606. int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
  607. int U = (ubuf0[count] + 64) >> 7;
  608. int V = (vbuf0[count] + 64) >> 7;
  609. r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
  610. g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
  611. c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
  612. b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
  613. yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
  614. r, g, b, y, target, 0);
  615. }
  616. } else {
  617. const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
  618. int count = 0;
  619. int HEADROOM = YUVRGB_TABLE_HEADROOM;
  620. int uvalpha1 = 4096 - uvalpha;
  621. __m128i headroom = __lsx_vreplgr2vr_w(HEADROOM);
  622. __m128i uvalpha_tmp1 = __lsx_vreplgr2vr_h(uvalpha1);
  623. __m128i uvalpha_tmp = __lsx_vreplgr2vr_h(uvalpha);
  624. for (i = 0; i < len; i += 8) {
  625. int Y1, Y2, U, V;
  626. int i_dex = i << 1;
  627. int c_dex = count << 1;
  628. __m128i src_y, src_u0, src_v0, src_u1, src_v1;
  629. __m128i y_l, y_h, u1, u2, v1, v2, u_ev, v_od;
  630. DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
  631. ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
  632. src_v1 = __lsx_vldx(vbuf1, c_dex);
  633. src_y = __lsx_vsrari_h(src_y, 7);
  634. u_ev = __lsx_vmulwev_w_h(src_u0, uvalpha_tmp1);
  635. v_od = __lsx_vmulwod_w_h(src_u0, uvalpha_tmp1);
  636. u1 = __lsx_vmaddwev_w_h(u_ev, src_u1, uvalpha_tmp);
  637. v1 = __lsx_vmaddwod_w_h(v_od, src_u1, uvalpha_tmp);
  638. u_ev = __lsx_vmulwev_w_h(src_v0, uvalpha_tmp1);
  639. v_od = __lsx_vmulwod_w_h(src_v0, uvalpha_tmp1);
  640. u2 = __lsx_vmaddwev_w_h(u_ev, src_v1, uvalpha_tmp);
  641. v2 = __lsx_vmaddwod_w_h(v_od, src_v1, uvalpha_tmp);
  642. y_l = __lsx_vsllwil_w_h(src_y, 0);
  643. y_h = __lsx_vexth_w_h(src_y);
  644. u1 = __lsx_vsrari_w(u1, 19);
  645. v1 = __lsx_vsrari_w(v1, 19);
  646. u2 = __lsx_vsrari_w(u2, 19);
  647. v2 = __lsx_vsrari_w(v2, 19);
  648. u1 = __lsx_vadd_w(u1, headroom);
  649. v1 = __lsx_vadd_w(v1, headroom);
  650. u2 = __lsx_vadd_w(u2, headroom);
  651. v2 = __lsx_vadd_w(v2, headroom);
  652. WRITE_YUV2RGB_LSX(y_l, y_l, u1, u2, 0, 1, 0, 0);
  653. WRITE_YUV2RGB_LSX(y_l, y_l, v1, v2, 2, 3, 0, 0);
  654. WRITE_YUV2RGB_LSX(y_h, y_h, u1, u2, 0, 1, 1, 1);
  655. WRITE_YUV2RGB_LSX(y_h, y_h, v1, v2, 2, 3, 1, 1);
  656. }
  657. for (; count < len_count; count++) {
  658. int Y1 = (buf0[count * 2 ] + 64) >> 7;
  659. int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
  660. int U = (ubuf0[count] + ubuf1[count] + 128) >> 8;
  661. int V = (vbuf0[count] + vbuf1[count] + 128) >> 8;
  662. r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
  663. g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
  664. c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
  665. b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
  666. yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
  667. r, g, b, y, target, 0);
  668. }
  669. }
  670. }
  671. #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
  672. static void name ## ext ## _X_lsx(SwsInternal *c, const int16_t *lumFilter, \
  673. const int16_t **lumSrc, int lumFilterSize, \
  674. const int16_t *chrFilter, const int16_t **chrUSrc, \
  675. const int16_t **chrVSrc, int chrFilterSize, \
  676. const int16_t **alpSrc, uint8_t *dest, int dstW, \
  677. int y) \
  678. { \
  679. name ## base ## _X_template_lsx(c, lumFilter, lumSrc, lumFilterSize, \
  680. chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
  681. alpSrc, dest, dstW, y, fmt, hasAlpha); \
  682. }
  683. #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
  684. YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
  685. static void name ## ext ## _2_lsx(SwsInternal *c, const int16_t *buf[2], \
  686. const int16_t *ubuf[2], const int16_t *vbuf[2], \
  687. const int16_t *abuf[2], uint8_t *dest, int dstW, \
  688. int yalpha, int uvalpha, int y) \
  689. { \
  690. name ## base ## _2_template_lsx(c, buf, ubuf, vbuf, abuf, dest, \
  691. dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
  692. }
  693. #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
  694. YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
  695. static void name ## ext ## _1_lsx(SwsInternal *c, const int16_t *buf0, \
  696. const int16_t *ubuf[2], const int16_t *vbuf[2], \
  697. const int16_t *abuf0, uint8_t *dest, int dstW, \
  698. int uvalpha, int y) \
  699. { \
  700. name ## base ## _1_template_lsx(c, buf0, ubuf, vbuf, abuf0, dest, \
  701. dstW, uvalpha, y, fmt, hasAlpha); \
  702. }
  703. #if CONFIG_SMALL
  704. #else
  705. #if CONFIG_SWSCALE_ALPHA
  706. #endif
  707. YUV2RGBWRAPPER(yuv2rgb,, x32_1, AV_PIX_FMT_RGB32_1, 0)
  708. YUV2RGBWRAPPER(yuv2rgb,, x32, AV_PIX_FMT_RGB32, 0)
  709. #endif
  710. YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
  711. YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
  712. YUV2RGBWRAPPER(yuv2rgb,, 16, AV_PIX_FMT_RGB565, 0)
  713. YUV2RGBWRAPPER(yuv2rgb,, 15, AV_PIX_FMT_RGB555, 0)
  714. YUV2RGBWRAPPER(yuv2rgb,, 12, AV_PIX_FMT_RGB444, 0)
  715. YUV2RGBWRAPPER(yuv2rgb,, 8, AV_PIX_FMT_RGB8, 0)
  716. YUV2RGBWRAPPER(yuv2rgb,, 4, AV_PIX_FMT_RGB4, 0)
  717. YUV2RGBWRAPPER(yuv2rgb,, 4b, AV_PIX_FMT_RGB4_BYTE, 0)
  718. // This function is copied from libswscale/output.c
  719. static av_always_inline void yuv2rgb_write_full(SwsInternal *c,
  720. uint8_t *dest, int i, int R, int A, int G, int B,
  721. int y, enum AVPixelFormat target, int hasAlpha, int err[4])
  722. {
  723. int isrgb8 = target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8;
  724. if ((R | G | B) & 0xC0000000) {
  725. R = av_clip_uintp2(R, 30);
  726. G = av_clip_uintp2(G, 30);
  727. B = av_clip_uintp2(B, 30);
  728. }
  729. switch(target) {
  730. case AV_PIX_FMT_ARGB:
  731. dest[0] = hasAlpha ? A : 255;
  732. dest[1] = R >> 22;
  733. dest[2] = G >> 22;
  734. dest[3] = B >> 22;
  735. break;
  736. case AV_PIX_FMT_RGB24:
  737. dest[0] = R >> 22;
  738. dest[1] = G >> 22;
  739. dest[2] = B >> 22;
  740. break;
  741. case AV_PIX_FMT_RGBA:
  742. dest[0] = R >> 22;
  743. dest[1] = G >> 22;
  744. dest[2] = B >> 22;
  745. dest[3] = hasAlpha ? A : 255;
  746. break;
  747. case AV_PIX_FMT_ABGR:
  748. dest[0] = hasAlpha ? A : 255;
  749. dest[1] = B >> 22;
  750. dest[2] = G >> 22;
  751. dest[3] = R >> 22;
  752. break;
  753. case AV_PIX_FMT_BGR24:
  754. dest[0] = B >> 22;
  755. dest[1] = G >> 22;
  756. dest[2] = R >> 22;
  757. break;
  758. case AV_PIX_FMT_BGRA:
  759. dest[0] = B >> 22;
  760. dest[1] = G >> 22;
  761. dest[2] = R >> 22;
  762. dest[3] = hasAlpha ? A : 255;
  763. break;
  764. case AV_PIX_FMT_BGR4_BYTE:
  765. case AV_PIX_FMT_RGB4_BYTE:
  766. case AV_PIX_FMT_BGR8:
  767. case AV_PIX_FMT_RGB8:
  768. {
  769. int r,g,b;
  770. switch (c->opts.dither) {
  771. default:
  772. case SWS_DITHER_AUTO:
  773. case SWS_DITHER_ED:
  774. R >>= 22;
  775. G >>= 22;
  776. B >>= 22;
  777. R += (7*err[0] + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2])>>4;
  778. G += (7*err[1] + 1*c->dither_error[1][i] + 5*c->dither_error[1][i+1] + 3*c->dither_error[1][i+2])>>4;
  779. B += (7*err[2] + 1*c->dither_error[2][i] + 5*c->dither_error[2][i+1] + 3*c->dither_error[2][i+2])>>4;
  780. c->dither_error[0][i] = err[0];
  781. c->dither_error[1][i] = err[1];
  782. c->dither_error[2][i] = err[2];
  783. r = R >> (isrgb8 ? 5 : 7);
  784. g = G >> (isrgb8 ? 5 : 6);
  785. b = B >> (isrgb8 ? 6 : 7);
  786. r = av_clip(r, 0, isrgb8 ? 7 : 1);
  787. g = av_clip(g, 0, isrgb8 ? 7 : 3);
  788. b = av_clip(b, 0, isrgb8 ? 3 : 1);
  789. err[0] = R - r*(isrgb8 ? 36 : 255);
  790. err[1] = G - g*(isrgb8 ? 36 : 85);
  791. err[2] = B - b*(isrgb8 ? 85 : 255);
  792. break;
  793. case SWS_DITHER_A_DITHER:
  794. if (isrgb8) {
  795. /* see http://pippin.gimp.org/a_dither/ for details/origin */
  796. #define A_DITHER(u,v) (((((u)+((v)*236))*119)&0xff))
  797. r = (((R >> 19) + A_DITHER(i,y) -96)>>8);
  798. g = (((G >> 19) + A_DITHER(i + 17,y) - 96)>>8);
  799. b = (((B >> 20) + A_DITHER(i + 17*2,y) -96)>>8);
  800. r = av_clip_uintp2(r, 3);
  801. g = av_clip_uintp2(g, 3);
  802. b = av_clip_uintp2(b, 2);
  803. } else {
  804. r = (((R >> 21) + A_DITHER(i,y)-256)>>8);
  805. g = (((G >> 19) + A_DITHER(i + 17,y)-256)>>8);
  806. b = (((B >> 21) + A_DITHER(i + 17*2,y)-256)>>8);
  807. r = av_clip_uintp2(r, 1);
  808. g = av_clip_uintp2(g, 2);
  809. b = av_clip_uintp2(b, 1);
  810. }
  811. break;
  812. case SWS_DITHER_X_DITHER:
  813. if (isrgb8) {
  814. /* see http://pippin.gimp.org/a_dither/ for details/origin */
  815. #define X_DITHER(u,v) (((((u)^((v)*237))*181)&0x1ff)/2)
  816. r = (((R >> 19) + X_DITHER(i,y) - 96)>>8);
  817. g = (((G >> 19) + X_DITHER(i + 17,y) - 96)>>8);
  818. b = (((B >> 20) + X_DITHER(i + 17*2,y) - 96)>>8);
  819. r = av_clip_uintp2(r, 3);
  820. g = av_clip_uintp2(g, 3);
  821. b = av_clip_uintp2(b, 2);
  822. } else {
  823. r = (((R >> 21) + X_DITHER(i,y)-256)>>8);
  824. g = (((G >> 19) + X_DITHER(i + 17,y)-256)>>8);
  825. b = (((B >> 21) + X_DITHER(i + 17*2,y)-256)>>8);
  826. r = av_clip_uintp2(r, 1);
  827. g = av_clip_uintp2(g, 2);
  828. b = av_clip_uintp2(b, 1);
  829. }
  830. break;
  831. }
  832. if(target == AV_PIX_FMT_BGR4_BYTE) {
  833. dest[0] = r + 2*g + 8*b;
  834. } else if(target == AV_PIX_FMT_RGB4_BYTE) {
  835. dest[0] = b + 2*g + 8*r;
  836. } else if(target == AV_PIX_FMT_BGR8) {
  837. dest[0] = r + 8*g + 64*b;
  838. } else if(target == AV_PIX_FMT_RGB8) {
  839. dest[0] = b + 4*g + 32*r;
  840. } else
  841. av_assert2(0);
  842. break; }
  843. }
  844. }
  845. #define YUVTORGB_SETUP_LSX \
  846. int y_offset = c->yuv2rgb_y_offset; \
  847. int y_coeff = c->yuv2rgb_y_coeff; \
  848. int v2r_coe = c->yuv2rgb_v2r_coeff; \
  849. int v2g_coe = c->yuv2rgb_v2g_coeff; \
  850. int u2g_coe = c->yuv2rgb_u2g_coeff; \
  851. int u2b_coe = c->yuv2rgb_u2b_coeff; \
  852. __m128i offset = __lsx_vreplgr2vr_w(y_offset); \
  853. __m128i coeff = __lsx_vreplgr2vr_w(y_coeff); \
  854. __m128i v2r = __lsx_vreplgr2vr_w(v2r_coe); \
  855. __m128i v2g = __lsx_vreplgr2vr_w(v2g_coe); \
  856. __m128i u2g = __lsx_vreplgr2vr_w(u2g_coe); \
  857. __m128i u2b = __lsx_vreplgr2vr_w(u2b_coe); \
  858. #define YUVTORGB_LSX(y, u, v, R, G, B, offset, coeff, \
  859. y_temp, v2r, v2g, u2g, u2b) \
  860. { \
  861. y = __lsx_vsub_w(y, offset); \
  862. y = __lsx_vmul_w(y, coeff); \
  863. y = __lsx_vadd_w(y, y_temp); \
  864. R = __lsx_vmadd_w(y, v, v2r); \
  865. v = __lsx_vmadd_w(y, v, v2g); \
  866. G = __lsx_vmadd_w(v, u, u2g); \
  867. B = __lsx_vmadd_w(y, u, u2b); \
  868. }
  869. #define WRITE_FULL_A_LSX(r, g, b, a, t1, s) \
  870. { \
  871. R = __lsx_vpickve2gr_w(r, t1); \
  872. G = __lsx_vpickve2gr_w(g, t1); \
  873. B = __lsx_vpickve2gr_w(b, t1); \
  874. A = __lsx_vpickve2gr_w(a, t1); \
  875. if (A & 0x100) \
  876. A = av_clip_uint8(A); \
  877. yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
  878. dest += step; \
  879. }
  880. #define WRITE_FULL_LSX(r, g, b, t1, s) \
  881. { \
  882. R = __lsx_vpickve2gr_w(r, t1); \
  883. G = __lsx_vpickve2gr_w(g, t1); \
  884. B = __lsx_vpickve2gr_w(b, t1); \
  885. yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
  886. dest += step; \
  887. }
  888. static void
  889. yuv2rgb_full_X_template_lsx(SwsInternal *c, const int16_t *lumFilter,
  890. const int16_t **lumSrc, int lumFilterSize,
  891. const int16_t *chrFilter, const int16_t **chrUSrc,
  892. const int16_t **chrVSrc, int chrFilterSize,
  893. const int16_t **alpSrc, uint8_t *dest,
  894. int dstW, int y, enum AVPixelFormat target,
  895. int hasAlpha)
  896. {
  897. int i, j, B, G, R, A;
  898. int step = (target == AV_PIX_FMT_RGB24 ||
  899. target == AV_PIX_FMT_BGR24) ? 3 : 4;
  900. int err[4] = {0};
  901. int a_temp = 1 << 18;
  902. int templ = 1 << 9;
  903. int tempc = templ - (128 << 19);
  904. int ytemp = 1 << 21;
  905. int len = dstW - 7;
  906. __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
  907. YUVTORGB_SETUP_LSX
  908. if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
  909. || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
  910. step = 1;
  911. for (i = 0; i < len; i += 8) {
  912. __m128i l_src, u_src, v_src;
  913. __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od, temp;
  914. __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
  915. int n = i << 1;
  916. y_ev = y_od = __lsx_vreplgr2vr_w(templ);
  917. u_ev = u_od = v_ev = v_od = __lsx_vreplgr2vr_w(tempc);
  918. for (j = 0; j < lumFilterSize; j++) {
  919. temp = __lsx_vldrepl_h((lumFilter + j), 0);
  920. l_src = __lsx_vldx(lumSrc[j], n);
  921. y_ev = __lsx_vmaddwev_w_h(y_ev, l_src, temp);
  922. y_od = __lsx_vmaddwod_w_h(y_od, l_src, temp);
  923. }
  924. for (j = 0; j < chrFilterSize; j++) {
  925. temp = __lsx_vldrepl_h((chrFilter + j), 0);
  926. DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n,
  927. u_src, v_src);
  928. DUP2_ARG3(__lsx_vmaddwev_w_h, u_ev, u_src, temp, v_ev,
  929. v_src, temp, u_ev, v_ev);
  930. DUP2_ARG3(__lsx_vmaddwod_w_h, u_od, u_src, temp, v_od,
  931. v_src, temp, u_od, v_od);
  932. }
  933. y_ev = __lsx_vsrai_w(y_ev, 10);
  934. y_od = __lsx_vsrai_w(y_od, 10);
  935. u_ev = __lsx_vsrai_w(u_ev, 10);
  936. u_od = __lsx_vsrai_w(u_od, 10);
  937. v_ev = __lsx_vsrai_w(v_ev, 10);
  938. v_od = __lsx_vsrai_w(v_od, 10);
  939. YUVTORGB_LSX(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
  940. y_temp, v2r, v2g, u2g, u2b);
  941. YUVTORGB_LSX(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
  942. y_temp, v2r, v2g, u2g, u2b);
  943. if (hasAlpha) {
  944. __m128i a_src, a_ev, a_od;
  945. a_ev = a_od = __lsx_vreplgr2vr_w(a_temp);
  946. for (j = 0; j < lumFilterSize; j++) {
  947. temp = __lsx_vldrepl_h(lumFilter + j, 0);
  948. a_src = __lsx_vldx(alpSrc[j], n);
  949. a_ev = __lsx_vmaddwev_w_h(a_ev, a_src, temp);
  950. a_od = __lsx_vmaddwod_w_h(a_od, a_src, temp);
  951. }
  952. a_ev = __lsx_vsrai_w(a_ev, 19);
  953. a_od = __lsx_vsrai_w(a_od, 19);
  954. WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 0, 0);
  955. WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 0, 1);
  956. WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 1, 2);
  957. WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 1, 3);
  958. WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 2, 4);
  959. WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 2, 5);
  960. WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 3, 6);
  961. WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 3, 7);
  962. } else {
  963. WRITE_FULL_LSX(R_ev, G_ev, B_ev, 0, 0);
  964. WRITE_FULL_LSX(R_od, G_od, B_od, 0, 1);
  965. WRITE_FULL_LSX(R_ev, G_ev, B_ev, 1, 2);
  966. WRITE_FULL_LSX(R_od, G_od, B_od, 1, 3);
  967. WRITE_FULL_LSX(R_ev, G_ev, B_ev, 2, 4);
  968. WRITE_FULL_LSX(R_od, G_od, B_od, 2, 5);
  969. WRITE_FULL_LSX(R_ev, G_ev, B_ev, 3, 6);
  970. WRITE_FULL_LSX(R_od, G_od, B_od, 3, 7);
  971. }
  972. }
  973. if (dstW - i >= 4) {
  974. __m128i l_src, u_src, v_src;
  975. __m128i y_ev, u_ev, v_ev, uv, temp;
  976. __m128i R_ev, G_ev, B_ev;
  977. int n = i << 1;
  978. y_ev = __lsx_vreplgr2vr_w(templ);
  979. u_ev = v_ev = __lsx_vreplgr2vr_w(tempc);
  980. for (j = 0; j < lumFilterSize; j++) {
  981. temp = __lsx_vldrepl_h((lumFilter + j), 0);
  982. l_src = __lsx_vldx(lumSrc[j], n);
  983. l_src = __lsx_vilvl_h(l_src, l_src);
  984. y_ev = __lsx_vmaddwev_w_h(y_ev, l_src, temp);
  985. }
  986. for (j = 0; j < chrFilterSize; j++) {
  987. temp = __lsx_vldrepl_h((chrFilter + j), 0);
  988. DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
  989. uv = __lsx_vilvl_h(v_src, u_src);
  990. u_ev = __lsx_vmaddwev_w_h(u_ev, uv, temp);
  991. v_ev = __lsx_vmaddwod_w_h(v_ev, uv, temp);
  992. }
  993. y_ev = __lsx_vsrai_w(y_ev, 10);
  994. u_ev = __lsx_vsrai_w(u_ev, 10);
  995. v_ev = __lsx_vsrai_w(v_ev, 10);
  996. YUVTORGB_LSX(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
  997. y_temp, v2r, v2g, u2g, u2b);
  998. if (hasAlpha) {
  999. __m128i a_src, a_ev;
  1000. a_ev = __lsx_vreplgr2vr_w(a_temp);
  1001. for (j = 0; j < lumFilterSize; j++) {
  1002. temp = __lsx_vldrepl_h(lumFilter + j, 0);
  1003. a_src = __lsx_vldx(alpSrc[j], n);
  1004. a_src = __lsx_vilvl_h(a_src, a_src);
  1005. a_ev = __lsx_vmaddwev_w_h(a_ev, a_src, temp);
  1006. }
  1007. a_ev = __lsx_vsrai_w(a_ev, 19);
  1008. WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 0, 0);
  1009. WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 1, 1);
  1010. WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 2, 2);
  1011. WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 3, 3);
  1012. } else {
  1013. WRITE_FULL_LSX(R_ev, G_ev, B_ev, 0, 0);
  1014. WRITE_FULL_LSX(R_ev, G_ev, B_ev, 1, 1);
  1015. WRITE_FULL_LSX(R_ev, G_ev, B_ev, 2, 2);
  1016. WRITE_FULL_LSX(R_ev, G_ev, B_ev, 3, 3);
  1017. }
  1018. i += 4;
  1019. }
  1020. for (; i < dstW; i++) {
  1021. int Y = templ;
  1022. int V, U = V = tempc;
  1023. A = 0;
  1024. for (j = 0; j < lumFilterSize; j++) {
  1025. Y += lumSrc[j][i] * lumFilter[j];
  1026. }
  1027. for (j = 0; j < chrFilterSize; j++) {
  1028. U += chrUSrc[j][i] * chrFilter[j];
  1029. V += chrVSrc[j][i] * chrFilter[j];
  1030. }
  1031. Y >>= 10;
  1032. U >>= 10;
  1033. V >>= 10;
  1034. if (hasAlpha) {
  1035. A = 1 << 18;
  1036. for (j = 0; j < lumFilterSize; j++) {
  1037. A += alpSrc[j][i] * lumFilter[j];
  1038. }
  1039. A >>= 19;
  1040. if (A & 0x100)
  1041. A = av_clip_uint8(A);
  1042. }
  1043. Y -= y_offset;
  1044. Y *= y_coeff;
  1045. Y += ytemp;
  1046. R = (unsigned)Y + V * v2r_coe;
  1047. G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
  1048. B = (unsigned)Y + U * u2b_coe;
  1049. yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
  1050. dest += step;
  1051. }
  1052. c->dither_error[0][i] = err[0];
  1053. c->dither_error[1][i] = err[1];
  1054. c->dither_error[2][i] = err[2];
  1055. }
  1056. static void
  1057. yuv2rgb_full_2_template_lsx(SwsInternal *c, const int16_t *buf[2],
  1058. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1059. const int16_t *abuf[2], uint8_t *dest, int dstW,
  1060. int yalpha, int uvalpha, int y,
  1061. enum AVPixelFormat target, int hasAlpha)
  1062. {
  1063. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1064. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
  1065. *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
  1066. *abuf0 = hasAlpha ? abuf[0] : NULL,
  1067. *abuf1 = hasAlpha ? abuf[1] : NULL;
  1068. int yalpha1 = 4096 - yalpha;
  1069. int uvalpha1 = 4096 - uvalpha;
  1070. int uvtemp = 128 << 19;
  1071. int atemp = 1 << 18;
  1072. int err[4] = {0};
  1073. int ytemp = 1 << 21;
  1074. int len = dstW - 7;
  1075. int i, R, G, B, A;
  1076. int step = (target == AV_PIX_FMT_RGB24 ||
  1077. target == AV_PIX_FMT_BGR24) ? 3 : 4;
  1078. __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
  1079. __m128i v_yalpha1 = __lsx_vreplgr2vr_w(yalpha1);
  1080. __m128i v_uvalpha = __lsx_vreplgr2vr_w(uvalpha);
  1081. __m128i v_yalpha = __lsx_vreplgr2vr_w(yalpha);
  1082. __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
  1083. __m128i a_bias = __lsx_vreplgr2vr_w(atemp);
  1084. __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
  1085. YUVTORGB_SETUP_LSX
  1086. av_assert2(yalpha <= 4096U);
  1087. av_assert2(uvalpha <= 4096U);
  1088. if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
  1089. || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
  1090. step = 1;
  1091. for (i = 0; i < len; i += 8) {
  1092. __m128i b0, b1, ub0, ub1, vb0, vb1;
  1093. __m128i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
  1094. __m128i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
  1095. __m128i y_l, y_h, v_l, v_h, u_l, u_h;
  1096. __m128i R_l, R_h, G_l, G_h, B_l, B_h;
  1097. int n = i << 1;
  1098. DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0,
  1099. n, ubuf1, n, b0, b1, ub0, ub1);
  1100. DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0 , vb1);
  1101. DUP2_ARG2(__lsx_vsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
  1102. DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
  1103. u0_l, u1_l, v0_l, v1_l);
  1104. DUP2_ARG1(__lsx_vexth_w_h, b0, b1, y0_h, y1_h);
  1105. DUP4_ARG1(__lsx_vexth_w_h, ub0, ub1, vb0, vb1,
  1106. u0_h, u1_h, v0_h, v1_h);
  1107. y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
  1108. y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
  1109. u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
  1110. u0_h = __lsx_vmul_w(u0_h, v_uvalpha1);
  1111. v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
  1112. v0_h = __lsx_vmul_w(v0_h, v_uvalpha1);
  1113. y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
  1114. y_h = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
  1115. u_l = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
  1116. u_h = __lsx_vmadd_w(u0_h, v_uvalpha, u1_h);
  1117. v_l = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
  1118. v_h = __lsx_vmadd_w(v0_h, v_uvalpha, v1_h);
  1119. u_l = __lsx_vsub_w(u_l, uv);
  1120. u_h = __lsx_vsub_w(u_h, uv);
  1121. v_l = __lsx_vsub_w(v_l, uv);
  1122. v_h = __lsx_vsub_w(v_h, uv);
  1123. y_l = __lsx_vsrai_w(y_l, 10);
  1124. y_h = __lsx_vsrai_w(y_h, 10);
  1125. u_l = __lsx_vsrai_w(u_l, 10);
  1126. u_h = __lsx_vsrai_w(u_h, 10);
  1127. v_l = __lsx_vsrai_w(v_l, 10);
  1128. v_h = __lsx_vsrai_w(v_h, 10);
  1129. YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
  1130. y_temp, v2r, v2g, u2g, u2b);
  1131. YUVTORGB_LSX(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
  1132. y_temp, v2r, v2g, u2g, u2b);
  1133. if (hasAlpha) {
  1134. __m128i a0, a1, a0_l, a0_h;
  1135. __m128i a_l, a_h, a1_l, a1_h;
  1136. DUP2_ARG2(__lsx_vldx, abuf0, n, abuf1, n, a0, a1);
  1137. DUP2_ARG2(__lsx_vsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
  1138. DUP2_ARG1(__lsx_vexth_w_h, a0, a1, a0_h, a1_h);
  1139. a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
  1140. a_h = __lsx_vmadd_w(a_bias, a0_h, v_yalpha1);
  1141. a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
  1142. a_h = __lsx_vmadd_w(a_h, v_yalpha, a1_h);
  1143. a_l = __lsx_vsrai_w(a_l, 19);
  1144. a_h = __lsx_vsrai_w(a_h, 19);
  1145. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
  1146. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
  1147. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
  1148. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
  1149. WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 0, 4);
  1150. WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 1, 5);
  1151. WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 2, 6);
  1152. WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 3, 7);
  1153. } else {
  1154. WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
  1155. WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
  1156. WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
  1157. WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
  1158. WRITE_FULL_LSX(R_h, G_h, B_h, 0, 4);
  1159. WRITE_FULL_LSX(R_h, G_h, B_h, 1, 5);
  1160. WRITE_FULL_LSX(R_h, G_h, B_h, 2, 6);
  1161. WRITE_FULL_LSX(R_h, G_h, B_h, 3, 7);
  1162. }
  1163. }
  1164. if (dstW - i >= 4) {
  1165. __m128i b0, b1, ub0, ub1, vb0, vb1;
  1166. __m128i y0_l, y1_l, u0_l;
  1167. __m128i v0_l, u1_l, v1_l;
  1168. __m128i y_l, u_l, v_l;
  1169. __m128i R_l, G_l, B_l;
  1170. int n = i << 1;
  1171. DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0, n,
  1172. ubuf1, n, b0, b1, ub0, ub1);
  1173. DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0, vb1);
  1174. DUP2_ARG2(__lsx_vsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
  1175. DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
  1176. u0_l, u1_l, v0_l, v1_l);
  1177. y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
  1178. u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
  1179. v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
  1180. y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
  1181. u_l = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
  1182. v_l = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
  1183. u_l = __lsx_vsub_w(u_l, uv);
  1184. v_l = __lsx_vsub_w(v_l, uv);
  1185. y_l = __lsx_vsrai_w(y_l, 10);
  1186. u_l = __lsx_vsrai_w(u_l, 10);
  1187. v_l = __lsx_vsrai_w(v_l, 10);
  1188. YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
  1189. y_temp, v2r, v2g, u2g, u2b);
  1190. if (hasAlpha) {
  1191. __m128i a0, a1, a0_l;
  1192. __m128i a_l, a1_l;
  1193. DUP2_ARG2(__lsx_vldx, abuf0, n, abuf1, n, a0, a1);
  1194. DUP2_ARG2(__lsx_vsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
  1195. a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
  1196. a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
  1197. a_l = __lsx_vsrai_w(a_l, 19);
  1198. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
  1199. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
  1200. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
  1201. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
  1202. } else {
  1203. WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
  1204. WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
  1205. WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
  1206. WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
  1207. }
  1208. i += 4;
  1209. }
  1210. for (; i < dstW; i++){
  1211. int Y = ( buf0[i] * yalpha1 + buf1[i] * yalpha ) >> 10;
  1212. int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha- uvtemp) >> 10;
  1213. int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha- uvtemp) >> 10;
  1214. A = 0;
  1215. if (hasAlpha){
  1216. A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha + atemp) >> 19;
  1217. if (A & 0x100)
  1218. A = av_clip_uint8(A);
  1219. }
  1220. Y -= y_offset;
  1221. Y *= y_coeff;
  1222. Y += ytemp;
  1223. R = (unsigned)Y + V * v2r_coe;
  1224. G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
  1225. B = (unsigned)Y + U * u2b_coe;
  1226. yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
  1227. dest += step;
  1228. }
  1229. c->dither_error[0][i] = err[0];
  1230. c->dither_error[1][i] = err[1];
  1231. c->dither_error[2][i] = err[2];
  1232. }
  1233. static void
  1234. yuv2rgb_full_1_template_lsx(SwsInternal *c, const int16_t *buf0,
  1235. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1236. const int16_t *abuf0, uint8_t *dest, int dstW,
  1237. int uvalpha, int y, enum AVPixelFormat target,
  1238. int hasAlpha)
  1239. {
  1240. const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
  1241. int i, B, G, R, A;
  1242. int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
  1243. int err[4] = {0};
  1244. int ytemp = 1 << 21;
  1245. int bias_int = 64;
  1246. int len = dstW - 7;
  1247. __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
  1248. YUVTORGB_SETUP_LSX
  1249. if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
  1250. || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
  1251. step = 1;
  1252. if (uvalpha < 2048) {
  1253. int uvtemp = 128 << 7;
  1254. __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
  1255. __m128i bias = __lsx_vreplgr2vr_w(bias_int);
  1256. for (i = 0; i < len; i += 8) {
  1257. __m128i b, ub, vb, ub_l, ub_h, vb_l, vb_h;
  1258. __m128i y_l, y_h, u_l, u_h, v_l, v_h;
  1259. __m128i R_l, R_h, G_l, G_h, B_l, B_h;
  1260. int n = i << 1;
  1261. DUP2_ARG2(__lsx_vldx, buf0, n, ubuf0, n, b, ub);
  1262. vb = __lsx_vldx(vbuf0, n);
  1263. y_l = __lsx_vsllwil_w_h(b, 2);
  1264. y_h = __lsx_vexth_w_h(b);
  1265. DUP2_ARG2(__lsx_vsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
  1266. DUP2_ARG1(__lsx_vexth_w_h, ub, vb, ub_h, vb_h);
  1267. y_h = __lsx_vslli_w(y_h, 2);
  1268. u_l = __lsx_vsub_w(ub_l, uv);
  1269. u_h = __lsx_vsub_w(ub_h, uv);
  1270. v_l = __lsx_vsub_w(vb_l, uv);
  1271. v_h = __lsx_vsub_w(vb_h, uv);
  1272. u_l = __lsx_vslli_w(u_l, 2);
  1273. u_h = __lsx_vslli_w(u_h, 2);
  1274. v_l = __lsx_vslli_w(v_l, 2);
  1275. v_h = __lsx_vslli_w(v_h, 2);
  1276. YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
  1277. y_temp, v2r, v2g, u2g, u2b);
  1278. YUVTORGB_LSX(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
  1279. y_temp, v2r, v2g, u2g, u2b);
  1280. if(hasAlpha) {
  1281. __m128i a_src;
  1282. __m128i a_l, a_h;
  1283. a_src = __lsx_vld(abuf0 + i, 0);
  1284. a_l = __lsx_vsllwil_w_h(a_src, 0);
  1285. a_h = __lsx_vexth_w_h(a_src);
  1286. a_l = __lsx_vadd_w(a_l, bias);
  1287. a_h = __lsx_vadd_w(a_h, bias);
  1288. a_l = __lsx_vsrai_w(a_l, 7);
  1289. a_h = __lsx_vsrai_w(a_h, 7);
  1290. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
  1291. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
  1292. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
  1293. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
  1294. WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 0, 4);
  1295. WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 1, 5);
  1296. WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 2, 6);
  1297. WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 3, 7);
  1298. } else {
  1299. WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
  1300. WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
  1301. WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
  1302. WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
  1303. WRITE_FULL_LSX(R_h, G_h, B_h, 0, 4);
  1304. WRITE_FULL_LSX(R_h, G_h, B_h, 1, 5);
  1305. WRITE_FULL_LSX(R_h, G_h, B_h, 2, 6);
  1306. WRITE_FULL_LSX(R_h, G_h, B_h, 3, 7);
  1307. }
  1308. }
  1309. if (dstW - i >= 4) {
  1310. __m128i b, ub, vb, ub_l, vb_l;
  1311. __m128i y_l, u_l, v_l;
  1312. __m128i R_l, G_l, B_l;
  1313. int n = i << 1;
  1314. DUP2_ARG2(__lsx_vldx, buf0, n, ubuf0, n, b, ub);
  1315. vb = __lsx_vldx(vbuf0, n);
  1316. y_l = __lsx_vsllwil_w_h(b, 0);
  1317. DUP2_ARG2(__lsx_vsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
  1318. y_l = __lsx_vslli_w(y_l, 2);
  1319. u_l = __lsx_vsub_w(ub_l, uv);
  1320. v_l = __lsx_vsub_w(vb_l, uv);
  1321. u_l = __lsx_vslli_w(u_l, 2);
  1322. v_l = __lsx_vslli_w(v_l, 2);
  1323. YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
  1324. y_temp, v2r, v2g, u2g, u2b);
  1325. if(hasAlpha) {
  1326. __m128i a_src, a_l;
  1327. a_src = __lsx_vldx(abuf0, n);
  1328. a_src = __lsx_vsllwil_w_h(a_src, 0);
  1329. a_l = __lsx_vadd_w(bias, a_src);
  1330. a_l = __lsx_vsrai_w(a_l, 7);
  1331. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
  1332. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
  1333. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
  1334. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
  1335. } else {
  1336. WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
  1337. WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
  1338. WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
  1339. WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
  1340. }
  1341. i += 4;
  1342. }
  1343. for (; i < dstW; i++) {
  1344. int Y = buf0[i] << 2;
  1345. int U = (ubuf0[i] - uvtemp) << 2;
  1346. int V = (vbuf0[i] - uvtemp) << 2;
  1347. A = 0;
  1348. if(hasAlpha) {
  1349. A = (abuf0[i] + 64) >> 7;
  1350. if (A & 0x100)
  1351. A = av_clip_uint8(A);
  1352. }
  1353. Y -= y_offset;
  1354. Y *= y_coeff;
  1355. Y += ytemp;
  1356. R = (unsigned)Y + V * v2r_coe;
  1357. G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
  1358. B = (unsigned)Y + U * u2b_coe;
  1359. yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
  1360. dest += step;
  1361. }
  1362. } else {
  1363. const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
  1364. int uvtemp = 128 << 8;
  1365. __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
  1366. __m128i zero = __lsx_vldi(0);
  1367. __m128i bias = __lsx_vreplgr2vr_h(bias_int);
  1368. for (i = 0; i < len; i += 8) {
  1369. __m128i b, ub0, ub1, vb0, vb1;
  1370. __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od;
  1371. __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
  1372. int n = i << 1;
  1373. DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
  1374. ubuf1, n, b, ub0, vb0, ub1);
  1375. vb1 = __lsx_vldx(vbuf, n);
  1376. y_ev = __lsx_vaddwev_w_h(b, zero);
  1377. y_od = __lsx_vaddwod_w_h(b, zero);
  1378. DUP2_ARG2(__lsx_vaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
  1379. DUP2_ARG2(__lsx_vaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
  1380. DUP2_ARG2(__lsx_vslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
  1381. DUP4_ARG2(__lsx_vsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
  1382. u_ev, u_od, v_ev, v_od);
  1383. DUP4_ARG2(__lsx_vslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
  1384. u_ev, u_od, v_ev, v_od);
  1385. YUVTORGB_LSX(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
  1386. y_temp, v2r, v2g, u2g, u2b);
  1387. YUVTORGB_LSX(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
  1388. y_temp, v2r, v2g, u2g, u2b);
  1389. if(hasAlpha) {
  1390. __m128i a_src;
  1391. __m128i a_ev, a_od;
  1392. a_src = __lsx_vld(abuf0 + i, 0);
  1393. a_ev = __lsx_vaddwev_w_h(bias, a_src);
  1394. a_od = __lsx_vaddwod_w_h(bias, a_src);
  1395. a_ev = __lsx_vsrai_w(a_ev, 7);
  1396. a_od = __lsx_vsrai_w(a_od, 7);
  1397. WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 0, 0);
  1398. WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 0, 1);
  1399. WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 1, 2);
  1400. WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 1, 3);
  1401. WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 2, 4);
  1402. WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 2, 5);
  1403. WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 3, 6);
  1404. WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 3, 7);
  1405. } else {
  1406. WRITE_FULL_LSX(R_ev, G_ev, B_ev, 0, 0);
  1407. WRITE_FULL_LSX(R_od, G_od, B_od, 0, 1);
  1408. WRITE_FULL_LSX(R_ev, G_ev, B_ev, 1, 2);
  1409. WRITE_FULL_LSX(R_od, G_od, B_od, 1, 3);
  1410. WRITE_FULL_LSX(R_ev, G_ev, B_ev, 2, 4);
  1411. WRITE_FULL_LSX(R_od, G_od, B_od, 2, 5);
  1412. WRITE_FULL_LSX(R_ev, G_ev, B_ev, 3, 6);
  1413. WRITE_FULL_LSX(R_od, G_od, B_od, 3, 7);
  1414. }
  1415. }
  1416. if (dstW - i >= 4) {
  1417. __m128i b, ub0, ub1, vb0, vb1;
  1418. __m128i y_l, u_l, v_l;
  1419. __m128i R_l, G_l, B_l;
  1420. int n = i << 1;
  1421. DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
  1422. ubuf1, n, b, ub0, vb0, ub1);
  1423. vb1 = __lsx_vldx(vbuf1, n);
  1424. y_l = __lsx_vsllwil_w_h(b, 0);
  1425. y_l = __lsx_vslli_w(y_l, 2);
  1426. DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, vb0, 0, ub1, 0, vb1, 0,
  1427. ub0, vb0, ub1, vb1);
  1428. DUP2_ARG2(__lsx_vadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
  1429. u_l = __lsx_vsub_w(u_l, uv);
  1430. v_l = __lsx_vsub_w(v_l, uv);
  1431. u_l = __lsx_vslli_w(u_l, 1);
  1432. v_l = __lsx_vslli_w(v_l, 1);
  1433. YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
  1434. y_temp, v2r, v2g, u2g, u2b);
  1435. if(hasAlpha) {
  1436. __m128i a_src;
  1437. __m128i a_l;
  1438. a_src = __lsx_vld(abuf0 + i, 0);
  1439. a_src = __lsx_vilvl_h(a_src, a_src);
  1440. a_l = __lsx_vaddwev_w_h(bias, a_l);
  1441. a_l = __lsx_vsrai_w(a_l, 7);
  1442. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
  1443. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
  1444. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
  1445. WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
  1446. } else {
  1447. WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
  1448. WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
  1449. WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
  1450. WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
  1451. }
  1452. i += 4;
  1453. }
  1454. for (; i < dstW; i++) {
  1455. int Y = buf0[i] << 2;
  1456. int U = (ubuf0[i] + ubuf1[i] - uvtemp) << 1;
  1457. int V = (vbuf0[i] + vbuf1[i] - uvtemp) << 1;
  1458. A = 0;
  1459. if(hasAlpha) {
  1460. A = (abuf0[i] + 64) >> 7;
  1461. if (A & 0x100)
  1462. A = av_clip_uint8(A);
  1463. }
  1464. Y -= y_offset;
  1465. Y *= y_coeff;
  1466. Y += ytemp;
  1467. R = (unsigned)Y + V * v2r_coe;
  1468. G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
  1469. B = (unsigned)Y + U * u2b_coe;
  1470. yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
  1471. dest += step;
  1472. }
  1473. }
  1474. c->dither_error[0][i] = err[0];
  1475. c->dither_error[1][i] = err[1];
  1476. c->dither_error[2][i] = err[2];
  1477. }
  1478. #if CONFIG_SMALL
  1479. YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,
  1480. CONFIG_SWSCALE_ALPHA && c->needAlpha)
  1481. YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,
  1482. CONFIG_SWSCALE_ALPHA && c->needAlpha)
  1483. YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,
  1484. CONFIG_SWSCALE_ALPHA && c->needAlpha)
  1485. YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,
  1486. CONFIG_SWSCALE_ALPHA && c->needAlpha)
  1487. #else
  1488. #if CONFIG_SWSCALE_ALPHA
  1489. YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA, 1)
  1490. YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR, 1)
  1491. YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA, 1)
  1492. YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB, 1)
  1493. #endif
  1494. YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
  1495. YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
  1496. YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
  1497. YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
  1498. #endif
  1499. YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
  1500. YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
  1501. YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full, AV_PIX_FMT_BGR4_BYTE, 0)
  1502. YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full, AV_PIX_FMT_RGB4_BYTE, 0)
  1503. YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
  1504. YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
  1505. av_cold void ff_sws_init_output_lsx(SwsInternal *c,
  1506. yuv2planar1_fn *yuv2plane1,
  1507. yuv2planarX_fn *yuv2planeX,
  1508. yuv2interleavedX_fn *yuv2nv12cX,
  1509. yuv2packed1_fn *yuv2packed1,
  1510. yuv2packed2_fn *yuv2packed2,
  1511. yuv2packedX_fn *yuv2packedX,
  1512. yuv2anyX_fn *yuv2anyX)
  1513. {
  1514. enum AVPixelFormat dstFormat = c->opts.dst_format;
  1515. /* Add initialization once optimized */
  1516. if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
  1517. } else if (is16BPS(dstFormat)) {
  1518. } else if (isNBPS(dstFormat)) {
  1519. } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
  1520. } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
  1521. } else {
  1522. *yuv2plane1 = yuv2plane1_8_lsx;
  1523. *yuv2planeX = yuv2planeX_8_lsx;
  1524. }
  1525. if(c->opts.flags & SWS_FULL_CHR_H_INT) {
  1526. switch (c->opts.dst_format) {
  1527. case AV_PIX_FMT_RGBA:
  1528. #if CONFIG_SMALL
  1529. c->yuv2packedX = yuv2rgba32_full_X_lsx;
  1530. c->yuv2packed2 = yuv2rgba32_full_2_lsx;
  1531. c->yuv2packed1 = yuv2rgba32_full_1_lsx;
  1532. #else
  1533. #if CONFIG_SWSCALE_ALPHA
  1534. if (c->needAlpha) {
  1535. c->yuv2packedX = yuv2rgba32_full_X_lsx;
  1536. c->yuv2packed2 = yuv2rgba32_full_2_lsx;
  1537. c->yuv2packed1 = yuv2rgba32_full_1_lsx;
  1538. } else
  1539. #endif /* CONFIG_SWSCALE_ALPHA */
  1540. {
  1541. c->yuv2packedX = yuv2rgbx32_full_X_lsx;
  1542. c->yuv2packed2 = yuv2rgbx32_full_2_lsx;
  1543. c->yuv2packed1 = yuv2rgbx32_full_1_lsx;
  1544. }
  1545. #endif /* !CONFIG_SMALL */
  1546. break;
  1547. case AV_PIX_FMT_ARGB:
  1548. #if CONFIG_SMALL
  1549. c->yuv2packedX = yuv2argb32_full_X_lsx;
  1550. c->yuv2packed2 = yuv2argb32_full_2_lsx;
  1551. c->yuv2packed1 = yuv2argb32_full_1_lsx;
  1552. #else
  1553. #if CONFIG_SWSCALE_ALPHA
  1554. if (c->needAlpha) {
  1555. c->yuv2packedX = yuv2argb32_full_X_lsx;
  1556. c->yuv2packed2 = yuv2argb32_full_2_lsx;
  1557. c->yuv2packed1 = yuv2argb32_full_1_lsx;
  1558. } else
  1559. #endif /* CONFIG_SWSCALE_ALPHA */
  1560. {
  1561. c->yuv2packedX = yuv2xrgb32_full_X_lsx;
  1562. c->yuv2packed2 = yuv2xrgb32_full_2_lsx;
  1563. c->yuv2packed1 = yuv2xrgb32_full_1_lsx;
  1564. }
  1565. #endif /* !CONFIG_SMALL */
  1566. break;
  1567. case AV_PIX_FMT_BGRA:
  1568. #if CONFIG_SMALL
  1569. c->yuv2packedX = yuv2bgra32_full_X_lsx;
  1570. c->yuv2packed2 = yuv2bgra32_full_2_lsx;
  1571. c->yuv2packed1 = yuv2bgra32_full_1_lsx;
  1572. #else
  1573. #if CONFIG_SWSCALE_ALPHA
  1574. if (c->needAlpha) {
  1575. c->yuv2packedX = yuv2bgra32_full_X_lsx;
  1576. c->yuv2packed2 = yuv2bgra32_full_2_lsx;
  1577. c->yuv2packed1 = yuv2bgra32_full_1_lsx;
  1578. } else
  1579. #endif /* CONFIG_SWSCALE_ALPHA */
  1580. {
  1581. c->yuv2packedX = yuv2bgrx32_full_X_lsx;
  1582. c->yuv2packed2 = yuv2bgrx32_full_2_lsx;
  1583. c->yuv2packed1 = yuv2bgrx32_full_1_lsx;
  1584. }
  1585. #endif /* !CONFIG_SMALL */
  1586. break;
  1587. case AV_PIX_FMT_ABGR:
  1588. #if CONFIG_SMALL
  1589. c->yuv2packedX = yuv2abgr32_full_X_lsx;
  1590. c->yuv2packed2 = yuv2abgr32_full_2_lsx;
  1591. c->yuv2packed1 = yuv2abgr32_full_1_lsx;
  1592. #else
  1593. #if CONFIG_SWSCALE_ALPHA
  1594. if (c->needAlpha) {
  1595. c->yuv2packedX = yuv2abgr32_full_X_lsx;
  1596. c->yuv2packed2 = yuv2abgr32_full_2_lsx;
  1597. c->yuv2packed1 = yuv2abgr32_full_1_lsx;
  1598. } else
  1599. #endif /* CONFIG_SWSCALE_ALPHA */
  1600. {
  1601. c->yuv2packedX = yuv2xbgr32_full_X_lsx;
  1602. c->yuv2packed2 = yuv2xbgr32_full_2_lsx;
  1603. c->yuv2packed1 = yuv2xbgr32_full_1_lsx;
  1604. }
  1605. #endif /* !CONFIG_SMALL */
  1606. break;
  1607. case AV_PIX_FMT_RGB24:
  1608. c->yuv2packedX = yuv2rgb24_full_X_lsx;
  1609. c->yuv2packed2 = yuv2rgb24_full_2_lsx;
  1610. c->yuv2packed1 = yuv2rgb24_full_1_lsx;
  1611. break;
  1612. case AV_PIX_FMT_BGR24:
  1613. c->yuv2packedX = yuv2bgr24_full_X_lsx;
  1614. c->yuv2packed2 = yuv2bgr24_full_2_lsx;
  1615. c->yuv2packed1 = yuv2bgr24_full_1_lsx;
  1616. break;
  1617. case AV_PIX_FMT_BGR4_BYTE:
  1618. c->yuv2packedX = yuv2bgr4_byte_full_X_lsx;
  1619. c->yuv2packed2 = yuv2bgr4_byte_full_2_lsx;
  1620. c->yuv2packed1 = yuv2bgr4_byte_full_1_lsx;
  1621. break;
  1622. case AV_PIX_FMT_RGB4_BYTE:
  1623. c->yuv2packedX = yuv2rgb4_byte_full_X_lsx;
  1624. c->yuv2packed2 = yuv2rgb4_byte_full_2_lsx;
  1625. c->yuv2packed1 = yuv2rgb4_byte_full_1_lsx;
  1626. break;
  1627. case AV_PIX_FMT_BGR8:
  1628. c->yuv2packedX = yuv2bgr8_full_X_lsx;
  1629. c->yuv2packed2 = yuv2bgr8_full_2_lsx;
  1630. c->yuv2packed1 = yuv2bgr8_full_1_lsx;
  1631. break;
  1632. case AV_PIX_FMT_RGB8:
  1633. c->yuv2packedX = yuv2rgb8_full_X_lsx;
  1634. c->yuv2packed2 = yuv2rgb8_full_2_lsx;
  1635. c->yuv2packed1 = yuv2rgb8_full_1_lsx;
  1636. break;
  1637. }
  1638. } else {
  1639. switch (c->opts.dst_format) {
  1640. case AV_PIX_FMT_RGB32:
  1641. case AV_PIX_FMT_BGR32:
  1642. #if CONFIG_SMALL
  1643. #else
  1644. #if CONFIG_SWSCALE_ALPHA
  1645. if (c->needAlpha) {
  1646. } else
  1647. #endif /* CONFIG_SWSCALE_ALPHA */
  1648. {
  1649. c->yuv2packed1 = yuv2rgbx32_1_lsx;
  1650. c->yuv2packed2 = yuv2rgbx32_2_lsx;
  1651. c->yuv2packedX = yuv2rgbx32_X_lsx;
  1652. }
  1653. #endif /* !CONFIG_SMALL */
  1654. break;
  1655. case AV_PIX_FMT_RGB32_1:
  1656. case AV_PIX_FMT_BGR32_1:
  1657. #if CONFIG_SMALL
  1658. #else
  1659. #if CONFIG_SWSCALE_ALPHA
  1660. if (c->needAlpha) {
  1661. } else
  1662. #endif /* CONFIG_SWSCALE_ALPHA */
  1663. {
  1664. c->yuv2packed1 = yuv2rgbx32_1_1_lsx;
  1665. c->yuv2packed2 = yuv2rgbx32_1_2_lsx;
  1666. c->yuv2packedX = yuv2rgbx32_1_X_lsx;
  1667. }
  1668. #endif /* !CONFIG_SMALL */
  1669. break;
  1670. case AV_PIX_FMT_RGB24:
  1671. c->yuv2packed1 = yuv2rgb24_1_lsx;
  1672. c->yuv2packed2 = yuv2rgb24_2_lsx;
  1673. c->yuv2packedX = yuv2rgb24_X_lsx;
  1674. break;
  1675. case AV_PIX_FMT_BGR24:
  1676. c->yuv2packed1 = yuv2bgr24_1_lsx;
  1677. c->yuv2packed2 = yuv2bgr24_2_lsx;
  1678. c->yuv2packedX = yuv2bgr24_X_lsx;
  1679. break;
  1680. case AV_PIX_FMT_RGB565LE:
  1681. case AV_PIX_FMT_RGB565BE:
  1682. case AV_PIX_FMT_BGR565LE:
  1683. case AV_PIX_FMT_BGR565BE:
  1684. c->yuv2packed1 = yuv2rgb16_1_lsx;
  1685. c->yuv2packed2 = yuv2rgb16_2_lsx;
  1686. c->yuv2packedX = yuv2rgb16_X_lsx;
  1687. break;
  1688. case AV_PIX_FMT_RGB555LE:
  1689. case AV_PIX_FMT_RGB555BE:
  1690. case AV_PIX_FMT_BGR555LE:
  1691. case AV_PIX_FMT_BGR555BE:
  1692. c->yuv2packed1 = yuv2rgb15_1_lsx;
  1693. c->yuv2packed2 = yuv2rgb15_2_lsx;
  1694. c->yuv2packedX = yuv2rgb15_X_lsx;
  1695. break;
  1696. case AV_PIX_FMT_RGB444LE:
  1697. case AV_PIX_FMT_RGB444BE:
  1698. case AV_PIX_FMT_BGR444LE:
  1699. case AV_PIX_FMT_BGR444BE:
  1700. c->yuv2packed1 = yuv2rgb12_1_lsx;
  1701. c->yuv2packed2 = yuv2rgb12_2_lsx;
  1702. c->yuv2packedX = yuv2rgb12_X_lsx;
  1703. break;
  1704. case AV_PIX_FMT_RGB8:
  1705. case AV_PIX_FMT_BGR8:
  1706. c->yuv2packed1 = yuv2rgb8_1_lsx;
  1707. c->yuv2packed2 = yuv2rgb8_2_lsx;
  1708. c->yuv2packedX = yuv2rgb8_X_lsx;
  1709. break;
  1710. case AV_PIX_FMT_RGB4:
  1711. case AV_PIX_FMT_BGR4:
  1712. c->yuv2packed1 = yuv2rgb4_1_lsx;
  1713. c->yuv2packed2 = yuv2rgb4_2_lsx;
  1714. c->yuv2packedX = yuv2rgb4_X_lsx;
  1715. break;
  1716. case AV_PIX_FMT_RGB4_BYTE:
  1717. case AV_PIX_FMT_BGR4_BYTE:
  1718. c->yuv2packed1 = yuv2rgb4b_1_lsx;
  1719. c->yuv2packed2 = yuv2rgb4b_2_lsx;
  1720. c->yuv2packedX = yuv2rgb4b_X_lsx;
  1721. break;
  1722. }
  1723. }
  1724. }