swscale_template.c 88 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992
  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #undef REAL_MOVNTQ
  21. #undef MOVNTQ
  22. #undef MOVNTQ2
  23. #undef PREFETCH
  24. #if COMPILE_TEMPLATE_MMX2
  25. #define PREFETCH "prefetchnta"
  26. #else
  27. #define PREFETCH " # nop"
  28. #endif
  29. #if COMPILE_TEMPLATE_MMX2
  30. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  31. #define MOVNTQ2 "movntq "
  32. #else
  33. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  34. #define MOVNTQ2 "movq "
  35. #endif
  36. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  37. #if !COMPILE_TEMPLATE_MMX2
  38. static av_always_inline void
  39. dither_8to16(const uint8_t *srcDither, int rot)
  40. {
  41. if (rot) {
  42. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  43. "movq (%0), %%mm3\n\t"
  44. "movq %%mm3, %%mm4\n\t"
  45. "psrlq $24, %%mm3\n\t"
  46. "psllq $40, %%mm4\n\t"
  47. "por %%mm4, %%mm3\n\t"
  48. "movq %%mm3, %%mm4\n\t"
  49. "punpcklbw %%mm0, %%mm3\n\t"
  50. "punpckhbw %%mm0, %%mm4\n\t"
  51. :: "r"(srcDither)
  52. );
  53. } else {
  54. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  55. "movq (%0), %%mm3\n\t"
  56. "movq %%mm3, %%mm4\n\t"
  57. "punpcklbw %%mm0, %%mm3\n\t"
  58. "punpckhbw %%mm0, %%mm4\n\t"
  59. :: "r"(srcDither)
  60. );
  61. }
  62. }
  63. #endif
  64. static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
  65. const int16_t **src, uint8_t *dest, int dstW,
  66. const uint8_t *dither, int offset)
  67. {
  68. dither_8to16(dither, offset);
  69. __asm__ volatile(\
  70. "psraw $4, %%mm3\n\t"
  71. "psraw $4, %%mm4\n\t"
  72. "movq %%mm3, %%mm6\n\t"
  73. "movq %%mm4, %%mm7\n\t"
  74. "movl %3, %%ecx\n\t"
  75. "mov %0, %%"REG_d" \n\t"\
  76. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  77. ".p2align 4 \n\t" /* FIXME Unroll? */\
  78. "1: \n\t"\
  79. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  80. "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\
  81. "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\
  82. "add $16, %%"REG_d" \n\t"\
  83. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  84. "test %%"REG_S", %%"REG_S" \n\t"\
  85. "pmulhw %%mm0, %%mm2 \n\t"\
  86. "pmulhw %%mm0, %%mm5 \n\t"\
  87. "paddw %%mm2, %%mm3 \n\t"\
  88. "paddw %%mm5, %%mm4 \n\t"\
  89. " jnz 1b \n\t"\
  90. "psraw $3, %%mm3 \n\t"\
  91. "psraw $3, %%mm4 \n\t"\
  92. "packuswb %%mm4, %%mm3 \n\t"
  93. MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t"
  94. "add $8, %%"REG_c" \n\t"\
  95. "cmp %2, %%"REG_c" \n\t"\
  96. "movq %%mm6, %%mm3\n\t"
  97. "movq %%mm7, %%mm4\n\t"
  98. "mov %0, %%"REG_d" \n\t"\
  99. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  100. "jb 1b \n\t"\
  101. :: "g" (filter),
  102. "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
  103. : "%"REG_d, "%"REG_S, "%"REG_c
  104. );
  105. }
  106. static void RENAME(yuv2yuv1_ar)(const int16_t *src, uint8_t *dst, int dstW, const uint8_t *dither, int offset)
  107. {
  108. dither_8to16(dither, offset);
  109. __asm__ volatile(
  110. "mov %2, %%"REG_a" \n\t"
  111. ".p2align 4 \n\t" /* FIXME Unroll? */
  112. "1: \n\t"
  113. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
  114. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
  115. "paddsw %%mm3, %%mm0 \n\t"
  116. "paddsw %%mm4, %%mm1 \n\t"
  117. "psraw $7, %%mm0 \n\t"
  118. "psraw $7, %%mm1 \n\t"
  119. "packuswb %%mm1, %%mm0 \n\t"
  120. MOVNTQ(%%mm0, (%1, %%REGa))
  121. "add $8, %%"REG_a" \n\t"
  122. "jnc 1b \n\t"
  123. :: "r" (src + dstW), "r" (dst + dstW),
  124. "g" ((x86_reg)-dstW)
  125. : "%"REG_a
  126. );
  127. }
  128. #define YSCALEYUV2PACKEDX_UV \
  129. __asm__ volatile(\
  130. "xor %%"REG_a", %%"REG_a" \n\t"\
  131. ".p2align 4 \n\t"\
  132. "nop \n\t"\
  133. "1: \n\t"\
  134. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  135. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  136. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  137. "movq %%mm3, %%mm4 \n\t"\
  138. ".p2align 4 \n\t"\
  139. "2: \n\t"\
  140. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  141. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  142. "add %6, %%"REG_S" \n\t" \
  143. "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  144. "add $16, %%"REG_d" \n\t"\
  145. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  146. "pmulhw %%mm0, %%mm2 \n\t"\
  147. "pmulhw %%mm0, %%mm5 \n\t"\
  148. "paddw %%mm2, %%mm3 \n\t"\
  149. "paddw %%mm5, %%mm4 \n\t"\
  150. "test %%"REG_S", %%"REG_S" \n\t"\
  151. " jnz 2b \n\t"\
  152. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  153. "lea "offset"(%0), %%"REG_d" \n\t"\
  154. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  155. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  156. "movq "#dst1", "#dst2" \n\t"\
  157. ".p2align 4 \n\t"\
  158. "2: \n\t"\
  159. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  160. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  161. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  162. "add $16, %%"REG_d" \n\t"\
  163. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  164. "pmulhw "#coeff", "#src1" \n\t"\
  165. "pmulhw "#coeff", "#src2" \n\t"\
  166. "paddw "#src1", "#dst1" \n\t"\
  167. "paddw "#src2", "#dst2" \n\t"\
  168. "test %%"REG_S", %%"REG_S" \n\t"\
  169. " jnz 2b \n\t"\
  170. #define YSCALEYUV2PACKEDX \
  171. YSCALEYUV2PACKEDX_UV \
  172. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  173. #define YSCALEYUV2PACKEDX_END \
  174. :: "r" (&c->redDither), \
  175. "m" (dummy), "m" (dummy), "m" (dummy),\
  176. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  177. : "%"REG_a, "%"REG_d, "%"REG_S \
  178. );
  179. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  180. __asm__ volatile(\
  181. "xor %%"REG_a", %%"REG_a" \n\t"\
  182. ".p2align 4 \n\t"\
  183. "nop \n\t"\
  184. "1: \n\t"\
  185. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  186. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  187. "pxor %%mm4, %%mm4 \n\t"\
  188. "pxor %%mm5, %%mm5 \n\t"\
  189. "pxor %%mm6, %%mm6 \n\t"\
  190. "pxor %%mm7, %%mm7 \n\t"\
  191. ".p2align 4 \n\t"\
  192. "2: \n\t"\
  193. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  194. "add %6, %%"REG_S" \n\t" \
  195. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  196. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  197. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  198. "movq %%mm0, %%mm3 \n\t"\
  199. "punpcklwd %%mm1, %%mm0 \n\t"\
  200. "punpckhwd %%mm1, %%mm3 \n\t"\
  201. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  202. "pmaddwd %%mm1, %%mm0 \n\t"\
  203. "pmaddwd %%mm1, %%mm3 \n\t"\
  204. "paddd %%mm0, %%mm4 \n\t"\
  205. "paddd %%mm3, %%mm5 \n\t"\
  206. "add %6, %%"REG_S" \n\t" \
  207. "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  208. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  209. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  210. "test %%"REG_S", %%"REG_S" \n\t"\
  211. "movq %%mm2, %%mm0 \n\t"\
  212. "punpcklwd %%mm3, %%mm2 \n\t"\
  213. "punpckhwd %%mm3, %%mm0 \n\t"\
  214. "pmaddwd %%mm1, %%mm2 \n\t"\
  215. "pmaddwd %%mm1, %%mm0 \n\t"\
  216. "paddd %%mm2, %%mm6 \n\t"\
  217. "paddd %%mm0, %%mm7 \n\t"\
  218. " jnz 2b \n\t"\
  219. "psrad $16, %%mm4 \n\t"\
  220. "psrad $16, %%mm5 \n\t"\
  221. "psrad $16, %%mm6 \n\t"\
  222. "psrad $16, %%mm7 \n\t"\
  223. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  224. "packssdw %%mm5, %%mm4 \n\t"\
  225. "packssdw %%mm7, %%mm6 \n\t"\
  226. "paddw %%mm0, %%mm4 \n\t"\
  227. "paddw %%mm0, %%mm6 \n\t"\
  228. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  229. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  230. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  231. "lea "offset"(%0), %%"REG_d" \n\t"\
  232. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  233. "pxor %%mm1, %%mm1 \n\t"\
  234. "pxor %%mm5, %%mm5 \n\t"\
  235. "pxor %%mm7, %%mm7 \n\t"\
  236. "pxor %%mm6, %%mm6 \n\t"\
  237. ".p2align 4 \n\t"\
  238. "2: \n\t"\
  239. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  240. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  241. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  242. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  243. "movq %%mm0, %%mm3 \n\t"\
  244. "punpcklwd %%mm4, %%mm0 \n\t"\
  245. "punpckhwd %%mm4, %%mm3 \n\t"\
  246. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  247. "pmaddwd %%mm4, %%mm0 \n\t"\
  248. "pmaddwd %%mm4, %%mm3 \n\t"\
  249. "paddd %%mm0, %%mm1 \n\t"\
  250. "paddd %%mm3, %%mm5 \n\t"\
  251. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  252. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  253. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  254. "test %%"REG_S", %%"REG_S" \n\t"\
  255. "movq %%mm2, %%mm0 \n\t"\
  256. "punpcklwd %%mm3, %%mm2 \n\t"\
  257. "punpckhwd %%mm3, %%mm0 \n\t"\
  258. "pmaddwd %%mm4, %%mm2 \n\t"\
  259. "pmaddwd %%mm4, %%mm0 \n\t"\
  260. "paddd %%mm2, %%mm7 \n\t"\
  261. "paddd %%mm0, %%mm6 \n\t"\
  262. " jnz 2b \n\t"\
  263. "psrad $16, %%mm1 \n\t"\
  264. "psrad $16, %%mm5 \n\t"\
  265. "psrad $16, %%mm7 \n\t"\
  266. "psrad $16, %%mm6 \n\t"\
  267. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  268. "packssdw %%mm5, %%mm1 \n\t"\
  269. "packssdw %%mm6, %%mm7 \n\t"\
  270. "paddw %%mm0, %%mm1 \n\t"\
  271. "paddw %%mm0, %%mm7 \n\t"\
  272. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  273. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  274. #define YSCALEYUV2PACKEDX_ACCURATE \
  275. YSCALEYUV2PACKEDX_ACCURATE_UV \
  276. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  277. #define YSCALEYUV2RGBX \
  278. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  279. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  280. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  281. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  282. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  283. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  284. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  285. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  286. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  287. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  288. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  289. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  290. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  291. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  292. "paddw %%mm3, %%mm4 \n\t"\
  293. "movq %%mm2, %%mm0 \n\t"\
  294. "movq %%mm5, %%mm6 \n\t"\
  295. "movq %%mm4, %%mm3 \n\t"\
  296. "punpcklwd %%mm2, %%mm2 \n\t"\
  297. "punpcklwd %%mm5, %%mm5 \n\t"\
  298. "punpcklwd %%mm4, %%mm4 \n\t"\
  299. "paddw %%mm1, %%mm2 \n\t"\
  300. "paddw %%mm1, %%mm5 \n\t"\
  301. "paddw %%mm1, %%mm4 \n\t"\
  302. "punpckhwd %%mm0, %%mm0 \n\t"\
  303. "punpckhwd %%mm6, %%mm6 \n\t"\
  304. "punpckhwd %%mm3, %%mm3 \n\t"\
  305. "paddw %%mm7, %%mm0 \n\t"\
  306. "paddw %%mm7, %%mm6 \n\t"\
  307. "paddw %%mm7, %%mm3 \n\t"\
  308. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  309. "packuswb %%mm0, %%mm2 \n\t"\
  310. "packuswb %%mm6, %%mm5 \n\t"\
  311. "packuswb %%mm3, %%mm4 \n\t"\
  312. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  313. "movq "#b", "#q2" \n\t" /* B */\
  314. "movq "#r", "#t" \n\t" /* R */\
  315. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  316. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  317. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  318. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  319. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  320. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  321. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  322. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  323. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  324. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  325. \
  326. MOVNTQ( q0, (dst, index, 4))\
  327. MOVNTQ( b, 8(dst, index, 4))\
  328. MOVNTQ( q2, 16(dst, index, 4))\
  329. MOVNTQ( q3, 24(dst, index, 4))\
  330. \
  331. "add $8, "#index" \n\t"\
  332. "cmp "#dstw", "#index" \n\t"\
  333. " jb 1b \n\t"
  334. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  335. static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  336. const int16_t **lumSrc, int lumFilterSize,
  337. const int16_t *chrFilter, const int16_t **chrUSrc,
  338. const int16_t **chrVSrc,
  339. int chrFilterSize, const int16_t **alpSrc,
  340. uint8_t *dest, int dstW, int dstY)
  341. {
  342. x86_reg dummy=0;
  343. x86_reg dstW_reg = dstW;
  344. x86_reg uv_off = c->uv_offx2;
  345. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  346. YSCALEYUV2PACKEDX_ACCURATE
  347. YSCALEYUV2RGBX
  348. "movq %%mm2, "U_TEMP"(%0) \n\t"
  349. "movq %%mm4, "V_TEMP"(%0) \n\t"
  350. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  351. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  352. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  353. "psraw $3, %%mm1 \n\t"
  354. "psraw $3, %%mm7 \n\t"
  355. "packuswb %%mm7, %%mm1 \n\t"
  356. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  357. YSCALEYUV2PACKEDX_END
  358. } else {
  359. YSCALEYUV2PACKEDX_ACCURATE
  360. YSCALEYUV2RGBX
  361. "pcmpeqd %%mm7, %%mm7 \n\t"
  362. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  363. YSCALEYUV2PACKEDX_END
  364. }
  365. }
  366. static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  367. const int16_t **lumSrc, int lumFilterSize,
  368. const int16_t *chrFilter, const int16_t **chrUSrc,
  369. const int16_t **chrVSrc,
  370. int chrFilterSize, const int16_t **alpSrc,
  371. uint8_t *dest, int dstW, int dstY)
  372. {
  373. x86_reg dummy=0;
  374. x86_reg dstW_reg = dstW;
  375. x86_reg uv_off = c->uv_offx2;
  376. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  377. YSCALEYUV2PACKEDX
  378. YSCALEYUV2RGBX
  379. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  380. "psraw $3, %%mm1 \n\t"
  381. "psraw $3, %%mm7 \n\t"
  382. "packuswb %%mm7, %%mm1 \n\t"
  383. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  384. YSCALEYUV2PACKEDX_END
  385. } else {
  386. YSCALEYUV2PACKEDX
  387. YSCALEYUV2RGBX
  388. "pcmpeqd %%mm7, %%mm7 \n\t"
  389. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  390. YSCALEYUV2PACKEDX_END
  391. }
  392. }
  393. #define REAL_WRITERGB16(dst, dstw, index) \
  394. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  395. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  396. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  397. "psrlq $3, %%mm2 \n\t"\
  398. \
  399. "movq %%mm2, %%mm1 \n\t"\
  400. "movq %%mm4, %%mm3 \n\t"\
  401. \
  402. "punpcklbw %%mm7, %%mm3 \n\t"\
  403. "punpcklbw %%mm5, %%mm2 \n\t"\
  404. "punpckhbw %%mm7, %%mm4 \n\t"\
  405. "punpckhbw %%mm5, %%mm1 \n\t"\
  406. \
  407. "psllq $3, %%mm3 \n\t"\
  408. "psllq $3, %%mm4 \n\t"\
  409. \
  410. "por %%mm3, %%mm2 \n\t"\
  411. "por %%mm4, %%mm1 \n\t"\
  412. \
  413. MOVNTQ(%%mm2, (dst, index, 2))\
  414. MOVNTQ(%%mm1, 8(dst, index, 2))\
  415. \
  416. "add $8, "#index" \n\t"\
  417. "cmp "#dstw", "#index" \n\t"\
  418. " jb 1b \n\t"
  419. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  420. static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  421. const int16_t **lumSrc, int lumFilterSize,
  422. const int16_t *chrFilter, const int16_t **chrUSrc,
  423. const int16_t **chrVSrc,
  424. int chrFilterSize, const int16_t **alpSrc,
  425. uint8_t *dest, int dstW, int dstY)
  426. {
  427. x86_reg dummy=0;
  428. x86_reg dstW_reg = dstW;
  429. x86_reg uv_off = c->uv_offx2;
  430. YSCALEYUV2PACKEDX_ACCURATE
  431. YSCALEYUV2RGBX
  432. "pxor %%mm7, %%mm7 \n\t"
  433. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  434. #ifdef DITHER1XBPP
  435. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  436. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  437. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  438. #endif
  439. WRITERGB16(%4, %5, %%REGa)
  440. YSCALEYUV2PACKEDX_END
  441. }
  442. static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  443. const int16_t **lumSrc, int lumFilterSize,
  444. const int16_t *chrFilter, const int16_t **chrUSrc,
  445. const int16_t **chrVSrc,
  446. int chrFilterSize, const int16_t **alpSrc,
  447. uint8_t *dest, int dstW, int dstY)
  448. {
  449. x86_reg dummy=0;
  450. x86_reg dstW_reg = dstW;
  451. x86_reg uv_off = c->uv_offx2;
  452. YSCALEYUV2PACKEDX
  453. YSCALEYUV2RGBX
  454. "pxor %%mm7, %%mm7 \n\t"
  455. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  456. #ifdef DITHER1XBPP
  457. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  458. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  459. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  460. #endif
  461. WRITERGB16(%4, %5, %%REGa)
  462. YSCALEYUV2PACKEDX_END
  463. }
  464. #define REAL_WRITERGB15(dst, dstw, index) \
  465. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  466. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  467. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  468. "psrlq $3, %%mm2 \n\t"\
  469. "psrlq $1, %%mm5 \n\t"\
  470. \
  471. "movq %%mm2, %%mm1 \n\t"\
  472. "movq %%mm4, %%mm3 \n\t"\
  473. \
  474. "punpcklbw %%mm7, %%mm3 \n\t"\
  475. "punpcklbw %%mm5, %%mm2 \n\t"\
  476. "punpckhbw %%mm7, %%mm4 \n\t"\
  477. "punpckhbw %%mm5, %%mm1 \n\t"\
  478. \
  479. "psllq $2, %%mm3 \n\t"\
  480. "psllq $2, %%mm4 \n\t"\
  481. \
  482. "por %%mm3, %%mm2 \n\t"\
  483. "por %%mm4, %%mm1 \n\t"\
  484. \
  485. MOVNTQ(%%mm2, (dst, index, 2))\
  486. MOVNTQ(%%mm1, 8(dst, index, 2))\
  487. \
  488. "add $8, "#index" \n\t"\
  489. "cmp "#dstw", "#index" \n\t"\
  490. " jb 1b \n\t"
  491. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  492. static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  493. const int16_t **lumSrc, int lumFilterSize,
  494. const int16_t *chrFilter, const int16_t **chrUSrc,
  495. const int16_t **chrVSrc,
  496. int chrFilterSize, const int16_t **alpSrc,
  497. uint8_t *dest, int dstW, int dstY)
  498. {
  499. x86_reg dummy=0;
  500. x86_reg dstW_reg = dstW;
  501. x86_reg uv_off = c->uv_offx2;
  502. YSCALEYUV2PACKEDX_ACCURATE
  503. YSCALEYUV2RGBX
  504. "pxor %%mm7, %%mm7 \n\t"
  505. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  506. #ifdef DITHER1XBPP
  507. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  508. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  509. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  510. #endif
  511. WRITERGB15(%4, %5, %%REGa)
  512. YSCALEYUV2PACKEDX_END
  513. }
  514. static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  515. const int16_t **lumSrc, int lumFilterSize,
  516. const int16_t *chrFilter, const int16_t **chrUSrc,
  517. const int16_t **chrVSrc,
  518. int chrFilterSize, const int16_t **alpSrc,
  519. uint8_t *dest, int dstW, int dstY)
  520. {
  521. x86_reg dummy=0;
  522. x86_reg dstW_reg = dstW;
  523. x86_reg uv_off = c->uv_offx2;
  524. YSCALEYUV2PACKEDX
  525. YSCALEYUV2RGBX
  526. "pxor %%mm7, %%mm7 \n\t"
  527. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  528. #ifdef DITHER1XBPP
  529. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  530. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  531. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  532. #endif
  533. WRITERGB15(%4, %5, %%REGa)
  534. YSCALEYUV2PACKEDX_END
  535. }
  536. #define WRITEBGR24MMX(dst, dstw, index) \
  537. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  538. "movq %%mm2, %%mm1 \n\t" /* B */\
  539. "movq %%mm5, %%mm6 \n\t" /* R */\
  540. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  541. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  542. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  543. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  544. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  545. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  546. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  547. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  548. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  549. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  550. \
  551. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  552. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  553. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  554. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  555. \
  556. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  557. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  558. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  559. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  560. \
  561. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  562. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  563. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  564. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  565. \
  566. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  567. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  568. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  569. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  570. MOVNTQ(%%mm0, (dst))\
  571. \
  572. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  573. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  574. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  575. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  576. MOVNTQ(%%mm6, 8(dst))\
  577. \
  578. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  579. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  580. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  581. MOVNTQ(%%mm5, 16(dst))\
  582. \
  583. "add $24, "#dst" \n\t"\
  584. \
  585. "add $8, "#index" \n\t"\
  586. "cmp "#dstw", "#index" \n\t"\
  587. " jb 1b \n\t"
  588. #define WRITEBGR24MMX2(dst, dstw, index) \
  589. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  590. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  591. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  592. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  593. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  594. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  595. \
  596. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  597. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  598. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  599. \
  600. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  601. "por %%mm1, %%mm6 \n\t"\
  602. "por %%mm3, %%mm6 \n\t"\
  603. MOVNTQ(%%mm6, (dst))\
  604. \
  605. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  606. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  607. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  608. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  609. \
  610. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  611. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  612. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  613. \
  614. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  615. "por %%mm3, %%mm6 \n\t"\
  616. MOVNTQ(%%mm6, 8(dst))\
  617. \
  618. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  619. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  620. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  621. \
  622. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  623. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  624. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  625. \
  626. "por %%mm1, %%mm3 \n\t"\
  627. "por %%mm3, %%mm6 \n\t"\
  628. MOVNTQ(%%mm6, 16(dst))\
  629. \
  630. "add $24, "#dst" \n\t"\
  631. \
  632. "add $8, "#index" \n\t"\
  633. "cmp "#dstw", "#index" \n\t"\
  634. " jb 1b \n\t"
  635. #if COMPILE_TEMPLATE_MMX2
  636. #undef WRITEBGR24
  637. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  638. #else
  639. #undef WRITEBGR24
  640. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  641. #endif
  642. static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  643. const int16_t **lumSrc, int lumFilterSize,
  644. const int16_t *chrFilter, const int16_t **chrUSrc,
  645. const int16_t **chrVSrc,
  646. int chrFilterSize, const int16_t **alpSrc,
  647. uint8_t *dest, int dstW, int dstY)
  648. {
  649. x86_reg dummy=0;
  650. x86_reg dstW_reg = dstW;
  651. x86_reg uv_off = c->uv_offx2;
  652. YSCALEYUV2PACKEDX_ACCURATE
  653. YSCALEYUV2RGBX
  654. "pxor %%mm7, %%mm7 \n\t"
  655. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  656. "add %4, %%"REG_c" \n\t"
  657. WRITEBGR24(%%REGc, %5, %%REGa)
  658. :: "r" (&c->redDither),
  659. "m" (dummy), "m" (dummy), "m" (dummy),
  660. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  661. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  662. );
  663. }
  664. static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  665. const int16_t **lumSrc, int lumFilterSize,
  666. const int16_t *chrFilter, const int16_t **chrUSrc,
  667. const int16_t **chrVSrc,
  668. int chrFilterSize, const int16_t **alpSrc,
  669. uint8_t *dest, int dstW, int dstY)
  670. {
  671. x86_reg dummy=0;
  672. x86_reg dstW_reg = dstW;
  673. x86_reg uv_off = c->uv_offx2;
  674. YSCALEYUV2PACKEDX
  675. YSCALEYUV2RGBX
  676. "pxor %%mm7, %%mm7 \n\t"
  677. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  678. "add %4, %%"REG_c" \n\t"
  679. WRITEBGR24(%%REGc, %5, %%REGa)
  680. :: "r" (&c->redDither),
  681. "m" (dummy), "m" (dummy), "m" (dummy),
  682. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  683. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  684. );
  685. }
  686. #define REAL_WRITEYUY2(dst, dstw, index) \
  687. "packuswb %%mm3, %%mm3 \n\t"\
  688. "packuswb %%mm4, %%mm4 \n\t"\
  689. "packuswb %%mm7, %%mm1 \n\t"\
  690. "punpcklbw %%mm4, %%mm3 \n\t"\
  691. "movq %%mm1, %%mm7 \n\t"\
  692. "punpcklbw %%mm3, %%mm1 \n\t"\
  693. "punpckhbw %%mm3, %%mm7 \n\t"\
  694. \
  695. MOVNTQ(%%mm1, (dst, index, 2))\
  696. MOVNTQ(%%mm7, 8(dst, index, 2))\
  697. \
  698. "add $8, "#index" \n\t"\
  699. "cmp "#dstw", "#index" \n\t"\
  700. " jb 1b \n\t"
  701. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  702. static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  703. const int16_t **lumSrc, int lumFilterSize,
  704. const int16_t *chrFilter, const int16_t **chrUSrc,
  705. const int16_t **chrVSrc,
  706. int chrFilterSize, const int16_t **alpSrc,
  707. uint8_t *dest, int dstW, int dstY)
  708. {
  709. x86_reg dummy=0;
  710. x86_reg dstW_reg = dstW;
  711. x86_reg uv_off = c->uv_offx2;
  712. YSCALEYUV2PACKEDX_ACCURATE
  713. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  714. "psraw $3, %%mm3 \n\t"
  715. "psraw $3, %%mm4 \n\t"
  716. "psraw $3, %%mm1 \n\t"
  717. "psraw $3, %%mm7 \n\t"
  718. WRITEYUY2(%4, %5, %%REGa)
  719. YSCALEYUV2PACKEDX_END
  720. }
  721. static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  722. const int16_t **lumSrc, int lumFilterSize,
  723. const int16_t *chrFilter, const int16_t **chrUSrc,
  724. const int16_t **chrVSrc,
  725. int chrFilterSize, const int16_t **alpSrc,
  726. uint8_t *dest, int dstW, int dstY)
  727. {
  728. x86_reg dummy=0;
  729. x86_reg dstW_reg = dstW;
  730. x86_reg uv_off = c->uv_offx2;
  731. YSCALEYUV2PACKEDX
  732. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  733. "psraw $3, %%mm3 \n\t"
  734. "psraw $3, %%mm4 \n\t"
  735. "psraw $3, %%mm1 \n\t"
  736. "psraw $3, %%mm7 \n\t"
  737. WRITEYUY2(%4, %5, %%REGa)
  738. YSCALEYUV2PACKEDX_END
  739. }
  740. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  741. "xor "#index", "#index" \n\t"\
  742. ".p2align 4 \n\t"\
  743. "1: \n\t"\
  744. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  745. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  746. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  747. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  748. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  749. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  750. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  751. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  752. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  753. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  754. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  755. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  756. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  757. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  758. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  759. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  760. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  761. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  762. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  763. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  764. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  765. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  766. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  767. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  768. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  769. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  770. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  771. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  772. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  773. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  774. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  775. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  776. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  777. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  778. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  779. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  780. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  781. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  782. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  783. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  784. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  785. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  786. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  787. "paddw %%mm3, %%mm4 \n\t"\
  788. "movq %%mm2, %%mm0 \n\t"\
  789. "movq %%mm5, %%mm6 \n\t"\
  790. "movq %%mm4, %%mm3 \n\t"\
  791. "punpcklwd %%mm2, %%mm2 \n\t"\
  792. "punpcklwd %%mm5, %%mm5 \n\t"\
  793. "punpcklwd %%mm4, %%mm4 \n\t"\
  794. "paddw %%mm1, %%mm2 \n\t"\
  795. "paddw %%mm1, %%mm5 \n\t"\
  796. "paddw %%mm1, %%mm4 \n\t"\
  797. "punpckhwd %%mm0, %%mm0 \n\t"\
  798. "punpckhwd %%mm6, %%mm6 \n\t"\
  799. "punpckhwd %%mm3, %%mm3 \n\t"\
  800. "paddw %%mm7, %%mm0 \n\t"\
  801. "paddw %%mm7, %%mm6 \n\t"\
  802. "paddw %%mm7, %%mm3 \n\t"\
  803. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  804. "packuswb %%mm0, %%mm2 \n\t"\
  805. "packuswb %%mm6, %%mm5 \n\t"\
  806. "packuswb %%mm3, %%mm4 \n\t"\
  807. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  808. #define YSCALEYUV2RGB(index, c) \
  809. REAL_YSCALEYUV2RGB_UV(index, c) \
  810. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  811. REAL_YSCALEYUV2RGB_COEFF(c)
  812. /**
  813. * vertical bilinear scale YV12 to RGB
  814. */
  815. static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
  816. const int16_t *ubuf[2], const int16_t *vbuf[2],
  817. const int16_t *abuf[2], uint8_t *dest,
  818. int dstW, int yalpha, int uvalpha, int y)
  819. {
  820. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  821. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  822. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  823. const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
  824. #if ARCH_X86_64
  825. __asm__ volatile(
  826. YSCALEYUV2RGB(%%r8, %5)
  827. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  828. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  829. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  830. "packuswb %%mm7, %%mm1 \n\t"
  831. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  832. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  833. "a" (&c->redDither),
  834. "r" (abuf0), "r" (abuf1)
  835. : "%r8"
  836. );
  837. #else
  838. c->u_temp=(intptr_t)abuf0;
  839. c->v_temp=(intptr_t)abuf1;
  840. __asm__ volatile(
  841. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  842. "mov %4, %%"REG_b" \n\t"
  843. "push %%"REG_BP" \n\t"
  844. YSCALEYUV2RGB(%%REGBP, %5)
  845. "push %0 \n\t"
  846. "push %1 \n\t"
  847. "mov "U_TEMP"(%5), %0 \n\t"
  848. "mov "V_TEMP"(%5), %1 \n\t"
  849. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  850. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  851. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  852. "packuswb %%mm7, %%mm1 \n\t"
  853. "pop %1 \n\t"
  854. "pop %0 \n\t"
  855. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  856. "pop %%"REG_BP" \n\t"
  857. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  858. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  859. "a" (&c->redDither)
  860. );
  861. #endif
  862. } else {
  863. __asm__ volatile(
  864. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  865. "mov %4, %%"REG_b" \n\t"
  866. "push %%"REG_BP" \n\t"
  867. YSCALEYUV2RGB(%%REGBP, %5)
  868. "pcmpeqd %%mm7, %%mm7 \n\t"
  869. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  870. "pop %%"REG_BP" \n\t"
  871. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  872. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  873. "a" (&c->redDither)
  874. );
  875. }
  876. }
  877. static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
  878. const int16_t *ubuf[2], const int16_t *vbuf[2],
  879. const int16_t *abuf[2], uint8_t *dest,
  880. int dstW, int yalpha, int uvalpha, int y)
  881. {
  882. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  883. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  884. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  885. __asm__ volatile(
  886. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  887. "mov %4, %%"REG_b" \n\t"
  888. "push %%"REG_BP" \n\t"
  889. YSCALEYUV2RGB(%%REGBP, %5)
  890. "pxor %%mm7, %%mm7 \n\t"
  891. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  892. "pop %%"REG_BP" \n\t"
  893. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  894. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  895. "a" (&c->redDither)
  896. );
  897. }
  898. static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
  899. const int16_t *ubuf[2], const int16_t *vbuf[2],
  900. const int16_t *abuf[2], uint8_t *dest,
  901. int dstW, int yalpha, int uvalpha, int y)
  902. {
  903. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  904. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  905. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  906. __asm__ volatile(
  907. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  908. "mov %4, %%"REG_b" \n\t"
  909. "push %%"REG_BP" \n\t"
  910. YSCALEYUV2RGB(%%REGBP, %5)
  911. "pxor %%mm7, %%mm7 \n\t"
  912. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  913. #ifdef DITHER1XBPP
  914. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  915. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  916. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  917. #endif
  918. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  919. "pop %%"REG_BP" \n\t"
  920. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  921. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  922. "a" (&c->redDither)
  923. );
  924. }
  925. static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
  926. const int16_t *ubuf[2], const int16_t *vbuf[2],
  927. const int16_t *abuf[2], uint8_t *dest,
  928. int dstW, int yalpha, int uvalpha, int y)
  929. {
  930. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  931. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  932. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  933. __asm__ volatile(
  934. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  935. "mov %4, %%"REG_b" \n\t"
  936. "push %%"REG_BP" \n\t"
  937. YSCALEYUV2RGB(%%REGBP, %5)
  938. "pxor %%mm7, %%mm7 \n\t"
  939. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  940. #ifdef DITHER1XBPP
  941. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  942. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  943. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  944. #endif
  945. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  946. "pop %%"REG_BP" \n\t"
  947. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  948. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  949. "a" (&c->redDither)
  950. );
  951. }
  952. #define REAL_YSCALEYUV2PACKED(index, c) \
  953. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  954. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  955. "psraw $3, %%mm0 \n\t"\
  956. "psraw $3, %%mm1 \n\t"\
  957. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  958. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  959. "xor "#index", "#index" \n\t"\
  960. ".p2align 4 \n\t"\
  961. "1: \n\t"\
  962. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  963. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  964. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  965. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  966. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  967. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  968. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  969. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  970. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  971. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  972. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  973. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  974. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  975. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  976. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  977. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  978. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  979. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  980. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  981. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  982. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  983. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  984. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  985. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  986. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  987. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  988. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  989. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  990. static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
  991. const int16_t *ubuf[2], const int16_t *vbuf[2],
  992. const int16_t *abuf[2], uint8_t *dest,
  993. int dstW, int yalpha, int uvalpha, int y)
  994. {
  995. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  996. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  997. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  998. __asm__ volatile(
  999. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1000. "mov %4, %%"REG_b" \n\t"
  1001. "push %%"REG_BP" \n\t"
  1002. YSCALEYUV2PACKED(%%REGBP, %5)
  1003. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1004. "pop %%"REG_BP" \n\t"
  1005. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1006. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1007. "a" (&c->redDither)
  1008. );
  1009. }
  1010. #define REAL_YSCALEYUV2RGB1(index, c) \
  1011. "xor "#index", "#index" \n\t"\
  1012. ".p2align 4 \n\t"\
  1013. "1: \n\t"\
  1014. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1015. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1016. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1017. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1018. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1019. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1020. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1021. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1022. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1023. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1024. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1025. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1026. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1027. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1028. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1029. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1030. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1031. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1032. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1033. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1034. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1035. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1036. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1037. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1038. "paddw %%mm3, %%mm4 \n\t"\
  1039. "movq %%mm2, %%mm0 \n\t"\
  1040. "movq %%mm5, %%mm6 \n\t"\
  1041. "movq %%mm4, %%mm3 \n\t"\
  1042. "punpcklwd %%mm2, %%mm2 \n\t"\
  1043. "punpcklwd %%mm5, %%mm5 \n\t"\
  1044. "punpcklwd %%mm4, %%mm4 \n\t"\
  1045. "paddw %%mm1, %%mm2 \n\t"\
  1046. "paddw %%mm1, %%mm5 \n\t"\
  1047. "paddw %%mm1, %%mm4 \n\t"\
  1048. "punpckhwd %%mm0, %%mm0 \n\t"\
  1049. "punpckhwd %%mm6, %%mm6 \n\t"\
  1050. "punpckhwd %%mm3, %%mm3 \n\t"\
  1051. "paddw %%mm7, %%mm0 \n\t"\
  1052. "paddw %%mm7, %%mm6 \n\t"\
  1053. "paddw %%mm7, %%mm3 \n\t"\
  1054. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1055. "packuswb %%mm0, %%mm2 \n\t"\
  1056. "packuswb %%mm6, %%mm5 \n\t"\
  1057. "packuswb %%mm3, %%mm4 \n\t"\
  1058. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1059. // do vertical chrominance interpolation
  1060. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1061. "xor "#index", "#index" \n\t"\
  1062. ".p2align 4 \n\t"\
  1063. "1: \n\t"\
  1064. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1065. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1066. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1067. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1068. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1069. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1070. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1071. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1072. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1073. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1074. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1075. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1076. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1077. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1078. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1079. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1080. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1081. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1082. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1083. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1084. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1085. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1086. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1087. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1088. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1089. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1090. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1091. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1092. "paddw %%mm3, %%mm4 \n\t"\
  1093. "movq %%mm2, %%mm0 \n\t"\
  1094. "movq %%mm5, %%mm6 \n\t"\
  1095. "movq %%mm4, %%mm3 \n\t"\
  1096. "punpcklwd %%mm2, %%mm2 \n\t"\
  1097. "punpcklwd %%mm5, %%mm5 \n\t"\
  1098. "punpcklwd %%mm4, %%mm4 \n\t"\
  1099. "paddw %%mm1, %%mm2 \n\t"\
  1100. "paddw %%mm1, %%mm5 \n\t"\
  1101. "paddw %%mm1, %%mm4 \n\t"\
  1102. "punpckhwd %%mm0, %%mm0 \n\t"\
  1103. "punpckhwd %%mm6, %%mm6 \n\t"\
  1104. "punpckhwd %%mm3, %%mm3 \n\t"\
  1105. "paddw %%mm7, %%mm0 \n\t"\
  1106. "paddw %%mm7, %%mm6 \n\t"\
  1107. "paddw %%mm7, %%mm3 \n\t"\
  1108. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1109. "packuswb %%mm0, %%mm2 \n\t"\
  1110. "packuswb %%mm6, %%mm5 \n\t"\
  1111. "packuswb %%mm3, %%mm4 \n\t"\
  1112. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1113. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1114. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1115. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1116. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1117. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1118. "packuswb %%mm1, %%mm7 \n\t"
  1119. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1120. /**
  1121. * YV12 to RGB without scaling or interpolating
  1122. */
  1123. static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
  1124. const int16_t *ubuf[2], const int16_t *bguf[2],
  1125. const int16_t *abuf0, uint8_t *dest,
  1126. int dstW, int uvalpha, int y)
  1127. {
  1128. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1129. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1130. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1131. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1132. __asm__ volatile(
  1133. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1134. "mov %4, %%"REG_b" \n\t"
  1135. "push %%"REG_BP" \n\t"
  1136. YSCALEYUV2RGB1(%%REGBP, %5)
  1137. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1138. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1139. "pop %%"REG_BP" \n\t"
  1140. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1141. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1142. "a" (&c->redDither)
  1143. );
  1144. } else {
  1145. __asm__ volatile(
  1146. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1147. "mov %4, %%"REG_b" \n\t"
  1148. "push %%"REG_BP" \n\t"
  1149. YSCALEYUV2RGB1(%%REGBP, %5)
  1150. "pcmpeqd %%mm7, %%mm7 \n\t"
  1151. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1152. "pop %%"REG_BP" \n\t"
  1153. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1154. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1155. "a" (&c->redDither)
  1156. );
  1157. }
  1158. } else {
  1159. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1160. __asm__ volatile(
  1161. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1162. "mov %4, %%"REG_b" \n\t"
  1163. "push %%"REG_BP" \n\t"
  1164. YSCALEYUV2RGB1b(%%REGBP, %5)
  1165. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1166. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1167. "pop %%"REG_BP" \n\t"
  1168. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1169. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1170. "a" (&c->redDither)
  1171. );
  1172. } else {
  1173. __asm__ volatile(
  1174. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1175. "mov %4, %%"REG_b" \n\t"
  1176. "push %%"REG_BP" \n\t"
  1177. YSCALEYUV2RGB1b(%%REGBP, %5)
  1178. "pcmpeqd %%mm7, %%mm7 \n\t"
  1179. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1180. "pop %%"REG_BP" \n\t"
  1181. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1182. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1183. "a" (&c->redDither)
  1184. );
  1185. }
  1186. }
  1187. }
  1188. static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
  1189. const int16_t *ubuf[2], const int16_t *bguf[2],
  1190. const int16_t *abuf0, uint8_t *dest,
  1191. int dstW, int uvalpha, int y)
  1192. {
  1193. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1194. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1195. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1196. __asm__ volatile(
  1197. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1198. "mov %4, %%"REG_b" \n\t"
  1199. "push %%"REG_BP" \n\t"
  1200. YSCALEYUV2RGB1(%%REGBP, %5)
  1201. "pxor %%mm7, %%mm7 \n\t"
  1202. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1203. "pop %%"REG_BP" \n\t"
  1204. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1205. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1206. "a" (&c->redDither)
  1207. );
  1208. } else {
  1209. __asm__ volatile(
  1210. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1211. "mov %4, %%"REG_b" \n\t"
  1212. "push %%"REG_BP" \n\t"
  1213. YSCALEYUV2RGB1b(%%REGBP, %5)
  1214. "pxor %%mm7, %%mm7 \n\t"
  1215. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1216. "pop %%"REG_BP" \n\t"
  1217. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1218. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1219. "a" (&c->redDither)
  1220. );
  1221. }
  1222. }
  1223. static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
  1224. const int16_t *ubuf[2], const int16_t *bguf[2],
  1225. const int16_t *abuf0, uint8_t *dest,
  1226. int dstW, int uvalpha, int y)
  1227. {
  1228. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1229. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1230. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1231. __asm__ volatile(
  1232. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1233. "mov %4, %%"REG_b" \n\t"
  1234. "push %%"REG_BP" \n\t"
  1235. YSCALEYUV2RGB1(%%REGBP, %5)
  1236. "pxor %%mm7, %%mm7 \n\t"
  1237. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1238. #ifdef DITHER1XBPP
  1239. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1240. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1241. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1242. #endif
  1243. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1244. "pop %%"REG_BP" \n\t"
  1245. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1246. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1247. "a" (&c->redDither)
  1248. );
  1249. } else {
  1250. __asm__ volatile(
  1251. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1252. "mov %4, %%"REG_b" \n\t"
  1253. "push %%"REG_BP" \n\t"
  1254. YSCALEYUV2RGB1b(%%REGBP, %5)
  1255. "pxor %%mm7, %%mm7 \n\t"
  1256. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1257. #ifdef DITHER1XBPP
  1258. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1259. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1260. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1261. #endif
  1262. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1263. "pop %%"REG_BP" \n\t"
  1264. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1265. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1266. "a" (&c->redDither)
  1267. );
  1268. }
  1269. }
  1270. static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
  1271. const int16_t *ubuf[2], const int16_t *bguf[2],
  1272. const int16_t *abuf0, uint8_t *dest,
  1273. int dstW, int uvalpha, int y)
  1274. {
  1275. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1276. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1277. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1278. __asm__ volatile(
  1279. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1280. "mov %4, %%"REG_b" \n\t"
  1281. "push %%"REG_BP" \n\t"
  1282. YSCALEYUV2RGB1(%%REGBP, %5)
  1283. "pxor %%mm7, %%mm7 \n\t"
  1284. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1285. #ifdef DITHER1XBPP
  1286. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1287. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1288. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1289. #endif
  1290. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1291. "pop %%"REG_BP" \n\t"
  1292. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1293. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1294. "a" (&c->redDither)
  1295. );
  1296. } else {
  1297. __asm__ volatile(
  1298. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1299. "mov %4, %%"REG_b" \n\t"
  1300. "push %%"REG_BP" \n\t"
  1301. YSCALEYUV2RGB1b(%%REGBP, %5)
  1302. "pxor %%mm7, %%mm7 \n\t"
  1303. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1304. #ifdef DITHER1XBPP
  1305. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1306. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1307. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1308. #endif
  1309. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1310. "pop %%"REG_BP" \n\t"
  1311. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1312. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1313. "a" (&c->redDither)
  1314. );
  1315. }
  1316. }
  1317. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1318. "xor "#index", "#index" \n\t"\
  1319. ".p2align 4 \n\t"\
  1320. "1: \n\t"\
  1321. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1322. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1323. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1324. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1325. "psraw $7, %%mm3 \n\t" \
  1326. "psraw $7, %%mm4 \n\t" \
  1327. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1328. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1329. "psraw $7, %%mm1 \n\t" \
  1330. "psraw $7, %%mm7 \n\t" \
  1331. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1332. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1333. "xor "#index", "#index" \n\t"\
  1334. ".p2align 4 \n\t"\
  1335. "1: \n\t"\
  1336. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1337. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1338. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1339. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1340. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1341. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1342. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1343. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1344. "psrlw $8, %%mm3 \n\t" \
  1345. "psrlw $8, %%mm4 \n\t" \
  1346. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1347. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1348. "psraw $7, %%mm1 \n\t" \
  1349. "psraw $7, %%mm7 \n\t"
  1350. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1351. static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
  1352. const int16_t *ubuf[2], const int16_t *bguf[2],
  1353. const int16_t *abuf0, uint8_t *dest,
  1354. int dstW, int uvalpha, int y)
  1355. {
  1356. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1357. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1358. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1359. __asm__ volatile(
  1360. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1361. "mov %4, %%"REG_b" \n\t"
  1362. "push %%"REG_BP" \n\t"
  1363. YSCALEYUV2PACKED1(%%REGBP, %5)
  1364. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1365. "pop %%"REG_BP" \n\t"
  1366. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1367. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1368. "a" (&c->redDither)
  1369. );
  1370. } else {
  1371. __asm__ volatile(
  1372. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1373. "mov %4, %%"REG_b" \n\t"
  1374. "push %%"REG_BP" \n\t"
  1375. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1376. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1377. "pop %%"REG_BP" \n\t"
  1378. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1379. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1380. "a" (&c->redDither)
  1381. );
  1382. }
  1383. }
  1384. #if !COMPILE_TEMPLATE_MMX2
  1385. //FIXME yuy2* can read up to 7 samples too much
  1386. static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
  1387. int width, uint32_t *unused)
  1388. {
  1389. __asm__ volatile(
  1390. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1391. "mov %0, %%"REG_a" \n\t"
  1392. "1: \n\t"
  1393. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1394. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1395. "pand %%mm2, %%mm0 \n\t"
  1396. "pand %%mm2, %%mm1 \n\t"
  1397. "packuswb %%mm1, %%mm0 \n\t"
  1398. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1399. "add $8, %%"REG_a" \n\t"
  1400. " js 1b \n\t"
  1401. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1402. : "%"REG_a
  1403. );
  1404. }
  1405. static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
  1406. const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2,
  1407. int width, uint32_t *unused)
  1408. {
  1409. __asm__ volatile(
  1410. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1411. "mov %0, %%"REG_a" \n\t"
  1412. "1: \n\t"
  1413. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1414. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1415. "psrlw $8, %%mm0 \n\t"
  1416. "psrlw $8, %%mm1 \n\t"
  1417. "packuswb %%mm1, %%mm0 \n\t"
  1418. "movq %%mm0, %%mm1 \n\t"
  1419. "psrlw $8, %%mm0 \n\t"
  1420. "pand %%mm4, %%mm1 \n\t"
  1421. "packuswb %%mm0, %%mm0 \n\t"
  1422. "packuswb %%mm1, %%mm1 \n\t"
  1423. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1424. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1425. "add $4, %%"REG_a" \n\t"
  1426. " js 1b \n\t"
  1427. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1428. : "%"REG_a
  1429. );
  1430. assert(src1 == src2);
  1431. }
  1432. /* This is almost identical to the previous, end exists only because
  1433. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1434. static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
  1435. int width, uint32_t *unused)
  1436. {
  1437. __asm__ volatile(
  1438. "mov %0, %%"REG_a" \n\t"
  1439. "1: \n\t"
  1440. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1441. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1442. "psrlw $8, %%mm0 \n\t"
  1443. "psrlw $8, %%mm1 \n\t"
  1444. "packuswb %%mm1, %%mm0 \n\t"
  1445. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1446. "add $8, %%"REG_a" \n\t"
  1447. " js 1b \n\t"
  1448. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1449. : "%"REG_a
  1450. );
  1451. }
  1452. static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
  1453. const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2,
  1454. int width, uint32_t *unused)
  1455. {
  1456. __asm__ volatile(
  1457. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1458. "mov %0, %%"REG_a" \n\t"
  1459. "1: \n\t"
  1460. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1461. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1462. "pand %%mm4, %%mm0 \n\t"
  1463. "pand %%mm4, %%mm1 \n\t"
  1464. "packuswb %%mm1, %%mm0 \n\t"
  1465. "movq %%mm0, %%mm1 \n\t"
  1466. "psrlw $8, %%mm0 \n\t"
  1467. "pand %%mm4, %%mm1 \n\t"
  1468. "packuswb %%mm0, %%mm0 \n\t"
  1469. "packuswb %%mm1, %%mm1 \n\t"
  1470. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1471. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1472. "add $4, %%"REG_a" \n\t"
  1473. " js 1b \n\t"
  1474. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1475. : "%"REG_a
  1476. );
  1477. assert(src1 == src2);
  1478. }
  1479. static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1480. const uint8_t *src, int width)
  1481. {
  1482. __asm__ volatile(
  1483. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1484. "mov %0, %%"REG_a" \n\t"
  1485. "1: \n\t"
  1486. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1487. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1488. "movq %%mm0, %%mm2 \n\t"
  1489. "movq %%mm1, %%mm3 \n\t"
  1490. "pand %%mm4, %%mm0 \n\t"
  1491. "pand %%mm4, %%mm1 \n\t"
  1492. "psrlw $8, %%mm2 \n\t"
  1493. "psrlw $8, %%mm3 \n\t"
  1494. "packuswb %%mm1, %%mm0 \n\t"
  1495. "packuswb %%mm3, %%mm2 \n\t"
  1496. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1497. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1498. "add $8, %%"REG_a" \n\t"
  1499. " js 1b \n\t"
  1500. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1501. : "%"REG_a
  1502. );
  1503. }
  1504. static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1505. const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2,
  1506. int width, uint32_t *unused)
  1507. {
  1508. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1509. }
  1510. static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1511. const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2,
  1512. int width, uint32_t *unused)
  1513. {
  1514. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1515. }
  1516. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1517. static av_always_inline void RENAME(bgr24ToY_mmx)(int16_t *dst, const uint8_t *src,
  1518. int width, enum PixelFormat srcFormat)
  1519. {
  1520. if(srcFormat == PIX_FMT_BGR24) {
  1521. __asm__ volatile(
  1522. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1523. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1524. :
  1525. );
  1526. } else {
  1527. __asm__ volatile(
  1528. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1529. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1530. :
  1531. );
  1532. }
  1533. __asm__ volatile(
  1534. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1535. "mov %2, %%"REG_a" \n\t"
  1536. "pxor %%mm7, %%mm7 \n\t"
  1537. "1: \n\t"
  1538. PREFETCH" 64(%0) \n\t"
  1539. "movd (%0), %%mm0 \n\t"
  1540. "movd 2(%0), %%mm1 \n\t"
  1541. "movd 6(%0), %%mm2 \n\t"
  1542. "movd 8(%0), %%mm3 \n\t"
  1543. "add $12, %0 \n\t"
  1544. "punpcklbw %%mm7, %%mm0 \n\t"
  1545. "punpcklbw %%mm7, %%mm1 \n\t"
  1546. "punpcklbw %%mm7, %%mm2 \n\t"
  1547. "punpcklbw %%mm7, %%mm3 \n\t"
  1548. "pmaddwd %%mm5, %%mm0 \n\t"
  1549. "pmaddwd %%mm6, %%mm1 \n\t"
  1550. "pmaddwd %%mm5, %%mm2 \n\t"
  1551. "pmaddwd %%mm6, %%mm3 \n\t"
  1552. "paddd %%mm1, %%mm0 \n\t"
  1553. "paddd %%mm3, %%mm2 \n\t"
  1554. "paddd %%mm4, %%mm0 \n\t"
  1555. "paddd %%mm4, %%mm2 \n\t"
  1556. "psrad $9, %%mm0 \n\t"
  1557. "psrad $9, %%mm2 \n\t"
  1558. "packssdw %%mm2, %%mm0 \n\t"
  1559. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1560. "add $8, %%"REG_a" \n\t"
  1561. " js 1b \n\t"
  1562. : "+r" (src)
  1563. : "r" (dst+width), "g" ((x86_reg)-2*width)
  1564. : "%"REG_a
  1565. );
  1566. }
  1567. static void RENAME(bgr24ToY)(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
  1568. int width, uint32_t *unused)
  1569. {
  1570. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1571. }
  1572. static void RENAME(rgb24ToY)(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
  1573. int width, uint32_t *unused)
  1574. {
  1575. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1576. }
  1577. static av_always_inline void RENAME(bgr24ToUV_mmx)(int16_t *dstU, int16_t *dstV,
  1578. const uint8_t *src, int width,
  1579. enum PixelFormat srcFormat)
  1580. {
  1581. __asm__ volatile(
  1582. "movq 24(%4), %%mm6 \n\t"
  1583. "mov %3, %%"REG_a" \n\t"
  1584. "pxor %%mm7, %%mm7 \n\t"
  1585. "1: \n\t"
  1586. PREFETCH" 64(%0) \n\t"
  1587. "movd (%0), %%mm0 \n\t"
  1588. "movd 2(%0), %%mm1 \n\t"
  1589. "punpcklbw %%mm7, %%mm0 \n\t"
  1590. "punpcklbw %%mm7, %%mm1 \n\t"
  1591. "movq %%mm0, %%mm2 \n\t"
  1592. "movq %%mm1, %%mm3 \n\t"
  1593. "pmaddwd (%4), %%mm0 \n\t"
  1594. "pmaddwd 8(%4), %%mm1 \n\t"
  1595. "pmaddwd 16(%4), %%mm2 \n\t"
  1596. "pmaddwd %%mm6, %%mm3 \n\t"
  1597. "paddd %%mm1, %%mm0 \n\t"
  1598. "paddd %%mm3, %%mm2 \n\t"
  1599. "movd 6(%0), %%mm1 \n\t"
  1600. "movd 8(%0), %%mm3 \n\t"
  1601. "add $12, %0 \n\t"
  1602. "punpcklbw %%mm7, %%mm1 \n\t"
  1603. "punpcklbw %%mm7, %%mm3 \n\t"
  1604. "movq %%mm1, %%mm4 \n\t"
  1605. "movq %%mm3, %%mm5 \n\t"
  1606. "pmaddwd (%4), %%mm1 \n\t"
  1607. "pmaddwd 8(%4), %%mm3 \n\t"
  1608. "pmaddwd 16(%4), %%mm4 \n\t"
  1609. "pmaddwd %%mm6, %%mm5 \n\t"
  1610. "paddd %%mm3, %%mm1 \n\t"
  1611. "paddd %%mm5, %%mm4 \n\t"
  1612. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1613. "paddd %%mm3, %%mm0 \n\t"
  1614. "paddd %%mm3, %%mm2 \n\t"
  1615. "paddd %%mm3, %%mm1 \n\t"
  1616. "paddd %%mm3, %%mm4 \n\t"
  1617. "psrad $9, %%mm0 \n\t"
  1618. "psrad $9, %%mm2 \n\t"
  1619. "psrad $9, %%mm1 \n\t"
  1620. "psrad $9, %%mm4 \n\t"
  1621. "packssdw %%mm1, %%mm0 \n\t"
  1622. "packssdw %%mm4, %%mm2 \n\t"
  1623. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1624. "movq %%mm2, (%2, %%"REG_a") \n\t"
  1625. "add $8, %%"REG_a" \n\t"
  1626. " js 1b \n\t"
  1627. : "+r" (src)
  1628. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-2*width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
  1629. : "%"REG_a
  1630. );
  1631. }
  1632. static void RENAME(bgr24ToUV)(int16_t *dstU, int16_t *dstV,
  1633. const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2,
  1634. int width, uint32_t *unused)
  1635. {
  1636. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1637. assert(src1 == src2);
  1638. }
  1639. static void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV,
  1640. const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2,
  1641. int width, uint32_t *unused)
  1642. {
  1643. assert(src1==src2);
  1644. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1645. }
  1646. #if COMPILE_TEMPLATE_MMX2
  1647. static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  1648. int dstWidth, const uint8_t *src,
  1649. int srcW, int xInc)
  1650. {
  1651. int16_t *filterPos = c->hLumFilterPos;
  1652. int16_t *filter = c->hLumFilter;
  1653. void *mmx2FilterCode= c->lumMmx2FilterCode;
  1654. int i;
  1655. #if defined(PIC)
  1656. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1657. #endif
  1658. __asm__ volatile(
  1659. #if defined(PIC)
  1660. "mov %%"REG_b", %5 \n\t"
  1661. #endif
  1662. "pxor %%mm7, %%mm7 \n\t"
  1663. "mov %0, %%"REG_c" \n\t"
  1664. "mov %1, %%"REG_D" \n\t"
  1665. "mov %2, %%"REG_d" \n\t"
  1666. "mov %3, %%"REG_b" \n\t"
  1667. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1668. PREFETCH" (%%"REG_c") \n\t"
  1669. PREFETCH" 32(%%"REG_c") \n\t"
  1670. PREFETCH" 64(%%"REG_c") \n\t"
  1671. #if ARCH_X86_64
  1672. #define CALL_MMX2_FILTER_CODE \
  1673. "movl (%%"REG_b"), %%esi \n\t"\
  1674. "call *%4 \n\t"\
  1675. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  1676. "add %%"REG_S", %%"REG_c" \n\t"\
  1677. "add %%"REG_a", %%"REG_D" \n\t"\
  1678. "xor %%"REG_a", %%"REG_a" \n\t"\
  1679. #else
  1680. #define CALL_MMX2_FILTER_CODE \
  1681. "movl (%%"REG_b"), %%esi \n\t"\
  1682. "call *%4 \n\t"\
  1683. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  1684. "add %%"REG_a", %%"REG_D" \n\t"\
  1685. "xor %%"REG_a", %%"REG_a" \n\t"\
  1686. #endif /* ARCH_X86_64 */
  1687. CALL_MMX2_FILTER_CODE
  1688. CALL_MMX2_FILTER_CODE
  1689. CALL_MMX2_FILTER_CODE
  1690. CALL_MMX2_FILTER_CODE
  1691. CALL_MMX2_FILTER_CODE
  1692. CALL_MMX2_FILTER_CODE
  1693. CALL_MMX2_FILTER_CODE
  1694. CALL_MMX2_FILTER_CODE
  1695. #if defined(PIC)
  1696. "mov %5, %%"REG_b" \n\t"
  1697. #endif
  1698. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  1699. "m" (mmx2FilterCode)
  1700. #if defined(PIC)
  1701. ,"m" (ebxsave)
  1702. #endif
  1703. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  1704. #if !defined(PIC)
  1705. ,"%"REG_b
  1706. #endif
  1707. );
  1708. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  1709. dst[i] = src[srcW-1]*128;
  1710. }
  1711. static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
  1712. int dstWidth, const uint8_t *src1,
  1713. const uint8_t *src2, int srcW, int xInc)
  1714. {
  1715. int16_t *filterPos = c->hChrFilterPos;
  1716. int16_t *filter = c->hChrFilter;
  1717. void *mmx2FilterCode= c->chrMmx2FilterCode;
  1718. int i;
  1719. #if defined(PIC)
  1720. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1721. #endif
  1722. __asm__ volatile(
  1723. #if defined(PIC)
  1724. "mov %%"REG_b", %7 \n\t"
  1725. #endif
  1726. "pxor %%mm7, %%mm7 \n\t"
  1727. "mov %0, %%"REG_c" \n\t"
  1728. "mov %1, %%"REG_D" \n\t"
  1729. "mov %2, %%"REG_d" \n\t"
  1730. "mov %3, %%"REG_b" \n\t"
  1731. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1732. PREFETCH" (%%"REG_c") \n\t"
  1733. PREFETCH" 32(%%"REG_c") \n\t"
  1734. PREFETCH" 64(%%"REG_c") \n\t"
  1735. CALL_MMX2_FILTER_CODE
  1736. CALL_MMX2_FILTER_CODE
  1737. CALL_MMX2_FILTER_CODE
  1738. CALL_MMX2_FILTER_CODE
  1739. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1740. "mov %5, %%"REG_c" \n\t" // src
  1741. "mov %6, %%"REG_D" \n\t" // buf2
  1742. PREFETCH" (%%"REG_c") \n\t"
  1743. PREFETCH" 32(%%"REG_c") \n\t"
  1744. PREFETCH" 64(%%"REG_c") \n\t"
  1745. CALL_MMX2_FILTER_CODE
  1746. CALL_MMX2_FILTER_CODE
  1747. CALL_MMX2_FILTER_CODE
  1748. CALL_MMX2_FILTER_CODE
  1749. #if defined(PIC)
  1750. "mov %7, %%"REG_b" \n\t"
  1751. #endif
  1752. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  1753. "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
  1754. #if defined(PIC)
  1755. ,"m" (ebxsave)
  1756. #endif
  1757. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  1758. #if !defined(PIC)
  1759. ,"%"REG_b
  1760. #endif
  1761. );
  1762. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  1763. dst1[i] = src1[srcW-1]*128;
  1764. dst2[i] = src2[srcW-1]*128;
  1765. }
  1766. }
  1767. #endif /* COMPILE_TEMPLATE_MMX2 */
  1768. static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
  1769. {
  1770. enum PixelFormat srcFormat = c->srcFormat,
  1771. dstFormat = c->dstFormat;
  1772. c->use_mmx_vfilter= 0;
  1773. if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12
  1774. && dstFormat != PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
  1775. c->yuv2plane1 = RENAME(yuv2yuv1_ar );
  1776. if (c->flags & SWS_ACCURATE_RND) {
  1777. //c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
  1778. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1779. switch (c->dstFormat) {
  1780. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  1781. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  1782. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  1783. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  1784. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  1785. default: break;
  1786. }
  1787. }
  1788. } else {
  1789. int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
  1790. //c->yuv2plane1 = should_dither ? RENAME(yuv2yuv1_ar ) : RENAME(yuv2yuv1 );
  1791. c->use_mmx_vfilter= 1;
  1792. c->yuv2planeX = RENAME(yuv2yuvX );
  1793. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1794. switch (c->dstFormat) {
  1795. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  1796. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  1797. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  1798. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  1799. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  1800. default: break;
  1801. }
  1802. }
  1803. }
  1804. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1805. switch (c->dstFormat) {
  1806. case PIX_FMT_RGB32:
  1807. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  1808. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  1809. break;
  1810. case PIX_FMT_BGR24:
  1811. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  1812. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  1813. break;
  1814. case PIX_FMT_RGB555:
  1815. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  1816. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  1817. break;
  1818. case PIX_FMT_RGB565:
  1819. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  1820. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  1821. break;
  1822. case PIX_FMT_YUYV422:
  1823. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  1824. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  1825. break;
  1826. default:
  1827. break;
  1828. }
  1829. }
  1830. }
  1831. if (c->srcBpc == 8 && c->dstBpc <= 10) {
  1832. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  1833. #if COMPILE_TEMPLATE_MMX2
  1834. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  1835. {
  1836. c->hyscale_fast = RENAME(hyscale_fast);
  1837. c->hcscale_fast = RENAME(hcscale_fast);
  1838. } else {
  1839. #endif /* COMPILE_TEMPLATE_MMX2 */
  1840. c->hyscale_fast = NULL;
  1841. c->hcscale_fast = NULL;
  1842. #if COMPILE_TEMPLATE_MMX2
  1843. }
  1844. #endif /* COMPILE_TEMPLATE_MMX2 */
  1845. }
  1846. #if !COMPILE_TEMPLATE_MMX2
  1847. switch(srcFormat) {
  1848. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  1849. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  1850. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  1851. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  1852. default: break;
  1853. }
  1854. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1855. if (!c->chrSrcHSubSample) {
  1856. switch(srcFormat) {
  1857. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  1858. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  1859. default: break;
  1860. }
  1861. }
  1862. switch (srcFormat) {
  1863. #if !COMPILE_TEMPLATE_MMX2
  1864. case PIX_FMT_YUYV422 :
  1865. case PIX_FMT_Y400A : c->lumToYV12 = RENAME(yuy2ToY); break;
  1866. case PIX_FMT_UYVY422 : c->lumToYV12 = RENAME(uyvyToY); break;
  1867. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1868. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  1869. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  1870. default: break;
  1871. }
  1872. #if !COMPILE_TEMPLATE_MMX2
  1873. if (c->alpPixBuf) {
  1874. switch (srcFormat) {
  1875. case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
  1876. default: break;
  1877. }
  1878. }
  1879. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1880. }