swscale_template.c 105 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307
  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #undef REAL_MOVNTQ
  21. #undef MOVNTQ
  22. #undef PREFETCH
  23. #if COMPILE_TEMPLATE_MMX2
  24. #define PREFETCH "prefetchnta"
  25. #else
  26. #define PREFETCH " # nop"
  27. #endif
  28. #if COMPILE_TEMPLATE_MMX2
  29. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  30. #else
  31. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  32. #endif
  33. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  34. #define YSCALEYUV2YV12X(offset, dest, end, pos) \
  35. __asm__ volatile(\
  36. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  37. "movq %%mm3, %%mm4 \n\t"\
  38. "lea " offset "(%0), %%"REG_d" \n\t"\
  39. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  40. ".p2align 4 \n\t" /* FIXME Unroll? */\
  41. "1: \n\t"\
  42. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  43. "movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  44. "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\
  45. "add $16, %%"REG_d" \n\t"\
  46. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  47. "test %%"REG_S", %%"REG_S" \n\t"\
  48. "pmulhw %%mm0, %%mm2 \n\t"\
  49. "pmulhw %%mm0, %%mm5 \n\t"\
  50. "paddw %%mm2, %%mm3 \n\t"\
  51. "paddw %%mm5, %%mm4 \n\t"\
  52. " jnz 1b \n\t"\
  53. "psraw $3, %%mm3 \n\t"\
  54. "psraw $3, %%mm4 \n\t"\
  55. "packuswb %%mm4, %%mm3 \n\t"\
  56. MOVNTQ(%%mm3, (%1, %3))\
  57. "add $8, %3 \n\t"\
  58. "cmp %2, %3 \n\t"\
  59. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  60. "movq %%mm3, %%mm4 \n\t"\
  61. "lea " offset "(%0), %%"REG_d" \n\t"\
  62. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  63. "jb 1b \n\t"\
  64. :: "r" (&c->redDither),\
  65. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  66. : "%"REG_d, "%"REG_S\
  67. );
  68. static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
  69. const int16_t **lumSrc, int lumFilterSize,
  70. const int16_t *chrFilter, const int16_t **chrUSrc,
  71. const int16_t **chrVSrc,
  72. int chrFilterSize, const int16_t **alpSrc,
  73. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  74. uint8_t *aDest, int dstW, int chrDstW)
  75. {
  76. if (uDest) {
  77. x86_reg uv_off = c->uv_off;
  78. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  79. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  80. }
  81. if (CONFIG_SWSCALE_ALPHA && aDest) {
  82. YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  83. }
  84. YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
  85. }
  86. #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
  87. __asm__ volatile(\
  88. "lea " offset "(%0), %%"REG_d" \n\t"\
  89. "pxor %%mm4, %%mm4 \n\t"\
  90. "pxor %%mm5, %%mm5 \n\t"\
  91. "pxor %%mm6, %%mm6 \n\t"\
  92. "pxor %%mm7, %%mm7 \n\t"\
  93. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  94. ".p2align 4 \n\t"\
  95. "1: \n\t"\
  96. "movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\
  97. "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  98. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  99. "movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\
  100. "movq %%mm0, %%mm3 \n\t"\
  101. "punpcklwd %%mm1, %%mm0 \n\t"\
  102. "punpckhwd %%mm1, %%mm3 \n\t"\
  103. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  104. "pmaddwd %%mm1, %%mm0 \n\t"\
  105. "pmaddwd %%mm1, %%mm3 \n\t"\
  106. "paddd %%mm0, %%mm4 \n\t"\
  107. "paddd %%mm3, %%mm5 \n\t"\
  108. "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\
  109. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  110. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  111. "test %%"REG_S", %%"REG_S" \n\t"\
  112. "movq %%mm2, %%mm0 \n\t"\
  113. "punpcklwd %%mm3, %%mm2 \n\t"\
  114. "punpckhwd %%mm3, %%mm0 \n\t"\
  115. "pmaddwd %%mm1, %%mm2 \n\t"\
  116. "pmaddwd %%mm1, %%mm0 \n\t"\
  117. "paddd %%mm2, %%mm6 \n\t"\
  118. "paddd %%mm0, %%mm7 \n\t"\
  119. " jnz 1b \n\t"\
  120. "psrad $16, %%mm4 \n\t"\
  121. "psrad $16, %%mm5 \n\t"\
  122. "psrad $16, %%mm6 \n\t"\
  123. "psrad $16, %%mm7 \n\t"\
  124. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  125. "packssdw %%mm5, %%mm4 \n\t"\
  126. "packssdw %%mm7, %%mm6 \n\t"\
  127. "paddw %%mm0, %%mm4 \n\t"\
  128. "paddw %%mm0, %%mm6 \n\t"\
  129. "psraw $3, %%mm4 \n\t"\
  130. "psraw $3, %%mm6 \n\t"\
  131. "packuswb %%mm6, %%mm4 \n\t"\
  132. MOVNTQ(%%mm4, (%1, %3))\
  133. "add $8, %3 \n\t"\
  134. "cmp %2, %3 \n\t"\
  135. "lea " offset "(%0), %%"REG_d" \n\t"\
  136. "pxor %%mm4, %%mm4 \n\t"\
  137. "pxor %%mm5, %%mm5 \n\t"\
  138. "pxor %%mm6, %%mm6 \n\t"\
  139. "pxor %%mm7, %%mm7 \n\t"\
  140. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  141. "jb 1b \n\t"\
  142. :: "r" (&c->redDither),\
  143. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  144. : "%"REG_a, "%"REG_d, "%"REG_S\
  145. );
  146. static inline void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
  147. const int16_t **lumSrc, int lumFilterSize,
  148. const int16_t *chrFilter, const int16_t **chrUSrc,
  149. const int16_t **chrVSrc,
  150. int chrFilterSize, const int16_t **alpSrc,
  151. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  152. uint8_t *aDest, int dstW, int chrDstW)
  153. {
  154. if (uDest) {
  155. x86_reg uv_off = c->uv_off;
  156. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  157. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  158. }
  159. if (CONFIG_SWSCALE_ALPHA && aDest) {
  160. YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  161. }
  162. YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
  163. }
  164. #define YSCALEYUV2YV121 \
  165. "mov %2, %%"REG_a" \n\t"\
  166. ".p2align 4 \n\t" /* FIXME Unroll? */\
  167. "1: \n\t"\
  168. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  169. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  170. "psraw $7, %%mm0 \n\t"\
  171. "psraw $7, %%mm1 \n\t"\
  172. "packuswb %%mm1, %%mm0 \n\t"\
  173. MOVNTQ(%%mm0, (%1, %%REGa))\
  174. "add $8, %%"REG_a" \n\t"\
  175. "jnc 1b \n\t"
  176. static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
  177. const int16_t *chrUSrc, const int16_t *chrVSrc,
  178. const int16_t *alpSrc,
  179. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  180. uint8_t *aDest, int dstW, int chrDstW)
  181. {
  182. int p= 4;
  183. const uint8_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
  184. uint8_t *dst[4]= { aDest, dest, uDest, vDest };
  185. x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
  186. while (p--) {
  187. if (dst[p]) {
  188. __asm__ volatile(
  189. YSCALEYUV2YV121
  190. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  191. "g" (-counter[p])
  192. : "%"REG_a
  193. );
  194. }
  195. }
  196. }
  197. #define YSCALEYUV2YV121_ACCURATE \
  198. "mov %2, %%"REG_a" \n\t"\
  199. "pcmpeqw %%mm7, %%mm7 \n\t"\
  200. "psrlw $15, %%mm7 \n\t"\
  201. "psllw $6, %%mm7 \n\t"\
  202. ".p2align 4 \n\t" /* FIXME Unroll? */\
  203. "1: \n\t"\
  204. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  205. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  206. "paddsw %%mm7, %%mm0 \n\t"\
  207. "paddsw %%mm7, %%mm1 \n\t"\
  208. "psraw $7, %%mm0 \n\t"\
  209. "psraw $7, %%mm1 \n\t"\
  210. "packuswb %%mm1, %%mm0 \n\t"\
  211. MOVNTQ(%%mm0, (%1, %%REGa))\
  212. "add $8, %%"REG_a" \n\t"\
  213. "jnc 1b \n\t"
  214. static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
  215. const int16_t *chrUSrc, const int16_t *chrVSrc,
  216. const int16_t *alpSrc,
  217. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  218. uint8_t *aDest, int dstW, int chrDstW)
  219. {
  220. int p= 4;
  221. const uint8_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
  222. uint8_t *dst[4]= { aDest, dest, uDest, vDest };
  223. x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
  224. while (p--) {
  225. if (dst[p]) {
  226. __asm__ volatile(
  227. YSCALEYUV2YV121_ACCURATE
  228. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  229. "g" (-counter[p])
  230. : "%"REG_a
  231. );
  232. }
  233. }
  234. }
  235. #define YSCALEYUV2PACKEDX_UV \
  236. __asm__ volatile(\
  237. "xor %%"REG_a", %%"REG_a" \n\t"\
  238. ".p2align 4 \n\t"\
  239. "nop \n\t"\
  240. "1: \n\t"\
  241. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  242. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  243. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  244. "movq %%mm3, %%mm4 \n\t"\
  245. ".p2align 4 \n\t"\
  246. "2: \n\t"\
  247. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  248. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  249. "add %6, %%"REG_S" \n\t" \
  250. "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  251. "add $16, %%"REG_d" \n\t"\
  252. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  253. "pmulhw %%mm0, %%mm2 \n\t"\
  254. "pmulhw %%mm0, %%mm5 \n\t"\
  255. "paddw %%mm2, %%mm3 \n\t"\
  256. "paddw %%mm5, %%mm4 \n\t"\
  257. "test %%"REG_S", %%"REG_S" \n\t"\
  258. " jnz 2b \n\t"\
  259. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  260. "lea "offset"(%0), %%"REG_d" \n\t"\
  261. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  262. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  263. "movq "#dst1", "#dst2" \n\t"\
  264. ".p2align 4 \n\t"\
  265. "2: \n\t"\
  266. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  267. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  268. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  269. "add $16, %%"REG_d" \n\t"\
  270. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  271. "pmulhw "#coeff", "#src1" \n\t"\
  272. "pmulhw "#coeff", "#src2" \n\t"\
  273. "paddw "#src1", "#dst1" \n\t"\
  274. "paddw "#src2", "#dst2" \n\t"\
  275. "test %%"REG_S", %%"REG_S" \n\t"\
  276. " jnz 2b \n\t"\
  277. #define YSCALEYUV2PACKEDX \
  278. YSCALEYUV2PACKEDX_UV \
  279. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  280. #define YSCALEYUV2PACKEDX_END \
  281. :: "r" (&c->redDither), \
  282. "m" (dummy), "m" (dummy), "m" (dummy),\
  283. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  284. : "%"REG_a, "%"REG_d, "%"REG_S \
  285. );
  286. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  287. __asm__ volatile(\
  288. "xor %%"REG_a", %%"REG_a" \n\t"\
  289. ".p2align 4 \n\t"\
  290. "nop \n\t"\
  291. "1: \n\t"\
  292. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  293. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  294. "pxor %%mm4, %%mm4 \n\t"\
  295. "pxor %%mm5, %%mm5 \n\t"\
  296. "pxor %%mm6, %%mm6 \n\t"\
  297. "pxor %%mm7, %%mm7 \n\t"\
  298. ".p2align 4 \n\t"\
  299. "2: \n\t"\
  300. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  301. "add %6, %%"REG_S" \n\t" \
  302. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  303. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  304. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  305. "movq %%mm0, %%mm3 \n\t"\
  306. "punpcklwd %%mm1, %%mm0 \n\t"\
  307. "punpckhwd %%mm1, %%mm3 \n\t"\
  308. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  309. "pmaddwd %%mm1, %%mm0 \n\t"\
  310. "pmaddwd %%mm1, %%mm3 \n\t"\
  311. "paddd %%mm0, %%mm4 \n\t"\
  312. "paddd %%mm3, %%mm5 \n\t"\
  313. "add %6, %%"REG_S" \n\t" \
  314. "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  315. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  316. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  317. "test %%"REG_S", %%"REG_S" \n\t"\
  318. "movq %%mm2, %%mm0 \n\t"\
  319. "punpcklwd %%mm3, %%mm2 \n\t"\
  320. "punpckhwd %%mm3, %%mm0 \n\t"\
  321. "pmaddwd %%mm1, %%mm2 \n\t"\
  322. "pmaddwd %%mm1, %%mm0 \n\t"\
  323. "paddd %%mm2, %%mm6 \n\t"\
  324. "paddd %%mm0, %%mm7 \n\t"\
  325. " jnz 2b \n\t"\
  326. "psrad $16, %%mm4 \n\t"\
  327. "psrad $16, %%mm5 \n\t"\
  328. "psrad $16, %%mm6 \n\t"\
  329. "psrad $16, %%mm7 \n\t"\
  330. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  331. "packssdw %%mm5, %%mm4 \n\t"\
  332. "packssdw %%mm7, %%mm6 \n\t"\
  333. "paddw %%mm0, %%mm4 \n\t"\
  334. "paddw %%mm0, %%mm6 \n\t"\
  335. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  336. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  337. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  338. "lea "offset"(%0), %%"REG_d" \n\t"\
  339. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  340. "pxor %%mm1, %%mm1 \n\t"\
  341. "pxor %%mm5, %%mm5 \n\t"\
  342. "pxor %%mm7, %%mm7 \n\t"\
  343. "pxor %%mm6, %%mm6 \n\t"\
  344. ".p2align 4 \n\t"\
  345. "2: \n\t"\
  346. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  347. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  348. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  349. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  350. "movq %%mm0, %%mm3 \n\t"\
  351. "punpcklwd %%mm4, %%mm0 \n\t"\
  352. "punpckhwd %%mm4, %%mm3 \n\t"\
  353. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  354. "pmaddwd %%mm4, %%mm0 \n\t"\
  355. "pmaddwd %%mm4, %%mm3 \n\t"\
  356. "paddd %%mm0, %%mm1 \n\t"\
  357. "paddd %%mm3, %%mm5 \n\t"\
  358. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  359. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  360. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  361. "test %%"REG_S", %%"REG_S" \n\t"\
  362. "movq %%mm2, %%mm0 \n\t"\
  363. "punpcklwd %%mm3, %%mm2 \n\t"\
  364. "punpckhwd %%mm3, %%mm0 \n\t"\
  365. "pmaddwd %%mm4, %%mm2 \n\t"\
  366. "pmaddwd %%mm4, %%mm0 \n\t"\
  367. "paddd %%mm2, %%mm7 \n\t"\
  368. "paddd %%mm0, %%mm6 \n\t"\
  369. " jnz 2b \n\t"\
  370. "psrad $16, %%mm1 \n\t"\
  371. "psrad $16, %%mm5 \n\t"\
  372. "psrad $16, %%mm7 \n\t"\
  373. "psrad $16, %%mm6 \n\t"\
  374. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  375. "packssdw %%mm5, %%mm1 \n\t"\
  376. "packssdw %%mm6, %%mm7 \n\t"\
  377. "paddw %%mm0, %%mm1 \n\t"\
  378. "paddw %%mm0, %%mm7 \n\t"\
  379. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  380. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  381. #define YSCALEYUV2PACKEDX_ACCURATE \
  382. YSCALEYUV2PACKEDX_ACCURATE_UV \
  383. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  384. #define YSCALEYUV2RGBX \
  385. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  386. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  387. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  388. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  389. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  390. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  391. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  392. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  393. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  394. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  395. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  396. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  397. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  398. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  399. "paddw %%mm3, %%mm4 \n\t"\
  400. "movq %%mm2, %%mm0 \n\t"\
  401. "movq %%mm5, %%mm6 \n\t"\
  402. "movq %%mm4, %%mm3 \n\t"\
  403. "punpcklwd %%mm2, %%mm2 \n\t"\
  404. "punpcklwd %%mm5, %%mm5 \n\t"\
  405. "punpcklwd %%mm4, %%mm4 \n\t"\
  406. "paddw %%mm1, %%mm2 \n\t"\
  407. "paddw %%mm1, %%mm5 \n\t"\
  408. "paddw %%mm1, %%mm4 \n\t"\
  409. "punpckhwd %%mm0, %%mm0 \n\t"\
  410. "punpckhwd %%mm6, %%mm6 \n\t"\
  411. "punpckhwd %%mm3, %%mm3 \n\t"\
  412. "paddw %%mm7, %%mm0 \n\t"\
  413. "paddw %%mm7, %%mm6 \n\t"\
  414. "paddw %%mm7, %%mm3 \n\t"\
  415. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  416. "packuswb %%mm0, %%mm2 \n\t"\
  417. "packuswb %%mm6, %%mm5 \n\t"\
  418. "packuswb %%mm3, %%mm4 \n\t"\
  419. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  420. "movq "#b", "#q2" \n\t" /* B */\
  421. "movq "#r", "#t" \n\t" /* R */\
  422. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  423. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  424. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  425. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  426. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  427. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  428. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  429. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  430. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  431. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  432. \
  433. MOVNTQ( q0, (dst, index, 4))\
  434. MOVNTQ( b, 8(dst, index, 4))\
  435. MOVNTQ( q2, 16(dst, index, 4))\
  436. MOVNTQ( q3, 24(dst, index, 4))\
  437. \
  438. "add $8, "#index" \n\t"\
  439. "cmp "#dstw", "#index" \n\t"\
  440. " jb 1b \n\t"
  441. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  442. static inline void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  443. const int16_t **lumSrc, int lumFilterSize,
  444. const int16_t *chrFilter, const int16_t **chrUSrc,
  445. const int16_t **chrVSrc,
  446. int chrFilterSize, const int16_t **alpSrc,
  447. uint8_t *dest, int dstW, int dstY)
  448. {
  449. x86_reg dummy=0;
  450. x86_reg dstW_reg = dstW;
  451. x86_reg uv_off = c->uv_off << 1;
  452. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  453. YSCALEYUV2PACKEDX_ACCURATE
  454. YSCALEYUV2RGBX
  455. "movq %%mm2, "U_TEMP"(%0) \n\t"
  456. "movq %%mm4, "V_TEMP"(%0) \n\t"
  457. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  458. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  459. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  460. "psraw $3, %%mm1 \n\t"
  461. "psraw $3, %%mm7 \n\t"
  462. "packuswb %%mm7, %%mm1 \n\t"
  463. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  464. YSCALEYUV2PACKEDX_END
  465. } else {
  466. YSCALEYUV2PACKEDX_ACCURATE
  467. YSCALEYUV2RGBX
  468. "pcmpeqd %%mm7, %%mm7 \n\t"
  469. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  470. YSCALEYUV2PACKEDX_END
  471. }
  472. }
  473. static inline void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  474. const int16_t **lumSrc, int lumFilterSize,
  475. const int16_t *chrFilter, const int16_t **chrUSrc,
  476. const int16_t **chrVSrc,
  477. int chrFilterSize, const int16_t **alpSrc,
  478. uint8_t *dest, int dstW, int dstY)
  479. {
  480. x86_reg dummy=0;
  481. x86_reg dstW_reg = dstW;
  482. x86_reg uv_off = c->uv_off << 1;
  483. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  484. YSCALEYUV2PACKEDX
  485. YSCALEYUV2RGBX
  486. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  487. "psraw $3, %%mm1 \n\t"
  488. "psraw $3, %%mm7 \n\t"
  489. "packuswb %%mm7, %%mm1 \n\t"
  490. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  491. YSCALEYUV2PACKEDX_END
  492. } else {
  493. YSCALEYUV2PACKEDX
  494. YSCALEYUV2RGBX
  495. "pcmpeqd %%mm7, %%mm7 \n\t"
  496. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  497. YSCALEYUV2PACKEDX_END
  498. }
  499. }
  500. #define REAL_WRITERGB16(dst, dstw, index) \
  501. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  502. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  503. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  504. "psrlq $3, %%mm2 \n\t"\
  505. \
  506. "movq %%mm2, %%mm1 \n\t"\
  507. "movq %%mm4, %%mm3 \n\t"\
  508. \
  509. "punpcklbw %%mm7, %%mm3 \n\t"\
  510. "punpcklbw %%mm5, %%mm2 \n\t"\
  511. "punpckhbw %%mm7, %%mm4 \n\t"\
  512. "punpckhbw %%mm5, %%mm1 \n\t"\
  513. \
  514. "psllq $3, %%mm3 \n\t"\
  515. "psllq $3, %%mm4 \n\t"\
  516. \
  517. "por %%mm3, %%mm2 \n\t"\
  518. "por %%mm4, %%mm1 \n\t"\
  519. \
  520. MOVNTQ(%%mm2, (dst, index, 2))\
  521. MOVNTQ(%%mm1, 8(dst, index, 2))\
  522. \
  523. "add $8, "#index" \n\t"\
  524. "cmp "#dstw", "#index" \n\t"\
  525. " jb 1b \n\t"
  526. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  527. static inline void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  528. const int16_t **lumSrc, int lumFilterSize,
  529. const int16_t *chrFilter, const int16_t **chrUSrc,
  530. const int16_t **chrVSrc,
  531. int chrFilterSize, const int16_t **alpSrc,
  532. uint8_t *dest, int dstW, int dstY)
  533. {
  534. x86_reg dummy=0;
  535. x86_reg dstW_reg = dstW;
  536. x86_reg uv_off = c->uv_off << 1;
  537. YSCALEYUV2PACKEDX_ACCURATE
  538. YSCALEYUV2RGBX
  539. "pxor %%mm7, %%mm7 \n\t"
  540. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  541. #ifdef DITHER1XBPP
  542. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  543. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  544. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  545. #endif
  546. WRITERGB16(%4, %5, %%REGa)
  547. YSCALEYUV2PACKEDX_END
  548. }
  549. static inline void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  550. const int16_t **lumSrc, int lumFilterSize,
  551. const int16_t *chrFilter, const int16_t **chrUSrc,
  552. const int16_t **chrVSrc,
  553. int chrFilterSize, const int16_t **alpSrc,
  554. uint8_t *dest, int dstW, int dstY)
  555. {
  556. x86_reg dummy=0;
  557. x86_reg dstW_reg = dstW;
  558. x86_reg uv_off = c->uv_off << 1;
  559. YSCALEYUV2PACKEDX
  560. YSCALEYUV2RGBX
  561. "pxor %%mm7, %%mm7 \n\t"
  562. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  563. #ifdef DITHER1XBPP
  564. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  565. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  566. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  567. #endif
  568. WRITERGB16(%4, %5, %%REGa)
  569. YSCALEYUV2PACKEDX_END
  570. }
  571. #define REAL_WRITERGB15(dst, dstw, index) \
  572. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  573. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  574. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  575. "psrlq $3, %%mm2 \n\t"\
  576. "psrlq $1, %%mm5 \n\t"\
  577. \
  578. "movq %%mm2, %%mm1 \n\t"\
  579. "movq %%mm4, %%mm3 \n\t"\
  580. \
  581. "punpcklbw %%mm7, %%mm3 \n\t"\
  582. "punpcklbw %%mm5, %%mm2 \n\t"\
  583. "punpckhbw %%mm7, %%mm4 \n\t"\
  584. "punpckhbw %%mm5, %%mm1 \n\t"\
  585. \
  586. "psllq $2, %%mm3 \n\t"\
  587. "psllq $2, %%mm4 \n\t"\
  588. \
  589. "por %%mm3, %%mm2 \n\t"\
  590. "por %%mm4, %%mm1 \n\t"\
  591. \
  592. MOVNTQ(%%mm2, (dst, index, 2))\
  593. MOVNTQ(%%mm1, 8(dst, index, 2))\
  594. \
  595. "add $8, "#index" \n\t"\
  596. "cmp "#dstw", "#index" \n\t"\
  597. " jb 1b \n\t"
  598. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  599. static inline void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  600. const int16_t **lumSrc, int lumFilterSize,
  601. const int16_t *chrFilter, const int16_t **chrUSrc,
  602. const int16_t **chrVSrc,
  603. int chrFilterSize, const int16_t **alpSrc,
  604. uint8_t *dest, int dstW, int dstY)
  605. {
  606. x86_reg dummy=0;
  607. x86_reg dstW_reg = dstW;
  608. x86_reg uv_off = c->uv_off << 1;
  609. YSCALEYUV2PACKEDX_ACCURATE
  610. YSCALEYUV2RGBX
  611. "pxor %%mm7, %%mm7 \n\t"
  612. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  613. #ifdef DITHER1XBPP
  614. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  615. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  616. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  617. #endif
  618. WRITERGB15(%4, %5, %%REGa)
  619. YSCALEYUV2PACKEDX_END
  620. }
  621. static inline void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  622. const int16_t **lumSrc, int lumFilterSize,
  623. const int16_t *chrFilter, const int16_t **chrUSrc,
  624. const int16_t **chrVSrc,
  625. int chrFilterSize, const int16_t **alpSrc,
  626. uint8_t *dest, int dstW, int dstY)
  627. {
  628. x86_reg dummy=0;
  629. x86_reg dstW_reg = dstW;
  630. x86_reg uv_off = c->uv_off << 1;
  631. YSCALEYUV2PACKEDX
  632. YSCALEYUV2RGBX
  633. "pxor %%mm7, %%mm7 \n\t"
  634. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  635. #ifdef DITHER1XBPP
  636. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  637. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  638. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  639. #endif
  640. WRITERGB15(%4, %5, %%REGa)
  641. YSCALEYUV2PACKEDX_END
  642. }
  643. #define WRITEBGR24MMX(dst, dstw, index) \
  644. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  645. "movq %%mm2, %%mm1 \n\t" /* B */\
  646. "movq %%mm5, %%mm6 \n\t" /* R */\
  647. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  648. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  649. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  650. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  651. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  652. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  653. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  654. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  655. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  656. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  657. \
  658. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  659. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  660. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  661. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  662. \
  663. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  664. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  665. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  666. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  667. \
  668. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  669. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  670. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  671. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  672. \
  673. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  674. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  675. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  676. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  677. MOVNTQ(%%mm0, (dst))\
  678. \
  679. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  680. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  681. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  682. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  683. MOVNTQ(%%mm6, 8(dst))\
  684. \
  685. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  686. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  687. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  688. MOVNTQ(%%mm5, 16(dst))\
  689. \
  690. "add $24, "#dst" \n\t"\
  691. \
  692. "add $8, "#index" \n\t"\
  693. "cmp "#dstw", "#index" \n\t"\
  694. " jb 1b \n\t"
  695. #define WRITEBGR24MMX2(dst, dstw, index) \
  696. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  697. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  698. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  699. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  700. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  701. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  702. \
  703. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  704. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  705. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  706. \
  707. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  708. "por %%mm1, %%mm6 \n\t"\
  709. "por %%mm3, %%mm6 \n\t"\
  710. MOVNTQ(%%mm6, (dst))\
  711. \
  712. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  713. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  714. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  715. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  716. \
  717. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  718. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  719. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  720. \
  721. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  722. "por %%mm3, %%mm6 \n\t"\
  723. MOVNTQ(%%mm6, 8(dst))\
  724. \
  725. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  726. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  727. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  728. \
  729. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  730. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  731. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  732. \
  733. "por %%mm1, %%mm3 \n\t"\
  734. "por %%mm3, %%mm6 \n\t"\
  735. MOVNTQ(%%mm6, 16(dst))\
  736. \
  737. "add $24, "#dst" \n\t"\
  738. \
  739. "add $8, "#index" \n\t"\
  740. "cmp "#dstw", "#index" \n\t"\
  741. " jb 1b \n\t"
  742. #if COMPILE_TEMPLATE_MMX2
  743. #undef WRITEBGR24
  744. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  745. #else
  746. #undef WRITEBGR24
  747. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  748. #endif
  749. static inline void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  750. const int16_t **lumSrc, int lumFilterSize,
  751. const int16_t *chrFilter, const int16_t **chrUSrc,
  752. const int16_t **chrVSrc,
  753. int chrFilterSize, const int16_t **alpSrc,
  754. uint8_t *dest, int dstW, int dstY)
  755. {
  756. x86_reg dummy=0;
  757. x86_reg dstW_reg = dstW;
  758. x86_reg uv_off = c->uv_off << 1;
  759. YSCALEYUV2PACKEDX_ACCURATE
  760. YSCALEYUV2RGBX
  761. "pxor %%mm7, %%mm7 \n\t"
  762. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  763. "add %4, %%"REG_c" \n\t"
  764. WRITEBGR24(%%REGc, %5, %%REGa)
  765. :: "r" (&c->redDither),
  766. "m" (dummy), "m" (dummy), "m" (dummy),
  767. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  768. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  769. );
  770. }
  771. static inline void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  772. const int16_t **lumSrc, int lumFilterSize,
  773. const int16_t *chrFilter, const int16_t **chrUSrc,
  774. const int16_t **chrVSrc,
  775. int chrFilterSize, const int16_t **alpSrc,
  776. uint8_t *dest, int dstW, int dstY)
  777. {
  778. x86_reg dummy=0;
  779. x86_reg dstW_reg = dstW;
  780. x86_reg uv_off = c->uv_off << 1;
  781. YSCALEYUV2PACKEDX
  782. YSCALEYUV2RGBX
  783. "pxor %%mm7, %%mm7 \n\t"
  784. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  785. "add %4, %%"REG_c" \n\t"
  786. WRITEBGR24(%%REGc, %5, %%REGa)
  787. :: "r" (&c->redDither),
  788. "m" (dummy), "m" (dummy), "m" (dummy),
  789. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  790. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  791. );
  792. }
  793. #define REAL_WRITEYUY2(dst, dstw, index) \
  794. "packuswb %%mm3, %%mm3 \n\t"\
  795. "packuswb %%mm4, %%mm4 \n\t"\
  796. "packuswb %%mm7, %%mm1 \n\t"\
  797. "punpcklbw %%mm4, %%mm3 \n\t"\
  798. "movq %%mm1, %%mm7 \n\t"\
  799. "punpcklbw %%mm3, %%mm1 \n\t"\
  800. "punpckhbw %%mm3, %%mm7 \n\t"\
  801. \
  802. MOVNTQ(%%mm1, (dst, index, 2))\
  803. MOVNTQ(%%mm7, 8(dst, index, 2))\
  804. \
  805. "add $8, "#index" \n\t"\
  806. "cmp "#dstw", "#index" \n\t"\
  807. " jb 1b \n\t"
  808. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  809. static inline void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  810. const int16_t **lumSrc, int lumFilterSize,
  811. const int16_t *chrFilter, const int16_t **chrUSrc,
  812. const int16_t **chrVSrc,
  813. int chrFilterSize, const int16_t **alpSrc,
  814. uint8_t *dest, int dstW, int dstY)
  815. {
  816. x86_reg dummy=0;
  817. x86_reg dstW_reg = dstW;
  818. x86_reg uv_off = c->uv_off << 1;
  819. YSCALEYUV2PACKEDX_ACCURATE
  820. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  821. "psraw $3, %%mm3 \n\t"
  822. "psraw $3, %%mm4 \n\t"
  823. "psraw $3, %%mm1 \n\t"
  824. "psraw $3, %%mm7 \n\t"
  825. WRITEYUY2(%4, %5, %%REGa)
  826. YSCALEYUV2PACKEDX_END
  827. }
  828. static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  829. const int16_t **lumSrc, int lumFilterSize,
  830. const int16_t *chrFilter, const int16_t **chrUSrc,
  831. const int16_t **chrVSrc,
  832. int chrFilterSize, const int16_t **alpSrc,
  833. uint8_t *dest, int dstW, int dstY)
  834. {
  835. x86_reg dummy=0;
  836. x86_reg dstW_reg = dstW;
  837. x86_reg uv_off = c->uv_off << 1;
  838. YSCALEYUV2PACKEDX
  839. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  840. "psraw $3, %%mm3 \n\t"
  841. "psraw $3, %%mm4 \n\t"
  842. "psraw $3, %%mm1 \n\t"
  843. "psraw $3, %%mm7 \n\t"
  844. WRITEYUY2(%4, %5, %%REGa)
  845. YSCALEYUV2PACKEDX_END
  846. }
  847. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  848. "xor "#index", "#index" \n\t"\
  849. ".p2align 4 \n\t"\
  850. "1: \n\t"\
  851. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  852. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  853. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  854. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  855. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  856. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  857. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  858. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  859. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  860. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  861. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  862. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  863. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  864. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  865. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  866. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  867. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  868. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  869. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  870. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  871. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  872. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  873. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  874. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  875. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  876. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  877. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  878. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  879. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  880. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  881. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  882. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  883. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  884. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  885. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  886. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  887. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  888. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  889. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  890. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  891. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  892. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  893. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  894. "paddw %%mm3, %%mm4 \n\t"\
  895. "movq %%mm2, %%mm0 \n\t"\
  896. "movq %%mm5, %%mm6 \n\t"\
  897. "movq %%mm4, %%mm3 \n\t"\
  898. "punpcklwd %%mm2, %%mm2 \n\t"\
  899. "punpcklwd %%mm5, %%mm5 \n\t"\
  900. "punpcklwd %%mm4, %%mm4 \n\t"\
  901. "paddw %%mm1, %%mm2 \n\t"\
  902. "paddw %%mm1, %%mm5 \n\t"\
  903. "paddw %%mm1, %%mm4 \n\t"\
  904. "punpckhwd %%mm0, %%mm0 \n\t"\
  905. "punpckhwd %%mm6, %%mm6 \n\t"\
  906. "punpckhwd %%mm3, %%mm3 \n\t"\
  907. "paddw %%mm7, %%mm0 \n\t"\
  908. "paddw %%mm7, %%mm6 \n\t"\
  909. "paddw %%mm7, %%mm3 \n\t"\
  910. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  911. "packuswb %%mm0, %%mm2 \n\t"\
  912. "packuswb %%mm6, %%mm5 \n\t"\
  913. "packuswb %%mm3, %%mm4 \n\t"\
  914. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  915. #define YSCALEYUV2RGB(index, c) \
  916. REAL_YSCALEYUV2RGB_UV(index, c) \
  917. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  918. REAL_YSCALEYUV2RGB_COEFF(c)
  919. /**
  920. * vertical bilinear scale YV12 to RGB
  921. */
  922. static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0,
  923. const uint16_t *buf1, const uint16_t *ubuf0,
  924. const uint16_t *ubuf1, const uint16_t *vbuf0,
  925. const uint16_t *vbuf1, const uint16_t *abuf0,
  926. const uint16_t *abuf1, uint8_t *dest,
  927. int dstW, int yalpha, int uvalpha, int y)
  928. {
  929. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  930. #if ARCH_X86_64
  931. __asm__ volatile(
  932. YSCALEYUV2RGB(%%r8, %5)
  933. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  934. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  935. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  936. "packuswb %%mm7, %%mm1 \n\t"
  937. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  938. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  939. "a" (&c->redDither),
  940. "r" (abuf0), "r" (abuf1)
  941. : "%r8"
  942. );
  943. #else
  944. *(const uint16_t **)(&c->u_temp)=abuf0;
  945. *(const uint16_t **)(&c->v_temp)=abuf1;
  946. __asm__ volatile(
  947. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  948. "mov %4, %%"REG_b" \n\t"
  949. "push %%"REG_BP" \n\t"
  950. YSCALEYUV2RGB(%%REGBP, %5)
  951. "push %0 \n\t"
  952. "push %1 \n\t"
  953. "mov "U_TEMP"(%5), %0 \n\t"
  954. "mov "V_TEMP"(%5), %1 \n\t"
  955. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  956. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  957. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  958. "packuswb %%mm7, %%mm1 \n\t"
  959. "pop %1 \n\t"
  960. "pop %0 \n\t"
  961. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  962. "pop %%"REG_BP" \n\t"
  963. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  964. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  965. "a" (&c->redDither)
  966. );
  967. #endif
  968. } else {
  969. __asm__ volatile(
  970. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  971. "mov %4, %%"REG_b" \n\t"
  972. "push %%"REG_BP" \n\t"
  973. YSCALEYUV2RGB(%%REGBP, %5)
  974. "pcmpeqd %%mm7, %%mm7 \n\t"
  975. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  976. "pop %%"REG_BP" \n\t"
  977. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  978. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  979. "a" (&c->redDither)
  980. );
  981. }
  982. }
  983. static inline void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0,
  984. const uint16_t *buf1, const uint16_t *ubuf0,
  985. const uint16_t *ubuf1, const uint16_t *vbuf0,
  986. const uint16_t *vbuf1, const uint16_t *abuf0,
  987. const uint16_t *abuf1, uint8_t *dest,
  988. int dstW, int yalpha, int uvalpha, int y)
  989. {
  990. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  991. __asm__ volatile(
  992. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  993. "mov %4, %%"REG_b" \n\t"
  994. "push %%"REG_BP" \n\t"
  995. YSCALEYUV2RGB(%%REGBP, %5)
  996. "pxor %%mm7, %%mm7 \n\t"
  997. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  998. "pop %%"REG_BP" \n\t"
  999. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1000. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1001. "a" (&c->redDither)
  1002. );
  1003. }
  1004. static inline void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0,
  1005. const uint16_t *buf1, const uint16_t *ubuf0,
  1006. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1007. const uint16_t *vbuf1, const uint16_t *abuf0,
  1008. const uint16_t *abuf1, uint8_t *dest,
  1009. int dstW, int yalpha, int uvalpha, int y)
  1010. {
  1011. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1012. __asm__ volatile(
  1013. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1014. "mov %4, %%"REG_b" \n\t"
  1015. "push %%"REG_BP" \n\t"
  1016. YSCALEYUV2RGB(%%REGBP, %5)
  1017. "pxor %%mm7, %%mm7 \n\t"
  1018. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1019. #ifdef DITHER1XBPP
  1020. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1021. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1022. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1023. #endif
  1024. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1025. "pop %%"REG_BP" \n\t"
  1026. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1027. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1028. "a" (&c->redDither)
  1029. );
  1030. }
  1031. static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0,
  1032. const uint16_t *buf1, const uint16_t *ubuf0,
  1033. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1034. const uint16_t *vbuf1, const uint16_t *abuf0,
  1035. const uint16_t *abuf1, uint8_t *dest,
  1036. int dstW, int yalpha, int uvalpha, int y)
  1037. {
  1038. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1039. __asm__ volatile(
  1040. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1041. "mov %4, %%"REG_b" \n\t"
  1042. "push %%"REG_BP" \n\t"
  1043. YSCALEYUV2RGB(%%REGBP, %5)
  1044. "pxor %%mm7, %%mm7 \n\t"
  1045. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1046. #ifdef DITHER1XBPP
  1047. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1048. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1049. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1050. #endif
  1051. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1052. "pop %%"REG_BP" \n\t"
  1053. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1054. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1055. "a" (&c->redDither)
  1056. );
  1057. }
  1058. #define REAL_YSCALEYUV2PACKED(index, c) \
  1059. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1060. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  1061. "psraw $3, %%mm0 \n\t"\
  1062. "psraw $3, %%mm1 \n\t"\
  1063. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1064. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1065. "xor "#index", "#index" \n\t"\
  1066. ".p2align 4 \n\t"\
  1067. "1: \n\t"\
  1068. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1069. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1070. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1071. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1072. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1073. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1074. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  1075. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  1076. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1077. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  1078. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  1079. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1080. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1081. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  1082. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  1083. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  1084. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  1085. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  1086. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  1087. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  1088. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  1089. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1090. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1091. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1092. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1093. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1094. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1095. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  1096. static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0,
  1097. const uint16_t *buf1, const uint16_t *ubuf0,
  1098. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1099. const uint16_t *vbuf1, const uint16_t *abuf0,
  1100. const uint16_t *abuf1, uint8_t *dest,
  1101. int dstW, int yalpha, int uvalpha, int y)
  1102. {
  1103. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1104. __asm__ volatile(
  1105. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1106. "mov %4, %%"REG_b" \n\t"
  1107. "push %%"REG_BP" \n\t"
  1108. YSCALEYUV2PACKED(%%REGBP, %5)
  1109. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1110. "pop %%"REG_BP" \n\t"
  1111. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1112. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1113. "a" (&c->redDither)
  1114. );
  1115. }
  1116. #define REAL_YSCALEYUV2RGB1(index, c) \
  1117. "xor "#index", "#index" \n\t"\
  1118. ".p2align 4 \n\t"\
  1119. "1: \n\t"\
  1120. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1121. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1122. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1123. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1124. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1125. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1126. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1127. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1128. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1129. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1130. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1131. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1132. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1133. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1134. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1135. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1136. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1137. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1138. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1139. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1140. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1141. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1142. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1143. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1144. "paddw %%mm3, %%mm4 \n\t"\
  1145. "movq %%mm2, %%mm0 \n\t"\
  1146. "movq %%mm5, %%mm6 \n\t"\
  1147. "movq %%mm4, %%mm3 \n\t"\
  1148. "punpcklwd %%mm2, %%mm2 \n\t"\
  1149. "punpcklwd %%mm5, %%mm5 \n\t"\
  1150. "punpcklwd %%mm4, %%mm4 \n\t"\
  1151. "paddw %%mm1, %%mm2 \n\t"\
  1152. "paddw %%mm1, %%mm5 \n\t"\
  1153. "paddw %%mm1, %%mm4 \n\t"\
  1154. "punpckhwd %%mm0, %%mm0 \n\t"\
  1155. "punpckhwd %%mm6, %%mm6 \n\t"\
  1156. "punpckhwd %%mm3, %%mm3 \n\t"\
  1157. "paddw %%mm7, %%mm0 \n\t"\
  1158. "paddw %%mm7, %%mm6 \n\t"\
  1159. "paddw %%mm7, %%mm3 \n\t"\
  1160. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1161. "packuswb %%mm0, %%mm2 \n\t"\
  1162. "packuswb %%mm6, %%mm5 \n\t"\
  1163. "packuswb %%mm3, %%mm4 \n\t"\
  1164. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1165. // do vertical chrominance interpolation
  1166. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1167. "xor "#index", "#index" \n\t"\
  1168. ".p2align 4 \n\t"\
  1169. "1: \n\t"\
  1170. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1171. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1172. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1173. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1174. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1175. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1176. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1177. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1178. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1179. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1180. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1181. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1182. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1183. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1184. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1185. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1186. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1187. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1188. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1189. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1190. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1191. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1192. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1193. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1194. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1195. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1196. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1197. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1198. "paddw %%mm3, %%mm4 \n\t"\
  1199. "movq %%mm2, %%mm0 \n\t"\
  1200. "movq %%mm5, %%mm6 \n\t"\
  1201. "movq %%mm4, %%mm3 \n\t"\
  1202. "punpcklwd %%mm2, %%mm2 \n\t"\
  1203. "punpcklwd %%mm5, %%mm5 \n\t"\
  1204. "punpcklwd %%mm4, %%mm4 \n\t"\
  1205. "paddw %%mm1, %%mm2 \n\t"\
  1206. "paddw %%mm1, %%mm5 \n\t"\
  1207. "paddw %%mm1, %%mm4 \n\t"\
  1208. "punpckhwd %%mm0, %%mm0 \n\t"\
  1209. "punpckhwd %%mm6, %%mm6 \n\t"\
  1210. "punpckhwd %%mm3, %%mm3 \n\t"\
  1211. "paddw %%mm7, %%mm0 \n\t"\
  1212. "paddw %%mm7, %%mm6 \n\t"\
  1213. "paddw %%mm7, %%mm3 \n\t"\
  1214. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1215. "packuswb %%mm0, %%mm2 \n\t"\
  1216. "packuswb %%mm6, %%mm5 \n\t"\
  1217. "packuswb %%mm3, %%mm4 \n\t"\
  1218. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1219. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1220. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1221. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1222. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1223. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1224. "packuswb %%mm1, %%mm7 \n\t"
  1225. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1226. /**
  1227. * YV12 to RGB without scaling or interpolating
  1228. */
  1229. static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0,
  1230. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1231. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1232. const uint16_t *abuf0, uint8_t *dest,
  1233. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1234. int flags, int y)
  1235. {
  1236. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1237. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1238. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1239. __asm__ volatile(
  1240. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1241. "mov %4, %%"REG_b" \n\t"
  1242. "push %%"REG_BP" \n\t"
  1243. YSCALEYUV2RGB1(%%REGBP, %5)
  1244. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1245. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1246. "pop %%"REG_BP" \n\t"
  1247. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1248. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1249. "a" (&c->redDither)
  1250. );
  1251. } else {
  1252. __asm__ volatile(
  1253. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1254. "mov %4, %%"REG_b" \n\t"
  1255. "push %%"REG_BP" \n\t"
  1256. YSCALEYUV2RGB1(%%REGBP, %5)
  1257. "pcmpeqd %%mm7, %%mm7 \n\t"
  1258. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1259. "pop %%"REG_BP" \n\t"
  1260. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1261. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1262. "a" (&c->redDither)
  1263. );
  1264. }
  1265. } else {
  1266. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1267. __asm__ volatile(
  1268. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1269. "mov %4, %%"REG_b" \n\t"
  1270. "push %%"REG_BP" \n\t"
  1271. YSCALEYUV2RGB1b(%%REGBP, %5)
  1272. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1273. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1274. "pop %%"REG_BP" \n\t"
  1275. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1276. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1277. "a" (&c->redDither)
  1278. );
  1279. } else {
  1280. __asm__ volatile(
  1281. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1282. "mov %4, %%"REG_b" \n\t"
  1283. "push %%"REG_BP" \n\t"
  1284. YSCALEYUV2RGB1b(%%REGBP, %5)
  1285. "pcmpeqd %%mm7, %%mm7 \n\t"
  1286. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1287. "pop %%"REG_BP" \n\t"
  1288. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1289. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1290. "a" (&c->redDither)
  1291. );
  1292. }
  1293. }
  1294. }
  1295. static inline void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0,
  1296. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1297. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1298. const uint16_t *abuf0, uint8_t *dest,
  1299. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1300. int flags, int y)
  1301. {
  1302. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1303. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1304. __asm__ volatile(
  1305. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1306. "mov %4, %%"REG_b" \n\t"
  1307. "push %%"REG_BP" \n\t"
  1308. YSCALEYUV2RGB1(%%REGBP, %5)
  1309. "pxor %%mm7, %%mm7 \n\t"
  1310. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1311. "pop %%"REG_BP" \n\t"
  1312. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1313. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1314. "a" (&c->redDither)
  1315. );
  1316. } else {
  1317. __asm__ volatile(
  1318. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1319. "mov %4, %%"REG_b" \n\t"
  1320. "push %%"REG_BP" \n\t"
  1321. YSCALEYUV2RGB1b(%%REGBP, %5)
  1322. "pxor %%mm7, %%mm7 \n\t"
  1323. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1324. "pop %%"REG_BP" \n\t"
  1325. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1326. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1327. "a" (&c->redDither)
  1328. );
  1329. }
  1330. }
  1331. static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
  1332. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1333. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1334. const uint16_t *abuf0, uint8_t *dest,
  1335. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1336. int flags, int y)
  1337. {
  1338. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1339. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1340. __asm__ volatile(
  1341. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1342. "mov %4, %%"REG_b" \n\t"
  1343. "push %%"REG_BP" \n\t"
  1344. YSCALEYUV2RGB1(%%REGBP, %5)
  1345. "pxor %%mm7, %%mm7 \n\t"
  1346. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1347. #ifdef DITHER1XBPP
  1348. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1349. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1350. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1351. #endif
  1352. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1353. "pop %%"REG_BP" \n\t"
  1354. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1355. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1356. "a" (&c->redDither)
  1357. );
  1358. } else {
  1359. __asm__ volatile(
  1360. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1361. "mov %4, %%"REG_b" \n\t"
  1362. "push %%"REG_BP" \n\t"
  1363. YSCALEYUV2RGB1b(%%REGBP, %5)
  1364. "pxor %%mm7, %%mm7 \n\t"
  1365. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1366. #ifdef DITHER1XBPP
  1367. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1368. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1369. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1370. #endif
  1371. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1372. "pop %%"REG_BP" \n\t"
  1373. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1374. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1375. "a" (&c->redDither)
  1376. );
  1377. }
  1378. }
  1379. static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
  1380. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1381. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1382. const uint16_t *abuf0, uint8_t *dest,
  1383. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1384. int flags, int y)
  1385. {
  1386. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1387. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1388. __asm__ volatile(
  1389. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1390. "mov %4, %%"REG_b" \n\t"
  1391. "push %%"REG_BP" \n\t"
  1392. YSCALEYUV2RGB1(%%REGBP, %5)
  1393. "pxor %%mm7, %%mm7 \n\t"
  1394. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1395. #ifdef DITHER1XBPP
  1396. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1397. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1398. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1399. #endif
  1400. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1401. "pop %%"REG_BP" \n\t"
  1402. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1403. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1404. "a" (&c->redDither)
  1405. );
  1406. } else {
  1407. __asm__ volatile(
  1408. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1409. "mov %4, %%"REG_b" \n\t"
  1410. "push %%"REG_BP" \n\t"
  1411. YSCALEYUV2RGB1b(%%REGBP, %5)
  1412. "pxor %%mm7, %%mm7 \n\t"
  1413. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1414. #ifdef DITHER1XBPP
  1415. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1416. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1417. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1418. #endif
  1419. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1420. "pop %%"REG_BP" \n\t"
  1421. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1422. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1423. "a" (&c->redDither)
  1424. );
  1425. }
  1426. }
  1427. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1428. "xor "#index", "#index" \n\t"\
  1429. ".p2align 4 \n\t"\
  1430. "1: \n\t"\
  1431. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1432. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1433. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1434. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1435. "psraw $7, %%mm3 \n\t" \
  1436. "psraw $7, %%mm4 \n\t" \
  1437. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1438. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1439. "psraw $7, %%mm1 \n\t" \
  1440. "psraw $7, %%mm7 \n\t" \
  1441. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1442. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1443. "xor "#index", "#index" \n\t"\
  1444. ".p2align 4 \n\t"\
  1445. "1: \n\t"\
  1446. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1447. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1448. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1449. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1450. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1451. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1452. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1453. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1454. "psrlw $8, %%mm3 \n\t" \
  1455. "psrlw $8, %%mm4 \n\t" \
  1456. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1457. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1458. "psraw $7, %%mm1 \n\t" \
  1459. "psraw $7, %%mm7 \n\t"
  1460. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1461. static inline void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0,
  1462. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1463. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1464. const uint16_t *abuf0, uint8_t *dest,
  1465. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1466. int flags, int y)
  1467. {
  1468. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1469. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1470. __asm__ volatile(
  1471. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1472. "mov %4, %%"REG_b" \n\t"
  1473. "push %%"REG_BP" \n\t"
  1474. YSCALEYUV2PACKED1(%%REGBP, %5)
  1475. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1476. "pop %%"REG_BP" \n\t"
  1477. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1478. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1479. "a" (&c->redDither)
  1480. );
  1481. } else {
  1482. __asm__ volatile(
  1483. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1484. "mov %4, %%"REG_b" \n\t"
  1485. "push %%"REG_BP" \n\t"
  1486. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1487. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1488. "pop %%"REG_BP" \n\t"
  1489. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1490. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1491. "a" (&c->redDither)
  1492. );
  1493. }
  1494. }
  1495. #if !COMPILE_TEMPLATE_MMX2
  1496. //FIXME yuy2* can read up to 7 samples too much
  1497. static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
  1498. {
  1499. __asm__ volatile(
  1500. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1501. "mov %0, %%"REG_a" \n\t"
  1502. "1: \n\t"
  1503. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1504. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1505. "pand %%mm2, %%mm0 \n\t"
  1506. "pand %%mm2, %%mm1 \n\t"
  1507. "packuswb %%mm1, %%mm0 \n\t"
  1508. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1509. "add $8, %%"REG_a" \n\t"
  1510. " js 1b \n\t"
  1511. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1512. : "%"REG_a
  1513. );
  1514. }
  1515. static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1516. {
  1517. __asm__ volatile(
  1518. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1519. "mov %0, %%"REG_a" \n\t"
  1520. "1: \n\t"
  1521. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1522. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1523. "psrlw $8, %%mm0 \n\t"
  1524. "psrlw $8, %%mm1 \n\t"
  1525. "packuswb %%mm1, %%mm0 \n\t"
  1526. "movq %%mm0, %%mm1 \n\t"
  1527. "psrlw $8, %%mm0 \n\t"
  1528. "pand %%mm4, %%mm1 \n\t"
  1529. "packuswb %%mm0, %%mm0 \n\t"
  1530. "packuswb %%mm1, %%mm1 \n\t"
  1531. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1532. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1533. "add $4, %%"REG_a" \n\t"
  1534. " js 1b \n\t"
  1535. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1536. : "%"REG_a
  1537. );
  1538. assert(src1 == src2);
  1539. }
  1540. static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1541. {
  1542. __asm__ volatile(
  1543. "mov %0, %%"REG_a" \n\t"
  1544. "1: \n\t"
  1545. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1546. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1547. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1548. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1549. "psrlw $8, %%mm0 \n\t"
  1550. "psrlw $8, %%mm1 \n\t"
  1551. "psrlw $8, %%mm2 \n\t"
  1552. "psrlw $8, %%mm3 \n\t"
  1553. "packuswb %%mm1, %%mm0 \n\t"
  1554. "packuswb %%mm3, %%mm2 \n\t"
  1555. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1556. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1557. "add $8, %%"REG_a" \n\t"
  1558. " js 1b \n\t"
  1559. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1560. : "%"REG_a
  1561. );
  1562. }
  1563. /* This is almost identical to the previous, end exists only because
  1564. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1565. static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
  1566. {
  1567. __asm__ volatile(
  1568. "mov %0, %%"REG_a" \n\t"
  1569. "1: \n\t"
  1570. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1571. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1572. "psrlw $8, %%mm0 \n\t"
  1573. "psrlw $8, %%mm1 \n\t"
  1574. "packuswb %%mm1, %%mm0 \n\t"
  1575. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1576. "add $8, %%"REG_a" \n\t"
  1577. " js 1b \n\t"
  1578. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1579. : "%"REG_a
  1580. );
  1581. }
  1582. static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1583. {
  1584. __asm__ volatile(
  1585. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1586. "mov %0, %%"REG_a" \n\t"
  1587. "1: \n\t"
  1588. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1589. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1590. "pand %%mm4, %%mm0 \n\t"
  1591. "pand %%mm4, %%mm1 \n\t"
  1592. "packuswb %%mm1, %%mm0 \n\t"
  1593. "movq %%mm0, %%mm1 \n\t"
  1594. "psrlw $8, %%mm0 \n\t"
  1595. "pand %%mm4, %%mm1 \n\t"
  1596. "packuswb %%mm0, %%mm0 \n\t"
  1597. "packuswb %%mm1, %%mm1 \n\t"
  1598. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1599. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1600. "add $4, %%"REG_a" \n\t"
  1601. " js 1b \n\t"
  1602. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1603. : "%"REG_a
  1604. );
  1605. assert(src1 == src2);
  1606. }
  1607. static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1608. {
  1609. __asm__ volatile(
  1610. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1611. "mov %0, %%"REG_a" \n\t"
  1612. "1: \n\t"
  1613. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1614. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1615. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1616. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1617. "pand %%mm4, %%mm0 \n\t"
  1618. "pand %%mm4, %%mm1 \n\t"
  1619. "pand %%mm4, %%mm2 \n\t"
  1620. "pand %%mm4, %%mm3 \n\t"
  1621. "packuswb %%mm1, %%mm0 \n\t"
  1622. "packuswb %%mm3, %%mm2 \n\t"
  1623. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1624. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1625. "add $8, %%"REG_a" \n\t"
  1626. " js 1b \n\t"
  1627. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1628. : "%"REG_a
  1629. );
  1630. }
  1631. static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1632. const uint8_t *src, int width)
  1633. {
  1634. __asm__ volatile(
  1635. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1636. "mov %0, %%"REG_a" \n\t"
  1637. "1: \n\t"
  1638. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1639. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1640. "movq %%mm0, %%mm2 \n\t"
  1641. "movq %%mm1, %%mm3 \n\t"
  1642. "pand %%mm4, %%mm0 \n\t"
  1643. "pand %%mm4, %%mm1 \n\t"
  1644. "psrlw $8, %%mm2 \n\t"
  1645. "psrlw $8, %%mm3 \n\t"
  1646. "packuswb %%mm1, %%mm0 \n\t"
  1647. "packuswb %%mm3, %%mm2 \n\t"
  1648. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1649. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1650. "add $8, %%"REG_a" \n\t"
  1651. " js 1b \n\t"
  1652. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1653. : "%"REG_a
  1654. );
  1655. }
  1656. static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1657. const uint8_t *src1, const uint8_t *src2,
  1658. int width, uint32_t *unused)
  1659. {
  1660. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1661. }
  1662. static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1663. const uint8_t *src1, const uint8_t *src2,
  1664. int width, uint32_t *unused)
  1665. {
  1666. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1667. }
  1668. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1669. static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, int width, enum PixelFormat srcFormat)
  1670. {
  1671. if(srcFormat == PIX_FMT_BGR24) {
  1672. __asm__ volatile(
  1673. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1674. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1675. :
  1676. );
  1677. } else {
  1678. __asm__ volatile(
  1679. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1680. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1681. :
  1682. );
  1683. }
  1684. __asm__ volatile(
  1685. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1686. "mov %2, %%"REG_a" \n\t"
  1687. "pxor %%mm7, %%mm7 \n\t"
  1688. "1: \n\t"
  1689. PREFETCH" 64(%0) \n\t"
  1690. "movd (%0), %%mm0 \n\t"
  1691. "movd 2(%0), %%mm1 \n\t"
  1692. "movd 6(%0), %%mm2 \n\t"
  1693. "movd 8(%0), %%mm3 \n\t"
  1694. "add $12, %0 \n\t"
  1695. "punpcklbw %%mm7, %%mm0 \n\t"
  1696. "punpcklbw %%mm7, %%mm1 \n\t"
  1697. "punpcklbw %%mm7, %%mm2 \n\t"
  1698. "punpcklbw %%mm7, %%mm3 \n\t"
  1699. "pmaddwd %%mm5, %%mm0 \n\t"
  1700. "pmaddwd %%mm6, %%mm1 \n\t"
  1701. "pmaddwd %%mm5, %%mm2 \n\t"
  1702. "pmaddwd %%mm6, %%mm3 \n\t"
  1703. "paddd %%mm1, %%mm0 \n\t"
  1704. "paddd %%mm3, %%mm2 \n\t"
  1705. "paddd %%mm4, %%mm0 \n\t"
  1706. "paddd %%mm4, %%mm2 \n\t"
  1707. "psrad $15, %%mm0 \n\t"
  1708. "psrad $15, %%mm2 \n\t"
  1709. "packssdw %%mm2, %%mm0 \n\t"
  1710. "packuswb %%mm0, %%mm0 \n\t"
  1711. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1712. "add $4, %%"REG_a" \n\t"
  1713. " js 1b \n\t"
  1714. : "+r" (src)
  1715. : "r" (dst+width), "g" ((x86_reg)-width)
  1716. : "%"REG_a
  1717. );
  1718. }
  1719. static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, int width, enum PixelFormat srcFormat)
  1720. {
  1721. __asm__ volatile(
  1722. "movq 24(%4), %%mm6 \n\t"
  1723. "mov %3, %%"REG_a" \n\t"
  1724. "pxor %%mm7, %%mm7 \n\t"
  1725. "1: \n\t"
  1726. PREFETCH" 64(%0) \n\t"
  1727. "movd (%0), %%mm0 \n\t"
  1728. "movd 2(%0), %%mm1 \n\t"
  1729. "punpcklbw %%mm7, %%mm0 \n\t"
  1730. "punpcklbw %%mm7, %%mm1 \n\t"
  1731. "movq %%mm0, %%mm2 \n\t"
  1732. "movq %%mm1, %%mm3 \n\t"
  1733. "pmaddwd (%4), %%mm0 \n\t"
  1734. "pmaddwd 8(%4), %%mm1 \n\t"
  1735. "pmaddwd 16(%4), %%mm2 \n\t"
  1736. "pmaddwd %%mm6, %%mm3 \n\t"
  1737. "paddd %%mm1, %%mm0 \n\t"
  1738. "paddd %%mm3, %%mm2 \n\t"
  1739. "movd 6(%0), %%mm1 \n\t"
  1740. "movd 8(%0), %%mm3 \n\t"
  1741. "add $12, %0 \n\t"
  1742. "punpcklbw %%mm7, %%mm1 \n\t"
  1743. "punpcklbw %%mm7, %%mm3 \n\t"
  1744. "movq %%mm1, %%mm4 \n\t"
  1745. "movq %%mm3, %%mm5 \n\t"
  1746. "pmaddwd (%4), %%mm1 \n\t"
  1747. "pmaddwd 8(%4), %%mm3 \n\t"
  1748. "pmaddwd 16(%4), %%mm4 \n\t"
  1749. "pmaddwd %%mm6, %%mm5 \n\t"
  1750. "paddd %%mm3, %%mm1 \n\t"
  1751. "paddd %%mm5, %%mm4 \n\t"
  1752. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1753. "paddd %%mm3, %%mm0 \n\t"
  1754. "paddd %%mm3, %%mm2 \n\t"
  1755. "paddd %%mm3, %%mm1 \n\t"
  1756. "paddd %%mm3, %%mm4 \n\t"
  1757. "psrad $15, %%mm0 \n\t"
  1758. "psrad $15, %%mm2 \n\t"
  1759. "psrad $15, %%mm1 \n\t"
  1760. "psrad $15, %%mm4 \n\t"
  1761. "packssdw %%mm1, %%mm0 \n\t"
  1762. "packssdw %%mm4, %%mm2 \n\t"
  1763. "packuswb %%mm0, %%mm0 \n\t"
  1764. "packuswb %%mm2, %%mm2 \n\t"
  1765. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1766. "movd %%mm2, (%2, %%"REG_a") \n\t"
  1767. "add $4, %%"REG_a" \n\t"
  1768. " js 1b \n\t"
  1769. : "+r" (src)
  1770. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
  1771. : "%"REG_a
  1772. );
  1773. }
  1774. static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
  1775. {
  1776. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1777. }
  1778. static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1779. {
  1780. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1781. assert(src1 == src2);
  1782. }
  1783. static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
  1784. {
  1785. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1786. }
  1787. static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1788. {
  1789. assert(src1==src2);
  1790. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1791. }
  1792. #if !COMPILE_TEMPLATE_MMX2
  1793. // bilinear / bicubic scaling
  1794. static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
  1795. const int16_t *filter, const int16_t *filterPos, int filterSize)
  1796. {
  1797. assert(filterSize % 4 == 0 && filterSize>0);
  1798. if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
  1799. x86_reg counter= -2*dstW;
  1800. filter-= counter*2;
  1801. filterPos-= counter/2;
  1802. dst-= counter/2;
  1803. __asm__ volatile(
  1804. #if defined(PIC)
  1805. "push %%"REG_b" \n\t"
  1806. #endif
  1807. "pxor %%mm7, %%mm7 \n\t"
  1808. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1809. "mov %%"REG_a", %%"REG_BP" \n\t"
  1810. ".p2align 4 \n\t"
  1811. "1: \n\t"
  1812. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1813. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1814. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1815. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1816. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1817. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1818. "punpcklbw %%mm7, %%mm0 \n\t"
  1819. "punpcklbw %%mm7, %%mm2 \n\t"
  1820. "pmaddwd %%mm1, %%mm0 \n\t"
  1821. "pmaddwd %%mm2, %%mm3 \n\t"
  1822. "movq %%mm0, %%mm4 \n\t"
  1823. "punpckldq %%mm3, %%mm0 \n\t"
  1824. "punpckhdq %%mm3, %%mm4 \n\t"
  1825. "paddd %%mm4, %%mm0 \n\t"
  1826. "psrad $7, %%mm0 \n\t"
  1827. "packssdw %%mm0, %%mm0 \n\t"
  1828. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1829. "add $4, %%"REG_BP" \n\t"
  1830. " jnc 1b \n\t"
  1831. "pop %%"REG_BP" \n\t"
  1832. #if defined(PIC)
  1833. "pop %%"REG_b" \n\t"
  1834. #endif
  1835. : "+a" (counter)
  1836. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1837. #if !defined(PIC)
  1838. : "%"REG_b
  1839. #endif
  1840. );
  1841. } else if (filterSize==8) {
  1842. x86_reg counter= -2*dstW;
  1843. filter-= counter*4;
  1844. filterPos-= counter/2;
  1845. dst-= counter/2;
  1846. __asm__ volatile(
  1847. #if defined(PIC)
  1848. "push %%"REG_b" \n\t"
  1849. #endif
  1850. "pxor %%mm7, %%mm7 \n\t"
  1851. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1852. "mov %%"REG_a", %%"REG_BP" \n\t"
  1853. ".p2align 4 \n\t"
  1854. "1: \n\t"
  1855. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1856. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1857. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  1858. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  1859. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1860. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1861. "punpcklbw %%mm7, %%mm0 \n\t"
  1862. "punpcklbw %%mm7, %%mm2 \n\t"
  1863. "pmaddwd %%mm1, %%mm0 \n\t"
  1864. "pmaddwd %%mm2, %%mm3 \n\t"
  1865. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  1866. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  1867. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  1868. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  1869. "punpcklbw %%mm7, %%mm4 \n\t"
  1870. "punpcklbw %%mm7, %%mm2 \n\t"
  1871. "pmaddwd %%mm1, %%mm4 \n\t"
  1872. "pmaddwd %%mm2, %%mm5 \n\t"
  1873. "paddd %%mm4, %%mm0 \n\t"
  1874. "paddd %%mm5, %%mm3 \n\t"
  1875. "movq %%mm0, %%mm4 \n\t"
  1876. "punpckldq %%mm3, %%mm0 \n\t"
  1877. "punpckhdq %%mm3, %%mm4 \n\t"
  1878. "paddd %%mm4, %%mm0 \n\t"
  1879. "psrad $7, %%mm0 \n\t"
  1880. "packssdw %%mm0, %%mm0 \n\t"
  1881. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1882. "add $4, %%"REG_BP" \n\t"
  1883. " jnc 1b \n\t"
  1884. "pop %%"REG_BP" \n\t"
  1885. #if defined(PIC)
  1886. "pop %%"REG_b" \n\t"
  1887. #endif
  1888. : "+a" (counter)
  1889. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1890. #if !defined(PIC)
  1891. : "%"REG_b
  1892. #endif
  1893. );
  1894. } else {
  1895. const uint8_t *offset = src+filterSize;
  1896. x86_reg counter= -2*dstW;
  1897. //filter-= counter*filterSize/2;
  1898. filterPos-= counter/2;
  1899. dst-= counter/2;
  1900. __asm__ volatile(
  1901. "pxor %%mm7, %%mm7 \n\t"
  1902. ".p2align 4 \n\t"
  1903. "1: \n\t"
  1904. "mov %2, %%"REG_c" \n\t"
  1905. "movzwl (%%"REG_c", %0), %%eax \n\t"
  1906. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  1907. "mov %5, %%"REG_c" \n\t"
  1908. "pxor %%mm4, %%mm4 \n\t"
  1909. "pxor %%mm5, %%mm5 \n\t"
  1910. "2: \n\t"
  1911. "movq (%1), %%mm1 \n\t"
  1912. "movq (%1, %6), %%mm3 \n\t"
  1913. "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
  1914. "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
  1915. "punpcklbw %%mm7, %%mm0 \n\t"
  1916. "punpcklbw %%mm7, %%mm2 \n\t"
  1917. "pmaddwd %%mm1, %%mm0 \n\t"
  1918. "pmaddwd %%mm2, %%mm3 \n\t"
  1919. "paddd %%mm3, %%mm5 \n\t"
  1920. "paddd %%mm0, %%mm4 \n\t"
  1921. "add $8, %1 \n\t"
  1922. "add $4, %%"REG_c" \n\t"
  1923. "cmp %4, %%"REG_c" \n\t"
  1924. " jb 2b \n\t"
  1925. "add %6, %1 \n\t"
  1926. "movq %%mm4, %%mm0 \n\t"
  1927. "punpckldq %%mm5, %%mm4 \n\t"
  1928. "punpckhdq %%mm5, %%mm0 \n\t"
  1929. "paddd %%mm0, %%mm4 \n\t"
  1930. "psrad $7, %%mm4 \n\t"
  1931. "packssdw %%mm4, %%mm4 \n\t"
  1932. "mov %3, %%"REG_a" \n\t"
  1933. "movd %%mm4, (%%"REG_a", %0) \n\t"
  1934. "add $4, %0 \n\t"
  1935. " jnc 1b \n\t"
  1936. : "+r" (counter), "+r" (filter)
  1937. : "m" (filterPos), "m" (dst), "m"(offset),
  1938. "m" (src), "r" ((x86_reg)filterSize*2)
  1939. : "%"REG_a, "%"REG_c, "%"REG_d
  1940. );
  1941. }
  1942. }
  1943. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1944. #if COMPILE_TEMPLATE_MMX2
  1945. static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  1946. int dstWidth, const uint8_t *src, int srcW,
  1947. int xInc)
  1948. {
  1949. int32_t *filterPos = c->hLumFilterPos;
  1950. int16_t *filter = c->hLumFilter;
  1951. void *mmx2FilterCode= c->lumMmx2FilterCode;
  1952. int i;
  1953. #if defined(PIC)
  1954. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1955. #endif
  1956. __asm__ volatile(
  1957. #if defined(PIC)
  1958. "mov %%"REG_b", %5 \n\t"
  1959. #endif
  1960. "pxor %%mm7, %%mm7 \n\t"
  1961. "mov %0, %%"REG_c" \n\t"
  1962. "mov %1, %%"REG_D" \n\t"
  1963. "mov %2, %%"REG_d" \n\t"
  1964. "mov %3, %%"REG_b" \n\t"
  1965. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1966. PREFETCH" (%%"REG_c") \n\t"
  1967. PREFETCH" 32(%%"REG_c") \n\t"
  1968. PREFETCH" 64(%%"REG_c") \n\t"
  1969. #if ARCH_X86_64
  1970. #define CALL_MMX2_FILTER_CODE \
  1971. "movl (%%"REG_b"), %%esi \n\t"\
  1972. "call *%4 \n\t"\
  1973. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  1974. "add %%"REG_S", %%"REG_c" \n\t"\
  1975. "add %%"REG_a", %%"REG_D" \n\t"\
  1976. "xor %%"REG_a", %%"REG_a" \n\t"\
  1977. #else
  1978. #define CALL_MMX2_FILTER_CODE \
  1979. "movl (%%"REG_b"), %%esi \n\t"\
  1980. "call *%4 \n\t"\
  1981. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  1982. "add %%"REG_a", %%"REG_D" \n\t"\
  1983. "xor %%"REG_a", %%"REG_a" \n\t"\
  1984. #endif /* ARCH_X86_64 */
  1985. CALL_MMX2_FILTER_CODE
  1986. CALL_MMX2_FILTER_CODE
  1987. CALL_MMX2_FILTER_CODE
  1988. CALL_MMX2_FILTER_CODE
  1989. CALL_MMX2_FILTER_CODE
  1990. CALL_MMX2_FILTER_CODE
  1991. CALL_MMX2_FILTER_CODE
  1992. CALL_MMX2_FILTER_CODE
  1993. #if defined(PIC)
  1994. "mov %5, %%"REG_b" \n\t"
  1995. #endif
  1996. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  1997. "m" (mmx2FilterCode)
  1998. #if defined(PIC)
  1999. ,"m" (ebxsave)
  2000. #endif
  2001. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2002. #if !defined(PIC)
  2003. ,"%"REG_b
  2004. #endif
  2005. );
  2006. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  2007. dst[i] = src[srcW-1]*128;
  2008. }
  2009. static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
  2010. int dstWidth, const uint8_t *src1,
  2011. const uint8_t *src2, int srcW, int xInc)
  2012. {
  2013. int32_t *filterPos = c->hChrFilterPos;
  2014. int16_t *filter = c->hChrFilter;
  2015. void *mmx2FilterCode= c->chrMmx2FilterCode;
  2016. int i;
  2017. #if defined(PIC)
  2018. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2019. #endif
  2020. __asm__ volatile(
  2021. #if defined(PIC)
  2022. "mov %%"REG_b", %7 \n\t"
  2023. #endif
  2024. "pxor %%mm7, %%mm7 \n\t"
  2025. "mov %0, %%"REG_c" \n\t"
  2026. "mov %1, %%"REG_D" \n\t"
  2027. "mov %2, %%"REG_d" \n\t"
  2028. "mov %3, %%"REG_b" \n\t"
  2029. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2030. PREFETCH" (%%"REG_c") \n\t"
  2031. PREFETCH" 32(%%"REG_c") \n\t"
  2032. PREFETCH" 64(%%"REG_c") \n\t"
  2033. CALL_MMX2_FILTER_CODE
  2034. CALL_MMX2_FILTER_CODE
  2035. CALL_MMX2_FILTER_CODE
  2036. CALL_MMX2_FILTER_CODE
  2037. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2038. "mov %5, %%"REG_c" \n\t" // src
  2039. "mov %6, %%"REG_D" \n\t" // buf2
  2040. PREFETCH" (%%"REG_c") \n\t"
  2041. PREFETCH" 32(%%"REG_c") \n\t"
  2042. PREFETCH" 64(%%"REG_c") \n\t"
  2043. CALL_MMX2_FILTER_CODE
  2044. CALL_MMX2_FILTER_CODE
  2045. CALL_MMX2_FILTER_CODE
  2046. CALL_MMX2_FILTER_CODE
  2047. #if defined(PIC)
  2048. "mov %7, %%"REG_b" \n\t"
  2049. #endif
  2050. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  2051. "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
  2052. #if defined(PIC)
  2053. ,"m" (ebxsave)
  2054. #endif
  2055. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2056. #if !defined(PIC)
  2057. ,"%"REG_b
  2058. #endif
  2059. );
  2060. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  2061. dst1[i] = src1[srcW-1]*128;
  2062. dst2[i] = src2[srcW-1]*128;
  2063. }
  2064. }
  2065. #endif /* COMPILE_TEMPLATE_MMX2 */
  2066. static void RENAME(sws_init_swScale)(SwsContext *c)
  2067. {
  2068. enum PixelFormat srcFormat = c->srcFormat;
  2069. if (!(c->flags & SWS_BITEXACT)) {
  2070. if (c->flags & SWS_ACCURATE_RND) {
  2071. c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
  2072. c->yuv2yuvX = RENAME(yuv2yuvX_ar );
  2073. switch (c->dstFormat) {
  2074. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  2075. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  2076. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  2077. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  2078. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  2079. default: break;
  2080. }
  2081. } else {
  2082. c->yuv2yuv1 = RENAME(yuv2yuv1 );
  2083. c->yuv2yuvX = RENAME(yuv2yuvX );
  2084. switch (c->dstFormat) {
  2085. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  2086. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  2087. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  2088. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  2089. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  2090. default: break;
  2091. }
  2092. }
  2093. switch (c->dstFormat) {
  2094. case PIX_FMT_RGB32:
  2095. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  2096. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  2097. break;
  2098. case PIX_FMT_BGR24:
  2099. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  2100. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  2101. break;
  2102. case PIX_FMT_RGB555:
  2103. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  2104. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  2105. break;
  2106. case PIX_FMT_RGB565:
  2107. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  2108. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  2109. break;
  2110. case PIX_FMT_YUYV422:
  2111. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  2112. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  2113. break;
  2114. default:
  2115. break;
  2116. }
  2117. }
  2118. #if !COMPILE_TEMPLATE_MMX2
  2119. c->hScale = RENAME(hScale );
  2120. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2121. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2122. #if COMPILE_TEMPLATE_MMX2
  2123. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  2124. {
  2125. c->hyscale_fast = RENAME(hyscale_fast);
  2126. c->hcscale_fast = RENAME(hcscale_fast);
  2127. } else {
  2128. #endif /* COMPILE_TEMPLATE_MMX2 */
  2129. c->hyscale_fast = NULL;
  2130. c->hcscale_fast = NULL;
  2131. #if COMPILE_TEMPLATE_MMX2
  2132. }
  2133. #endif /* COMPILE_TEMPLATE_MMX2 */
  2134. #if !COMPILE_TEMPLATE_MMX2
  2135. switch(srcFormat) {
  2136. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  2137. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  2138. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  2139. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  2140. case PIX_FMT_YUV420P16BE:
  2141. case PIX_FMT_YUV422P16BE:
  2142. case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
  2143. case PIX_FMT_YUV420P16LE:
  2144. case PIX_FMT_YUV422P16LE:
  2145. case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
  2146. default: break;
  2147. }
  2148. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2149. if (!c->chrSrcHSubSample) {
  2150. switch(srcFormat) {
  2151. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  2152. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  2153. default: break;
  2154. }
  2155. }
  2156. switch (srcFormat) {
  2157. #if !COMPILE_TEMPLATE_MMX2
  2158. case PIX_FMT_YUYV422 :
  2159. case PIX_FMT_YUV420P16BE:
  2160. case PIX_FMT_YUV422P16BE:
  2161. case PIX_FMT_YUV444P16BE:
  2162. case PIX_FMT_Y400A :
  2163. case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
  2164. case PIX_FMT_UYVY422 :
  2165. case PIX_FMT_YUV420P16LE:
  2166. case PIX_FMT_YUV422P16LE:
  2167. case PIX_FMT_YUV444P16LE:
  2168. case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
  2169. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2170. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  2171. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  2172. default: break;
  2173. }
  2174. #if !COMPILE_TEMPLATE_MMX2
  2175. if (c->alpPixBuf) {
  2176. switch (srcFormat) {
  2177. case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
  2178. default: break;
  2179. }
  2180. }
  2181. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2182. }