swscale_template.c 138 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069
  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. *
  20. * The C code (not assembly, MMX, ...) of this file can be used
  21. * under the LGPL license.
  22. */
  23. #undef REAL_MOVNTQ
  24. #undef MOVNTQ
  25. #undef PAVGB
  26. #undef PREFETCH
  27. #if COMPILE_TEMPLATE_AMD3DNOW
  28. #define PREFETCH "prefetch"
  29. #elif COMPILE_TEMPLATE_MMX2
  30. #define PREFETCH "prefetchnta"
  31. #else
  32. #define PREFETCH " # nop"
  33. #endif
  34. #if COMPILE_TEMPLATE_MMX2
  35. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  36. #elif COMPILE_TEMPLATE_AMD3DNOW
  37. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  38. #endif
  39. #if COMPILE_TEMPLATE_MMX2
  40. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  41. #else
  42. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  43. #endif
  44. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  45. #if COMPILE_TEMPLATE_ALTIVEC
  46. #include "ppc/swscale_altivec_template.c"
  47. #endif
  48. #define YSCALEYUV2YV12X(x, offset, dest, width) \
  49. __asm__ volatile(\
  50. "xor %%"REG_a", %%"REG_a" \n\t"\
  51. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  52. "movq %%mm3, %%mm4 \n\t"\
  53. "lea " offset "(%0), %%"REG_d" \n\t"\
  54. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  55. ASMALIGN(4) /* FIXME Unroll? */\
  56. "1: \n\t"\
  57. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  58. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  59. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
  60. "add $16, %%"REG_d" \n\t"\
  61. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  62. "test %%"REG_S", %%"REG_S" \n\t"\
  63. "pmulhw %%mm0, %%mm2 \n\t"\
  64. "pmulhw %%mm0, %%mm5 \n\t"\
  65. "paddw %%mm2, %%mm3 \n\t"\
  66. "paddw %%mm5, %%mm4 \n\t"\
  67. " jnz 1b \n\t"\
  68. "psraw $3, %%mm3 \n\t"\
  69. "psraw $3, %%mm4 \n\t"\
  70. "packuswb %%mm4, %%mm3 \n\t"\
  71. MOVNTQ(%%mm3, (%1, %%REGa))\
  72. "add $8, %%"REG_a" \n\t"\
  73. "cmp %2, %%"REG_a" \n\t"\
  74. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  75. "movq %%mm3, %%mm4 \n\t"\
  76. "lea " offset "(%0), %%"REG_d" \n\t"\
  77. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  78. "jb 1b \n\t"\
  79. :: "r" (&c->redDither),\
  80. "r" (dest), "g" (width)\
  81. : "%"REG_a, "%"REG_d, "%"REG_S\
  82. );
  83. #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
  84. __asm__ volatile(\
  85. "lea " offset "(%0), %%"REG_d" \n\t"\
  86. "xor %%"REG_a", %%"REG_a" \n\t"\
  87. "pxor %%mm4, %%mm4 \n\t"\
  88. "pxor %%mm5, %%mm5 \n\t"\
  89. "pxor %%mm6, %%mm6 \n\t"\
  90. "pxor %%mm7, %%mm7 \n\t"\
  91. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  92. ASMALIGN(4) \
  93. "1: \n\t"\
  94. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
  95. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  96. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  97. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
  98. "movq %%mm0, %%mm3 \n\t"\
  99. "punpcklwd %%mm1, %%mm0 \n\t"\
  100. "punpckhwd %%mm1, %%mm3 \n\t"\
  101. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  102. "pmaddwd %%mm1, %%mm0 \n\t"\
  103. "pmaddwd %%mm1, %%mm3 \n\t"\
  104. "paddd %%mm0, %%mm4 \n\t"\
  105. "paddd %%mm3, %%mm5 \n\t"\
  106. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
  107. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  108. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  109. "test %%"REG_S", %%"REG_S" \n\t"\
  110. "movq %%mm2, %%mm0 \n\t"\
  111. "punpcklwd %%mm3, %%mm2 \n\t"\
  112. "punpckhwd %%mm3, %%mm0 \n\t"\
  113. "pmaddwd %%mm1, %%mm2 \n\t"\
  114. "pmaddwd %%mm1, %%mm0 \n\t"\
  115. "paddd %%mm2, %%mm6 \n\t"\
  116. "paddd %%mm0, %%mm7 \n\t"\
  117. " jnz 1b \n\t"\
  118. "psrad $16, %%mm4 \n\t"\
  119. "psrad $16, %%mm5 \n\t"\
  120. "psrad $16, %%mm6 \n\t"\
  121. "psrad $16, %%mm7 \n\t"\
  122. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  123. "packssdw %%mm5, %%mm4 \n\t"\
  124. "packssdw %%mm7, %%mm6 \n\t"\
  125. "paddw %%mm0, %%mm4 \n\t"\
  126. "paddw %%mm0, %%mm6 \n\t"\
  127. "psraw $3, %%mm4 \n\t"\
  128. "psraw $3, %%mm6 \n\t"\
  129. "packuswb %%mm6, %%mm4 \n\t"\
  130. MOVNTQ(%%mm4, (%1, %%REGa))\
  131. "add $8, %%"REG_a" \n\t"\
  132. "cmp %2, %%"REG_a" \n\t"\
  133. "lea " offset "(%0), %%"REG_d" \n\t"\
  134. "pxor %%mm4, %%mm4 \n\t"\
  135. "pxor %%mm5, %%mm5 \n\t"\
  136. "pxor %%mm6, %%mm6 \n\t"\
  137. "pxor %%mm7, %%mm7 \n\t"\
  138. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  139. "jb 1b \n\t"\
  140. :: "r" (&c->redDither),\
  141. "r" (dest), "g" (width)\
  142. : "%"REG_a, "%"REG_d, "%"REG_S\
  143. );
  144. #define YSCALEYUV2YV121 \
  145. "mov %2, %%"REG_a" \n\t"\
  146. ASMALIGN(4) /* FIXME Unroll? */\
  147. "1: \n\t"\
  148. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  149. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  150. "psraw $7, %%mm0 \n\t"\
  151. "psraw $7, %%mm1 \n\t"\
  152. "packuswb %%mm1, %%mm0 \n\t"\
  153. MOVNTQ(%%mm0, (%1, %%REGa))\
  154. "add $8, %%"REG_a" \n\t"\
  155. "jnc 1b \n\t"
  156. #define YSCALEYUV2YV121_ACCURATE \
  157. "mov %2, %%"REG_a" \n\t"\
  158. "pcmpeqw %%mm7, %%mm7 \n\t"\
  159. "psrlw $15, %%mm7 \n\t"\
  160. "psllw $6, %%mm7 \n\t"\
  161. ASMALIGN(4) /* FIXME Unroll? */\
  162. "1: \n\t"\
  163. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  164. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  165. "paddsw %%mm7, %%mm0 \n\t"\
  166. "paddsw %%mm7, %%mm1 \n\t"\
  167. "psraw $7, %%mm0 \n\t"\
  168. "psraw $7, %%mm1 \n\t"\
  169. "packuswb %%mm1, %%mm0 \n\t"\
  170. MOVNTQ(%%mm0, (%1, %%REGa))\
  171. "add $8, %%"REG_a" \n\t"\
  172. "jnc 1b \n\t"
  173. /*
  174. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  175. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  176. "r" (dest), "m" (dstW),
  177. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  178. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  179. */
  180. #define YSCALEYUV2PACKEDX_UV \
  181. __asm__ volatile(\
  182. "xor %%"REG_a", %%"REG_a" \n\t"\
  183. ASMALIGN(4)\
  184. "nop \n\t"\
  185. "1: \n\t"\
  186. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  187. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  188. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  189. "movq %%mm3, %%mm4 \n\t"\
  190. ASMALIGN(4)\
  191. "2: \n\t"\
  192. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  193. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  194. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  195. "add $16, %%"REG_d" \n\t"\
  196. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  197. "pmulhw %%mm0, %%mm2 \n\t"\
  198. "pmulhw %%mm0, %%mm5 \n\t"\
  199. "paddw %%mm2, %%mm3 \n\t"\
  200. "paddw %%mm5, %%mm4 \n\t"\
  201. "test %%"REG_S", %%"REG_S" \n\t"\
  202. " jnz 2b \n\t"\
  203. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  204. "lea "offset"(%0), %%"REG_d" \n\t"\
  205. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  206. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  207. "movq "#dst1", "#dst2" \n\t"\
  208. ASMALIGN(4)\
  209. "2: \n\t"\
  210. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  211. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  212. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  213. "add $16, %%"REG_d" \n\t"\
  214. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  215. "pmulhw "#coeff", "#src1" \n\t"\
  216. "pmulhw "#coeff", "#src2" \n\t"\
  217. "paddw "#src1", "#dst1" \n\t"\
  218. "paddw "#src2", "#dst2" \n\t"\
  219. "test %%"REG_S", %%"REG_S" \n\t"\
  220. " jnz 2b \n\t"\
  221. #define YSCALEYUV2PACKEDX \
  222. YSCALEYUV2PACKEDX_UV \
  223. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  224. #define YSCALEYUV2PACKEDX_END \
  225. :: "r" (&c->redDither), \
  226. "m" (dummy), "m" (dummy), "m" (dummy),\
  227. "r" (dest), "m" (dstW) \
  228. : "%"REG_a, "%"REG_d, "%"REG_S \
  229. );
  230. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  231. __asm__ volatile(\
  232. "xor %%"REG_a", %%"REG_a" \n\t"\
  233. ASMALIGN(4)\
  234. "nop \n\t"\
  235. "1: \n\t"\
  236. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  237. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  238. "pxor %%mm4, %%mm4 \n\t"\
  239. "pxor %%mm5, %%mm5 \n\t"\
  240. "pxor %%mm6, %%mm6 \n\t"\
  241. "pxor %%mm7, %%mm7 \n\t"\
  242. ASMALIGN(4)\
  243. "2: \n\t"\
  244. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  245. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  246. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  247. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  248. "movq %%mm0, %%mm3 \n\t"\
  249. "punpcklwd %%mm1, %%mm0 \n\t"\
  250. "punpckhwd %%mm1, %%mm3 \n\t"\
  251. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  252. "pmaddwd %%mm1, %%mm0 \n\t"\
  253. "pmaddwd %%mm1, %%mm3 \n\t"\
  254. "paddd %%mm0, %%mm4 \n\t"\
  255. "paddd %%mm3, %%mm5 \n\t"\
  256. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  257. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  258. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  259. "test %%"REG_S", %%"REG_S" \n\t"\
  260. "movq %%mm2, %%mm0 \n\t"\
  261. "punpcklwd %%mm3, %%mm2 \n\t"\
  262. "punpckhwd %%mm3, %%mm0 \n\t"\
  263. "pmaddwd %%mm1, %%mm2 \n\t"\
  264. "pmaddwd %%mm1, %%mm0 \n\t"\
  265. "paddd %%mm2, %%mm6 \n\t"\
  266. "paddd %%mm0, %%mm7 \n\t"\
  267. " jnz 2b \n\t"\
  268. "psrad $16, %%mm4 \n\t"\
  269. "psrad $16, %%mm5 \n\t"\
  270. "psrad $16, %%mm6 \n\t"\
  271. "psrad $16, %%mm7 \n\t"\
  272. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  273. "packssdw %%mm5, %%mm4 \n\t"\
  274. "packssdw %%mm7, %%mm6 \n\t"\
  275. "paddw %%mm0, %%mm4 \n\t"\
  276. "paddw %%mm0, %%mm6 \n\t"\
  277. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  278. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  279. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  280. "lea "offset"(%0), %%"REG_d" \n\t"\
  281. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  282. "pxor %%mm1, %%mm1 \n\t"\
  283. "pxor %%mm5, %%mm5 \n\t"\
  284. "pxor %%mm7, %%mm7 \n\t"\
  285. "pxor %%mm6, %%mm6 \n\t"\
  286. ASMALIGN(4)\
  287. "2: \n\t"\
  288. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  289. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  290. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  291. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  292. "movq %%mm0, %%mm3 \n\t"\
  293. "punpcklwd %%mm4, %%mm0 \n\t"\
  294. "punpckhwd %%mm4, %%mm3 \n\t"\
  295. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  296. "pmaddwd %%mm4, %%mm0 \n\t"\
  297. "pmaddwd %%mm4, %%mm3 \n\t"\
  298. "paddd %%mm0, %%mm1 \n\t"\
  299. "paddd %%mm3, %%mm5 \n\t"\
  300. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  301. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  302. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  303. "test %%"REG_S", %%"REG_S" \n\t"\
  304. "movq %%mm2, %%mm0 \n\t"\
  305. "punpcklwd %%mm3, %%mm2 \n\t"\
  306. "punpckhwd %%mm3, %%mm0 \n\t"\
  307. "pmaddwd %%mm4, %%mm2 \n\t"\
  308. "pmaddwd %%mm4, %%mm0 \n\t"\
  309. "paddd %%mm2, %%mm7 \n\t"\
  310. "paddd %%mm0, %%mm6 \n\t"\
  311. " jnz 2b \n\t"\
  312. "psrad $16, %%mm1 \n\t"\
  313. "psrad $16, %%mm5 \n\t"\
  314. "psrad $16, %%mm7 \n\t"\
  315. "psrad $16, %%mm6 \n\t"\
  316. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  317. "packssdw %%mm5, %%mm1 \n\t"\
  318. "packssdw %%mm6, %%mm7 \n\t"\
  319. "paddw %%mm0, %%mm1 \n\t"\
  320. "paddw %%mm0, %%mm7 \n\t"\
  321. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  322. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  323. #define YSCALEYUV2PACKEDX_ACCURATE \
  324. YSCALEYUV2PACKEDX_ACCURATE_UV \
  325. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  326. #define YSCALEYUV2RGBX \
  327. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  328. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  329. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  330. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  331. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  332. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  333. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  334. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  335. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  336. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  337. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  338. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  339. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  340. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  341. "paddw %%mm3, %%mm4 \n\t"\
  342. "movq %%mm2, %%mm0 \n\t"\
  343. "movq %%mm5, %%mm6 \n\t"\
  344. "movq %%mm4, %%mm3 \n\t"\
  345. "punpcklwd %%mm2, %%mm2 \n\t"\
  346. "punpcklwd %%mm5, %%mm5 \n\t"\
  347. "punpcklwd %%mm4, %%mm4 \n\t"\
  348. "paddw %%mm1, %%mm2 \n\t"\
  349. "paddw %%mm1, %%mm5 \n\t"\
  350. "paddw %%mm1, %%mm4 \n\t"\
  351. "punpckhwd %%mm0, %%mm0 \n\t"\
  352. "punpckhwd %%mm6, %%mm6 \n\t"\
  353. "punpckhwd %%mm3, %%mm3 \n\t"\
  354. "paddw %%mm7, %%mm0 \n\t"\
  355. "paddw %%mm7, %%mm6 \n\t"\
  356. "paddw %%mm7, %%mm3 \n\t"\
  357. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  358. "packuswb %%mm0, %%mm2 \n\t"\
  359. "packuswb %%mm6, %%mm5 \n\t"\
  360. "packuswb %%mm3, %%mm4 \n\t"\
  361. #define REAL_YSCALEYUV2PACKED(index, c) \
  362. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  363. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  364. "psraw $3, %%mm0 \n\t"\
  365. "psraw $3, %%mm1 \n\t"\
  366. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  367. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  368. "xor "#index", "#index" \n\t"\
  369. ASMALIGN(4)\
  370. "1: \n\t"\
  371. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  372. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  373. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  374. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  375. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  376. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  377. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  378. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  379. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  380. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  381. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  382. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  383. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  384. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  385. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  386. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  387. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  388. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  389. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  390. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  391. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  392. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  393. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  394. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  395. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  396. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  397. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  398. "xor "#index", "#index" \n\t"\
  399. ASMALIGN(4)\
  400. "1: \n\t"\
  401. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  402. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  403. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  404. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  405. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  406. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  407. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  408. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  409. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  410. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  411. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  412. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  413. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  414. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  415. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  416. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  417. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  418. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  419. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  420. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  421. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  422. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  423. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  424. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  425. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  426. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  427. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  428. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  429. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  430. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  431. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  432. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  433. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  434. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  435. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  436. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  437. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  438. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  439. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  440. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  441. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  442. "paddw %%mm3, %%mm4 \n\t"\
  443. "movq %%mm2, %%mm0 \n\t"\
  444. "movq %%mm5, %%mm6 \n\t"\
  445. "movq %%mm4, %%mm3 \n\t"\
  446. "punpcklwd %%mm2, %%mm2 \n\t"\
  447. "punpcklwd %%mm5, %%mm5 \n\t"\
  448. "punpcklwd %%mm4, %%mm4 \n\t"\
  449. "paddw %%mm1, %%mm2 \n\t"\
  450. "paddw %%mm1, %%mm5 \n\t"\
  451. "paddw %%mm1, %%mm4 \n\t"\
  452. "punpckhwd %%mm0, %%mm0 \n\t"\
  453. "punpckhwd %%mm6, %%mm6 \n\t"\
  454. "punpckhwd %%mm3, %%mm3 \n\t"\
  455. "paddw %%mm7, %%mm0 \n\t"\
  456. "paddw %%mm7, %%mm6 \n\t"\
  457. "paddw %%mm7, %%mm3 \n\t"\
  458. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  459. "packuswb %%mm0, %%mm2 \n\t"\
  460. "packuswb %%mm6, %%mm5 \n\t"\
  461. "packuswb %%mm3, %%mm4 \n\t"\
  462. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  463. #define YSCALEYUV2RGB(index, c) \
  464. REAL_YSCALEYUV2RGB_UV(index, c) \
  465. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  466. REAL_YSCALEYUV2RGB_COEFF(c)
  467. #define REAL_YSCALEYUV2PACKED1(index, c) \
  468. "xor "#index", "#index" \n\t"\
  469. ASMALIGN(4)\
  470. "1: \n\t"\
  471. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  472. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  473. "psraw $7, %%mm3 \n\t" \
  474. "psraw $7, %%mm4 \n\t" \
  475. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  476. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  477. "psraw $7, %%mm1 \n\t" \
  478. "psraw $7, %%mm7 \n\t" \
  479. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  480. #define REAL_YSCALEYUV2RGB1(index, c) \
  481. "xor "#index", "#index" \n\t"\
  482. ASMALIGN(4)\
  483. "1: \n\t"\
  484. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  485. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  486. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  487. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  488. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  489. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  490. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  491. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  492. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  493. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  494. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  495. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  496. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  497. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  498. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  499. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  500. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  501. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  502. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  503. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  504. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  505. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  506. "paddw %%mm3, %%mm4 \n\t"\
  507. "movq %%mm2, %%mm0 \n\t"\
  508. "movq %%mm5, %%mm6 \n\t"\
  509. "movq %%mm4, %%mm3 \n\t"\
  510. "punpcklwd %%mm2, %%mm2 \n\t"\
  511. "punpcklwd %%mm5, %%mm5 \n\t"\
  512. "punpcklwd %%mm4, %%mm4 \n\t"\
  513. "paddw %%mm1, %%mm2 \n\t"\
  514. "paddw %%mm1, %%mm5 \n\t"\
  515. "paddw %%mm1, %%mm4 \n\t"\
  516. "punpckhwd %%mm0, %%mm0 \n\t"\
  517. "punpckhwd %%mm6, %%mm6 \n\t"\
  518. "punpckhwd %%mm3, %%mm3 \n\t"\
  519. "paddw %%mm7, %%mm0 \n\t"\
  520. "paddw %%mm7, %%mm6 \n\t"\
  521. "paddw %%mm7, %%mm3 \n\t"\
  522. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  523. "packuswb %%mm0, %%mm2 \n\t"\
  524. "packuswb %%mm6, %%mm5 \n\t"\
  525. "packuswb %%mm3, %%mm4 \n\t"\
  526. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  527. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  528. "xor "#index", "#index" \n\t"\
  529. ASMALIGN(4)\
  530. "1: \n\t"\
  531. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  532. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  533. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  534. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  535. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  536. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  537. "psrlw $8, %%mm3 \n\t" \
  538. "psrlw $8, %%mm4 \n\t" \
  539. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  540. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  541. "psraw $7, %%mm1 \n\t" \
  542. "psraw $7, %%mm7 \n\t"
  543. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  544. // do vertical chrominance interpolation
  545. #define REAL_YSCALEYUV2RGB1b(index, c) \
  546. "xor "#index", "#index" \n\t"\
  547. ASMALIGN(4)\
  548. "1: \n\t"\
  549. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  550. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  551. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  552. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  553. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  554. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  555. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  556. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  557. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  558. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  559. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  560. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  561. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  562. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  563. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  564. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  565. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  566. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  567. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  568. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  569. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  570. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  571. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  572. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  573. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  574. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  575. "paddw %%mm3, %%mm4 \n\t"\
  576. "movq %%mm2, %%mm0 \n\t"\
  577. "movq %%mm5, %%mm6 \n\t"\
  578. "movq %%mm4, %%mm3 \n\t"\
  579. "punpcklwd %%mm2, %%mm2 \n\t"\
  580. "punpcklwd %%mm5, %%mm5 \n\t"\
  581. "punpcklwd %%mm4, %%mm4 \n\t"\
  582. "paddw %%mm1, %%mm2 \n\t"\
  583. "paddw %%mm1, %%mm5 \n\t"\
  584. "paddw %%mm1, %%mm4 \n\t"\
  585. "punpckhwd %%mm0, %%mm0 \n\t"\
  586. "punpckhwd %%mm6, %%mm6 \n\t"\
  587. "punpckhwd %%mm3, %%mm3 \n\t"\
  588. "paddw %%mm7, %%mm0 \n\t"\
  589. "paddw %%mm7, %%mm6 \n\t"\
  590. "paddw %%mm7, %%mm3 \n\t"\
  591. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  592. "packuswb %%mm0, %%mm2 \n\t"\
  593. "packuswb %%mm6, %%mm5 \n\t"\
  594. "packuswb %%mm3, %%mm4 \n\t"\
  595. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  596. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  597. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  598. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  599. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  600. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  601. "packuswb %%mm1, %%mm7 \n\t"
  602. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  603. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  604. "movq "#b", "#q2" \n\t" /* B */\
  605. "movq "#r", "#t" \n\t" /* R */\
  606. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  607. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  608. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  609. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  610. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  611. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  612. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  613. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  614. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  615. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  616. \
  617. MOVNTQ( q0, (dst, index, 4))\
  618. MOVNTQ( b, 8(dst, index, 4))\
  619. MOVNTQ( q2, 16(dst, index, 4))\
  620. MOVNTQ( q3, 24(dst, index, 4))\
  621. \
  622. "add $8, "#index" \n\t"\
  623. "cmp "#dstw", "#index" \n\t"\
  624. " jb 1b \n\t"
  625. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  626. #define REAL_WRITERGB16(dst, dstw, index) \
  627. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  628. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  629. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  630. "psrlq $3, %%mm2 \n\t"\
  631. \
  632. "movq %%mm2, %%mm1 \n\t"\
  633. "movq %%mm4, %%mm3 \n\t"\
  634. \
  635. "punpcklbw %%mm7, %%mm3 \n\t"\
  636. "punpcklbw %%mm5, %%mm2 \n\t"\
  637. "punpckhbw %%mm7, %%mm4 \n\t"\
  638. "punpckhbw %%mm5, %%mm1 \n\t"\
  639. \
  640. "psllq $3, %%mm3 \n\t"\
  641. "psllq $3, %%mm4 \n\t"\
  642. \
  643. "por %%mm3, %%mm2 \n\t"\
  644. "por %%mm4, %%mm1 \n\t"\
  645. \
  646. MOVNTQ(%%mm2, (dst, index, 2))\
  647. MOVNTQ(%%mm1, 8(dst, index, 2))\
  648. \
  649. "add $8, "#index" \n\t"\
  650. "cmp "#dstw", "#index" \n\t"\
  651. " jb 1b \n\t"
  652. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  653. #define REAL_WRITERGB15(dst, dstw, index) \
  654. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  655. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  656. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  657. "psrlq $3, %%mm2 \n\t"\
  658. "psrlq $1, %%mm5 \n\t"\
  659. \
  660. "movq %%mm2, %%mm1 \n\t"\
  661. "movq %%mm4, %%mm3 \n\t"\
  662. \
  663. "punpcklbw %%mm7, %%mm3 \n\t"\
  664. "punpcklbw %%mm5, %%mm2 \n\t"\
  665. "punpckhbw %%mm7, %%mm4 \n\t"\
  666. "punpckhbw %%mm5, %%mm1 \n\t"\
  667. \
  668. "psllq $2, %%mm3 \n\t"\
  669. "psllq $2, %%mm4 \n\t"\
  670. \
  671. "por %%mm3, %%mm2 \n\t"\
  672. "por %%mm4, %%mm1 \n\t"\
  673. \
  674. MOVNTQ(%%mm2, (dst, index, 2))\
  675. MOVNTQ(%%mm1, 8(dst, index, 2))\
  676. \
  677. "add $8, "#index" \n\t"\
  678. "cmp "#dstw", "#index" \n\t"\
  679. " jb 1b \n\t"
  680. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  681. #define WRITEBGR24OLD(dst, dstw, index) \
  682. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  683. "movq %%mm2, %%mm1 \n\t" /* B */\
  684. "movq %%mm5, %%mm6 \n\t" /* R */\
  685. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  686. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  687. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  688. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  689. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  690. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  691. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  692. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  693. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  694. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  695. \
  696. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  697. "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
  698. "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
  699. "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
  700. "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
  701. "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
  702. "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
  703. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  704. \
  705. "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  706. "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
  707. "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
  708. "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
  709. "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
  710. "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
  711. "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
  712. "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
  713. "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
  714. "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
  715. "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
  716. "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
  717. "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
  718. \
  719. "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
  720. "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
  721. "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
  722. "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
  723. "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
  724. "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
  725. "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
  726. "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
  727. \
  728. MOVNTQ(%%mm0, (dst))\
  729. MOVNTQ(%%mm2, 8(dst))\
  730. MOVNTQ(%%mm3, 16(dst))\
  731. "add $24, "#dst" \n\t"\
  732. \
  733. "add $8, "#index" \n\t"\
  734. "cmp "#dstw", "#index" \n\t"\
  735. " jb 1b \n\t"
  736. #define WRITEBGR24MMX(dst, dstw, index) \
  737. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  738. "movq %%mm2, %%mm1 \n\t" /* B */\
  739. "movq %%mm5, %%mm6 \n\t" /* R */\
  740. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  741. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  742. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  743. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  744. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  745. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  746. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  747. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  748. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  749. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  750. \
  751. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  752. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  753. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  754. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  755. \
  756. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  757. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  758. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  759. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  760. \
  761. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  762. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  763. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  764. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  765. \
  766. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  767. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  768. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  769. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  770. MOVNTQ(%%mm0, (dst))\
  771. \
  772. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  773. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  774. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  775. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  776. MOVNTQ(%%mm6, 8(dst))\
  777. \
  778. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  779. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  780. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  781. MOVNTQ(%%mm5, 16(dst))\
  782. \
  783. "add $24, "#dst" \n\t"\
  784. \
  785. "add $8, "#index" \n\t"\
  786. "cmp "#dstw", "#index" \n\t"\
  787. " jb 1b \n\t"
  788. #define WRITEBGR24MMX2(dst, dstw, index) \
  789. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  790. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  791. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  792. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  793. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  794. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  795. \
  796. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  797. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  798. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  799. \
  800. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  801. "por %%mm1, %%mm6 \n\t"\
  802. "por %%mm3, %%mm6 \n\t"\
  803. MOVNTQ(%%mm6, (dst))\
  804. \
  805. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  806. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  807. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  808. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  809. \
  810. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  811. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  812. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  813. \
  814. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  815. "por %%mm3, %%mm6 \n\t"\
  816. MOVNTQ(%%mm6, 8(dst))\
  817. \
  818. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  819. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  820. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  821. \
  822. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  823. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  824. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  825. \
  826. "por %%mm1, %%mm3 \n\t"\
  827. "por %%mm3, %%mm6 \n\t"\
  828. MOVNTQ(%%mm6, 16(dst))\
  829. \
  830. "add $24, "#dst" \n\t"\
  831. \
  832. "add $8, "#index" \n\t"\
  833. "cmp "#dstw", "#index" \n\t"\
  834. " jb 1b \n\t"
  835. #if COMPILE_TEMPLATE_MMX2
  836. #undef WRITEBGR24
  837. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  838. #else
  839. #undef WRITEBGR24
  840. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  841. #endif
  842. #define REAL_WRITEYUY2(dst, dstw, index) \
  843. "packuswb %%mm3, %%mm3 \n\t"\
  844. "packuswb %%mm4, %%mm4 \n\t"\
  845. "packuswb %%mm7, %%mm1 \n\t"\
  846. "punpcklbw %%mm4, %%mm3 \n\t"\
  847. "movq %%mm1, %%mm7 \n\t"\
  848. "punpcklbw %%mm3, %%mm1 \n\t"\
  849. "punpckhbw %%mm3, %%mm7 \n\t"\
  850. \
  851. MOVNTQ(%%mm1, (dst, index, 2))\
  852. MOVNTQ(%%mm7, 8(dst, index, 2))\
  853. \
  854. "add $8, "#index" \n\t"\
  855. "cmp "#dstw", "#index" \n\t"\
  856. " jb 1b \n\t"
  857. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  858. static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  859. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
  860. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
  861. {
  862. #if COMPILE_TEMPLATE_MMX
  863. if(!(c->flags & SWS_BITEXACT)) {
  864. if (c->flags & SWS_ACCURATE_RND) {
  865. if (uDest) {
  866. YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  867. YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  868. }
  869. if (CONFIG_SWSCALE_ALPHA && aDest) {
  870. YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  871. }
  872. YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  873. } else {
  874. if (uDest) {
  875. YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  876. YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  877. }
  878. if (CONFIG_SWSCALE_ALPHA && aDest) {
  879. YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  880. }
  881. YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  882. }
  883. return;
  884. }
  885. #endif
  886. #if COMPILE_TEMPLATE_ALTIVEC
  887. yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
  888. chrFilter, chrSrc, chrFilterSize,
  889. dest, uDest, vDest, dstW, chrDstW);
  890. #else //COMPILE_TEMPLATE_ALTIVEC
  891. yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
  892. chrFilter, chrSrc, chrFilterSize,
  893. alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
  894. #endif //!COMPILE_TEMPLATE_ALTIVEC
  895. }
  896. static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  897. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
  898. uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
  899. {
  900. yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
  901. chrFilter, chrSrc, chrFilterSize,
  902. dest, uDest, dstW, chrDstW, dstFormat);
  903. }
  904. static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
  905. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
  906. {
  907. int i;
  908. #if COMPILE_TEMPLATE_MMX
  909. if(!(c->flags & SWS_BITEXACT)) {
  910. long p= 4;
  911. const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
  912. uint8_t *dst[4]= {aDest, dest, uDest, vDest};
  913. x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
  914. if (c->flags & SWS_ACCURATE_RND) {
  915. while(p--) {
  916. if (dst[p]) {
  917. __asm__ volatile(
  918. YSCALEYUV2YV121_ACCURATE
  919. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  920. "g" (-counter[p])
  921. : "%"REG_a
  922. );
  923. }
  924. }
  925. } else {
  926. while(p--) {
  927. if (dst[p]) {
  928. __asm__ volatile(
  929. YSCALEYUV2YV121
  930. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  931. "g" (-counter[p])
  932. : "%"REG_a
  933. );
  934. }
  935. }
  936. }
  937. return;
  938. }
  939. #endif
  940. for (i=0; i<dstW; i++) {
  941. int val= (lumSrc[i]+64)>>7;
  942. if (val&256) {
  943. if (val<0) val=0;
  944. else val=255;
  945. }
  946. dest[i]= val;
  947. }
  948. if (uDest)
  949. for (i=0; i<chrDstW; i++) {
  950. int u=(chrSrc[i ]+64)>>7;
  951. int v=(chrSrc[i + VOFW]+64)>>7;
  952. if ((u|v)&256) {
  953. if (u<0) u=0;
  954. else if (u>255) u=255;
  955. if (v<0) v=0;
  956. else if (v>255) v=255;
  957. }
  958. uDest[i]= u;
  959. vDest[i]= v;
  960. }
  961. if (CONFIG_SWSCALE_ALPHA && aDest)
  962. for (i=0; i<dstW; i++) {
  963. int val= (alpSrc[i]+64)>>7;
  964. aDest[i]= av_clip_uint8(val);
  965. }
  966. }
  967. /**
  968. * vertical scale YV12 to RGB
  969. */
  970. static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  971. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
  972. const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
  973. {
  974. #if COMPILE_TEMPLATE_MMX
  975. x86_reg dummy=0;
  976. if(!(c->flags & SWS_BITEXACT)) {
  977. if (c->flags & SWS_ACCURATE_RND) {
  978. switch(c->dstFormat) {
  979. case PIX_FMT_RGB32:
  980. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  981. YSCALEYUV2PACKEDX_ACCURATE
  982. YSCALEYUV2RGBX
  983. "movq %%mm2, "U_TEMP"(%0) \n\t"
  984. "movq %%mm4, "V_TEMP"(%0) \n\t"
  985. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  986. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  987. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  988. "psraw $3, %%mm1 \n\t"
  989. "psraw $3, %%mm7 \n\t"
  990. "packuswb %%mm7, %%mm1 \n\t"
  991. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  992. YSCALEYUV2PACKEDX_END
  993. } else {
  994. YSCALEYUV2PACKEDX_ACCURATE
  995. YSCALEYUV2RGBX
  996. "pcmpeqd %%mm7, %%mm7 \n\t"
  997. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  998. YSCALEYUV2PACKEDX_END
  999. }
  1000. return;
  1001. case PIX_FMT_BGR24:
  1002. YSCALEYUV2PACKEDX_ACCURATE
  1003. YSCALEYUV2RGBX
  1004. "pxor %%mm7, %%mm7 \n\t"
  1005. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  1006. "add %4, %%"REG_c" \n\t"
  1007. WRITEBGR24(%%REGc, %5, %%REGa)
  1008. :: "r" (&c->redDither),
  1009. "m" (dummy), "m" (dummy), "m" (dummy),
  1010. "r" (dest), "m" (dstW)
  1011. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  1012. );
  1013. return;
  1014. case PIX_FMT_RGB555:
  1015. YSCALEYUV2PACKEDX_ACCURATE
  1016. YSCALEYUV2RGBX
  1017. "pxor %%mm7, %%mm7 \n\t"
  1018. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1019. #ifdef DITHER1XBPP
  1020. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  1021. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  1022. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  1023. #endif
  1024. WRITERGB15(%4, %5, %%REGa)
  1025. YSCALEYUV2PACKEDX_END
  1026. return;
  1027. case PIX_FMT_RGB565:
  1028. YSCALEYUV2PACKEDX_ACCURATE
  1029. YSCALEYUV2RGBX
  1030. "pxor %%mm7, %%mm7 \n\t"
  1031. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1032. #ifdef DITHER1XBPP
  1033. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  1034. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  1035. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  1036. #endif
  1037. WRITERGB16(%4, %5, %%REGa)
  1038. YSCALEYUV2PACKEDX_END
  1039. return;
  1040. case PIX_FMT_YUYV422:
  1041. YSCALEYUV2PACKEDX_ACCURATE
  1042. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1043. "psraw $3, %%mm3 \n\t"
  1044. "psraw $3, %%mm4 \n\t"
  1045. "psraw $3, %%mm1 \n\t"
  1046. "psraw $3, %%mm7 \n\t"
  1047. WRITEYUY2(%4, %5, %%REGa)
  1048. YSCALEYUV2PACKEDX_END
  1049. return;
  1050. }
  1051. } else {
  1052. switch(c->dstFormat) {
  1053. case PIX_FMT_RGB32:
  1054. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1055. YSCALEYUV2PACKEDX
  1056. YSCALEYUV2RGBX
  1057. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  1058. "psraw $3, %%mm1 \n\t"
  1059. "psraw $3, %%mm7 \n\t"
  1060. "packuswb %%mm7, %%mm1 \n\t"
  1061. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1062. YSCALEYUV2PACKEDX_END
  1063. } else {
  1064. YSCALEYUV2PACKEDX
  1065. YSCALEYUV2RGBX
  1066. "pcmpeqd %%mm7, %%mm7 \n\t"
  1067. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1068. YSCALEYUV2PACKEDX_END
  1069. }
  1070. return;
  1071. case PIX_FMT_BGR24:
  1072. YSCALEYUV2PACKEDX
  1073. YSCALEYUV2RGBX
  1074. "pxor %%mm7, %%mm7 \n\t"
  1075. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  1076. "add %4, %%"REG_c" \n\t"
  1077. WRITEBGR24(%%REGc, %5, %%REGa)
  1078. :: "r" (&c->redDither),
  1079. "m" (dummy), "m" (dummy), "m" (dummy),
  1080. "r" (dest), "m" (dstW)
  1081. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  1082. );
  1083. return;
  1084. case PIX_FMT_RGB555:
  1085. YSCALEYUV2PACKEDX
  1086. YSCALEYUV2RGBX
  1087. "pxor %%mm7, %%mm7 \n\t"
  1088. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1089. #ifdef DITHER1XBPP
  1090. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  1091. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  1092. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  1093. #endif
  1094. WRITERGB15(%4, %5, %%REGa)
  1095. YSCALEYUV2PACKEDX_END
  1096. return;
  1097. case PIX_FMT_RGB565:
  1098. YSCALEYUV2PACKEDX
  1099. YSCALEYUV2RGBX
  1100. "pxor %%mm7, %%mm7 \n\t"
  1101. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1102. #ifdef DITHER1XBPP
  1103. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  1104. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  1105. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  1106. #endif
  1107. WRITERGB16(%4, %5, %%REGa)
  1108. YSCALEYUV2PACKEDX_END
  1109. return;
  1110. case PIX_FMT_YUYV422:
  1111. YSCALEYUV2PACKEDX
  1112. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1113. "psraw $3, %%mm3 \n\t"
  1114. "psraw $3, %%mm4 \n\t"
  1115. "psraw $3, %%mm1 \n\t"
  1116. "psraw $3, %%mm7 \n\t"
  1117. WRITEYUY2(%4, %5, %%REGa)
  1118. YSCALEYUV2PACKEDX_END
  1119. return;
  1120. }
  1121. }
  1122. }
  1123. #endif /* COMPILE_TEMPLATE_MMX */
  1124. #if COMPILE_TEMPLATE_ALTIVEC
  1125. /* The following list of supported dstFormat values should
  1126. match what's found in the body of ff_yuv2packedX_altivec() */
  1127. if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
  1128. (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
  1129. c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
  1130. c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
  1131. ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
  1132. chrFilter, chrSrc, chrFilterSize,
  1133. dest, dstW, dstY);
  1134. else
  1135. #endif
  1136. yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
  1137. chrFilter, chrSrc, chrFilterSize,
  1138. alpSrc, dest, dstW, dstY);
  1139. }
  1140. /**
  1141. * vertical bilinear scale YV12 to RGB
  1142. */
  1143. static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1144. const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
  1145. {
  1146. int yalpha1=4095- yalpha;
  1147. int uvalpha1=4095-uvalpha;
  1148. int i;
  1149. #if COMPILE_TEMPLATE_MMX
  1150. if(!(c->flags & SWS_BITEXACT)) {
  1151. switch(c->dstFormat) {
  1152. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1153. case PIX_FMT_RGB32:
  1154. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1155. #if ARCH_X86_64
  1156. __asm__ volatile(
  1157. YSCALEYUV2RGB(%%r8, %5)
  1158. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  1159. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1160. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1161. "packuswb %%mm7, %%mm1 \n\t"
  1162. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1163. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
  1164. "a" (&c->redDither)
  1165. ,"r" (abuf0), "r" (abuf1)
  1166. : "%r8"
  1167. );
  1168. #else
  1169. *(const uint16_t **)(&c->u_temp)=abuf0;
  1170. *(const uint16_t **)(&c->v_temp)=abuf1;
  1171. __asm__ volatile(
  1172. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1173. "mov %4, %%"REG_b" \n\t"
  1174. "push %%"REG_BP" \n\t"
  1175. YSCALEYUV2RGB(%%REGBP, %5)
  1176. "push %0 \n\t"
  1177. "push %1 \n\t"
  1178. "mov "U_TEMP"(%5), %0 \n\t"
  1179. "mov "V_TEMP"(%5), %1 \n\t"
  1180. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  1181. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1182. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1183. "packuswb %%mm7, %%mm1 \n\t"
  1184. "pop %1 \n\t"
  1185. "pop %0 \n\t"
  1186. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1187. "pop %%"REG_BP" \n\t"
  1188. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1189. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1190. "a" (&c->redDither)
  1191. );
  1192. #endif
  1193. } else {
  1194. __asm__ volatile(
  1195. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1196. "mov %4, %%"REG_b" \n\t"
  1197. "push %%"REG_BP" \n\t"
  1198. YSCALEYUV2RGB(%%REGBP, %5)
  1199. "pcmpeqd %%mm7, %%mm7 \n\t"
  1200. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1201. "pop %%"REG_BP" \n\t"
  1202. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1203. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1204. "a" (&c->redDither)
  1205. );
  1206. }
  1207. return;
  1208. case PIX_FMT_BGR24:
  1209. __asm__ volatile(
  1210. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1211. "mov %4, %%"REG_b" \n\t"
  1212. "push %%"REG_BP" \n\t"
  1213. YSCALEYUV2RGB(%%REGBP, %5)
  1214. "pxor %%mm7, %%mm7 \n\t"
  1215. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1216. "pop %%"REG_BP" \n\t"
  1217. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1218. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1219. "a" (&c->redDither)
  1220. );
  1221. return;
  1222. case PIX_FMT_RGB555:
  1223. __asm__ volatile(
  1224. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1225. "mov %4, %%"REG_b" \n\t"
  1226. "push %%"REG_BP" \n\t"
  1227. YSCALEYUV2RGB(%%REGBP, %5)
  1228. "pxor %%mm7, %%mm7 \n\t"
  1229. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1230. #ifdef DITHER1XBPP
  1231. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1232. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1233. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1234. #endif
  1235. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1236. "pop %%"REG_BP" \n\t"
  1237. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1238. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1239. "a" (&c->redDither)
  1240. );
  1241. return;
  1242. case PIX_FMT_RGB565:
  1243. __asm__ volatile(
  1244. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1245. "mov %4, %%"REG_b" \n\t"
  1246. "push %%"REG_BP" \n\t"
  1247. YSCALEYUV2RGB(%%REGBP, %5)
  1248. "pxor %%mm7, %%mm7 \n\t"
  1249. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1250. #ifdef DITHER1XBPP
  1251. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1252. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1253. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1254. #endif
  1255. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1256. "pop %%"REG_BP" \n\t"
  1257. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1258. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1259. "a" (&c->redDither)
  1260. );
  1261. return;
  1262. case PIX_FMT_YUYV422:
  1263. __asm__ volatile(
  1264. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1265. "mov %4, %%"REG_b" \n\t"
  1266. "push %%"REG_BP" \n\t"
  1267. YSCALEYUV2PACKED(%%REGBP, %5)
  1268. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1269. "pop %%"REG_BP" \n\t"
  1270. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1271. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1272. "a" (&c->redDither)
  1273. );
  1274. return;
  1275. default: break;
  1276. }
  1277. }
  1278. #endif //COMPILE_TEMPLATE_MMX
  1279. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
  1280. }
  1281. /**
  1282. * YV12 to RGB without scaling or interpolating
  1283. */
  1284. static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1285. const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
  1286. {
  1287. const int yalpha1=0;
  1288. int i;
  1289. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1290. const int yalpha= 4096; //FIXME ...
  1291. if (flags&SWS_FULL_CHR_H_INT) {
  1292. c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
  1293. return;
  1294. }
  1295. #if COMPILE_TEMPLATE_MMX
  1296. if(!(flags & SWS_BITEXACT)) {
  1297. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1298. switch(dstFormat) {
  1299. case PIX_FMT_RGB32:
  1300. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1301. __asm__ volatile(
  1302. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1303. "mov %4, %%"REG_b" \n\t"
  1304. "push %%"REG_BP" \n\t"
  1305. YSCALEYUV2RGB1(%%REGBP, %5)
  1306. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1307. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1308. "pop %%"REG_BP" \n\t"
  1309. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1310. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1311. "a" (&c->redDither)
  1312. );
  1313. } else {
  1314. __asm__ volatile(
  1315. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1316. "mov %4, %%"REG_b" \n\t"
  1317. "push %%"REG_BP" \n\t"
  1318. YSCALEYUV2RGB1(%%REGBP, %5)
  1319. "pcmpeqd %%mm7, %%mm7 \n\t"
  1320. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1321. "pop %%"REG_BP" \n\t"
  1322. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1323. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1324. "a" (&c->redDither)
  1325. );
  1326. }
  1327. return;
  1328. case PIX_FMT_BGR24:
  1329. __asm__ volatile(
  1330. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1331. "mov %4, %%"REG_b" \n\t"
  1332. "push %%"REG_BP" \n\t"
  1333. YSCALEYUV2RGB1(%%REGBP, %5)
  1334. "pxor %%mm7, %%mm7 \n\t"
  1335. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1336. "pop %%"REG_BP" \n\t"
  1337. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1338. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1339. "a" (&c->redDither)
  1340. );
  1341. return;
  1342. case PIX_FMT_RGB555:
  1343. __asm__ volatile(
  1344. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1345. "mov %4, %%"REG_b" \n\t"
  1346. "push %%"REG_BP" \n\t"
  1347. YSCALEYUV2RGB1(%%REGBP, %5)
  1348. "pxor %%mm7, %%mm7 \n\t"
  1349. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1350. #ifdef DITHER1XBPP
  1351. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1352. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1353. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1354. #endif
  1355. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1356. "pop %%"REG_BP" \n\t"
  1357. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1358. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1359. "a" (&c->redDither)
  1360. );
  1361. return;
  1362. case PIX_FMT_RGB565:
  1363. __asm__ volatile(
  1364. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1365. "mov %4, %%"REG_b" \n\t"
  1366. "push %%"REG_BP" \n\t"
  1367. YSCALEYUV2RGB1(%%REGBP, %5)
  1368. "pxor %%mm7, %%mm7 \n\t"
  1369. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1370. #ifdef DITHER1XBPP
  1371. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1372. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1373. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1374. #endif
  1375. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1376. "pop %%"REG_BP" \n\t"
  1377. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1378. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1379. "a" (&c->redDither)
  1380. );
  1381. return;
  1382. case PIX_FMT_YUYV422:
  1383. __asm__ volatile(
  1384. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1385. "mov %4, %%"REG_b" \n\t"
  1386. "push %%"REG_BP" \n\t"
  1387. YSCALEYUV2PACKED1(%%REGBP, %5)
  1388. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1389. "pop %%"REG_BP" \n\t"
  1390. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1391. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1392. "a" (&c->redDither)
  1393. );
  1394. return;
  1395. }
  1396. } else {
  1397. switch(dstFormat) {
  1398. case PIX_FMT_RGB32:
  1399. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1400. __asm__ volatile(
  1401. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1402. "mov %4, %%"REG_b" \n\t"
  1403. "push %%"REG_BP" \n\t"
  1404. YSCALEYUV2RGB1b(%%REGBP, %5)
  1405. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1406. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1407. "pop %%"REG_BP" \n\t"
  1408. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1409. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1410. "a" (&c->redDither)
  1411. );
  1412. } else {
  1413. __asm__ volatile(
  1414. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1415. "mov %4, %%"REG_b" \n\t"
  1416. "push %%"REG_BP" \n\t"
  1417. YSCALEYUV2RGB1b(%%REGBP, %5)
  1418. "pcmpeqd %%mm7, %%mm7 \n\t"
  1419. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1420. "pop %%"REG_BP" \n\t"
  1421. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1422. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1423. "a" (&c->redDither)
  1424. );
  1425. }
  1426. return;
  1427. case PIX_FMT_BGR24:
  1428. __asm__ volatile(
  1429. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1430. "mov %4, %%"REG_b" \n\t"
  1431. "push %%"REG_BP" \n\t"
  1432. YSCALEYUV2RGB1b(%%REGBP, %5)
  1433. "pxor %%mm7, %%mm7 \n\t"
  1434. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1435. "pop %%"REG_BP" \n\t"
  1436. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1437. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1438. "a" (&c->redDither)
  1439. );
  1440. return;
  1441. case PIX_FMT_RGB555:
  1442. __asm__ volatile(
  1443. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1444. "mov %4, %%"REG_b" \n\t"
  1445. "push %%"REG_BP" \n\t"
  1446. YSCALEYUV2RGB1b(%%REGBP, %5)
  1447. "pxor %%mm7, %%mm7 \n\t"
  1448. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1449. #ifdef DITHER1XBPP
  1450. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1451. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1452. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1453. #endif
  1454. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1455. "pop %%"REG_BP" \n\t"
  1456. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1457. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1458. "a" (&c->redDither)
  1459. );
  1460. return;
  1461. case PIX_FMT_RGB565:
  1462. __asm__ volatile(
  1463. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1464. "mov %4, %%"REG_b" \n\t"
  1465. "push %%"REG_BP" \n\t"
  1466. YSCALEYUV2RGB1b(%%REGBP, %5)
  1467. "pxor %%mm7, %%mm7 \n\t"
  1468. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1469. #ifdef DITHER1XBPP
  1470. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1471. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1472. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1473. #endif
  1474. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1475. "pop %%"REG_BP" \n\t"
  1476. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1477. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1478. "a" (&c->redDither)
  1479. );
  1480. return;
  1481. case PIX_FMT_YUYV422:
  1482. __asm__ volatile(
  1483. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1484. "mov %4, %%"REG_b" \n\t"
  1485. "push %%"REG_BP" \n\t"
  1486. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1487. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1488. "pop %%"REG_BP" \n\t"
  1489. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1490. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1491. "a" (&c->redDither)
  1492. );
  1493. return;
  1494. }
  1495. }
  1496. }
  1497. #endif /* COMPILE_TEMPLATE_MMX */
  1498. if (uvalpha < 2048) {
  1499. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
  1500. } else {
  1501. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
  1502. }
  1503. }
  1504. //FIXME yuy2* can read up to 7 samples too much
  1505. static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1506. {
  1507. #if COMPILE_TEMPLATE_MMX
  1508. __asm__ volatile(
  1509. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1510. "mov %0, %%"REG_a" \n\t"
  1511. "1: \n\t"
  1512. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1513. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1514. "pand %%mm2, %%mm0 \n\t"
  1515. "pand %%mm2, %%mm1 \n\t"
  1516. "packuswb %%mm1, %%mm0 \n\t"
  1517. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1518. "add $8, %%"REG_a" \n\t"
  1519. " js 1b \n\t"
  1520. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1521. : "%"REG_a
  1522. );
  1523. #else
  1524. int i;
  1525. for (i=0; i<width; i++)
  1526. dst[i]= src[2*i];
  1527. #endif
  1528. }
  1529. static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1530. {
  1531. #if COMPILE_TEMPLATE_MMX
  1532. __asm__ volatile(
  1533. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1534. "mov %0, %%"REG_a" \n\t"
  1535. "1: \n\t"
  1536. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1537. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1538. "psrlw $8, %%mm0 \n\t"
  1539. "psrlw $8, %%mm1 \n\t"
  1540. "packuswb %%mm1, %%mm0 \n\t"
  1541. "movq %%mm0, %%mm1 \n\t"
  1542. "psrlw $8, %%mm0 \n\t"
  1543. "pand %%mm4, %%mm1 \n\t"
  1544. "packuswb %%mm0, %%mm0 \n\t"
  1545. "packuswb %%mm1, %%mm1 \n\t"
  1546. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1547. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1548. "add $4, %%"REG_a" \n\t"
  1549. " js 1b \n\t"
  1550. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1551. : "%"REG_a
  1552. );
  1553. #else
  1554. int i;
  1555. for (i=0; i<width; i++) {
  1556. dstU[i]= src1[4*i + 1];
  1557. dstV[i]= src1[4*i + 3];
  1558. }
  1559. #endif
  1560. assert(src1 == src2);
  1561. }
  1562. static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1563. {
  1564. #if COMPILE_TEMPLATE_MMX
  1565. __asm__ volatile(
  1566. "mov %0, %%"REG_a" \n\t"
  1567. "1: \n\t"
  1568. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1569. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1570. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1571. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1572. "psrlw $8, %%mm0 \n\t"
  1573. "psrlw $8, %%mm1 \n\t"
  1574. "psrlw $8, %%mm2 \n\t"
  1575. "psrlw $8, %%mm3 \n\t"
  1576. "packuswb %%mm1, %%mm0 \n\t"
  1577. "packuswb %%mm3, %%mm2 \n\t"
  1578. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1579. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1580. "add $8, %%"REG_a" \n\t"
  1581. " js 1b \n\t"
  1582. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1583. : "%"REG_a
  1584. );
  1585. #else
  1586. int i;
  1587. for (i=0; i<width; i++) {
  1588. dstU[i]= src1[2*i + 1];
  1589. dstV[i]= src2[2*i + 1];
  1590. }
  1591. #endif
  1592. }
  1593. /* This is almost identical to the previous, end exists only because
  1594. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1595. static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1596. {
  1597. #if COMPILE_TEMPLATE_MMX
  1598. __asm__ volatile(
  1599. "mov %0, %%"REG_a" \n\t"
  1600. "1: \n\t"
  1601. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1602. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1603. "psrlw $8, %%mm0 \n\t"
  1604. "psrlw $8, %%mm1 \n\t"
  1605. "packuswb %%mm1, %%mm0 \n\t"
  1606. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1607. "add $8, %%"REG_a" \n\t"
  1608. " js 1b \n\t"
  1609. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1610. : "%"REG_a
  1611. );
  1612. #else
  1613. int i;
  1614. for (i=0; i<width; i++)
  1615. dst[i]= src[2*i+1];
  1616. #endif
  1617. }
  1618. static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1619. {
  1620. #if COMPILE_TEMPLATE_MMX
  1621. __asm__ volatile(
  1622. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1623. "mov %0, %%"REG_a" \n\t"
  1624. "1: \n\t"
  1625. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1626. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1627. "pand %%mm4, %%mm0 \n\t"
  1628. "pand %%mm4, %%mm1 \n\t"
  1629. "packuswb %%mm1, %%mm0 \n\t"
  1630. "movq %%mm0, %%mm1 \n\t"
  1631. "psrlw $8, %%mm0 \n\t"
  1632. "pand %%mm4, %%mm1 \n\t"
  1633. "packuswb %%mm0, %%mm0 \n\t"
  1634. "packuswb %%mm1, %%mm1 \n\t"
  1635. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1636. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1637. "add $4, %%"REG_a" \n\t"
  1638. " js 1b \n\t"
  1639. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1640. : "%"REG_a
  1641. );
  1642. #else
  1643. int i;
  1644. for (i=0; i<width; i++) {
  1645. dstU[i]= src1[4*i + 0];
  1646. dstV[i]= src1[4*i + 2];
  1647. }
  1648. #endif
  1649. assert(src1 == src2);
  1650. }
  1651. static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1652. {
  1653. #if COMPILE_TEMPLATE_MMX
  1654. __asm__ volatile(
  1655. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1656. "mov %0, %%"REG_a" \n\t"
  1657. "1: \n\t"
  1658. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1659. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1660. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1661. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1662. "pand %%mm4, %%mm0 \n\t"
  1663. "pand %%mm4, %%mm1 \n\t"
  1664. "pand %%mm4, %%mm2 \n\t"
  1665. "pand %%mm4, %%mm3 \n\t"
  1666. "packuswb %%mm1, %%mm0 \n\t"
  1667. "packuswb %%mm3, %%mm2 \n\t"
  1668. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1669. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1670. "add $8, %%"REG_a" \n\t"
  1671. " js 1b \n\t"
  1672. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1673. : "%"REG_a
  1674. );
  1675. #else
  1676. int i;
  1677. for (i=0; i<width; i++) {
  1678. dstU[i]= src1[2*i];
  1679. dstV[i]= src2[2*i];
  1680. }
  1681. #endif
  1682. }
  1683. static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1684. const uint8_t *src, long width)
  1685. {
  1686. #if COMPILE_TEMPLATE_MMX
  1687. __asm__ volatile(
  1688. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1689. "mov %0, %%"REG_a" \n\t"
  1690. "1: \n\t"
  1691. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1692. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1693. "movq %%mm0, %%mm2 \n\t"
  1694. "movq %%mm1, %%mm3 \n\t"
  1695. "pand %%mm4, %%mm0 \n\t"
  1696. "pand %%mm4, %%mm1 \n\t"
  1697. "psrlw $8, %%mm2 \n\t"
  1698. "psrlw $8, %%mm3 \n\t"
  1699. "packuswb %%mm1, %%mm0 \n\t"
  1700. "packuswb %%mm3, %%mm2 \n\t"
  1701. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1702. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1703. "add $8, %%"REG_a" \n\t"
  1704. " js 1b \n\t"
  1705. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1706. : "%"REG_a
  1707. );
  1708. #else
  1709. int i;
  1710. for (i = 0; i < width; i++) {
  1711. dst1[i] = src[2*i+0];
  1712. dst2[i] = src[2*i+1];
  1713. }
  1714. #endif
  1715. }
  1716. static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1717. const uint8_t *src1, const uint8_t *src2,
  1718. long width, uint32_t *unused)
  1719. {
  1720. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1721. }
  1722. static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1723. const uint8_t *src1, const uint8_t *src2,
  1724. long width, uint32_t *unused)
  1725. {
  1726. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1727. }
  1728. #if COMPILE_TEMPLATE_MMX
  1729. static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
  1730. {
  1731. if(srcFormat == PIX_FMT_BGR24) {
  1732. __asm__ volatile(
  1733. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1734. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1735. :
  1736. );
  1737. } else {
  1738. __asm__ volatile(
  1739. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1740. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1741. :
  1742. );
  1743. }
  1744. __asm__ volatile(
  1745. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1746. "mov %2, %%"REG_a" \n\t"
  1747. "pxor %%mm7, %%mm7 \n\t"
  1748. "1: \n\t"
  1749. PREFETCH" 64(%0) \n\t"
  1750. "movd (%0), %%mm0 \n\t"
  1751. "movd 2(%0), %%mm1 \n\t"
  1752. "movd 6(%0), %%mm2 \n\t"
  1753. "movd 8(%0), %%mm3 \n\t"
  1754. "add $12, %0 \n\t"
  1755. "punpcklbw %%mm7, %%mm0 \n\t"
  1756. "punpcklbw %%mm7, %%mm1 \n\t"
  1757. "punpcklbw %%mm7, %%mm2 \n\t"
  1758. "punpcklbw %%mm7, %%mm3 \n\t"
  1759. "pmaddwd %%mm5, %%mm0 \n\t"
  1760. "pmaddwd %%mm6, %%mm1 \n\t"
  1761. "pmaddwd %%mm5, %%mm2 \n\t"
  1762. "pmaddwd %%mm6, %%mm3 \n\t"
  1763. "paddd %%mm1, %%mm0 \n\t"
  1764. "paddd %%mm3, %%mm2 \n\t"
  1765. "paddd %%mm4, %%mm0 \n\t"
  1766. "paddd %%mm4, %%mm2 \n\t"
  1767. "psrad $15, %%mm0 \n\t"
  1768. "psrad $15, %%mm2 \n\t"
  1769. "packssdw %%mm2, %%mm0 \n\t"
  1770. "packuswb %%mm0, %%mm0 \n\t"
  1771. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1772. "add $4, %%"REG_a" \n\t"
  1773. " js 1b \n\t"
  1774. : "+r" (src)
  1775. : "r" (dst+width), "g" ((x86_reg)-width)
  1776. : "%"REG_a
  1777. );
  1778. }
  1779. static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
  1780. {
  1781. __asm__ volatile(
  1782. "movq 24+%4, %%mm6 \n\t"
  1783. "mov %3, %%"REG_a" \n\t"
  1784. "pxor %%mm7, %%mm7 \n\t"
  1785. "1: \n\t"
  1786. PREFETCH" 64(%0) \n\t"
  1787. "movd (%0), %%mm0 \n\t"
  1788. "movd 2(%0), %%mm1 \n\t"
  1789. "punpcklbw %%mm7, %%mm0 \n\t"
  1790. "punpcklbw %%mm7, %%mm1 \n\t"
  1791. "movq %%mm0, %%mm2 \n\t"
  1792. "movq %%mm1, %%mm3 \n\t"
  1793. "pmaddwd %4, %%mm0 \n\t"
  1794. "pmaddwd 8+%4, %%mm1 \n\t"
  1795. "pmaddwd 16+%4, %%mm2 \n\t"
  1796. "pmaddwd %%mm6, %%mm3 \n\t"
  1797. "paddd %%mm1, %%mm0 \n\t"
  1798. "paddd %%mm3, %%mm2 \n\t"
  1799. "movd 6(%0), %%mm1 \n\t"
  1800. "movd 8(%0), %%mm3 \n\t"
  1801. "add $12, %0 \n\t"
  1802. "punpcklbw %%mm7, %%mm1 \n\t"
  1803. "punpcklbw %%mm7, %%mm3 \n\t"
  1804. "movq %%mm1, %%mm4 \n\t"
  1805. "movq %%mm3, %%mm5 \n\t"
  1806. "pmaddwd %4, %%mm1 \n\t"
  1807. "pmaddwd 8+%4, %%mm3 \n\t"
  1808. "pmaddwd 16+%4, %%mm4 \n\t"
  1809. "pmaddwd %%mm6, %%mm5 \n\t"
  1810. "paddd %%mm3, %%mm1 \n\t"
  1811. "paddd %%mm5, %%mm4 \n\t"
  1812. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1813. "paddd %%mm3, %%mm0 \n\t"
  1814. "paddd %%mm3, %%mm2 \n\t"
  1815. "paddd %%mm3, %%mm1 \n\t"
  1816. "paddd %%mm3, %%mm4 \n\t"
  1817. "psrad $15, %%mm0 \n\t"
  1818. "psrad $15, %%mm2 \n\t"
  1819. "psrad $15, %%mm1 \n\t"
  1820. "psrad $15, %%mm4 \n\t"
  1821. "packssdw %%mm1, %%mm0 \n\t"
  1822. "packssdw %%mm4, %%mm2 \n\t"
  1823. "packuswb %%mm0, %%mm0 \n\t"
  1824. "packuswb %%mm2, %%mm2 \n\t"
  1825. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1826. "movd %%mm2, (%2, %%"REG_a") \n\t"
  1827. "add $4, %%"REG_a" \n\t"
  1828. " js 1b \n\t"
  1829. : "+r" (src)
  1830. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
  1831. : "%"REG_a
  1832. );
  1833. }
  1834. #endif
  1835. static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1836. {
  1837. #if COMPILE_TEMPLATE_MMX
  1838. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1839. #else
  1840. int i;
  1841. for (i=0; i<width; i++) {
  1842. int b= src[i*3+0];
  1843. int g= src[i*3+1];
  1844. int r= src[i*3+2];
  1845. dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
  1846. }
  1847. #endif /* COMPILE_TEMPLATE_MMX */
  1848. }
  1849. static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1850. {
  1851. #if COMPILE_TEMPLATE_MMX
  1852. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1853. #else
  1854. int i;
  1855. for (i=0; i<width; i++) {
  1856. int b= src1[3*i + 0];
  1857. int g= src1[3*i + 1];
  1858. int r= src1[3*i + 2];
  1859. dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1860. dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1861. }
  1862. #endif /* COMPILE_TEMPLATE_MMX */
  1863. assert(src1 == src2);
  1864. }
  1865. static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1866. {
  1867. int i;
  1868. for (i=0; i<width; i++) {
  1869. int b= src1[6*i + 0] + src1[6*i + 3];
  1870. int g= src1[6*i + 1] + src1[6*i + 4];
  1871. int r= src1[6*i + 2] + src1[6*i + 5];
  1872. dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1873. dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1874. }
  1875. assert(src1 == src2);
  1876. }
  1877. static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1878. {
  1879. #if COMPILE_TEMPLATE_MMX
  1880. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1881. #else
  1882. int i;
  1883. for (i=0; i<width; i++) {
  1884. int r= src[i*3+0];
  1885. int g= src[i*3+1];
  1886. int b= src[i*3+2];
  1887. dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
  1888. }
  1889. #endif
  1890. }
  1891. static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1892. {
  1893. #if COMPILE_TEMPLATE_MMX
  1894. assert(src1==src2);
  1895. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1896. #else
  1897. int i;
  1898. assert(src1==src2);
  1899. for (i=0; i<width; i++) {
  1900. int r= src1[3*i + 0];
  1901. int g= src1[3*i + 1];
  1902. int b= src1[3*i + 2];
  1903. dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1904. dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1905. }
  1906. #endif
  1907. }
  1908. static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1909. {
  1910. int i;
  1911. assert(src1==src2);
  1912. for (i=0; i<width; i++) {
  1913. int r= src1[6*i + 0] + src1[6*i + 3];
  1914. int g= src1[6*i + 1] + src1[6*i + 4];
  1915. int b= src1[6*i + 2] + src1[6*i + 5];
  1916. dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1917. dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1918. }
  1919. }
  1920. // bilinear / bicubic scaling
  1921. static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
  1922. const int16_t *filter, const int16_t *filterPos, long filterSize)
  1923. {
  1924. #if COMPILE_TEMPLATE_MMX
  1925. assert(filterSize % 4 == 0 && filterSize>0);
  1926. if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
  1927. x86_reg counter= -2*dstW;
  1928. filter-= counter*2;
  1929. filterPos-= counter/2;
  1930. dst-= counter/2;
  1931. __asm__ volatile(
  1932. #if defined(PIC)
  1933. "push %%"REG_b" \n\t"
  1934. #endif
  1935. "pxor %%mm7, %%mm7 \n\t"
  1936. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1937. "mov %%"REG_a", %%"REG_BP" \n\t"
  1938. ASMALIGN(4)
  1939. "1: \n\t"
  1940. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1941. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1942. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1943. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1944. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1945. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1946. "punpcklbw %%mm7, %%mm0 \n\t"
  1947. "punpcklbw %%mm7, %%mm2 \n\t"
  1948. "pmaddwd %%mm1, %%mm0 \n\t"
  1949. "pmaddwd %%mm2, %%mm3 \n\t"
  1950. "movq %%mm0, %%mm4 \n\t"
  1951. "punpckldq %%mm3, %%mm0 \n\t"
  1952. "punpckhdq %%mm3, %%mm4 \n\t"
  1953. "paddd %%mm4, %%mm0 \n\t"
  1954. "psrad $7, %%mm0 \n\t"
  1955. "packssdw %%mm0, %%mm0 \n\t"
  1956. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1957. "add $4, %%"REG_BP" \n\t"
  1958. " jnc 1b \n\t"
  1959. "pop %%"REG_BP" \n\t"
  1960. #if defined(PIC)
  1961. "pop %%"REG_b" \n\t"
  1962. #endif
  1963. : "+a" (counter)
  1964. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1965. #if !defined(PIC)
  1966. : "%"REG_b
  1967. #endif
  1968. );
  1969. } else if (filterSize==8) {
  1970. x86_reg counter= -2*dstW;
  1971. filter-= counter*4;
  1972. filterPos-= counter/2;
  1973. dst-= counter/2;
  1974. __asm__ volatile(
  1975. #if defined(PIC)
  1976. "push %%"REG_b" \n\t"
  1977. #endif
  1978. "pxor %%mm7, %%mm7 \n\t"
  1979. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1980. "mov %%"REG_a", %%"REG_BP" \n\t"
  1981. ASMALIGN(4)
  1982. "1: \n\t"
  1983. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1984. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1985. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  1986. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  1987. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1988. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1989. "punpcklbw %%mm7, %%mm0 \n\t"
  1990. "punpcklbw %%mm7, %%mm2 \n\t"
  1991. "pmaddwd %%mm1, %%mm0 \n\t"
  1992. "pmaddwd %%mm2, %%mm3 \n\t"
  1993. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  1994. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  1995. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  1996. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  1997. "punpcklbw %%mm7, %%mm4 \n\t"
  1998. "punpcklbw %%mm7, %%mm2 \n\t"
  1999. "pmaddwd %%mm1, %%mm4 \n\t"
  2000. "pmaddwd %%mm2, %%mm5 \n\t"
  2001. "paddd %%mm4, %%mm0 \n\t"
  2002. "paddd %%mm5, %%mm3 \n\t"
  2003. "movq %%mm0, %%mm4 \n\t"
  2004. "punpckldq %%mm3, %%mm0 \n\t"
  2005. "punpckhdq %%mm3, %%mm4 \n\t"
  2006. "paddd %%mm4, %%mm0 \n\t"
  2007. "psrad $7, %%mm0 \n\t"
  2008. "packssdw %%mm0, %%mm0 \n\t"
  2009. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  2010. "add $4, %%"REG_BP" \n\t"
  2011. " jnc 1b \n\t"
  2012. "pop %%"REG_BP" \n\t"
  2013. #if defined(PIC)
  2014. "pop %%"REG_b" \n\t"
  2015. #endif
  2016. : "+a" (counter)
  2017. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  2018. #if !defined(PIC)
  2019. : "%"REG_b
  2020. #endif
  2021. );
  2022. } else {
  2023. const uint8_t *offset = src+filterSize;
  2024. x86_reg counter= -2*dstW;
  2025. //filter-= counter*filterSize/2;
  2026. filterPos-= counter/2;
  2027. dst-= counter/2;
  2028. __asm__ volatile(
  2029. "pxor %%mm7, %%mm7 \n\t"
  2030. ASMALIGN(4)
  2031. "1: \n\t"
  2032. "mov %2, %%"REG_c" \n\t"
  2033. "movzwl (%%"REG_c", %0), %%eax \n\t"
  2034. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  2035. "mov %5, %%"REG_c" \n\t"
  2036. "pxor %%mm4, %%mm4 \n\t"
  2037. "pxor %%mm5, %%mm5 \n\t"
  2038. "2: \n\t"
  2039. "movq (%1), %%mm1 \n\t"
  2040. "movq (%1, %6), %%mm3 \n\t"
  2041. "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
  2042. "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
  2043. "punpcklbw %%mm7, %%mm0 \n\t"
  2044. "punpcklbw %%mm7, %%mm2 \n\t"
  2045. "pmaddwd %%mm1, %%mm0 \n\t"
  2046. "pmaddwd %%mm2, %%mm3 \n\t"
  2047. "paddd %%mm3, %%mm5 \n\t"
  2048. "paddd %%mm0, %%mm4 \n\t"
  2049. "add $8, %1 \n\t"
  2050. "add $4, %%"REG_c" \n\t"
  2051. "cmp %4, %%"REG_c" \n\t"
  2052. " jb 2b \n\t"
  2053. "add %6, %1 \n\t"
  2054. "movq %%mm4, %%mm0 \n\t"
  2055. "punpckldq %%mm5, %%mm4 \n\t"
  2056. "punpckhdq %%mm5, %%mm0 \n\t"
  2057. "paddd %%mm0, %%mm4 \n\t"
  2058. "psrad $7, %%mm4 \n\t"
  2059. "packssdw %%mm4, %%mm4 \n\t"
  2060. "mov %3, %%"REG_a" \n\t"
  2061. "movd %%mm4, (%%"REG_a", %0) \n\t"
  2062. "add $4, %0 \n\t"
  2063. " jnc 1b \n\t"
  2064. : "+r" (counter), "+r" (filter)
  2065. : "m" (filterPos), "m" (dst), "m"(offset),
  2066. "m" (src), "r" ((x86_reg)filterSize*2)
  2067. : "%"REG_a, "%"REG_c, "%"REG_d
  2068. );
  2069. }
  2070. #else
  2071. #if COMPILE_TEMPLATE_ALTIVEC
  2072. hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
  2073. #else
  2074. int i;
  2075. for (i=0; i<dstW; i++) {
  2076. int j;
  2077. int srcPos= filterPos[i];
  2078. int val=0;
  2079. //printf("filterPos: %d\n", filterPos[i]);
  2080. for (j=0; j<filterSize; j++) {
  2081. //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
  2082. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  2083. }
  2084. //filter += hFilterSize;
  2085. dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
  2086. //dst[i] = val>>7;
  2087. }
  2088. #endif /* COMPILE_ALTIVEC */
  2089. #endif /* COMPILE_MMX */
  2090. }
  2091. //FIXME all pal and rgb srcFormats could do this convertion as well
  2092. //FIXME all scalers more complex than bilinear could do half of this transform
  2093. static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
  2094. {
  2095. int i;
  2096. for (i = 0; i < width; i++) {
  2097. dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
  2098. dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
  2099. }
  2100. }
  2101. static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
  2102. {
  2103. int i;
  2104. for (i = 0; i < width; i++) {
  2105. dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469
  2106. dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
  2107. }
  2108. }
  2109. static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
  2110. {
  2111. int i;
  2112. for (i = 0; i < width; i++)
  2113. dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
  2114. }
  2115. static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
  2116. {
  2117. int i;
  2118. for (i = 0; i < width; i++)
  2119. dst[i] = (dst[i]*14071 + 33561947)>>14;
  2120. }
  2121. #define FAST_BILINEAR_X86 \
  2122. "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
  2123. "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
  2124. "shll $16, %%edi \n\t" \
  2125. "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
  2126. "mov %1, %%"REG_D"\n\t" \
  2127. "shrl $9, %%esi \n\t" \
  2128. static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  2129. long dstWidth, const uint8_t *src, int srcW,
  2130. int xInc)
  2131. {
  2132. #if ARCH_X86 && CONFIG_GPL
  2133. #if COMPILE_TEMPLATE_MMX2
  2134. int32_t *filterPos = c->hLumFilterPos;
  2135. int16_t *filter = c->hLumFilter;
  2136. int canMMX2BeUsed = c->canMMX2BeUsed;
  2137. void *mmx2FilterCode= c->lumMmx2FilterCode;
  2138. int i;
  2139. #if defined(PIC)
  2140. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2141. #endif
  2142. if (canMMX2BeUsed) {
  2143. __asm__ volatile(
  2144. #if defined(PIC)
  2145. "mov %%"REG_b", %5 \n\t"
  2146. #endif
  2147. "pxor %%mm7, %%mm7 \n\t"
  2148. "mov %0, %%"REG_c" \n\t"
  2149. "mov %1, %%"REG_D" \n\t"
  2150. "mov %2, %%"REG_d" \n\t"
  2151. "mov %3, %%"REG_b" \n\t"
  2152. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2153. PREFETCH" (%%"REG_c") \n\t"
  2154. PREFETCH" 32(%%"REG_c") \n\t"
  2155. PREFETCH" 64(%%"REG_c") \n\t"
  2156. #if ARCH_X86_64
  2157. #define CALL_MMX2_FILTER_CODE \
  2158. "movl (%%"REG_b"), %%esi \n\t"\
  2159. "call *%4 \n\t"\
  2160. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  2161. "add %%"REG_S", %%"REG_c" \n\t"\
  2162. "add %%"REG_a", %%"REG_D" \n\t"\
  2163. "xor %%"REG_a", %%"REG_a" \n\t"\
  2164. #else
  2165. #define CALL_MMX2_FILTER_CODE \
  2166. "movl (%%"REG_b"), %%esi \n\t"\
  2167. "call *%4 \n\t"\
  2168. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  2169. "add %%"REG_a", %%"REG_D" \n\t"\
  2170. "xor %%"REG_a", %%"REG_a" \n\t"\
  2171. #endif /* ARCH_X86_64 */
  2172. CALL_MMX2_FILTER_CODE
  2173. CALL_MMX2_FILTER_CODE
  2174. CALL_MMX2_FILTER_CODE
  2175. CALL_MMX2_FILTER_CODE
  2176. CALL_MMX2_FILTER_CODE
  2177. CALL_MMX2_FILTER_CODE
  2178. CALL_MMX2_FILTER_CODE
  2179. CALL_MMX2_FILTER_CODE
  2180. #if defined(PIC)
  2181. "mov %5, %%"REG_b" \n\t"
  2182. #endif
  2183. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  2184. "m" (mmx2FilterCode)
  2185. #if defined(PIC)
  2186. ,"m" (ebxsave)
  2187. #endif
  2188. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2189. #if !defined(PIC)
  2190. ,"%"REG_b
  2191. #endif
  2192. );
  2193. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
  2194. } else {
  2195. #endif /* COMPILE_TEMPLATE_MMX2 */
  2196. x86_reg xInc_shr16 = xInc >> 16;
  2197. uint16_t xInc_mask = xInc & 0xffff;
  2198. //NO MMX just normal asm ...
  2199. __asm__ volatile(
  2200. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2201. "xor %%"REG_d", %%"REG_d" \n\t" // xx
  2202. "xorl %%ecx, %%ecx \n\t" // xalpha
  2203. ASMALIGN(4)
  2204. "1: \n\t"
  2205. "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
  2206. "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2207. FAST_BILINEAR_X86
  2208. "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
  2209. "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
  2210. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
  2211. "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
  2212. "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2213. FAST_BILINEAR_X86
  2214. "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
  2215. "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
  2216. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
  2217. "add $2, %%"REG_a" \n\t"
  2218. "cmp %2, %%"REG_a" \n\t"
  2219. " jb 1b \n\t"
  2220. :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
  2221. : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
  2222. );
  2223. #if COMPILE_TEMPLATE_MMX2
  2224. } //if MMX2 can't be used
  2225. #endif
  2226. #else
  2227. int i;
  2228. unsigned int xpos=0;
  2229. for (i=0;i<dstWidth;i++) {
  2230. register unsigned int xx=xpos>>16;
  2231. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  2232. dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
  2233. xpos+=xInc;
  2234. }
  2235. #endif /* ARCH_X86 */
  2236. }
  2237. // *** horizontal scale Y line to temp buffer
  2238. static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
  2239. const int16_t *hLumFilter,
  2240. const int16_t *hLumFilterPos, int hLumFilterSize,
  2241. uint8_t *formatConvBuffer,
  2242. uint32_t *pal, int isAlpha)
  2243. {
  2244. void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
  2245. void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
  2246. src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
  2247. if (toYV12) {
  2248. toYV12(formatConvBuffer, src, srcW, pal);
  2249. src= formatConvBuffer;
  2250. }
  2251. if (!c->hyscale_fast) {
  2252. c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
  2253. } else { // fast bilinear upscale / crap downscale
  2254. c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
  2255. }
  2256. if (convertRange)
  2257. convertRange(dst, dstWidth);
  2258. }
  2259. static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
  2260. long dstWidth, const uint8_t *src1,
  2261. const uint8_t *src2, int srcW, int xInc)
  2262. {
  2263. #if ARCH_X86 && CONFIG_GPL
  2264. #if COMPILE_TEMPLATE_MMX2
  2265. int32_t *filterPos = c->hChrFilterPos;
  2266. int16_t *filter = c->hChrFilter;
  2267. int canMMX2BeUsed = c->canMMX2BeUsed;
  2268. void *mmx2FilterCode= c->chrMmx2FilterCode;
  2269. int i;
  2270. #if defined(PIC)
  2271. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2272. #endif
  2273. if (canMMX2BeUsed) {
  2274. __asm__ volatile(
  2275. #if defined(PIC)
  2276. "mov %%"REG_b", %6 \n\t"
  2277. #endif
  2278. "pxor %%mm7, %%mm7 \n\t"
  2279. "mov %0, %%"REG_c" \n\t"
  2280. "mov %1, %%"REG_D" \n\t"
  2281. "mov %2, %%"REG_d" \n\t"
  2282. "mov %3, %%"REG_b" \n\t"
  2283. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2284. PREFETCH" (%%"REG_c") \n\t"
  2285. PREFETCH" 32(%%"REG_c") \n\t"
  2286. PREFETCH" 64(%%"REG_c") \n\t"
  2287. CALL_MMX2_FILTER_CODE
  2288. CALL_MMX2_FILTER_CODE
  2289. CALL_MMX2_FILTER_CODE
  2290. CALL_MMX2_FILTER_CODE
  2291. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2292. "mov %5, %%"REG_c" \n\t" // src
  2293. "mov %1, %%"REG_D" \n\t" // buf1
  2294. "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
  2295. PREFETCH" (%%"REG_c") \n\t"
  2296. PREFETCH" 32(%%"REG_c") \n\t"
  2297. PREFETCH" 64(%%"REG_c") \n\t"
  2298. CALL_MMX2_FILTER_CODE
  2299. CALL_MMX2_FILTER_CODE
  2300. CALL_MMX2_FILTER_CODE
  2301. CALL_MMX2_FILTER_CODE
  2302. #if defined(PIC)
  2303. "mov %6, %%"REG_b" \n\t"
  2304. #endif
  2305. :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
  2306. "m" (mmx2FilterCode), "m" (src2)
  2307. #if defined(PIC)
  2308. ,"m" (ebxsave)
  2309. #endif
  2310. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2311. #if !defined(PIC)
  2312. ,"%"REG_b
  2313. #endif
  2314. );
  2315. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  2316. //printf("%d %d %d\n", dstWidth, i, srcW);
  2317. dst[i] = src1[srcW-1]*128;
  2318. dst[i+VOFW] = src2[srcW-1]*128;
  2319. }
  2320. } else {
  2321. #endif /* COMPILE_TEMPLATE_MMX2 */
  2322. x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
  2323. uint16_t xInc_mask = xInc & 0xffff;
  2324. __asm__ volatile(
  2325. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2326. "xor %%"REG_d", %%"REG_d" \n\t" // xx
  2327. "xorl %%ecx, %%ecx \n\t" // xalpha
  2328. ASMALIGN(4)
  2329. "1: \n\t"
  2330. "mov %0, %%"REG_S" \n\t"
  2331. "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
  2332. "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
  2333. FAST_BILINEAR_X86
  2334. "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
  2335. "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
  2336. "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2337. FAST_BILINEAR_X86
  2338. "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
  2339. "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
  2340. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
  2341. "add $1, %%"REG_a" \n\t"
  2342. "cmp %2, %%"REG_a" \n\t"
  2343. " jb 1b \n\t"
  2344. /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
  2345. which is needed to support GCC 4.0. */
  2346. #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
  2347. :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
  2348. #else
  2349. :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
  2350. #endif
  2351. "r" (src2)
  2352. : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
  2353. );
  2354. #if COMPILE_TEMPLATE_MMX2
  2355. } //if MMX2 can't be used
  2356. #endif
  2357. #else
  2358. int i;
  2359. unsigned int xpos=0;
  2360. for (i=0;i<dstWidth;i++) {
  2361. register unsigned int xx=xpos>>16;
  2362. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  2363. dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
  2364. dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
  2365. /* slower
  2366. dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
  2367. dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
  2368. */
  2369. xpos+=xInc;
  2370. }
  2371. #endif /* ARCH_X86 */
  2372. }
  2373. inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
  2374. int srcW, int xInc, const int16_t *hChrFilter,
  2375. const int16_t *hChrFilterPos, int hChrFilterSize,
  2376. uint8_t *formatConvBuffer,
  2377. uint32_t *pal)
  2378. {
  2379. src1 += c->chrSrcOffset;
  2380. src2 += c->chrSrcOffset;
  2381. if (c->chrToYV12) {
  2382. c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2383. src1= formatConvBuffer;
  2384. src2= formatConvBuffer+VOFW;
  2385. }
  2386. if (!c->hcscale_fast) {
  2387. c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2388. c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2389. } else { // fast bilinear upscale / crap downscale
  2390. c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
  2391. }
  2392. if (c->chrConvertRange)
  2393. c->chrConvertRange(dst, dstWidth);
  2394. }
  2395. #define DEBUG_SWSCALE_BUFFERS 0
  2396. #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
  2397. static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
  2398. int srcSliceH, uint8_t* dst[], int dstStride[])
  2399. {
  2400. /* load a few things into local vars to make the code more readable? and faster */
  2401. const int srcW= c->srcW;
  2402. const int dstW= c->dstW;
  2403. const int dstH= c->dstH;
  2404. const int chrDstW= c->chrDstW;
  2405. const int chrSrcW= c->chrSrcW;
  2406. const int lumXInc= c->lumXInc;
  2407. const int chrXInc= c->chrXInc;
  2408. const enum PixelFormat dstFormat= c->dstFormat;
  2409. const int flags= c->flags;
  2410. int16_t *vLumFilterPos= c->vLumFilterPos;
  2411. int16_t *vChrFilterPos= c->vChrFilterPos;
  2412. int16_t *hLumFilterPos= c->hLumFilterPos;
  2413. int16_t *hChrFilterPos= c->hChrFilterPos;
  2414. int16_t *vLumFilter= c->vLumFilter;
  2415. int16_t *vChrFilter= c->vChrFilter;
  2416. int16_t *hLumFilter= c->hLumFilter;
  2417. int16_t *hChrFilter= c->hChrFilter;
  2418. int32_t *lumMmxFilter= c->lumMmxFilter;
  2419. int32_t *chrMmxFilter= c->chrMmxFilter;
  2420. int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
  2421. const int vLumFilterSize= c->vLumFilterSize;
  2422. const int vChrFilterSize= c->vChrFilterSize;
  2423. const int hLumFilterSize= c->hLumFilterSize;
  2424. const int hChrFilterSize= c->hChrFilterSize;
  2425. int16_t **lumPixBuf= c->lumPixBuf;
  2426. int16_t **chrPixBuf= c->chrPixBuf;
  2427. int16_t **alpPixBuf= c->alpPixBuf;
  2428. const int vLumBufSize= c->vLumBufSize;
  2429. const int vChrBufSize= c->vChrBufSize;
  2430. uint8_t *formatConvBuffer= c->formatConvBuffer;
  2431. const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
  2432. const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
  2433. int lastDstY;
  2434. uint32_t *pal=c->pal_yuv;
  2435. /* vars which will change and which we need to store back in the context */
  2436. int dstY= c->dstY;
  2437. int lumBufIndex= c->lumBufIndex;
  2438. int chrBufIndex= c->chrBufIndex;
  2439. int lastInLumBuf= c->lastInLumBuf;
  2440. int lastInChrBuf= c->lastInChrBuf;
  2441. if (isPacked(c->srcFormat)) {
  2442. src[0]=
  2443. src[1]=
  2444. src[2]=
  2445. src[3]= src[0];
  2446. srcStride[0]=
  2447. srcStride[1]=
  2448. srcStride[2]=
  2449. srcStride[3]= srcStride[0];
  2450. }
  2451. srcStride[1]<<= c->vChrDrop;
  2452. srcStride[2]<<= c->vChrDrop;
  2453. DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
  2454. src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
  2455. dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
  2456. DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
  2457. srcSliceY, srcSliceH, dstY, dstH);
  2458. DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
  2459. vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
  2460. if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
  2461. static int warnedAlready=0; //FIXME move this into the context perhaps
  2462. if (flags & SWS_PRINT_INFO && !warnedAlready) {
  2463. av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
  2464. " ->cannot do aligned memory accesses anymore\n");
  2465. warnedAlready=1;
  2466. }
  2467. }
  2468. /* Note the user might start scaling the picture in the middle so this
  2469. will not get executed. This is not really intended but works
  2470. currently, so people might do it. */
  2471. if (srcSliceY ==0) {
  2472. lumBufIndex=-1;
  2473. chrBufIndex=-1;
  2474. dstY=0;
  2475. lastInLumBuf= -1;
  2476. lastInChrBuf= -1;
  2477. }
  2478. lastDstY= dstY;
  2479. for (;dstY < dstH; dstY++) {
  2480. unsigned char *dest =dst[0]+dstStride[0]*dstY;
  2481. const int chrDstY= dstY>>c->chrDstVSubSample;
  2482. unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
  2483. unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
  2484. unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
  2485. const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
  2486. const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
  2487. const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
  2488. int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
  2489. int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
  2490. int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
  2491. int enough_lines;
  2492. //handle holes (FAST_BILINEAR & weird filters)
  2493. if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
  2494. if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
  2495. assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
  2496. assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
  2497. DEBUG_BUFFERS("dstY: %d\n", dstY);
  2498. DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
  2499. firstLumSrcY, lastLumSrcY, lastInLumBuf);
  2500. DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
  2501. firstChrSrcY, lastChrSrcY, lastInChrBuf);
  2502. // Do we have enough lines in this slice to output the dstY line
  2503. enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
  2504. if (!enough_lines) {
  2505. lastLumSrcY = srcSliceY + srcSliceH - 1;
  2506. lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
  2507. DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
  2508. lastLumSrcY, lastChrSrcY);
  2509. }
  2510. //Do horizontal scaling
  2511. while(lastInLumBuf < lastLumSrcY) {
  2512. const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  2513. const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
  2514. lumBufIndex++;
  2515. assert(lumBufIndex < 2*vLumBufSize);
  2516. assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
  2517. assert(lastInLumBuf + 1 - srcSliceY >= 0);
  2518. RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
  2519. hLumFilter, hLumFilterPos, hLumFilterSize,
  2520. formatConvBuffer,
  2521. pal, 0);
  2522. if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
  2523. RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
  2524. hLumFilter, hLumFilterPos, hLumFilterSize,
  2525. formatConvBuffer,
  2526. pal, 1);
  2527. lastInLumBuf++;
  2528. DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
  2529. lumBufIndex, lastInLumBuf);
  2530. }
  2531. while(lastInChrBuf < lastChrSrcY) {
  2532. const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
  2533. const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
  2534. chrBufIndex++;
  2535. assert(chrBufIndex < 2*vChrBufSize);
  2536. assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
  2537. assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
  2538. //FIXME replace parameters through context struct (some at least)
  2539. if (c->needs_hcscale)
  2540. RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
  2541. hChrFilter, hChrFilterPos, hChrFilterSize,
  2542. formatConvBuffer,
  2543. pal);
  2544. lastInChrBuf++;
  2545. DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
  2546. chrBufIndex, lastInChrBuf);
  2547. }
  2548. //wrap buf index around to stay inside the ring buffer
  2549. if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
  2550. if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
  2551. if (!enough_lines)
  2552. break; //we can't output a dstY line so let's try with the next slice
  2553. #if COMPILE_TEMPLATE_MMX
  2554. c->blueDither= ff_dither8[dstY&1];
  2555. if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
  2556. c->greenDither= ff_dither8[dstY&1];
  2557. else
  2558. c->greenDither= ff_dither4[dstY&1];
  2559. c->redDither= ff_dither8[(dstY+1)&1];
  2560. #endif
  2561. if (dstY < dstH-2) {
  2562. const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2563. const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2564. const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2565. #if COMPILE_TEMPLATE_MMX
  2566. int i;
  2567. if (flags & SWS_ACCURATE_RND) {
  2568. int s= APCK_SIZE / 8;
  2569. for (i=0; i<vLumFilterSize; i+=2) {
  2570. *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
  2571. *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
  2572. lumMmxFilter[s*i+APCK_COEF/4 ]=
  2573. lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
  2574. + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
  2575. if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
  2576. *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
  2577. *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
  2578. alpMmxFilter[s*i+APCK_COEF/4 ]=
  2579. alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
  2580. }
  2581. }
  2582. for (i=0; i<vChrFilterSize; i+=2) {
  2583. *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
  2584. *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
  2585. chrMmxFilter[s*i+APCK_COEF/4 ]=
  2586. chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
  2587. + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
  2588. }
  2589. } else {
  2590. for (i=0; i<vLumFilterSize; i++) {
  2591. lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
  2592. lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
  2593. lumMmxFilter[4*i+2]=
  2594. lumMmxFilter[4*i+3]=
  2595. ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
  2596. if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
  2597. alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
  2598. alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
  2599. alpMmxFilter[4*i+2]=
  2600. alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
  2601. }
  2602. }
  2603. for (i=0; i<vChrFilterSize; i++) {
  2604. chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
  2605. chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
  2606. chrMmxFilter[4*i+2]=
  2607. chrMmxFilter[4*i+3]=
  2608. ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
  2609. }
  2610. }
  2611. #endif
  2612. if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
  2613. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2614. if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
  2615. c->yuv2nv12X(c,
  2616. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2617. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2618. dest, uDest, dstW, chrDstW, dstFormat);
  2619. } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
  2620. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2621. if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2622. if (is16BPS(dstFormat)) {
  2623. yuv2yuvX16inC(
  2624. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2625. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2626. alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
  2627. dstFormat);
  2628. } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
  2629. const int16_t *lumBuf = lumSrcPtr[0];
  2630. const int16_t *chrBuf= chrSrcPtr[0];
  2631. const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
  2632. c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
  2633. } else { //General YV12
  2634. c->yuv2yuvX(c,
  2635. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2636. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2637. alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
  2638. }
  2639. } else {
  2640. assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2641. assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2642. if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
  2643. int chrAlpha= vChrFilter[2*dstY+1];
  2644. if(flags & SWS_FULL_CHR_H_INT) {
  2645. yuv2rgbXinC_full(c, //FIXME write a packed1_full function
  2646. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2647. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2648. alpSrcPtr, dest, dstW, dstY);
  2649. } else {
  2650. c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
  2651. alpPixBuf ? *alpSrcPtr : NULL,
  2652. dest, dstW, chrAlpha, dstFormat, flags, dstY);
  2653. }
  2654. } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
  2655. int lumAlpha= vLumFilter[2*dstY+1];
  2656. int chrAlpha= vChrFilter[2*dstY+1];
  2657. lumMmxFilter[2]=
  2658. lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
  2659. chrMmxFilter[2]=
  2660. chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
  2661. if(flags & SWS_FULL_CHR_H_INT) {
  2662. yuv2rgbXinC_full(c, //FIXME write a packed2_full function
  2663. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2664. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2665. alpSrcPtr, dest, dstW, dstY);
  2666. } else {
  2667. c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
  2668. alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
  2669. dest, dstW, lumAlpha, chrAlpha, dstY);
  2670. }
  2671. } else { //general RGB
  2672. if(flags & SWS_FULL_CHR_H_INT) {
  2673. yuv2rgbXinC_full(c,
  2674. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2675. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2676. alpSrcPtr, dest, dstW, dstY);
  2677. } else {
  2678. c->yuv2packedX(c,
  2679. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2680. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2681. alpSrcPtr, dest, dstW, dstY);
  2682. }
  2683. }
  2684. }
  2685. } else { // hmm looks like we can't use MMX here without overwriting this array's tail
  2686. const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2687. const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2688. const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2689. if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
  2690. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2691. if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
  2692. yuv2nv12XinC(
  2693. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2694. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2695. dest, uDest, dstW, chrDstW, dstFormat);
  2696. } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
  2697. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2698. if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2699. if (is16BPS(dstFormat)) {
  2700. yuv2yuvX16inC(
  2701. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2702. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2703. alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
  2704. dstFormat);
  2705. } else {
  2706. yuv2yuvXinC(
  2707. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2708. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2709. alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
  2710. }
  2711. } else {
  2712. assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2713. assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2714. if(flags & SWS_FULL_CHR_H_INT) {
  2715. yuv2rgbXinC_full(c,
  2716. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2717. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2718. alpSrcPtr, dest, dstW, dstY);
  2719. } else {
  2720. yuv2packedXinC(c,
  2721. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2722. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2723. alpSrcPtr, dest, dstW, dstY);
  2724. }
  2725. }
  2726. }
  2727. }
  2728. if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
  2729. fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
  2730. #if COMPILE_TEMPLATE_MMX
  2731. if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
  2732. /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
  2733. if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
  2734. else __asm__ volatile("emms" :::"memory");
  2735. #endif
  2736. /* store changed local vars back in the context */
  2737. c->dstY= dstY;
  2738. c->lumBufIndex= lumBufIndex;
  2739. c->chrBufIndex= chrBufIndex;
  2740. c->lastInLumBuf= lastInLumBuf;
  2741. c->lastInChrBuf= lastInChrBuf;
  2742. return dstY - lastDstY;
  2743. }
  2744. static void RENAME(sws_init_swScale)(SwsContext *c)
  2745. {
  2746. enum PixelFormat srcFormat = c->srcFormat;
  2747. c->yuv2nv12X = RENAME(yuv2nv12X );
  2748. c->yuv2yuv1 = RENAME(yuv2yuv1 );
  2749. c->yuv2yuvX = RENAME(yuv2yuvX );
  2750. c->yuv2packed1 = RENAME(yuv2packed1 );
  2751. c->yuv2packed2 = RENAME(yuv2packed2 );
  2752. c->yuv2packedX = RENAME(yuv2packedX );
  2753. c->hScale = RENAME(hScale );
  2754. #if COMPILE_TEMPLATE_MMX
  2755. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2756. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  2757. #else
  2758. if (c->flags & SWS_FAST_BILINEAR)
  2759. #endif
  2760. {
  2761. c->hyscale_fast = RENAME(hyscale_fast);
  2762. c->hcscale_fast = RENAME(hcscale_fast);
  2763. }
  2764. c->chrToYV12 = NULL;
  2765. switch(srcFormat) {
  2766. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  2767. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  2768. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  2769. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  2770. case PIX_FMT_RGB8 :
  2771. case PIX_FMT_BGR8 :
  2772. case PIX_FMT_PAL8 :
  2773. case PIX_FMT_BGR4_BYTE:
  2774. case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
  2775. case PIX_FMT_YUV420P16BE:
  2776. case PIX_FMT_YUV422P16BE:
  2777. case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
  2778. case PIX_FMT_YUV420P16LE:
  2779. case PIX_FMT_YUV422P16LE:
  2780. case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
  2781. }
  2782. if (c->chrSrcHSubSample) {
  2783. switch(srcFormat) {
  2784. case PIX_FMT_RGB48BE:
  2785. case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
  2786. case PIX_FMT_RGB32 :
  2787. case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV_half; break;
  2788. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
  2789. case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
  2790. case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
  2791. case PIX_FMT_BGR32 :
  2792. case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV_half; break;
  2793. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
  2794. case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
  2795. case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
  2796. }
  2797. } else {
  2798. switch(srcFormat) {
  2799. case PIX_FMT_RGB48BE:
  2800. case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
  2801. case PIX_FMT_RGB32 :
  2802. case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV; break;
  2803. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  2804. case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
  2805. case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
  2806. case PIX_FMT_BGR32 :
  2807. case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV; break;
  2808. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  2809. case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
  2810. case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
  2811. }
  2812. }
  2813. c->lumToYV12 = NULL;
  2814. c->alpToYV12 = NULL;
  2815. switch (srcFormat) {
  2816. case PIX_FMT_YUYV422 :
  2817. case PIX_FMT_YUV420P16BE:
  2818. case PIX_FMT_YUV422P16BE:
  2819. case PIX_FMT_YUV444P16BE:
  2820. case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
  2821. case PIX_FMT_UYVY422 :
  2822. case PIX_FMT_YUV420P16LE:
  2823. case PIX_FMT_YUV422P16LE:
  2824. case PIX_FMT_YUV444P16LE:
  2825. case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
  2826. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  2827. case PIX_FMT_BGR565 : c->lumToYV12 = bgr16ToY; break;
  2828. case PIX_FMT_BGR555 : c->lumToYV12 = bgr15ToY; break;
  2829. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  2830. case PIX_FMT_RGB565 : c->lumToYV12 = rgb16ToY; break;
  2831. case PIX_FMT_RGB555 : c->lumToYV12 = rgb15ToY; break;
  2832. case PIX_FMT_RGB8 :
  2833. case PIX_FMT_BGR8 :
  2834. case PIX_FMT_PAL8 :
  2835. case PIX_FMT_BGR4_BYTE:
  2836. case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
  2837. case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
  2838. case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
  2839. case PIX_FMT_RGB32 :
  2840. case PIX_FMT_RGB32_1: c->lumToYV12 = bgr32ToY; break;
  2841. case PIX_FMT_BGR32 :
  2842. case PIX_FMT_BGR32_1: c->lumToYV12 = rgb32ToY; break;
  2843. case PIX_FMT_RGB48BE:
  2844. case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
  2845. }
  2846. if (c->alpPixBuf) {
  2847. switch (srcFormat) {
  2848. case PIX_FMT_RGB32 :
  2849. case PIX_FMT_RGB32_1:
  2850. case PIX_FMT_BGR32 :
  2851. case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
  2852. }
  2853. }
  2854. switch (srcFormat) {
  2855. case PIX_FMT_RGB32 :
  2856. case PIX_FMT_BGR32 :
  2857. c->alpSrcOffset = 3;
  2858. break;
  2859. case PIX_FMT_RGB32_1:
  2860. case PIX_FMT_BGR32_1:
  2861. c->lumSrcOffset = ALT32_CORR;
  2862. c->chrSrcOffset = ALT32_CORR;
  2863. break;
  2864. case PIX_FMT_RGB48LE:
  2865. c->lumSrcOffset = 1;
  2866. c->chrSrcOffset = 1;
  2867. c->alpSrcOffset = 1;
  2868. break;
  2869. }
  2870. if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
  2871. if (c->srcRange) {
  2872. c->lumConvertRange = RENAME(lumRangeFromJpeg);
  2873. c->chrConvertRange = RENAME(chrRangeFromJpeg);
  2874. } else {
  2875. c->lumConvertRange = RENAME(lumRangeToJpeg);
  2876. c->chrConvertRange = RENAME(chrRangeToJpeg);
  2877. }
  2878. }
  2879. if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
  2880. srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
  2881. c->needs_hcscale = 1;
  2882. }