swscale_template.c 141 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118
  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #undef REAL_MOVNTQ
  21. #undef MOVNTQ
  22. #undef PAVGB
  23. #undef PREFETCH
  24. #if COMPILE_TEMPLATE_AMD3DNOW
  25. #define PREFETCH "prefetch"
  26. #elif COMPILE_TEMPLATE_MMX2
  27. #define PREFETCH "prefetchnta"
  28. #else
  29. #define PREFETCH " # nop"
  30. #endif
  31. #if COMPILE_TEMPLATE_MMX2
  32. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  33. #elif COMPILE_TEMPLATE_AMD3DNOW
  34. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  35. #endif
  36. #if COMPILE_TEMPLATE_MMX2
  37. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  38. #else
  39. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  40. #endif
  41. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  42. #if COMPILE_TEMPLATE_ALTIVEC
  43. #include "ppc/swscale_altivec_template.c"
  44. #endif
  45. #define YSCALEYUV2YV12X(x, offset, dest, width) \
  46. __asm__ volatile(\
  47. "xor %%"REG_a", %%"REG_a" \n\t"\
  48. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  49. "movq %%mm3, %%mm4 \n\t"\
  50. "lea " offset "(%0), %%"REG_d" \n\t"\
  51. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  52. ".p2align 4 \n\t" /* FIXME Unroll? */\
  53. "1: \n\t"\
  54. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  55. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  56. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
  57. "add $16, %%"REG_d" \n\t"\
  58. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  59. "test %%"REG_S", %%"REG_S" \n\t"\
  60. "pmulhw %%mm0, %%mm2 \n\t"\
  61. "pmulhw %%mm0, %%mm5 \n\t"\
  62. "paddw %%mm2, %%mm3 \n\t"\
  63. "paddw %%mm5, %%mm4 \n\t"\
  64. " jnz 1b \n\t"\
  65. "psraw $3, %%mm3 \n\t"\
  66. "psraw $3, %%mm4 \n\t"\
  67. "packuswb %%mm4, %%mm3 \n\t"\
  68. MOVNTQ(%%mm3, (%1, %%REGa))\
  69. "add $8, %%"REG_a" \n\t"\
  70. "cmp %2, %%"REG_a" \n\t"\
  71. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  72. "movq %%mm3, %%mm4 \n\t"\
  73. "lea " offset "(%0), %%"REG_d" \n\t"\
  74. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  75. "jb 1b \n\t"\
  76. :: "r" (&c->redDither),\
  77. "r" (dest), "g" ((x86_reg)width)\
  78. : "%"REG_a, "%"REG_d, "%"REG_S\
  79. );
  80. #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
  81. __asm__ volatile(\
  82. "lea " offset "(%0), %%"REG_d" \n\t"\
  83. "xor %%"REG_a", %%"REG_a" \n\t"\
  84. "pxor %%mm4, %%mm4 \n\t"\
  85. "pxor %%mm5, %%mm5 \n\t"\
  86. "pxor %%mm6, %%mm6 \n\t"\
  87. "pxor %%mm7, %%mm7 \n\t"\
  88. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  89. ".p2align 4 \n\t"\
  90. "1: \n\t"\
  91. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
  92. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  93. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  94. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
  95. "movq %%mm0, %%mm3 \n\t"\
  96. "punpcklwd %%mm1, %%mm0 \n\t"\
  97. "punpckhwd %%mm1, %%mm3 \n\t"\
  98. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  99. "pmaddwd %%mm1, %%mm0 \n\t"\
  100. "pmaddwd %%mm1, %%mm3 \n\t"\
  101. "paddd %%mm0, %%mm4 \n\t"\
  102. "paddd %%mm3, %%mm5 \n\t"\
  103. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
  104. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  105. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  106. "test %%"REG_S", %%"REG_S" \n\t"\
  107. "movq %%mm2, %%mm0 \n\t"\
  108. "punpcklwd %%mm3, %%mm2 \n\t"\
  109. "punpckhwd %%mm3, %%mm0 \n\t"\
  110. "pmaddwd %%mm1, %%mm2 \n\t"\
  111. "pmaddwd %%mm1, %%mm0 \n\t"\
  112. "paddd %%mm2, %%mm6 \n\t"\
  113. "paddd %%mm0, %%mm7 \n\t"\
  114. " jnz 1b \n\t"\
  115. "psrad $16, %%mm4 \n\t"\
  116. "psrad $16, %%mm5 \n\t"\
  117. "psrad $16, %%mm6 \n\t"\
  118. "psrad $16, %%mm7 \n\t"\
  119. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  120. "packssdw %%mm5, %%mm4 \n\t"\
  121. "packssdw %%mm7, %%mm6 \n\t"\
  122. "paddw %%mm0, %%mm4 \n\t"\
  123. "paddw %%mm0, %%mm6 \n\t"\
  124. "psraw $3, %%mm4 \n\t"\
  125. "psraw $3, %%mm6 \n\t"\
  126. "packuswb %%mm6, %%mm4 \n\t"\
  127. MOVNTQ(%%mm4, (%1, %%REGa))\
  128. "add $8, %%"REG_a" \n\t"\
  129. "cmp %2, %%"REG_a" \n\t"\
  130. "lea " offset "(%0), %%"REG_d" \n\t"\
  131. "pxor %%mm4, %%mm4 \n\t"\
  132. "pxor %%mm5, %%mm5 \n\t"\
  133. "pxor %%mm6, %%mm6 \n\t"\
  134. "pxor %%mm7, %%mm7 \n\t"\
  135. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  136. "jb 1b \n\t"\
  137. :: "r" (&c->redDither),\
  138. "r" (dest), "g" ((x86_reg)width)\
  139. : "%"REG_a, "%"REG_d, "%"REG_S\
  140. );
  141. #define YSCALEYUV2YV121 \
  142. "mov %2, %%"REG_a" \n\t"\
  143. ".p2align 4 \n\t" /* FIXME Unroll? */\
  144. "1: \n\t"\
  145. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  146. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  147. "psraw $7, %%mm0 \n\t"\
  148. "psraw $7, %%mm1 \n\t"\
  149. "packuswb %%mm1, %%mm0 \n\t"\
  150. MOVNTQ(%%mm0, (%1, %%REGa))\
  151. "add $8, %%"REG_a" \n\t"\
  152. "jnc 1b \n\t"
  153. #define YSCALEYUV2YV121_ACCURATE \
  154. "mov %2, %%"REG_a" \n\t"\
  155. "pcmpeqw %%mm7, %%mm7 \n\t"\
  156. "psrlw $15, %%mm7 \n\t"\
  157. "psllw $6, %%mm7 \n\t"\
  158. ".p2align 4 \n\t" /* FIXME Unroll? */\
  159. "1: \n\t"\
  160. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  161. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  162. "paddsw %%mm7, %%mm0 \n\t"\
  163. "paddsw %%mm7, %%mm1 \n\t"\
  164. "psraw $7, %%mm0 \n\t"\
  165. "psraw $7, %%mm1 \n\t"\
  166. "packuswb %%mm1, %%mm0 \n\t"\
  167. MOVNTQ(%%mm0, (%1, %%REGa))\
  168. "add $8, %%"REG_a" \n\t"\
  169. "jnc 1b \n\t"
  170. /*
  171. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  172. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  173. "r" (dest), "m" (dstW_reg),
  174. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  175. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  176. */
  177. #define YSCALEYUV2PACKEDX_UV \
  178. __asm__ volatile(\
  179. "xor %%"REG_a", %%"REG_a" \n\t"\
  180. ".p2align 4 \n\t"\
  181. "nop \n\t"\
  182. "1: \n\t"\
  183. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  184. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  185. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  186. "movq %%mm3, %%mm4 \n\t"\
  187. ".p2align 4 \n\t"\
  188. "2: \n\t"\
  189. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  190. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  191. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  192. "add $16, %%"REG_d" \n\t"\
  193. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  194. "pmulhw %%mm0, %%mm2 \n\t"\
  195. "pmulhw %%mm0, %%mm5 \n\t"\
  196. "paddw %%mm2, %%mm3 \n\t"\
  197. "paddw %%mm5, %%mm4 \n\t"\
  198. "test %%"REG_S", %%"REG_S" \n\t"\
  199. " jnz 2b \n\t"\
  200. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  201. "lea "offset"(%0), %%"REG_d" \n\t"\
  202. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  203. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  204. "movq "#dst1", "#dst2" \n\t"\
  205. ".p2align 4 \n\t"\
  206. "2: \n\t"\
  207. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  208. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  209. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  210. "add $16, %%"REG_d" \n\t"\
  211. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  212. "pmulhw "#coeff", "#src1" \n\t"\
  213. "pmulhw "#coeff", "#src2" \n\t"\
  214. "paddw "#src1", "#dst1" \n\t"\
  215. "paddw "#src2", "#dst2" \n\t"\
  216. "test %%"REG_S", %%"REG_S" \n\t"\
  217. " jnz 2b \n\t"\
  218. #define YSCALEYUV2PACKEDX \
  219. YSCALEYUV2PACKEDX_UV \
  220. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  221. #define YSCALEYUV2PACKEDX_END \
  222. :: "r" (&c->redDither), \
  223. "m" (dummy), "m" (dummy), "m" (dummy),\
  224. "r" (dest), "m" (dstW_reg) \
  225. : "%"REG_a, "%"REG_d, "%"REG_S \
  226. );
  227. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  228. __asm__ volatile(\
  229. "xor %%"REG_a", %%"REG_a" \n\t"\
  230. ".p2align 4 \n\t"\
  231. "nop \n\t"\
  232. "1: \n\t"\
  233. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  234. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  235. "pxor %%mm4, %%mm4 \n\t"\
  236. "pxor %%mm5, %%mm5 \n\t"\
  237. "pxor %%mm6, %%mm6 \n\t"\
  238. "pxor %%mm7, %%mm7 \n\t"\
  239. ".p2align 4 \n\t"\
  240. "2: \n\t"\
  241. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  242. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  243. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  244. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  245. "movq %%mm0, %%mm3 \n\t"\
  246. "punpcklwd %%mm1, %%mm0 \n\t"\
  247. "punpckhwd %%mm1, %%mm3 \n\t"\
  248. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  249. "pmaddwd %%mm1, %%mm0 \n\t"\
  250. "pmaddwd %%mm1, %%mm3 \n\t"\
  251. "paddd %%mm0, %%mm4 \n\t"\
  252. "paddd %%mm3, %%mm5 \n\t"\
  253. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  254. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  255. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  256. "test %%"REG_S", %%"REG_S" \n\t"\
  257. "movq %%mm2, %%mm0 \n\t"\
  258. "punpcklwd %%mm3, %%mm2 \n\t"\
  259. "punpckhwd %%mm3, %%mm0 \n\t"\
  260. "pmaddwd %%mm1, %%mm2 \n\t"\
  261. "pmaddwd %%mm1, %%mm0 \n\t"\
  262. "paddd %%mm2, %%mm6 \n\t"\
  263. "paddd %%mm0, %%mm7 \n\t"\
  264. " jnz 2b \n\t"\
  265. "psrad $16, %%mm4 \n\t"\
  266. "psrad $16, %%mm5 \n\t"\
  267. "psrad $16, %%mm6 \n\t"\
  268. "psrad $16, %%mm7 \n\t"\
  269. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  270. "packssdw %%mm5, %%mm4 \n\t"\
  271. "packssdw %%mm7, %%mm6 \n\t"\
  272. "paddw %%mm0, %%mm4 \n\t"\
  273. "paddw %%mm0, %%mm6 \n\t"\
  274. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  275. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  276. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  277. "lea "offset"(%0), %%"REG_d" \n\t"\
  278. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  279. "pxor %%mm1, %%mm1 \n\t"\
  280. "pxor %%mm5, %%mm5 \n\t"\
  281. "pxor %%mm7, %%mm7 \n\t"\
  282. "pxor %%mm6, %%mm6 \n\t"\
  283. ".p2align 4 \n\t"\
  284. "2: \n\t"\
  285. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  286. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  287. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  288. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  289. "movq %%mm0, %%mm3 \n\t"\
  290. "punpcklwd %%mm4, %%mm0 \n\t"\
  291. "punpckhwd %%mm4, %%mm3 \n\t"\
  292. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  293. "pmaddwd %%mm4, %%mm0 \n\t"\
  294. "pmaddwd %%mm4, %%mm3 \n\t"\
  295. "paddd %%mm0, %%mm1 \n\t"\
  296. "paddd %%mm3, %%mm5 \n\t"\
  297. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  298. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  299. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  300. "test %%"REG_S", %%"REG_S" \n\t"\
  301. "movq %%mm2, %%mm0 \n\t"\
  302. "punpcklwd %%mm3, %%mm2 \n\t"\
  303. "punpckhwd %%mm3, %%mm0 \n\t"\
  304. "pmaddwd %%mm4, %%mm2 \n\t"\
  305. "pmaddwd %%mm4, %%mm0 \n\t"\
  306. "paddd %%mm2, %%mm7 \n\t"\
  307. "paddd %%mm0, %%mm6 \n\t"\
  308. " jnz 2b \n\t"\
  309. "psrad $16, %%mm1 \n\t"\
  310. "psrad $16, %%mm5 \n\t"\
  311. "psrad $16, %%mm7 \n\t"\
  312. "psrad $16, %%mm6 \n\t"\
  313. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  314. "packssdw %%mm5, %%mm1 \n\t"\
  315. "packssdw %%mm6, %%mm7 \n\t"\
  316. "paddw %%mm0, %%mm1 \n\t"\
  317. "paddw %%mm0, %%mm7 \n\t"\
  318. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  319. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  320. #define YSCALEYUV2PACKEDX_ACCURATE \
  321. YSCALEYUV2PACKEDX_ACCURATE_UV \
  322. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  323. #define YSCALEYUV2RGBX \
  324. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  325. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  326. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  327. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  328. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  329. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  330. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  331. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  332. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  333. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  334. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  335. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  336. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  337. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  338. "paddw %%mm3, %%mm4 \n\t"\
  339. "movq %%mm2, %%mm0 \n\t"\
  340. "movq %%mm5, %%mm6 \n\t"\
  341. "movq %%mm4, %%mm3 \n\t"\
  342. "punpcklwd %%mm2, %%mm2 \n\t"\
  343. "punpcklwd %%mm5, %%mm5 \n\t"\
  344. "punpcklwd %%mm4, %%mm4 \n\t"\
  345. "paddw %%mm1, %%mm2 \n\t"\
  346. "paddw %%mm1, %%mm5 \n\t"\
  347. "paddw %%mm1, %%mm4 \n\t"\
  348. "punpckhwd %%mm0, %%mm0 \n\t"\
  349. "punpckhwd %%mm6, %%mm6 \n\t"\
  350. "punpckhwd %%mm3, %%mm3 \n\t"\
  351. "paddw %%mm7, %%mm0 \n\t"\
  352. "paddw %%mm7, %%mm6 \n\t"\
  353. "paddw %%mm7, %%mm3 \n\t"\
  354. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  355. "packuswb %%mm0, %%mm2 \n\t"\
  356. "packuswb %%mm6, %%mm5 \n\t"\
  357. "packuswb %%mm3, %%mm4 \n\t"\
  358. #define REAL_YSCALEYUV2PACKED(index, c) \
  359. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  360. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  361. "psraw $3, %%mm0 \n\t"\
  362. "psraw $3, %%mm1 \n\t"\
  363. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  364. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  365. "xor "#index", "#index" \n\t"\
  366. ".p2align 4 \n\t"\
  367. "1: \n\t"\
  368. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  369. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  370. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  371. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  372. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  373. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  374. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  375. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  376. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  377. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  378. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  379. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  380. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  381. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  382. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  383. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  384. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  385. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  386. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  387. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  388. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  389. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  390. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  391. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  392. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  393. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  394. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  395. "xor "#index", "#index" \n\t"\
  396. ".p2align 4 \n\t"\
  397. "1: \n\t"\
  398. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  399. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  400. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  401. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  402. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  403. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  404. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  405. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  406. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  407. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  408. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  409. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  410. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  411. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  412. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  413. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  414. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  415. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  416. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  417. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  418. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  419. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  420. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  421. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  422. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  423. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  424. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  425. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  426. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  427. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  428. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  429. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  430. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  431. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  432. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  433. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  434. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  435. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  436. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  437. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  438. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  439. "paddw %%mm3, %%mm4 \n\t"\
  440. "movq %%mm2, %%mm0 \n\t"\
  441. "movq %%mm5, %%mm6 \n\t"\
  442. "movq %%mm4, %%mm3 \n\t"\
  443. "punpcklwd %%mm2, %%mm2 \n\t"\
  444. "punpcklwd %%mm5, %%mm5 \n\t"\
  445. "punpcklwd %%mm4, %%mm4 \n\t"\
  446. "paddw %%mm1, %%mm2 \n\t"\
  447. "paddw %%mm1, %%mm5 \n\t"\
  448. "paddw %%mm1, %%mm4 \n\t"\
  449. "punpckhwd %%mm0, %%mm0 \n\t"\
  450. "punpckhwd %%mm6, %%mm6 \n\t"\
  451. "punpckhwd %%mm3, %%mm3 \n\t"\
  452. "paddw %%mm7, %%mm0 \n\t"\
  453. "paddw %%mm7, %%mm6 \n\t"\
  454. "paddw %%mm7, %%mm3 \n\t"\
  455. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  456. "packuswb %%mm0, %%mm2 \n\t"\
  457. "packuswb %%mm6, %%mm5 \n\t"\
  458. "packuswb %%mm3, %%mm4 \n\t"\
  459. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  460. #define YSCALEYUV2RGB(index, c) \
  461. REAL_YSCALEYUV2RGB_UV(index, c) \
  462. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  463. REAL_YSCALEYUV2RGB_COEFF(c)
  464. #define REAL_YSCALEYUV2PACKED1(index, c) \
  465. "xor "#index", "#index" \n\t"\
  466. ".p2align 4 \n\t"\
  467. "1: \n\t"\
  468. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  469. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  470. "psraw $7, %%mm3 \n\t" \
  471. "psraw $7, %%mm4 \n\t" \
  472. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  473. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  474. "psraw $7, %%mm1 \n\t" \
  475. "psraw $7, %%mm7 \n\t" \
  476. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  477. #define REAL_YSCALEYUV2RGB1(index, c) \
  478. "xor "#index", "#index" \n\t"\
  479. ".p2align 4 \n\t"\
  480. "1: \n\t"\
  481. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  482. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  483. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  484. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  485. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  486. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  487. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  488. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  489. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  490. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  491. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  492. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  493. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  494. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  495. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  496. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  497. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  498. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  499. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  500. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  501. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  502. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  503. "paddw %%mm3, %%mm4 \n\t"\
  504. "movq %%mm2, %%mm0 \n\t"\
  505. "movq %%mm5, %%mm6 \n\t"\
  506. "movq %%mm4, %%mm3 \n\t"\
  507. "punpcklwd %%mm2, %%mm2 \n\t"\
  508. "punpcklwd %%mm5, %%mm5 \n\t"\
  509. "punpcklwd %%mm4, %%mm4 \n\t"\
  510. "paddw %%mm1, %%mm2 \n\t"\
  511. "paddw %%mm1, %%mm5 \n\t"\
  512. "paddw %%mm1, %%mm4 \n\t"\
  513. "punpckhwd %%mm0, %%mm0 \n\t"\
  514. "punpckhwd %%mm6, %%mm6 \n\t"\
  515. "punpckhwd %%mm3, %%mm3 \n\t"\
  516. "paddw %%mm7, %%mm0 \n\t"\
  517. "paddw %%mm7, %%mm6 \n\t"\
  518. "paddw %%mm7, %%mm3 \n\t"\
  519. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  520. "packuswb %%mm0, %%mm2 \n\t"\
  521. "packuswb %%mm6, %%mm5 \n\t"\
  522. "packuswb %%mm3, %%mm4 \n\t"\
  523. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  524. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  525. "xor "#index", "#index" \n\t"\
  526. ".p2align 4 \n\t"\
  527. "1: \n\t"\
  528. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  529. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  530. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  531. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  532. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  533. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  534. "psrlw $8, %%mm3 \n\t" \
  535. "psrlw $8, %%mm4 \n\t" \
  536. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  537. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  538. "psraw $7, %%mm1 \n\t" \
  539. "psraw $7, %%mm7 \n\t"
  540. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  541. // do vertical chrominance interpolation
  542. #define REAL_YSCALEYUV2RGB1b(index, c) \
  543. "xor "#index", "#index" \n\t"\
  544. ".p2align 4 \n\t"\
  545. "1: \n\t"\
  546. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  547. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  548. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  549. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  550. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  551. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  552. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  553. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  554. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  555. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  556. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  557. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  558. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  559. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  560. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  561. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  562. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  563. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  564. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  565. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  566. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  567. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  568. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  569. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  570. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  571. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  572. "paddw %%mm3, %%mm4 \n\t"\
  573. "movq %%mm2, %%mm0 \n\t"\
  574. "movq %%mm5, %%mm6 \n\t"\
  575. "movq %%mm4, %%mm3 \n\t"\
  576. "punpcklwd %%mm2, %%mm2 \n\t"\
  577. "punpcklwd %%mm5, %%mm5 \n\t"\
  578. "punpcklwd %%mm4, %%mm4 \n\t"\
  579. "paddw %%mm1, %%mm2 \n\t"\
  580. "paddw %%mm1, %%mm5 \n\t"\
  581. "paddw %%mm1, %%mm4 \n\t"\
  582. "punpckhwd %%mm0, %%mm0 \n\t"\
  583. "punpckhwd %%mm6, %%mm6 \n\t"\
  584. "punpckhwd %%mm3, %%mm3 \n\t"\
  585. "paddw %%mm7, %%mm0 \n\t"\
  586. "paddw %%mm7, %%mm6 \n\t"\
  587. "paddw %%mm7, %%mm3 \n\t"\
  588. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  589. "packuswb %%mm0, %%mm2 \n\t"\
  590. "packuswb %%mm6, %%mm5 \n\t"\
  591. "packuswb %%mm3, %%mm4 \n\t"\
  592. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  593. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  594. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  595. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  596. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  597. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  598. "packuswb %%mm1, %%mm7 \n\t"
  599. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  600. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  601. "movq "#b", "#q2" \n\t" /* B */\
  602. "movq "#r", "#t" \n\t" /* R */\
  603. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  604. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  605. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  606. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  607. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  608. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  609. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  610. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  611. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  612. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  613. \
  614. MOVNTQ( q0, (dst, index, 4))\
  615. MOVNTQ( b, 8(dst, index, 4))\
  616. MOVNTQ( q2, 16(dst, index, 4))\
  617. MOVNTQ( q3, 24(dst, index, 4))\
  618. \
  619. "add $8, "#index" \n\t"\
  620. "cmp "#dstw", "#index" \n\t"\
  621. " jb 1b \n\t"
  622. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  623. #define REAL_WRITERGB16(dst, dstw, index) \
  624. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  625. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  626. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  627. "psrlq $3, %%mm2 \n\t"\
  628. \
  629. "movq %%mm2, %%mm1 \n\t"\
  630. "movq %%mm4, %%mm3 \n\t"\
  631. \
  632. "punpcklbw %%mm7, %%mm3 \n\t"\
  633. "punpcklbw %%mm5, %%mm2 \n\t"\
  634. "punpckhbw %%mm7, %%mm4 \n\t"\
  635. "punpckhbw %%mm5, %%mm1 \n\t"\
  636. \
  637. "psllq $3, %%mm3 \n\t"\
  638. "psllq $3, %%mm4 \n\t"\
  639. \
  640. "por %%mm3, %%mm2 \n\t"\
  641. "por %%mm4, %%mm1 \n\t"\
  642. \
  643. MOVNTQ(%%mm2, (dst, index, 2))\
  644. MOVNTQ(%%mm1, 8(dst, index, 2))\
  645. \
  646. "add $8, "#index" \n\t"\
  647. "cmp "#dstw", "#index" \n\t"\
  648. " jb 1b \n\t"
  649. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  650. #define REAL_WRITERGB15(dst, dstw, index) \
  651. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  652. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  653. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  654. "psrlq $3, %%mm2 \n\t"\
  655. "psrlq $1, %%mm5 \n\t"\
  656. \
  657. "movq %%mm2, %%mm1 \n\t"\
  658. "movq %%mm4, %%mm3 \n\t"\
  659. \
  660. "punpcklbw %%mm7, %%mm3 \n\t"\
  661. "punpcklbw %%mm5, %%mm2 \n\t"\
  662. "punpckhbw %%mm7, %%mm4 \n\t"\
  663. "punpckhbw %%mm5, %%mm1 \n\t"\
  664. \
  665. "psllq $2, %%mm3 \n\t"\
  666. "psllq $2, %%mm4 \n\t"\
  667. \
  668. "por %%mm3, %%mm2 \n\t"\
  669. "por %%mm4, %%mm1 \n\t"\
  670. \
  671. MOVNTQ(%%mm2, (dst, index, 2))\
  672. MOVNTQ(%%mm1, 8(dst, index, 2))\
  673. \
  674. "add $8, "#index" \n\t"\
  675. "cmp "#dstw", "#index" \n\t"\
  676. " jb 1b \n\t"
  677. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  678. #define WRITEBGR24OLD(dst, dstw, index) \
  679. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  680. "movq %%mm2, %%mm1 \n\t" /* B */\
  681. "movq %%mm5, %%mm6 \n\t" /* R */\
  682. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  683. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  684. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  685. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  686. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  687. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  688. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  689. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  690. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  691. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  692. \
  693. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  694. "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
  695. "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
  696. "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
  697. "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
  698. "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
  699. "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
  700. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  701. \
  702. "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  703. "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
  704. "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
  705. "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
  706. "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
  707. "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
  708. "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
  709. "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
  710. "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
  711. "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
  712. "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
  713. "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
  714. "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
  715. \
  716. "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
  717. "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
  718. "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
  719. "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
  720. "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
  721. "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
  722. "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
  723. "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
  724. \
  725. MOVNTQ(%%mm0, (dst))\
  726. MOVNTQ(%%mm2, 8(dst))\
  727. MOVNTQ(%%mm3, 16(dst))\
  728. "add $24, "#dst" \n\t"\
  729. \
  730. "add $8, "#index" \n\t"\
  731. "cmp "#dstw", "#index" \n\t"\
  732. " jb 1b \n\t"
  733. #define WRITEBGR24MMX(dst, dstw, index) \
  734. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  735. "movq %%mm2, %%mm1 \n\t" /* B */\
  736. "movq %%mm5, %%mm6 \n\t" /* R */\
  737. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  738. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  739. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  740. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  741. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  742. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  743. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  744. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  745. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  746. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  747. \
  748. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  749. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  750. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  751. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  752. \
  753. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  754. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  755. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  756. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  757. \
  758. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  759. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  760. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  761. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  762. \
  763. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  764. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  765. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  766. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  767. MOVNTQ(%%mm0, (dst))\
  768. \
  769. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  770. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  771. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  772. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  773. MOVNTQ(%%mm6, 8(dst))\
  774. \
  775. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  776. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  777. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  778. MOVNTQ(%%mm5, 16(dst))\
  779. \
  780. "add $24, "#dst" \n\t"\
  781. \
  782. "add $8, "#index" \n\t"\
  783. "cmp "#dstw", "#index" \n\t"\
  784. " jb 1b \n\t"
  785. #define WRITEBGR24MMX2(dst, dstw, index) \
  786. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  787. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  788. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  789. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  790. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  791. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  792. \
  793. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  794. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  795. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  796. \
  797. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  798. "por %%mm1, %%mm6 \n\t"\
  799. "por %%mm3, %%mm6 \n\t"\
  800. MOVNTQ(%%mm6, (dst))\
  801. \
  802. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  803. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  804. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  805. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  806. \
  807. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  808. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  809. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  810. \
  811. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  812. "por %%mm3, %%mm6 \n\t"\
  813. MOVNTQ(%%mm6, 8(dst))\
  814. \
  815. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  816. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  817. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  818. \
  819. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  820. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  821. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  822. \
  823. "por %%mm1, %%mm3 \n\t"\
  824. "por %%mm3, %%mm6 \n\t"\
  825. MOVNTQ(%%mm6, 16(dst))\
  826. \
  827. "add $24, "#dst" \n\t"\
  828. \
  829. "add $8, "#index" \n\t"\
  830. "cmp "#dstw", "#index" \n\t"\
  831. " jb 1b \n\t"
  832. #if COMPILE_TEMPLATE_MMX2
  833. #undef WRITEBGR24
  834. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  835. #else
  836. #undef WRITEBGR24
  837. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  838. #endif
  839. #define REAL_WRITEYUY2(dst, dstw, index) \
  840. "packuswb %%mm3, %%mm3 \n\t"\
  841. "packuswb %%mm4, %%mm4 \n\t"\
  842. "packuswb %%mm7, %%mm1 \n\t"\
  843. "punpcklbw %%mm4, %%mm3 \n\t"\
  844. "movq %%mm1, %%mm7 \n\t"\
  845. "punpcklbw %%mm3, %%mm1 \n\t"\
  846. "punpckhbw %%mm3, %%mm7 \n\t"\
  847. \
  848. MOVNTQ(%%mm1, (dst, index, 2))\
  849. MOVNTQ(%%mm7, 8(dst, index, 2))\
  850. \
  851. "add $8, "#index" \n\t"\
  852. "cmp "#dstw", "#index" \n\t"\
  853. " jb 1b \n\t"
  854. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  855. static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  856. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
  857. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
  858. {
  859. #if COMPILE_TEMPLATE_MMX
  860. if(!(c->flags & SWS_BITEXACT)) {
  861. if (c->flags & SWS_ACCURATE_RND) {
  862. if (uDest) {
  863. YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  864. YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  865. }
  866. if (CONFIG_SWSCALE_ALPHA && aDest) {
  867. YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  868. }
  869. YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  870. } else {
  871. if (uDest) {
  872. YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  873. YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  874. }
  875. if (CONFIG_SWSCALE_ALPHA && aDest) {
  876. YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  877. }
  878. YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  879. }
  880. return;
  881. }
  882. #endif
  883. #if COMPILE_TEMPLATE_ALTIVEC
  884. yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
  885. chrFilter, chrSrc, chrFilterSize,
  886. dest, uDest, vDest, dstW, chrDstW);
  887. #else //COMPILE_TEMPLATE_ALTIVEC
  888. yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
  889. chrFilter, chrSrc, chrFilterSize,
  890. alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
  891. #endif //!COMPILE_TEMPLATE_ALTIVEC
  892. }
  893. static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  894. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
  895. uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
  896. {
  897. yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
  898. chrFilter, chrSrc, chrFilterSize,
  899. dest, uDest, dstW, chrDstW, dstFormat);
  900. }
  901. static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
  902. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
  903. {
  904. int i;
  905. #if COMPILE_TEMPLATE_MMX
  906. if(!(c->flags & SWS_BITEXACT)) {
  907. long p= 4;
  908. const int16_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
  909. uint8_t *dst[4]= {aDest, dest, uDest, vDest};
  910. x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
  911. if (c->flags & SWS_ACCURATE_RND) {
  912. while(p--) {
  913. if (dst[p]) {
  914. __asm__ volatile(
  915. YSCALEYUV2YV121_ACCURATE
  916. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  917. "g" (-counter[p])
  918. : "%"REG_a
  919. );
  920. }
  921. }
  922. } else {
  923. while(p--) {
  924. if (dst[p]) {
  925. __asm__ volatile(
  926. YSCALEYUV2YV121
  927. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  928. "g" (-counter[p])
  929. : "%"REG_a
  930. );
  931. }
  932. }
  933. }
  934. return;
  935. }
  936. #endif
  937. for (i=0; i<dstW; i++) {
  938. int val= (lumSrc[i]+64)>>7;
  939. if (val&256) {
  940. if (val<0) val=0;
  941. else val=255;
  942. }
  943. dest[i]= val;
  944. }
  945. if (uDest)
  946. for (i=0; i<chrDstW; i++) {
  947. int u=(chrSrc[i ]+64)>>7;
  948. int v=(chrSrc[i + VOFW]+64)>>7;
  949. if ((u|v)&256) {
  950. if (u<0) u=0;
  951. else if (u>255) u=255;
  952. if (v<0) v=0;
  953. else if (v>255) v=255;
  954. }
  955. uDest[i]= u;
  956. vDest[i]= v;
  957. }
  958. if (CONFIG_SWSCALE_ALPHA && aDest)
  959. for (i=0; i<dstW; i++) {
  960. int val= (alpSrc[i]+64)>>7;
  961. aDest[i]= av_clip_uint8(val);
  962. }
  963. }
  964. /**
  965. * vertical scale YV12 to RGB
  966. */
  967. static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  968. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
  969. const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
  970. {
  971. #if COMPILE_TEMPLATE_MMX
  972. x86_reg dummy=0;
  973. x86_reg dstW_reg = dstW;
  974. if(!(c->flags & SWS_BITEXACT)) {
  975. if (c->flags & SWS_ACCURATE_RND) {
  976. switch(c->dstFormat) {
  977. case PIX_FMT_RGB32:
  978. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  979. YSCALEYUV2PACKEDX_ACCURATE
  980. YSCALEYUV2RGBX
  981. "movq %%mm2, "U_TEMP"(%0) \n\t"
  982. "movq %%mm4, "V_TEMP"(%0) \n\t"
  983. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  984. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  985. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  986. "psraw $3, %%mm1 \n\t"
  987. "psraw $3, %%mm7 \n\t"
  988. "packuswb %%mm7, %%mm1 \n\t"
  989. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  990. YSCALEYUV2PACKEDX_END
  991. } else {
  992. YSCALEYUV2PACKEDX_ACCURATE
  993. YSCALEYUV2RGBX
  994. "pcmpeqd %%mm7, %%mm7 \n\t"
  995. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  996. YSCALEYUV2PACKEDX_END
  997. }
  998. return;
  999. case PIX_FMT_BGR24:
  1000. YSCALEYUV2PACKEDX_ACCURATE
  1001. YSCALEYUV2RGBX
  1002. "pxor %%mm7, %%mm7 \n\t"
  1003. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  1004. "add %4, %%"REG_c" \n\t"
  1005. WRITEBGR24(%%REGc, %5, %%REGa)
  1006. :: "r" (&c->redDither),
  1007. "m" (dummy), "m" (dummy), "m" (dummy),
  1008. "r" (dest), "m" (dstW_reg)
  1009. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  1010. );
  1011. return;
  1012. case PIX_FMT_RGB555:
  1013. YSCALEYUV2PACKEDX_ACCURATE
  1014. YSCALEYUV2RGBX
  1015. "pxor %%mm7, %%mm7 \n\t"
  1016. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1017. #ifdef DITHER1XBPP
  1018. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  1019. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  1020. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  1021. #endif
  1022. WRITERGB15(%4, %5, %%REGa)
  1023. YSCALEYUV2PACKEDX_END
  1024. return;
  1025. case PIX_FMT_RGB565:
  1026. YSCALEYUV2PACKEDX_ACCURATE
  1027. YSCALEYUV2RGBX
  1028. "pxor %%mm7, %%mm7 \n\t"
  1029. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1030. #ifdef DITHER1XBPP
  1031. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  1032. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  1033. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  1034. #endif
  1035. WRITERGB16(%4, %5, %%REGa)
  1036. YSCALEYUV2PACKEDX_END
  1037. return;
  1038. case PIX_FMT_YUYV422:
  1039. YSCALEYUV2PACKEDX_ACCURATE
  1040. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1041. "psraw $3, %%mm3 \n\t"
  1042. "psraw $3, %%mm4 \n\t"
  1043. "psraw $3, %%mm1 \n\t"
  1044. "psraw $3, %%mm7 \n\t"
  1045. WRITEYUY2(%4, %5, %%REGa)
  1046. YSCALEYUV2PACKEDX_END
  1047. return;
  1048. }
  1049. } else {
  1050. switch(c->dstFormat) {
  1051. case PIX_FMT_RGB32:
  1052. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1053. YSCALEYUV2PACKEDX
  1054. YSCALEYUV2RGBX
  1055. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  1056. "psraw $3, %%mm1 \n\t"
  1057. "psraw $3, %%mm7 \n\t"
  1058. "packuswb %%mm7, %%mm1 \n\t"
  1059. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1060. YSCALEYUV2PACKEDX_END
  1061. } else {
  1062. YSCALEYUV2PACKEDX
  1063. YSCALEYUV2RGBX
  1064. "pcmpeqd %%mm7, %%mm7 \n\t"
  1065. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1066. YSCALEYUV2PACKEDX_END
  1067. }
  1068. return;
  1069. case PIX_FMT_BGR24:
  1070. YSCALEYUV2PACKEDX
  1071. YSCALEYUV2RGBX
  1072. "pxor %%mm7, %%mm7 \n\t"
  1073. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  1074. "add %4, %%"REG_c" \n\t"
  1075. WRITEBGR24(%%REGc, %5, %%REGa)
  1076. :: "r" (&c->redDither),
  1077. "m" (dummy), "m" (dummy), "m" (dummy),
  1078. "r" (dest), "m" (dstW_reg)
  1079. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  1080. );
  1081. return;
  1082. case PIX_FMT_RGB555:
  1083. YSCALEYUV2PACKEDX
  1084. YSCALEYUV2RGBX
  1085. "pxor %%mm7, %%mm7 \n\t"
  1086. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1087. #ifdef DITHER1XBPP
  1088. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  1089. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  1090. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  1091. #endif
  1092. WRITERGB15(%4, %5, %%REGa)
  1093. YSCALEYUV2PACKEDX_END
  1094. return;
  1095. case PIX_FMT_RGB565:
  1096. YSCALEYUV2PACKEDX
  1097. YSCALEYUV2RGBX
  1098. "pxor %%mm7, %%mm7 \n\t"
  1099. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1100. #ifdef DITHER1XBPP
  1101. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  1102. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  1103. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  1104. #endif
  1105. WRITERGB16(%4, %5, %%REGa)
  1106. YSCALEYUV2PACKEDX_END
  1107. return;
  1108. case PIX_FMT_YUYV422:
  1109. YSCALEYUV2PACKEDX
  1110. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1111. "psraw $3, %%mm3 \n\t"
  1112. "psraw $3, %%mm4 \n\t"
  1113. "psraw $3, %%mm1 \n\t"
  1114. "psraw $3, %%mm7 \n\t"
  1115. WRITEYUY2(%4, %5, %%REGa)
  1116. YSCALEYUV2PACKEDX_END
  1117. return;
  1118. }
  1119. }
  1120. }
  1121. #endif /* COMPILE_TEMPLATE_MMX */
  1122. #if COMPILE_TEMPLATE_ALTIVEC
  1123. /* The following list of supported dstFormat values should
  1124. match what's found in the body of ff_yuv2packedX_altivec() */
  1125. if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
  1126. (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
  1127. c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
  1128. c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
  1129. ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
  1130. chrFilter, chrSrc, chrFilterSize,
  1131. dest, dstW, dstY);
  1132. else
  1133. #endif
  1134. yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
  1135. chrFilter, chrSrc, chrFilterSize,
  1136. alpSrc, dest, dstW, dstY);
  1137. }
  1138. /**
  1139. * vertical bilinear scale YV12 to RGB
  1140. */
  1141. static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1142. const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
  1143. {
  1144. int yalpha1=4095- yalpha;
  1145. int uvalpha1=4095-uvalpha;
  1146. int i;
  1147. #if COMPILE_TEMPLATE_MMX
  1148. if(!(c->flags & SWS_BITEXACT)) {
  1149. switch(c->dstFormat) {
  1150. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1151. case PIX_FMT_RGB32:
  1152. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1153. #if ARCH_X86_64
  1154. __asm__ volatile(
  1155. YSCALEYUV2RGB(%%r8, %5)
  1156. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  1157. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1158. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1159. "packuswb %%mm7, %%mm1 \n\t"
  1160. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1161. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
  1162. "a" (&c->redDither)
  1163. ,"r" (abuf0), "r" (abuf1)
  1164. : "%r8"
  1165. );
  1166. #else
  1167. c->u_temp=(intptr_t)abuf0;
  1168. c->v_temp=(intptr_t)abuf1;
  1169. __asm__ volatile(
  1170. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1171. "mov %4, %%"REG_b" \n\t"
  1172. "push %%"REG_BP" \n\t"
  1173. YSCALEYUV2RGB(%%REGBP, %5)
  1174. "push %0 \n\t"
  1175. "push %1 \n\t"
  1176. "mov "U_TEMP"(%5), %0 \n\t"
  1177. "mov "V_TEMP"(%5), %1 \n\t"
  1178. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  1179. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1180. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1181. "packuswb %%mm7, %%mm1 \n\t"
  1182. "pop %1 \n\t"
  1183. "pop %0 \n\t"
  1184. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1185. "pop %%"REG_BP" \n\t"
  1186. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1187. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1188. "a" (&c->redDither)
  1189. );
  1190. #endif
  1191. } else {
  1192. __asm__ volatile(
  1193. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1194. "mov %4, %%"REG_b" \n\t"
  1195. "push %%"REG_BP" \n\t"
  1196. YSCALEYUV2RGB(%%REGBP, %5)
  1197. "pcmpeqd %%mm7, %%mm7 \n\t"
  1198. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1199. "pop %%"REG_BP" \n\t"
  1200. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1201. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1202. "a" (&c->redDither)
  1203. );
  1204. }
  1205. return;
  1206. case PIX_FMT_BGR24:
  1207. __asm__ volatile(
  1208. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1209. "mov %4, %%"REG_b" \n\t"
  1210. "push %%"REG_BP" \n\t"
  1211. YSCALEYUV2RGB(%%REGBP, %5)
  1212. "pxor %%mm7, %%mm7 \n\t"
  1213. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1214. "pop %%"REG_BP" \n\t"
  1215. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1216. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1217. "a" (&c->redDither)
  1218. );
  1219. return;
  1220. case PIX_FMT_RGB555:
  1221. __asm__ volatile(
  1222. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1223. "mov %4, %%"REG_b" \n\t"
  1224. "push %%"REG_BP" \n\t"
  1225. YSCALEYUV2RGB(%%REGBP, %5)
  1226. "pxor %%mm7, %%mm7 \n\t"
  1227. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1228. #ifdef DITHER1XBPP
  1229. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1230. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1231. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1232. #endif
  1233. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1234. "pop %%"REG_BP" \n\t"
  1235. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1236. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1237. "a" (&c->redDither)
  1238. );
  1239. return;
  1240. case PIX_FMT_RGB565:
  1241. __asm__ volatile(
  1242. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1243. "mov %4, %%"REG_b" \n\t"
  1244. "push %%"REG_BP" \n\t"
  1245. YSCALEYUV2RGB(%%REGBP, %5)
  1246. "pxor %%mm7, %%mm7 \n\t"
  1247. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1248. #ifdef DITHER1XBPP
  1249. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1250. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1251. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1252. #endif
  1253. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1254. "pop %%"REG_BP" \n\t"
  1255. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1256. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1257. "a" (&c->redDither)
  1258. );
  1259. return;
  1260. case PIX_FMT_YUYV422:
  1261. __asm__ volatile(
  1262. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1263. "mov %4, %%"REG_b" \n\t"
  1264. "push %%"REG_BP" \n\t"
  1265. YSCALEYUV2PACKED(%%REGBP, %5)
  1266. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1267. "pop %%"REG_BP" \n\t"
  1268. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1269. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1270. "a" (&c->redDither)
  1271. );
  1272. return;
  1273. default: break;
  1274. }
  1275. }
  1276. #endif //COMPILE_TEMPLATE_MMX
  1277. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
  1278. }
  1279. /**
  1280. * YV12 to RGB without scaling or interpolating
  1281. */
  1282. static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1283. const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
  1284. {
  1285. const int yalpha1=0;
  1286. int i;
  1287. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1288. const int yalpha= 4096; //FIXME ...
  1289. if (flags&SWS_FULL_CHR_H_INT) {
  1290. c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
  1291. return;
  1292. }
  1293. #if COMPILE_TEMPLATE_MMX
  1294. if(!(flags & SWS_BITEXACT)) {
  1295. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1296. switch(dstFormat) {
  1297. case PIX_FMT_RGB32:
  1298. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1299. __asm__ volatile(
  1300. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1301. "mov %4, %%"REG_b" \n\t"
  1302. "push %%"REG_BP" \n\t"
  1303. YSCALEYUV2RGB1(%%REGBP, %5)
  1304. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1305. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1306. "pop %%"REG_BP" \n\t"
  1307. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1308. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1309. "a" (&c->redDither)
  1310. );
  1311. } else {
  1312. __asm__ volatile(
  1313. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1314. "mov %4, %%"REG_b" \n\t"
  1315. "push %%"REG_BP" \n\t"
  1316. YSCALEYUV2RGB1(%%REGBP, %5)
  1317. "pcmpeqd %%mm7, %%mm7 \n\t"
  1318. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1319. "pop %%"REG_BP" \n\t"
  1320. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1321. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1322. "a" (&c->redDither)
  1323. );
  1324. }
  1325. return;
  1326. case PIX_FMT_BGR24:
  1327. __asm__ volatile(
  1328. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1329. "mov %4, %%"REG_b" \n\t"
  1330. "push %%"REG_BP" \n\t"
  1331. YSCALEYUV2RGB1(%%REGBP, %5)
  1332. "pxor %%mm7, %%mm7 \n\t"
  1333. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1334. "pop %%"REG_BP" \n\t"
  1335. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1336. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1337. "a" (&c->redDither)
  1338. );
  1339. return;
  1340. case PIX_FMT_RGB555:
  1341. __asm__ volatile(
  1342. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1343. "mov %4, %%"REG_b" \n\t"
  1344. "push %%"REG_BP" \n\t"
  1345. YSCALEYUV2RGB1(%%REGBP, %5)
  1346. "pxor %%mm7, %%mm7 \n\t"
  1347. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1348. #ifdef DITHER1XBPP
  1349. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1350. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1351. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1352. #endif
  1353. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1354. "pop %%"REG_BP" \n\t"
  1355. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1356. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1357. "a" (&c->redDither)
  1358. );
  1359. return;
  1360. case PIX_FMT_RGB565:
  1361. __asm__ volatile(
  1362. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1363. "mov %4, %%"REG_b" \n\t"
  1364. "push %%"REG_BP" \n\t"
  1365. YSCALEYUV2RGB1(%%REGBP, %5)
  1366. "pxor %%mm7, %%mm7 \n\t"
  1367. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1368. #ifdef DITHER1XBPP
  1369. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1370. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1371. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1372. #endif
  1373. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1374. "pop %%"REG_BP" \n\t"
  1375. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1376. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1377. "a" (&c->redDither)
  1378. );
  1379. return;
  1380. case PIX_FMT_YUYV422:
  1381. __asm__ volatile(
  1382. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1383. "mov %4, %%"REG_b" \n\t"
  1384. "push %%"REG_BP" \n\t"
  1385. YSCALEYUV2PACKED1(%%REGBP, %5)
  1386. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1387. "pop %%"REG_BP" \n\t"
  1388. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1389. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1390. "a" (&c->redDither)
  1391. );
  1392. return;
  1393. }
  1394. } else {
  1395. switch(dstFormat) {
  1396. case PIX_FMT_RGB32:
  1397. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1398. __asm__ volatile(
  1399. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1400. "mov %4, %%"REG_b" \n\t"
  1401. "push %%"REG_BP" \n\t"
  1402. YSCALEYUV2RGB1b(%%REGBP, %5)
  1403. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1404. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1405. "pop %%"REG_BP" \n\t"
  1406. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1407. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1408. "a" (&c->redDither)
  1409. );
  1410. } else {
  1411. __asm__ volatile(
  1412. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1413. "mov %4, %%"REG_b" \n\t"
  1414. "push %%"REG_BP" \n\t"
  1415. YSCALEYUV2RGB1b(%%REGBP, %5)
  1416. "pcmpeqd %%mm7, %%mm7 \n\t"
  1417. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1418. "pop %%"REG_BP" \n\t"
  1419. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1420. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1421. "a" (&c->redDither)
  1422. );
  1423. }
  1424. return;
  1425. case PIX_FMT_BGR24:
  1426. __asm__ volatile(
  1427. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1428. "mov %4, %%"REG_b" \n\t"
  1429. "push %%"REG_BP" \n\t"
  1430. YSCALEYUV2RGB1b(%%REGBP, %5)
  1431. "pxor %%mm7, %%mm7 \n\t"
  1432. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1433. "pop %%"REG_BP" \n\t"
  1434. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1435. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1436. "a" (&c->redDither)
  1437. );
  1438. return;
  1439. case PIX_FMT_RGB555:
  1440. __asm__ volatile(
  1441. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1442. "mov %4, %%"REG_b" \n\t"
  1443. "push %%"REG_BP" \n\t"
  1444. YSCALEYUV2RGB1b(%%REGBP, %5)
  1445. "pxor %%mm7, %%mm7 \n\t"
  1446. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1447. #ifdef DITHER1XBPP
  1448. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1449. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1450. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1451. #endif
  1452. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1453. "pop %%"REG_BP" \n\t"
  1454. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1455. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1456. "a" (&c->redDither)
  1457. );
  1458. return;
  1459. case PIX_FMT_RGB565:
  1460. __asm__ volatile(
  1461. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1462. "mov %4, %%"REG_b" \n\t"
  1463. "push %%"REG_BP" \n\t"
  1464. YSCALEYUV2RGB1b(%%REGBP, %5)
  1465. "pxor %%mm7, %%mm7 \n\t"
  1466. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1467. #ifdef DITHER1XBPP
  1468. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1469. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1470. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1471. #endif
  1472. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1473. "pop %%"REG_BP" \n\t"
  1474. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1475. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1476. "a" (&c->redDither)
  1477. );
  1478. return;
  1479. case PIX_FMT_YUYV422:
  1480. __asm__ volatile(
  1481. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1482. "mov %4, %%"REG_b" \n\t"
  1483. "push %%"REG_BP" \n\t"
  1484. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1485. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1486. "pop %%"REG_BP" \n\t"
  1487. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1488. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1489. "a" (&c->redDither)
  1490. );
  1491. return;
  1492. }
  1493. }
  1494. }
  1495. #endif /* COMPILE_TEMPLATE_MMX */
  1496. if (uvalpha < 2048) {
  1497. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
  1498. } else {
  1499. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
  1500. }
  1501. }
  1502. //FIXME yuy2* can read up to 7 samples too much
  1503. static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1504. {
  1505. #if COMPILE_TEMPLATE_MMX
  1506. __asm__ volatile(
  1507. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1508. "mov %0, %%"REG_a" \n\t"
  1509. "1: \n\t"
  1510. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1511. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1512. "pand %%mm2, %%mm0 \n\t"
  1513. "pand %%mm2, %%mm1 \n\t"
  1514. "packuswb %%mm1, %%mm0 \n\t"
  1515. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1516. "add $8, %%"REG_a" \n\t"
  1517. " js 1b \n\t"
  1518. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1519. : "%"REG_a
  1520. );
  1521. #else
  1522. int i;
  1523. for (i=0; i<width; i++)
  1524. dst[i]= src[2*i];
  1525. #endif
  1526. }
  1527. static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1528. {
  1529. #if COMPILE_TEMPLATE_MMX
  1530. __asm__ volatile(
  1531. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1532. "mov %0, %%"REG_a" \n\t"
  1533. "1: \n\t"
  1534. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1535. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1536. "psrlw $8, %%mm0 \n\t"
  1537. "psrlw $8, %%mm1 \n\t"
  1538. "packuswb %%mm1, %%mm0 \n\t"
  1539. "movq %%mm0, %%mm1 \n\t"
  1540. "psrlw $8, %%mm0 \n\t"
  1541. "pand %%mm4, %%mm1 \n\t"
  1542. "packuswb %%mm0, %%mm0 \n\t"
  1543. "packuswb %%mm1, %%mm1 \n\t"
  1544. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1545. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1546. "add $4, %%"REG_a" \n\t"
  1547. " js 1b \n\t"
  1548. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1549. : "%"REG_a
  1550. );
  1551. #else
  1552. int i;
  1553. for (i=0; i<width; i++) {
  1554. dstU[i]= src1[4*i + 1];
  1555. dstV[i]= src1[4*i + 3];
  1556. }
  1557. #endif
  1558. assert(src1 == src2);
  1559. }
  1560. static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1561. {
  1562. #if COMPILE_TEMPLATE_MMX
  1563. __asm__ volatile(
  1564. "mov %0, %%"REG_a" \n\t"
  1565. "1: \n\t"
  1566. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1567. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1568. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1569. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1570. "psrlw $8, %%mm0 \n\t"
  1571. "psrlw $8, %%mm1 \n\t"
  1572. "psrlw $8, %%mm2 \n\t"
  1573. "psrlw $8, %%mm3 \n\t"
  1574. "packuswb %%mm1, %%mm0 \n\t"
  1575. "packuswb %%mm3, %%mm2 \n\t"
  1576. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1577. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1578. "add $8, %%"REG_a" \n\t"
  1579. " js 1b \n\t"
  1580. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1581. : "%"REG_a
  1582. );
  1583. #else
  1584. int i;
  1585. // FIXME I don't think this code is right for YUV444/422, since then h is not subsampled so
  1586. // we need to skip each second pixel. Same for BEToUV.
  1587. for (i=0; i<width; i++) {
  1588. dstU[i]= src1[2*i + 1];
  1589. dstV[i]= src2[2*i + 1];
  1590. }
  1591. #endif
  1592. }
  1593. /* This is almost identical to the previous, end exists only because
  1594. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1595. static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1596. {
  1597. #if COMPILE_TEMPLATE_MMX
  1598. __asm__ volatile(
  1599. "mov %0, %%"REG_a" \n\t"
  1600. "1: \n\t"
  1601. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1602. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1603. "psrlw $8, %%mm0 \n\t"
  1604. "psrlw $8, %%mm1 \n\t"
  1605. "packuswb %%mm1, %%mm0 \n\t"
  1606. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1607. "add $8, %%"REG_a" \n\t"
  1608. " js 1b \n\t"
  1609. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1610. : "%"REG_a
  1611. );
  1612. #else
  1613. int i;
  1614. for (i=0; i<width; i++)
  1615. dst[i]= src[2*i+1];
  1616. #endif
  1617. }
  1618. static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1619. {
  1620. #if COMPILE_TEMPLATE_MMX
  1621. __asm__ volatile(
  1622. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1623. "mov %0, %%"REG_a" \n\t"
  1624. "1: \n\t"
  1625. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1626. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1627. "pand %%mm4, %%mm0 \n\t"
  1628. "pand %%mm4, %%mm1 \n\t"
  1629. "packuswb %%mm1, %%mm0 \n\t"
  1630. "movq %%mm0, %%mm1 \n\t"
  1631. "psrlw $8, %%mm0 \n\t"
  1632. "pand %%mm4, %%mm1 \n\t"
  1633. "packuswb %%mm0, %%mm0 \n\t"
  1634. "packuswb %%mm1, %%mm1 \n\t"
  1635. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1636. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1637. "add $4, %%"REG_a" \n\t"
  1638. " js 1b \n\t"
  1639. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1640. : "%"REG_a
  1641. );
  1642. #else
  1643. int i;
  1644. for (i=0; i<width; i++) {
  1645. dstU[i]= src1[4*i + 0];
  1646. dstV[i]= src1[4*i + 2];
  1647. }
  1648. #endif
  1649. assert(src1 == src2);
  1650. }
  1651. static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1652. {
  1653. #if COMPILE_TEMPLATE_MMX
  1654. __asm__ volatile(
  1655. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1656. "mov %0, %%"REG_a" \n\t"
  1657. "1: \n\t"
  1658. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1659. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1660. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1661. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1662. "pand %%mm4, %%mm0 \n\t"
  1663. "pand %%mm4, %%mm1 \n\t"
  1664. "pand %%mm4, %%mm2 \n\t"
  1665. "pand %%mm4, %%mm3 \n\t"
  1666. "packuswb %%mm1, %%mm0 \n\t"
  1667. "packuswb %%mm3, %%mm2 \n\t"
  1668. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1669. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1670. "add $8, %%"REG_a" \n\t"
  1671. " js 1b \n\t"
  1672. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1673. : "%"REG_a
  1674. );
  1675. #else
  1676. int i;
  1677. for (i=0; i<width; i++) {
  1678. dstU[i]= src1[2*i];
  1679. dstV[i]= src2[2*i];
  1680. }
  1681. #endif
  1682. }
  1683. static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1684. const uint8_t *src, long width)
  1685. {
  1686. #if COMPILE_TEMPLATE_MMX
  1687. __asm__ volatile(
  1688. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1689. "mov %0, %%"REG_a" \n\t"
  1690. "1: \n\t"
  1691. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1692. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1693. "movq %%mm0, %%mm2 \n\t"
  1694. "movq %%mm1, %%mm3 \n\t"
  1695. "pand %%mm4, %%mm0 \n\t"
  1696. "pand %%mm4, %%mm1 \n\t"
  1697. "psrlw $8, %%mm2 \n\t"
  1698. "psrlw $8, %%mm3 \n\t"
  1699. "packuswb %%mm1, %%mm0 \n\t"
  1700. "packuswb %%mm3, %%mm2 \n\t"
  1701. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1702. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1703. "add $8, %%"REG_a" \n\t"
  1704. " js 1b \n\t"
  1705. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1706. : "%"REG_a
  1707. );
  1708. #else
  1709. int i;
  1710. for (i = 0; i < width; i++) {
  1711. dst1[i] = src[2*i+0];
  1712. dst2[i] = src[2*i+1];
  1713. }
  1714. #endif
  1715. }
  1716. static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1717. const uint8_t *src1, const uint8_t *src2,
  1718. long width, uint32_t *unused)
  1719. {
  1720. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1721. }
  1722. static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1723. const uint8_t *src1, const uint8_t *src2,
  1724. long width, uint32_t *unused)
  1725. {
  1726. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1727. }
  1728. // FIXME Maybe dither instead.
  1729. #ifndef YUV_NBPS
  1730. #define YUV_NBPS(depth, endianness, rfunc) \
  1731. static inline void endianness ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
  1732. const uint16_t *srcU, const uint16_t *srcV, \
  1733. long width, uint32_t *unused) \
  1734. { \
  1735. int i; \
  1736. for (i = 0; i < width; i++) { \
  1737. dstU[i] = rfunc(&srcU[i])>>(depth-8); \
  1738. dstV[i] = rfunc(&srcV[i])>>(depth-8); \
  1739. } \
  1740. } \
  1741. \
  1742. static inline void endianness ## depth ## ToY_c(uint8_t *dstY, const uint16_t *srcY, long width, uint32_t *unused) \
  1743. { \
  1744. int i; \
  1745. for (i = 0; i < width; i++) \
  1746. dstY[i] = rfunc(&srcY[i])>>(depth-8); \
  1747. } \
  1748. YUV_NBPS( 9, LE, AV_RL16)
  1749. YUV_NBPS( 9, BE, AV_RB16)
  1750. YUV_NBPS(10, LE, AV_RL16)
  1751. YUV_NBPS(10, BE, AV_RB16)
  1752. #endif // YUV_NBPS
  1753. #if COMPILE_TEMPLATE_MMX
  1754. static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
  1755. {
  1756. if(srcFormat == PIX_FMT_BGR24) {
  1757. __asm__ volatile(
  1758. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1759. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1760. :
  1761. );
  1762. } else {
  1763. __asm__ volatile(
  1764. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1765. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1766. :
  1767. );
  1768. }
  1769. __asm__ volatile(
  1770. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1771. "mov %2, %%"REG_a" \n\t"
  1772. "pxor %%mm7, %%mm7 \n\t"
  1773. "1: \n\t"
  1774. PREFETCH" 64(%0) \n\t"
  1775. "movd (%0), %%mm0 \n\t"
  1776. "movd 2(%0), %%mm1 \n\t"
  1777. "movd 6(%0), %%mm2 \n\t"
  1778. "movd 8(%0), %%mm3 \n\t"
  1779. "add $12, %0 \n\t"
  1780. "punpcklbw %%mm7, %%mm0 \n\t"
  1781. "punpcklbw %%mm7, %%mm1 \n\t"
  1782. "punpcklbw %%mm7, %%mm2 \n\t"
  1783. "punpcklbw %%mm7, %%mm3 \n\t"
  1784. "pmaddwd %%mm5, %%mm0 \n\t"
  1785. "pmaddwd %%mm6, %%mm1 \n\t"
  1786. "pmaddwd %%mm5, %%mm2 \n\t"
  1787. "pmaddwd %%mm6, %%mm3 \n\t"
  1788. "paddd %%mm1, %%mm0 \n\t"
  1789. "paddd %%mm3, %%mm2 \n\t"
  1790. "paddd %%mm4, %%mm0 \n\t"
  1791. "paddd %%mm4, %%mm2 \n\t"
  1792. "psrad $15, %%mm0 \n\t"
  1793. "psrad $15, %%mm2 \n\t"
  1794. "packssdw %%mm2, %%mm0 \n\t"
  1795. "packuswb %%mm0, %%mm0 \n\t"
  1796. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1797. "add $4, %%"REG_a" \n\t"
  1798. " js 1b \n\t"
  1799. : "+r" (src)
  1800. : "r" (dst+width), "g" ((x86_reg)-width)
  1801. : "%"REG_a
  1802. );
  1803. }
  1804. static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
  1805. {
  1806. __asm__ volatile(
  1807. "movq 24(%4), %%mm6 \n\t"
  1808. "mov %3, %%"REG_a" \n\t"
  1809. "pxor %%mm7, %%mm7 \n\t"
  1810. "1: \n\t"
  1811. PREFETCH" 64(%0) \n\t"
  1812. "movd (%0), %%mm0 \n\t"
  1813. "movd 2(%0), %%mm1 \n\t"
  1814. "punpcklbw %%mm7, %%mm0 \n\t"
  1815. "punpcklbw %%mm7, %%mm1 \n\t"
  1816. "movq %%mm0, %%mm2 \n\t"
  1817. "movq %%mm1, %%mm3 \n\t"
  1818. "pmaddwd (%4), %%mm0 \n\t"
  1819. "pmaddwd 8(%4), %%mm1 \n\t"
  1820. "pmaddwd 16(%4), %%mm2 \n\t"
  1821. "pmaddwd %%mm6, %%mm3 \n\t"
  1822. "paddd %%mm1, %%mm0 \n\t"
  1823. "paddd %%mm3, %%mm2 \n\t"
  1824. "movd 6(%0), %%mm1 \n\t"
  1825. "movd 8(%0), %%mm3 \n\t"
  1826. "add $12, %0 \n\t"
  1827. "punpcklbw %%mm7, %%mm1 \n\t"
  1828. "punpcklbw %%mm7, %%mm3 \n\t"
  1829. "movq %%mm1, %%mm4 \n\t"
  1830. "movq %%mm3, %%mm5 \n\t"
  1831. "pmaddwd (%4), %%mm1 \n\t"
  1832. "pmaddwd 8(%4), %%mm3 \n\t"
  1833. "pmaddwd 16(%4), %%mm4 \n\t"
  1834. "pmaddwd %%mm6, %%mm5 \n\t"
  1835. "paddd %%mm3, %%mm1 \n\t"
  1836. "paddd %%mm5, %%mm4 \n\t"
  1837. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1838. "paddd %%mm3, %%mm0 \n\t"
  1839. "paddd %%mm3, %%mm2 \n\t"
  1840. "paddd %%mm3, %%mm1 \n\t"
  1841. "paddd %%mm3, %%mm4 \n\t"
  1842. "psrad $15, %%mm0 \n\t"
  1843. "psrad $15, %%mm2 \n\t"
  1844. "psrad $15, %%mm1 \n\t"
  1845. "psrad $15, %%mm4 \n\t"
  1846. "packssdw %%mm1, %%mm0 \n\t"
  1847. "packssdw %%mm4, %%mm2 \n\t"
  1848. "packuswb %%mm0, %%mm0 \n\t"
  1849. "packuswb %%mm2, %%mm2 \n\t"
  1850. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1851. "movd %%mm2, (%2, %%"REG_a") \n\t"
  1852. "add $4, %%"REG_a" \n\t"
  1853. " js 1b \n\t"
  1854. : "+r" (src)
  1855. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
  1856. : "%"REG_a
  1857. );
  1858. }
  1859. #endif
  1860. static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1861. {
  1862. #if COMPILE_TEMPLATE_MMX
  1863. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1864. #else
  1865. int i;
  1866. for (i=0; i<width; i++) {
  1867. int b= src[i*3+0];
  1868. int g= src[i*3+1];
  1869. int r= src[i*3+2];
  1870. dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
  1871. }
  1872. #endif /* COMPILE_TEMPLATE_MMX */
  1873. }
  1874. static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1875. {
  1876. #if COMPILE_TEMPLATE_MMX
  1877. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1878. #else
  1879. int i;
  1880. for (i=0; i<width; i++) {
  1881. int b= src1[3*i + 0];
  1882. int g= src1[3*i + 1];
  1883. int r= src1[3*i + 2];
  1884. dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1885. dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1886. }
  1887. #endif /* COMPILE_TEMPLATE_MMX */
  1888. assert(src1 == src2);
  1889. }
  1890. static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1891. {
  1892. int i;
  1893. for (i=0; i<width; i++) {
  1894. int b= src1[6*i + 0] + src1[6*i + 3];
  1895. int g= src1[6*i + 1] + src1[6*i + 4];
  1896. int r= src1[6*i + 2] + src1[6*i + 5];
  1897. dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1898. dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1899. }
  1900. assert(src1 == src2);
  1901. }
  1902. static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1903. {
  1904. #if COMPILE_TEMPLATE_MMX
  1905. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1906. #else
  1907. int i;
  1908. for (i=0; i<width; i++) {
  1909. int r= src[i*3+0];
  1910. int g= src[i*3+1];
  1911. int b= src[i*3+2];
  1912. dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
  1913. }
  1914. #endif
  1915. }
  1916. static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1917. {
  1918. #if COMPILE_TEMPLATE_MMX
  1919. assert(src1==src2);
  1920. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1921. #else
  1922. int i;
  1923. assert(src1==src2);
  1924. for (i=0; i<width; i++) {
  1925. int r= src1[3*i + 0];
  1926. int g= src1[3*i + 1];
  1927. int b= src1[3*i + 2];
  1928. dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1929. dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1930. }
  1931. #endif
  1932. }
  1933. static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1934. {
  1935. int i;
  1936. assert(src1==src2);
  1937. for (i=0; i<width; i++) {
  1938. int r= src1[6*i + 0] + src1[6*i + 3];
  1939. int g= src1[6*i + 1] + src1[6*i + 4];
  1940. int b= src1[6*i + 2] + src1[6*i + 5];
  1941. dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1942. dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1943. }
  1944. }
  1945. // bilinear / bicubic scaling
  1946. static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
  1947. const int16_t *filter, const int16_t *filterPos, long filterSize)
  1948. {
  1949. #if COMPILE_TEMPLATE_MMX
  1950. assert(filterSize % 4 == 0 && filterSize>0);
  1951. if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
  1952. x86_reg counter= -2*dstW;
  1953. filter-= counter*2;
  1954. filterPos-= counter/2;
  1955. dst-= counter/2;
  1956. __asm__ volatile(
  1957. #if defined(PIC)
  1958. "push %%"REG_b" \n\t"
  1959. #endif
  1960. "pxor %%mm7, %%mm7 \n\t"
  1961. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1962. "mov %%"REG_a", %%"REG_BP" \n\t"
  1963. ".p2align 4 \n\t"
  1964. "1: \n\t"
  1965. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1966. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1967. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1968. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1969. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1970. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1971. "punpcklbw %%mm7, %%mm0 \n\t"
  1972. "punpcklbw %%mm7, %%mm2 \n\t"
  1973. "pmaddwd %%mm1, %%mm0 \n\t"
  1974. "pmaddwd %%mm2, %%mm3 \n\t"
  1975. "movq %%mm0, %%mm4 \n\t"
  1976. "punpckldq %%mm3, %%mm0 \n\t"
  1977. "punpckhdq %%mm3, %%mm4 \n\t"
  1978. "paddd %%mm4, %%mm0 \n\t"
  1979. "psrad $7, %%mm0 \n\t"
  1980. "packssdw %%mm0, %%mm0 \n\t"
  1981. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1982. "add $4, %%"REG_BP" \n\t"
  1983. " jnc 1b \n\t"
  1984. "pop %%"REG_BP" \n\t"
  1985. #if defined(PIC)
  1986. "pop %%"REG_b" \n\t"
  1987. #endif
  1988. : "+a" (counter)
  1989. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1990. #if !defined(PIC)
  1991. : "%"REG_b
  1992. #endif
  1993. );
  1994. } else if (filterSize==8) {
  1995. x86_reg counter= -2*dstW;
  1996. filter-= counter*4;
  1997. filterPos-= counter/2;
  1998. dst-= counter/2;
  1999. __asm__ volatile(
  2000. #if defined(PIC)
  2001. "push %%"REG_b" \n\t"
  2002. #endif
  2003. "pxor %%mm7, %%mm7 \n\t"
  2004. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  2005. "mov %%"REG_a", %%"REG_BP" \n\t"
  2006. ".p2align 4 \n\t"
  2007. "1: \n\t"
  2008. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  2009. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  2010. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  2011. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  2012. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  2013. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  2014. "punpcklbw %%mm7, %%mm0 \n\t"
  2015. "punpcklbw %%mm7, %%mm2 \n\t"
  2016. "pmaddwd %%mm1, %%mm0 \n\t"
  2017. "pmaddwd %%mm2, %%mm3 \n\t"
  2018. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  2019. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  2020. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  2021. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  2022. "punpcklbw %%mm7, %%mm4 \n\t"
  2023. "punpcklbw %%mm7, %%mm2 \n\t"
  2024. "pmaddwd %%mm1, %%mm4 \n\t"
  2025. "pmaddwd %%mm2, %%mm5 \n\t"
  2026. "paddd %%mm4, %%mm0 \n\t"
  2027. "paddd %%mm5, %%mm3 \n\t"
  2028. "movq %%mm0, %%mm4 \n\t"
  2029. "punpckldq %%mm3, %%mm0 \n\t"
  2030. "punpckhdq %%mm3, %%mm4 \n\t"
  2031. "paddd %%mm4, %%mm0 \n\t"
  2032. "psrad $7, %%mm0 \n\t"
  2033. "packssdw %%mm0, %%mm0 \n\t"
  2034. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  2035. "add $4, %%"REG_BP" \n\t"
  2036. " jnc 1b \n\t"
  2037. "pop %%"REG_BP" \n\t"
  2038. #if defined(PIC)
  2039. "pop %%"REG_b" \n\t"
  2040. #endif
  2041. : "+a" (counter)
  2042. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  2043. #if !defined(PIC)
  2044. : "%"REG_b
  2045. #endif
  2046. );
  2047. } else {
  2048. const uint8_t *offset = src+filterSize;
  2049. x86_reg counter= -2*dstW;
  2050. //filter-= counter*filterSize/2;
  2051. filterPos-= counter/2;
  2052. dst-= counter/2;
  2053. __asm__ volatile(
  2054. "pxor %%mm7, %%mm7 \n\t"
  2055. ".p2align 4 \n\t"
  2056. "1: \n\t"
  2057. "mov %2, %%"REG_c" \n\t"
  2058. "movzwl (%%"REG_c", %0), %%eax \n\t"
  2059. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  2060. "mov %5, %%"REG_c" \n\t"
  2061. "pxor %%mm4, %%mm4 \n\t"
  2062. "pxor %%mm5, %%mm5 \n\t"
  2063. "2: \n\t"
  2064. "movq (%1), %%mm1 \n\t"
  2065. "movq (%1, %6), %%mm3 \n\t"
  2066. "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
  2067. "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
  2068. "punpcklbw %%mm7, %%mm0 \n\t"
  2069. "punpcklbw %%mm7, %%mm2 \n\t"
  2070. "pmaddwd %%mm1, %%mm0 \n\t"
  2071. "pmaddwd %%mm2, %%mm3 \n\t"
  2072. "paddd %%mm3, %%mm5 \n\t"
  2073. "paddd %%mm0, %%mm4 \n\t"
  2074. "add $8, %1 \n\t"
  2075. "add $4, %%"REG_c" \n\t"
  2076. "cmp %4, %%"REG_c" \n\t"
  2077. " jb 2b \n\t"
  2078. "add %6, %1 \n\t"
  2079. "movq %%mm4, %%mm0 \n\t"
  2080. "punpckldq %%mm5, %%mm4 \n\t"
  2081. "punpckhdq %%mm5, %%mm0 \n\t"
  2082. "paddd %%mm0, %%mm4 \n\t"
  2083. "psrad $7, %%mm4 \n\t"
  2084. "packssdw %%mm4, %%mm4 \n\t"
  2085. "mov %3, %%"REG_a" \n\t"
  2086. "movd %%mm4, (%%"REG_a", %0) \n\t"
  2087. "add $4, %0 \n\t"
  2088. " jnc 1b \n\t"
  2089. : "+r" (counter), "+r" (filter)
  2090. : "m" (filterPos), "m" (dst), "m"(offset),
  2091. "m" (src), "r" ((x86_reg)filterSize*2)
  2092. : "%"REG_a, "%"REG_c, "%"REG_d
  2093. );
  2094. }
  2095. #else
  2096. #if COMPILE_TEMPLATE_ALTIVEC
  2097. hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
  2098. #else
  2099. int i;
  2100. for (i=0; i<dstW; i++) {
  2101. int j;
  2102. int srcPos= filterPos[i];
  2103. int val=0;
  2104. //printf("filterPos: %d\n", filterPos[i]);
  2105. for (j=0; j<filterSize; j++) {
  2106. //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
  2107. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  2108. }
  2109. //filter += hFilterSize;
  2110. dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
  2111. //dst[i] = val>>7;
  2112. }
  2113. #endif /* COMPILE_TEMPLATE_ALTIVEC */
  2114. #endif /* COMPILE_MMX */
  2115. }
  2116. //FIXME all pal and rgb srcFormats could do this convertion as well
  2117. //FIXME all scalers more complex than bilinear could do half of this transform
  2118. static void RENAME(chrRangeToJpeg)(int16_t *dst, int width)
  2119. {
  2120. int i;
  2121. for (i = 0; i < width; i++) {
  2122. dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
  2123. dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
  2124. }
  2125. }
  2126. static void RENAME(chrRangeFromJpeg)(int16_t *dst, int width)
  2127. {
  2128. int i;
  2129. for (i = 0; i < width; i++) {
  2130. dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469
  2131. dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
  2132. }
  2133. }
  2134. static void RENAME(lumRangeToJpeg)(int16_t *dst, int width)
  2135. {
  2136. int i;
  2137. for (i = 0; i < width; i++)
  2138. dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
  2139. }
  2140. static void RENAME(lumRangeFromJpeg)(int16_t *dst, int width)
  2141. {
  2142. int i;
  2143. for (i = 0; i < width; i++)
  2144. dst[i] = (dst[i]*14071 + 33561947)>>14;
  2145. }
  2146. #define FAST_BILINEAR_X86 \
  2147. "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
  2148. "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
  2149. "shll $16, %%edi \n\t" \
  2150. "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
  2151. "mov %1, %%"REG_D"\n\t" \
  2152. "shrl $9, %%esi \n\t" \
  2153. static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  2154. long dstWidth, const uint8_t *src, int srcW,
  2155. int xInc)
  2156. {
  2157. #if ARCH_X86
  2158. #if COMPILE_TEMPLATE_MMX2
  2159. int32_t *filterPos = c->hLumFilterPos;
  2160. int16_t *filter = c->hLumFilter;
  2161. int canMMX2BeUsed = c->canMMX2BeUsed;
  2162. void *mmx2FilterCode= c->lumMmx2FilterCode;
  2163. int i;
  2164. #if defined(PIC)
  2165. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2166. #endif
  2167. if (canMMX2BeUsed) {
  2168. __asm__ volatile(
  2169. #if defined(PIC)
  2170. "mov %%"REG_b", %5 \n\t"
  2171. #endif
  2172. "pxor %%mm7, %%mm7 \n\t"
  2173. "mov %0, %%"REG_c" \n\t"
  2174. "mov %1, %%"REG_D" \n\t"
  2175. "mov %2, %%"REG_d" \n\t"
  2176. "mov %3, %%"REG_b" \n\t"
  2177. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2178. PREFETCH" (%%"REG_c") \n\t"
  2179. PREFETCH" 32(%%"REG_c") \n\t"
  2180. PREFETCH" 64(%%"REG_c") \n\t"
  2181. #if ARCH_X86_64
  2182. #define CALL_MMX2_FILTER_CODE \
  2183. "movl (%%"REG_b"), %%esi \n\t"\
  2184. "call *%4 \n\t"\
  2185. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  2186. "add %%"REG_S", %%"REG_c" \n\t"\
  2187. "add %%"REG_a", %%"REG_D" \n\t"\
  2188. "xor %%"REG_a", %%"REG_a" \n\t"\
  2189. #else
  2190. #define CALL_MMX2_FILTER_CODE \
  2191. "movl (%%"REG_b"), %%esi \n\t"\
  2192. "call *%4 \n\t"\
  2193. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  2194. "add %%"REG_a", %%"REG_D" \n\t"\
  2195. "xor %%"REG_a", %%"REG_a" \n\t"\
  2196. #endif /* ARCH_X86_64 */
  2197. CALL_MMX2_FILTER_CODE
  2198. CALL_MMX2_FILTER_CODE
  2199. CALL_MMX2_FILTER_CODE
  2200. CALL_MMX2_FILTER_CODE
  2201. CALL_MMX2_FILTER_CODE
  2202. CALL_MMX2_FILTER_CODE
  2203. CALL_MMX2_FILTER_CODE
  2204. CALL_MMX2_FILTER_CODE
  2205. #if defined(PIC)
  2206. "mov %5, %%"REG_b" \n\t"
  2207. #endif
  2208. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  2209. "m" (mmx2FilterCode)
  2210. #if defined(PIC)
  2211. ,"m" (ebxsave)
  2212. #endif
  2213. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2214. #if !defined(PIC)
  2215. ,"%"REG_b
  2216. #endif
  2217. );
  2218. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
  2219. } else {
  2220. #endif /* COMPILE_TEMPLATE_MMX2 */
  2221. x86_reg xInc_shr16 = xInc >> 16;
  2222. uint16_t xInc_mask = xInc & 0xffff;
  2223. x86_reg dstWidth_reg = dstWidth;
  2224. //NO MMX just normal asm ...
  2225. __asm__ volatile(
  2226. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2227. "xor %%"REG_d", %%"REG_d" \n\t" // xx
  2228. "xorl %%ecx, %%ecx \n\t" // xalpha
  2229. ".p2align 4 \n\t"
  2230. "1: \n\t"
  2231. "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
  2232. "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2233. FAST_BILINEAR_X86
  2234. "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
  2235. "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
  2236. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
  2237. "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
  2238. "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2239. FAST_BILINEAR_X86
  2240. "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
  2241. "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
  2242. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
  2243. "add $2, %%"REG_a" \n\t"
  2244. "cmp %2, %%"REG_a" \n\t"
  2245. " jb 1b \n\t"
  2246. :: "r" (src), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask)
  2247. : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
  2248. );
  2249. #if COMPILE_TEMPLATE_MMX2
  2250. } //if MMX2 can't be used
  2251. #endif
  2252. #else
  2253. int i;
  2254. unsigned int xpos=0;
  2255. for (i=0;i<dstWidth;i++) {
  2256. register unsigned int xx=xpos>>16;
  2257. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  2258. dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
  2259. xpos+=xInc;
  2260. }
  2261. #endif /* ARCH_X86 */
  2262. }
  2263. // *** horizontal scale Y line to temp buffer
  2264. static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
  2265. const int16_t *hLumFilter,
  2266. const int16_t *hLumFilterPos, int hLumFilterSize,
  2267. uint8_t *formatConvBuffer,
  2268. uint32_t *pal, int isAlpha)
  2269. {
  2270. void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
  2271. void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
  2272. src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
  2273. if (toYV12) {
  2274. toYV12(formatConvBuffer, src, srcW, pal);
  2275. src= formatConvBuffer;
  2276. }
  2277. if (!c->hyscale_fast) {
  2278. c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
  2279. } else { // fast bilinear upscale / crap downscale
  2280. c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
  2281. }
  2282. if (convertRange)
  2283. convertRange(dst, dstWidth);
  2284. }
  2285. static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
  2286. long dstWidth, const uint8_t *src1,
  2287. const uint8_t *src2, int srcW, int xInc)
  2288. {
  2289. #if ARCH_X86
  2290. #if COMPILE_TEMPLATE_MMX2
  2291. int32_t *filterPos = c->hChrFilterPos;
  2292. int16_t *filter = c->hChrFilter;
  2293. int canMMX2BeUsed = c->canMMX2BeUsed;
  2294. void *mmx2FilterCode= c->chrMmx2FilterCode;
  2295. int i;
  2296. #if defined(PIC)
  2297. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2298. #endif
  2299. if (canMMX2BeUsed) {
  2300. __asm__ volatile(
  2301. #if defined(PIC)
  2302. "mov %%"REG_b", %6 \n\t"
  2303. #endif
  2304. "pxor %%mm7, %%mm7 \n\t"
  2305. "mov %0, %%"REG_c" \n\t"
  2306. "mov %1, %%"REG_D" \n\t"
  2307. "mov %2, %%"REG_d" \n\t"
  2308. "mov %3, %%"REG_b" \n\t"
  2309. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2310. PREFETCH" (%%"REG_c") \n\t"
  2311. PREFETCH" 32(%%"REG_c") \n\t"
  2312. PREFETCH" 64(%%"REG_c") \n\t"
  2313. CALL_MMX2_FILTER_CODE
  2314. CALL_MMX2_FILTER_CODE
  2315. CALL_MMX2_FILTER_CODE
  2316. CALL_MMX2_FILTER_CODE
  2317. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2318. "mov %5, %%"REG_c" \n\t" // src
  2319. "mov %1, %%"REG_D" \n\t" // buf1
  2320. "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
  2321. PREFETCH" (%%"REG_c") \n\t"
  2322. PREFETCH" 32(%%"REG_c") \n\t"
  2323. PREFETCH" 64(%%"REG_c") \n\t"
  2324. CALL_MMX2_FILTER_CODE
  2325. CALL_MMX2_FILTER_CODE
  2326. CALL_MMX2_FILTER_CODE
  2327. CALL_MMX2_FILTER_CODE
  2328. #if defined(PIC)
  2329. "mov %6, %%"REG_b" \n\t"
  2330. #endif
  2331. :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
  2332. "m" (mmx2FilterCode), "m" (src2)
  2333. #if defined(PIC)
  2334. ,"m" (ebxsave)
  2335. #endif
  2336. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2337. #if !defined(PIC)
  2338. ,"%"REG_b
  2339. #endif
  2340. );
  2341. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  2342. //printf("%d %d %d\n", dstWidth, i, srcW);
  2343. dst[i] = src1[srcW-1]*128;
  2344. dst[i+VOFW] = src2[srcW-1]*128;
  2345. }
  2346. } else {
  2347. #endif /* COMPILE_TEMPLATE_MMX2 */
  2348. x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
  2349. uint16_t xInc_mask = xInc & 0xffff;
  2350. x86_reg dstWidth_reg = dstWidth;
  2351. __asm__ volatile(
  2352. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2353. "xor %%"REG_d", %%"REG_d" \n\t" // xx
  2354. "xorl %%ecx, %%ecx \n\t" // xalpha
  2355. ".p2align 4 \n\t"
  2356. "1: \n\t"
  2357. "mov %0, %%"REG_S" \n\t"
  2358. "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
  2359. "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
  2360. FAST_BILINEAR_X86
  2361. "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
  2362. "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
  2363. "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2364. FAST_BILINEAR_X86
  2365. "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
  2366. "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
  2367. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
  2368. "add $1, %%"REG_a" \n\t"
  2369. "cmp %2, %%"REG_a" \n\t"
  2370. " jb 1b \n\t"
  2371. /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
  2372. which is needed to support GCC 4.0. */
  2373. #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
  2374. :: "m" (src1), "m" (dst), "g" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
  2375. #else
  2376. :: "m" (src1), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
  2377. #endif
  2378. "r" (src2)
  2379. : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
  2380. );
  2381. #if COMPILE_TEMPLATE_MMX2
  2382. } //if MMX2 can't be used
  2383. #endif
  2384. #else
  2385. int i;
  2386. unsigned int xpos=0;
  2387. for (i=0;i<dstWidth;i++) {
  2388. register unsigned int xx=xpos>>16;
  2389. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  2390. dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
  2391. dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
  2392. /* slower
  2393. dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
  2394. dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
  2395. */
  2396. xpos+=xInc;
  2397. }
  2398. #endif /* ARCH_X86 */
  2399. }
  2400. inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
  2401. int srcW, int xInc, const int16_t *hChrFilter,
  2402. const int16_t *hChrFilterPos, int hChrFilterSize,
  2403. uint8_t *formatConvBuffer,
  2404. uint32_t *pal)
  2405. {
  2406. src1 += c->chrSrcOffset;
  2407. src2 += c->chrSrcOffset;
  2408. if (c->chrToYV12) {
  2409. c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2410. src1= formatConvBuffer;
  2411. src2= formatConvBuffer+VOFW;
  2412. }
  2413. if (!c->hcscale_fast) {
  2414. c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2415. c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2416. } else { // fast bilinear upscale / crap downscale
  2417. c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
  2418. }
  2419. if (c->chrConvertRange)
  2420. c->chrConvertRange(dst, dstWidth);
  2421. }
  2422. #define DEBUG_SWSCALE_BUFFERS 0
  2423. #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
  2424. static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
  2425. int srcSliceH, uint8_t* dst[], int dstStride[])
  2426. {
  2427. /* load a few things into local vars to make the code more readable? and faster */
  2428. const int srcW= c->srcW;
  2429. const int dstW= c->dstW;
  2430. const int dstH= c->dstH;
  2431. const int chrDstW= c->chrDstW;
  2432. const int chrSrcW= c->chrSrcW;
  2433. const int lumXInc= c->lumXInc;
  2434. const int chrXInc= c->chrXInc;
  2435. const enum PixelFormat dstFormat= c->dstFormat;
  2436. const int flags= c->flags;
  2437. int16_t *vLumFilterPos= c->vLumFilterPos;
  2438. int16_t *vChrFilterPos= c->vChrFilterPos;
  2439. int16_t *hLumFilterPos= c->hLumFilterPos;
  2440. int16_t *hChrFilterPos= c->hChrFilterPos;
  2441. int16_t *vLumFilter= c->vLumFilter;
  2442. int16_t *vChrFilter= c->vChrFilter;
  2443. int16_t *hLumFilter= c->hLumFilter;
  2444. int16_t *hChrFilter= c->hChrFilter;
  2445. int32_t *lumMmxFilter= c->lumMmxFilter;
  2446. int32_t *chrMmxFilter= c->chrMmxFilter;
  2447. int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
  2448. const int vLumFilterSize= c->vLumFilterSize;
  2449. const int vChrFilterSize= c->vChrFilterSize;
  2450. const int hLumFilterSize= c->hLumFilterSize;
  2451. const int hChrFilterSize= c->hChrFilterSize;
  2452. int16_t **lumPixBuf= c->lumPixBuf;
  2453. int16_t **chrPixBuf= c->chrPixBuf;
  2454. int16_t **alpPixBuf= c->alpPixBuf;
  2455. const int vLumBufSize= c->vLumBufSize;
  2456. const int vChrBufSize= c->vChrBufSize;
  2457. uint8_t *formatConvBuffer= c->formatConvBuffer;
  2458. const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
  2459. const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
  2460. int lastDstY;
  2461. uint32_t *pal=c->pal_yuv;
  2462. /* vars which will change and which we need to store back in the context */
  2463. int dstY= c->dstY;
  2464. int lumBufIndex= c->lumBufIndex;
  2465. int chrBufIndex= c->chrBufIndex;
  2466. int lastInLumBuf= c->lastInLumBuf;
  2467. int lastInChrBuf= c->lastInChrBuf;
  2468. if (isPacked(c->srcFormat)) {
  2469. src[0]=
  2470. src[1]=
  2471. src[2]=
  2472. src[3]= src[0];
  2473. srcStride[0]=
  2474. srcStride[1]=
  2475. srcStride[2]=
  2476. srcStride[3]= srcStride[0];
  2477. }
  2478. srcStride[1]<<= c->vChrDrop;
  2479. srcStride[2]<<= c->vChrDrop;
  2480. DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
  2481. src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
  2482. dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
  2483. DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
  2484. srcSliceY, srcSliceH, dstY, dstH);
  2485. DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
  2486. vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
  2487. if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
  2488. static int warnedAlready=0; //FIXME move this into the context perhaps
  2489. if (flags & SWS_PRINT_INFO && !warnedAlready) {
  2490. av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
  2491. " ->cannot do aligned memory accesses anymore\n");
  2492. warnedAlready=1;
  2493. }
  2494. }
  2495. /* Note the user might start scaling the picture in the middle so this
  2496. will not get executed. This is not really intended but works
  2497. currently, so people might do it. */
  2498. if (srcSliceY ==0) {
  2499. lumBufIndex=-1;
  2500. chrBufIndex=-1;
  2501. dstY=0;
  2502. lastInLumBuf= -1;
  2503. lastInChrBuf= -1;
  2504. }
  2505. lastDstY= dstY;
  2506. for (;dstY < dstH; dstY++) {
  2507. unsigned char *dest =dst[0]+dstStride[0]*dstY;
  2508. const int chrDstY= dstY>>c->chrDstVSubSample;
  2509. unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
  2510. unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
  2511. unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
  2512. const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
  2513. const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
  2514. const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
  2515. int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
  2516. int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
  2517. int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
  2518. int enough_lines;
  2519. //handle holes (FAST_BILINEAR & weird filters)
  2520. if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
  2521. if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
  2522. assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
  2523. assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
  2524. DEBUG_BUFFERS("dstY: %d\n", dstY);
  2525. DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
  2526. firstLumSrcY, lastLumSrcY, lastInLumBuf);
  2527. DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
  2528. firstChrSrcY, lastChrSrcY, lastInChrBuf);
  2529. // Do we have enough lines in this slice to output the dstY line
  2530. enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
  2531. if (!enough_lines) {
  2532. lastLumSrcY = srcSliceY + srcSliceH - 1;
  2533. lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
  2534. DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
  2535. lastLumSrcY, lastChrSrcY);
  2536. }
  2537. //Do horizontal scaling
  2538. while(lastInLumBuf < lastLumSrcY) {
  2539. const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  2540. const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
  2541. lumBufIndex++;
  2542. assert(lumBufIndex < 2*vLumBufSize);
  2543. assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
  2544. assert(lastInLumBuf + 1 - srcSliceY >= 0);
  2545. RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
  2546. hLumFilter, hLumFilterPos, hLumFilterSize,
  2547. formatConvBuffer,
  2548. pal, 0);
  2549. if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
  2550. RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
  2551. hLumFilter, hLumFilterPos, hLumFilterSize,
  2552. formatConvBuffer,
  2553. pal, 1);
  2554. lastInLumBuf++;
  2555. DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
  2556. lumBufIndex, lastInLumBuf);
  2557. }
  2558. while(lastInChrBuf < lastChrSrcY) {
  2559. const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
  2560. const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
  2561. chrBufIndex++;
  2562. assert(chrBufIndex < 2*vChrBufSize);
  2563. assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
  2564. assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
  2565. //FIXME replace parameters through context struct (some at least)
  2566. if (c->needs_hcscale)
  2567. RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
  2568. hChrFilter, hChrFilterPos, hChrFilterSize,
  2569. formatConvBuffer,
  2570. pal);
  2571. lastInChrBuf++;
  2572. DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
  2573. chrBufIndex, lastInChrBuf);
  2574. }
  2575. //wrap buf index around to stay inside the ring buffer
  2576. if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
  2577. if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
  2578. if (!enough_lines)
  2579. break; //we can't output a dstY line so let's try with the next slice
  2580. #if COMPILE_TEMPLATE_MMX
  2581. c->blueDither= ff_dither8[dstY&1];
  2582. if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
  2583. c->greenDither= ff_dither8[dstY&1];
  2584. else
  2585. c->greenDither= ff_dither4[dstY&1];
  2586. c->redDither= ff_dither8[(dstY+1)&1];
  2587. #endif
  2588. if (dstY < dstH-2) {
  2589. const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2590. const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2591. const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2592. #if COMPILE_TEMPLATE_MMX
  2593. int i;
  2594. if (flags & SWS_ACCURATE_RND) {
  2595. int s= APCK_SIZE / 8;
  2596. for (i=0; i<vLumFilterSize; i+=2) {
  2597. *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
  2598. *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
  2599. lumMmxFilter[s*i+APCK_COEF/4 ]=
  2600. lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
  2601. + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
  2602. if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
  2603. *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
  2604. *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
  2605. alpMmxFilter[s*i+APCK_COEF/4 ]=
  2606. alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
  2607. }
  2608. }
  2609. for (i=0; i<vChrFilterSize; i+=2) {
  2610. *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
  2611. *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
  2612. chrMmxFilter[s*i+APCK_COEF/4 ]=
  2613. chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
  2614. + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
  2615. }
  2616. } else {
  2617. for (i=0; i<vLumFilterSize; i++) {
  2618. lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
  2619. lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
  2620. lumMmxFilter[4*i+2]=
  2621. lumMmxFilter[4*i+3]=
  2622. ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
  2623. if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
  2624. alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
  2625. alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
  2626. alpMmxFilter[4*i+2]=
  2627. alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
  2628. }
  2629. }
  2630. for (i=0; i<vChrFilterSize; i++) {
  2631. chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
  2632. chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
  2633. chrMmxFilter[4*i+2]=
  2634. chrMmxFilter[4*i+3]=
  2635. ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
  2636. }
  2637. }
  2638. #endif
  2639. if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
  2640. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2641. if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
  2642. c->yuv2nv12X(c,
  2643. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2644. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2645. dest, uDest, dstW, chrDstW, dstFormat);
  2646. } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
  2647. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2648. if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2649. if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
  2650. yuv2yuvX16inC(
  2651. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2652. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2653. alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
  2654. dstFormat);
  2655. } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
  2656. const int16_t *lumBuf = lumSrcPtr[0];
  2657. const int16_t *chrBuf= chrSrcPtr[0];
  2658. const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
  2659. c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
  2660. } else { //General YV12
  2661. c->yuv2yuvX(c,
  2662. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2663. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2664. alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
  2665. }
  2666. } else {
  2667. assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2668. assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2669. if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
  2670. int chrAlpha= vChrFilter[2*dstY+1];
  2671. if(flags & SWS_FULL_CHR_H_INT) {
  2672. yuv2rgbXinC_full(c, //FIXME write a packed1_full function
  2673. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2674. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2675. alpSrcPtr, dest, dstW, dstY);
  2676. } else {
  2677. c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
  2678. alpPixBuf ? *alpSrcPtr : NULL,
  2679. dest, dstW, chrAlpha, dstFormat, flags, dstY);
  2680. }
  2681. } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
  2682. int lumAlpha= vLumFilter[2*dstY+1];
  2683. int chrAlpha= vChrFilter[2*dstY+1];
  2684. lumMmxFilter[2]=
  2685. lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
  2686. chrMmxFilter[2]=
  2687. chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
  2688. if(flags & SWS_FULL_CHR_H_INT) {
  2689. yuv2rgbXinC_full(c, //FIXME write a packed2_full function
  2690. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2691. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2692. alpSrcPtr, dest, dstW, dstY);
  2693. } else {
  2694. c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
  2695. alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
  2696. dest, dstW, lumAlpha, chrAlpha, dstY);
  2697. }
  2698. } else { //general RGB
  2699. if(flags & SWS_FULL_CHR_H_INT) {
  2700. yuv2rgbXinC_full(c,
  2701. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2702. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2703. alpSrcPtr, dest, dstW, dstY);
  2704. } else {
  2705. c->yuv2packedX(c,
  2706. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2707. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2708. alpSrcPtr, dest, dstW, dstY);
  2709. }
  2710. }
  2711. }
  2712. } else { // hmm looks like we can't use MMX here without overwriting this array's tail
  2713. const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2714. const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2715. const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2716. if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
  2717. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2718. if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
  2719. yuv2nv12XinC(
  2720. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2721. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2722. dest, uDest, dstW, chrDstW, dstFormat);
  2723. } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
  2724. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2725. if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2726. if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
  2727. yuv2yuvX16inC(
  2728. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2729. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2730. alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
  2731. dstFormat);
  2732. } else {
  2733. yuv2yuvXinC(
  2734. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2735. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2736. alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
  2737. }
  2738. } else {
  2739. assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2740. assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2741. if(flags & SWS_FULL_CHR_H_INT) {
  2742. yuv2rgbXinC_full(c,
  2743. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2744. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2745. alpSrcPtr, dest, dstW, dstY);
  2746. } else {
  2747. yuv2packedXinC(c,
  2748. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2749. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2750. alpSrcPtr, dest, dstW, dstY);
  2751. }
  2752. }
  2753. }
  2754. }
  2755. if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
  2756. fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
  2757. #if COMPILE_TEMPLATE_MMX
  2758. if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
  2759. /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
  2760. if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
  2761. else __asm__ volatile("emms" :::"memory");
  2762. #endif
  2763. /* store changed local vars back in the context */
  2764. c->dstY= dstY;
  2765. c->lumBufIndex= lumBufIndex;
  2766. c->chrBufIndex= chrBufIndex;
  2767. c->lastInLumBuf= lastInLumBuf;
  2768. c->lastInChrBuf= lastInChrBuf;
  2769. return dstY - lastDstY;
  2770. }
  2771. static void RENAME(sws_init_swScale)(SwsContext *c)
  2772. {
  2773. enum PixelFormat srcFormat = c->srcFormat;
  2774. c->yuv2nv12X = RENAME(yuv2nv12X );
  2775. c->yuv2yuv1 = RENAME(yuv2yuv1 );
  2776. c->yuv2yuvX = RENAME(yuv2yuvX );
  2777. c->yuv2packed1 = RENAME(yuv2packed1 );
  2778. c->yuv2packed2 = RENAME(yuv2packed2 );
  2779. c->yuv2packedX = RENAME(yuv2packedX );
  2780. c->hScale = RENAME(hScale );
  2781. #if COMPILE_TEMPLATE_MMX
  2782. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2783. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  2784. #else
  2785. if (c->flags & SWS_FAST_BILINEAR)
  2786. #endif
  2787. {
  2788. c->hyscale_fast = RENAME(hyscale_fast);
  2789. c->hcscale_fast = RENAME(hcscale_fast);
  2790. }
  2791. c->chrToYV12 = NULL;
  2792. switch(srcFormat) {
  2793. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  2794. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  2795. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  2796. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  2797. case PIX_FMT_RGB8 :
  2798. case PIX_FMT_BGR8 :
  2799. case PIX_FMT_PAL8 :
  2800. case PIX_FMT_BGR4_BYTE:
  2801. case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
  2802. case PIX_FMT_YUV420P9BE: c->chrToYV12 = BE9ToUV_c; break;
  2803. case PIX_FMT_YUV420P9LE: c->chrToYV12 = LE9ToUV_c; break;
  2804. case PIX_FMT_YUV422P10BE:
  2805. case PIX_FMT_YUV420P10BE: c->chrToYV12 = BE10ToUV_c; break;
  2806. case PIX_FMT_YUV422P10LE:
  2807. case PIX_FMT_YUV420P10LE: c->chrToYV12 = LE10ToUV_c; break;
  2808. case PIX_FMT_YUV420P16BE:
  2809. case PIX_FMT_YUV422P16BE:
  2810. case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
  2811. case PIX_FMT_YUV420P16LE:
  2812. case PIX_FMT_YUV422P16LE:
  2813. case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
  2814. }
  2815. if (c->chrSrcHSubSample) {
  2816. switch(srcFormat) {
  2817. case PIX_FMT_RGB48BE:
  2818. case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
  2819. case PIX_FMT_BGR48BE:
  2820. case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV_half; break;
  2821. case PIX_FMT_RGB32 : c->chrToYV12 = bgr32ToUV_half; break;
  2822. case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_half; break;
  2823. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
  2824. case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
  2825. case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
  2826. case PIX_FMT_BGR32 : c->chrToYV12 = rgb32ToUV_half; break;
  2827. case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_half; break;
  2828. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
  2829. case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
  2830. case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
  2831. }
  2832. } else {
  2833. switch(srcFormat) {
  2834. case PIX_FMT_RGB48BE:
  2835. case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
  2836. case PIX_FMT_BGR48BE:
  2837. case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV; break;
  2838. case PIX_FMT_RGB32 : c->chrToYV12 = bgr32ToUV; break;
  2839. case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV; break;
  2840. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  2841. case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
  2842. case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
  2843. case PIX_FMT_BGR32 : c->chrToYV12 = rgb32ToUV; break;
  2844. case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV; break;
  2845. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  2846. case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
  2847. case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
  2848. }
  2849. }
  2850. c->lumToYV12 = NULL;
  2851. c->alpToYV12 = NULL;
  2852. switch (srcFormat) {
  2853. case PIX_FMT_YUV420P9BE: c->lumToYV12 = BE9ToY_c; break;
  2854. case PIX_FMT_YUV420P9LE: c->lumToYV12 = LE9ToY_c; break;
  2855. case PIX_FMT_YUV422P10BE:
  2856. case PIX_FMT_YUV420P10BE: c->lumToYV12 = BE10ToY_c; break;
  2857. case PIX_FMT_YUV422P10LE:
  2858. case PIX_FMT_YUV420P10LE: c->lumToYV12 = LE10ToY_c; break;
  2859. case PIX_FMT_YUYV422 :
  2860. case PIX_FMT_YUV420P16BE:
  2861. case PIX_FMT_YUV422P16BE:
  2862. case PIX_FMT_YUV444P16BE:
  2863. case PIX_FMT_GRAY8A :
  2864. case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
  2865. case PIX_FMT_UYVY422 :
  2866. case PIX_FMT_YUV420P16LE:
  2867. case PIX_FMT_YUV422P16LE:
  2868. case PIX_FMT_YUV444P16LE:
  2869. case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
  2870. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  2871. case PIX_FMT_BGR565 : c->lumToYV12 = bgr16ToY; break;
  2872. case PIX_FMT_BGR555 : c->lumToYV12 = bgr15ToY; break;
  2873. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  2874. case PIX_FMT_RGB565 : c->lumToYV12 = rgb16ToY; break;
  2875. case PIX_FMT_RGB555 : c->lumToYV12 = rgb15ToY; break;
  2876. case PIX_FMT_RGB8 :
  2877. case PIX_FMT_BGR8 :
  2878. case PIX_FMT_PAL8 :
  2879. case PIX_FMT_BGR4_BYTE:
  2880. case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
  2881. case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
  2882. case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
  2883. case PIX_FMT_RGB32 : c->lumToYV12 = bgr32ToY; break;
  2884. case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY; break;
  2885. case PIX_FMT_BGR32 : c->lumToYV12 = rgb32ToY; break;
  2886. case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY; break;
  2887. case PIX_FMT_RGB48BE:
  2888. case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
  2889. case PIX_FMT_BGR48BE:
  2890. case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48ToY; break;
  2891. }
  2892. if (c->alpPixBuf) {
  2893. switch (srcFormat) {
  2894. case PIX_FMT_RGB32 :
  2895. case PIX_FMT_RGB32_1:
  2896. case PIX_FMT_BGR32 :
  2897. case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
  2898. case PIX_FMT_GRAY8A : c->alpToYV12 = RENAME(yuy2ToY); break;
  2899. case PIX_FMT_PAL8 : c->alpToYV12 = palToA; break;
  2900. }
  2901. }
  2902. switch (srcFormat) {
  2903. case PIX_FMT_GRAY8A :
  2904. c->alpSrcOffset = 1;
  2905. break;
  2906. case PIX_FMT_RGB32 :
  2907. case PIX_FMT_BGR32 :
  2908. c->alpSrcOffset = 3;
  2909. break;
  2910. case PIX_FMT_RGB48LE:
  2911. case PIX_FMT_BGR48LE:
  2912. c->lumSrcOffset = 1;
  2913. c->chrSrcOffset = 1;
  2914. c->alpSrcOffset = 1;
  2915. break;
  2916. }
  2917. if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
  2918. if (c->srcRange) {
  2919. c->lumConvertRange = RENAME(lumRangeFromJpeg);
  2920. c->chrConvertRange = RENAME(chrRangeFromJpeg);
  2921. } else {
  2922. c->lumConvertRange = RENAME(lumRangeToJpeg);
  2923. c->chrConvertRange = RENAME(chrRangeToJpeg);
  2924. }
  2925. }
  2926. if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
  2927. srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
  2928. c->needs_hcscale = 1;
  2929. }