swscale_template.c 114 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498
  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "swscale_template.h"
  21. #undef REAL_MOVNTQ
  22. #undef MOVNTQ
  23. #undef PREFETCH
  24. #if COMPILE_TEMPLATE_MMX2
  25. #define PREFETCH "prefetchnta"
  26. #else
  27. #define PREFETCH " # nop"
  28. #endif
  29. #if COMPILE_TEMPLATE_MMX2
  30. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  31. #else
  32. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  33. #endif
  34. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  35. #define YSCALEYUV2YV12X(x, offset, dest, width) \
  36. __asm__ volatile(\
  37. "xor %%"REG_a", %%"REG_a" \n\t"\
  38. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  39. "movq %%mm3, %%mm4 \n\t"\
  40. "lea " offset "(%0), %%"REG_d" \n\t"\
  41. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  42. ".p2align 4 \n\t" /* FIXME Unroll? */\
  43. "1: \n\t"\
  44. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  45. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  46. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
  47. "add $16, %%"REG_d" \n\t"\
  48. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  49. "test %%"REG_S", %%"REG_S" \n\t"\
  50. "pmulhw %%mm0, %%mm2 \n\t"\
  51. "pmulhw %%mm0, %%mm5 \n\t"\
  52. "paddw %%mm2, %%mm3 \n\t"\
  53. "paddw %%mm5, %%mm4 \n\t"\
  54. " jnz 1b \n\t"\
  55. "psraw $3, %%mm3 \n\t"\
  56. "psraw $3, %%mm4 \n\t"\
  57. "packuswb %%mm4, %%mm3 \n\t"\
  58. MOVNTQ(%%mm3, (%1, %%REGa))\
  59. "add $8, %%"REG_a" \n\t"\
  60. "cmp %2, %%"REG_a" \n\t"\
  61. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  62. "movq %%mm3, %%mm4 \n\t"\
  63. "lea " offset "(%0), %%"REG_d" \n\t"\
  64. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  65. "jb 1b \n\t"\
  66. :: "r" (&c->redDither),\
  67. "r" (dest), "g" ((x86_reg)width)\
  68. : "%"REG_a, "%"REG_d, "%"REG_S\
  69. );
  70. static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
  71. const int16_t **lumSrc, int lumFilterSize,
  72. const int16_t *chrFilter, const int16_t **chrSrc,
  73. int chrFilterSize, const int16_t **alpSrc,
  74. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  75. uint8_t *aDest, long dstW, long chrDstW)
  76. {
  77. if (uDest) {
  78. YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  79. YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  80. }
  81. if (CONFIG_SWSCALE_ALPHA && aDest) {
  82. YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  83. }
  84. YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  85. }
  86. #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
  87. __asm__ volatile(\
  88. "lea " offset "(%0), %%"REG_d" \n\t"\
  89. "xor %%"REG_a", %%"REG_a" \n\t"\
  90. "pxor %%mm4, %%mm4 \n\t"\
  91. "pxor %%mm5, %%mm5 \n\t"\
  92. "pxor %%mm6, %%mm6 \n\t"\
  93. "pxor %%mm7, %%mm7 \n\t"\
  94. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  95. ".p2align 4 \n\t"\
  96. "1: \n\t"\
  97. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
  98. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  99. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  100. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
  101. "movq %%mm0, %%mm3 \n\t"\
  102. "punpcklwd %%mm1, %%mm0 \n\t"\
  103. "punpckhwd %%mm1, %%mm3 \n\t"\
  104. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  105. "pmaddwd %%mm1, %%mm0 \n\t"\
  106. "pmaddwd %%mm1, %%mm3 \n\t"\
  107. "paddd %%mm0, %%mm4 \n\t"\
  108. "paddd %%mm3, %%mm5 \n\t"\
  109. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
  110. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  111. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  112. "test %%"REG_S", %%"REG_S" \n\t"\
  113. "movq %%mm2, %%mm0 \n\t"\
  114. "punpcklwd %%mm3, %%mm2 \n\t"\
  115. "punpckhwd %%mm3, %%mm0 \n\t"\
  116. "pmaddwd %%mm1, %%mm2 \n\t"\
  117. "pmaddwd %%mm1, %%mm0 \n\t"\
  118. "paddd %%mm2, %%mm6 \n\t"\
  119. "paddd %%mm0, %%mm7 \n\t"\
  120. " jnz 1b \n\t"\
  121. "psrad $16, %%mm4 \n\t"\
  122. "psrad $16, %%mm5 \n\t"\
  123. "psrad $16, %%mm6 \n\t"\
  124. "psrad $16, %%mm7 \n\t"\
  125. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  126. "packssdw %%mm5, %%mm4 \n\t"\
  127. "packssdw %%mm7, %%mm6 \n\t"\
  128. "paddw %%mm0, %%mm4 \n\t"\
  129. "paddw %%mm0, %%mm6 \n\t"\
  130. "psraw $3, %%mm4 \n\t"\
  131. "psraw $3, %%mm6 \n\t"\
  132. "packuswb %%mm6, %%mm4 \n\t"\
  133. MOVNTQ(%%mm4, (%1, %%REGa))\
  134. "add $8, %%"REG_a" \n\t"\
  135. "cmp %2, %%"REG_a" \n\t"\
  136. "lea " offset "(%0), %%"REG_d" \n\t"\
  137. "pxor %%mm4, %%mm4 \n\t"\
  138. "pxor %%mm5, %%mm5 \n\t"\
  139. "pxor %%mm6, %%mm6 \n\t"\
  140. "pxor %%mm7, %%mm7 \n\t"\
  141. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  142. "jb 1b \n\t"\
  143. :: "r" (&c->redDither),\
  144. "r" (dest), "g" ((x86_reg)width)\
  145. : "%"REG_a, "%"REG_d, "%"REG_S\
  146. );
  147. static inline void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
  148. const int16_t **lumSrc, int lumFilterSize,
  149. const int16_t *chrFilter, const int16_t **chrSrc,
  150. int chrFilterSize, const int16_t **alpSrc,
  151. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  152. uint8_t *aDest, long dstW, long chrDstW)
  153. {
  154. if (uDest) {
  155. YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  156. YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  157. }
  158. if (CONFIG_SWSCALE_ALPHA && aDest) {
  159. YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  160. }
  161. YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  162. }
  163. #define YSCALEYUV2YV121 \
  164. "mov %2, %%"REG_a" \n\t"\
  165. ".p2align 4 \n\t" /* FIXME Unroll? */\
  166. "1: \n\t"\
  167. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  168. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  169. "psraw $7, %%mm0 \n\t"\
  170. "psraw $7, %%mm1 \n\t"\
  171. "packuswb %%mm1, %%mm0 \n\t"\
  172. MOVNTQ(%%mm0, (%1, %%REGa))\
  173. "add $8, %%"REG_a" \n\t"\
  174. "jnc 1b \n\t"
  175. static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
  176. const int16_t *chrSrc, const int16_t *alpSrc,
  177. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  178. uint8_t *aDest, long dstW, long chrDstW)
  179. {
  180. long p= 4;
  181. const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW };
  182. uint8_t *dst[4]= { aDest, dest, uDest, vDest };
  183. x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
  184. while (p--) {
  185. if (dst[p]) {
  186. __asm__ volatile(
  187. YSCALEYUV2YV121
  188. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  189. "g" (-counter[p])
  190. : "%"REG_a
  191. );
  192. }
  193. }
  194. }
  195. #define YSCALEYUV2YV121_ACCURATE \
  196. "mov %2, %%"REG_a" \n\t"\
  197. "pcmpeqw %%mm7, %%mm7 \n\t"\
  198. "psrlw $15, %%mm7 \n\t"\
  199. "psllw $6, %%mm7 \n\t"\
  200. ".p2align 4 \n\t" /* FIXME Unroll? */\
  201. "1: \n\t"\
  202. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  203. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  204. "paddsw %%mm7, %%mm0 \n\t"\
  205. "paddsw %%mm7, %%mm1 \n\t"\
  206. "psraw $7, %%mm0 \n\t"\
  207. "psraw $7, %%mm1 \n\t"\
  208. "packuswb %%mm1, %%mm0 \n\t"\
  209. MOVNTQ(%%mm0, (%1, %%REGa))\
  210. "add $8, %%"REG_a" \n\t"\
  211. "jnc 1b \n\t"
  212. static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
  213. const int16_t *chrSrc, const int16_t *alpSrc,
  214. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  215. uint8_t *aDest, long dstW, long chrDstW)
  216. {
  217. long p= 4;
  218. const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW };
  219. uint8_t *dst[4]= { aDest, dest, uDest, vDest };
  220. x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
  221. while (p--) {
  222. if (dst[p]) {
  223. __asm__ volatile(
  224. YSCALEYUV2YV121_ACCURATE
  225. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  226. "g" (-counter[p])
  227. : "%"REG_a
  228. );
  229. }
  230. }
  231. }
  232. #define YSCALEYUV2PACKEDX_UV \
  233. __asm__ volatile(\
  234. "xor %%"REG_a", %%"REG_a" \n\t"\
  235. ".p2align 4 \n\t"\
  236. "nop \n\t"\
  237. "1: \n\t"\
  238. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  239. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  240. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  241. "movq %%mm3, %%mm4 \n\t"\
  242. ".p2align 4 \n\t"\
  243. "2: \n\t"\
  244. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  245. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  246. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  247. "add $16, %%"REG_d" \n\t"\
  248. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  249. "pmulhw %%mm0, %%mm2 \n\t"\
  250. "pmulhw %%mm0, %%mm5 \n\t"\
  251. "paddw %%mm2, %%mm3 \n\t"\
  252. "paddw %%mm5, %%mm4 \n\t"\
  253. "test %%"REG_S", %%"REG_S" \n\t"\
  254. " jnz 2b \n\t"\
  255. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  256. "lea "offset"(%0), %%"REG_d" \n\t"\
  257. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  258. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  259. "movq "#dst1", "#dst2" \n\t"\
  260. ".p2align 4 \n\t"\
  261. "2: \n\t"\
  262. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  263. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  264. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  265. "add $16, %%"REG_d" \n\t"\
  266. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  267. "pmulhw "#coeff", "#src1" \n\t"\
  268. "pmulhw "#coeff", "#src2" \n\t"\
  269. "paddw "#src1", "#dst1" \n\t"\
  270. "paddw "#src2", "#dst2" \n\t"\
  271. "test %%"REG_S", %%"REG_S" \n\t"\
  272. " jnz 2b \n\t"\
  273. #define YSCALEYUV2PACKEDX \
  274. YSCALEYUV2PACKEDX_UV \
  275. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  276. #define YSCALEYUV2PACKEDX_END \
  277. :: "r" (&c->redDither), \
  278. "m" (dummy), "m" (dummy), "m" (dummy),\
  279. "r" (dest), "m" (dstW_reg) \
  280. : "%"REG_a, "%"REG_d, "%"REG_S \
  281. );
  282. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  283. __asm__ volatile(\
  284. "xor %%"REG_a", %%"REG_a" \n\t"\
  285. ".p2align 4 \n\t"\
  286. "nop \n\t"\
  287. "1: \n\t"\
  288. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  289. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  290. "pxor %%mm4, %%mm4 \n\t"\
  291. "pxor %%mm5, %%mm5 \n\t"\
  292. "pxor %%mm6, %%mm6 \n\t"\
  293. "pxor %%mm7, %%mm7 \n\t"\
  294. ".p2align 4 \n\t"\
  295. "2: \n\t"\
  296. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  297. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  298. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  299. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  300. "movq %%mm0, %%mm3 \n\t"\
  301. "punpcklwd %%mm1, %%mm0 \n\t"\
  302. "punpckhwd %%mm1, %%mm3 \n\t"\
  303. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  304. "pmaddwd %%mm1, %%mm0 \n\t"\
  305. "pmaddwd %%mm1, %%mm3 \n\t"\
  306. "paddd %%mm0, %%mm4 \n\t"\
  307. "paddd %%mm3, %%mm5 \n\t"\
  308. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  309. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  310. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  311. "test %%"REG_S", %%"REG_S" \n\t"\
  312. "movq %%mm2, %%mm0 \n\t"\
  313. "punpcklwd %%mm3, %%mm2 \n\t"\
  314. "punpckhwd %%mm3, %%mm0 \n\t"\
  315. "pmaddwd %%mm1, %%mm2 \n\t"\
  316. "pmaddwd %%mm1, %%mm0 \n\t"\
  317. "paddd %%mm2, %%mm6 \n\t"\
  318. "paddd %%mm0, %%mm7 \n\t"\
  319. " jnz 2b \n\t"\
  320. "psrad $16, %%mm4 \n\t"\
  321. "psrad $16, %%mm5 \n\t"\
  322. "psrad $16, %%mm6 \n\t"\
  323. "psrad $16, %%mm7 \n\t"\
  324. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  325. "packssdw %%mm5, %%mm4 \n\t"\
  326. "packssdw %%mm7, %%mm6 \n\t"\
  327. "paddw %%mm0, %%mm4 \n\t"\
  328. "paddw %%mm0, %%mm6 \n\t"\
  329. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  330. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  331. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  332. "lea "offset"(%0), %%"REG_d" \n\t"\
  333. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  334. "pxor %%mm1, %%mm1 \n\t"\
  335. "pxor %%mm5, %%mm5 \n\t"\
  336. "pxor %%mm7, %%mm7 \n\t"\
  337. "pxor %%mm6, %%mm6 \n\t"\
  338. ".p2align 4 \n\t"\
  339. "2: \n\t"\
  340. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  341. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  342. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  343. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  344. "movq %%mm0, %%mm3 \n\t"\
  345. "punpcklwd %%mm4, %%mm0 \n\t"\
  346. "punpckhwd %%mm4, %%mm3 \n\t"\
  347. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  348. "pmaddwd %%mm4, %%mm0 \n\t"\
  349. "pmaddwd %%mm4, %%mm3 \n\t"\
  350. "paddd %%mm0, %%mm1 \n\t"\
  351. "paddd %%mm3, %%mm5 \n\t"\
  352. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  353. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  354. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  355. "test %%"REG_S", %%"REG_S" \n\t"\
  356. "movq %%mm2, %%mm0 \n\t"\
  357. "punpcklwd %%mm3, %%mm2 \n\t"\
  358. "punpckhwd %%mm3, %%mm0 \n\t"\
  359. "pmaddwd %%mm4, %%mm2 \n\t"\
  360. "pmaddwd %%mm4, %%mm0 \n\t"\
  361. "paddd %%mm2, %%mm7 \n\t"\
  362. "paddd %%mm0, %%mm6 \n\t"\
  363. " jnz 2b \n\t"\
  364. "psrad $16, %%mm1 \n\t"\
  365. "psrad $16, %%mm5 \n\t"\
  366. "psrad $16, %%mm7 \n\t"\
  367. "psrad $16, %%mm6 \n\t"\
  368. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  369. "packssdw %%mm5, %%mm1 \n\t"\
  370. "packssdw %%mm6, %%mm7 \n\t"\
  371. "paddw %%mm0, %%mm1 \n\t"\
  372. "paddw %%mm0, %%mm7 \n\t"\
  373. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  374. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  375. #define YSCALEYUV2PACKEDX_ACCURATE \
  376. YSCALEYUV2PACKEDX_ACCURATE_UV \
  377. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  378. #define YSCALEYUV2RGBX \
  379. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  380. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  381. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  382. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  383. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  384. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  385. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  386. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  387. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  388. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  389. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  390. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  391. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  392. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  393. "paddw %%mm3, %%mm4 \n\t"\
  394. "movq %%mm2, %%mm0 \n\t"\
  395. "movq %%mm5, %%mm6 \n\t"\
  396. "movq %%mm4, %%mm3 \n\t"\
  397. "punpcklwd %%mm2, %%mm2 \n\t"\
  398. "punpcklwd %%mm5, %%mm5 \n\t"\
  399. "punpcklwd %%mm4, %%mm4 \n\t"\
  400. "paddw %%mm1, %%mm2 \n\t"\
  401. "paddw %%mm1, %%mm5 \n\t"\
  402. "paddw %%mm1, %%mm4 \n\t"\
  403. "punpckhwd %%mm0, %%mm0 \n\t"\
  404. "punpckhwd %%mm6, %%mm6 \n\t"\
  405. "punpckhwd %%mm3, %%mm3 \n\t"\
  406. "paddw %%mm7, %%mm0 \n\t"\
  407. "paddw %%mm7, %%mm6 \n\t"\
  408. "paddw %%mm7, %%mm3 \n\t"\
  409. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  410. "packuswb %%mm0, %%mm2 \n\t"\
  411. "packuswb %%mm6, %%mm5 \n\t"\
  412. "packuswb %%mm3, %%mm4 \n\t"\
  413. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  414. "movq "#b", "#q2" \n\t" /* B */\
  415. "movq "#r", "#t" \n\t" /* R */\
  416. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  417. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  418. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  419. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  420. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  421. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  422. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  423. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  424. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  425. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  426. \
  427. MOVNTQ( q0, (dst, index, 4))\
  428. MOVNTQ( b, 8(dst, index, 4))\
  429. MOVNTQ( q2, 16(dst, index, 4))\
  430. MOVNTQ( q3, 24(dst, index, 4))\
  431. \
  432. "add $8, "#index" \n\t"\
  433. "cmp "#dstw", "#index" \n\t"\
  434. " jb 1b \n\t"
  435. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  436. static inline void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  437. const int16_t **lumSrc, int lumFilterSize,
  438. const int16_t *chrFilter, const int16_t **chrSrc,
  439. int chrFilterSize, const int16_t **alpSrc,
  440. uint8_t *dest, long dstW, long dstY)
  441. {
  442. x86_reg dummy=0;
  443. x86_reg dstW_reg = dstW;
  444. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  445. YSCALEYUV2PACKEDX_ACCURATE
  446. YSCALEYUV2RGBX
  447. "movq %%mm2, "U_TEMP"(%0) \n\t"
  448. "movq %%mm4, "V_TEMP"(%0) \n\t"
  449. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  450. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  451. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  452. "psraw $3, %%mm1 \n\t"
  453. "psraw $3, %%mm7 \n\t"
  454. "packuswb %%mm7, %%mm1 \n\t"
  455. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  456. YSCALEYUV2PACKEDX_END
  457. } else {
  458. YSCALEYUV2PACKEDX_ACCURATE
  459. YSCALEYUV2RGBX
  460. "pcmpeqd %%mm7, %%mm7 \n\t"
  461. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  462. YSCALEYUV2PACKEDX_END
  463. }
  464. }
  465. static inline void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  466. const int16_t **lumSrc, int lumFilterSize,
  467. const int16_t *chrFilter, const int16_t **chrSrc,
  468. int chrFilterSize, const int16_t **alpSrc,
  469. uint8_t *dest, long dstW, long dstY)
  470. {
  471. x86_reg dummy=0;
  472. x86_reg dstW_reg = dstW;
  473. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  474. YSCALEYUV2PACKEDX
  475. YSCALEYUV2RGBX
  476. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  477. "psraw $3, %%mm1 \n\t"
  478. "psraw $3, %%mm7 \n\t"
  479. "packuswb %%mm7, %%mm1 \n\t"
  480. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  481. YSCALEYUV2PACKEDX_END
  482. } else {
  483. YSCALEYUV2PACKEDX
  484. YSCALEYUV2RGBX
  485. "pcmpeqd %%mm7, %%mm7 \n\t"
  486. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  487. YSCALEYUV2PACKEDX_END
  488. }
  489. }
  490. #define REAL_WRITERGB16(dst, dstw, index) \
  491. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  492. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  493. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  494. "psrlq $3, %%mm2 \n\t"\
  495. \
  496. "movq %%mm2, %%mm1 \n\t"\
  497. "movq %%mm4, %%mm3 \n\t"\
  498. \
  499. "punpcklbw %%mm7, %%mm3 \n\t"\
  500. "punpcklbw %%mm5, %%mm2 \n\t"\
  501. "punpckhbw %%mm7, %%mm4 \n\t"\
  502. "punpckhbw %%mm5, %%mm1 \n\t"\
  503. \
  504. "psllq $3, %%mm3 \n\t"\
  505. "psllq $3, %%mm4 \n\t"\
  506. \
  507. "por %%mm3, %%mm2 \n\t"\
  508. "por %%mm4, %%mm1 \n\t"\
  509. \
  510. MOVNTQ(%%mm2, (dst, index, 2))\
  511. MOVNTQ(%%mm1, 8(dst, index, 2))\
  512. \
  513. "add $8, "#index" \n\t"\
  514. "cmp "#dstw", "#index" \n\t"\
  515. " jb 1b \n\t"
  516. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  517. static inline void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  518. const int16_t **lumSrc, int lumFilterSize,
  519. const int16_t *chrFilter, const int16_t **chrSrc,
  520. int chrFilterSize, const int16_t **alpSrc,
  521. uint8_t *dest, long dstW, long dstY)
  522. {
  523. x86_reg dummy=0;
  524. x86_reg dstW_reg = dstW;
  525. YSCALEYUV2PACKEDX_ACCURATE
  526. YSCALEYUV2RGBX
  527. "pxor %%mm7, %%mm7 \n\t"
  528. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  529. #ifdef DITHER1XBPP
  530. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  531. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  532. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  533. #endif
  534. WRITERGB16(%4, %5, %%REGa)
  535. YSCALEYUV2PACKEDX_END
  536. }
  537. static inline void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  538. const int16_t **lumSrc, int lumFilterSize,
  539. const int16_t *chrFilter, const int16_t **chrSrc,
  540. int chrFilterSize, const int16_t **alpSrc,
  541. uint8_t *dest, long dstW, long dstY)
  542. {
  543. x86_reg dummy=0;
  544. x86_reg dstW_reg = dstW;
  545. YSCALEYUV2PACKEDX
  546. YSCALEYUV2RGBX
  547. "pxor %%mm7, %%mm7 \n\t"
  548. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  549. #ifdef DITHER1XBPP
  550. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  551. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  552. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  553. #endif
  554. WRITERGB16(%4, %5, %%REGa)
  555. YSCALEYUV2PACKEDX_END
  556. }
  557. #define REAL_WRITERGB15(dst, dstw, index) \
  558. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  559. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  560. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  561. "psrlq $3, %%mm2 \n\t"\
  562. "psrlq $1, %%mm5 \n\t"\
  563. \
  564. "movq %%mm2, %%mm1 \n\t"\
  565. "movq %%mm4, %%mm3 \n\t"\
  566. \
  567. "punpcklbw %%mm7, %%mm3 \n\t"\
  568. "punpcklbw %%mm5, %%mm2 \n\t"\
  569. "punpckhbw %%mm7, %%mm4 \n\t"\
  570. "punpckhbw %%mm5, %%mm1 \n\t"\
  571. \
  572. "psllq $2, %%mm3 \n\t"\
  573. "psllq $2, %%mm4 \n\t"\
  574. \
  575. "por %%mm3, %%mm2 \n\t"\
  576. "por %%mm4, %%mm1 \n\t"\
  577. \
  578. MOVNTQ(%%mm2, (dst, index, 2))\
  579. MOVNTQ(%%mm1, 8(dst, index, 2))\
  580. \
  581. "add $8, "#index" \n\t"\
  582. "cmp "#dstw", "#index" \n\t"\
  583. " jb 1b \n\t"
  584. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  585. static inline void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  586. const int16_t **lumSrc, int lumFilterSize,
  587. const int16_t *chrFilter, const int16_t **chrSrc,
  588. int chrFilterSize, const int16_t **alpSrc,
  589. uint8_t *dest, long dstW, long dstY)
  590. {
  591. x86_reg dummy=0;
  592. x86_reg dstW_reg = dstW;
  593. YSCALEYUV2PACKEDX_ACCURATE
  594. YSCALEYUV2RGBX
  595. "pxor %%mm7, %%mm7 \n\t"
  596. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  597. #ifdef DITHER1XBPP
  598. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  599. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  600. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  601. #endif
  602. WRITERGB15(%4, %5, %%REGa)
  603. YSCALEYUV2PACKEDX_END
  604. }
  605. static inline void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  606. const int16_t **lumSrc, int lumFilterSize,
  607. const int16_t *chrFilter, const int16_t **chrSrc,
  608. int chrFilterSize, const int16_t **alpSrc,
  609. uint8_t *dest, long dstW, long dstY)
  610. {
  611. x86_reg dummy=0;
  612. x86_reg dstW_reg = dstW;
  613. YSCALEYUV2PACKEDX
  614. YSCALEYUV2RGBX
  615. "pxor %%mm7, %%mm7 \n\t"
  616. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  617. #ifdef DITHER1XBPP
  618. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  619. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  620. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  621. #endif
  622. WRITERGB15(%4, %5, %%REGa)
  623. YSCALEYUV2PACKEDX_END
  624. }
  625. #define WRITEBGR24MMX(dst, dstw, index) \
  626. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  627. "movq %%mm2, %%mm1 \n\t" /* B */\
  628. "movq %%mm5, %%mm6 \n\t" /* R */\
  629. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  630. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  631. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  632. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  633. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  634. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  635. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  636. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  637. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  638. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  639. \
  640. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  641. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  642. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  643. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  644. \
  645. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  646. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  647. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  648. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  649. \
  650. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  651. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  652. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  653. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  654. \
  655. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  656. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  657. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  658. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  659. MOVNTQ(%%mm0, (dst))\
  660. \
  661. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  662. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  663. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  664. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  665. MOVNTQ(%%mm6, 8(dst))\
  666. \
  667. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  668. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  669. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  670. MOVNTQ(%%mm5, 16(dst))\
  671. \
  672. "add $24, "#dst" \n\t"\
  673. \
  674. "add $8, "#index" \n\t"\
  675. "cmp "#dstw", "#index" \n\t"\
  676. " jb 1b \n\t"
  677. #define WRITEBGR24MMX2(dst, dstw, index) \
  678. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  679. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  680. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  681. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  682. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  683. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  684. \
  685. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  686. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  687. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  688. \
  689. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  690. "por %%mm1, %%mm6 \n\t"\
  691. "por %%mm3, %%mm6 \n\t"\
  692. MOVNTQ(%%mm6, (dst))\
  693. \
  694. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  695. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  696. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  697. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  698. \
  699. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  700. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  701. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  702. \
  703. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  704. "por %%mm3, %%mm6 \n\t"\
  705. MOVNTQ(%%mm6, 8(dst))\
  706. \
  707. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  708. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  709. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  710. \
  711. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  712. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  713. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  714. \
  715. "por %%mm1, %%mm3 \n\t"\
  716. "por %%mm3, %%mm6 \n\t"\
  717. MOVNTQ(%%mm6, 16(dst))\
  718. \
  719. "add $24, "#dst" \n\t"\
  720. \
  721. "add $8, "#index" \n\t"\
  722. "cmp "#dstw", "#index" \n\t"\
  723. " jb 1b \n\t"
  724. #if COMPILE_TEMPLATE_MMX2
  725. #undef WRITEBGR24
  726. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  727. #else
  728. #undef WRITEBGR24
  729. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  730. #endif
  731. static inline void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  732. const int16_t **lumSrc, int lumFilterSize,
  733. const int16_t *chrFilter, const int16_t **chrSrc,
  734. int chrFilterSize, const int16_t **alpSrc,
  735. uint8_t *dest, long dstW, long dstY)
  736. {
  737. x86_reg dummy=0;
  738. x86_reg dstW_reg = dstW;
  739. YSCALEYUV2PACKEDX_ACCURATE
  740. YSCALEYUV2RGBX
  741. "pxor %%mm7, %%mm7 \n\t"
  742. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  743. "add %4, %%"REG_c" \n\t"
  744. WRITEBGR24(%%REGc, %5, %%REGa)
  745. :: "r" (&c->redDither),
  746. "m" (dummy), "m" (dummy), "m" (dummy),
  747. "r" (dest), "m" (dstW_reg)
  748. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  749. );
  750. }
  751. static inline void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  752. const int16_t **lumSrc, int lumFilterSize,
  753. const int16_t *chrFilter, const int16_t **chrSrc,
  754. int chrFilterSize, const int16_t **alpSrc,
  755. uint8_t *dest, long dstW, long dstY)
  756. {
  757. x86_reg dummy=0;
  758. x86_reg dstW_reg = dstW;
  759. YSCALEYUV2PACKEDX
  760. YSCALEYUV2RGBX
  761. "pxor %%mm7, %%mm7 \n\t"
  762. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  763. "add %4, %%"REG_c" \n\t"
  764. WRITEBGR24(%%REGc, %5, %%REGa)
  765. :: "r" (&c->redDither),
  766. "m" (dummy), "m" (dummy), "m" (dummy),
  767. "r" (dest), "m" (dstW_reg)
  768. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  769. );
  770. }
  771. #define REAL_WRITEYUY2(dst, dstw, index) \
  772. "packuswb %%mm3, %%mm3 \n\t"\
  773. "packuswb %%mm4, %%mm4 \n\t"\
  774. "packuswb %%mm7, %%mm1 \n\t"\
  775. "punpcklbw %%mm4, %%mm3 \n\t"\
  776. "movq %%mm1, %%mm7 \n\t"\
  777. "punpcklbw %%mm3, %%mm1 \n\t"\
  778. "punpckhbw %%mm3, %%mm7 \n\t"\
  779. \
  780. MOVNTQ(%%mm1, (dst, index, 2))\
  781. MOVNTQ(%%mm7, 8(dst, index, 2))\
  782. \
  783. "add $8, "#index" \n\t"\
  784. "cmp "#dstw", "#index" \n\t"\
  785. " jb 1b \n\t"
  786. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  787. static inline void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  788. const int16_t **lumSrc, int lumFilterSize,
  789. const int16_t *chrFilter, const int16_t **chrSrc,
  790. int chrFilterSize, const int16_t **alpSrc,
  791. uint8_t *dest, long dstW, long dstY)
  792. {
  793. x86_reg dummy=0;
  794. x86_reg dstW_reg = dstW;
  795. YSCALEYUV2PACKEDX_ACCURATE
  796. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  797. "psraw $3, %%mm3 \n\t"
  798. "psraw $3, %%mm4 \n\t"
  799. "psraw $3, %%mm1 \n\t"
  800. "psraw $3, %%mm7 \n\t"
  801. WRITEYUY2(%4, %5, %%REGa)
  802. YSCALEYUV2PACKEDX_END
  803. }
  804. static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  805. const int16_t **lumSrc, int lumFilterSize,
  806. const int16_t *chrFilter, const int16_t **chrSrc,
  807. int chrFilterSize, const int16_t **alpSrc,
  808. uint8_t *dest, long dstW, long dstY)
  809. {
  810. x86_reg dummy=0;
  811. x86_reg dstW_reg = dstW;
  812. YSCALEYUV2PACKEDX
  813. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  814. "psraw $3, %%mm3 \n\t"
  815. "psraw $3, %%mm4 \n\t"
  816. "psraw $3, %%mm1 \n\t"
  817. "psraw $3, %%mm7 \n\t"
  818. WRITEYUY2(%4, %5, %%REGa)
  819. YSCALEYUV2PACKEDX_END
  820. }
  821. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  822. "xor "#index", "#index" \n\t"\
  823. ".p2align 4 \n\t"\
  824. "1: \n\t"\
  825. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  826. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  827. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  828. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  829. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  830. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  831. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  832. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  833. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  834. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  835. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  836. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  837. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  838. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  839. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  840. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  841. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  842. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  843. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  844. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  845. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  846. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  847. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  848. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  849. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  850. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  851. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  852. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  853. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  854. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  855. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  856. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  857. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  858. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  859. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  860. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  861. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  862. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  863. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  864. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  865. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  866. "paddw %%mm3, %%mm4 \n\t"\
  867. "movq %%mm2, %%mm0 \n\t"\
  868. "movq %%mm5, %%mm6 \n\t"\
  869. "movq %%mm4, %%mm3 \n\t"\
  870. "punpcklwd %%mm2, %%mm2 \n\t"\
  871. "punpcklwd %%mm5, %%mm5 \n\t"\
  872. "punpcklwd %%mm4, %%mm4 \n\t"\
  873. "paddw %%mm1, %%mm2 \n\t"\
  874. "paddw %%mm1, %%mm5 \n\t"\
  875. "paddw %%mm1, %%mm4 \n\t"\
  876. "punpckhwd %%mm0, %%mm0 \n\t"\
  877. "punpckhwd %%mm6, %%mm6 \n\t"\
  878. "punpckhwd %%mm3, %%mm3 \n\t"\
  879. "paddw %%mm7, %%mm0 \n\t"\
  880. "paddw %%mm7, %%mm6 \n\t"\
  881. "paddw %%mm7, %%mm3 \n\t"\
  882. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  883. "packuswb %%mm0, %%mm2 \n\t"\
  884. "packuswb %%mm6, %%mm5 \n\t"\
  885. "packuswb %%mm3, %%mm4 \n\t"\
  886. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  887. #define YSCALEYUV2RGB(index, c) \
  888. REAL_YSCALEYUV2RGB_UV(index, c) \
  889. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  890. REAL_YSCALEYUV2RGB_COEFF(c)
  891. /**
  892. * vertical bilinear scale YV12 to RGB
  893. */
  894. static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0,
  895. const uint16_t *buf1, const uint16_t *uvbuf0,
  896. const uint16_t *uvbuf1, const uint16_t *abuf0,
  897. const uint16_t *abuf1, uint8_t *dest,
  898. int dstW, int yalpha, int uvalpha, int y)
  899. {
  900. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  901. #if ARCH_X86_64
  902. __asm__ volatile(
  903. YSCALEYUV2RGB(%%r8, %5)
  904. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  905. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  906. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  907. "packuswb %%mm7, %%mm1 \n\t"
  908. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  909. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
  910. "a" (&c->redDither),
  911. "r" (abuf0), "r" (abuf1)
  912. : "%r8"
  913. );
  914. #else
  915. c->u_temp=(intptr_t)abuf0;
  916. c->v_temp=(intptr_t)abuf1;
  917. __asm__ volatile(
  918. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  919. "mov %4, %%"REG_b" \n\t"
  920. "push %%"REG_BP" \n\t"
  921. YSCALEYUV2RGB(%%REGBP, %5)
  922. "push %0 \n\t"
  923. "push %1 \n\t"
  924. "mov "U_TEMP"(%5), %0 \n\t"
  925. "mov "V_TEMP"(%5), %1 \n\t"
  926. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  927. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  928. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  929. "packuswb %%mm7, %%mm1 \n\t"
  930. "pop %1 \n\t"
  931. "pop %0 \n\t"
  932. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  933. "pop %%"REG_BP" \n\t"
  934. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  935. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  936. "a" (&c->redDither)
  937. );
  938. #endif
  939. } else {
  940. __asm__ volatile(
  941. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  942. "mov %4, %%"REG_b" \n\t"
  943. "push %%"REG_BP" \n\t"
  944. YSCALEYUV2RGB(%%REGBP, %5)
  945. "pcmpeqd %%mm7, %%mm7 \n\t"
  946. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  947. "pop %%"REG_BP" \n\t"
  948. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  949. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  950. "a" (&c->redDither)
  951. );
  952. }
  953. }
  954. static inline void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0,
  955. const uint16_t *buf1, const uint16_t *uvbuf0,
  956. const uint16_t *uvbuf1, const uint16_t *abuf0,
  957. const uint16_t *abuf1, uint8_t *dest,
  958. int dstW, int yalpha, int uvalpha, int y)
  959. {
  960. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  961. __asm__ volatile(
  962. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  963. "mov %4, %%"REG_b" \n\t"
  964. "push %%"REG_BP" \n\t"
  965. YSCALEYUV2RGB(%%REGBP, %5)
  966. "pxor %%mm7, %%mm7 \n\t"
  967. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  968. "pop %%"REG_BP" \n\t"
  969. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  970. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  971. "a" (&c->redDither)
  972. );
  973. }
  974. static inline void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0,
  975. const uint16_t *buf1, const uint16_t *uvbuf0,
  976. const uint16_t *uvbuf1, const uint16_t *abuf0,
  977. const uint16_t *abuf1, uint8_t *dest,
  978. int dstW, int yalpha, int uvalpha, int y)
  979. {
  980. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  981. __asm__ volatile(
  982. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  983. "mov %4, %%"REG_b" \n\t"
  984. "push %%"REG_BP" \n\t"
  985. YSCALEYUV2RGB(%%REGBP, %5)
  986. "pxor %%mm7, %%mm7 \n\t"
  987. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  988. #ifdef DITHER1XBPP
  989. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  990. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  991. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  992. #endif
  993. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  994. "pop %%"REG_BP" \n\t"
  995. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  996. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  997. "a" (&c->redDither)
  998. );
  999. }
  1000. static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0,
  1001. const uint16_t *buf1, const uint16_t *uvbuf0,
  1002. const uint16_t *uvbuf1, const uint16_t *abuf0,
  1003. const uint16_t *abuf1, uint8_t *dest,
  1004. int dstW, int yalpha, int uvalpha, int y)
  1005. {
  1006. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1007. __asm__ volatile(
  1008. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1009. "mov %4, %%"REG_b" \n\t"
  1010. "push %%"REG_BP" \n\t"
  1011. YSCALEYUV2RGB(%%REGBP, %5)
  1012. "pxor %%mm7, %%mm7 \n\t"
  1013. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1014. #ifdef DITHER1XBPP
  1015. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1016. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1017. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1018. #endif
  1019. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1020. "pop %%"REG_BP" \n\t"
  1021. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1022. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1023. "a" (&c->redDither)
  1024. );
  1025. }
  1026. #define REAL_YSCALEYUV2PACKED(index, c) \
  1027. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1028. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  1029. "psraw $3, %%mm0 \n\t"\
  1030. "psraw $3, %%mm1 \n\t"\
  1031. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1032. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1033. "xor "#index", "#index" \n\t"\
  1034. ".p2align 4 \n\t"\
  1035. "1: \n\t"\
  1036. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1037. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1038. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1039. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1040. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  1041. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  1042. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1043. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  1044. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  1045. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1046. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1047. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  1048. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  1049. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  1050. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  1051. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  1052. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  1053. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  1054. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  1055. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1056. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1057. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1058. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1059. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1060. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1061. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  1062. static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0,
  1063. const uint16_t *buf1, const uint16_t *uvbuf0,
  1064. const uint16_t *uvbuf1, const uint16_t *abuf0,
  1065. const uint16_t *abuf1, uint8_t *dest,
  1066. int dstW, int yalpha, int uvalpha, int y)
  1067. {
  1068. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1069. __asm__ volatile(
  1070. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1071. "mov %4, %%"REG_b" \n\t"
  1072. "push %%"REG_BP" \n\t"
  1073. YSCALEYUV2PACKED(%%REGBP, %5)
  1074. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1075. "pop %%"REG_BP" \n\t"
  1076. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1077. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1078. "a" (&c->redDither)
  1079. );
  1080. }
  1081. #define REAL_YSCALEYUV2RGB1(index, c) \
  1082. "xor "#index", "#index" \n\t"\
  1083. ".p2align 4 \n\t"\
  1084. "1: \n\t"\
  1085. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1086. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1087. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1088. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1089. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1090. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1091. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1092. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1093. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1094. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1095. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1096. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1097. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1098. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1099. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1100. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1101. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1102. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1103. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1104. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1105. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1106. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1107. "paddw %%mm3, %%mm4 \n\t"\
  1108. "movq %%mm2, %%mm0 \n\t"\
  1109. "movq %%mm5, %%mm6 \n\t"\
  1110. "movq %%mm4, %%mm3 \n\t"\
  1111. "punpcklwd %%mm2, %%mm2 \n\t"\
  1112. "punpcklwd %%mm5, %%mm5 \n\t"\
  1113. "punpcklwd %%mm4, %%mm4 \n\t"\
  1114. "paddw %%mm1, %%mm2 \n\t"\
  1115. "paddw %%mm1, %%mm5 \n\t"\
  1116. "paddw %%mm1, %%mm4 \n\t"\
  1117. "punpckhwd %%mm0, %%mm0 \n\t"\
  1118. "punpckhwd %%mm6, %%mm6 \n\t"\
  1119. "punpckhwd %%mm3, %%mm3 \n\t"\
  1120. "paddw %%mm7, %%mm0 \n\t"\
  1121. "paddw %%mm7, %%mm6 \n\t"\
  1122. "paddw %%mm7, %%mm3 \n\t"\
  1123. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1124. "packuswb %%mm0, %%mm2 \n\t"\
  1125. "packuswb %%mm6, %%mm5 \n\t"\
  1126. "packuswb %%mm3, %%mm4 \n\t"\
  1127. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1128. // do vertical chrominance interpolation
  1129. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1130. "xor "#index", "#index" \n\t"\
  1131. ".p2align 4 \n\t"\
  1132. "1: \n\t"\
  1133. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1134. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1135. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1136. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1137. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1138. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1139. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1140. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1141. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1142. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1143. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1144. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1145. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1146. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1147. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1148. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1149. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1150. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1151. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1152. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1153. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1154. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1155. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1156. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1157. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1158. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1159. "paddw %%mm3, %%mm4 \n\t"\
  1160. "movq %%mm2, %%mm0 \n\t"\
  1161. "movq %%mm5, %%mm6 \n\t"\
  1162. "movq %%mm4, %%mm3 \n\t"\
  1163. "punpcklwd %%mm2, %%mm2 \n\t"\
  1164. "punpcklwd %%mm5, %%mm5 \n\t"\
  1165. "punpcklwd %%mm4, %%mm4 \n\t"\
  1166. "paddw %%mm1, %%mm2 \n\t"\
  1167. "paddw %%mm1, %%mm5 \n\t"\
  1168. "paddw %%mm1, %%mm4 \n\t"\
  1169. "punpckhwd %%mm0, %%mm0 \n\t"\
  1170. "punpckhwd %%mm6, %%mm6 \n\t"\
  1171. "punpckhwd %%mm3, %%mm3 \n\t"\
  1172. "paddw %%mm7, %%mm0 \n\t"\
  1173. "paddw %%mm7, %%mm6 \n\t"\
  1174. "paddw %%mm7, %%mm3 \n\t"\
  1175. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1176. "packuswb %%mm0, %%mm2 \n\t"\
  1177. "packuswb %%mm6, %%mm5 \n\t"\
  1178. "packuswb %%mm3, %%mm4 \n\t"\
  1179. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1180. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1181. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1182. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1183. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1184. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1185. "packuswb %%mm1, %%mm7 \n\t"
  1186. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1187. /**
  1188. * YV12 to RGB without scaling or interpolating
  1189. */
  1190. static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0,
  1191. const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1192. const uint16_t *abuf0, uint8_t *dest,
  1193. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1194. int flags, int y)
  1195. {
  1196. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1197. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1198. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1199. __asm__ volatile(
  1200. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1201. "mov %4, %%"REG_b" \n\t"
  1202. "push %%"REG_BP" \n\t"
  1203. YSCALEYUV2RGB1(%%REGBP, %5)
  1204. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1205. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1206. "pop %%"REG_BP" \n\t"
  1207. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1208. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1209. "a" (&c->redDither)
  1210. );
  1211. } else {
  1212. __asm__ volatile(
  1213. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1214. "mov %4, %%"REG_b" \n\t"
  1215. "push %%"REG_BP" \n\t"
  1216. YSCALEYUV2RGB1(%%REGBP, %5)
  1217. "pcmpeqd %%mm7, %%mm7 \n\t"
  1218. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1219. "pop %%"REG_BP" \n\t"
  1220. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1221. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1222. "a" (&c->redDither)
  1223. );
  1224. }
  1225. } else {
  1226. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1227. __asm__ volatile(
  1228. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1229. "mov %4, %%"REG_b" \n\t"
  1230. "push %%"REG_BP" \n\t"
  1231. YSCALEYUV2RGB1b(%%REGBP, %5)
  1232. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1233. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1234. "pop %%"REG_BP" \n\t"
  1235. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1236. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1237. "a" (&c->redDither)
  1238. );
  1239. } else {
  1240. __asm__ volatile(
  1241. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1242. "mov %4, %%"REG_b" \n\t"
  1243. "push %%"REG_BP" \n\t"
  1244. YSCALEYUV2RGB1b(%%REGBP, %5)
  1245. "pcmpeqd %%mm7, %%mm7 \n\t"
  1246. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1247. "pop %%"REG_BP" \n\t"
  1248. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1249. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1250. "a" (&c->redDither)
  1251. );
  1252. }
  1253. }
  1254. }
  1255. static inline void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0,
  1256. const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1257. const uint16_t *abuf0, uint8_t *dest,
  1258. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1259. int flags, int y)
  1260. {
  1261. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1262. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1263. __asm__ volatile(
  1264. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1265. "mov %4, %%"REG_b" \n\t"
  1266. "push %%"REG_BP" \n\t"
  1267. YSCALEYUV2RGB1(%%REGBP, %5)
  1268. "pxor %%mm7, %%mm7 \n\t"
  1269. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1270. "pop %%"REG_BP" \n\t"
  1271. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1272. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1273. "a" (&c->redDither)
  1274. );
  1275. } else {
  1276. __asm__ volatile(
  1277. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1278. "mov %4, %%"REG_b" \n\t"
  1279. "push %%"REG_BP" \n\t"
  1280. YSCALEYUV2RGB1b(%%REGBP, %5)
  1281. "pxor %%mm7, %%mm7 \n\t"
  1282. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1283. "pop %%"REG_BP" \n\t"
  1284. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1285. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1286. "a" (&c->redDither)
  1287. );
  1288. }
  1289. }
  1290. static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
  1291. const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1292. const uint16_t *abuf0, uint8_t *dest,
  1293. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1294. int flags, int y)
  1295. {
  1296. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1297. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1298. __asm__ volatile(
  1299. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1300. "mov %4, %%"REG_b" \n\t"
  1301. "push %%"REG_BP" \n\t"
  1302. YSCALEYUV2RGB1(%%REGBP, %5)
  1303. "pxor %%mm7, %%mm7 \n\t"
  1304. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1305. #ifdef DITHER1XBPP
  1306. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1307. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1308. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1309. #endif
  1310. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1311. "pop %%"REG_BP" \n\t"
  1312. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1313. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1314. "a" (&c->redDither)
  1315. );
  1316. } else {
  1317. __asm__ volatile(
  1318. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1319. "mov %4, %%"REG_b" \n\t"
  1320. "push %%"REG_BP" \n\t"
  1321. YSCALEYUV2RGB1b(%%REGBP, %5)
  1322. "pxor %%mm7, %%mm7 \n\t"
  1323. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1324. #ifdef DITHER1XBPP
  1325. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1326. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1327. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1328. #endif
  1329. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1330. "pop %%"REG_BP" \n\t"
  1331. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1332. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1333. "a" (&c->redDither)
  1334. );
  1335. }
  1336. }
  1337. static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
  1338. const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1339. const uint16_t *abuf0, uint8_t *dest,
  1340. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1341. int flags, int y)
  1342. {
  1343. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1344. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1345. __asm__ volatile(
  1346. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1347. "mov %4, %%"REG_b" \n\t"
  1348. "push %%"REG_BP" \n\t"
  1349. YSCALEYUV2RGB1(%%REGBP, %5)
  1350. "pxor %%mm7, %%mm7 \n\t"
  1351. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1352. #ifdef DITHER1XBPP
  1353. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1354. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1355. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1356. #endif
  1357. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1358. "pop %%"REG_BP" \n\t"
  1359. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1360. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1361. "a" (&c->redDither)
  1362. );
  1363. } else {
  1364. __asm__ volatile(
  1365. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1366. "mov %4, %%"REG_b" \n\t"
  1367. "push %%"REG_BP" \n\t"
  1368. YSCALEYUV2RGB1b(%%REGBP, %5)
  1369. "pxor %%mm7, %%mm7 \n\t"
  1370. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1371. #ifdef DITHER1XBPP
  1372. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1373. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1374. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1375. #endif
  1376. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1377. "pop %%"REG_BP" \n\t"
  1378. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1379. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1380. "a" (&c->redDither)
  1381. );
  1382. }
  1383. }
  1384. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1385. "xor "#index", "#index" \n\t"\
  1386. ".p2align 4 \n\t"\
  1387. "1: \n\t"\
  1388. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1389. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1390. "psraw $7, %%mm3 \n\t" \
  1391. "psraw $7, %%mm4 \n\t" \
  1392. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1393. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1394. "psraw $7, %%mm1 \n\t" \
  1395. "psraw $7, %%mm7 \n\t" \
  1396. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1397. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1398. "xor "#index", "#index" \n\t"\
  1399. ".p2align 4 \n\t"\
  1400. "1: \n\t"\
  1401. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1402. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1403. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1404. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1405. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1406. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1407. "psrlw $8, %%mm3 \n\t" \
  1408. "psrlw $8, %%mm4 \n\t" \
  1409. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1410. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1411. "psraw $7, %%mm1 \n\t" \
  1412. "psraw $7, %%mm7 \n\t"
  1413. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1414. static inline void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0,
  1415. const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1416. const uint16_t *abuf0, uint8_t *dest,
  1417. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1418. int flags, int y)
  1419. {
  1420. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1421. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1422. __asm__ volatile(
  1423. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1424. "mov %4, %%"REG_b" \n\t"
  1425. "push %%"REG_BP" \n\t"
  1426. YSCALEYUV2PACKED1(%%REGBP, %5)
  1427. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1428. "pop %%"REG_BP" \n\t"
  1429. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1430. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1431. "a" (&c->redDither)
  1432. );
  1433. } else {
  1434. __asm__ volatile(
  1435. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1436. "mov %4, %%"REG_b" \n\t"
  1437. "push %%"REG_BP" \n\t"
  1438. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1439. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1440. "pop %%"REG_BP" \n\t"
  1441. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1442. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1443. "a" (&c->redDither)
  1444. );
  1445. }
  1446. }
  1447. #if !COMPILE_TEMPLATE_MMX2
  1448. //FIXME yuy2* can read up to 7 samples too much
  1449. static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1450. {
  1451. __asm__ volatile(
  1452. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1453. "mov %0, %%"REG_a" \n\t"
  1454. "1: \n\t"
  1455. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1456. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1457. "pand %%mm2, %%mm0 \n\t"
  1458. "pand %%mm2, %%mm1 \n\t"
  1459. "packuswb %%mm1, %%mm0 \n\t"
  1460. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1461. "add $8, %%"REG_a" \n\t"
  1462. " js 1b \n\t"
  1463. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1464. : "%"REG_a
  1465. );
  1466. }
  1467. static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1468. {
  1469. __asm__ volatile(
  1470. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1471. "mov %0, %%"REG_a" \n\t"
  1472. "1: \n\t"
  1473. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1474. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1475. "psrlw $8, %%mm0 \n\t"
  1476. "psrlw $8, %%mm1 \n\t"
  1477. "packuswb %%mm1, %%mm0 \n\t"
  1478. "movq %%mm0, %%mm1 \n\t"
  1479. "psrlw $8, %%mm0 \n\t"
  1480. "pand %%mm4, %%mm1 \n\t"
  1481. "packuswb %%mm0, %%mm0 \n\t"
  1482. "packuswb %%mm1, %%mm1 \n\t"
  1483. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1484. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1485. "add $4, %%"REG_a" \n\t"
  1486. " js 1b \n\t"
  1487. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1488. : "%"REG_a
  1489. );
  1490. assert(src1 == src2);
  1491. }
  1492. static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1493. {
  1494. __asm__ volatile(
  1495. "mov %0, %%"REG_a" \n\t"
  1496. "1: \n\t"
  1497. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1498. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1499. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1500. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1501. "psrlw $8, %%mm0 \n\t"
  1502. "psrlw $8, %%mm1 \n\t"
  1503. "psrlw $8, %%mm2 \n\t"
  1504. "psrlw $8, %%mm3 \n\t"
  1505. "packuswb %%mm1, %%mm0 \n\t"
  1506. "packuswb %%mm3, %%mm2 \n\t"
  1507. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1508. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1509. "add $8, %%"REG_a" \n\t"
  1510. " js 1b \n\t"
  1511. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1512. : "%"REG_a
  1513. );
  1514. }
  1515. /* This is almost identical to the previous, end exists only because
  1516. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1517. static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1518. {
  1519. __asm__ volatile(
  1520. "mov %0, %%"REG_a" \n\t"
  1521. "1: \n\t"
  1522. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1523. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1524. "psrlw $8, %%mm0 \n\t"
  1525. "psrlw $8, %%mm1 \n\t"
  1526. "packuswb %%mm1, %%mm0 \n\t"
  1527. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1528. "add $8, %%"REG_a" \n\t"
  1529. " js 1b \n\t"
  1530. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1531. : "%"REG_a
  1532. );
  1533. }
  1534. static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1535. {
  1536. __asm__ volatile(
  1537. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1538. "mov %0, %%"REG_a" \n\t"
  1539. "1: \n\t"
  1540. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1541. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1542. "pand %%mm4, %%mm0 \n\t"
  1543. "pand %%mm4, %%mm1 \n\t"
  1544. "packuswb %%mm1, %%mm0 \n\t"
  1545. "movq %%mm0, %%mm1 \n\t"
  1546. "psrlw $8, %%mm0 \n\t"
  1547. "pand %%mm4, %%mm1 \n\t"
  1548. "packuswb %%mm0, %%mm0 \n\t"
  1549. "packuswb %%mm1, %%mm1 \n\t"
  1550. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1551. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1552. "add $4, %%"REG_a" \n\t"
  1553. " js 1b \n\t"
  1554. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1555. : "%"REG_a
  1556. );
  1557. assert(src1 == src2);
  1558. }
  1559. static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1560. {
  1561. __asm__ volatile(
  1562. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1563. "mov %0, %%"REG_a" \n\t"
  1564. "1: \n\t"
  1565. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1566. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1567. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1568. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1569. "pand %%mm4, %%mm0 \n\t"
  1570. "pand %%mm4, %%mm1 \n\t"
  1571. "pand %%mm4, %%mm2 \n\t"
  1572. "pand %%mm4, %%mm3 \n\t"
  1573. "packuswb %%mm1, %%mm0 \n\t"
  1574. "packuswb %%mm3, %%mm2 \n\t"
  1575. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1576. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1577. "add $8, %%"REG_a" \n\t"
  1578. " js 1b \n\t"
  1579. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1580. : "%"REG_a
  1581. );
  1582. }
  1583. static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1584. const uint8_t *src, long width)
  1585. {
  1586. __asm__ volatile(
  1587. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1588. "mov %0, %%"REG_a" \n\t"
  1589. "1: \n\t"
  1590. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1591. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1592. "movq %%mm0, %%mm2 \n\t"
  1593. "movq %%mm1, %%mm3 \n\t"
  1594. "pand %%mm4, %%mm0 \n\t"
  1595. "pand %%mm4, %%mm1 \n\t"
  1596. "psrlw $8, %%mm2 \n\t"
  1597. "psrlw $8, %%mm3 \n\t"
  1598. "packuswb %%mm1, %%mm0 \n\t"
  1599. "packuswb %%mm3, %%mm2 \n\t"
  1600. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1601. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1602. "add $8, %%"REG_a" \n\t"
  1603. " js 1b \n\t"
  1604. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1605. : "%"REG_a
  1606. );
  1607. }
  1608. static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1609. const uint8_t *src1, const uint8_t *src2,
  1610. long width, uint32_t *unused)
  1611. {
  1612. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1613. }
  1614. static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1615. const uint8_t *src1, const uint8_t *src2,
  1616. long width, uint32_t *unused)
  1617. {
  1618. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1619. }
  1620. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1621. static inline void RENAME(bgr24ToY_mmx)(int16_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
  1622. {
  1623. if(srcFormat == PIX_FMT_BGR24) {
  1624. __asm__ volatile(
  1625. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1626. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1627. :
  1628. );
  1629. } else {
  1630. __asm__ volatile(
  1631. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1632. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1633. :
  1634. );
  1635. }
  1636. __asm__ volatile(
  1637. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1638. "mov %2, %%"REG_a" \n\t"
  1639. "pxor %%mm7, %%mm7 \n\t"
  1640. "1: \n\t"
  1641. PREFETCH" 64(%0) \n\t"
  1642. "movd (%0), %%mm0 \n\t"
  1643. "movd 2(%0), %%mm1 \n\t"
  1644. "movd 6(%0), %%mm2 \n\t"
  1645. "movd 8(%0), %%mm3 \n\t"
  1646. "add $12, %0 \n\t"
  1647. "punpcklbw %%mm7, %%mm0 \n\t"
  1648. "punpcklbw %%mm7, %%mm1 \n\t"
  1649. "punpcklbw %%mm7, %%mm2 \n\t"
  1650. "punpcklbw %%mm7, %%mm3 \n\t"
  1651. "pmaddwd %%mm5, %%mm0 \n\t"
  1652. "pmaddwd %%mm6, %%mm1 \n\t"
  1653. "pmaddwd %%mm5, %%mm2 \n\t"
  1654. "pmaddwd %%mm6, %%mm3 \n\t"
  1655. "paddd %%mm1, %%mm0 \n\t"
  1656. "paddd %%mm3, %%mm2 \n\t"
  1657. "paddd %%mm4, %%mm0 \n\t"
  1658. "paddd %%mm4, %%mm2 \n\t"
  1659. "psrad $9, %%mm0 \n\t"
  1660. "psrad $9, %%mm2 \n\t"
  1661. "packssdw %%mm2, %%mm0 \n\t"
  1662. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1663. "add $8, %%"REG_a" \n\t"
  1664. " js 1b \n\t"
  1665. : "+r" (src)
  1666. : "r" (dst+width), "g" ((x86_reg)-2*width)
  1667. : "%"REG_a
  1668. );
  1669. }
  1670. static inline void RENAME(bgr24ToUV_mmx)(int16_t *dstU, int16_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
  1671. {
  1672. __asm__ volatile(
  1673. "movq 24(%4), %%mm6 \n\t"
  1674. "mov %3, %%"REG_a" \n\t"
  1675. "pxor %%mm7, %%mm7 \n\t"
  1676. "1: \n\t"
  1677. PREFETCH" 64(%0) \n\t"
  1678. "movd (%0), %%mm0 \n\t"
  1679. "movd 2(%0), %%mm1 \n\t"
  1680. "punpcklbw %%mm7, %%mm0 \n\t"
  1681. "punpcklbw %%mm7, %%mm1 \n\t"
  1682. "movq %%mm0, %%mm2 \n\t"
  1683. "movq %%mm1, %%mm3 \n\t"
  1684. "pmaddwd (%4), %%mm0 \n\t"
  1685. "pmaddwd 8(%4), %%mm1 \n\t"
  1686. "pmaddwd 16(%4), %%mm2 \n\t"
  1687. "pmaddwd %%mm6, %%mm3 \n\t"
  1688. "paddd %%mm1, %%mm0 \n\t"
  1689. "paddd %%mm3, %%mm2 \n\t"
  1690. "movd 6(%0), %%mm1 \n\t"
  1691. "movd 8(%0), %%mm3 \n\t"
  1692. "add $12, %0 \n\t"
  1693. "punpcklbw %%mm7, %%mm1 \n\t"
  1694. "punpcklbw %%mm7, %%mm3 \n\t"
  1695. "movq %%mm1, %%mm4 \n\t"
  1696. "movq %%mm3, %%mm5 \n\t"
  1697. "pmaddwd (%4), %%mm1 \n\t"
  1698. "pmaddwd 8(%4), %%mm3 \n\t"
  1699. "pmaddwd 16(%4), %%mm4 \n\t"
  1700. "pmaddwd %%mm6, %%mm5 \n\t"
  1701. "paddd %%mm3, %%mm1 \n\t"
  1702. "paddd %%mm5, %%mm4 \n\t"
  1703. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1704. "paddd %%mm3, %%mm0 \n\t"
  1705. "paddd %%mm3, %%mm2 \n\t"
  1706. "paddd %%mm3, %%mm1 \n\t"
  1707. "paddd %%mm3, %%mm4 \n\t"
  1708. "psrad $9, %%mm0 \n\t"
  1709. "psrad $9, %%mm2 \n\t"
  1710. "psrad $9, %%mm1 \n\t"
  1711. "psrad $9, %%mm4 \n\t"
  1712. "packssdw %%mm1, %%mm0 \n\t"
  1713. "packssdw %%mm4, %%mm2 \n\t"
  1714. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1715. "movq %%mm2, (%2, %%"REG_a") \n\t"
  1716. "add $8, %%"REG_a" \n\t"
  1717. " js 1b \n\t"
  1718. : "+r" (src)
  1719. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-2*width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
  1720. : "%"REG_a
  1721. );
  1722. }
  1723. static inline void RENAME(bgr24ToY)(int16_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1724. {
  1725. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1726. }
  1727. static inline void RENAME(bgr24ToUV)(int16_t *dstU, int16_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1728. {
  1729. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1730. assert(src1 == src2);
  1731. }
  1732. static inline void RENAME(rgb24ToY)(int16_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1733. {
  1734. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1735. }
  1736. static inline void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1737. {
  1738. assert(src1==src2);
  1739. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1740. }
  1741. #if !COMPILE_TEMPLATE_MMX2
  1742. // bilinear / bicubic scaling
  1743. static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
  1744. const int16_t *filter, const int16_t *filterPos, long filterSize)
  1745. {
  1746. assert(filterSize % 4 == 0 && filterSize>0);
  1747. if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
  1748. x86_reg counter= -2*dstW;
  1749. filter-= counter*2;
  1750. filterPos-= counter/2;
  1751. dst-= counter/2;
  1752. __asm__ volatile(
  1753. #if defined(PIC)
  1754. "push %%"REG_b" \n\t"
  1755. #endif
  1756. "pxor %%mm7, %%mm7 \n\t"
  1757. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1758. "mov %%"REG_a", %%"REG_BP" \n\t"
  1759. ".p2align 4 \n\t"
  1760. "1: \n\t"
  1761. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1762. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1763. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1764. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1765. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1766. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1767. "punpcklbw %%mm7, %%mm0 \n\t"
  1768. "punpcklbw %%mm7, %%mm2 \n\t"
  1769. "pmaddwd %%mm1, %%mm0 \n\t"
  1770. "pmaddwd %%mm2, %%mm3 \n\t"
  1771. "movq %%mm0, %%mm4 \n\t"
  1772. "punpckldq %%mm3, %%mm0 \n\t"
  1773. "punpckhdq %%mm3, %%mm4 \n\t"
  1774. "paddd %%mm4, %%mm0 \n\t"
  1775. "psrad $7, %%mm0 \n\t"
  1776. "packssdw %%mm0, %%mm0 \n\t"
  1777. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1778. "add $4, %%"REG_BP" \n\t"
  1779. " jnc 1b \n\t"
  1780. "pop %%"REG_BP" \n\t"
  1781. #if defined(PIC)
  1782. "pop %%"REG_b" \n\t"
  1783. #endif
  1784. : "+a" (counter)
  1785. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1786. #if !defined(PIC)
  1787. : "%"REG_b
  1788. #endif
  1789. );
  1790. } else if (filterSize==8) {
  1791. x86_reg counter= -2*dstW;
  1792. filter-= counter*4;
  1793. filterPos-= counter/2;
  1794. dst-= counter/2;
  1795. __asm__ volatile(
  1796. #if defined(PIC)
  1797. "push %%"REG_b" \n\t"
  1798. #endif
  1799. "pxor %%mm7, %%mm7 \n\t"
  1800. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1801. "mov %%"REG_a", %%"REG_BP" \n\t"
  1802. ".p2align 4 \n\t"
  1803. "1: \n\t"
  1804. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1805. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1806. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  1807. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  1808. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1809. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1810. "punpcklbw %%mm7, %%mm0 \n\t"
  1811. "punpcklbw %%mm7, %%mm2 \n\t"
  1812. "pmaddwd %%mm1, %%mm0 \n\t"
  1813. "pmaddwd %%mm2, %%mm3 \n\t"
  1814. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  1815. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  1816. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  1817. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  1818. "punpcklbw %%mm7, %%mm4 \n\t"
  1819. "punpcklbw %%mm7, %%mm2 \n\t"
  1820. "pmaddwd %%mm1, %%mm4 \n\t"
  1821. "pmaddwd %%mm2, %%mm5 \n\t"
  1822. "paddd %%mm4, %%mm0 \n\t"
  1823. "paddd %%mm5, %%mm3 \n\t"
  1824. "movq %%mm0, %%mm4 \n\t"
  1825. "punpckldq %%mm3, %%mm0 \n\t"
  1826. "punpckhdq %%mm3, %%mm4 \n\t"
  1827. "paddd %%mm4, %%mm0 \n\t"
  1828. "psrad $7, %%mm0 \n\t"
  1829. "packssdw %%mm0, %%mm0 \n\t"
  1830. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1831. "add $4, %%"REG_BP" \n\t"
  1832. " jnc 1b \n\t"
  1833. "pop %%"REG_BP" \n\t"
  1834. #if defined(PIC)
  1835. "pop %%"REG_b" \n\t"
  1836. #endif
  1837. : "+a" (counter)
  1838. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1839. #if !defined(PIC)
  1840. : "%"REG_b
  1841. #endif
  1842. );
  1843. } else {
  1844. const uint8_t *offset = src+filterSize;
  1845. x86_reg counter= -2*dstW;
  1846. //filter-= counter*filterSize/2;
  1847. filterPos-= counter/2;
  1848. dst-= counter/2;
  1849. __asm__ volatile(
  1850. "pxor %%mm7, %%mm7 \n\t"
  1851. ".p2align 4 \n\t"
  1852. "1: \n\t"
  1853. "mov %2, %%"REG_c" \n\t"
  1854. "movzwl (%%"REG_c", %0), %%eax \n\t"
  1855. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  1856. "mov %5, %%"REG_c" \n\t"
  1857. "pxor %%mm4, %%mm4 \n\t"
  1858. "pxor %%mm5, %%mm5 \n\t"
  1859. "2: \n\t"
  1860. "movq (%1), %%mm1 \n\t"
  1861. "movq (%1, %6), %%mm3 \n\t"
  1862. "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
  1863. "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
  1864. "punpcklbw %%mm7, %%mm0 \n\t"
  1865. "punpcklbw %%mm7, %%mm2 \n\t"
  1866. "pmaddwd %%mm1, %%mm0 \n\t"
  1867. "pmaddwd %%mm2, %%mm3 \n\t"
  1868. "paddd %%mm3, %%mm5 \n\t"
  1869. "paddd %%mm0, %%mm4 \n\t"
  1870. "add $8, %1 \n\t"
  1871. "add $4, %%"REG_c" \n\t"
  1872. "cmp %4, %%"REG_c" \n\t"
  1873. " jb 2b \n\t"
  1874. "add %6, %1 \n\t"
  1875. "movq %%mm4, %%mm0 \n\t"
  1876. "punpckldq %%mm5, %%mm4 \n\t"
  1877. "punpckhdq %%mm5, %%mm0 \n\t"
  1878. "paddd %%mm0, %%mm4 \n\t"
  1879. "psrad $7, %%mm4 \n\t"
  1880. "packssdw %%mm4, %%mm4 \n\t"
  1881. "mov %3, %%"REG_a" \n\t"
  1882. "movd %%mm4, (%%"REG_a", %0) \n\t"
  1883. "add $4, %0 \n\t"
  1884. " jnc 1b \n\t"
  1885. : "+r" (counter), "+r" (filter)
  1886. : "m" (filterPos), "m" (dst), "m"(offset),
  1887. "m" (src), "r" ((x86_reg)filterSize*2)
  1888. : "%"REG_a, "%"REG_c, "%"REG_d
  1889. );
  1890. }
  1891. }
  1892. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1893. static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
  1894. const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
  1895. {
  1896. int i, j;
  1897. assert(filterSize % 4 == 0 && filterSize>0);
  1898. if (filterSize==4 && shift<15) { // Always true for upscaling, sometimes for down, too.
  1899. x86_reg counter= -2*dstW;
  1900. filter-= counter*2;
  1901. filterPos-= counter/2;
  1902. dst-= counter/2;
  1903. __asm__ volatile(
  1904. "movd %5, %%mm7 \n\t"
  1905. #if defined(PIC)
  1906. "push %%"REG_b" \n\t"
  1907. #endif
  1908. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1909. "mov %%"REG_a", %%"REG_BP" \n\t"
  1910. ".p2align 4 \n\t"
  1911. "1: \n\t"
  1912. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1913. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1914. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1915. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1916. "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
  1917. "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
  1918. "pmaddwd %%mm1, %%mm0 \n\t"
  1919. "pmaddwd %%mm2, %%mm3 \n\t"
  1920. "movq %%mm0, %%mm4 \n\t"
  1921. "punpckldq %%mm3, %%mm0 \n\t"
  1922. "punpckhdq %%mm3, %%mm4 \n\t"
  1923. "paddd %%mm4, %%mm0 \n\t"
  1924. "psrad %%mm7, %%mm0 \n\t"
  1925. "packssdw %%mm0, %%mm0 \n\t"
  1926. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1927. "add $4, %%"REG_BP" \n\t"
  1928. " jnc 1b \n\t"
  1929. "pop %%"REG_BP" \n\t"
  1930. #if defined(PIC)
  1931. "pop %%"REG_b" \n\t"
  1932. #endif
  1933. : "+a" (counter)
  1934. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
  1935. #if !defined(PIC)
  1936. : "%"REG_b
  1937. #endif
  1938. );
  1939. } else if (filterSize==8 && shift<15) {
  1940. x86_reg counter= -2*dstW;
  1941. filter-= counter*4;
  1942. filterPos-= counter/2;
  1943. dst-= counter/2;
  1944. __asm__ volatile(
  1945. "movd %5, %%mm7 \n\t"
  1946. #if defined(PIC)
  1947. "push %%"REG_b" \n\t"
  1948. #endif
  1949. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1950. "mov %%"REG_a", %%"REG_BP" \n\t"
  1951. ".p2align 4 \n\t"
  1952. "1: \n\t"
  1953. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1954. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1955. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  1956. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  1957. "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
  1958. "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
  1959. "pmaddwd %%mm1, %%mm0 \n\t"
  1960. "pmaddwd %%mm2, %%mm3 \n\t"
  1961. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  1962. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  1963. "movq 8(%3, %%"REG_a", 2), %%mm4 \n\t"
  1964. "movq 8(%3, %%"REG_b", 2), %%mm2 \n\t"
  1965. "pmaddwd %%mm1, %%mm4 \n\t"
  1966. "pmaddwd %%mm2, %%mm5 \n\t"
  1967. "paddd %%mm4, %%mm0 \n\t"
  1968. "paddd %%mm5, %%mm3 \n\t"
  1969. "movq %%mm0, %%mm4 \n\t"
  1970. "punpckldq %%mm3, %%mm0 \n\t"
  1971. "punpckhdq %%mm3, %%mm4 \n\t"
  1972. "paddd %%mm4, %%mm0 \n\t"
  1973. "psrad %%mm7, %%mm0 \n\t"
  1974. "packssdw %%mm0, %%mm0 \n\t"
  1975. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1976. "add $4, %%"REG_BP" \n\t"
  1977. " jnc 1b \n\t"
  1978. "pop %%"REG_BP" \n\t"
  1979. #if defined(PIC)
  1980. "pop %%"REG_b" \n\t"
  1981. #endif
  1982. : "+a" (counter)
  1983. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
  1984. #if !defined(PIC)
  1985. : "%"REG_b
  1986. #endif
  1987. );
  1988. } else if (shift<15){
  1989. const uint16_t *offset = src+filterSize;
  1990. x86_reg counter= -2*dstW;
  1991. //filter-= counter*filterSize/2;
  1992. filterPos-= counter/2;
  1993. dst-= counter/2;
  1994. __asm__ volatile(
  1995. "movd %7, %%mm7 \n\t"
  1996. ".p2align 4 \n\t"
  1997. "1: \n\t"
  1998. "mov %2, %%"REG_c" \n\t"
  1999. "movzwl (%%"REG_c", %0), %%eax \n\t"
  2000. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  2001. "mov %5, %%"REG_c" \n\t"
  2002. "pxor %%mm4, %%mm4 \n\t"
  2003. "pxor %%mm5, %%mm5 \n\t"
  2004. "2: \n\t"
  2005. "movq (%1), %%mm1 \n\t"
  2006. "movq (%1, %6), %%mm3 \n\t"
  2007. "movq (%%"REG_c", %%"REG_a", 2), %%mm0 \n\t"
  2008. "movq (%%"REG_c", %%"REG_d", 2), %%mm2 \n\t"
  2009. "pmaddwd %%mm1, %%mm0 \n\t"
  2010. "pmaddwd %%mm2, %%mm3 \n\t"
  2011. "paddd %%mm3, %%mm5 \n\t"
  2012. "paddd %%mm0, %%mm4 \n\t"
  2013. "add $8, %1 \n\t"
  2014. "add $8, %%"REG_c" \n\t"
  2015. "cmp %4, %%"REG_c" \n\t"
  2016. " jb 2b \n\t"
  2017. "add %6, %1 \n\t"
  2018. "movq %%mm4, %%mm0 \n\t"
  2019. "punpckldq %%mm5, %%mm4 \n\t"
  2020. "punpckhdq %%mm5, %%mm0 \n\t"
  2021. "paddd %%mm0, %%mm4 \n\t"
  2022. "psrad %%mm7, %%mm4 \n\t"
  2023. "packssdw %%mm4, %%mm4 \n\t"
  2024. "mov %3, %%"REG_a" \n\t"
  2025. "movd %%mm4, (%%"REG_a", %0) \n\t"
  2026. "add $4, %0 \n\t"
  2027. " jnc 1b \n\t"
  2028. : "+r" (counter), "+r" (filter)
  2029. : "m" (filterPos), "m" (dst), "m"(offset),
  2030. "m" (src), "r" ((x86_reg)filterSize*2), "m"(shift)
  2031. : "%"REG_a, "%"REG_c, "%"REG_d
  2032. );
  2033. } else
  2034. for (i=0; i<dstW; i++) {
  2035. int srcPos= filterPos[i];
  2036. int val=0;
  2037. for (j=0; j<filterSize; j++) {
  2038. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  2039. }
  2040. dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
  2041. }
  2042. }
  2043. #if COMPILE_TEMPLATE_MMX2
  2044. static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  2045. long dstWidth, const uint8_t *src, int srcW,
  2046. int xInc)
  2047. {
  2048. int32_t *filterPos = c->hLumFilterPos;
  2049. int16_t *filter = c->hLumFilter;
  2050. int canMMX2BeUsed = c->canMMX2BeUsed;
  2051. void *mmx2FilterCode= c->lumMmx2FilterCode;
  2052. int i;
  2053. #if defined(PIC)
  2054. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2055. #endif
  2056. __asm__ volatile(
  2057. #if defined(PIC)
  2058. "mov %%"REG_b", %5 \n\t"
  2059. #endif
  2060. "pxor %%mm7, %%mm7 \n\t"
  2061. "mov %0, %%"REG_c" \n\t"
  2062. "mov %1, %%"REG_D" \n\t"
  2063. "mov %2, %%"REG_d" \n\t"
  2064. "mov %3, %%"REG_b" \n\t"
  2065. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2066. PREFETCH" (%%"REG_c") \n\t"
  2067. PREFETCH" 32(%%"REG_c") \n\t"
  2068. PREFETCH" 64(%%"REG_c") \n\t"
  2069. #if ARCH_X86_64
  2070. #define CALL_MMX2_FILTER_CODE \
  2071. "movl (%%"REG_b"), %%esi \n\t"\
  2072. "call *%4 \n\t"\
  2073. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  2074. "add %%"REG_S", %%"REG_c" \n\t"\
  2075. "add %%"REG_a", %%"REG_D" \n\t"\
  2076. "xor %%"REG_a", %%"REG_a" \n\t"\
  2077. #else
  2078. #define CALL_MMX2_FILTER_CODE \
  2079. "movl (%%"REG_b"), %%esi \n\t"\
  2080. "call *%4 \n\t"\
  2081. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  2082. "add %%"REG_a", %%"REG_D" \n\t"\
  2083. "xor %%"REG_a", %%"REG_a" \n\t"\
  2084. #endif /* ARCH_X86_64 */
  2085. CALL_MMX2_FILTER_CODE
  2086. CALL_MMX2_FILTER_CODE
  2087. CALL_MMX2_FILTER_CODE
  2088. CALL_MMX2_FILTER_CODE
  2089. CALL_MMX2_FILTER_CODE
  2090. CALL_MMX2_FILTER_CODE
  2091. CALL_MMX2_FILTER_CODE
  2092. CALL_MMX2_FILTER_CODE
  2093. #if defined(PIC)
  2094. "mov %5, %%"REG_b" \n\t"
  2095. #endif
  2096. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  2097. "m" (mmx2FilterCode)
  2098. #if defined(PIC)
  2099. ,"m" (ebxsave)
  2100. #endif
  2101. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2102. #if !defined(PIC)
  2103. ,"%"REG_b
  2104. #endif
  2105. );
  2106. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  2107. dst[i] = src[srcW-1]*128;
  2108. }
  2109. static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
  2110. long dstWidth, const uint8_t *src1,
  2111. const uint8_t *src2, int srcW, int xInc)
  2112. {
  2113. int32_t *filterPos = c->hChrFilterPos;
  2114. int16_t *filter = c->hChrFilter;
  2115. int canMMX2BeUsed = c->canMMX2BeUsed;
  2116. void *mmx2FilterCode= c->chrMmx2FilterCode;
  2117. int i;
  2118. #if defined(PIC)
  2119. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2120. #endif
  2121. __asm__ volatile(
  2122. #if defined(PIC)
  2123. "mov %%"REG_b", %6 \n\t"
  2124. #endif
  2125. "pxor %%mm7, %%mm7 \n\t"
  2126. "mov %0, %%"REG_c" \n\t"
  2127. "mov %1, %%"REG_D" \n\t"
  2128. "mov %2, %%"REG_d" \n\t"
  2129. "mov %3, %%"REG_b" \n\t"
  2130. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2131. PREFETCH" (%%"REG_c") \n\t"
  2132. PREFETCH" 32(%%"REG_c") \n\t"
  2133. PREFETCH" 64(%%"REG_c") \n\t"
  2134. CALL_MMX2_FILTER_CODE
  2135. CALL_MMX2_FILTER_CODE
  2136. CALL_MMX2_FILTER_CODE
  2137. CALL_MMX2_FILTER_CODE
  2138. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2139. "mov %5, %%"REG_c" \n\t" // src
  2140. "mov %1, %%"REG_D" \n\t" // buf1
  2141. "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
  2142. PREFETCH" (%%"REG_c") \n\t"
  2143. PREFETCH" 32(%%"REG_c") \n\t"
  2144. PREFETCH" 64(%%"REG_c") \n\t"
  2145. CALL_MMX2_FILTER_CODE
  2146. CALL_MMX2_FILTER_CODE
  2147. CALL_MMX2_FILTER_CODE
  2148. CALL_MMX2_FILTER_CODE
  2149. #if defined(PIC)
  2150. "mov %6, %%"REG_b" \n\t"
  2151. #endif
  2152. :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
  2153. "m" (mmx2FilterCode), "m" (src2)
  2154. #if defined(PIC)
  2155. ,"m" (ebxsave)
  2156. #endif
  2157. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2158. #if !defined(PIC)
  2159. ,"%"REG_b
  2160. #endif
  2161. );
  2162. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  2163. dst[i] = src1[srcW-1]*128;
  2164. dst[i+VOFW] = src2[srcW-1]*128;
  2165. }
  2166. }
  2167. #endif /* COMPILE_TEMPLATE_MMX2 */
  2168. #if !COMPILE_TEMPLATE_MMX2
  2169. static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
  2170. int lastInLumBuf, int lastInChrBuf)
  2171. {
  2172. const int dstH= c->dstH;
  2173. const int flags= c->flags;
  2174. int16_t **lumPixBuf= c->lumPixBuf;
  2175. int16_t **chrPixBuf= c->chrPixBuf;
  2176. int16_t **alpPixBuf= c->alpPixBuf;
  2177. const int vLumBufSize= c->vLumBufSize;
  2178. const int vChrBufSize= c->vChrBufSize;
  2179. int16_t *vLumFilterPos= c->vLumFilterPos;
  2180. int16_t *vChrFilterPos= c->vChrFilterPos;
  2181. int16_t *vLumFilter= c->vLumFilter;
  2182. int16_t *vChrFilter= c->vChrFilter;
  2183. int32_t *lumMmxFilter= c->lumMmxFilter;
  2184. int32_t *chrMmxFilter= c->chrMmxFilter;
  2185. int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
  2186. const int vLumFilterSize= c->vLumFilterSize;
  2187. const int vChrFilterSize= c->vChrFilterSize;
  2188. const int chrDstY= dstY>>c->chrDstVSubSample;
  2189. const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
  2190. const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
  2191. c->blueDither= ff_dither8[dstY&1];
  2192. if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
  2193. c->greenDither= ff_dither8[dstY&1];
  2194. else
  2195. c->greenDither= ff_dither4[dstY&1];
  2196. c->redDither= ff_dither8[(dstY+1)&1];
  2197. if (dstY < dstH - 2) {
  2198. const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2199. const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2200. const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2201. int i;
  2202. if (flags & SWS_ACCURATE_RND) {
  2203. int s= APCK_SIZE / 8;
  2204. for (i=0; i<vLumFilterSize; i+=2) {
  2205. *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
  2206. *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
  2207. lumMmxFilter[s*i+APCK_COEF/4 ]=
  2208. lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
  2209. + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
  2210. if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
  2211. *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
  2212. *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
  2213. alpMmxFilter[s*i+APCK_COEF/4 ]=
  2214. alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
  2215. }
  2216. }
  2217. for (i=0; i<vChrFilterSize; i+=2) {
  2218. *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
  2219. *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
  2220. chrMmxFilter[s*i+APCK_COEF/4 ]=
  2221. chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
  2222. + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
  2223. }
  2224. } else {
  2225. for (i=0; i<vLumFilterSize; i++) {
  2226. lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
  2227. lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
  2228. lumMmxFilter[4*i+2]=
  2229. lumMmxFilter[4*i+3]=
  2230. ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
  2231. if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
  2232. alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
  2233. alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
  2234. alpMmxFilter[4*i+2]=
  2235. alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
  2236. }
  2237. }
  2238. for (i=0; i<vChrFilterSize; i++) {
  2239. chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
  2240. chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
  2241. chrMmxFilter[4*i+2]=
  2242. chrMmxFilter[4*i+3]=
  2243. ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
  2244. }
  2245. }
  2246. }
  2247. }
  2248. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2249. static void RENAME(sws_init_swScale)(SwsContext *c)
  2250. {
  2251. enum PixelFormat srcFormat = c->srcFormat;
  2252. if (!(c->flags & SWS_BITEXACT)) {
  2253. if (c->flags & SWS_ACCURATE_RND) {
  2254. c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
  2255. c->yuv2yuvX = RENAME(yuv2yuvX_ar );
  2256. switch (c->dstFormat) {
  2257. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  2258. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  2259. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  2260. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  2261. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  2262. default: break;
  2263. }
  2264. } else {
  2265. c->yuv2yuv1 = RENAME(yuv2yuv1 );
  2266. c->yuv2yuvX = RENAME(yuv2yuvX );
  2267. switch (c->dstFormat) {
  2268. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  2269. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  2270. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  2271. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  2272. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  2273. default: break;
  2274. }
  2275. }
  2276. switch (c->dstFormat) {
  2277. case PIX_FMT_RGB32:
  2278. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  2279. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  2280. break;
  2281. case PIX_FMT_BGR24:
  2282. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  2283. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  2284. break;
  2285. case PIX_FMT_RGB555:
  2286. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  2287. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  2288. break;
  2289. case PIX_FMT_RGB565:
  2290. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  2291. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  2292. break;
  2293. case PIX_FMT_YUYV422:
  2294. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  2295. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  2296. break;
  2297. default:
  2298. break;
  2299. }
  2300. }
  2301. #if !COMPILE_TEMPLATE_MMX2
  2302. c->hScale = RENAME(hScale );
  2303. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2304. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2305. #if COMPILE_TEMPLATE_MMX2
  2306. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  2307. {
  2308. c->hyscale_fast = RENAME(hyscale_fast);
  2309. c->hcscale_fast = RENAME(hcscale_fast);
  2310. } else {
  2311. #endif /* COMPILE_TEMPLATE_MMX2 */
  2312. c->hyscale_fast = NULL;
  2313. c->hcscale_fast = NULL;
  2314. #if COMPILE_TEMPLATE_MMX2
  2315. }
  2316. #endif /* COMPILE_TEMPLATE_MMX2 */
  2317. #if !COMPILE_TEMPLATE_MMX2
  2318. switch(srcFormat) {
  2319. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  2320. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  2321. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  2322. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  2323. case PIX_FMT_GRAY16LE :
  2324. case PIX_FMT_YUV420P9LE:
  2325. case PIX_FMT_YUV422P10LE:
  2326. case PIX_FMT_YUV420P10LE:
  2327. case PIX_FMT_YUV420P16LE:
  2328. case PIX_FMT_YUV422P16LE:
  2329. case PIX_FMT_YUV444P16LE: c->hScale16= RENAME(hScale16); break;
  2330. }
  2331. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2332. if (!c->chrSrcHSubSample) {
  2333. switch(srcFormat) {
  2334. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  2335. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  2336. default: break;
  2337. }
  2338. }
  2339. switch (srcFormat) {
  2340. #if !COMPILE_TEMPLATE_MMX2
  2341. case PIX_FMT_YUYV422 :
  2342. case PIX_FMT_Y400A :
  2343. c->lumToYV12 = RENAME(yuy2ToY); break;
  2344. case PIX_FMT_UYVY422 :
  2345. c->lumToYV12 = RENAME(uyvyToY); break;
  2346. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2347. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  2348. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  2349. default: break;
  2350. }
  2351. #if !COMPILE_TEMPLATE_MMX2
  2352. if (c->alpPixBuf) {
  2353. switch (srcFormat) {
  2354. case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
  2355. default: break;
  2356. }
  2357. }
  2358. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2359. if(isAnyRGB(c->srcFormat))
  2360. c->hScale16= RENAME(hScale16);
  2361. }