swscale_template.c 72 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577
  1. /*
  2. * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include <stdint.h>
  21. #include "libavutil/x86/asm.h"
  22. #include "libswscale/swscale_internal.h"
  23. #undef REAL_MOVNTQ
  24. #undef MOVNTQ
  25. #undef MOVNTQ2
  26. #undef PREFETCH
  27. #if COMPILE_TEMPLATE_MMXEXT
  28. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  29. #define MOVNTQ2 "movntq "
  30. #else
  31. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  32. #define MOVNTQ2 "movq "
  33. #endif
  34. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  35. #if !COMPILE_TEMPLATE_MMXEXT
  36. static av_always_inline void
  37. dither_8to16(const uint8_t *srcDither, int rot)
  38. {
  39. if (rot) {
  40. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  41. "movq (%0), %%mm3\n\t"
  42. "movq %%mm3, %%mm4\n\t"
  43. "psrlq $24, %%mm3\n\t"
  44. "psllq $40, %%mm4\n\t"
  45. "por %%mm4, %%mm3\n\t"
  46. "movq %%mm3, %%mm4\n\t"
  47. "punpcklbw %%mm0, %%mm3\n\t"
  48. "punpckhbw %%mm0, %%mm4\n\t"
  49. :: "r"(srcDither)
  50. );
  51. } else {
  52. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  53. "movq (%0), %%mm3\n\t"
  54. "movq %%mm3, %%mm4\n\t"
  55. "punpcklbw %%mm0, %%mm3\n\t"
  56. "punpckhbw %%mm0, %%mm4\n\t"
  57. :: "r"(srcDither)
  58. );
  59. }
  60. }
  61. #endif
  62. static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
  63. const int16_t **src, uint8_t *dest, int dstW,
  64. const uint8_t *dither, int offset)
  65. {
  66. dither_8to16(dither, offset);
  67. filterSize--;
  68. __asm__ volatile(
  69. "movd %0, %%mm1\n\t"
  70. "punpcklwd %%mm1, %%mm1\n\t"
  71. "punpckldq %%mm1, %%mm1\n\t"
  72. "psllw $3, %%mm1\n\t"
  73. "paddw %%mm1, %%mm3\n\t"
  74. "paddw %%mm1, %%mm4\n\t"
  75. "psraw $4, %%mm3\n\t"
  76. "psraw $4, %%mm4\n\t"
  77. ::"m"(filterSize)
  78. );
  79. __asm__ volatile(\
  80. "movq %%mm3, %%mm6\n\t"
  81. "movq %%mm4, %%mm7\n\t"
  82. "movl %3, %%ecx\n\t"
  83. "mov %0, %%"FF_REG_d" \n\t"\
  84. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  85. ".p2align 4 \n\t" /* FIXME Unroll? */\
  86. "1: \n\t"\
  87. "movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\
  88. "movq (%%"FF_REG_S", %%"FF_REG_c", 2), %%mm2 \n\t" /* srcData */\
  89. "movq 8(%%"FF_REG_S", %%"FF_REG_c", 2), %%mm5 \n\t" /* srcData */\
  90. "add $16, %%"FF_REG_d" \n\t"\
  91. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  92. "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
  93. "pmulhw %%mm0, %%mm2 \n\t"\
  94. "pmulhw %%mm0, %%mm5 \n\t"\
  95. "paddw %%mm2, %%mm3 \n\t"\
  96. "paddw %%mm5, %%mm4 \n\t"\
  97. " jnz 1b \n\t"\
  98. "psraw $3, %%mm3 \n\t"\
  99. "psraw $3, %%mm4 \n\t"\
  100. "packuswb %%mm4, %%mm3 \n\t"
  101. MOVNTQ2 " %%mm3, (%1, %%"FF_REG_c")\n\t"
  102. "add $8, %%"FF_REG_c" \n\t"\
  103. "cmp %2, %%"FF_REG_c" \n\t"\
  104. "movq %%mm6, %%mm3\n\t"
  105. "movq %%mm7, %%mm4\n\t"
  106. "mov %0, %%"FF_REG_d" \n\t"\
  107. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  108. "jb 1b \n\t"\
  109. :: "g" (filter),
  110. "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
  111. : "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
  112. );
  113. }
  114. #define YSCALEYUV2PACKEDX_UV \
  115. __asm__ volatile(\
  116. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
  117. ".p2align 4 \n\t"\
  118. "nop \n\t"\
  119. "1: \n\t"\
  120. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
  121. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  122. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  123. "movq %%mm3, %%mm4 \n\t"\
  124. ".p2align 4 \n\t"\
  125. "2: \n\t"\
  126. "movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\
  127. "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* UsrcData */\
  128. "add %6, %%"FF_REG_S" \n\t" \
  129. "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" /* VsrcData */\
  130. "add $16, %%"FF_REG_d" \n\t"\
  131. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  132. "pmulhw %%mm0, %%mm2 \n\t"\
  133. "pmulhw %%mm0, %%mm5 \n\t"\
  134. "paddw %%mm2, %%mm3 \n\t"\
  135. "paddw %%mm5, %%mm4 \n\t"\
  136. "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
  137. " jnz 2b \n\t"\
  138. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  139. "lea "offset"(%0), %%"FF_REG_d" \n\t"\
  140. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  141. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  142. "movq "#dst1", "#dst2" \n\t"\
  143. ".p2align 4 \n\t"\
  144. "2: \n\t"\
  145. "movq 8(%%"FF_REG_d"), "#coeff" \n\t" /* filterCoeff */\
  146. "movq (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  147. "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  148. "add $16, %%"FF_REG_d" \n\t"\
  149. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  150. "pmulhw "#coeff", "#src1" \n\t"\
  151. "pmulhw "#coeff", "#src2" \n\t"\
  152. "paddw "#src1", "#dst1" \n\t"\
  153. "paddw "#src2", "#dst2" \n\t"\
  154. "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
  155. " jnz 2b \n\t"\
  156. #define YSCALEYUV2PACKEDX \
  157. YSCALEYUV2PACKEDX_UV \
  158. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  159. #define YSCALEYUV2PACKEDX_END \
  160. :: "r" (&c->redDither), \
  161. "m" (dummy), "m" (dummy), "m" (dummy),\
  162. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  163. NAMED_CONSTRAINTS_ADD(bF8,bFC) \
  164. : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \
  165. );
  166. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  167. __asm__ volatile(\
  168. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
  169. ".p2align 4 \n\t"\
  170. "nop \n\t"\
  171. "1: \n\t"\
  172. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
  173. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  174. "pxor %%mm4, %%mm4 \n\t"\
  175. "pxor %%mm5, %%mm5 \n\t"\
  176. "pxor %%mm6, %%mm6 \n\t"\
  177. "pxor %%mm7, %%mm7 \n\t"\
  178. ".p2align 4 \n\t"\
  179. "2: \n\t"\
  180. "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" /* UsrcData */\
  181. "add %6, %%"FF_REG_S" \n\t" \
  182. "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* VsrcData */\
  183. "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  184. "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" /* UsrcData */\
  185. "movq %%mm0, %%mm3 \n\t"\
  186. "punpcklwd %%mm1, %%mm0 \n\t"\
  187. "punpckhwd %%mm1, %%mm3 \n\t"\
  188. "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1 \n\t" /* filterCoeff */\
  189. "pmaddwd %%mm1, %%mm0 \n\t"\
  190. "pmaddwd %%mm1, %%mm3 \n\t"\
  191. "paddd %%mm0, %%mm4 \n\t"\
  192. "paddd %%mm3, %%mm5 \n\t"\
  193. "add %6, %%"FF_REG_S" \n\t" \
  194. "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" /* VsrcData */\
  195. "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  196. "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
  197. "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
  198. "movq %%mm2, %%mm0 \n\t"\
  199. "punpcklwd %%mm3, %%mm2 \n\t"\
  200. "punpckhwd %%mm3, %%mm0 \n\t"\
  201. "pmaddwd %%mm1, %%mm2 \n\t"\
  202. "pmaddwd %%mm1, %%mm0 \n\t"\
  203. "paddd %%mm2, %%mm6 \n\t"\
  204. "paddd %%mm0, %%mm7 \n\t"\
  205. " jnz 2b \n\t"\
  206. "psrad $16, %%mm4 \n\t"\
  207. "psrad $16, %%mm5 \n\t"\
  208. "psrad $16, %%mm6 \n\t"\
  209. "psrad $16, %%mm7 \n\t"\
  210. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  211. "packssdw %%mm5, %%mm4 \n\t"\
  212. "packssdw %%mm7, %%mm6 \n\t"\
  213. "paddw %%mm0, %%mm4 \n\t"\
  214. "paddw %%mm0, %%mm6 \n\t"\
  215. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  216. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  217. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  218. "lea "offset"(%0), %%"FF_REG_d" \n\t"\
  219. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  220. "pxor %%mm1, %%mm1 \n\t"\
  221. "pxor %%mm5, %%mm5 \n\t"\
  222. "pxor %%mm7, %%mm7 \n\t"\
  223. "pxor %%mm6, %%mm6 \n\t"\
  224. ".p2align 4 \n\t"\
  225. "2: \n\t"\
  226. "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  227. "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  228. "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  229. "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  230. "movq %%mm0, %%mm3 \n\t"\
  231. "punpcklwd %%mm4, %%mm0 \n\t"\
  232. "punpckhwd %%mm4, %%mm3 \n\t"\
  233. "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4 \n\t" /* filterCoeff */\
  234. "pmaddwd %%mm4, %%mm0 \n\t"\
  235. "pmaddwd %%mm4, %%mm3 \n\t"\
  236. "paddd %%mm0, %%mm1 \n\t"\
  237. "paddd %%mm3, %%mm5 \n\t"\
  238. "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  239. "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  240. "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
  241. "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
  242. "movq %%mm2, %%mm0 \n\t"\
  243. "punpcklwd %%mm3, %%mm2 \n\t"\
  244. "punpckhwd %%mm3, %%mm0 \n\t"\
  245. "pmaddwd %%mm4, %%mm2 \n\t"\
  246. "pmaddwd %%mm4, %%mm0 \n\t"\
  247. "paddd %%mm2, %%mm7 \n\t"\
  248. "paddd %%mm0, %%mm6 \n\t"\
  249. " jnz 2b \n\t"\
  250. "psrad $16, %%mm1 \n\t"\
  251. "psrad $16, %%mm5 \n\t"\
  252. "psrad $16, %%mm7 \n\t"\
  253. "psrad $16, %%mm6 \n\t"\
  254. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  255. "packssdw %%mm5, %%mm1 \n\t"\
  256. "packssdw %%mm6, %%mm7 \n\t"\
  257. "paddw %%mm0, %%mm1 \n\t"\
  258. "paddw %%mm0, %%mm7 \n\t"\
  259. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  260. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  261. #define YSCALEYUV2PACKEDX_ACCURATE \
  262. YSCALEYUV2PACKEDX_ACCURATE_UV \
  263. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  264. #define YSCALEYUV2RGBX \
  265. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  266. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  267. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  268. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  269. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  270. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  271. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  272. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  273. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  274. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  275. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  276. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  277. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  278. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  279. "paddw %%mm3, %%mm4 \n\t"\
  280. "movq %%mm2, %%mm0 \n\t"\
  281. "movq %%mm5, %%mm6 \n\t"\
  282. "movq %%mm4, %%mm3 \n\t"\
  283. "punpcklwd %%mm2, %%mm2 \n\t"\
  284. "punpcklwd %%mm5, %%mm5 \n\t"\
  285. "punpcklwd %%mm4, %%mm4 \n\t"\
  286. "paddw %%mm1, %%mm2 \n\t"\
  287. "paddw %%mm1, %%mm5 \n\t"\
  288. "paddw %%mm1, %%mm4 \n\t"\
  289. "punpckhwd %%mm0, %%mm0 \n\t"\
  290. "punpckhwd %%mm6, %%mm6 \n\t"\
  291. "punpckhwd %%mm3, %%mm3 \n\t"\
  292. "paddw %%mm7, %%mm0 \n\t"\
  293. "paddw %%mm7, %%mm6 \n\t"\
  294. "paddw %%mm7, %%mm3 \n\t"\
  295. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  296. "packuswb %%mm0, %%mm2 \n\t"\
  297. "packuswb %%mm6, %%mm5 \n\t"\
  298. "packuswb %%mm3, %%mm4 \n\t"\
  299. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  300. "movq "#b", "#q2" \n\t" /* B */\
  301. "movq "#r", "#t" \n\t" /* R */\
  302. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  303. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  304. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  305. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  306. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  307. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  308. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  309. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  310. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  311. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  312. \
  313. MOVNTQ( q0, (dst, index, 4))\
  314. MOVNTQ( b, 8(dst, index, 4))\
  315. MOVNTQ( q2, 16(dst, index, 4))\
  316. MOVNTQ( q3, 24(dst, index, 4))\
  317. \
  318. "add $8, "#index" \n\t"\
  319. "cmp "dstw", "#index" \n\t"\
  320. " jb 1b \n\t"
  321. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  322. static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  323. const int16_t **lumSrc, int lumFilterSize,
  324. const int16_t *chrFilter, const int16_t **chrUSrc,
  325. const int16_t **chrVSrc,
  326. int chrFilterSize, const int16_t **alpSrc,
  327. uint8_t *dest, int dstW, int dstY)
  328. {
  329. x86_reg dummy=0;
  330. x86_reg dstW_reg = dstW;
  331. x86_reg uv_off = c->uv_offx2;
  332. if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
  333. YSCALEYUV2PACKEDX_ACCURATE
  334. YSCALEYUV2RGBX
  335. "movq %%mm2, "U_TEMP"(%0) \n\t"
  336. "movq %%mm4, "V_TEMP"(%0) \n\t"
  337. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  338. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  339. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  340. "psraw $3, %%mm1 \n\t"
  341. "psraw $3, %%mm7 \n\t"
  342. "packuswb %%mm7, %%mm1 \n\t"
  343. WRITEBGR32(%4, "%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  344. YSCALEYUV2PACKEDX_END
  345. } else {
  346. YSCALEYUV2PACKEDX_ACCURATE
  347. YSCALEYUV2RGBX
  348. "pcmpeqd %%mm7, %%mm7 \n\t"
  349. WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  350. YSCALEYUV2PACKEDX_END
  351. }
  352. }
  353. static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  354. const int16_t **lumSrc, int lumFilterSize,
  355. const int16_t *chrFilter, const int16_t **chrUSrc,
  356. const int16_t **chrVSrc,
  357. int chrFilterSize, const int16_t **alpSrc,
  358. uint8_t *dest, int dstW, int dstY)
  359. {
  360. x86_reg dummy=0;
  361. x86_reg dstW_reg = dstW;
  362. x86_reg uv_off = c->uv_offx2;
  363. if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
  364. YSCALEYUV2PACKEDX
  365. YSCALEYUV2RGBX
  366. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  367. "psraw $3, %%mm1 \n\t"
  368. "psraw $3, %%mm7 \n\t"
  369. "packuswb %%mm7, %%mm1 \n\t"
  370. WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  371. YSCALEYUV2PACKEDX_END
  372. } else {
  373. YSCALEYUV2PACKEDX
  374. YSCALEYUV2RGBX
  375. "pcmpeqd %%mm7, %%mm7 \n\t"
  376. WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  377. YSCALEYUV2PACKEDX_END
  378. }
  379. }
  380. static void RENAME(yuv2bgr32_X)(SwsContext *c, const int16_t *lumFilter,
  381. const int16_t **lumSrc, int lumFilterSize,
  382. const int16_t *chrFilter, const int16_t **chrUSrc,
  383. const int16_t **chrVSrc,
  384. int chrFilterSize, const int16_t **alpSrc,
  385. uint8_t *dest, int dstW, int dstY)
  386. {
  387. x86_reg dummy=0;
  388. x86_reg dstW_reg = dstW;
  389. x86_reg uv_off = c->uv_offx2;
  390. if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
  391. YSCALEYUV2PACKEDX
  392. YSCALEYUV2RGBX
  393. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  394. "psraw $3, %%mm1 \n\t"
  395. "psraw $3, %%mm7 \n\t"
  396. "packuswb %%mm7, %%mm1 \n\t"
  397. WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  398. YSCALEYUV2PACKEDX_END
  399. } else {
  400. YSCALEYUV2PACKEDX
  401. YSCALEYUV2RGBX
  402. "pcmpeqd %%mm7, %%mm7 \n\t"
  403. WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  404. YSCALEYUV2PACKEDX_END
  405. }
  406. }
  407. #define REAL_WRITERGB16(dst, dstw, index) \
  408. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  409. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  410. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  411. "psrlq $3, %%mm2 \n\t"\
  412. \
  413. "movq %%mm2, %%mm1 \n\t"\
  414. "movq %%mm4, %%mm3 \n\t"\
  415. \
  416. "punpcklbw %%mm7, %%mm3 \n\t"\
  417. "punpcklbw %%mm5, %%mm2 \n\t"\
  418. "punpckhbw %%mm7, %%mm4 \n\t"\
  419. "punpckhbw %%mm5, %%mm1 \n\t"\
  420. \
  421. "psllq $3, %%mm3 \n\t"\
  422. "psllq $3, %%mm4 \n\t"\
  423. \
  424. "por %%mm3, %%mm2 \n\t"\
  425. "por %%mm4, %%mm1 \n\t"\
  426. \
  427. MOVNTQ(%%mm2, (dst, index, 2))\
  428. MOVNTQ(%%mm1, 8(dst, index, 2))\
  429. \
  430. "add $8, "#index" \n\t"\
  431. "cmp "dstw", "#index" \n\t"\
  432. " jb 1b \n\t"
  433. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  434. static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  435. const int16_t **lumSrc, int lumFilterSize,
  436. const int16_t *chrFilter, const int16_t **chrUSrc,
  437. const int16_t **chrVSrc,
  438. int chrFilterSize, const int16_t **alpSrc,
  439. uint8_t *dest, int dstW, int dstY)
  440. {
  441. x86_reg dummy=0;
  442. x86_reg dstW_reg = dstW;
  443. x86_reg uv_off = c->uv_offx2;
  444. YSCALEYUV2PACKEDX_ACCURATE
  445. YSCALEYUV2RGBX
  446. "pxor %%mm7, %%mm7 \n\t"
  447. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  448. #ifdef DITHER1XBPP
  449. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  450. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  451. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  452. #endif
  453. WRITERGB16(%4, "%5", %%FF_REGa)
  454. YSCALEYUV2PACKEDX_END
  455. }
  456. static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  457. const int16_t **lumSrc, int lumFilterSize,
  458. const int16_t *chrFilter, const int16_t **chrUSrc,
  459. const int16_t **chrVSrc,
  460. int chrFilterSize, const int16_t **alpSrc,
  461. uint8_t *dest, int dstW, int dstY)
  462. {
  463. x86_reg dummy=0;
  464. x86_reg dstW_reg = dstW;
  465. x86_reg uv_off = c->uv_offx2;
  466. YSCALEYUV2PACKEDX
  467. YSCALEYUV2RGBX
  468. "pxor %%mm7, %%mm7 \n\t"
  469. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  470. #ifdef DITHER1XBPP
  471. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  472. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  473. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  474. #endif
  475. WRITERGB16(%4, "%5", %%FF_REGa)
  476. YSCALEYUV2PACKEDX_END
  477. }
  478. #define REAL_WRITERGB15(dst, dstw, index) \
  479. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  480. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  481. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  482. "psrlq $3, %%mm2 \n\t"\
  483. "psrlq $1, %%mm5 \n\t"\
  484. \
  485. "movq %%mm2, %%mm1 \n\t"\
  486. "movq %%mm4, %%mm3 \n\t"\
  487. \
  488. "punpcklbw %%mm7, %%mm3 \n\t"\
  489. "punpcklbw %%mm5, %%mm2 \n\t"\
  490. "punpckhbw %%mm7, %%mm4 \n\t"\
  491. "punpckhbw %%mm5, %%mm1 \n\t"\
  492. \
  493. "psllq $2, %%mm3 \n\t"\
  494. "psllq $2, %%mm4 \n\t"\
  495. \
  496. "por %%mm3, %%mm2 \n\t"\
  497. "por %%mm4, %%mm1 \n\t"\
  498. \
  499. MOVNTQ(%%mm2, (dst, index, 2))\
  500. MOVNTQ(%%mm1, 8(dst, index, 2))\
  501. \
  502. "add $8, "#index" \n\t"\
  503. "cmp "dstw", "#index" \n\t"\
  504. " jb 1b \n\t"
  505. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  506. static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  507. const int16_t **lumSrc, int lumFilterSize,
  508. const int16_t *chrFilter, const int16_t **chrUSrc,
  509. const int16_t **chrVSrc,
  510. int chrFilterSize, const int16_t **alpSrc,
  511. uint8_t *dest, int dstW, int dstY)
  512. {
  513. x86_reg dummy=0;
  514. x86_reg dstW_reg = dstW;
  515. x86_reg uv_off = c->uv_offx2;
  516. YSCALEYUV2PACKEDX_ACCURATE
  517. YSCALEYUV2RGBX
  518. "pxor %%mm7, %%mm7 \n\t"
  519. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  520. #ifdef DITHER1XBPP
  521. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  522. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  523. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  524. #endif
  525. WRITERGB15(%4, "%5", %%FF_REGa)
  526. YSCALEYUV2PACKEDX_END
  527. }
  528. static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  529. const int16_t **lumSrc, int lumFilterSize,
  530. const int16_t *chrFilter, const int16_t **chrUSrc,
  531. const int16_t **chrVSrc,
  532. int chrFilterSize, const int16_t **alpSrc,
  533. uint8_t *dest, int dstW, int dstY)
  534. {
  535. x86_reg dummy=0;
  536. x86_reg dstW_reg = dstW;
  537. x86_reg uv_off = c->uv_offx2;
  538. YSCALEYUV2PACKEDX
  539. YSCALEYUV2RGBX
  540. "pxor %%mm7, %%mm7 \n\t"
  541. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  542. #ifdef DITHER1XBPP
  543. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  544. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  545. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  546. #endif
  547. WRITERGB15(%4, "%5", %%FF_REGa)
  548. YSCALEYUV2PACKEDX_END
  549. }
  550. #define WRITEBGR24MMX(dst, dstw, index) \
  551. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  552. "movq %%mm2, %%mm1 \n\t" /* B */\
  553. "movq %%mm5, %%mm6 \n\t" /* R */\
  554. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  555. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  556. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  557. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  558. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  559. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  560. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  561. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  562. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  563. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  564. \
  565. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  566. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  567. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  568. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  569. \
  570. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  571. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  572. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  573. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  574. \
  575. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  576. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  577. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  578. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  579. \
  580. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  581. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  582. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  583. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  584. MOVNTQ(%%mm0, (dst))\
  585. \
  586. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  587. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  588. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  589. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  590. MOVNTQ(%%mm6, 8(dst))\
  591. \
  592. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  593. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  594. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  595. MOVNTQ(%%mm5, 16(dst))\
  596. \
  597. "add $24, "#dst" \n\t"\
  598. \
  599. "add $8, "#index" \n\t"\
  600. "cmp "dstw", "#index" \n\t"\
  601. " jb 1b \n\t"
  602. #define WRITEBGR24MMXEXT(dst, dstw, index) \
  603. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  604. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  605. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  606. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  607. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  608. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  609. \
  610. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  611. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  612. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  613. \
  614. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  615. "por %%mm1, %%mm6 \n\t"\
  616. "por %%mm3, %%mm6 \n\t"\
  617. MOVNTQ(%%mm6, (dst))\
  618. \
  619. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  620. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  621. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  622. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  623. \
  624. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  625. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  626. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  627. \
  628. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  629. "por %%mm3, %%mm6 \n\t"\
  630. MOVNTQ(%%mm6, 8(dst))\
  631. \
  632. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  633. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  634. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  635. \
  636. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  637. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  638. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  639. \
  640. "por %%mm1, %%mm3 \n\t"\
  641. "por %%mm3, %%mm6 \n\t"\
  642. MOVNTQ(%%mm6, 16(dst))\
  643. \
  644. "add $24, "#dst" \n\t"\
  645. \
  646. "add $8, "#index" \n\t"\
  647. "cmp "dstw", "#index" \n\t"\
  648. " jb 1b \n\t"
  649. #if COMPILE_TEMPLATE_MMXEXT
  650. #undef WRITEBGR24
  651. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
  652. #else
  653. #undef WRITEBGR24
  654. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  655. #endif
  656. #if HAVE_6REGS
  657. static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  658. const int16_t **lumSrc, int lumFilterSize,
  659. const int16_t *chrFilter, const int16_t **chrUSrc,
  660. const int16_t **chrVSrc,
  661. int chrFilterSize, const int16_t **alpSrc,
  662. uint8_t *dest, int dstW, int dstY)
  663. {
  664. x86_reg dummy=0;
  665. x86_reg dstW_reg = dstW;
  666. x86_reg uv_off = c->uv_offx2;
  667. YSCALEYUV2PACKEDX_ACCURATE
  668. YSCALEYUV2RGBX
  669. "pxor %%mm7, %%mm7 \n\t"
  670. "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c"\n\t" //FIXME optimize
  671. "add %4, %%"FF_REG_c" \n\t"
  672. WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
  673. :: "r" (&c->redDither),
  674. "m" (dummy), "m" (dummy), "m" (dummy),
  675. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  676. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  677. : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
  678. );
  679. }
  680. static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  681. const int16_t **lumSrc, int lumFilterSize,
  682. const int16_t *chrFilter, const int16_t **chrUSrc,
  683. const int16_t **chrVSrc,
  684. int chrFilterSize, const int16_t **alpSrc,
  685. uint8_t *dest, int dstW, int dstY)
  686. {
  687. x86_reg dummy=0;
  688. x86_reg dstW_reg = dstW;
  689. x86_reg uv_off = c->uv_offx2;
  690. YSCALEYUV2PACKEDX
  691. YSCALEYUV2RGBX
  692. "pxor %%mm7, %%mm7 \n\t"
  693. "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" //FIXME optimize
  694. "add %4, %%"FF_REG_c" \n\t"
  695. WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
  696. :: "r" (&c->redDither),
  697. "m" (dummy), "m" (dummy), "m" (dummy),
  698. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  699. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  700. : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
  701. );
  702. }
  703. #endif /* HAVE_6REGS */
  704. #define REAL_WRITEYUY2(dst, dstw, index) \
  705. "packuswb %%mm3, %%mm3 \n\t"\
  706. "packuswb %%mm4, %%mm4 \n\t"\
  707. "packuswb %%mm7, %%mm1 \n\t"\
  708. "punpcklbw %%mm4, %%mm3 \n\t"\
  709. "movq %%mm1, %%mm7 \n\t"\
  710. "punpcklbw %%mm3, %%mm1 \n\t"\
  711. "punpckhbw %%mm3, %%mm7 \n\t"\
  712. \
  713. MOVNTQ(%%mm1, (dst, index, 2))\
  714. MOVNTQ(%%mm7, 8(dst, index, 2))\
  715. \
  716. "add $8, "#index" \n\t"\
  717. "cmp "dstw", "#index" \n\t"\
  718. " jb 1b \n\t"
  719. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  720. static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  721. const int16_t **lumSrc, int lumFilterSize,
  722. const int16_t *chrFilter, const int16_t **chrUSrc,
  723. const int16_t **chrVSrc,
  724. int chrFilterSize, const int16_t **alpSrc,
  725. uint8_t *dest, int dstW, int dstY)
  726. {
  727. x86_reg dummy=0;
  728. x86_reg dstW_reg = dstW;
  729. x86_reg uv_off = c->uv_offx2;
  730. YSCALEYUV2PACKEDX_ACCURATE
  731. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  732. "psraw $3, %%mm3 \n\t"
  733. "psraw $3, %%mm4 \n\t"
  734. "psraw $3, %%mm1 \n\t"
  735. "psraw $3, %%mm7 \n\t"
  736. WRITEYUY2(%4, "%5", %%FF_REGa)
  737. YSCALEYUV2PACKEDX_END
  738. }
  739. static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  740. const int16_t **lumSrc, int lumFilterSize,
  741. const int16_t *chrFilter, const int16_t **chrUSrc,
  742. const int16_t **chrVSrc,
  743. int chrFilterSize, const int16_t **alpSrc,
  744. uint8_t *dest, int dstW, int dstY)
  745. {
  746. x86_reg dummy=0;
  747. x86_reg dstW_reg = dstW;
  748. x86_reg uv_off = c->uv_offx2;
  749. YSCALEYUV2PACKEDX
  750. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  751. "psraw $3, %%mm3 \n\t"
  752. "psraw $3, %%mm4 \n\t"
  753. "psraw $3, %%mm1 \n\t"
  754. "psraw $3, %%mm7 \n\t"
  755. WRITEYUY2(%4, "%5", %%FF_REGa)
  756. YSCALEYUV2PACKEDX_END
  757. }
  758. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  759. "xor "#index", "#index" \n\t"\
  760. ".p2align 4 \n\t"\
  761. "1: \n\t"\
  762. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  763. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  764. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  765. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  766. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  767. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  768. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  769. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  770. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  771. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  772. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  773. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  774. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  775. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  776. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  777. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  778. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  779. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  780. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  781. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  782. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  783. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  784. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  785. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  786. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  787. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  788. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  789. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  790. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  791. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  792. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  793. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  794. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  795. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  796. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  797. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  798. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  799. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  800. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  801. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  802. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  803. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  804. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  805. "paddw %%mm3, %%mm4 \n\t"\
  806. "movq %%mm2, %%mm0 \n\t"\
  807. "movq %%mm5, %%mm6 \n\t"\
  808. "movq %%mm4, %%mm3 \n\t"\
  809. "punpcklwd %%mm2, %%mm2 \n\t"\
  810. "punpcklwd %%mm5, %%mm5 \n\t"\
  811. "punpcklwd %%mm4, %%mm4 \n\t"\
  812. "paddw %%mm1, %%mm2 \n\t"\
  813. "paddw %%mm1, %%mm5 \n\t"\
  814. "paddw %%mm1, %%mm4 \n\t"\
  815. "punpckhwd %%mm0, %%mm0 \n\t"\
  816. "punpckhwd %%mm6, %%mm6 \n\t"\
  817. "punpckhwd %%mm3, %%mm3 \n\t"\
  818. "paddw %%mm7, %%mm0 \n\t"\
  819. "paddw %%mm7, %%mm6 \n\t"\
  820. "paddw %%mm7, %%mm3 \n\t"\
  821. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  822. "packuswb %%mm0, %%mm2 \n\t"\
  823. "packuswb %%mm6, %%mm5 \n\t"\
  824. "packuswb %%mm3, %%mm4 \n\t"\
  825. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  826. #define YSCALEYUV2RGB(index, c) \
  827. REAL_YSCALEYUV2RGB_UV(index, c) \
  828. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  829. REAL_YSCALEYUV2RGB_COEFF(c)
  830. /**
  831. * vertical bilinear scale YV12 to RGB
  832. */
  833. static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
  834. const int16_t *ubuf[2], const int16_t *vbuf[2],
  835. const int16_t *abuf[2], uint8_t *dest,
  836. int dstW, int yalpha, int uvalpha, int y)
  837. {
  838. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  839. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  840. if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
  841. const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
  842. #if ARCH_X86_64
  843. __asm__ volatile(
  844. YSCALEYUV2RGB(%%r8, %5)
  845. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  846. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  847. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  848. "packuswb %%mm7, %%mm1 \n\t"
  849. WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  850. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  851. "a" (&c->redDither),
  852. "r" (abuf0), "r" (abuf1)
  853. : "%r8"
  854. );
  855. #else
  856. c->u_temp=(intptr_t)abuf0;
  857. c->v_temp=(intptr_t)abuf1;
  858. __asm__ volatile(
  859. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  860. "mov %4, %%"FF_REG_b" \n\t"
  861. "push %%"FF_REG_BP" \n\t"
  862. YSCALEYUV2RGB(%%FF_REGBP, %5)
  863. "push %0 \n\t"
  864. "push %1 \n\t"
  865. "mov "U_TEMP"(%5), %0 \n\t"
  866. "mov "V_TEMP"(%5), %1 \n\t"
  867. YSCALEYUV2RGB_YA(%%FF_REGBP, %5, %0, %1)
  868. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  869. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  870. "packuswb %%mm7, %%mm1 \n\t"
  871. "pop %1 \n\t"
  872. "pop %0 \n\t"
  873. WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  874. "pop %%"FF_REG_BP" \n\t"
  875. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  876. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  877. "a" (&c->redDither)
  878. );
  879. #endif
  880. } else {
  881. __asm__ volatile(
  882. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  883. "mov %4, %%"FF_REG_b" \n\t"
  884. "push %%"FF_REG_BP" \n\t"
  885. YSCALEYUV2RGB(%%FF_REGBP, %5)
  886. "pcmpeqd %%mm7, %%mm7 \n\t"
  887. WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  888. "pop %%"FF_REG_BP" \n\t"
  889. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  890. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  891. "a" (&c->redDither)
  892. );
  893. }
  894. }
  895. static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
  896. const int16_t *ubuf[2], const int16_t *vbuf[2],
  897. const int16_t *abuf[2], uint8_t *dest,
  898. int dstW, int yalpha, int uvalpha, int y)
  899. {
  900. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  901. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  902. __asm__ volatile(
  903. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  904. "mov %4, %%"FF_REG_b" \n\t"
  905. "push %%"FF_REG_BP" \n\t"
  906. YSCALEYUV2RGB(%%FF_REGBP, %5)
  907. "pxor %%mm7, %%mm7 \n\t"
  908. WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  909. "pop %%"FF_REG_BP" \n\t"
  910. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  911. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  912. "a" (&c->redDither)
  913. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  914. );
  915. }
  916. static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
  917. const int16_t *ubuf[2], const int16_t *vbuf[2],
  918. const int16_t *abuf[2], uint8_t *dest,
  919. int dstW, int yalpha, int uvalpha, int y)
  920. {
  921. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  922. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  923. __asm__ volatile(
  924. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  925. "mov %4, %%"FF_REG_b" \n\t"
  926. "push %%"FF_REG_BP" \n\t"
  927. YSCALEYUV2RGB(%%FF_REGBP, %5)
  928. "pxor %%mm7, %%mm7 \n\t"
  929. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  930. #ifdef DITHER1XBPP
  931. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  932. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  933. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  934. #endif
  935. WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  936. "pop %%"FF_REG_BP" \n\t"
  937. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  938. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  939. "a" (&c->redDither)
  940. NAMED_CONSTRAINTS_ADD(bF8)
  941. );
  942. }
  943. static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
  944. const int16_t *ubuf[2], const int16_t *vbuf[2],
  945. const int16_t *abuf[2], uint8_t *dest,
  946. int dstW, int yalpha, int uvalpha, int y)
  947. {
  948. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  949. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  950. __asm__ volatile(
  951. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  952. "mov %4, %%"FF_REG_b" \n\t"
  953. "push %%"FF_REG_BP" \n\t"
  954. YSCALEYUV2RGB(%%FF_REGBP, %5)
  955. "pxor %%mm7, %%mm7 \n\t"
  956. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  957. #ifdef DITHER1XBPP
  958. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  959. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  960. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  961. #endif
  962. WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  963. "pop %%"FF_REG_BP" \n\t"
  964. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  965. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  966. "a" (&c->redDither)
  967. NAMED_CONSTRAINTS_ADD(bF8,bFC)
  968. );
  969. }
  970. #define REAL_YSCALEYUV2PACKED(index, c) \
  971. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  972. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  973. "psraw $3, %%mm0 \n\t"\
  974. "psraw $3, %%mm1 \n\t"\
  975. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  976. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  977. "xor "#index", "#index" \n\t"\
  978. ".p2align 4 \n\t"\
  979. "1: \n\t"\
  980. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  981. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  982. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  983. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  984. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  985. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  986. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  987. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  988. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  989. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  990. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  991. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  992. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  993. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  994. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  995. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  996. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  997. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  998. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  999. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  1000. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  1001. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1002. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1003. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1004. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1005. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1006. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1007. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  1008. static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
  1009. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1010. const int16_t *abuf[2], uint8_t *dest,
  1011. int dstW, int yalpha, int uvalpha, int y)
  1012. {
  1013. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1014. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1015. __asm__ volatile(
  1016. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1017. "mov %4, %%"FF_REG_b" \n\t"
  1018. "push %%"FF_REG_BP" \n\t"
  1019. YSCALEYUV2PACKED(%%FF_REGBP, %5)
  1020. WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1021. "pop %%"FF_REG_BP" \n\t"
  1022. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1023. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1024. "a" (&c->redDither)
  1025. );
  1026. }
  1027. #define REAL_YSCALEYUV2RGB1(index, c) \
  1028. "xor "#index", "#index" \n\t"\
  1029. ".p2align 4 \n\t"\
  1030. "1: \n\t"\
  1031. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1032. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1033. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1034. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1035. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1036. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1037. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1038. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1039. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1040. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1041. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1042. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1043. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1044. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1045. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1046. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1047. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1048. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1049. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1050. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1051. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1052. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1053. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1054. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1055. "paddw %%mm3, %%mm4 \n\t"\
  1056. "movq %%mm2, %%mm0 \n\t"\
  1057. "movq %%mm5, %%mm6 \n\t"\
  1058. "movq %%mm4, %%mm3 \n\t"\
  1059. "punpcklwd %%mm2, %%mm2 \n\t"\
  1060. "punpcklwd %%mm5, %%mm5 \n\t"\
  1061. "punpcklwd %%mm4, %%mm4 \n\t"\
  1062. "paddw %%mm1, %%mm2 \n\t"\
  1063. "paddw %%mm1, %%mm5 \n\t"\
  1064. "paddw %%mm1, %%mm4 \n\t"\
  1065. "punpckhwd %%mm0, %%mm0 \n\t"\
  1066. "punpckhwd %%mm6, %%mm6 \n\t"\
  1067. "punpckhwd %%mm3, %%mm3 \n\t"\
  1068. "paddw %%mm7, %%mm0 \n\t"\
  1069. "paddw %%mm7, %%mm6 \n\t"\
  1070. "paddw %%mm7, %%mm3 \n\t"\
  1071. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1072. "packuswb %%mm0, %%mm2 \n\t"\
  1073. "packuswb %%mm6, %%mm5 \n\t"\
  1074. "packuswb %%mm3, %%mm4 \n\t"\
  1075. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1076. // do vertical chrominance interpolation
  1077. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1078. "xor "#index", "#index" \n\t"\
  1079. ".p2align 4 \n\t"\
  1080. "1: \n\t"\
  1081. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1082. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1083. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1084. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1085. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1086. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1087. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1088. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1089. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1090. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1091. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1092. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1093. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1094. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1095. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1096. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1097. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1098. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1099. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1100. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1101. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1102. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1103. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1104. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1105. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1106. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1107. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1108. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1109. "paddw %%mm3, %%mm4 \n\t"\
  1110. "movq %%mm2, %%mm0 \n\t"\
  1111. "movq %%mm5, %%mm6 \n\t"\
  1112. "movq %%mm4, %%mm3 \n\t"\
  1113. "punpcklwd %%mm2, %%mm2 \n\t"\
  1114. "punpcklwd %%mm5, %%mm5 \n\t"\
  1115. "punpcklwd %%mm4, %%mm4 \n\t"\
  1116. "paddw %%mm1, %%mm2 \n\t"\
  1117. "paddw %%mm1, %%mm5 \n\t"\
  1118. "paddw %%mm1, %%mm4 \n\t"\
  1119. "punpckhwd %%mm0, %%mm0 \n\t"\
  1120. "punpckhwd %%mm6, %%mm6 \n\t"\
  1121. "punpckhwd %%mm3, %%mm3 \n\t"\
  1122. "paddw %%mm7, %%mm0 \n\t"\
  1123. "paddw %%mm7, %%mm6 \n\t"\
  1124. "paddw %%mm7, %%mm3 \n\t"\
  1125. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1126. "packuswb %%mm0, %%mm2 \n\t"\
  1127. "packuswb %%mm6, %%mm5 \n\t"\
  1128. "packuswb %%mm3, %%mm4 \n\t"\
  1129. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1130. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1131. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1132. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1133. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1134. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1135. "packuswb %%mm1, %%mm7 \n\t"
  1136. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1137. /**
  1138. * YV12 to RGB without scaling or interpolating
  1139. */
  1140. static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
  1141. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1142. const int16_t *abuf0, uint8_t *dest,
  1143. int dstW, int uvalpha, int y)
  1144. {
  1145. const int16_t *ubuf0 = ubuf[0];
  1146. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1147. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1148. const int16_t *ubuf1 = ubuf[0];
  1149. if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
  1150. __asm__ volatile(
  1151. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1152. "mov %4, %%"FF_REG_b" \n\t"
  1153. "push %%"FF_REG_BP" \n\t"
  1154. YSCALEYUV2RGB1(%%FF_REGBP, %5)
  1155. YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
  1156. WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1157. "pop %%"FF_REG_BP" \n\t"
  1158. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1159. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1160. "a" (&c->redDither)
  1161. );
  1162. } else {
  1163. __asm__ volatile(
  1164. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1165. "mov %4, %%"FF_REG_b" \n\t"
  1166. "push %%"FF_REG_BP" \n\t"
  1167. YSCALEYUV2RGB1(%%FF_REGBP, %5)
  1168. "pcmpeqd %%mm7, %%mm7 \n\t"
  1169. WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1170. "pop %%"FF_REG_BP" \n\t"
  1171. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1172. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1173. "a" (&c->redDither)
  1174. );
  1175. }
  1176. } else {
  1177. const int16_t *ubuf1 = ubuf[1];
  1178. if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
  1179. __asm__ volatile(
  1180. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1181. "mov %4, %%"FF_REG_b" \n\t"
  1182. "push %%"FF_REG_BP" \n\t"
  1183. YSCALEYUV2RGB1b(%%FF_REGBP, %5)
  1184. YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
  1185. WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1186. "pop %%"FF_REG_BP" \n\t"
  1187. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1188. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1189. "a" (&c->redDither)
  1190. );
  1191. } else {
  1192. __asm__ volatile(
  1193. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1194. "mov %4, %%"FF_REG_b" \n\t"
  1195. "push %%"FF_REG_BP" \n\t"
  1196. YSCALEYUV2RGB1b(%%FF_REGBP, %5)
  1197. "pcmpeqd %%mm7, %%mm7 \n\t"
  1198. WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1199. "pop %%"FF_REG_BP" \n\t"
  1200. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1201. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1202. "a" (&c->redDither)
  1203. );
  1204. }
  1205. }
  1206. }
  1207. static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
  1208. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1209. const int16_t *abuf0, uint8_t *dest,
  1210. int dstW, int uvalpha, int y)
  1211. {
  1212. const int16_t *ubuf0 = ubuf[0];
  1213. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1214. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1215. const int16_t *ubuf1 = ubuf[0];
  1216. __asm__ volatile(
  1217. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1218. "mov %4, %%"FF_REG_b" \n\t"
  1219. "push %%"FF_REG_BP" \n\t"
  1220. YSCALEYUV2RGB1(%%FF_REGBP, %5)
  1221. "pxor %%mm7, %%mm7 \n\t"
  1222. WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1223. "pop %%"FF_REG_BP" \n\t"
  1224. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1225. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1226. "a" (&c->redDither)
  1227. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  1228. );
  1229. } else {
  1230. const int16_t *ubuf1 = ubuf[1];
  1231. __asm__ volatile(
  1232. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1233. "mov %4, %%"FF_REG_b" \n\t"
  1234. "push %%"FF_REG_BP" \n\t"
  1235. YSCALEYUV2RGB1b(%%FF_REGBP, %5)
  1236. "pxor %%mm7, %%mm7 \n\t"
  1237. WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1238. "pop %%"FF_REG_BP" \n\t"
  1239. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1240. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1241. "a" (&c->redDither)
  1242. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  1243. );
  1244. }
  1245. }
  1246. static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
  1247. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1248. const int16_t *abuf0, uint8_t *dest,
  1249. int dstW, int uvalpha, int y)
  1250. {
  1251. const int16_t *ubuf0 = ubuf[0];
  1252. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1253. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1254. const int16_t *ubuf1 = ubuf[0];
  1255. __asm__ volatile(
  1256. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1257. "mov %4, %%"FF_REG_b" \n\t"
  1258. "push %%"FF_REG_BP" \n\t"
  1259. YSCALEYUV2RGB1(%%FF_REGBP, %5)
  1260. "pxor %%mm7, %%mm7 \n\t"
  1261. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1262. #ifdef DITHER1XBPP
  1263. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1264. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1265. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1266. #endif
  1267. WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1268. "pop %%"FF_REG_BP" \n\t"
  1269. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1270. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1271. "a" (&c->redDither)
  1272. NAMED_CONSTRAINTS_ADD(bF8)
  1273. );
  1274. } else {
  1275. const int16_t *ubuf1 = ubuf[1];
  1276. __asm__ volatile(
  1277. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1278. "mov %4, %%"FF_REG_b" \n\t"
  1279. "push %%"FF_REG_BP" \n\t"
  1280. YSCALEYUV2RGB1b(%%FF_REGBP, %5)
  1281. "pxor %%mm7, %%mm7 \n\t"
  1282. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1283. #ifdef DITHER1XBPP
  1284. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1285. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1286. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1287. #endif
  1288. WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1289. "pop %%"FF_REG_BP" \n\t"
  1290. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1291. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1292. "a" (&c->redDither)
  1293. NAMED_CONSTRAINTS_ADD(bF8)
  1294. );
  1295. }
  1296. }
  1297. static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
  1298. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1299. const int16_t *abuf0, uint8_t *dest,
  1300. int dstW, int uvalpha, int y)
  1301. {
  1302. const int16_t *ubuf0 = ubuf[0];
  1303. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1304. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1305. const int16_t *ubuf1 = ubuf[0];
  1306. __asm__ volatile(
  1307. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1308. "mov %4, %%"FF_REG_b" \n\t"
  1309. "push %%"FF_REG_BP" \n\t"
  1310. YSCALEYUV2RGB1(%%FF_REGBP, %5)
  1311. "pxor %%mm7, %%mm7 \n\t"
  1312. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1313. #ifdef DITHER1XBPP
  1314. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1315. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1316. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1317. #endif
  1318. WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1319. "pop %%"FF_REG_BP" \n\t"
  1320. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1321. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1322. "a" (&c->redDither)
  1323. NAMED_CONSTRAINTS_ADD(bF8,bFC)
  1324. );
  1325. } else {
  1326. const int16_t *ubuf1 = ubuf[1];
  1327. __asm__ volatile(
  1328. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1329. "mov %4, %%"FF_REG_b" \n\t"
  1330. "push %%"FF_REG_BP" \n\t"
  1331. YSCALEYUV2RGB1b(%%FF_REGBP, %5)
  1332. "pxor %%mm7, %%mm7 \n\t"
  1333. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1334. #ifdef DITHER1XBPP
  1335. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1336. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1337. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1338. #endif
  1339. WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1340. "pop %%"FF_REG_BP" \n\t"
  1341. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1342. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1343. "a" (&c->redDither)
  1344. NAMED_CONSTRAINTS_ADD(bF8,bFC)
  1345. );
  1346. }
  1347. }
  1348. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1349. "xor "#index", "#index" \n\t"\
  1350. ".p2align 4 \n\t"\
  1351. "1: \n\t"\
  1352. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1353. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1354. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1355. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1356. "psraw $7, %%mm3 \n\t" \
  1357. "psraw $7, %%mm4 \n\t" \
  1358. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1359. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1360. "psraw $7, %%mm1 \n\t" \
  1361. "psraw $7, %%mm7 \n\t" \
  1362. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1363. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1364. "xor "#index", "#index" \n\t"\
  1365. ".p2align 4 \n\t"\
  1366. "1: \n\t"\
  1367. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1368. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1369. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1370. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1371. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1372. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1373. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1374. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1375. "psrlw $8, %%mm3 \n\t" \
  1376. "psrlw $8, %%mm4 \n\t" \
  1377. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1378. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1379. "psraw $7, %%mm1 \n\t" \
  1380. "psraw $7, %%mm7 \n\t"
  1381. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1382. static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
  1383. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1384. const int16_t *abuf0, uint8_t *dest,
  1385. int dstW, int uvalpha, int y)
  1386. {
  1387. const int16_t *ubuf0 = ubuf[0];
  1388. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1389. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1390. const int16_t *ubuf1 = ubuf[0];
  1391. __asm__ volatile(
  1392. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1393. "mov %4, %%"FF_REG_b" \n\t"
  1394. "push %%"FF_REG_BP" \n\t"
  1395. YSCALEYUV2PACKED1(%%FF_REGBP, %5)
  1396. WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1397. "pop %%"FF_REG_BP" \n\t"
  1398. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1399. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1400. "a" (&c->redDither)
  1401. );
  1402. } else {
  1403. const int16_t *ubuf1 = ubuf[1];
  1404. __asm__ volatile(
  1405. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1406. "mov %4, %%"FF_REG_b" \n\t"
  1407. "push %%"FF_REG_BP" \n\t"
  1408. YSCALEYUV2PACKED1b(%%FF_REGBP, %5)
  1409. WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1410. "pop %%"FF_REG_BP" \n\t"
  1411. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1412. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1413. "a" (&c->redDither)
  1414. );
  1415. }
  1416. }
  1417. static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
  1418. {
  1419. enum AVPixelFormat dstFormat = c->dstFormat;
  1420. c->use_mmx_vfilter= 0;
  1421. if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat)
  1422. && dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE
  1423. && !(c->flags & SWS_BITEXACT)) {
  1424. if (c->flags & SWS_ACCURATE_RND) {
  1425. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1426. switch (c->dstFormat) {
  1427. case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  1428. #if HAVE_6REGS
  1429. case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  1430. #endif
  1431. case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  1432. case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  1433. case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  1434. default: break;
  1435. }
  1436. }
  1437. } else {
  1438. c->use_mmx_vfilter= 1;
  1439. c->yuv2planeX = RENAME(yuv2yuvX );
  1440. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1441. switch (c->dstFormat) {
  1442. case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  1443. case AV_PIX_FMT_BGR32: c->yuv2packedX = RENAME(yuv2bgr32_X); break;
  1444. #if HAVE_6REGS
  1445. case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  1446. #endif
  1447. case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  1448. case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  1449. case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  1450. default: break;
  1451. }
  1452. }
  1453. }
  1454. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1455. switch (c->dstFormat) {
  1456. case AV_PIX_FMT_RGB32:
  1457. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  1458. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  1459. break;
  1460. case AV_PIX_FMT_BGR24:
  1461. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  1462. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  1463. break;
  1464. case AV_PIX_FMT_RGB555:
  1465. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  1466. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  1467. break;
  1468. case AV_PIX_FMT_RGB565:
  1469. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  1470. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  1471. break;
  1472. case AV_PIX_FMT_YUYV422:
  1473. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  1474. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  1475. break;
  1476. default:
  1477. break;
  1478. }
  1479. }
  1480. }
  1481. if (c->srcBpc == 8 && c->dstBpc <= 14) {
  1482. // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
  1483. #if COMPILE_TEMPLATE_MMXEXT
  1484. if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
  1485. c->hyscale_fast = ff_hyscale_fast_mmxext;
  1486. c->hcscale_fast = ff_hcscale_fast_mmxext;
  1487. } else {
  1488. #endif /* COMPILE_TEMPLATE_MMXEXT */
  1489. c->hyscale_fast = NULL;
  1490. c->hcscale_fast = NULL;
  1491. #if COMPILE_TEMPLATE_MMXEXT
  1492. }
  1493. #endif /* COMPILE_TEMPLATE_MMXEXT */
  1494. }
  1495. }