swscale_template.c 72 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576
  1. /*
  2. * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include <stdint.h>
  21. #include "libavutil/x86/asm.h"
  22. #include "libswscale/swscale_internal.h"
  23. #undef REAL_MOVNTQ
  24. #undef MOVNTQ
  25. #undef MOVNTQ2
  26. #undef PREFETCH
  27. #if COMPILE_TEMPLATE_MMXEXT
  28. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  29. #define MOVNTQ2 "movntq "
  30. #else
  31. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  32. #define MOVNTQ2 "movq "
  33. #endif
  34. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  35. #if !COMPILE_TEMPLATE_MMXEXT
  36. static av_always_inline void
  37. dither_8to16(const uint8_t *srcDither, int rot)
  38. {
  39. if (rot) {
  40. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  41. "movq (%0), %%mm3\n\t"
  42. "movq %%mm3, %%mm4\n\t"
  43. "psrlq $24, %%mm3\n\t"
  44. "psllq $40, %%mm4\n\t"
  45. "por %%mm4, %%mm3\n\t"
  46. "movq %%mm3, %%mm4\n\t"
  47. "punpcklbw %%mm0, %%mm3\n\t"
  48. "punpckhbw %%mm0, %%mm4\n\t"
  49. :: "r"(srcDither)
  50. );
  51. } else {
  52. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  53. "movq (%0), %%mm3\n\t"
  54. "movq %%mm3, %%mm4\n\t"
  55. "punpcklbw %%mm0, %%mm3\n\t"
  56. "punpckhbw %%mm0, %%mm4\n\t"
  57. :: "r"(srcDither)
  58. );
  59. }
  60. }
  61. #endif
  62. static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
  63. const int16_t **src, uint8_t *dest, int dstW,
  64. const uint8_t *dither, int offset)
  65. {
  66. dither_8to16(dither, offset);
  67. filterSize--;
  68. __asm__ volatile(
  69. "movd %0, %%mm1\n\t"
  70. "punpcklwd %%mm1, %%mm1\n\t"
  71. "punpckldq %%mm1, %%mm1\n\t"
  72. "psllw $3, %%mm1\n\t"
  73. "paddw %%mm1, %%mm3\n\t"
  74. "paddw %%mm1, %%mm4\n\t"
  75. "psraw $4, %%mm3\n\t"
  76. "psraw $4, %%mm4\n\t"
  77. ::"m"(filterSize)
  78. );
  79. __asm__ volatile(\
  80. "movq %%mm3, %%mm6\n\t"
  81. "movq %%mm4, %%mm7\n\t"
  82. "movl %3, %%ecx\n\t"
  83. "mov %0, %%"FF_REG_d" \n\t"\
  84. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  85. ".p2align 4 \n\t" /* FIXME Unroll? */\
  86. "1: \n\t"\
  87. "movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\
  88. "movq (%%"FF_REG_S", %%"FF_REG_c", 2), %%mm2 \n\t" /* srcData */\
  89. "movq 8(%%"FF_REG_S", %%"FF_REG_c", 2), %%mm5 \n\t" /* srcData */\
  90. "add $16, %%"FF_REG_d" \n\t"\
  91. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  92. "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
  93. "pmulhw %%mm0, %%mm2 \n\t"\
  94. "pmulhw %%mm0, %%mm5 \n\t"\
  95. "paddw %%mm2, %%mm3 \n\t"\
  96. "paddw %%mm5, %%mm4 \n\t"\
  97. " jnz 1b \n\t"\
  98. "psraw $3, %%mm3 \n\t"\
  99. "psraw $3, %%mm4 \n\t"\
  100. "packuswb %%mm4, %%mm3 \n\t"
  101. MOVNTQ2 " %%mm3, (%1, %%"FF_REG_c")\n\t"
  102. "add $8, %%"FF_REG_c" \n\t"\
  103. "cmp %2, %%"FF_REG_c" \n\t"\
  104. "movq %%mm6, %%mm3\n\t"
  105. "movq %%mm7, %%mm4\n\t"
  106. "mov %0, %%"FF_REG_d" \n\t"\
  107. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  108. "jb 1b \n\t"\
  109. :: "g" (filter),
  110. "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
  111. : "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
  112. );
  113. }
  114. #define YSCALEYUV2PACKEDX_UV \
  115. __asm__ volatile(\
  116. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
  117. ".p2align 4 \n\t"\
  118. "nop \n\t"\
  119. "1: \n\t"\
  120. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
  121. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  122. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  123. "movq %%mm3, %%mm4 \n\t"\
  124. ".p2align 4 \n\t"\
  125. "2: \n\t"\
  126. "movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\
  127. "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* UsrcData */\
  128. "add %6, %%"FF_REG_S" \n\t" \
  129. "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" /* VsrcData */\
  130. "add $16, %%"FF_REG_d" \n\t"\
  131. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  132. "pmulhw %%mm0, %%mm2 \n\t"\
  133. "pmulhw %%mm0, %%mm5 \n\t"\
  134. "paddw %%mm2, %%mm3 \n\t"\
  135. "paddw %%mm5, %%mm4 \n\t"\
  136. "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
  137. " jnz 2b \n\t"\
  138. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  139. "lea "offset"(%0), %%"FF_REG_d" \n\t"\
  140. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  141. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  142. "movq "#dst1", "#dst2" \n\t"\
  143. ".p2align 4 \n\t"\
  144. "2: \n\t"\
  145. "movq 8(%%"FF_REG_d"), "#coeff" \n\t" /* filterCoeff */\
  146. "movq (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  147. "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  148. "add $16, %%"FF_REG_d" \n\t"\
  149. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  150. "pmulhw "#coeff", "#src1" \n\t"\
  151. "pmulhw "#coeff", "#src2" \n\t"\
  152. "paddw "#src1", "#dst1" \n\t"\
  153. "paddw "#src2", "#dst2" \n\t"\
  154. "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
  155. " jnz 2b \n\t"\
  156. #define YSCALEYUV2PACKEDX \
  157. YSCALEYUV2PACKEDX_UV \
  158. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  159. #define YSCALEYUV2PACKEDX_END \
  160. :: "r" (&c->redDither), \
  161. "m" (dummy), "m" (dummy), "m" (dummy),\
  162. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  163. NAMED_CONSTRAINTS_ADD(bF8,bFC) \
  164. : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \
  165. );
  166. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  167. __asm__ volatile(\
  168. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
  169. ".p2align 4 \n\t"\
  170. "nop \n\t"\
  171. "1: \n\t"\
  172. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
  173. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  174. "pxor %%mm4, %%mm4 \n\t"\
  175. "pxor %%mm5, %%mm5 \n\t"\
  176. "pxor %%mm6, %%mm6 \n\t"\
  177. "pxor %%mm7, %%mm7 \n\t"\
  178. ".p2align 4 \n\t"\
  179. "2: \n\t"\
  180. "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" /* UsrcData */\
  181. "add %6, %%"FF_REG_S" \n\t" \
  182. "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* VsrcData */\
  183. "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  184. "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" /* UsrcData */\
  185. "movq %%mm0, %%mm3 \n\t"\
  186. "punpcklwd %%mm1, %%mm0 \n\t"\
  187. "punpckhwd %%mm1, %%mm3 \n\t"\
  188. "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1 \n\t" /* filterCoeff */\
  189. "pmaddwd %%mm1, %%mm0 \n\t"\
  190. "pmaddwd %%mm1, %%mm3 \n\t"\
  191. "paddd %%mm0, %%mm4 \n\t"\
  192. "paddd %%mm3, %%mm5 \n\t"\
  193. "add %6, %%"FF_REG_S" \n\t" \
  194. "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" /* VsrcData */\
  195. "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  196. "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
  197. "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
  198. "movq %%mm2, %%mm0 \n\t"\
  199. "punpcklwd %%mm3, %%mm2 \n\t"\
  200. "punpckhwd %%mm3, %%mm0 \n\t"\
  201. "pmaddwd %%mm1, %%mm2 \n\t"\
  202. "pmaddwd %%mm1, %%mm0 \n\t"\
  203. "paddd %%mm2, %%mm6 \n\t"\
  204. "paddd %%mm0, %%mm7 \n\t"\
  205. " jnz 2b \n\t"\
  206. "psrad $16, %%mm4 \n\t"\
  207. "psrad $16, %%mm5 \n\t"\
  208. "psrad $16, %%mm6 \n\t"\
  209. "psrad $16, %%mm7 \n\t"\
  210. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  211. "packssdw %%mm5, %%mm4 \n\t"\
  212. "packssdw %%mm7, %%mm6 \n\t"\
  213. "paddw %%mm0, %%mm4 \n\t"\
  214. "paddw %%mm0, %%mm6 \n\t"\
  215. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  216. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  217. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  218. "lea "offset"(%0), %%"FF_REG_d" \n\t"\
  219. "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  220. "pxor %%mm1, %%mm1 \n\t"\
  221. "pxor %%mm5, %%mm5 \n\t"\
  222. "pxor %%mm7, %%mm7 \n\t"\
  223. "pxor %%mm6, %%mm6 \n\t"\
  224. ".p2align 4 \n\t"\
  225. "2: \n\t"\
  226. "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  227. "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  228. "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  229. "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  230. "movq %%mm0, %%mm3 \n\t"\
  231. "punpcklwd %%mm4, %%mm0 \n\t"\
  232. "punpckhwd %%mm4, %%mm3 \n\t"\
  233. "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4 \n\t" /* filterCoeff */\
  234. "pmaddwd %%mm4, %%mm0 \n\t"\
  235. "pmaddwd %%mm4, %%mm3 \n\t"\
  236. "paddd %%mm0, %%mm1 \n\t"\
  237. "paddd %%mm3, %%mm5 \n\t"\
  238. "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  239. "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
  240. "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
  241. "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
  242. "movq %%mm2, %%mm0 \n\t"\
  243. "punpcklwd %%mm3, %%mm2 \n\t"\
  244. "punpckhwd %%mm3, %%mm0 \n\t"\
  245. "pmaddwd %%mm4, %%mm2 \n\t"\
  246. "pmaddwd %%mm4, %%mm0 \n\t"\
  247. "paddd %%mm2, %%mm7 \n\t"\
  248. "paddd %%mm0, %%mm6 \n\t"\
  249. " jnz 2b \n\t"\
  250. "psrad $16, %%mm1 \n\t"\
  251. "psrad $16, %%mm5 \n\t"\
  252. "psrad $16, %%mm7 \n\t"\
  253. "psrad $16, %%mm6 \n\t"\
  254. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  255. "packssdw %%mm5, %%mm1 \n\t"\
  256. "packssdw %%mm6, %%mm7 \n\t"\
  257. "paddw %%mm0, %%mm1 \n\t"\
  258. "paddw %%mm0, %%mm7 \n\t"\
  259. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  260. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  261. #define YSCALEYUV2PACKEDX_ACCURATE \
  262. YSCALEYUV2PACKEDX_ACCURATE_UV \
  263. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  264. #define YSCALEYUV2RGBX \
  265. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  266. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  267. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  268. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  269. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  270. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  271. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  272. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  273. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  274. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  275. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  276. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  277. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  278. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  279. "paddw %%mm3, %%mm4 \n\t"\
  280. "movq %%mm2, %%mm0 \n\t"\
  281. "movq %%mm5, %%mm6 \n\t"\
  282. "movq %%mm4, %%mm3 \n\t"\
  283. "punpcklwd %%mm2, %%mm2 \n\t"\
  284. "punpcklwd %%mm5, %%mm5 \n\t"\
  285. "punpcklwd %%mm4, %%mm4 \n\t"\
  286. "paddw %%mm1, %%mm2 \n\t"\
  287. "paddw %%mm1, %%mm5 \n\t"\
  288. "paddw %%mm1, %%mm4 \n\t"\
  289. "punpckhwd %%mm0, %%mm0 \n\t"\
  290. "punpckhwd %%mm6, %%mm6 \n\t"\
  291. "punpckhwd %%mm3, %%mm3 \n\t"\
  292. "paddw %%mm7, %%mm0 \n\t"\
  293. "paddw %%mm7, %%mm6 \n\t"\
  294. "paddw %%mm7, %%mm3 \n\t"\
  295. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  296. "packuswb %%mm0, %%mm2 \n\t"\
  297. "packuswb %%mm6, %%mm5 \n\t"\
  298. "packuswb %%mm3, %%mm4 \n\t"\
  299. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  300. "movq "#b", "#q2" \n\t" /* B */\
  301. "movq "#r", "#t" \n\t" /* R */\
  302. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  303. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  304. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  305. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  306. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  307. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  308. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  309. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  310. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  311. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  312. \
  313. MOVNTQ( q0, (dst, index, 4))\
  314. MOVNTQ( b, 8(dst, index, 4))\
  315. MOVNTQ( q2, 16(dst, index, 4))\
  316. MOVNTQ( q3, 24(dst, index, 4))\
  317. \
  318. "add $8, "#index" \n\t"\
  319. "cmp "dstw", "#index" \n\t"\
  320. " jb 1b \n\t"
  321. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  322. static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  323. const int16_t **lumSrc, int lumFilterSize,
  324. const int16_t *chrFilter, const int16_t **chrUSrc,
  325. const int16_t **chrVSrc,
  326. int chrFilterSize, const int16_t **alpSrc,
  327. uint8_t *dest, int dstW, int dstY)
  328. {
  329. x86_reg dummy=0;
  330. x86_reg dstW_reg = dstW;
  331. x86_reg uv_off = c->uv_offx2;
  332. if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
  333. YSCALEYUV2PACKEDX_ACCURATE
  334. YSCALEYUV2RGBX
  335. "movq %%mm2, "U_TEMP"(%0) \n\t"
  336. "movq %%mm4, "V_TEMP"(%0) \n\t"
  337. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  338. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  339. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  340. "psraw $3, %%mm1 \n\t"
  341. "psraw $3, %%mm7 \n\t"
  342. "packuswb %%mm7, %%mm1 \n\t"
  343. WRITEBGR32(%4, "%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  344. YSCALEYUV2PACKEDX_END
  345. } else {
  346. YSCALEYUV2PACKEDX_ACCURATE
  347. YSCALEYUV2RGBX
  348. "pcmpeqd %%mm7, %%mm7 \n\t"
  349. WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  350. YSCALEYUV2PACKEDX_END
  351. }
  352. }
  353. static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  354. const int16_t **lumSrc, int lumFilterSize,
  355. const int16_t *chrFilter, const int16_t **chrUSrc,
  356. const int16_t **chrVSrc,
  357. int chrFilterSize, const int16_t **alpSrc,
  358. uint8_t *dest, int dstW, int dstY)
  359. {
  360. x86_reg dummy=0;
  361. x86_reg dstW_reg = dstW;
  362. x86_reg uv_off = c->uv_offx2;
  363. if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
  364. YSCALEYUV2PACKEDX
  365. YSCALEYUV2RGBX
  366. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  367. "psraw $3, %%mm1 \n\t"
  368. "psraw $3, %%mm7 \n\t"
  369. "packuswb %%mm7, %%mm1 \n\t"
  370. WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  371. YSCALEYUV2PACKEDX_END
  372. } else {
  373. YSCALEYUV2PACKEDX
  374. YSCALEYUV2RGBX
  375. "pcmpeqd %%mm7, %%mm7 \n\t"
  376. WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  377. YSCALEYUV2PACKEDX_END
  378. }
  379. }
  380. static void RENAME(yuv2bgr32_X)(SwsContext *c, const int16_t *lumFilter,
  381. const int16_t **lumSrc, int lumFilterSize,
  382. const int16_t *chrFilter, const int16_t **chrUSrc,
  383. const int16_t **chrVSrc,
  384. int chrFilterSize, const int16_t **alpSrc,
  385. uint8_t *dest, int dstW, int dstY)
  386. {
  387. x86_reg dummy=0;
  388. x86_reg dstW_reg = dstW;
  389. x86_reg uv_off = c->uv_offx2;
  390. if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
  391. YSCALEYUV2PACKEDX
  392. YSCALEYUV2RGBX
  393. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  394. "psraw $3, %%mm1 \n\t"
  395. "psraw $3, %%mm7 \n\t"
  396. "packuswb %%mm7, %%mm1 \n\t"
  397. WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  398. YSCALEYUV2PACKEDX_END
  399. } else {
  400. YSCALEYUV2PACKEDX
  401. YSCALEYUV2RGBX
  402. "pcmpeqd %%mm7, %%mm7 \n\t"
  403. WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  404. YSCALEYUV2PACKEDX_END
  405. }
  406. }
  407. #define REAL_WRITERGB16(dst, dstw, index) \
  408. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  409. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  410. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  411. "psrlq $3, %%mm2 \n\t"\
  412. \
  413. "movq %%mm2, %%mm1 \n\t"\
  414. "movq %%mm4, %%mm3 \n\t"\
  415. \
  416. "punpcklbw %%mm7, %%mm3 \n\t"\
  417. "punpcklbw %%mm5, %%mm2 \n\t"\
  418. "punpckhbw %%mm7, %%mm4 \n\t"\
  419. "punpckhbw %%mm5, %%mm1 \n\t"\
  420. \
  421. "psllq $3, %%mm3 \n\t"\
  422. "psllq $3, %%mm4 \n\t"\
  423. \
  424. "por %%mm3, %%mm2 \n\t"\
  425. "por %%mm4, %%mm1 \n\t"\
  426. \
  427. MOVNTQ(%%mm2, (dst, index, 2))\
  428. MOVNTQ(%%mm1, 8(dst, index, 2))\
  429. \
  430. "add $8, "#index" \n\t"\
  431. "cmp "dstw", "#index" \n\t"\
  432. " jb 1b \n\t"
  433. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  434. static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  435. const int16_t **lumSrc, int lumFilterSize,
  436. const int16_t *chrFilter, const int16_t **chrUSrc,
  437. const int16_t **chrVSrc,
  438. int chrFilterSize, const int16_t **alpSrc,
  439. uint8_t *dest, int dstW, int dstY)
  440. {
  441. x86_reg dummy=0;
  442. x86_reg dstW_reg = dstW;
  443. x86_reg uv_off = c->uv_offx2;
  444. YSCALEYUV2PACKEDX_ACCURATE
  445. YSCALEYUV2RGBX
  446. "pxor %%mm7, %%mm7 \n\t"
  447. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  448. #ifdef DITHER1XBPP
  449. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  450. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  451. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  452. #endif
  453. WRITERGB16(%4, "%5", %%FF_REGa)
  454. YSCALEYUV2PACKEDX_END
  455. }
  456. static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  457. const int16_t **lumSrc, int lumFilterSize,
  458. const int16_t *chrFilter, const int16_t **chrUSrc,
  459. const int16_t **chrVSrc,
  460. int chrFilterSize, const int16_t **alpSrc,
  461. uint8_t *dest, int dstW, int dstY)
  462. {
  463. x86_reg dummy=0;
  464. x86_reg dstW_reg = dstW;
  465. x86_reg uv_off = c->uv_offx2;
  466. YSCALEYUV2PACKEDX
  467. YSCALEYUV2RGBX
  468. "pxor %%mm7, %%mm7 \n\t"
  469. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  470. #ifdef DITHER1XBPP
  471. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  472. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  473. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  474. #endif
  475. WRITERGB16(%4, "%5", %%FF_REGa)
  476. YSCALEYUV2PACKEDX_END
  477. }
  478. #define REAL_WRITERGB15(dst, dstw, index) \
  479. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  480. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  481. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  482. "psrlq $3, %%mm2 \n\t"\
  483. "psrlq $1, %%mm5 \n\t"\
  484. \
  485. "movq %%mm2, %%mm1 \n\t"\
  486. "movq %%mm4, %%mm3 \n\t"\
  487. \
  488. "punpcklbw %%mm7, %%mm3 \n\t"\
  489. "punpcklbw %%mm5, %%mm2 \n\t"\
  490. "punpckhbw %%mm7, %%mm4 \n\t"\
  491. "punpckhbw %%mm5, %%mm1 \n\t"\
  492. \
  493. "psllq $2, %%mm3 \n\t"\
  494. "psllq $2, %%mm4 \n\t"\
  495. \
  496. "por %%mm3, %%mm2 \n\t"\
  497. "por %%mm4, %%mm1 \n\t"\
  498. \
  499. MOVNTQ(%%mm2, (dst, index, 2))\
  500. MOVNTQ(%%mm1, 8(dst, index, 2))\
  501. \
  502. "add $8, "#index" \n\t"\
  503. "cmp "dstw", "#index" \n\t"\
  504. " jb 1b \n\t"
  505. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  506. static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  507. const int16_t **lumSrc, int lumFilterSize,
  508. const int16_t *chrFilter, const int16_t **chrUSrc,
  509. const int16_t **chrVSrc,
  510. int chrFilterSize, const int16_t **alpSrc,
  511. uint8_t *dest, int dstW, int dstY)
  512. {
  513. x86_reg dummy=0;
  514. x86_reg dstW_reg = dstW;
  515. x86_reg uv_off = c->uv_offx2;
  516. YSCALEYUV2PACKEDX_ACCURATE
  517. YSCALEYUV2RGBX
  518. "pxor %%mm7, %%mm7 \n\t"
  519. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  520. #ifdef DITHER1XBPP
  521. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  522. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  523. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  524. #endif
  525. WRITERGB15(%4, "%5", %%FF_REGa)
  526. YSCALEYUV2PACKEDX_END
  527. }
  528. static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  529. const int16_t **lumSrc, int lumFilterSize,
  530. const int16_t *chrFilter, const int16_t **chrUSrc,
  531. const int16_t **chrVSrc,
  532. int chrFilterSize, const int16_t **alpSrc,
  533. uint8_t *dest, int dstW, int dstY)
  534. {
  535. x86_reg dummy=0;
  536. x86_reg dstW_reg = dstW;
  537. x86_reg uv_off = c->uv_offx2;
  538. YSCALEYUV2PACKEDX
  539. YSCALEYUV2RGBX
  540. "pxor %%mm7, %%mm7 \n\t"
  541. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  542. #ifdef DITHER1XBPP
  543. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  544. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  545. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  546. #endif
  547. WRITERGB15(%4, "%5", %%FF_REGa)
  548. YSCALEYUV2PACKEDX_END
  549. }
  550. #define WRITEBGR24MMX(dst, dstw, index) \
  551. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  552. "movq %%mm2, %%mm1 \n\t" /* B */\
  553. "movq %%mm5, %%mm6 \n\t" /* R */\
  554. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  555. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  556. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  557. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  558. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  559. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  560. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  561. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  562. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  563. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  564. \
  565. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  566. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  567. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  568. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  569. \
  570. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  571. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  572. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  573. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  574. \
  575. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  576. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  577. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  578. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  579. \
  580. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  581. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  582. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  583. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  584. MOVNTQ(%%mm0, (dst))\
  585. \
  586. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  587. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  588. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  589. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  590. MOVNTQ(%%mm6, 8(dst))\
  591. \
  592. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  593. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  594. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  595. MOVNTQ(%%mm5, 16(dst))\
  596. \
  597. "add $24, "#dst" \n\t"\
  598. \
  599. "add $8, "#index" \n\t"\
  600. "cmp "dstw", "#index" \n\t"\
  601. " jb 1b \n\t"
  602. #define WRITEBGR24MMXEXT(dst, dstw, index) \
  603. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  604. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  605. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  606. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  607. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  608. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  609. \
  610. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  611. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  612. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  613. \
  614. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  615. "por %%mm1, %%mm6 \n\t"\
  616. "por %%mm3, %%mm6 \n\t"\
  617. MOVNTQ(%%mm6, (dst))\
  618. \
  619. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  620. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  621. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  622. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  623. \
  624. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  625. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  626. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  627. \
  628. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  629. "por %%mm3, %%mm6 \n\t"\
  630. MOVNTQ(%%mm6, 8(dst))\
  631. \
  632. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  633. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  634. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  635. \
  636. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  637. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  638. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  639. \
  640. "por %%mm1, %%mm3 \n\t"\
  641. "por %%mm3, %%mm6 \n\t"\
  642. MOVNTQ(%%mm6, 16(dst))\
  643. \
  644. "add $24, "#dst" \n\t"\
  645. \
  646. "add $8, "#index" \n\t"\
  647. "cmp "dstw", "#index" \n\t"\
  648. " jb 1b \n\t"
  649. #if COMPILE_TEMPLATE_MMXEXT
  650. #undef WRITEBGR24
  651. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
  652. #else
  653. #undef WRITEBGR24
  654. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  655. #endif
  656. #if HAVE_6REGS
  657. static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  658. const int16_t **lumSrc, int lumFilterSize,
  659. const int16_t *chrFilter, const int16_t **chrUSrc,
  660. const int16_t **chrVSrc,
  661. int chrFilterSize, const int16_t **alpSrc,
  662. uint8_t *dest, int dstW, int dstY)
  663. {
  664. x86_reg dummy=0;
  665. x86_reg dstW_reg = dstW;
  666. x86_reg uv_off = c->uv_offx2;
  667. YSCALEYUV2PACKEDX_ACCURATE
  668. YSCALEYUV2RGBX
  669. "pxor %%mm7, %%mm7 \n\t"
  670. "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c"\n\t" //FIXME optimize
  671. "add %4, %%"FF_REG_c" \n\t"
  672. WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
  673. :: "r" (&c->redDither),
  674. "m" (dummy), "m" (dummy), "m" (dummy),
  675. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  676. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  677. : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
  678. );
  679. }
  680. static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  681. const int16_t **lumSrc, int lumFilterSize,
  682. const int16_t *chrFilter, const int16_t **chrUSrc,
  683. const int16_t **chrVSrc,
  684. int chrFilterSize, const int16_t **alpSrc,
  685. uint8_t *dest, int dstW, int dstY)
  686. {
  687. x86_reg dummy=0;
  688. x86_reg dstW_reg = dstW;
  689. x86_reg uv_off = c->uv_offx2;
  690. YSCALEYUV2PACKEDX
  691. YSCALEYUV2RGBX
  692. "pxor %%mm7, %%mm7 \n\t"
  693. "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" //FIXME optimize
  694. "add %4, %%"FF_REG_c" \n\t"
  695. WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
  696. :: "r" (&c->redDither),
  697. "m" (dummy), "m" (dummy), "m" (dummy),
  698. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  699. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  700. : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
  701. );
  702. }
  703. #endif /* HAVE_6REGS */
  704. #define REAL_WRITEYUY2(dst, dstw, index) \
  705. "packuswb %%mm3, %%mm3 \n\t"\
  706. "packuswb %%mm4, %%mm4 \n\t"\
  707. "packuswb %%mm7, %%mm1 \n\t"\
  708. "punpcklbw %%mm4, %%mm3 \n\t"\
  709. "movq %%mm1, %%mm7 \n\t"\
  710. "punpcklbw %%mm3, %%mm1 \n\t"\
  711. "punpckhbw %%mm3, %%mm7 \n\t"\
  712. \
  713. MOVNTQ(%%mm1, (dst, index, 2))\
  714. MOVNTQ(%%mm7, 8(dst, index, 2))\
  715. \
  716. "add $8, "#index" \n\t"\
  717. "cmp "dstw", "#index" \n\t"\
  718. " jb 1b \n\t"
  719. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  720. static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  721. const int16_t **lumSrc, int lumFilterSize,
  722. const int16_t *chrFilter, const int16_t **chrUSrc,
  723. const int16_t **chrVSrc,
  724. int chrFilterSize, const int16_t **alpSrc,
  725. uint8_t *dest, int dstW, int dstY)
  726. {
  727. x86_reg dummy=0;
  728. x86_reg dstW_reg = dstW;
  729. x86_reg uv_off = c->uv_offx2;
  730. YSCALEYUV2PACKEDX_ACCURATE
  731. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  732. "psraw $3, %%mm3 \n\t"
  733. "psraw $3, %%mm4 \n\t"
  734. "psraw $3, %%mm1 \n\t"
  735. "psraw $3, %%mm7 \n\t"
  736. WRITEYUY2(%4, "%5", %%FF_REGa)
  737. YSCALEYUV2PACKEDX_END
  738. }
  739. static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  740. const int16_t **lumSrc, int lumFilterSize,
  741. const int16_t *chrFilter, const int16_t **chrUSrc,
  742. const int16_t **chrVSrc,
  743. int chrFilterSize, const int16_t **alpSrc,
  744. uint8_t *dest, int dstW, int dstY)
  745. {
  746. x86_reg dummy=0;
  747. x86_reg dstW_reg = dstW;
  748. x86_reg uv_off = c->uv_offx2;
  749. YSCALEYUV2PACKEDX
  750. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  751. "psraw $3, %%mm3 \n\t"
  752. "psraw $3, %%mm4 \n\t"
  753. "psraw $3, %%mm1 \n\t"
  754. "psraw $3, %%mm7 \n\t"
  755. WRITEYUY2(%4, "%5", %%FF_REGa)
  756. YSCALEYUV2PACKEDX_END
  757. }
  758. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  759. "xor "#index", "#index" \n\t"\
  760. ".p2align 4 \n\t"\
  761. "1: \n\t"\
  762. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  763. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  764. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  765. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  766. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  767. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  768. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  769. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  770. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  771. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  772. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  773. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  774. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  775. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  776. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  777. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  778. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  779. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  780. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  781. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  782. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  783. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  784. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  785. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  786. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  787. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  788. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  789. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  790. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  791. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  792. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  793. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  794. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  795. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  796. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  797. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  798. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  799. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  800. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  801. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  802. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  803. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  804. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  805. "paddw %%mm3, %%mm4 \n\t"\
  806. "movq %%mm2, %%mm0 \n\t"\
  807. "movq %%mm5, %%mm6 \n\t"\
  808. "movq %%mm4, %%mm3 \n\t"\
  809. "punpcklwd %%mm2, %%mm2 \n\t"\
  810. "punpcklwd %%mm5, %%mm5 \n\t"\
  811. "punpcklwd %%mm4, %%mm4 \n\t"\
  812. "paddw %%mm1, %%mm2 \n\t"\
  813. "paddw %%mm1, %%mm5 \n\t"\
  814. "paddw %%mm1, %%mm4 \n\t"\
  815. "punpckhwd %%mm0, %%mm0 \n\t"\
  816. "punpckhwd %%mm6, %%mm6 \n\t"\
  817. "punpckhwd %%mm3, %%mm3 \n\t"\
  818. "paddw %%mm7, %%mm0 \n\t"\
  819. "paddw %%mm7, %%mm6 \n\t"\
  820. "paddw %%mm7, %%mm3 \n\t"\
  821. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  822. "packuswb %%mm0, %%mm2 \n\t"\
  823. "packuswb %%mm6, %%mm5 \n\t"\
  824. "packuswb %%mm3, %%mm4 \n\t"\
  825. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  826. #define YSCALEYUV2RGB(index, c) \
  827. REAL_YSCALEYUV2RGB_UV(index, c) \
  828. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  829. REAL_YSCALEYUV2RGB_COEFF(c)
  830. /**
  831. * vertical bilinear scale YV12 to RGB
  832. */
  833. static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
  834. const int16_t *ubuf[2], const int16_t *vbuf[2],
  835. const int16_t *abuf[2], uint8_t *dest,
  836. int dstW, int yalpha, int uvalpha, int y)
  837. {
  838. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  839. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  840. if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
  841. const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
  842. #if ARCH_X86_64
  843. __asm__ volatile(
  844. YSCALEYUV2RGB(%%r8, %5)
  845. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  846. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  847. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  848. "packuswb %%mm7, %%mm1 \n\t"
  849. WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  850. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  851. "a" (&c->redDither),
  852. "r" (abuf0), "r" (abuf1)
  853. : "%r8"
  854. );
  855. #else
  856. c->u_temp=(intptr_t)abuf0;
  857. c->v_temp=(intptr_t)abuf1;
  858. __asm__ volatile(
  859. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  860. "mov %4, %%"FF_REG_b" \n\t"
  861. "push %%"FF_REG_BP" \n\t"
  862. YSCALEYUV2RGB(%%FF_REGBP, %5)
  863. "push %0 \n\t"
  864. "push %1 \n\t"
  865. "mov "U_TEMP"(%5), %0 \n\t"
  866. "mov "V_TEMP"(%5), %1 \n\t"
  867. YSCALEYUV2RGB_YA(%%FF_REGBP, %5, %0, %1)
  868. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  869. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  870. "packuswb %%mm7, %%mm1 \n\t"
  871. "pop %1 \n\t"
  872. "pop %0 \n\t"
  873. WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  874. "pop %%"FF_REG_BP" \n\t"
  875. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  876. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  877. "a" (&c->redDither)
  878. );
  879. #endif
  880. } else {
  881. __asm__ volatile(
  882. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  883. "mov %4, %%"FF_REG_b" \n\t"
  884. "push %%"FF_REG_BP" \n\t"
  885. YSCALEYUV2RGB(%%FF_REGBP, %5)
  886. "pcmpeqd %%mm7, %%mm7 \n\t"
  887. WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  888. "pop %%"FF_REG_BP" \n\t"
  889. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  890. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  891. "a" (&c->redDither)
  892. );
  893. }
  894. }
  895. static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
  896. const int16_t *ubuf[2], const int16_t *vbuf[2],
  897. const int16_t *abuf[2], uint8_t *dest,
  898. int dstW, int yalpha, int uvalpha, int y)
  899. {
  900. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  901. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  902. __asm__ volatile(
  903. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  904. "mov %4, %%"FF_REG_b" \n\t"
  905. "push %%"FF_REG_BP" \n\t"
  906. YSCALEYUV2RGB(%%FF_REGBP, %5)
  907. "pxor %%mm7, %%mm7 \n\t"
  908. WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  909. "pop %%"FF_REG_BP" \n\t"
  910. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  911. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  912. "a" (&c->redDither)
  913. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  914. );
  915. }
  916. static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
  917. const int16_t *ubuf[2], const int16_t *vbuf[2],
  918. const int16_t *abuf[2], uint8_t *dest,
  919. int dstW, int yalpha, int uvalpha, int y)
  920. {
  921. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  922. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  923. __asm__ volatile(
  924. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  925. "mov %4, %%"FF_REG_b" \n\t"
  926. "push %%"FF_REG_BP" \n\t"
  927. YSCALEYUV2RGB(%%FF_REGBP, %5)
  928. "pxor %%mm7, %%mm7 \n\t"
  929. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  930. #ifdef DITHER1XBPP
  931. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  932. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  933. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  934. #endif
  935. WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  936. "pop %%"FF_REG_BP" \n\t"
  937. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  938. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  939. "a" (&c->redDither)
  940. NAMED_CONSTRAINTS_ADD(bF8)
  941. );
  942. }
  943. static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
  944. const int16_t *ubuf[2], const int16_t *vbuf[2],
  945. const int16_t *abuf[2], uint8_t *dest,
  946. int dstW, int yalpha, int uvalpha, int y)
  947. {
  948. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  949. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  950. __asm__ volatile(
  951. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  952. "mov %4, %%"FF_REG_b" \n\t"
  953. "push %%"FF_REG_BP" \n\t"
  954. YSCALEYUV2RGB(%%FF_REGBP, %5)
  955. "pxor %%mm7, %%mm7 \n\t"
  956. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  957. #ifdef DITHER1XBPP
  958. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  959. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  960. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  961. #endif
  962. WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  963. "pop %%"FF_REG_BP" \n\t"
  964. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  965. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  966. "a" (&c->redDither)
  967. NAMED_CONSTRAINTS_ADD(bF8,bFC)
  968. );
  969. }
  970. #define REAL_YSCALEYUV2PACKED(index, c) \
  971. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  972. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  973. "psraw $3, %%mm0 \n\t"\
  974. "psraw $3, %%mm1 \n\t"\
  975. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  976. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  977. "xor "#index", "#index" \n\t"\
  978. ".p2align 4 \n\t"\
  979. "1: \n\t"\
  980. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  981. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  982. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  983. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  984. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  985. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  986. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  987. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  988. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  989. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  990. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  991. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  992. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  993. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  994. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  995. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  996. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  997. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  998. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  999. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  1000. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  1001. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1002. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1003. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1004. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1005. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1006. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1007. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  1008. static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
  1009. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1010. const int16_t *abuf[2], uint8_t *dest,
  1011. int dstW, int yalpha, int uvalpha, int y)
  1012. {
  1013. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1014. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1015. __asm__ volatile(
  1016. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1017. "mov %4, %%"FF_REG_b" \n\t"
  1018. "push %%"FF_REG_BP" \n\t"
  1019. YSCALEYUV2PACKED(%%FF_REGBP, %5)
  1020. WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1021. "pop %%"FF_REG_BP" \n\t"
  1022. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1023. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1024. "a" (&c->redDither)
  1025. );
  1026. }
  1027. #define REAL_YSCALEYUV2RGB1(index, c) \
  1028. "xor "#index", "#index" \n\t"\
  1029. ".p2align 4 \n\t"\
  1030. "1: \n\t"\
  1031. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1032. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1033. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1034. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1035. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1036. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1037. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1038. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1039. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1040. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1041. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1042. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1043. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1044. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1045. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1046. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1047. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1048. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1049. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1050. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1051. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1052. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1053. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1054. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1055. "paddw %%mm3, %%mm4 \n\t"\
  1056. "movq %%mm2, %%mm0 \n\t"\
  1057. "movq %%mm5, %%mm6 \n\t"\
  1058. "movq %%mm4, %%mm3 \n\t"\
  1059. "punpcklwd %%mm2, %%mm2 \n\t"\
  1060. "punpcklwd %%mm5, %%mm5 \n\t"\
  1061. "punpcklwd %%mm4, %%mm4 \n\t"\
  1062. "paddw %%mm1, %%mm2 \n\t"\
  1063. "paddw %%mm1, %%mm5 \n\t"\
  1064. "paddw %%mm1, %%mm4 \n\t"\
  1065. "punpckhwd %%mm0, %%mm0 \n\t"\
  1066. "punpckhwd %%mm6, %%mm6 \n\t"\
  1067. "punpckhwd %%mm3, %%mm3 \n\t"\
  1068. "paddw %%mm7, %%mm0 \n\t"\
  1069. "paddw %%mm7, %%mm6 \n\t"\
  1070. "paddw %%mm7, %%mm3 \n\t"\
  1071. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1072. "packuswb %%mm0, %%mm2 \n\t"\
  1073. "packuswb %%mm6, %%mm5 \n\t"\
  1074. "packuswb %%mm3, %%mm4 \n\t"\
  1075. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1076. // do vertical chrominance interpolation
  1077. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1078. "xor "#index", "#index" \n\t"\
  1079. ".p2align 4 \n\t"\
  1080. "1: \n\t"\
  1081. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1082. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1083. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1084. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1085. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1086. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1087. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1088. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1089. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1090. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1091. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1092. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1093. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1094. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1095. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1096. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1097. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1098. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1099. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1100. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1101. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1102. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1103. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1104. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1105. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1106. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1107. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1108. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1109. "paddw %%mm3, %%mm4 \n\t"\
  1110. "movq %%mm2, %%mm0 \n\t"\
  1111. "movq %%mm5, %%mm6 \n\t"\
  1112. "movq %%mm4, %%mm3 \n\t"\
  1113. "punpcklwd %%mm2, %%mm2 \n\t"\
  1114. "punpcklwd %%mm5, %%mm5 \n\t"\
  1115. "punpcklwd %%mm4, %%mm4 \n\t"\
  1116. "paddw %%mm1, %%mm2 \n\t"\
  1117. "paddw %%mm1, %%mm5 \n\t"\
  1118. "paddw %%mm1, %%mm4 \n\t"\
  1119. "punpckhwd %%mm0, %%mm0 \n\t"\
  1120. "punpckhwd %%mm6, %%mm6 \n\t"\
  1121. "punpckhwd %%mm3, %%mm3 \n\t"\
  1122. "paddw %%mm7, %%mm0 \n\t"\
  1123. "paddw %%mm7, %%mm6 \n\t"\
  1124. "paddw %%mm7, %%mm3 \n\t"\
  1125. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1126. "packuswb %%mm0, %%mm2 \n\t"\
  1127. "packuswb %%mm6, %%mm5 \n\t"\
  1128. "packuswb %%mm3, %%mm4 \n\t"\
  1129. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1130. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1131. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1132. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1133. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1134. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1135. "packuswb %%mm1, %%mm7 \n\t"
  1136. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1137. /**
  1138. * YV12 to RGB without scaling or interpolating
  1139. */
  1140. static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
  1141. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1142. const int16_t *abuf0, uint8_t *dest,
  1143. int dstW, int uvalpha, int y)
  1144. {
  1145. const int16_t *ubuf0 = ubuf[0];
  1146. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1147. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1148. const int16_t *ubuf1 = ubuf[0];
  1149. if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
  1150. __asm__ volatile(
  1151. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1152. "mov %4, %%"FF_REG_b" \n\t"
  1153. "push %%"FF_REG_BP" \n\t"
  1154. YSCALEYUV2RGB1(%%FF_REGBP, %5)
  1155. YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
  1156. WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1157. "pop %%"FF_REG_BP" \n\t"
  1158. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1159. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1160. "a" (&c->redDither)
  1161. );
  1162. } else {
  1163. __asm__ volatile(
  1164. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1165. "mov %4, %%"FF_REG_b" \n\t"
  1166. "push %%"FF_REG_BP" \n\t"
  1167. YSCALEYUV2RGB1(%%FF_REGBP, %5)
  1168. "pcmpeqd %%mm7, %%mm7 \n\t"
  1169. WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1170. "pop %%"FF_REG_BP" \n\t"
  1171. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1172. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1173. "a" (&c->redDither)
  1174. );
  1175. }
  1176. } else {
  1177. const int16_t *ubuf1 = ubuf[1];
  1178. if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
  1179. __asm__ volatile(
  1180. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1181. "mov %4, %%"FF_REG_b" \n\t"
  1182. "push %%"FF_REG_BP" \n\t"
  1183. YSCALEYUV2RGB1b(%%FF_REGBP, %5)
  1184. YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
  1185. WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1186. "pop %%"FF_REG_BP" \n\t"
  1187. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1188. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1189. "a" (&c->redDither)
  1190. );
  1191. } else {
  1192. __asm__ volatile(
  1193. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1194. "mov %4, %%"FF_REG_b" \n\t"
  1195. "push %%"FF_REG_BP" \n\t"
  1196. YSCALEYUV2RGB1b(%%FF_REGBP, %5)
  1197. "pcmpeqd %%mm7, %%mm7 \n\t"
  1198. WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1199. "pop %%"FF_REG_BP" \n\t"
  1200. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1201. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1202. "a" (&c->redDither)
  1203. );
  1204. }
  1205. }
  1206. }
  1207. static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
  1208. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1209. const int16_t *abuf0, uint8_t *dest,
  1210. int dstW, int uvalpha, int y)
  1211. {
  1212. const int16_t *ubuf0 = ubuf[0];
  1213. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1214. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1215. const int16_t *ubuf1 = ubuf[0];
  1216. __asm__ volatile(
  1217. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1218. "mov %4, %%"FF_REG_b" \n\t"
  1219. "push %%"FF_REG_BP" \n\t"
  1220. YSCALEYUV2RGB1(%%FF_REGBP, %5)
  1221. "pxor %%mm7, %%mm7 \n\t"
  1222. WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1223. "pop %%"FF_REG_BP" \n\t"
  1224. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1225. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1226. "a" (&c->redDither)
  1227. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  1228. );
  1229. } else {
  1230. const int16_t *ubuf1 = ubuf[1];
  1231. __asm__ volatile(
  1232. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1233. "mov %4, %%"FF_REG_b" \n\t"
  1234. "push %%"FF_REG_BP" \n\t"
  1235. YSCALEYUV2RGB1b(%%FF_REGBP, %5)
  1236. "pxor %%mm7, %%mm7 \n\t"
  1237. WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1238. "pop %%"FF_REG_BP" \n\t"
  1239. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1240. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1241. "a" (&c->redDither)
  1242. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  1243. );
  1244. }
  1245. }
  1246. static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
  1247. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1248. const int16_t *abuf0, uint8_t *dest,
  1249. int dstW, int uvalpha, int y)
  1250. {
  1251. const int16_t *ubuf0 = ubuf[0];
  1252. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1253. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1254. const int16_t *ubuf1 = ubuf[0];
  1255. __asm__ volatile(
  1256. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1257. "mov %4, %%"FF_REG_b" \n\t"
  1258. "push %%"FF_REG_BP" \n\t"
  1259. YSCALEYUV2RGB1(%%FF_REGBP, %5)
  1260. "pxor %%mm7, %%mm7 \n\t"
  1261. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1262. #ifdef DITHER1XBPP
  1263. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1264. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1265. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1266. #endif
  1267. WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1268. "pop %%"FF_REG_BP" \n\t"
  1269. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1270. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1271. "a" (&c->redDither)
  1272. NAMED_CONSTRAINTS_ADD(bF8)
  1273. );
  1274. } else {
  1275. const int16_t *ubuf1 = ubuf[1];
  1276. __asm__ volatile(
  1277. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1278. "mov %4, %%"FF_REG_b" \n\t"
  1279. "push %%"FF_REG_BP" \n\t"
  1280. YSCALEYUV2RGB1b(%%FF_REGBP, %5)
  1281. "pxor %%mm7, %%mm7 \n\t"
  1282. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1283. #ifdef DITHER1XBPP
  1284. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1285. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1286. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1287. #endif
  1288. WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1289. "pop %%"FF_REG_BP" \n\t"
  1290. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1291. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1292. "a" (&c->redDither)
  1293. NAMED_CONSTRAINTS_ADD(bF8)
  1294. );
  1295. }
  1296. }
  1297. static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
  1298. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1299. const int16_t *abuf0, uint8_t *dest,
  1300. int dstW, int uvalpha, int y)
  1301. {
  1302. const int16_t *ubuf0 = ubuf[0];
  1303. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1304. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1305. const int16_t *ubuf1 = ubuf[0];
  1306. __asm__ volatile(
  1307. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1308. "mov %4, %%"FF_REG_b" \n\t"
  1309. "push %%"FF_REG_BP" \n\t"
  1310. YSCALEYUV2RGB1(%%FF_REGBP, %5)
  1311. "pxor %%mm7, %%mm7 \n\t"
  1312. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1313. #ifdef DITHER1XBPP
  1314. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1315. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1316. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1317. #endif
  1318. WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1319. "pop %%"FF_REG_BP" \n\t"
  1320. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1321. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1322. "a" (&c->redDither)
  1323. NAMED_CONSTRAINTS_ADD(bF8,bFC)
  1324. );
  1325. } else {
  1326. const int16_t *ubuf1 = ubuf[1];
  1327. __asm__ volatile(
  1328. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1329. "mov %4, %%"FF_REG_b" \n\t"
  1330. "push %%"FF_REG_BP" \n\t"
  1331. YSCALEYUV2RGB1b(%%FF_REGBP, %5)
  1332. "pxor %%mm7, %%mm7 \n\t"
  1333. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1334. #ifdef DITHER1XBPP
  1335. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1336. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1337. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1338. #endif
  1339. WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1340. "pop %%"FF_REG_BP" \n\t"
  1341. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1342. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1343. "a" (&c->redDither)
  1344. NAMED_CONSTRAINTS_ADD(bF8,bFC)
  1345. );
  1346. }
  1347. }
  1348. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1349. "xor "#index", "#index" \n\t"\
  1350. ".p2align 4 \n\t"\
  1351. "1: \n\t"\
  1352. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1353. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1354. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1355. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1356. "psraw $7, %%mm3 \n\t" \
  1357. "psraw $7, %%mm4 \n\t" \
  1358. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1359. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1360. "psraw $7, %%mm1 \n\t" \
  1361. "psraw $7, %%mm7 \n\t" \
  1362. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1363. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1364. "xor "#index", "#index" \n\t"\
  1365. ".p2align 4 \n\t"\
  1366. "1: \n\t"\
  1367. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1368. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1369. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1370. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1371. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1372. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1373. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1374. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1375. "psrlw $8, %%mm3 \n\t" \
  1376. "psrlw $8, %%mm4 \n\t" \
  1377. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1378. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1379. "psraw $7, %%mm1 \n\t" \
  1380. "psraw $7, %%mm7 \n\t"
  1381. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1382. static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
  1383. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1384. const int16_t *abuf0, uint8_t *dest,
  1385. int dstW, int uvalpha, int y)
  1386. {
  1387. const int16_t *ubuf0 = ubuf[0];
  1388. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1389. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1390. const int16_t *ubuf1 = ubuf[0];
  1391. __asm__ volatile(
  1392. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1393. "mov %4, %%"FF_REG_b" \n\t"
  1394. "push %%"FF_REG_BP" \n\t"
  1395. YSCALEYUV2PACKED1(%%FF_REGBP, %5)
  1396. WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1397. "pop %%"FF_REG_BP" \n\t"
  1398. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1399. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1400. "a" (&c->redDither)
  1401. );
  1402. } else {
  1403. const int16_t *ubuf1 = ubuf[1];
  1404. __asm__ volatile(
  1405. "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
  1406. "mov %4, %%"FF_REG_b" \n\t"
  1407. "push %%"FF_REG_BP" \n\t"
  1408. YSCALEYUV2PACKED1b(%%FF_REGBP, %5)
  1409. WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
  1410. "pop %%"FF_REG_BP" \n\t"
  1411. "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
  1412. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1413. "a" (&c->redDither)
  1414. );
  1415. }
  1416. }
  1417. static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
  1418. {
  1419. enum AVPixelFormat dstFormat = c->dstFormat;
  1420. c->use_mmx_vfilter= 0;
  1421. if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
  1422. && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
  1423. if (c->flags & SWS_ACCURATE_RND) {
  1424. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1425. switch (c->dstFormat) {
  1426. case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  1427. #if HAVE_6REGS
  1428. case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  1429. #endif
  1430. case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  1431. case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  1432. case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  1433. default: break;
  1434. }
  1435. }
  1436. } else {
  1437. c->use_mmx_vfilter= 1;
  1438. c->yuv2planeX = RENAME(yuv2yuvX );
  1439. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1440. switch (c->dstFormat) {
  1441. case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  1442. case AV_PIX_FMT_BGR32: c->yuv2packedX = RENAME(yuv2bgr32_X); break;
  1443. #if HAVE_6REGS
  1444. case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  1445. #endif
  1446. case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  1447. case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  1448. case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  1449. default: break;
  1450. }
  1451. }
  1452. }
  1453. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1454. switch (c->dstFormat) {
  1455. case AV_PIX_FMT_RGB32:
  1456. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  1457. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  1458. break;
  1459. case AV_PIX_FMT_BGR24:
  1460. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  1461. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  1462. break;
  1463. case AV_PIX_FMT_RGB555:
  1464. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  1465. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  1466. break;
  1467. case AV_PIX_FMT_RGB565:
  1468. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  1469. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  1470. break;
  1471. case AV_PIX_FMT_YUYV422:
  1472. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  1473. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  1474. break;
  1475. default:
  1476. break;
  1477. }
  1478. }
  1479. }
  1480. if (c->srcBpc == 8 && c->dstBpc <= 14) {
  1481. // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
  1482. #if COMPILE_TEMPLATE_MMXEXT
  1483. if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
  1484. c->hyscale_fast = ff_hyscale_fast_mmxext;
  1485. c->hcscale_fast = ff_hcscale_fast_mmxext;
  1486. } else {
  1487. #endif /* COMPILE_TEMPLATE_MMXEXT */
  1488. c->hyscale_fast = NULL;
  1489. c->hcscale_fast = NULL;
  1490. #if COMPILE_TEMPLATE_MMXEXT
  1491. }
  1492. #endif /* COMPILE_TEMPLATE_MMXEXT */
  1493. }
  1494. }