rgb2rgb.c 100 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441
  1. /*
  2. * software RGB to RGB converter
  3. * pluralize by software PAL8 to RGB converter
  4. * software YUV to YUV converter
  5. * software YUV to RGB converter
  6. * Written by Nick Kurshev.
  7. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  8. *
  9. * This file is part of FFmpeg.
  10. *
  11. * FFmpeg is free software; you can redistribute it and/or
  12. * modify it under the terms of the GNU Lesser General Public
  13. * License as published by the Free Software Foundation; either
  14. * version 2.1 of the License, or (at your option) any later version.
  15. *
  16. * FFmpeg is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  19. * Lesser General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU Lesser General Public
  22. * License along with FFmpeg; if not, write to the Free Software
  23. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24. */
  25. #include <stdint.h>
  26. #include "config.h"
  27. #include "libavutil/attributes.h"
  28. #include "libavutil/x86/cpu.h"
  29. #include "libavutil/cpu.h"
  30. #include "libavutil/bswap.h"
  31. #include "libavutil/mem_internal.h"
  32. #include "libswscale/rgb2rgb.h"
  33. #include "libswscale/swscale.h"
  34. #include "libswscale/swscale_internal.h"
  35. #if HAVE_INLINE_ASM
  36. #include "libavutil/x86/asm.h"
  37. DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL;
  38. DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL;
  39. DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL;
  40. DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL;
  41. DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL;
  42. DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL;
  43. DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL;
  44. DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL;
  45. DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL;
  46. DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL;
  47. DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL;
  48. DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL;
  49. DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL;
  50. DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
  51. DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
  52. DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL;
  53. DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL;
  54. DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL;
  55. #define mask16b mask15b
  56. DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL;
  57. DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL;
  58. DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL;
  59. DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
  60. DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
  61. DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
  62. DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
  63. DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
  64. DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
  65. DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
  66. DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
  67. DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2YOffset);
  68. DECLARE_ALIGNED(8, extern const uint64_t, ff_w1111);
  69. DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
  70. #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
  71. #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
  72. #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
  73. #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
  74. #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
  75. #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
  76. #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
  77. #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
  78. #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
  79. // MMXEXT versions
  80. #define PREFETCH "prefetchnta"
  81. #define PAVGB "pavgb"
  82. #define MOVNTQ "movntq"
  83. #define SFENCE "sfence"
  84. #define EMMS "emms"
  85. static inline void rgb24tobgr32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  86. {
  87. uint8_t *dest = dst;
  88. const uint8_t *s = src;
  89. const uint8_t *end;
  90. const uint8_t *mm_end;
  91. end = s + src_size;
  92. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  93. mm_end = end - 23;
  94. __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
  95. while (s < mm_end) {
  96. __asm__ volatile(
  97. PREFETCH" 32(%1) \n\t"
  98. "movd (%1), %%mm0 \n\t"
  99. "punpckldq 3(%1), %%mm0 \n\t"
  100. "movd 6(%1), %%mm1 \n\t"
  101. "punpckldq 9(%1), %%mm1 \n\t"
  102. "movd 12(%1), %%mm2 \n\t"
  103. "punpckldq 15(%1), %%mm2 \n\t"
  104. "movd 18(%1), %%mm3 \n\t"
  105. "punpckldq 21(%1), %%mm3 \n\t"
  106. "por %%mm7, %%mm0 \n\t"
  107. "por %%mm7, %%mm1 \n\t"
  108. "por %%mm7, %%mm2 \n\t"
  109. "por %%mm7, %%mm3 \n\t"
  110. MOVNTQ" %%mm0, (%0) \n\t"
  111. MOVNTQ" %%mm1, 8(%0) \n\t"
  112. MOVNTQ" %%mm2, 16(%0) \n\t"
  113. MOVNTQ" %%mm3, 24(%0)"
  114. :: "r"(dest), "r"(s)
  115. :"memory");
  116. dest += 32;
  117. s += 24;
  118. }
  119. __asm__ volatile(SFENCE:::"memory");
  120. __asm__ volatile(EMMS:::"memory");
  121. while (s < end) {
  122. *dest++ = *s++;
  123. *dest++ = *s++;
  124. *dest++ = *s++;
  125. *dest++ = 255;
  126. }
  127. }
  128. #define STORE_BGR24_MMX \
  129. "psrlq $8, %%mm2 \n\t" \
  130. "psrlq $8, %%mm3 \n\t" \
  131. "psrlq $8, %%mm6 \n\t" \
  132. "psrlq $8, %%mm7 \n\t" \
  133. "pand "MANGLE(mask24l)", %%mm0\n\t" \
  134. "pand "MANGLE(mask24l)", %%mm1\n\t" \
  135. "pand "MANGLE(mask24l)", %%mm4\n\t" \
  136. "pand "MANGLE(mask24l)", %%mm5\n\t" \
  137. "pand "MANGLE(mask24h)", %%mm2\n\t" \
  138. "pand "MANGLE(mask24h)", %%mm3\n\t" \
  139. "pand "MANGLE(mask24h)", %%mm6\n\t" \
  140. "pand "MANGLE(mask24h)", %%mm7\n\t" \
  141. "por %%mm2, %%mm0 \n\t" \
  142. "por %%mm3, %%mm1 \n\t" \
  143. "por %%mm6, %%mm4 \n\t" \
  144. "por %%mm7, %%mm5 \n\t" \
  145. \
  146. "movq %%mm1, %%mm2 \n\t" \
  147. "movq %%mm4, %%mm3 \n\t" \
  148. "psllq $48, %%mm2 \n\t" \
  149. "psllq $32, %%mm3 \n\t" \
  150. "por %%mm2, %%mm0 \n\t" \
  151. "psrlq $16, %%mm1 \n\t" \
  152. "psrlq $32, %%mm4 \n\t" \
  153. "psllq $16, %%mm5 \n\t" \
  154. "por %%mm3, %%mm1 \n\t" \
  155. "por %%mm5, %%mm4 \n\t" \
  156. \
  157. MOVNTQ" %%mm0, (%0) \n\t" \
  158. MOVNTQ" %%mm1, 8(%0) \n\t" \
  159. MOVNTQ" %%mm4, 16(%0)"
  160. static inline void rgb32tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  161. {
  162. uint8_t *dest = dst;
  163. const uint8_t *s = src;
  164. const uint8_t *end;
  165. const uint8_t *mm_end;
  166. end = s + src_size;
  167. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  168. mm_end = end - 31;
  169. while (s < mm_end) {
  170. __asm__ volatile(
  171. PREFETCH" 32(%1) \n\t"
  172. "movq (%1), %%mm0 \n\t"
  173. "movq 8(%1), %%mm1 \n\t"
  174. "movq 16(%1), %%mm4 \n\t"
  175. "movq 24(%1), %%mm5 \n\t"
  176. "movq %%mm0, %%mm2 \n\t"
  177. "movq %%mm1, %%mm3 \n\t"
  178. "movq %%mm4, %%mm6 \n\t"
  179. "movq %%mm5, %%mm7 \n\t"
  180. STORE_BGR24_MMX
  181. :: "r"(dest), "r"(s)
  182. NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
  183. :"memory");
  184. dest += 24;
  185. s += 32;
  186. }
  187. __asm__ volatile(SFENCE:::"memory");
  188. __asm__ volatile(EMMS:::"memory");
  189. while (s < end) {
  190. *dest++ = *s++;
  191. *dest++ = *s++;
  192. *dest++ = *s++;
  193. s++;
  194. }
  195. }
  196. /*
  197. original by Strepto/Astral
  198. ported to gcc & bugfixed: A'rpi
  199. MMXEXT, 3DNOW optimization by Nick Kurshev
  200. 32-bit C version, and and&add trick by Michael Niedermayer
  201. */
  202. static inline void rgb15to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  203. {
  204. register const uint8_t* s=src;
  205. register uint8_t* d=dst;
  206. register const uint8_t *end;
  207. const uint8_t *mm_end;
  208. end = s + src_size;
  209. __asm__ volatile(PREFETCH" %0"::"m"(*s));
  210. __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
  211. mm_end = end - 15;
  212. while (s<mm_end) {
  213. __asm__ volatile(
  214. PREFETCH" 32(%1) \n\t"
  215. "movq (%1), %%mm0 \n\t"
  216. "movq 8(%1), %%mm2 \n\t"
  217. "movq %%mm0, %%mm1 \n\t"
  218. "movq %%mm2, %%mm3 \n\t"
  219. "pand %%mm4, %%mm0 \n\t"
  220. "pand %%mm4, %%mm2 \n\t"
  221. "paddw %%mm1, %%mm0 \n\t"
  222. "paddw %%mm3, %%mm2 \n\t"
  223. MOVNTQ" %%mm0, (%0) \n\t"
  224. MOVNTQ" %%mm2, 8(%0)"
  225. :: "r"(d), "r"(s)
  226. );
  227. d+=16;
  228. s+=16;
  229. }
  230. __asm__ volatile(SFENCE:::"memory");
  231. __asm__ volatile(EMMS:::"memory");
  232. mm_end = end - 3;
  233. while (s < mm_end) {
  234. register unsigned x= *((const uint32_t *)s);
  235. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  236. d+=4;
  237. s+=4;
  238. }
  239. if (s < end) {
  240. register unsigned short x= *((const uint16_t *)s);
  241. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  242. }
  243. }
  244. static inline void rgb16to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  245. {
  246. register const uint8_t* s=src;
  247. register uint8_t* d=dst;
  248. register const uint8_t *end;
  249. const uint8_t *mm_end;
  250. end = s + src_size;
  251. __asm__ volatile(PREFETCH" %0"::"m"(*s));
  252. __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
  253. __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
  254. mm_end = end - 15;
  255. while (s<mm_end) {
  256. __asm__ volatile(
  257. PREFETCH" 32(%1) \n\t"
  258. "movq (%1), %%mm0 \n\t"
  259. "movq 8(%1), %%mm2 \n\t"
  260. "movq %%mm0, %%mm1 \n\t"
  261. "movq %%mm2, %%mm3 \n\t"
  262. "psrlq $1, %%mm0 \n\t"
  263. "psrlq $1, %%mm2 \n\t"
  264. "pand %%mm7, %%mm0 \n\t"
  265. "pand %%mm7, %%mm2 \n\t"
  266. "pand %%mm6, %%mm1 \n\t"
  267. "pand %%mm6, %%mm3 \n\t"
  268. "por %%mm1, %%mm0 \n\t"
  269. "por %%mm3, %%mm2 \n\t"
  270. MOVNTQ" %%mm0, (%0) \n\t"
  271. MOVNTQ" %%mm2, 8(%0)"
  272. :: "r"(d), "r"(s)
  273. );
  274. d+=16;
  275. s+=16;
  276. }
  277. __asm__ volatile(SFENCE:::"memory");
  278. __asm__ volatile(EMMS:::"memory");
  279. mm_end = end - 3;
  280. while (s < mm_end) {
  281. register uint32_t x= *((const uint32_t*)s);
  282. *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
  283. s+=4;
  284. d+=4;
  285. }
  286. if (s < end) {
  287. register uint16_t x= *((const uint16_t*)s);
  288. *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
  289. }
  290. }
  291. static inline void rgb32to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  292. {
  293. const uint8_t *s = src;
  294. const uint8_t *end;
  295. const uint8_t *mm_end;
  296. uint16_t *d = (uint16_t *)dst;
  297. end = s + src_size;
  298. mm_end = end - 15;
  299. __asm__ volatile(
  300. "movq %3, %%mm5 \n\t"
  301. "movq %4, %%mm6 \n\t"
  302. "movq %5, %%mm7 \n\t"
  303. "jmp 2f \n\t"
  304. ".p2align 4 \n\t"
  305. "1: \n\t"
  306. PREFETCH" 32(%1) \n\t"
  307. "movd (%1), %%mm0 \n\t"
  308. "movd 4(%1), %%mm3 \n\t"
  309. "punpckldq 8(%1), %%mm0 \n\t"
  310. "punpckldq 12(%1), %%mm3 \n\t"
  311. "movq %%mm0, %%mm1 \n\t"
  312. "movq %%mm3, %%mm4 \n\t"
  313. "pand %%mm6, %%mm0 \n\t"
  314. "pand %%mm6, %%mm3 \n\t"
  315. "pmaddwd %%mm7, %%mm0 \n\t"
  316. "pmaddwd %%mm7, %%mm3 \n\t"
  317. "pand %%mm5, %%mm1 \n\t"
  318. "pand %%mm5, %%mm4 \n\t"
  319. "por %%mm1, %%mm0 \n\t"
  320. "por %%mm4, %%mm3 \n\t"
  321. "psrld $5, %%mm0 \n\t"
  322. "pslld $11, %%mm3 \n\t"
  323. "por %%mm3, %%mm0 \n\t"
  324. MOVNTQ" %%mm0, (%0) \n\t"
  325. "add $16, %1 \n\t"
  326. "add $8, %0 \n\t"
  327. "2: \n\t"
  328. "cmp %2, %1 \n\t"
  329. " jb 1b \n\t"
  330. : "+r" (d), "+r"(s)
  331. : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
  332. );
  333. __asm__ volatile(SFENCE:::"memory");
  334. __asm__ volatile(EMMS:::"memory");
  335. while (s < end) {
  336. register int rgb = *(const uint32_t*)s; s += 4;
  337. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
  338. }
  339. }
  340. static inline void rgb32tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  341. {
  342. const uint8_t *s = src;
  343. const uint8_t *end;
  344. const uint8_t *mm_end;
  345. uint16_t *d = (uint16_t *)dst;
  346. end = s + src_size;
  347. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  348. __asm__ volatile(
  349. "movq %0, %%mm7 \n\t"
  350. "movq %1, %%mm6 \n\t"
  351. ::"m"(red_16mask),"m"(green_16mask));
  352. mm_end = end - 15;
  353. while (s < mm_end) {
  354. __asm__ volatile(
  355. PREFETCH" 32(%1) \n\t"
  356. "movd (%1), %%mm0 \n\t"
  357. "movd 4(%1), %%mm3 \n\t"
  358. "punpckldq 8(%1), %%mm0 \n\t"
  359. "punpckldq 12(%1), %%mm3 \n\t"
  360. "movq %%mm0, %%mm1 \n\t"
  361. "movq %%mm0, %%mm2 \n\t"
  362. "movq %%mm3, %%mm4 \n\t"
  363. "movq %%mm3, %%mm5 \n\t"
  364. "psllq $8, %%mm0 \n\t"
  365. "psllq $8, %%mm3 \n\t"
  366. "pand %%mm7, %%mm0 \n\t"
  367. "pand %%mm7, %%mm3 \n\t"
  368. "psrlq $5, %%mm1 \n\t"
  369. "psrlq $5, %%mm4 \n\t"
  370. "pand %%mm6, %%mm1 \n\t"
  371. "pand %%mm6, %%mm4 \n\t"
  372. "psrlq $19, %%mm2 \n\t"
  373. "psrlq $19, %%mm5 \n\t"
  374. "pand %2, %%mm2 \n\t"
  375. "pand %2, %%mm5 \n\t"
  376. "por %%mm1, %%mm0 \n\t"
  377. "por %%mm4, %%mm3 \n\t"
  378. "por %%mm2, %%mm0 \n\t"
  379. "por %%mm5, %%mm3 \n\t"
  380. "psllq $16, %%mm3 \n\t"
  381. "por %%mm3, %%mm0 \n\t"
  382. MOVNTQ" %%mm0, (%0) \n\t"
  383. :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
  384. d += 4;
  385. s += 16;
  386. }
  387. __asm__ volatile(SFENCE:::"memory");
  388. __asm__ volatile(EMMS:::"memory");
  389. while (s < end) {
  390. register int rgb = *(const uint32_t*)s; s += 4;
  391. *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
  392. }
  393. }
  394. static inline void rgb32to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  395. {
  396. const uint8_t *s = src;
  397. const uint8_t *end;
  398. const uint8_t *mm_end;
  399. uint16_t *d = (uint16_t *)dst;
  400. end = s + src_size;
  401. mm_end = end - 15;
  402. __asm__ volatile(
  403. "movq %3, %%mm5 \n\t"
  404. "movq %4, %%mm6 \n\t"
  405. "movq %5, %%mm7 \n\t"
  406. "jmp 2f \n\t"
  407. ".p2align 4 \n\t"
  408. "1: \n\t"
  409. PREFETCH" 32(%1) \n\t"
  410. "movd (%1), %%mm0 \n\t"
  411. "movd 4(%1), %%mm3 \n\t"
  412. "punpckldq 8(%1), %%mm0 \n\t"
  413. "punpckldq 12(%1), %%mm3 \n\t"
  414. "movq %%mm0, %%mm1 \n\t"
  415. "movq %%mm3, %%mm4 \n\t"
  416. "pand %%mm6, %%mm0 \n\t"
  417. "pand %%mm6, %%mm3 \n\t"
  418. "pmaddwd %%mm7, %%mm0 \n\t"
  419. "pmaddwd %%mm7, %%mm3 \n\t"
  420. "pand %%mm5, %%mm1 \n\t"
  421. "pand %%mm5, %%mm4 \n\t"
  422. "por %%mm1, %%mm0 \n\t"
  423. "por %%mm4, %%mm3 \n\t"
  424. "psrld $6, %%mm0 \n\t"
  425. "pslld $10, %%mm3 \n\t"
  426. "por %%mm3, %%mm0 \n\t"
  427. MOVNTQ" %%mm0, (%0) \n\t"
  428. "add $16, %1 \n\t"
  429. "add $8, %0 \n\t"
  430. "2: \n\t"
  431. "cmp %2, %1 \n\t"
  432. " jb 1b \n\t"
  433. : "+r" (d), "+r"(s)
  434. : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
  435. );
  436. __asm__ volatile(SFENCE:::"memory");
  437. __asm__ volatile(EMMS:::"memory");
  438. while (s < end) {
  439. register int rgb = *(const uint32_t*)s; s += 4;
  440. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
  441. }
  442. }
  443. static inline void rgb32tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  444. {
  445. const uint8_t *s = src;
  446. const uint8_t *end;
  447. const uint8_t *mm_end;
  448. uint16_t *d = (uint16_t *)dst;
  449. end = s + src_size;
  450. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  451. __asm__ volatile(
  452. "movq %0, %%mm7 \n\t"
  453. "movq %1, %%mm6 \n\t"
  454. ::"m"(red_15mask),"m"(green_15mask));
  455. mm_end = end - 15;
  456. while (s < mm_end) {
  457. __asm__ volatile(
  458. PREFETCH" 32(%1) \n\t"
  459. "movd (%1), %%mm0 \n\t"
  460. "movd 4(%1), %%mm3 \n\t"
  461. "punpckldq 8(%1), %%mm0 \n\t"
  462. "punpckldq 12(%1), %%mm3 \n\t"
  463. "movq %%mm0, %%mm1 \n\t"
  464. "movq %%mm0, %%mm2 \n\t"
  465. "movq %%mm3, %%mm4 \n\t"
  466. "movq %%mm3, %%mm5 \n\t"
  467. "psllq $7, %%mm0 \n\t"
  468. "psllq $7, %%mm3 \n\t"
  469. "pand %%mm7, %%mm0 \n\t"
  470. "pand %%mm7, %%mm3 \n\t"
  471. "psrlq $6, %%mm1 \n\t"
  472. "psrlq $6, %%mm4 \n\t"
  473. "pand %%mm6, %%mm1 \n\t"
  474. "pand %%mm6, %%mm4 \n\t"
  475. "psrlq $19, %%mm2 \n\t"
  476. "psrlq $19, %%mm5 \n\t"
  477. "pand %2, %%mm2 \n\t"
  478. "pand %2, %%mm5 \n\t"
  479. "por %%mm1, %%mm0 \n\t"
  480. "por %%mm4, %%mm3 \n\t"
  481. "por %%mm2, %%mm0 \n\t"
  482. "por %%mm5, %%mm3 \n\t"
  483. "psllq $16, %%mm3 \n\t"
  484. "por %%mm3, %%mm0 \n\t"
  485. MOVNTQ" %%mm0, (%0) \n\t"
  486. ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
  487. d += 4;
  488. s += 16;
  489. }
  490. __asm__ volatile(SFENCE:::"memory");
  491. __asm__ volatile(EMMS:::"memory");
  492. while (s < end) {
  493. register int rgb = *(const uint32_t*)s; s += 4;
  494. *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
  495. }
  496. }
  497. static inline void rgb24tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  498. {
  499. const uint8_t *s = src;
  500. const uint8_t *end;
  501. const uint8_t *mm_end;
  502. uint16_t *d = (uint16_t *)dst;
  503. end = s + src_size;
  504. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  505. __asm__ volatile(
  506. "movq %0, %%mm7 \n\t"
  507. "movq %1, %%mm6 \n\t"
  508. ::"m"(red_16mask),"m"(green_16mask));
  509. mm_end = end - 11;
  510. while (s < mm_end) {
  511. __asm__ volatile(
  512. PREFETCH" 32(%1) \n\t"
  513. "movd (%1), %%mm0 \n\t"
  514. "movd 3(%1), %%mm3 \n\t"
  515. "punpckldq 6(%1), %%mm0 \n\t"
  516. "punpckldq 9(%1), %%mm3 \n\t"
  517. "movq %%mm0, %%mm1 \n\t"
  518. "movq %%mm0, %%mm2 \n\t"
  519. "movq %%mm3, %%mm4 \n\t"
  520. "movq %%mm3, %%mm5 \n\t"
  521. "psrlq $3, %%mm0 \n\t"
  522. "psrlq $3, %%mm3 \n\t"
  523. "pand %2, %%mm0 \n\t"
  524. "pand %2, %%mm3 \n\t"
  525. "psrlq $5, %%mm1 \n\t"
  526. "psrlq $5, %%mm4 \n\t"
  527. "pand %%mm6, %%mm1 \n\t"
  528. "pand %%mm6, %%mm4 \n\t"
  529. "psrlq $8, %%mm2 \n\t"
  530. "psrlq $8, %%mm5 \n\t"
  531. "pand %%mm7, %%mm2 \n\t"
  532. "pand %%mm7, %%mm5 \n\t"
  533. "por %%mm1, %%mm0 \n\t"
  534. "por %%mm4, %%mm3 \n\t"
  535. "por %%mm2, %%mm0 \n\t"
  536. "por %%mm5, %%mm3 \n\t"
  537. "psllq $16, %%mm3 \n\t"
  538. "por %%mm3, %%mm0 \n\t"
  539. MOVNTQ" %%mm0, (%0) \n\t"
  540. ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
  541. d += 4;
  542. s += 12;
  543. }
  544. __asm__ volatile(SFENCE:::"memory");
  545. __asm__ volatile(EMMS:::"memory");
  546. while (s < end) {
  547. const int b = *s++;
  548. const int g = *s++;
  549. const int r = *s++;
  550. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  551. }
  552. }
  553. static inline void rgb24to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  554. {
  555. const uint8_t *s = src;
  556. const uint8_t *end;
  557. const uint8_t *mm_end;
  558. uint16_t *d = (uint16_t *)dst;
  559. end = s + src_size;
  560. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  561. __asm__ volatile(
  562. "movq %0, %%mm7 \n\t"
  563. "movq %1, %%mm6 \n\t"
  564. ::"m"(red_16mask),"m"(green_16mask));
  565. mm_end = end - 15;
  566. while (s < mm_end) {
  567. __asm__ volatile(
  568. PREFETCH" 32(%1) \n\t"
  569. "movd (%1), %%mm0 \n\t"
  570. "movd 3(%1), %%mm3 \n\t"
  571. "punpckldq 6(%1), %%mm0 \n\t"
  572. "punpckldq 9(%1), %%mm3 \n\t"
  573. "movq %%mm0, %%mm1 \n\t"
  574. "movq %%mm0, %%mm2 \n\t"
  575. "movq %%mm3, %%mm4 \n\t"
  576. "movq %%mm3, %%mm5 \n\t"
  577. "psllq $8, %%mm0 \n\t"
  578. "psllq $8, %%mm3 \n\t"
  579. "pand %%mm7, %%mm0 \n\t"
  580. "pand %%mm7, %%mm3 \n\t"
  581. "psrlq $5, %%mm1 \n\t"
  582. "psrlq $5, %%mm4 \n\t"
  583. "pand %%mm6, %%mm1 \n\t"
  584. "pand %%mm6, %%mm4 \n\t"
  585. "psrlq $19, %%mm2 \n\t"
  586. "psrlq $19, %%mm5 \n\t"
  587. "pand %2, %%mm2 \n\t"
  588. "pand %2, %%mm5 \n\t"
  589. "por %%mm1, %%mm0 \n\t"
  590. "por %%mm4, %%mm3 \n\t"
  591. "por %%mm2, %%mm0 \n\t"
  592. "por %%mm5, %%mm3 \n\t"
  593. "psllq $16, %%mm3 \n\t"
  594. "por %%mm3, %%mm0 \n\t"
  595. MOVNTQ" %%mm0, (%0) \n\t"
  596. ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
  597. d += 4;
  598. s += 12;
  599. }
  600. __asm__ volatile(SFENCE:::"memory");
  601. __asm__ volatile(EMMS:::"memory");
  602. while (s < end) {
  603. const int r = *s++;
  604. const int g = *s++;
  605. const int b = *s++;
  606. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  607. }
  608. }
  609. static inline void rgb24tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  610. {
  611. const uint8_t *s = src;
  612. const uint8_t *end;
  613. const uint8_t *mm_end;
  614. uint16_t *d = (uint16_t *)dst;
  615. end = s + src_size;
  616. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  617. __asm__ volatile(
  618. "movq %0, %%mm7 \n\t"
  619. "movq %1, %%mm6 \n\t"
  620. ::"m"(red_15mask),"m"(green_15mask));
  621. mm_end = end - 11;
  622. while (s < mm_end) {
  623. __asm__ volatile(
  624. PREFETCH" 32(%1) \n\t"
  625. "movd (%1), %%mm0 \n\t"
  626. "movd 3(%1), %%mm3 \n\t"
  627. "punpckldq 6(%1), %%mm0 \n\t"
  628. "punpckldq 9(%1), %%mm3 \n\t"
  629. "movq %%mm0, %%mm1 \n\t"
  630. "movq %%mm0, %%mm2 \n\t"
  631. "movq %%mm3, %%mm4 \n\t"
  632. "movq %%mm3, %%mm5 \n\t"
  633. "psrlq $3, %%mm0 \n\t"
  634. "psrlq $3, %%mm3 \n\t"
  635. "pand %2, %%mm0 \n\t"
  636. "pand %2, %%mm3 \n\t"
  637. "psrlq $6, %%mm1 \n\t"
  638. "psrlq $6, %%mm4 \n\t"
  639. "pand %%mm6, %%mm1 \n\t"
  640. "pand %%mm6, %%mm4 \n\t"
  641. "psrlq $9, %%mm2 \n\t"
  642. "psrlq $9, %%mm5 \n\t"
  643. "pand %%mm7, %%mm2 \n\t"
  644. "pand %%mm7, %%mm5 \n\t"
  645. "por %%mm1, %%mm0 \n\t"
  646. "por %%mm4, %%mm3 \n\t"
  647. "por %%mm2, %%mm0 \n\t"
  648. "por %%mm5, %%mm3 \n\t"
  649. "psllq $16, %%mm3 \n\t"
  650. "por %%mm3, %%mm0 \n\t"
  651. MOVNTQ" %%mm0, (%0) \n\t"
  652. ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
  653. d += 4;
  654. s += 12;
  655. }
  656. __asm__ volatile(SFENCE:::"memory");
  657. __asm__ volatile(EMMS:::"memory");
  658. while (s < end) {
  659. const int b = *s++;
  660. const int g = *s++;
  661. const int r = *s++;
  662. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  663. }
  664. }
  665. static inline void rgb24to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  666. {
  667. const uint8_t *s = src;
  668. const uint8_t *end;
  669. const uint8_t *mm_end;
  670. uint16_t *d = (uint16_t *)dst;
  671. end = s + src_size;
  672. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  673. __asm__ volatile(
  674. "movq %0, %%mm7 \n\t"
  675. "movq %1, %%mm6 \n\t"
  676. ::"m"(red_15mask),"m"(green_15mask));
  677. mm_end = end - 15;
  678. while (s < mm_end) {
  679. __asm__ volatile(
  680. PREFETCH" 32(%1) \n\t"
  681. "movd (%1), %%mm0 \n\t"
  682. "movd 3(%1), %%mm3 \n\t"
  683. "punpckldq 6(%1), %%mm0 \n\t"
  684. "punpckldq 9(%1), %%mm3 \n\t"
  685. "movq %%mm0, %%mm1 \n\t"
  686. "movq %%mm0, %%mm2 \n\t"
  687. "movq %%mm3, %%mm4 \n\t"
  688. "movq %%mm3, %%mm5 \n\t"
  689. "psllq $7, %%mm0 \n\t"
  690. "psllq $7, %%mm3 \n\t"
  691. "pand %%mm7, %%mm0 \n\t"
  692. "pand %%mm7, %%mm3 \n\t"
  693. "psrlq $6, %%mm1 \n\t"
  694. "psrlq $6, %%mm4 \n\t"
  695. "pand %%mm6, %%mm1 \n\t"
  696. "pand %%mm6, %%mm4 \n\t"
  697. "psrlq $19, %%mm2 \n\t"
  698. "psrlq $19, %%mm5 \n\t"
  699. "pand %2, %%mm2 \n\t"
  700. "pand %2, %%mm5 \n\t"
  701. "por %%mm1, %%mm0 \n\t"
  702. "por %%mm4, %%mm3 \n\t"
  703. "por %%mm2, %%mm0 \n\t"
  704. "por %%mm5, %%mm3 \n\t"
  705. "psllq $16, %%mm3 \n\t"
  706. "por %%mm3, %%mm0 \n\t"
  707. MOVNTQ" %%mm0, (%0) \n\t"
  708. ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
  709. d += 4;
  710. s += 12;
  711. }
  712. __asm__ volatile(SFENCE:::"memory");
  713. __asm__ volatile(EMMS:::"memory");
  714. while (s < end) {
  715. const int r = *s++;
  716. const int g = *s++;
  717. const int b = *s++;
  718. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  719. }
  720. }
  721. static inline void rgb15tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  722. {
  723. const uint16_t *end;
  724. const uint16_t *mm_end;
  725. uint8_t *d = dst;
  726. const uint16_t *s = (const uint16_t*)src;
  727. end = s + src_size/2;
  728. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  729. mm_end = end - 7;
  730. while (s < mm_end) {
  731. __asm__ volatile(
  732. PREFETCH" 32(%1) \n\t"
  733. "movq (%1), %%mm0 \n\t"
  734. "movq (%1), %%mm1 \n\t"
  735. "movq (%1), %%mm2 \n\t"
  736. "pand %2, %%mm0 \n\t"
  737. "pand %3, %%mm1 \n\t"
  738. "pand %4, %%mm2 \n\t"
  739. "psllq $5, %%mm0 \n\t"
  740. "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
  741. "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
  742. "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
  743. "movq %%mm0, %%mm3 \n\t"
  744. "movq %%mm1, %%mm4 \n\t"
  745. "movq %%mm2, %%mm5 \n\t"
  746. "punpcklwd %5, %%mm0 \n\t"
  747. "punpcklwd %5, %%mm1 \n\t"
  748. "punpcklwd %5, %%mm2 \n\t"
  749. "punpckhwd %5, %%mm3 \n\t"
  750. "punpckhwd %5, %%mm4 \n\t"
  751. "punpckhwd %5, %%mm5 \n\t"
  752. "psllq $8, %%mm1 \n\t"
  753. "psllq $16, %%mm2 \n\t"
  754. "por %%mm1, %%mm0 \n\t"
  755. "por %%mm2, %%mm0 \n\t"
  756. "psllq $8, %%mm4 \n\t"
  757. "psllq $16, %%mm5 \n\t"
  758. "por %%mm4, %%mm3 \n\t"
  759. "por %%mm5, %%mm3 \n\t"
  760. "movq %%mm0, %%mm6 \n\t"
  761. "movq %%mm3, %%mm7 \n\t"
  762. "movq 8(%1), %%mm0 \n\t"
  763. "movq 8(%1), %%mm1 \n\t"
  764. "movq 8(%1), %%mm2 \n\t"
  765. "pand %2, %%mm0 \n\t"
  766. "pand %3, %%mm1 \n\t"
  767. "pand %4, %%mm2 \n\t"
  768. "psllq $5, %%mm0 \n\t"
  769. "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
  770. "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
  771. "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
  772. "movq %%mm0, %%mm3 \n\t"
  773. "movq %%mm1, %%mm4 \n\t"
  774. "movq %%mm2, %%mm5 \n\t"
  775. "punpcklwd %5, %%mm0 \n\t"
  776. "punpcklwd %5, %%mm1 \n\t"
  777. "punpcklwd %5, %%mm2 \n\t"
  778. "punpckhwd %5, %%mm3 \n\t"
  779. "punpckhwd %5, %%mm4 \n\t"
  780. "punpckhwd %5, %%mm5 \n\t"
  781. "psllq $8, %%mm1 \n\t"
  782. "psllq $16, %%mm2 \n\t"
  783. "por %%mm1, %%mm0 \n\t"
  784. "por %%mm2, %%mm0 \n\t"
  785. "psllq $8, %%mm4 \n\t"
  786. "psllq $16, %%mm5 \n\t"
  787. "por %%mm4, %%mm3 \n\t"
  788. "por %%mm5, %%mm3 \n\t"
  789. :"=m"(*d)
  790. :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
  791. NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi)
  792. :"memory");
  793. /* borrowed 32 to 24 */
  794. __asm__ volatile(
  795. "movq %%mm0, %%mm4 \n\t"
  796. "movq %%mm3, %%mm5 \n\t"
  797. "movq %%mm6, %%mm0 \n\t"
  798. "movq %%mm7, %%mm1 \n\t"
  799. "movq %%mm4, %%mm6 \n\t"
  800. "movq %%mm5, %%mm7 \n\t"
  801. "movq %%mm0, %%mm2 \n\t"
  802. "movq %%mm1, %%mm3 \n\t"
  803. STORE_BGR24_MMX
  804. :: "r"(d), "m"(*s)
  805. NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
  806. :"memory");
  807. d += 24;
  808. s += 8;
  809. }
  810. __asm__ volatile(SFENCE:::"memory");
  811. __asm__ volatile(EMMS:::"memory");
  812. while (s < end) {
  813. register uint16_t bgr;
  814. bgr = *s++;
  815. *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
  816. *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
  817. *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
  818. }
  819. }
  820. static inline void rgb16tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  821. {
  822. const uint16_t *end;
  823. const uint16_t *mm_end;
  824. uint8_t *d = (uint8_t *)dst;
  825. const uint16_t *s = (const uint16_t *)src;
  826. end = s + src_size/2;
  827. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  828. mm_end = end - 7;
  829. while (s < mm_end) {
  830. __asm__ volatile(
  831. PREFETCH" 32(%1) \n\t"
  832. "movq (%1), %%mm0 \n\t"
  833. "movq (%1), %%mm1 \n\t"
  834. "movq (%1), %%mm2 \n\t"
  835. "pand %2, %%mm0 \n\t"
  836. "pand %3, %%mm1 \n\t"
  837. "pand %4, %%mm2 \n\t"
  838. "psllq $5, %%mm0 \n\t"
  839. "psrlq $1, %%mm2 \n\t"
  840. "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
  841. "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
  842. "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
  843. "movq %%mm0, %%mm3 \n\t"
  844. "movq %%mm1, %%mm4 \n\t"
  845. "movq %%mm2, %%mm5 \n\t"
  846. "punpcklwd %5, %%mm0 \n\t"
  847. "punpcklwd %5, %%mm1 \n\t"
  848. "punpcklwd %5, %%mm2 \n\t"
  849. "punpckhwd %5, %%mm3 \n\t"
  850. "punpckhwd %5, %%mm4 \n\t"
  851. "punpckhwd %5, %%mm5 \n\t"
  852. "psllq $8, %%mm1 \n\t"
  853. "psllq $16, %%mm2 \n\t"
  854. "por %%mm1, %%mm0 \n\t"
  855. "por %%mm2, %%mm0 \n\t"
  856. "psllq $8, %%mm4 \n\t"
  857. "psllq $16, %%mm5 \n\t"
  858. "por %%mm4, %%mm3 \n\t"
  859. "por %%mm5, %%mm3 \n\t"
  860. "movq %%mm0, %%mm6 \n\t"
  861. "movq %%mm3, %%mm7 \n\t"
  862. "movq 8(%1), %%mm0 \n\t"
  863. "movq 8(%1), %%mm1 \n\t"
  864. "movq 8(%1), %%mm2 \n\t"
  865. "pand %2, %%mm0 \n\t"
  866. "pand %3, %%mm1 \n\t"
  867. "pand %4, %%mm2 \n\t"
  868. "psllq $5, %%mm0 \n\t"
  869. "psrlq $1, %%mm2 \n\t"
  870. "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
  871. "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
  872. "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
  873. "movq %%mm0, %%mm3 \n\t"
  874. "movq %%mm1, %%mm4 \n\t"
  875. "movq %%mm2, %%mm5 \n\t"
  876. "punpcklwd %5, %%mm0 \n\t"
  877. "punpcklwd %5, %%mm1 \n\t"
  878. "punpcklwd %5, %%mm2 \n\t"
  879. "punpckhwd %5, %%mm3 \n\t"
  880. "punpckhwd %5, %%mm4 \n\t"
  881. "punpckhwd %5, %%mm5 \n\t"
  882. "psllq $8, %%mm1 \n\t"
  883. "psllq $16, %%mm2 \n\t"
  884. "por %%mm1, %%mm0 \n\t"
  885. "por %%mm2, %%mm0 \n\t"
  886. "psllq $8, %%mm4 \n\t"
  887. "psllq $16, %%mm5 \n\t"
  888. "por %%mm4, %%mm3 \n\t"
  889. "por %%mm5, %%mm3 \n\t"
  890. :"=m"(*d)
  891. :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
  892. NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi)
  893. :"memory");
  894. /* borrowed 32 to 24 */
  895. __asm__ volatile(
  896. "movq %%mm0, %%mm4 \n\t"
  897. "movq %%mm3, %%mm5 \n\t"
  898. "movq %%mm6, %%mm0 \n\t"
  899. "movq %%mm7, %%mm1 \n\t"
  900. "movq %%mm4, %%mm6 \n\t"
  901. "movq %%mm5, %%mm7 \n\t"
  902. "movq %%mm0, %%mm2 \n\t"
  903. "movq %%mm1, %%mm3 \n\t"
  904. STORE_BGR24_MMX
  905. :: "r"(d), "m"(*s)
  906. NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
  907. :"memory");
  908. d += 24;
  909. s += 8;
  910. }
  911. __asm__ volatile(SFENCE:::"memory");
  912. __asm__ volatile(EMMS:::"memory");
  913. while (s < end) {
  914. register uint16_t bgr;
  915. bgr = *s++;
  916. *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
  917. *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
  918. *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
  919. }
  920. }
  921. /*
  922. * mm0 = 00 B3 00 B2 00 B1 00 B0
  923. * mm1 = 00 G3 00 G2 00 G1 00 G0
  924. * mm2 = 00 R3 00 R2 00 R1 00 R0
  925. * mm6 = FF FF FF FF FF FF FF FF
  926. * mm7 = 00 00 00 00 00 00 00 00
  927. */
  928. #define PACK_RGB32 \
  929. "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
  930. "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
  931. "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
  932. "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
  933. "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
  934. "movq %%mm0, %%mm3 \n\t" \
  935. "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
  936. "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
  937. MOVNTQ" %%mm0, (%0) \n\t" \
  938. MOVNTQ" %%mm3, 8(%0) \n\t" \
  939. static inline void rgb15to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  940. {
  941. const uint16_t *end;
  942. const uint16_t *mm_end;
  943. uint8_t *d = dst;
  944. const uint16_t *s = (const uint16_t *)src;
  945. end = s + src_size/2;
  946. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  947. __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  948. __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
  949. mm_end = end - 3;
  950. while (s < mm_end) {
  951. __asm__ volatile(
  952. PREFETCH" 32(%1) \n\t"
  953. "movq (%1), %%mm0 \n\t"
  954. "movq (%1), %%mm1 \n\t"
  955. "movq (%1), %%mm2 \n\t"
  956. "pand %2, %%mm0 \n\t"
  957. "pand %3, %%mm1 \n\t"
  958. "pand %4, %%mm2 \n\t"
  959. "psllq $5, %%mm0 \n\t"
  960. "pmulhw %5, %%mm0 \n\t"
  961. "pmulhw %5, %%mm1 \n\t"
  962. "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
  963. PACK_RGB32
  964. ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
  965. NAMED_CONSTRAINTS_ADD(mul15_hi)
  966. :"memory");
  967. d += 16;
  968. s += 4;
  969. }
  970. __asm__ volatile(SFENCE:::"memory");
  971. __asm__ volatile(EMMS:::"memory");
  972. while (s < end) {
  973. register uint16_t bgr;
  974. bgr = *s++;
  975. *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
  976. *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
  977. *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
  978. *d++ = 255;
  979. }
  980. }
  981. static inline void rgb16to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  982. {
  983. const uint16_t *end;
  984. const uint16_t *mm_end;
  985. uint8_t *d = dst;
  986. const uint16_t *s = (const uint16_t*)src;
  987. end = s + src_size/2;
  988. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  989. __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  990. __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
  991. mm_end = end - 3;
  992. while (s < mm_end) {
  993. __asm__ volatile(
  994. PREFETCH" 32(%1) \n\t"
  995. "movq (%1), %%mm0 \n\t"
  996. "movq (%1), %%mm1 \n\t"
  997. "movq (%1), %%mm2 \n\t"
  998. "pand %2, %%mm0 \n\t"
  999. "pand %3, %%mm1 \n\t"
  1000. "pand %4, %%mm2 \n\t"
  1001. "psllq $5, %%mm0 \n\t"
  1002. "psrlq $1, %%mm2 \n\t"
  1003. "pmulhw %5, %%mm0 \n\t"
  1004. "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
  1005. "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
  1006. PACK_RGB32
  1007. ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
  1008. NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi)
  1009. :"memory");
  1010. d += 16;
  1011. s += 4;
  1012. }
  1013. __asm__ volatile(SFENCE:::"memory");
  1014. __asm__ volatile(EMMS:::"memory");
  1015. while (s < end) {
  1016. register uint16_t bgr;
  1017. bgr = *s++;
  1018. *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
  1019. *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
  1020. *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
  1021. *d++ = 255;
  1022. }
  1023. }
  1024. static inline void rgb24tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  1025. {
  1026. x86_reg mmx_size= 23 - src_size;
  1027. __asm__ volatile (
  1028. "test %%"FF_REG_a", %%"FF_REG_a" \n\t"
  1029. "jns 2f \n\t"
  1030. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  1031. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  1032. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  1033. ".p2align 4 \n\t"
  1034. "1: \n\t"
  1035. PREFETCH" 32(%1, %%"FF_REG_a") \n\t"
  1036. "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG
  1037. "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" // BGR BGR BG
  1038. "movq 2(%1, %%"FF_REG_a"), %%mm2 \n\t" // R BGR BGR B
  1039. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  1040. "pand %%mm5, %%mm0 \n\t"
  1041. "pand %%mm6, %%mm1 \n\t"
  1042. "pand %%mm7, %%mm2 \n\t"
  1043. "por %%mm0, %%mm1 \n\t"
  1044. "por %%mm2, %%mm1 \n\t"
  1045. "movq 6(%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG
  1046. MOVNTQ" %%mm1,(%2, %%"FF_REG_a") \n\t" // RGB RGB RG
  1047. "movq 8(%1, %%"FF_REG_a"), %%mm1 \n\t" // R BGR BGR B
  1048. "movq 10(%1, %%"FF_REG_a"), %%mm2 \n\t" // GR BGR BGR
  1049. "pand %%mm7, %%mm0 \n\t"
  1050. "pand %%mm5, %%mm1 \n\t"
  1051. "pand %%mm6, %%mm2 \n\t"
  1052. "por %%mm0, %%mm1 \n\t"
  1053. "por %%mm2, %%mm1 \n\t"
  1054. "movq 14(%1, %%"FF_REG_a"), %%mm0 \n\t" // R BGR BGR B
  1055. MOVNTQ" %%mm1, 8(%2, %%"FF_REG_a")\n\t" // B RGB RGB R
  1056. "movq 16(%1, %%"FF_REG_a"), %%mm1 \n\t" // GR BGR BGR
  1057. "movq 18(%1, %%"FF_REG_a"), %%mm2 \n\t" // BGR BGR BG
  1058. "pand %%mm6, %%mm0 \n\t"
  1059. "pand %%mm7, %%mm1 \n\t"
  1060. "pand %%mm5, %%mm2 \n\t"
  1061. "por %%mm0, %%mm1 \n\t"
  1062. "por %%mm2, %%mm1 \n\t"
  1063. MOVNTQ" %%mm1, 16(%2, %%"FF_REG_a") \n\t"
  1064. "add $24, %%"FF_REG_a" \n\t"
  1065. " js 1b \n\t"
  1066. "2: \n\t"
  1067. : "+a" (mmx_size)
  1068. : "r" (src-mmx_size), "r"(dst-mmx_size)
  1069. NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b)
  1070. );
  1071. __asm__ volatile(SFENCE:::"memory");
  1072. __asm__ volatile(EMMS:::"memory");
  1073. if (mmx_size==23) return; //finished, was multiple of 8
  1074. src+= src_size;
  1075. dst+= src_size;
  1076. src_size= 23-mmx_size;
  1077. src-= src_size;
  1078. dst-= src_size;
  1079. for (unsigned i = 0; i < src_size; i +=3) {
  1080. register uint8_t x;
  1081. x = src[i + 2];
  1082. dst[i + 1] = src[i + 1];
  1083. dst[i + 2] = src[i + 0];
  1084. dst[i + 0] = x;
  1085. }
  1086. }
  1087. static inline void yuvPlanartoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1088. int width, int height,
  1089. int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
  1090. {
  1091. const x86_reg chromWidth= width>>1;
  1092. for (int y = 0; y < height; y++) {
  1093. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1094. __asm__ volatile(
  1095. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
  1096. ".p2align 4 \n\t"
  1097. "1: \n\t"
  1098. PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t"
  1099. PREFETCH" 32(%2, %%"FF_REG_a") \n\t"
  1100. PREFETCH" 32(%3, %%"FF_REG_a") \n\t"
  1101. "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0)
  1102. "movq %%mm0, %%mm2 \n\t" // U(0)
  1103. "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0)
  1104. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1105. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1106. "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0)
  1107. "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8)
  1108. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1109. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1110. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1111. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1112. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1113. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1114. MOVNTQ" %%mm3, (%0, %%"FF_REG_a", 4) \n\t"
  1115. MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t"
  1116. MOVNTQ" %%mm5, 16(%0, %%"FF_REG_a", 4) \n\t"
  1117. MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t"
  1118. "add $8, %%"FF_REG_a" \n\t"
  1119. "cmp %4, %%"FF_REG_a" \n\t"
  1120. " jb 1b \n\t"
  1121. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1122. : "%"FF_REG_a
  1123. );
  1124. if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
  1125. usrc += chromStride;
  1126. vsrc += chromStride;
  1127. }
  1128. ysrc += lumStride;
  1129. dst += dstStride;
  1130. }
  1131. __asm__(EMMS" \n\t"
  1132. SFENCE" \n\t"
  1133. :::"memory");
  1134. }
  1135. /**
  1136. * Height should be a multiple of 2 and width should be a multiple of 16.
  1137. * (If this is a problem for anyone then tell me, and I will fix it.)
  1138. */
  1139. static inline void yv12toyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1140. int width, int height,
  1141. int lumStride, int chromStride, int dstStride)
  1142. {
  1143. //FIXME interpolate chroma
  1144. yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1145. }
  1146. static inline void yuvPlanartouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1147. int width, int height,
  1148. int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
  1149. {
  1150. const x86_reg chromWidth= width>>1;
  1151. for (int y = 0; y < height; y++) {
  1152. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1153. __asm__ volatile(
  1154. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
  1155. ".p2align 4 \n\t"
  1156. "1: \n\t"
  1157. PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t"
  1158. PREFETCH" 32(%2, %%"FF_REG_a") \n\t"
  1159. PREFETCH" 32(%3, %%"FF_REG_a") \n\t"
  1160. "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0)
  1161. "movq %%mm0, %%mm2 \n\t" // U(0)
  1162. "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0)
  1163. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1164. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1165. "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0)
  1166. "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8)
  1167. "movq %%mm0, %%mm4 \n\t" // Y(0)
  1168. "movq %%mm2, %%mm6 \n\t" // Y(8)
  1169. "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
  1170. "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
  1171. "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
  1172. "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
  1173. MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 4) \n\t"
  1174. MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t"
  1175. MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 4) \n\t"
  1176. MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t"
  1177. "add $8, %%"FF_REG_a" \n\t"
  1178. "cmp %4, %%"FF_REG_a" \n\t"
  1179. " jb 1b \n\t"
  1180. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1181. : "%"FF_REG_a
  1182. );
  1183. if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
  1184. usrc += chromStride;
  1185. vsrc += chromStride;
  1186. }
  1187. ysrc += lumStride;
  1188. dst += dstStride;
  1189. }
  1190. __asm__(EMMS" \n\t"
  1191. SFENCE" \n\t"
  1192. :::"memory");
  1193. }
  1194. /**
  1195. * Height should be a multiple of 2 and width should be a multiple of 16
  1196. * (If this is a problem for anyone then tell me, and I will fix it.)
  1197. */
  1198. static inline void yv12touyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1199. int width, int height,
  1200. int lumStride, int chromStride, int dstStride)
  1201. {
  1202. //FIXME interpolate chroma
  1203. yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1204. }
  1205. /**
  1206. * Width should be a multiple of 16.
  1207. */
  1208. static inline void yuv422ptouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1209. int width, int height,
  1210. int lumStride, int chromStride, int dstStride)
  1211. {
  1212. yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1213. }
  1214. /**
  1215. * Width should be a multiple of 16.
  1216. */
  1217. static inline void yuv422ptoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1218. int width, int height,
  1219. int lumStride, int chromStride, int dstStride)
  1220. {
  1221. yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1222. }
  1223. /**
  1224. * Height should be a multiple of 2 and width should be a multiple of 16.
  1225. * (If this is a problem for anyone then tell me, and I will fix it.)
  1226. */
  1227. static inline void yuy2toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1228. int width, int height,
  1229. int lumStride, int chromStride, int srcStride)
  1230. {
  1231. const x86_reg chromWidth= width>>1;
  1232. for (int y = 0; y < height; y += 2) {
  1233. __asm__ volatile(
  1234. "xor %%"FF_REG_a", %%"FF_REG_a"\n\t"
  1235. "pcmpeqw %%mm7, %%mm7 \n\t"
  1236. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1237. ".p2align 4 \n\t"
  1238. "1: \n\t"
  1239. PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t"
  1240. "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1241. "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1242. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  1243. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  1244. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  1245. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  1246. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1247. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1248. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1249. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1250. MOVNTQ" %%mm2, (%1, %%"FF_REG_a", 2) \n\t"
  1251. "movq 16(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
  1252. "movq 24(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
  1253. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  1254. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  1255. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  1256. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  1257. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1258. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1259. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1260. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1261. MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t"
  1262. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1263. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1264. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1265. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1266. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1267. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1268. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1269. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1270. MOVNTQ" %%mm0, (%3, %%"FF_REG_a") \n\t"
  1271. MOVNTQ" %%mm2, (%2, %%"FF_REG_a") \n\t"
  1272. "add $8, %%"FF_REG_a" \n\t"
  1273. "cmp %4, %%"FF_REG_a" \n\t"
  1274. " jb 1b \n\t"
  1275. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1276. : "memory", "%"FF_REG_a
  1277. );
  1278. ydst += lumStride;
  1279. src += srcStride;
  1280. __asm__ volatile(
  1281. "xor %%"FF_REG_a", %%"FF_REG_a"\n\t"
  1282. ".p2align 4 \n\t"
  1283. "1: \n\t"
  1284. PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t"
  1285. "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1286. "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1287. "movq 16(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
  1288. "movq 24(%0, %%"FF_REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
  1289. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1290. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1291. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1292. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1293. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1294. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1295. MOVNTQ" %%mm0, (%1, %%"FF_REG_a", 2) \n\t"
  1296. MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t"
  1297. "add $8, %%"FF_REG_a"\n\t"
  1298. "cmp %4, %%"FF_REG_a"\n\t"
  1299. " jb 1b \n\t"
  1300. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1301. : "memory", "%"FF_REG_a
  1302. );
  1303. udst += chromStride;
  1304. vdst += chromStride;
  1305. ydst += lumStride;
  1306. src += srcStride;
  1307. }
  1308. __asm__ volatile(EMMS" \n\t"
  1309. SFENCE" \n\t"
  1310. :::"memory");
  1311. }
  1312. static inline void planar2x_mmxext(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
  1313. {
  1314. dst[0]= src[0];
  1315. // first line
  1316. for (int x = 0; x < srcWidth - 1; x++) {
  1317. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1318. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1319. }
  1320. dst[2*srcWidth-1]= src[srcWidth-1];
  1321. dst+= dstStride;
  1322. for (int y = 1; y < srcHeight; y++) {
  1323. x86_reg mmxSize= srcWidth&~15;
  1324. if (mmxSize) {
  1325. __asm__ volatile(
  1326. "mov %4, %%"FF_REG_a" \n\t"
  1327. "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
  1328. "movq (%0, %%"FF_REG_a"), %%mm4 \n\t"
  1329. "movq %%mm4, %%mm2 \n\t"
  1330. "psllq $8, %%mm4 \n\t"
  1331. "pand %%mm0, %%mm2 \n\t"
  1332. "por %%mm2, %%mm4 \n\t"
  1333. "movq (%1, %%"FF_REG_a"), %%mm5 \n\t"
  1334. "movq %%mm5, %%mm3 \n\t"
  1335. "psllq $8, %%mm5 \n\t"
  1336. "pand %%mm0, %%mm3 \n\t"
  1337. "por %%mm3, %%mm5 \n\t"
  1338. "1: \n\t"
  1339. "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
  1340. "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
  1341. "movq 1(%0, %%"FF_REG_a"), %%mm2 \n\t"
  1342. "movq 1(%1, %%"FF_REG_a"), %%mm3 \n\t"
  1343. PAVGB" %%mm0, %%mm5 \n\t"
  1344. PAVGB" %%mm0, %%mm3 \n\t"
  1345. PAVGB" %%mm0, %%mm5 \n\t"
  1346. PAVGB" %%mm0, %%mm3 \n\t"
  1347. PAVGB" %%mm1, %%mm4 \n\t"
  1348. PAVGB" %%mm1, %%mm2 \n\t"
  1349. PAVGB" %%mm1, %%mm4 \n\t"
  1350. PAVGB" %%mm1, %%mm2 \n\t"
  1351. "movq %%mm5, %%mm7 \n\t"
  1352. "movq %%mm4, %%mm6 \n\t"
  1353. "punpcklbw %%mm3, %%mm5 \n\t"
  1354. "punpckhbw %%mm3, %%mm7 \n\t"
  1355. "punpcklbw %%mm2, %%mm4 \n\t"
  1356. "punpckhbw %%mm2, %%mm6 \n\t"
  1357. MOVNTQ" %%mm5, (%2, %%"FF_REG_a", 2) \n\t"
  1358. MOVNTQ" %%mm7, 8(%2, %%"FF_REG_a", 2) \n\t"
  1359. MOVNTQ" %%mm4, (%3, %%"FF_REG_a", 2) \n\t"
  1360. MOVNTQ" %%mm6, 8(%3, %%"FF_REG_a", 2) \n\t"
  1361. "add $8, %%"FF_REG_a" \n\t"
  1362. "movq -1(%0, %%"FF_REG_a"), %%mm4 \n\t"
  1363. "movq -1(%1, %%"FF_REG_a"), %%mm5 \n\t"
  1364. " js 1b \n\t"
  1365. :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
  1366. "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
  1367. "g" (-mmxSize)
  1368. NAMED_CONSTRAINTS_ADD(mmx_ff)
  1369. : "%"FF_REG_a
  1370. );
  1371. } else {
  1372. mmxSize = 1;
  1373. dst[0] = (src[0] * 3 + src[srcStride]) >> 2;
  1374. dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
  1375. }
  1376. for (int x = mmxSize - 1; x < srcWidth - 1; x++) {
  1377. dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
  1378. dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
  1379. dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
  1380. dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
  1381. }
  1382. dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
  1383. dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
  1384. dst+=dstStride*2;
  1385. src+=srcStride;
  1386. }
  1387. // last line
  1388. dst[0]= src[0];
  1389. for (int x = 0; x < srcWidth - 1; x++) {
  1390. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1391. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1392. }
  1393. dst[2*srcWidth-1]= src[srcWidth-1];
  1394. __asm__ volatile(EMMS" \n\t"
  1395. SFENCE" \n\t"
  1396. :::"memory");
  1397. }
  1398. /**
  1399. * Height should be a multiple of 2 and width should be a multiple of 2.
  1400. * (If this is a problem for anyone then tell me, and I will fix it.)
  1401. * Chrominance data is only taken from every second line,
  1402. * others are ignored in the C version.
  1403. * FIXME: Write HQ version.
  1404. */
  1405. #if HAVE_7REGS
  1406. static inline void rgb24toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1407. int width, int height,
  1408. int lumStride, int chromStride, int srcStride,
  1409. int32_t *rgb2yuv)
  1410. {
  1411. #define BGR2Y_IDX "16*4+16*32"
  1412. #define BGR2U_IDX "16*4+16*33"
  1413. #define BGR2V_IDX "16*4+16*34"
  1414. int y;
  1415. const x86_reg chromWidth= width>>1;
  1416. if (height > 2) {
  1417. ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv);
  1418. src += 2*srcStride;
  1419. ydst += 2*lumStride;
  1420. udst += chromStride;
  1421. vdst += chromStride;
  1422. height -= 2;
  1423. }
  1424. for (y = 0; y < height - 2; y += 2) {
  1425. for (int i = 0; i < 2; i++) {
  1426. __asm__ volatile(
  1427. "mov %2, %%"FF_REG_a"\n\t"
  1428. "movq "BGR2Y_IDX"(%3), %%mm6 \n\t"
  1429. "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
  1430. "pxor %%mm7, %%mm7 \n\t"
  1431. "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
  1432. ".p2align 4 \n\t"
  1433. "1: \n\t"
  1434. PREFETCH" 64(%0, %%"FF_REG_d") \n\t"
  1435. "movd (%0, %%"FF_REG_d"), %%mm0 \n\t"
  1436. "movd 3(%0, %%"FF_REG_d"), %%mm1 \n\t"
  1437. "punpcklbw %%mm7, %%mm0 \n\t"
  1438. "punpcklbw %%mm7, %%mm1 \n\t"
  1439. "movd 6(%0, %%"FF_REG_d"), %%mm2 \n\t"
  1440. "movd 9(%0, %%"FF_REG_d"), %%mm3 \n\t"
  1441. "punpcklbw %%mm7, %%mm2 \n\t"
  1442. "punpcklbw %%mm7, %%mm3 \n\t"
  1443. "pmaddwd %%mm6, %%mm0 \n\t"
  1444. "pmaddwd %%mm6, %%mm1 \n\t"
  1445. "pmaddwd %%mm6, %%mm2 \n\t"
  1446. "pmaddwd %%mm6, %%mm3 \n\t"
  1447. "psrad $8, %%mm0 \n\t"
  1448. "psrad $8, %%mm1 \n\t"
  1449. "psrad $8, %%mm2 \n\t"
  1450. "psrad $8, %%mm3 \n\t"
  1451. "packssdw %%mm1, %%mm0 \n\t"
  1452. "packssdw %%mm3, %%mm2 \n\t"
  1453. "pmaddwd %%mm5, %%mm0 \n\t"
  1454. "pmaddwd %%mm5, %%mm2 \n\t"
  1455. "packssdw %%mm2, %%mm0 \n\t"
  1456. "psraw $7, %%mm0 \n\t"
  1457. "movd 12(%0, %%"FF_REG_d"), %%mm4 \n\t"
  1458. "movd 15(%0, %%"FF_REG_d"), %%mm1 \n\t"
  1459. "punpcklbw %%mm7, %%mm4 \n\t"
  1460. "punpcklbw %%mm7, %%mm1 \n\t"
  1461. "movd 18(%0, %%"FF_REG_d"), %%mm2 \n\t"
  1462. "movd 21(%0, %%"FF_REG_d"), %%mm3 \n\t"
  1463. "punpcklbw %%mm7, %%mm2 \n\t"
  1464. "punpcklbw %%mm7, %%mm3 \n\t"
  1465. "pmaddwd %%mm6, %%mm4 \n\t"
  1466. "pmaddwd %%mm6, %%mm1 \n\t"
  1467. "pmaddwd %%mm6, %%mm2 \n\t"
  1468. "pmaddwd %%mm6, %%mm3 \n\t"
  1469. "psrad $8, %%mm4 \n\t"
  1470. "psrad $8, %%mm1 \n\t"
  1471. "psrad $8, %%mm2 \n\t"
  1472. "psrad $8, %%mm3 \n\t"
  1473. "packssdw %%mm1, %%mm4 \n\t"
  1474. "packssdw %%mm3, %%mm2 \n\t"
  1475. "pmaddwd %%mm5, %%mm4 \n\t"
  1476. "pmaddwd %%mm5, %%mm2 \n\t"
  1477. "add $24, %%"FF_REG_d"\n\t"
  1478. "packssdw %%mm2, %%mm4 \n\t"
  1479. "psraw $7, %%mm4 \n\t"
  1480. "packuswb %%mm4, %%mm0 \n\t"
  1481. "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
  1482. MOVNTQ" %%mm0, (%1, %%"FF_REG_a") \n\t"
  1483. "add $8, %%"FF_REG_a" \n\t"
  1484. " js 1b \n\t"
  1485. : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
  1486. NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset)
  1487. : "%"FF_REG_a, "%"FF_REG_d
  1488. );
  1489. ydst += lumStride;
  1490. src += srcStride;
  1491. }
  1492. src -= srcStride*2;
  1493. __asm__ volatile(
  1494. "mov %4, %%"FF_REG_a"\n\t"
  1495. "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
  1496. "movq "BGR2U_IDX"(%5), %%mm6 \n\t"
  1497. "pxor %%mm7, %%mm7 \n\t"
  1498. "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
  1499. "add %%"FF_REG_d", %%"FF_REG_d"\n\t"
  1500. ".p2align 4 \n\t"
  1501. "1: \n\t"
  1502. PREFETCH" 64(%0, %%"FF_REG_d") \n\t"
  1503. PREFETCH" 64(%1, %%"FF_REG_d") \n\t"
  1504. "movq (%0, %%"FF_REG_d"), %%mm0 \n\t"
  1505. "movq (%1, %%"FF_REG_d"), %%mm1 \n\t"
  1506. "movq 6(%0, %%"FF_REG_d"), %%mm2 \n\t"
  1507. "movq 6(%1, %%"FF_REG_d"), %%mm3 \n\t"
  1508. PAVGB" %%mm1, %%mm0 \n\t"
  1509. PAVGB" %%mm3, %%mm2 \n\t"
  1510. "movq %%mm0, %%mm1 \n\t"
  1511. "movq %%mm2, %%mm3 \n\t"
  1512. "psrlq $24, %%mm0 \n\t"
  1513. "psrlq $24, %%mm2 \n\t"
  1514. PAVGB" %%mm1, %%mm0 \n\t"
  1515. PAVGB" %%mm3, %%mm2 \n\t"
  1516. "punpcklbw %%mm7, %%mm0 \n\t"
  1517. "punpcklbw %%mm7, %%mm2 \n\t"
  1518. "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
  1519. "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
  1520. "pmaddwd %%mm0, %%mm1 \n\t"
  1521. "pmaddwd %%mm2, %%mm3 \n\t"
  1522. "pmaddwd %%mm6, %%mm0 \n\t"
  1523. "pmaddwd %%mm6, %%mm2 \n\t"
  1524. "psrad $8, %%mm0 \n\t"
  1525. "psrad $8, %%mm1 \n\t"
  1526. "psrad $8, %%mm2 \n\t"
  1527. "psrad $8, %%mm3 \n\t"
  1528. "packssdw %%mm2, %%mm0 \n\t"
  1529. "packssdw %%mm3, %%mm1 \n\t"
  1530. "pmaddwd %%mm5, %%mm0 \n\t"
  1531. "pmaddwd %%mm5, %%mm1 \n\t"
  1532. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  1533. "psraw $7, %%mm0 \n\t"
  1534. "movq 12(%0, %%"FF_REG_d"), %%mm4 \n\t"
  1535. "movq 12(%1, %%"FF_REG_d"), %%mm1 \n\t"
  1536. "movq 18(%0, %%"FF_REG_d"), %%mm2 \n\t"
  1537. "movq 18(%1, %%"FF_REG_d"), %%mm3 \n\t"
  1538. PAVGB" %%mm1, %%mm4 \n\t"
  1539. PAVGB" %%mm3, %%mm2 \n\t"
  1540. "movq %%mm4, %%mm1 \n\t"
  1541. "movq %%mm2, %%mm3 \n\t"
  1542. "psrlq $24, %%mm4 \n\t"
  1543. "psrlq $24, %%mm2 \n\t"
  1544. PAVGB" %%mm1, %%mm4 \n\t"
  1545. PAVGB" %%mm3, %%mm2 \n\t"
  1546. "punpcklbw %%mm7, %%mm4 \n\t"
  1547. "punpcklbw %%mm7, %%mm2 \n\t"
  1548. "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
  1549. "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
  1550. "pmaddwd %%mm4, %%mm1 \n\t"
  1551. "pmaddwd %%mm2, %%mm3 \n\t"
  1552. "pmaddwd %%mm6, %%mm4 \n\t"
  1553. "pmaddwd %%mm6, %%mm2 \n\t"
  1554. "psrad $8, %%mm4 \n\t"
  1555. "psrad $8, %%mm1 \n\t"
  1556. "psrad $8, %%mm2 \n\t"
  1557. "psrad $8, %%mm3 \n\t"
  1558. "packssdw %%mm2, %%mm4 \n\t"
  1559. "packssdw %%mm3, %%mm1 \n\t"
  1560. "pmaddwd %%mm5, %%mm4 \n\t"
  1561. "pmaddwd %%mm5, %%mm1 \n\t"
  1562. "add $24, %%"FF_REG_d"\n\t"
  1563. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  1564. "psraw $7, %%mm4 \n\t"
  1565. "movq %%mm0, %%mm1 \n\t"
  1566. "punpckldq %%mm4, %%mm0 \n\t"
  1567. "punpckhdq %%mm4, %%mm1 \n\t"
  1568. "packsswb %%mm1, %%mm0 \n\t"
  1569. "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
  1570. "movd %%mm0, (%2, %%"FF_REG_a") \n\t"
  1571. "punpckhdq %%mm0, %%mm0 \n\t"
  1572. "movd %%mm0, (%3, %%"FF_REG_a") \n\t"
  1573. "add $4, %%"FF_REG_a" \n\t"
  1574. " js 1b \n\t"
  1575. : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
  1576. NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset)
  1577. : "%"FF_REG_a, "%"FF_REG_d
  1578. );
  1579. udst += chromStride;
  1580. vdst += chromStride;
  1581. src += srcStride*2;
  1582. }
  1583. __asm__ volatile(EMMS" \n\t"
  1584. SFENCE" \n\t"
  1585. :::"memory");
  1586. ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
  1587. }
  1588. #endif /* HAVE_7REGS */
  1589. static inline void vu9_to_vu12_mmxext(const uint8_t *src1, const uint8_t *src2,
  1590. uint8_t *dst1, uint8_t *dst2,
  1591. int width, int height,
  1592. int srcStride1, int srcStride2,
  1593. int dstStride1, int dstStride2)
  1594. {
  1595. int w,h;
  1596. w=width/2; h=height/2;
  1597. __asm__ volatile(
  1598. PREFETCH" %0 \n\t"
  1599. PREFETCH" %1 \n\t"
  1600. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
  1601. for (x86_reg y = 0; y < h; y++) {
  1602. const uint8_t* s1=src1+srcStride1*(y>>1);
  1603. uint8_t* d=dst1+dstStride1*y;
  1604. x86_reg x = 0;
  1605. for (;x<w-31;x+=32) {
  1606. __asm__ volatile(
  1607. PREFETCH" 32(%1,%2) \n\t"
  1608. "movq (%1,%2), %%mm0 \n\t"
  1609. "movq 8(%1,%2), %%mm2 \n\t"
  1610. "movq 16(%1,%2), %%mm4 \n\t"
  1611. "movq 24(%1,%2), %%mm6 \n\t"
  1612. "movq %%mm0, %%mm1 \n\t"
  1613. "movq %%mm2, %%mm3 \n\t"
  1614. "movq %%mm4, %%mm5 \n\t"
  1615. "movq %%mm6, %%mm7 \n\t"
  1616. "punpcklbw %%mm0, %%mm0 \n\t"
  1617. "punpckhbw %%mm1, %%mm1 \n\t"
  1618. "punpcklbw %%mm2, %%mm2 \n\t"
  1619. "punpckhbw %%mm3, %%mm3 \n\t"
  1620. "punpcklbw %%mm4, %%mm4 \n\t"
  1621. "punpckhbw %%mm5, %%mm5 \n\t"
  1622. "punpcklbw %%mm6, %%mm6 \n\t"
  1623. "punpckhbw %%mm7, %%mm7 \n\t"
  1624. MOVNTQ" %%mm0, (%0,%2,2) \n\t"
  1625. MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
  1626. MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
  1627. MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
  1628. MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
  1629. MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
  1630. MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
  1631. MOVNTQ" %%mm7, 56(%0,%2,2)"
  1632. :: "r"(d), "r"(s1), "r"(x)
  1633. :"memory");
  1634. }
  1635. for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
  1636. }
  1637. for (x86_reg y = 0; y < h; y++) {
  1638. const uint8_t* s2=src2+srcStride2*(y>>1);
  1639. uint8_t* d=dst2+dstStride2*y;
  1640. x86_reg x = 0;
  1641. for (;x<w-31;x+=32) {
  1642. __asm__ volatile(
  1643. PREFETCH" 32(%1,%2) \n\t"
  1644. "movq (%1,%2), %%mm0 \n\t"
  1645. "movq 8(%1,%2), %%mm2 \n\t"
  1646. "movq 16(%1,%2), %%mm4 \n\t"
  1647. "movq 24(%1,%2), %%mm6 \n\t"
  1648. "movq %%mm0, %%mm1 \n\t"
  1649. "movq %%mm2, %%mm3 \n\t"
  1650. "movq %%mm4, %%mm5 \n\t"
  1651. "movq %%mm6, %%mm7 \n\t"
  1652. "punpcklbw %%mm0, %%mm0 \n\t"
  1653. "punpckhbw %%mm1, %%mm1 \n\t"
  1654. "punpcklbw %%mm2, %%mm2 \n\t"
  1655. "punpckhbw %%mm3, %%mm3 \n\t"
  1656. "punpcklbw %%mm4, %%mm4 \n\t"
  1657. "punpckhbw %%mm5, %%mm5 \n\t"
  1658. "punpcklbw %%mm6, %%mm6 \n\t"
  1659. "punpckhbw %%mm7, %%mm7 \n\t"
  1660. MOVNTQ" %%mm0, (%0,%2,2) \n\t"
  1661. MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
  1662. MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
  1663. MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
  1664. MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
  1665. MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
  1666. MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
  1667. MOVNTQ" %%mm7, 56(%0,%2,2)"
  1668. :: "r"(d), "r"(s2), "r"(x)
  1669. :"memory");
  1670. }
  1671. for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
  1672. }
  1673. __asm__(
  1674. EMMS" \n\t"
  1675. SFENCE" \n\t"
  1676. ::: "memory"
  1677. );
  1678. }
  1679. static inline void yvu9_to_yuy2_mmxext(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
  1680. uint8_t *dst,
  1681. int width, int height,
  1682. int srcStride1, int srcStride2,
  1683. int srcStride3, int dstStride)
  1684. {
  1685. int w,h;
  1686. w=width/2; h=height;
  1687. for (int y = 0; y < h; y++) {
  1688. const uint8_t* yp=src1+srcStride1*y;
  1689. const uint8_t* up=src2+srcStride2*(y>>2);
  1690. const uint8_t* vp=src3+srcStride3*(y>>2);
  1691. uint8_t* d=dst+dstStride*y;
  1692. x86_reg x = 0;
  1693. for (;x<w-7;x+=8) {
  1694. __asm__ volatile(
  1695. PREFETCH" 32(%1, %0) \n\t"
  1696. PREFETCH" 32(%2, %0) \n\t"
  1697. PREFETCH" 32(%3, %0) \n\t"
  1698. "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  1699. "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
  1700. "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
  1701. "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  1702. "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
  1703. "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
  1704. "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
  1705. "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
  1706. "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
  1707. "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
  1708. "movq %%mm1, %%mm6 \n\t"
  1709. "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
  1710. "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
  1711. "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
  1712. MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
  1713. MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
  1714. "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
  1715. "movq 8(%1, %0, 4), %%mm0 \n\t"
  1716. "movq %%mm0, %%mm3 \n\t"
  1717. "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
  1718. "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
  1719. MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
  1720. MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
  1721. "movq %%mm4, %%mm6 \n\t"
  1722. "movq 16(%1, %0, 4), %%mm0 \n\t"
  1723. "movq %%mm0, %%mm3 \n\t"
  1724. "punpcklbw %%mm5, %%mm4 \n\t"
  1725. "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
  1726. "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
  1727. MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
  1728. MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
  1729. "punpckhbw %%mm5, %%mm6 \n\t"
  1730. "movq 24(%1, %0, 4), %%mm0 \n\t"
  1731. "movq %%mm0, %%mm3 \n\t"
  1732. "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
  1733. "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
  1734. MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
  1735. MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
  1736. : "+r" (x)
  1737. : "r"(yp), "r" (up), "r"(vp), "r"(d)
  1738. :"memory");
  1739. }
  1740. for (; x<w; x++) {
  1741. const int x2 = x<<2;
  1742. d[8*x+0] = yp[x2];
  1743. d[8*x+1] = up[x];
  1744. d[8*x+2] = yp[x2+1];
  1745. d[8*x+3] = vp[x];
  1746. d[8*x+4] = yp[x2+2];
  1747. d[8*x+5] = up[x];
  1748. d[8*x+6] = yp[x2+3];
  1749. d[8*x+7] = vp[x];
  1750. }
  1751. }
  1752. __asm__(
  1753. EMMS" \n\t"
  1754. SFENCE" \n\t"
  1755. ::: "memory"
  1756. );
  1757. }
  1758. static void extract_even_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count)
  1759. {
  1760. dst += count;
  1761. src += 2*count;
  1762. count= - count;
  1763. if(count <= -16) {
  1764. count += 15;
  1765. __asm__ volatile(
  1766. "pcmpeqw %%mm7, %%mm7 \n\t"
  1767. "psrlw $8, %%mm7 \n\t"
  1768. "1: \n\t"
  1769. "movq -30(%1, %0, 2), %%mm0 \n\t"
  1770. "movq -22(%1, %0, 2), %%mm1 \n\t"
  1771. "movq -14(%1, %0, 2), %%mm2 \n\t"
  1772. "movq -6(%1, %0, 2), %%mm3 \n\t"
  1773. "pand %%mm7, %%mm0 \n\t"
  1774. "pand %%mm7, %%mm1 \n\t"
  1775. "pand %%mm7, %%mm2 \n\t"
  1776. "pand %%mm7, %%mm3 \n\t"
  1777. "packuswb %%mm1, %%mm0 \n\t"
  1778. "packuswb %%mm3, %%mm2 \n\t"
  1779. MOVNTQ" %%mm0,-15(%2, %0) \n\t"
  1780. MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
  1781. "add $16, %0 \n\t"
  1782. " js 1b \n\t"
  1783. : "+r"(count)
  1784. : "r"(src), "r"(dst)
  1785. );
  1786. count -= 15;
  1787. }
  1788. while(count<0) {
  1789. dst[count]= src[2*count];
  1790. count++;
  1791. }
  1792. }
  1793. static void extract_odd_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count)
  1794. {
  1795. src ++;
  1796. dst += count;
  1797. src += 2*count;
  1798. count= - count;
  1799. if(count < -16) {
  1800. count += 16;
  1801. __asm__ volatile(
  1802. "pcmpeqw %%mm7, %%mm7 \n\t"
  1803. "psrlw $8, %%mm7 \n\t"
  1804. "1: \n\t"
  1805. "movq -32(%1, %0, 2), %%mm0 \n\t"
  1806. "movq -24(%1, %0, 2), %%mm1 \n\t"
  1807. "movq -16(%1, %0, 2), %%mm2 \n\t"
  1808. "movq -8(%1, %0, 2), %%mm3 \n\t"
  1809. "pand %%mm7, %%mm0 \n\t"
  1810. "pand %%mm7, %%mm1 \n\t"
  1811. "pand %%mm7, %%mm2 \n\t"
  1812. "pand %%mm7, %%mm3 \n\t"
  1813. "packuswb %%mm1, %%mm0 \n\t"
  1814. "packuswb %%mm3, %%mm2 \n\t"
  1815. MOVNTQ" %%mm0,-16(%2, %0) \n\t"
  1816. MOVNTQ" %%mm2,- 8(%2, %0) \n\t"
  1817. "add $16, %0 \n\t"
  1818. " js 1b \n\t"
  1819. : "+r"(count)
  1820. : "r"(src), "r"(dst)
  1821. );
  1822. count -= 16;
  1823. }
  1824. while(count<0) {
  1825. dst[count]= src[2*count];
  1826. count++;
  1827. }
  1828. }
  1829. #if ARCH_X86_32
  1830. static void extract_even2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  1831. {
  1832. dst0+= count;
  1833. dst1+= count;
  1834. src += 4*count;
  1835. count= - count;
  1836. if(count <= -8) {
  1837. count += 7;
  1838. __asm__ volatile(
  1839. "pcmpeqw %%mm7, %%mm7 \n\t"
  1840. "psrlw $8, %%mm7 \n\t"
  1841. "1: \n\t"
  1842. "movq -28(%1, %0, 4), %%mm0 \n\t"
  1843. "movq -20(%1, %0, 4), %%mm1 \n\t"
  1844. "movq -12(%1, %0, 4), %%mm2 \n\t"
  1845. "movq -4(%1, %0, 4), %%mm3 \n\t"
  1846. "pand %%mm7, %%mm0 \n\t"
  1847. "pand %%mm7, %%mm1 \n\t"
  1848. "pand %%mm7, %%mm2 \n\t"
  1849. "pand %%mm7, %%mm3 \n\t"
  1850. "packuswb %%mm1, %%mm0 \n\t"
  1851. "packuswb %%mm3, %%mm2 \n\t"
  1852. "movq %%mm0, %%mm1 \n\t"
  1853. "movq %%mm2, %%mm3 \n\t"
  1854. "psrlw $8, %%mm0 \n\t"
  1855. "psrlw $8, %%mm2 \n\t"
  1856. "pand %%mm7, %%mm1 \n\t"
  1857. "pand %%mm7, %%mm3 \n\t"
  1858. "packuswb %%mm2, %%mm0 \n\t"
  1859. "packuswb %%mm3, %%mm1 \n\t"
  1860. MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
  1861. MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
  1862. "add $8, %0 \n\t"
  1863. " js 1b \n\t"
  1864. : "+r"(count)
  1865. : "r"(src), "r"(dst0), "r"(dst1)
  1866. );
  1867. count -= 7;
  1868. }
  1869. while(count<0) {
  1870. dst0[count]= src[4*count+0];
  1871. dst1[count]= src[4*count+2];
  1872. count++;
  1873. }
  1874. }
  1875. #endif /* ARCH_X86_32 */
  1876. static void extract_even2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  1877. {
  1878. dst0 += count;
  1879. dst1 += count;
  1880. src0 += 4*count;
  1881. src1 += 4*count;
  1882. count= - count;
  1883. #ifdef PAVGB
  1884. if(count <= -8) {
  1885. count += 7;
  1886. __asm__ volatile(
  1887. "pcmpeqw %%mm7, %%mm7 \n\t"
  1888. "psrlw $8, %%mm7 \n\t"
  1889. "1: \n\t"
  1890. "movq -28(%1, %0, 4), %%mm0 \n\t"
  1891. "movq -20(%1, %0, 4), %%mm1 \n\t"
  1892. "movq -12(%1, %0, 4), %%mm2 \n\t"
  1893. "movq -4(%1, %0, 4), %%mm3 \n\t"
  1894. PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
  1895. PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
  1896. PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
  1897. PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
  1898. "pand %%mm7, %%mm0 \n\t"
  1899. "pand %%mm7, %%mm1 \n\t"
  1900. "pand %%mm7, %%mm2 \n\t"
  1901. "pand %%mm7, %%mm3 \n\t"
  1902. "packuswb %%mm1, %%mm0 \n\t"
  1903. "packuswb %%mm3, %%mm2 \n\t"
  1904. "movq %%mm0, %%mm1 \n\t"
  1905. "movq %%mm2, %%mm3 \n\t"
  1906. "psrlw $8, %%mm0 \n\t"
  1907. "psrlw $8, %%mm2 \n\t"
  1908. "pand %%mm7, %%mm1 \n\t"
  1909. "pand %%mm7, %%mm3 \n\t"
  1910. "packuswb %%mm2, %%mm0 \n\t"
  1911. "packuswb %%mm3, %%mm1 \n\t"
  1912. MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
  1913. MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
  1914. "add $8, %0 \n\t"
  1915. " js 1b \n\t"
  1916. : "+r"(count)
  1917. : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
  1918. );
  1919. count -= 7;
  1920. }
  1921. #endif
  1922. while(count<0) {
  1923. dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
  1924. dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
  1925. count++;
  1926. }
  1927. }
  1928. static void extract_odd2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  1929. {
  1930. dst0+= count;
  1931. dst1+= count;
  1932. src += 4*count;
  1933. count= - count;
  1934. if(count <= -8) {
  1935. count += 7;
  1936. __asm__ volatile(
  1937. "pcmpeqw %%mm7, %%mm7 \n\t"
  1938. "psrlw $8, %%mm7 \n\t"
  1939. "1: \n\t"
  1940. "movq -28(%1, %0, 4), %%mm0 \n\t"
  1941. "movq -20(%1, %0, 4), %%mm1 \n\t"
  1942. "movq -12(%1, %0, 4), %%mm2 \n\t"
  1943. "movq -4(%1, %0, 4), %%mm3 \n\t"
  1944. "psrlw $8, %%mm0 \n\t"
  1945. "psrlw $8, %%mm1 \n\t"
  1946. "psrlw $8, %%mm2 \n\t"
  1947. "psrlw $8, %%mm3 \n\t"
  1948. "packuswb %%mm1, %%mm0 \n\t"
  1949. "packuswb %%mm3, %%mm2 \n\t"
  1950. "movq %%mm0, %%mm1 \n\t"
  1951. "movq %%mm2, %%mm3 \n\t"
  1952. "psrlw $8, %%mm0 \n\t"
  1953. "psrlw $8, %%mm2 \n\t"
  1954. "pand %%mm7, %%mm1 \n\t"
  1955. "pand %%mm7, %%mm3 \n\t"
  1956. "packuswb %%mm2, %%mm0 \n\t"
  1957. "packuswb %%mm3, %%mm1 \n\t"
  1958. MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
  1959. MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
  1960. "add $8, %0 \n\t"
  1961. " js 1b \n\t"
  1962. : "+r"(count)
  1963. : "r"(src), "r"(dst0), "r"(dst1)
  1964. );
  1965. count -= 7;
  1966. }
  1967. src++;
  1968. while(count<0) {
  1969. dst0[count]= src[4*count+0];
  1970. dst1[count]= src[4*count+2];
  1971. count++;
  1972. }
  1973. }
  1974. static void extract_odd2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  1975. {
  1976. dst0 += count;
  1977. dst1 += count;
  1978. src0 += 4*count;
  1979. src1 += 4*count;
  1980. count= - count;
  1981. #ifdef PAVGB
  1982. if(count <= -8) {
  1983. count += 7;
  1984. __asm__ volatile(
  1985. "pcmpeqw %%mm7, %%mm7 \n\t"
  1986. "psrlw $8, %%mm7 \n\t"
  1987. "1: \n\t"
  1988. "movq -28(%1, %0, 4), %%mm0 \n\t"
  1989. "movq -20(%1, %0, 4), %%mm1 \n\t"
  1990. "movq -12(%1, %0, 4), %%mm2 \n\t"
  1991. "movq -4(%1, %0, 4), %%mm3 \n\t"
  1992. PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
  1993. PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
  1994. PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
  1995. PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
  1996. "psrlw $8, %%mm0 \n\t"
  1997. "psrlw $8, %%mm1 \n\t"
  1998. "psrlw $8, %%mm2 \n\t"
  1999. "psrlw $8, %%mm3 \n\t"
  2000. "packuswb %%mm1, %%mm0 \n\t"
  2001. "packuswb %%mm3, %%mm2 \n\t"
  2002. "movq %%mm0, %%mm1 \n\t"
  2003. "movq %%mm2, %%mm3 \n\t"
  2004. "psrlw $8, %%mm0 \n\t"
  2005. "psrlw $8, %%mm2 \n\t"
  2006. "pand %%mm7, %%mm1 \n\t"
  2007. "pand %%mm7, %%mm3 \n\t"
  2008. "packuswb %%mm2, %%mm0 \n\t"
  2009. "packuswb %%mm3, %%mm1 \n\t"
  2010. MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
  2011. MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
  2012. "add $8, %0 \n\t"
  2013. " js 1b \n\t"
  2014. : "+r"(count)
  2015. : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
  2016. );
  2017. count -= 7;
  2018. }
  2019. #endif
  2020. src0++;
  2021. src1++;
  2022. while(count<0) {
  2023. dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
  2024. dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
  2025. count++;
  2026. }
  2027. }
  2028. static void yuyvtoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2029. int width, int height,
  2030. int lumStride, int chromStride, int srcStride)
  2031. {
  2032. const int chromWidth = AV_CEIL_RSHIFT(width, 1);
  2033. for (int y = 0; y < height; y++) {
  2034. extract_even_mmxext(src, ydst, width);
  2035. if(y&1) {
  2036. extract_odd2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth);
  2037. udst+= chromStride;
  2038. vdst+= chromStride;
  2039. }
  2040. src += srcStride;
  2041. ydst+= lumStride;
  2042. }
  2043. __asm__(
  2044. EMMS" \n\t"
  2045. SFENCE" \n\t"
  2046. ::: "memory"
  2047. );
  2048. }
  2049. static void yuyvtoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2050. int width, int height,
  2051. int lumStride, int chromStride, int srcStride)
  2052. {
  2053. const int chromWidth = AV_CEIL_RSHIFT(width, 1);
  2054. for (int y = 0; y < height; y++) {
  2055. extract_even_mmxext(src, ydst, width);
  2056. extract_odd2_mmxext(src, udst, vdst, chromWidth);
  2057. src += srcStride;
  2058. ydst+= lumStride;
  2059. udst+= chromStride;
  2060. vdst+= chromStride;
  2061. }
  2062. __asm__(
  2063. EMMS" \n\t"
  2064. SFENCE" \n\t"
  2065. ::: "memory"
  2066. );
  2067. }
  2068. static void uyvytoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2069. int width, int height,
  2070. int lumStride, int chromStride, int srcStride)
  2071. {
  2072. const int chromWidth = AV_CEIL_RSHIFT(width, 1);
  2073. for (int y = 0; y < height; y++) {
  2074. extract_odd_mmxext(src, ydst, width);
  2075. if(y&1) {
  2076. extract_even2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth);
  2077. udst+= chromStride;
  2078. vdst+= chromStride;
  2079. }
  2080. src += srcStride;
  2081. ydst+= lumStride;
  2082. }
  2083. __asm__(
  2084. EMMS" \n\t"
  2085. SFENCE" \n\t"
  2086. ::: "memory"
  2087. );
  2088. }
  2089. #if ARCH_X86_32
  2090. static void uyvytoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2091. int width, int height,
  2092. int lumStride, int chromStride, int srcStride)
  2093. {
  2094. const int chromWidth = AV_CEIL_RSHIFT(width, 1);
  2095. for (int y = 0; y < height; y++) {
  2096. extract_odd_mmxext(src, ydst, width);
  2097. extract_even2_mmxext(src, udst, vdst, chromWidth);
  2098. src += srcStride;
  2099. ydst+= lumStride;
  2100. udst+= chromStride;
  2101. vdst+= chromStride;
  2102. }
  2103. __asm__(
  2104. EMMS" \n\t"
  2105. SFENCE" \n\t"
  2106. ::: "memory"
  2107. );
  2108. }
  2109. #endif /* ARCH_X86_32 */
  2110. static av_cold void rgb2rgb_init_mmxext(void)
  2111. {
  2112. rgb15to16 = rgb15to16_mmxext;
  2113. rgb15tobgr24 = rgb15tobgr24_mmxext;
  2114. rgb15to32 = rgb15to32_mmxext;
  2115. rgb16tobgr24 = rgb16tobgr24_mmxext;
  2116. rgb16to32 = rgb16to32_mmxext;
  2117. rgb16to15 = rgb16to15_mmxext;
  2118. rgb24tobgr16 = rgb24tobgr16_mmxext;
  2119. rgb24tobgr15 = rgb24tobgr15_mmxext;
  2120. rgb24tobgr32 = rgb24tobgr32_mmxext;
  2121. rgb32to16 = rgb32to16_mmxext;
  2122. rgb32to15 = rgb32to15_mmxext;
  2123. rgb32tobgr24 = rgb32tobgr24_mmxext;
  2124. rgb24to15 = rgb24to15_mmxext;
  2125. rgb24to16 = rgb24to16_mmxext;
  2126. rgb24tobgr24 = rgb24tobgr24_mmxext;
  2127. rgb32tobgr16 = rgb32tobgr16_mmxext;
  2128. rgb32tobgr15 = rgb32tobgr15_mmxext;
  2129. yv12toyuy2 = yv12toyuy2_mmxext;
  2130. yv12touyvy = yv12touyvy_mmxext;
  2131. yuv422ptoyuy2 = yuv422ptoyuy2_mmxext;
  2132. yuv422ptouyvy = yuv422ptouyvy_mmxext;
  2133. yuy2toyv12 = yuy2toyv12_mmxext;
  2134. vu9_to_vu12 = vu9_to_vu12_mmxext;
  2135. yvu9_to_yuy2 = yvu9_to_yuy2_mmxext;
  2136. #if ARCH_X86_32
  2137. uyvytoyuv422 = uyvytoyuv422_mmxext;
  2138. #endif
  2139. yuyvtoyuv422 = yuyvtoyuv422_mmxext;
  2140. planar2x = planar2x_mmxext;
  2141. #if HAVE_7REGS
  2142. ff_rgb24toyv12 = rgb24toyv12_mmxext;
  2143. #endif /* HAVE_7REGS */
  2144. yuyvtoyuv420 = yuyvtoyuv420_mmxext;
  2145. uyvytoyuv420 = uyvytoyuv420_mmxext;
  2146. }
  2147. //SSE2 versions
  2148. static void interleave_bytes_sse2(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
  2149. int width, int height, int src1Stride,
  2150. int src2Stride, int dstStride)
  2151. {
  2152. for (int h = 0; h < height; h++) {
  2153. if (width >= 16) {
  2154. if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) {
  2155. __asm__(
  2156. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
  2157. "1: \n\t"
  2158. PREFETCH" 64(%1, %%"FF_REG_a") \n\t"
  2159. PREFETCH" 64(%2, %%"FF_REG_a") \n\t"
  2160. "movdqa (%1, %%"FF_REG_a"), %%xmm0 \n\t"
  2161. "movdqa (%1, %%"FF_REG_a"), %%xmm1 \n\t"
  2162. "movdqa (%2, %%"FF_REG_a"), %%xmm2 \n\t"
  2163. "punpcklbw %%xmm2, %%xmm0 \n\t"
  2164. "punpckhbw %%xmm2, %%xmm1 \n\t"
  2165. "movntdq %%xmm0, (%0, %%"FF_REG_a", 2) \n\t"
  2166. "movntdq %%xmm1, 16(%0, %%"FF_REG_a", 2) \n\t"
  2167. "add $16, %%"FF_REG_a" \n\t"
  2168. "cmp %3, %%"FF_REG_a" \n\t"
  2169. " jb 1b \n\t"
  2170. ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
  2171. : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"FF_REG_a
  2172. );
  2173. } else
  2174. __asm__(
  2175. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
  2176. "1: \n\t"
  2177. PREFETCH" 64(%1, %%"FF_REG_a") \n\t"
  2178. PREFETCH" 64(%2, %%"FF_REG_a") \n\t"
  2179. "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
  2180. "movq 8(%1, %%"FF_REG_a"), %%mm2 \n\t"
  2181. "movq %%mm0, %%mm1 \n\t"
  2182. "movq %%mm2, %%mm3 \n\t"
  2183. "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
  2184. "movq 8(%2, %%"FF_REG_a"), %%mm5 \n\t"
  2185. "punpcklbw %%mm4, %%mm0 \n\t"
  2186. "punpckhbw %%mm4, %%mm1 \n\t"
  2187. "punpcklbw %%mm5, %%mm2 \n\t"
  2188. "punpckhbw %%mm5, %%mm3 \n\t"
  2189. MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 2) \n\t"
  2190. MOVNTQ" %%mm1, 8(%0, %%"FF_REG_a", 2) \n\t"
  2191. MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 2) \n\t"
  2192. MOVNTQ" %%mm3, 24(%0, %%"FF_REG_a", 2) \n\t"
  2193. "add $16, %%"FF_REG_a" \n\t"
  2194. "cmp %3, %%"FF_REG_a" \n\t"
  2195. " jb 1b \n\t"
  2196. ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
  2197. : "memory", "%"FF_REG_a
  2198. );
  2199. }
  2200. for (int w = (width & (~15)); w < width; w++) {
  2201. dest[2*w+0] = src1[w];
  2202. dest[2*w+1] = src2[w];
  2203. }
  2204. dest += dstStride;
  2205. src1 += src1Stride;
  2206. src2 += src2Stride;
  2207. }
  2208. __asm__(
  2209. EMMS" \n\t"
  2210. SFENCE" \n\t"
  2211. ::: "memory"
  2212. );
  2213. }
  2214. /*
  2215. RGB15->RGB16 original by Strepto/Astral
  2216. ported to gcc & bugfixed : A'rpi
  2217. MMXEXT, 3DNOW optimization by Nick Kurshev
  2218. 32-bit C version, and and&add trick by Michael Niedermayer
  2219. */
  2220. #endif /* HAVE_INLINE_ASM */
  2221. void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
  2222. void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
  2223. void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
  2224. void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
  2225. void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
  2226. #if ARCH_X86_64
  2227. void ff_shuffle_bytes_2103_avx2(const uint8_t *src, uint8_t *dst, int src_size);
  2228. void ff_shuffle_bytes_0321_avx2(const uint8_t *src, uint8_t *dst, int src_size);
  2229. void ff_shuffle_bytes_1230_avx2(const uint8_t *src, uint8_t *dst, int src_size);
  2230. void ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int src_size);
  2231. void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t *dst, int src_size);
  2232. void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  2233. const uint8_t *src, int width, int height,
  2234. int lumStride, int chromStride, int srcStride);
  2235. void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  2236. const uint8_t *src, int width, int height,
  2237. int lumStride, int chromStride, int srcStride);
  2238. void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  2239. const uint8_t *src, int width, int height,
  2240. int lumStride, int chromStride, int srcStride);
  2241. #endif
  2242. #define DEINTERLEAVE_BYTES(cpuext) \
  2243. void ff_nv12ToUV_ ## cpuext(uint8_t *dstU, uint8_t *dstV, \
  2244. const uint8_t *unused, \
  2245. const uint8_t *src1, \
  2246. const uint8_t *src2, \
  2247. int w, \
  2248. uint32_t *unused2, \
  2249. void *opq); \
  2250. static void deinterleave_bytes_ ## cpuext(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, \
  2251. int width, int height, int srcStride, \
  2252. int dst1Stride, int dst2Stride) \
  2253. { \
  2254. for (int h = 0; h < height; h++) { \
  2255. ff_nv12ToUV_ ## cpuext(dst1, dst2, NULL, src, NULL, width, NULL, NULL); \
  2256. src += srcStride; \
  2257. dst1 += dst1Stride; \
  2258. dst2 += dst2Stride; \
  2259. } \
  2260. }
  2261. #if HAVE_SSE2_EXTERNAL
  2262. DEINTERLEAVE_BYTES(sse2)
  2263. #endif
  2264. #if HAVE_AVX_EXTERNAL
  2265. DEINTERLEAVE_BYTES(avx)
  2266. #endif
  2267. av_cold void rgb2rgb_init_x86(void)
  2268. {
  2269. int cpu_flags = av_get_cpu_flags();
  2270. #if HAVE_INLINE_ASM
  2271. if (INLINE_MMXEXT(cpu_flags))
  2272. rgb2rgb_init_mmxext();
  2273. if (INLINE_SSE2(cpu_flags))
  2274. interleaveBytes = interleave_bytes_sse2;
  2275. #endif /* HAVE_INLINE_ASM */
  2276. #if HAVE_SSE2_EXTERNAL
  2277. if (EXTERNAL_SSE2(cpu_flags)) {
  2278. #if ARCH_X86_64
  2279. uyvytoyuv422 = ff_uyvytoyuv422_sse2;
  2280. #endif
  2281. deinterleaveBytes = deinterleave_bytes_sse2;
  2282. }
  2283. #endif
  2284. if (EXTERNAL_SSSE3(cpu_flags)) {
  2285. shuffle_bytes_0321 = ff_shuffle_bytes_0321_ssse3;
  2286. shuffle_bytes_2103 = ff_shuffle_bytes_2103_ssse3;
  2287. shuffle_bytes_1230 = ff_shuffle_bytes_1230_ssse3;
  2288. shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3;
  2289. shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3;
  2290. }
  2291. #if HAVE_AVX_EXTERNAL
  2292. if (EXTERNAL_AVX(cpu_flags)) {
  2293. deinterleaveBytes = deinterleave_bytes_avx;
  2294. #if ARCH_X86_64
  2295. uyvytoyuv422 = ff_uyvytoyuv422_avx;
  2296. }
  2297. if (EXTERNAL_AVX2_FAST(cpu_flags)) {
  2298. shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx2;
  2299. shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx2;
  2300. shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx2;
  2301. shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2;
  2302. shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2;
  2303. }
  2304. if (EXTERNAL_AVX2_FAST(cpu_flags)) {
  2305. uyvytoyuv422 = ff_uyvytoyuv422_avx2;
  2306. #endif
  2307. }
  2308. #endif
  2309. }