rgb2rgb.c 103 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483
  1. /*
  2. * software RGB to RGB converter
  3. * pluralize by software PAL8 to RGB converter
  4. * software YUV to YUV converter
  5. * software YUV to RGB converter
  6. * Written by Nick Kurshev.
  7. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  8. *
  9. * This file is part of FFmpeg.
  10. *
  11. * FFmpeg is free software; you can redistribute it and/or
  12. * modify it under the terms of the GNU Lesser General Public
  13. * License as published by the Free Software Foundation; either
  14. * version 2.1 of the License, or (at your option) any later version.
  15. *
  16. * FFmpeg is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  19. * Lesser General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU Lesser General Public
  22. * License along with FFmpeg; if not, write to the Free Software
  23. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24. */
  25. #include <stdint.h>
  26. #include "config.h"
  27. #include "libavutil/attributes.h"
  28. #include "libavutil/x86/cpu.h"
  29. #include "libavutil/cpu.h"
  30. #include "libavutil/bswap.h"
  31. #include "libavutil/mem_internal.h"
  32. #include "libswscale/rgb2rgb.h"
  33. #include "libswscale/swscale.h"
  34. #include "libswscale/swscale_internal.h"
  35. #if HAVE_INLINE_ASM
  36. #include "libavutil/x86/asm.h"
  37. DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL;
  38. DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL;
  39. DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL;
  40. DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL;
  41. DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL;
  42. DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL;
  43. DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL;
  44. DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL;
  45. DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL;
  46. DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL;
  47. DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL;
  48. DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL;
  49. DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL;
  50. DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
  51. DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
  52. DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL;
  53. DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL;
  54. DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL;
  55. #define mask16b mask15b
  56. DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL;
  57. DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL;
  58. DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL;
  59. DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
  60. DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
  61. DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
  62. DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
  63. DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
  64. DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
  65. DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
  66. DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
  67. #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
  68. #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
  69. #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
  70. #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
  71. #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
  72. #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
  73. #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
  74. #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
  75. #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
  76. // MMXEXT versions
  77. #define PREFETCH "prefetchnta"
  78. #define PAVGB "pavgb"
  79. #define MOVNTQ "movntq"
  80. #define SFENCE "sfence"
  81. #define EMMS "emms"
  82. static inline void rgb24tobgr32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  83. {
  84. uint8_t *dest = dst;
  85. const uint8_t *s = src;
  86. const uint8_t *end;
  87. const uint8_t *mm_end;
  88. end = s + src_size;
  89. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  90. mm_end = end - 23;
  91. __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
  92. while (s < mm_end) {
  93. __asm__ volatile(
  94. PREFETCH" 32(%1) \n\t"
  95. "movd (%1), %%mm0 \n\t"
  96. "punpckldq 3(%1), %%mm0 \n\t"
  97. "movd 6(%1), %%mm1 \n\t"
  98. "punpckldq 9(%1), %%mm1 \n\t"
  99. "movd 12(%1), %%mm2 \n\t"
  100. "punpckldq 15(%1), %%mm2 \n\t"
  101. "movd 18(%1), %%mm3 \n\t"
  102. "punpckldq 21(%1), %%mm3 \n\t"
  103. "por %%mm7, %%mm0 \n\t"
  104. "por %%mm7, %%mm1 \n\t"
  105. "por %%mm7, %%mm2 \n\t"
  106. "por %%mm7, %%mm3 \n\t"
  107. MOVNTQ" %%mm0, (%0) \n\t"
  108. MOVNTQ" %%mm1, 8(%0) \n\t"
  109. MOVNTQ" %%mm2, 16(%0) \n\t"
  110. MOVNTQ" %%mm3, 24(%0)"
  111. :: "r"(dest), "r"(s)
  112. :"memory");
  113. dest += 32;
  114. s += 24;
  115. }
  116. __asm__ volatile(SFENCE:::"memory");
  117. __asm__ volatile(EMMS:::"memory");
  118. while (s < end) {
  119. *dest++ = *s++;
  120. *dest++ = *s++;
  121. *dest++ = *s++;
  122. *dest++ = 255;
  123. }
  124. }
  125. #define STORE_BGR24_MMX \
  126. "psrlq $8, %%mm2 \n\t" \
  127. "psrlq $8, %%mm3 \n\t" \
  128. "psrlq $8, %%mm6 \n\t" \
  129. "psrlq $8, %%mm7 \n\t" \
  130. "pand "MANGLE(mask24l)", %%mm0\n\t" \
  131. "pand "MANGLE(mask24l)", %%mm1\n\t" \
  132. "pand "MANGLE(mask24l)", %%mm4\n\t" \
  133. "pand "MANGLE(mask24l)", %%mm5\n\t" \
  134. "pand "MANGLE(mask24h)", %%mm2\n\t" \
  135. "pand "MANGLE(mask24h)", %%mm3\n\t" \
  136. "pand "MANGLE(mask24h)", %%mm6\n\t" \
  137. "pand "MANGLE(mask24h)", %%mm7\n\t" \
  138. "por %%mm2, %%mm0 \n\t" \
  139. "por %%mm3, %%mm1 \n\t" \
  140. "por %%mm6, %%mm4 \n\t" \
  141. "por %%mm7, %%mm5 \n\t" \
  142. \
  143. "movq %%mm1, %%mm2 \n\t" \
  144. "movq %%mm4, %%mm3 \n\t" \
  145. "psllq $48, %%mm2 \n\t" \
  146. "psllq $32, %%mm3 \n\t" \
  147. "por %%mm2, %%mm0 \n\t" \
  148. "psrlq $16, %%mm1 \n\t" \
  149. "psrlq $32, %%mm4 \n\t" \
  150. "psllq $16, %%mm5 \n\t" \
  151. "por %%mm3, %%mm1 \n\t" \
  152. "por %%mm5, %%mm4 \n\t" \
  153. \
  154. MOVNTQ" %%mm0, (%0) \n\t" \
  155. MOVNTQ" %%mm1, 8(%0) \n\t" \
  156. MOVNTQ" %%mm4, 16(%0)"
  157. static inline void rgb32tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  158. {
  159. uint8_t *dest = dst;
  160. const uint8_t *s = src;
  161. const uint8_t *end;
  162. const uint8_t *mm_end;
  163. end = s + src_size;
  164. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  165. mm_end = end - 31;
  166. while (s < mm_end) {
  167. __asm__ volatile(
  168. PREFETCH" 32(%1) \n\t"
  169. "movq (%1), %%mm0 \n\t"
  170. "movq 8(%1), %%mm1 \n\t"
  171. "movq 16(%1), %%mm4 \n\t"
  172. "movq 24(%1), %%mm5 \n\t"
  173. "movq %%mm0, %%mm2 \n\t"
  174. "movq %%mm1, %%mm3 \n\t"
  175. "movq %%mm4, %%mm6 \n\t"
  176. "movq %%mm5, %%mm7 \n\t"
  177. STORE_BGR24_MMX
  178. :: "r"(dest), "r"(s)
  179. NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
  180. :"memory");
  181. dest += 24;
  182. s += 32;
  183. }
  184. __asm__ volatile(SFENCE:::"memory");
  185. __asm__ volatile(EMMS:::"memory");
  186. while (s < end) {
  187. *dest++ = *s++;
  188. *dest++ = *s++;
  189. *dest++ = *s++;
  190. s++;
  191. }
  192. }
  193. /*
  194. original by Strepto/Astral
  195. ported to gcc & bugfixed: A'rpi
  196. MMXEXT, 3DNOW optimization by Nick Kurshev
  197. 32-bit C version, and and&add trick by Michael Niedermayer
  198. */
  199. static inline void rgb15to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  200. {
  201. register const uint8_t* s=src;
  202. register uint8_t* d=dst;
  203. register const uint8_t *end;
  204. const uint8_t *mm_end;
  205. end = s + src_size;
  206. __asm__ volatile(PREFETCH" %0"::"m"(*s));
  207. __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
  208. mm_end = end - 15;
  209. while (s<mm_end) {
  210. __asm__ volatile(
  211. PREFETCH" 32(%1) \n\t"
  212. "movq (%1), %%mm0 \n\t"
  213. "movq 8(%1), %%mm2 \n\t"
  214. "movq %%mm0, %%mm1 \n\t"
  215. "movq %%mm2, %%mm3 \n\t"
  216. "pand %%mm4, %%mm0 \n\t"
  217. "pand %%mm4, %%mm2 \n\t"
  218. "paddw %%mm1, %%mm0 \n\t"
  219. "paddw %%mm3, %%mm2 \n\t"
  220. MOVNTQ" %%mm0, (%0) \n\t"
  221. MOVNTQ" %%mm2, 8(%0)"
  222. :: "r"(d), "r"(s)
  223. );
  224. d+=16;
  225. s+=16;
  226. }
  227. __asm__ volatile(SFENCE:::"memory");
  228. __asm__ volatile(EMMS:::"memory");
  229. mm_end = end - 3;
  230. while (s < mm_end) {
  231. register unsigned x= *((const uint32_t *)s);
  232. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  233. d+=4;
  234. s+=4;
  235. }
  236. if (s < end) {
  237. register unsigned short x= *((const uint16_t *)s);
  238. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  239. }
  240. }
  241. static inline void rgb16to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  242. {
  243. register const uint8_t* s=src;
  244. register uint8_t* d=dst;
  245. register const uint8_t *end;
  246. const uint8_t *mm_end;
  247. end = s + src_size;
  248. __asm__ volatile(PREFETCH" %0"::"m"(*s));
  249. __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
  250. __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
  251. mm_end = end - 15;
  252. while (s<mm_end) {
  253. __asm__ volatile(
  254. PREFETCH" 32(%1) \n\t"
  255. "movq (%1), %%mm0 \n\t"
  256. "movq 8(%1), %%mm2 \n\t"
  257. "movq %%mm0, %%mm1 \n\t"
  258. "movq %%mm2, %%mm3 \n\t"
  259. "psrlq $1, %%mm0 \n\t"
  260. "psrlq $1, %%mm2 \n\t"
  261. "pand %%mm7, %%mm0 \n\t"
  262. "pand %%mm7, %%mm2 \n\t"
  263. "pand %%mm6, %%mm1 \n\t"
  264. "pand %%mm6, %%mm3 \n\t"
  265. "por %%mm1, %%mm0 \n\t"
  266. "por %%mm3, %%mm2 \n\t"
  267. MOVNTQ" %%mm0, (%0) \n\t"
  268. MOVNTQ" %%mm2, 8(%0)"
  269. :: "r"(d), "r"(s)
  270. );
  271. d+=16;
  272. s+=16;
  273. }
  274. __asm__ volatile(SFENCE:::"memory");
  275. __asm__ volatile(EMMS:::"memory");
  276. mm_end = end - 3;
  277. while (s < mm_end) {
  278. register uint32_t x= *((const uint32_t*)s);
  279. *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
  280. s+=4;
  281. d+=4;
  282. }
  283. if (s < end) {
  284. register uint16_t x= *((const uint16_t*)s);
  285. *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
  286. }
  287. }
  288. static inline void rgb32to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  289. {
  290. const uint8_t *s = src;
  291. const uint8_t *end;
  292. const uint8_t *mm_end;
  293. uint16_t *d = (uint16_t *)dst;
  294. end = s + src_size;
  295. mm_end = end - 15;
  296. __asm__ volatile(
  297. "movq %3, %%mm5 \n\t"
  298. "movq %4, %%mm6 \n\t"
  299. "movq %5, %%mm7 \n\t"
  300. "jmp 2f \n\t"
  301. ".p2align 4 \n\t"
  302. "1: \n\t"
  303. PREFETCH" 32(%1) \n\t"
  304. "movd (%1), %%mm0 \n\t"
  305. "movd 4(%1), %%mm3 \n\t"
  306. "punpckldq 8(%1), %%mm0 \n\t"
  307. "punpckldq 12(%1), %%mm3 \n\t"
  308. "movq %%mm0, %%mm1 \n\t"
  309. "movq %%mm3, %%mm4 \n\t"
  310. "pand %%mm6, %%mm0 \n\t"
  311. "pand %%mm6, %%mm3 \n\t"
  312. "pmaddwd %%mm7, %%mm0 \n\t"
  313. "pmaddwd %%mm7, %%mm3 \n\t"
  314. "pand %%mm5, %%mm1 \n\t"
  315. "pand %%mm5, %%mm4 \n\t"
  316. "por %%mm1, %%mm0 \n\t"
  317. "por %%mm4, %%mm3 \n\t"
  318. "psrld $5, %%mm0 \n\t"
  319. "pslld $11, %%mm3 \n\t"
  320. "por %%mm3, %%mm0 \n\t"
  321. MOVNTQ" %%mm0, (%0) \n\t"
  322. "add $16, %1 \n\t"
  323. "add $8, %0 \n\t"
  324. "2: \n\t"
  325. "cmp %2, %1 \n\t"
  326. " jb 1b \n\t"
  327. : "+r" (d), "+r"(s)
  328. : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
  329. );
  330. __asm__ volatile(SFENCE:::"memory");
  331. __asm__ volatile(EMMS:::"memory");
  332. while (s < end) {
  333. register int rgb = *(const uint32_t*)s; s += 4;
  334. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
  335. }
  336. }
  337. static inline void rgb32tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  338. {
  339. const uint8_t *s = src;
  340. const uint8_t *end;
  341. const uint8_t *mm_end;
  342. uint16_t *d = (uint16_t *)dst;
  343. end = s + src_size;
  344. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  345. __asm__ volatile(
  346. "movq %0, %%mm7 \n\t"
  347. "movq %1, %%mm6 \n\t"
  348. ::"m"(red_16mask),"m"(green_16mask));
  349. mm_end = end - 15;
  350. while (s < mm_end) {
  351. __asm__ volatile(
  352. PREFETCH" 32(%1) \n\t"
  353. "movd (%1), %%mm0 \n\t"
  354. "movd 4(%1), %%mm3 \n\t"
  355. "punpckldq 8(%1), %%mm0 \n\t"
  356. "punpckldq 12(%1), %%mm3 \n\t"
  357. "movq %%mm0, %%mm1 \n\t"
  358. "movq %%mm0, %%mm2 \n\t"
  359. "movq %%mm3, %%mm4 \n\t"
  360. "movq %%mm3, %%mm5 \n\t"
  361. "psllq $8, %%mm0 \n\t"
  362. "psllq $8, %%mm3 \n\t"
  363. "pand %%mm7, %%mm0 \n\t"
  364. "pand %%mm7, %%mm3 \n\t"
  365. "psrlq $5, %%mm1 \n\t"
  366. "psrlq $5, %%mm4 \n\t"
  367. "pand %%mm6, %%mm1 \n\t"
  368. "pand %%mm6, %%mm4 \n\t"
  369. "psrlq $19, %%mm2 \n\t"
  370. "psrlq $19, %%mm5 \n\t"
  371. "pand %2, %%mm2 \n\t"
  372. "pand %2, %%mm5 \n\t"
  373. "por %%mm1, %%mm0 \n\t"
  374. "por %%mm4, %%mm3 \n\t"
  375. "por %%mm2, %%mm0 \n\t"
  376. "por %%mm5, %%mm3 \n\t"
  377. "psllq $16, %%mm3 \n\t"
  378. "por %%mm3, %%mm0 \n\t"
  379. MOVNTQ" %%mm0, (%0) \n\t"
  380. :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
  381. d += 4;
  382. s += 16;
  383. }
  384. __asm__ volatile(SFENCE:::"memory");
  385. __asm__ volatile(EMMS:::"memory");
  386. while (s < end) {
  387. register int rgb = *(const uint32_t*)s; s += 4;
  388. *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
  389. }
  390. }
  391. static inline void rgb32to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  392. {
  393. const uint8_t *s = src;
  394. const uint8_t *end;
  395. const uint8_t *mm_end;
  396. uint16_t *d = (uint16_t *)dst;
  397. end = s + src_size;
  398. mm_end = end - 15;
  399. __asm__ volatile(
  400. "movq %3, %%mm5 \n\t"
  401. "movq %4, %%mm6 \n\t"
  402. "movq %5, %%mm7 \n\t"
  403. "jmp 2f \n\t"
  404. ".p2align 4 \n\t"
  405. "1: \n\t"
  406. PREFETCH" 32(%1) \n\t"
  407. "movd (%1), %%mm0 \n\t"
  408. "movd 4(%1), %%mm3 \n\t"
  409. "punpckldq 8(%1), %%mm0 \n\t"
  410. "punpckldq 12(%1), %%mm3 \n\t"
  411. "movq %%mm0, %%mm1 \n\t"
  412. "movq %%mm3, %%mm4 \n\t"
  413. "pand %%mm6, %%mm0 \n\t"
  414. "pand %%mm6, %%mm3 \n\t"
  415. "pmaddwd %%mm7, %%mm0 \n\t"
  416. "pmaddwd %%mm7, %%mm3 \n\t"
  417. "pand %%mm5, %%mm1 \n\t"
  418. "pand %%mm5, %%mm4 \n\t"
  419. "por %%mm1, %%mm0 \n\t"
  420. "por %%mm4, %%mm3 \n\t"
  421. "psrld $6, %%mm0 \n\t"
  422. "pslld $10, %%mm3 \n\t"
  423. "por %%mm3, %%mm0 \n\t"
  424. MOVNTQ" %%mm0, (%0) \n\t"
  425. "add $16, %1 \n\t"
  426. "add $8, %0 \n\t"
  427. "2: \n\t"
  428. "cmp %2, %1 \n\t"
  429. " jb 1b \n\t"
  430. : "+r" (d), "+r"(s)
  431. : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
  432. );
  433. __asm__ volatile(SFENCE:::"memory");
  434. __asm__ volatile(EMMS:::"memory");
  435. while (s < end) {
  436. register int rgb = *(const uint32_t*)s; s += 4;
  437. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
  438. }
  439. }
  440. static inline void rgb32tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  441. {
  442. const uint8_t *s = src;
  443. const uint8_t *end;
  444. const uint8_t *mm_end;
  445. uint16_t *d = (uint16_t *)dst;
  446. end = s + src_size;
  447. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  448. __asm__ volatile(
  449. "movq %0, %%mm7 \n\t"
  450. "movq %1, %%mm6 \n\t"
  451. ::"m"(red_15mask),"m"(green_15mask));
  452. mm_end = end - 15;
  453. while (s < mm_end) {
  454. __asm__ volatile(
  455. PREFETCH" 32(%1) \n\t"
  456. "movd (%1), %%mm0 \n\t"
  457. "movd 4(%1), %%mm3 \n\t"
  458. "punpckldq 8(%1), %%mm0 \n\t"
  459. "punpckldq 12(%1), %%mm3 \n\t"
  460. "movq %%mm0, %%mm1 \n\t"
  461. "movq %%mm0, %%mm2 \n\t"
  462. "movq %%mm3, %%mm4 \n\t"
  463. "movq %%mm3, %%mm5 \n\t"
  464. "psllq $7, %%mm0 \n\t"
  465. "psllq $7, %%mm3 \n\t"
  466. "pand %%mm7, %%mm0 \n\t"
  467. "pand %%mm7, %%mm3 \n\t"
  468. "psrlq $6, %%mm1 \n\t"
  469. "psrlq $6, %%mm4 \n\t"
  470. "pand %%mm6, %%mm1 \n\t"
  471. "pand %%mm6, %%mm4 \n\t"
  472. "psrlq $19, %%mm2 \n\t"
  473. "psrlq $19, %%mm5 \n\t"
  474. "pand %2, %%mm2 \n\t"
  475. "pand %2, %%mm5 \n\t"
  476. "por %%mm1, %%mm0 \n\t"
  477. "por %%mm4, %%mm3 \n\t"
  478. "por %%mm2, %%mm0 \n\t"
  479. "por %%mm5, %%mm3 \n\t"
  480. "psllq $16, %%mm3 \n\t"
  481. "por %%mm3, %%mm0 \n\t"
  482. MOVNTQ" %%mm0, (%0) \n\t"
  483. ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
  484. d += 4;
  485. s += 16;
  486. }
  487. __asm__ volatile(SFENCE:::"memory");
  488. __asm__ volatile(EMMS:::"memory");
  489. while (s < end) {
  490. register int rgb = *(const uint32_t*)s; s += 4;
  491. *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
  492. }
  493. }
  494. static inline void rgb24tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  495. {
  496. const uint8_t *s = src;
  497. const uint8_t *end;
  498. const uint8_t *mm_end;
  499. uint16_t *d = (uint16_t *)dst;
  500. end = s + src_size;
  501. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  502. __asm__ volatile(
  503. "movq %0, %%mm7 \n\t"
  504. "movq %1, %%mm6 \n\t"
  505. ::"m"(red_16mask),"m"(green_16mask));
  506. mm_end = end - 11;
  507. while (s < mm_end) {
  508. __asm__ volatile(
  509. PREFETCH" 32(%1) \n\t"
  510. "movd (%1), %%mm0 \n\t"
  511. "movd 3(%1), %%mm3 \n\t"
  512. "punpckldq 6(%1), %%mm0 \n\t"
  513. "punpckldq 9(%1), %%mm3 \n\t"
  514. "movq %%mm0, %%mm1 \n\t"
  515. "movq %%mm0, %%mm2 \n\t"
  516. "movq %%mm3, %%mm4 \n\t"
  517. "movq %%mm3, %%mm5 \n\t"
  518. "psrlq $3, %%mm0 \n\t"
  519. "psrlq $3, %%mm3 \n\t"
  520. "pand %2, %%mm0 \n\t"
  521. "pand %2, %%mm3 \n\t"
  522. "psrlq $5, %%mm1 \n\t"
  523. "psrlq $5, %%mm4 \n\t"
  524. "pand %%mm6, %%mm1 \n\t"
  525. "pand %%mm6, %%mm4 \n\t"
  526. "psrlq $8, %%mm2 \n\t"
  527. "psrlq $8, %%mm5 \n\t"
  528. "pand %%mm7, %%mm2 \n\t"
  529. "pand %%mm7, %%mm5 \n\t"
  530. "por %%mm1, %%mm0 \n\t"
  531. "por %%mm4, %%mm3 \n\t"
  532. "por %%mm2, %%mm0 \n\t"
  533. "por %%mm5, %%mm3 \n\t"
  534. "psllq $16, %%mm3 \n\t"
  535. "por %%mm3, %%mm0 \n\t"
  536. MOVNTQ" %%mm0, (%0) \n\t"
  537. ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
  538. d += 4;
  539. s += 12;
  540. }
  541. __asm__ volatile(SFENCE:::"memory");
  542. __asm__ volatile(EMMS:::"memory");
  543. while (s < end) {
  544. const int b = *s++;
  545. const int g = *s++;
  546. const int r = *s++;
  547. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  548. }
  549. }
  550. static inline void rgb24to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  551. {
  552. const uint8_t *s = src;
  553. const uint8_t *end;
  554. const uint8_t *mm_end;
  555. uint16_t *d = (uint16_t *)dst;
  556. end = s + src_size;
  557. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  558. __asm__ volatile(
  559. "movq %0, %%mm7 \n\t"
  560. "movq %1, %%mm6 \n\t"
  561. ::"m"(red_16mask),"m"(green_16mask));
  562. mm_end = end - 15;
  563. while (s < mm_end) {
  564. __asm__ volatile(
  565. PREFETCH" 32(%1) \n\t"
  566. "movd (%1), %%mm0 \n\t"
  567. "movd 3(%1), %%mm3 \n\t"
  568. "punpckldq 6(%1), %%mm0 \n\t"
  569. "punpckldq 9(%1), %%mm3 \n\t"
  570. "movq %%mm0, %%mm1 \n\t"
  571. "movq %%mm0, %%mm2 \n\t"
  572. "movq %%mm3, %%mm4 \n\t"
  573. "movq %%mm3, %%mm5 \n\t"
  574. "psllq $8, %%mm0 \n\t"
  575. "psllq $8, %%mm3 \n\t"
  576. "pand %%mm7, %%mm0 \n\t"
  577. "pand %%mm7, %%mm3 \n\t"
  578. "psrlq $5, %%mm1 \n\t"
  579. "psrlq $5, %%mm4 \n\t"
  580. "pand %%mm6, %%mm1 \n\t"
  581. "pand %%mm6, %%mm4 \n\t"
  582. "psrlq $19, %%mm2 \n\t"
  583. "psrlq $19, %%mm5 \n\t"
  584. "pand %2, %%mm2 \n\t"
  585. "pand %2, %%mm5 \n\t"
  586. "por %%mm1, %%mm0 \n\t"
  587. "por %%mm4, %%mm3 \n\t"
  588. "por %%mm2, %%mm0 \n\t"
  589. "por %%mm5, %%mm3 \n\t"
  590. "psllq $16, %%mm3 \n\t"
  591. "por %%mm3, %%mm0 \n\t"
  592. MOVNTQ" %%mm0, (%0) \n\t"
  593. ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
  594. d += 4;
  595. s += 12;
  596. }
  597. __asm__ volatile(SFENCE:::"memory");
  598. __asm__ volatile(EMMS:::"memory");
  599. while (s < end) {
  600. const int r = *s++;
  601. const int g = *s++;
  602. const int b = *s++;
  603. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  604. }
  605. }
  606. static inline void rgb24tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  607. {
  608. const uint8_t *s = src;
  609. const uint8_t *end;
  610. const uint8_t *mm_end;
  611. uint16_t *d = (uint16_t *)dst;
  612. end = s + src_size;
  613. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  614. __asm__ volatile(
  615. "movq %0, %%mm7 \n\t"
  616. "movq %1, %%mm6 \n\t"
  617. ::"m"(red_15mask),"m"(green_15mask));
  618. mm_end = end - 11;
  619. while (s < mm_end) {
  620. __asm__ volatile(
  621. PREFETCH" 32(%1) \n\t"
  622. "movd (%1), %%mm0 \n\t"
  623. "movd 3(%1), %%mm3 \n\t"
  624. "punpckldq 6(%1), %%mm0 \n\t"
  625. "punpckldq 9(%1), %%mm3 \n\t"
  626. "movq %%mm0, %%mm1 \n\t"
  627. "movq %%mm0, %%mm2 \n\t"
  628. "movq %%mm3, %%mm4 \n\t"
  629. "movq %%mm3, %%mm5 \n\t"
  630. "psrlq $3, %%mm0 \n\t"
  631. "psrlq $3, %%mm3 \n\t"
  632. "pand %2, %%mm0 \n\t"
  633. "pand %2, %%mm3 \n\t"
  634. "psrlq $6, %%mm1 \n\t"
  635. "psrlq $6, %%mm4 \n\t"
  636. "pand %%mm6, %%mm1 \n\t"
  637. "pand %%mm6, %%mm4 \n\t"
  638. "psrlq $9, %%mm2 \n\t"
  639. "psrlq $9, %%mm5 \n\t"
  640. "pand %%mm7, %%mm2 \n\t"
  641. "pand %%mm7, %%mm5 \n\t"
  642. "por %%mm1, %%mm0 \n\t"
  643. "por %%mm4, %%mm3 \n\t"
  644. "por %%mm2, %%mm0 \n\t"
  645. "por %%mm5, %%mm3 \n\t"
  646. "psllq $16, %%mm3 \n\t"
  647. "por %%mm3, %%mm0 \n\t"
  648. MOVNTQ" %%mm0, (%0) \n\t"
  649. ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
  650. d += 4;
  651. s += 12;
  652. }
  653. __asm__ volatile(SFENCE:::"memory");
  654. __asm__ volatile(EMMS:::"memory");
  655. while (s < end) {
  656. const int b = *s++;
  657. const int g = *s++;
  658. const int r = *s++;
  659. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  660. }
  661. }
  662. static inline void rgb24to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  663. {
  664. const uint8_t *s = src;
  665. const uint8_t *end;
  666. const uint8_t *mm_end;
  667. uint16_t *d = (uint16_t *)dst;
  668. end = s + src_size;
  669. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  670. __asm__ volatile(
  671. "movq %0, %%mm7 \n\t"
  672. "movq %1, %%mm6 \n\t"
  673. ::"m"(red_15mask),"m"(green_15mask));
  674. mm_end = end - 15;
  675. while (s < mm_end) {
  676. __asm__ volatile(
  677. PREFETCH" 32(%1) \n\t"
  678. "movd (%1), %%mm0 \n\t"
  679. "movd 3(%1), %%mm3 \n\t"
  680. "punpckldq 6(%1), %%mm0 \n\t"
  681. "punpckldq 9(%1), %%mm3 \n\t"
  682. "movq %%mm0, %%mm1 \n\t"
  683. "movq %%mm0, %%mm2 \n\t"
  684. "movq %%mm3, %%mm4 \n\t"
  685. "movq %%mm3, %%mm5 \n\t"
  686. "psllq $7, %%mm0 \n\t"
  687. "psllq $7, %%mm3 \n\t"
  688. "pand %%mm7, %%mm0 \n\t"
  689. "pand %%mm7, %%mm3 \n\t"
  690. "psrlq $6, %%mm1 \n\t"
  691. "psrlq $6, %%mm4 \n\t"
  692. "pand %%mm6, %%mm1 \n\t"
  693. "pand %%mm6, %%mm4 \n\t"
  694. "psrlq $19, %%mm2 \n\t"
  695. "psrlq $19, %%mm5 \n\t"
  696. "pand %2, %%mm2 \n\t"
  697. "pand %2, %%mm5 \n\t"
  698. "por %%mm1, %%mm0 \n\t"
  699. "por %%mm4, %%mm3 \n\t"
  700. "por %%mm2, %%mm0 \n\t"
  701. "por %%mm5, %%mm3 \n\t"
  702. "psllq $16, %%mm3 \n\t"
  703. "por %%mm3, %%mm0 \n\t"
  704. MOVNTQ" %%mm0, (%0) \n\t"
  705. ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
  706. d += 4;
  707. s += 12;
  708. }
  709. __asm__ volatile(SFENCE:::"memory");
  710. __asm__ volatile(EMMS:::"memory");
  711. while (s < end) {
  712. const int r = *s++;
  713. const int g = *s++;
  714. const int b = *s++;
  715. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  716. }
  717. }
  718. static inline void rgb15tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  719. {
  720. const uint16_t *end;
  721. const uint16_t *mm_end;
  722. uint8_t *d = dst;
  723. const uint16_t *s = (const uint16_t*)src;
  724. end = s + src_size/2;
  725. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  726. mm_end = end - 7;
  727. while (s < mm_end) {
  728. __asm__ volatile(
  729. PREFETCH" 32(%1) \n\t"
  730. "movq (%1), %%mm0 \n\t"
  731. "movq (%1), %%mm1 \n\t"
  732. "movq (%1), %%mm2 \n\t"
  733. "pand %2, %%mm0 \n\t"
  734. "pand %3, %%mm1 \n\t"
  735. "pand %4, %%mm2 \n\t"
  736. "psllq $5, %%mm0 \n\t"
  737. "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
  738. "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
  739. "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
  740. "movq %%mm0, %%mm3 \n\t"
  741. "movq %%mm1, %%mm4 \n\t"
  742. "movq %%mm2, %%mm5 \n\t"
  743. "punpcklwd %5, %%mm0 \n\t"
  744. "punpcklwd %5, %%mm1 \n\t"
  745. "punpcklwd %5, %%mm2 \n\t"
  746. "punpckhwd %5, %%mm3 \n\t"
  747. "punpckhwd %5, %%mm4 \n\t"
  748. "punpckhwd %5, %%mm5 \n\t"
  749. "psllq $8, %%mm1 \n\t"
  750. "psllq $16, %%mm2 \n\t"
  751. "por %%mm1, %%mm0 \n\t"
  752. "por %%mm2, %%mm0 \n\t"
  753. "psllq $8, %%mm4 \n\t"
  754. "psllq $16, %%mm5 \n\t"
  755. "por %%mm4, %%mm3 \n\t"
  756. "por %%mm5, %%mm3 \n\t"
  757. "movq %%mm0, %%mm6 \n\t"
  758. "movq %%mm3, %%mm7 \n\t"
  759. "movq 8(%1), %%mm0 \n\t"
  760. "movq 8(%1), %%mm1 \n\t"
  761. "movq 8(%1), %%mm2 \n\t"
  762. "pand %2, %%mm0 \n\t"
  763. "pand %3, %%mm1 \n\t"
  764. "pand %4, %%mm2 \n\t"
  765. "psllq $5, %%mm0 \n\t"
  766. "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
  767. "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
  768. "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
  769. "movq %%mm0, %%mm3 \n\t"
  770. "movq %%mm1, %%mm4 \n\t"
  771. "movq %%mm2, %%mm5 \n\t"
  772. "punpcklwd %5, %%mm0 \n\t"
  773. "punpcklwd %5, %%mm1 \n\t"
  774. "punpcklwd %5, %%mm2 \n\t"
  775. "punpckhwd %5, %%mm3 \n\t"
  776. "punpckhwd %5, %%mm4 \n\t"
  777. "punpckhwd %5, %%mm5 \n\t"
  778. "psllq $8, %%mm1 \n\t"
  779. "psllq $16, %%mm2 \n\t"
  780. "por %%mm1, %%mm0 \n\t"
  781. "por %%mm2, %%mm0 \n\t"
  782. "psllq $8, %%mm4 \n\t"
  783. "psllq $16, %%mm5 \n\t"
  784. "por %%mm4, %%mm3 \n\t"
  785. "por %%mm5, %%mm3 \n\t"
  786. :"=m"(*d)
  787. :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
  788. NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi)
  789. :"memory");
  790. /* borrowed 32 to 24 */
  791. __asm__ volatile(
  792. "movq %%mm0, %%mm4 \n\t"
  793. "movq %%mm3, %%mm5 \n\t"
  794. "movq %%mm6, %%mm0 \n\t"
  795. "movq %%mm7, %%mm1 \n\t"
  796. "movq %%mm4, %%mm6 \n\t"
  797. "movq %%mm5, %%mm7 \n\t"
  798. "movq %%mm0, %%mm2 \n\t"
  799. "movq %%mm1, %%mm3 \n\t"
  800. STORE_BGR24_MMX
  801. :: "r"(d), "m"(*s)
  802. NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
  803. :"memory");
  804. d += 24;
  805. s += 8;
  806. }
  807. __asm__ volatile(SFENCE:::"memory");
  808. __asm__ volatile(EMMS:::"memory");
  809. while (s < end) {
  810. register uint16_t bgr;
  811. bgr = *s++;
  812. *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
  813. *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
  814. *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
  815. }
  816. }
  817. static inline void rgb16tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  818. {
  819. const uint16_t *end;
  820. const uint16_t *mm_end;
  821. uint8_t *d = (uint8_t *)dst;
  822. const uint16_t *s = (const uint16_t *)src;
  823. end = s + src_size/2;
  824. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  825. mm_end = end - 7;
  826. while (s < mm_end) {
  827. __asm__ volatile(
  828. PREFETCH" 32(%1) \n\t"
  829. "movq (%1), %%mm0 \n\t"
  830. "movq (%1), %%mm1 \n\t"
  831. "movq (%1), %%mm2 \n\t"
  832. "pand %2, %%mm0 \n\t"
  833. "pand %3, %%mm1 \n\t"
  834. "pand %4, %%mm2 \n\t"
  835. "psllq $5, %%mm0 \n\t"
  836. "psrlq $1, %%mm2 \n\t"
  837. "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
  838. "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
  839. "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
  840. "movq %%mm0, %%mm3 \n\t"
  841. "movq %%mm1, %%mm4 \n\t"
  842. "movq %%mm2, %%mm5 \n\t"
  843. "punpcklwd %5, %%mm0 \n\t"
  844. "punpcklwd %5, %%mm1 \n\t"
  845. "punpcklwd %5, %%mm2 \n\t"
  846. "punpckhwd %5, %%mm3 \n\t"
  847. "punpckhwd %5, %%mm4 \n\t"
  848. "punpckhwd %5, %%mm5 \n\t"
  849. "psllq $8, %%mm1 \n\t"
  850. "psllq $16, %%mm2 \n\t"
  851. "por %%mm1, %%mm0 \n\t"
  852. "por %%mm2, %%mm0 \n\t"
  853. "psllq $8, %%mm4 \n\t"
  854. "psllq $16, %%mm5 \n\t"
  855. "por %%mm4, %%mm3 \n\t"
  856. "por %%mm5, %%mm3 \n\t"
  857. "movq %%mm0, %%mm6 \n\t"
  858. "movq %%mm3, %%mm7 \n\t"
  859. "movq 8(%1), %%mm0 \n\t"
  860. "movq 8(%1), %%mm1 \n\t"
  861. "movq 8(%1), %%mm2 \n\t"
  862. "pand %2, %%mm0 \n\t"
  863. "pand %3, %%mm1 \n\t"
  864. "pand %4, %%mm2 \n\t"
  865. "psllq $5, %%mm0 \n\t"
  866. "psrlq $1, %%mm2 \n\t"
  867. "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
  868. "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
  869. "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
  870. "movq %%mm0, %%mm3 \n\t"
  871. "movq %%mm1, %%mm4 \n\t"
  872. "movq %%mm2, %%mm5 \n\t"
  873. "punpcklwd %5, %%mm0 \n\t"
  874. "punpcklwd %5, %%mm1 \n\t"
  875. "punpcklwd %5, %%mm2 \n\t"
  876. "punpckhwd %5, %%mm3 \n\t"
  877. "punpckhwd %5, %%mm4 \n\t"
  878. "punpckhwd %5, %%mm5 \n\t"
  879. "psllq $8, %%mm1 \n\t"
  880. "psllq $16, %%mm2 \n\t"
  881. "por %%mm1, %%mm0 \n\t"
  882. "por %%mm2, %%mm0 \n\t"
  883. "psllq $8, %%mm4 \n\t"
  884. "psllq $16, %%mm5 \n\t"
  885. "por %%mm4, %%mm3 \n\t"
  886. "por %%mm5, %%mm3 \n\t"
  887. :"=m"(*d)
  888. :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
  889. NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi)
  890. :"memory");
  891. /* borrowed 32 to 24 */
  892. __asm__ volatile(
  893. "movq %%mm0, %%mm4 \n\t"
  894. "movq %%mm3, %%mm5 \n\t"
  895. "movq %%mm6, %%mm0 \n\t"
  896. "movq %%mm7, %%mm1 \n\t"
  897. "movq %%mm4, %%mm6 \n\t"
  898. "movq %%mm5, %%mm7 \n\t"
  899. "movq %%mm0, %%mm2 \n\t"
  900. "movq %%mm1, %%mm3 \n\t"
  901. STORE_BGR24_MMX
  902. :: "r"(d), "m"(*s)
  903. NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
  904. :"memory");
  905. d += 24;
  906. s += 8;
  907. }
  908. __asm__ volatile(SFENCE:::"memory");
  909. __asm__ volatile(EMMS:::"memory");
  910. while (s < end) {
  911. register uint16_t bgr;
  912. bgr = *s++;
  913. *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
  914. *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
  915. *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
  916. }
  917. }
  918. /*
  919. * mm0 = 00 B3 00 B2 00 B1 00 B0
  920. * mm1 = 00 G3 00 G2 00 G1 00 G0
  921. * mm2 = 00 R3 00 R2 00 R1 00 R0
  922. * mm6 = FF FF FF FF FF FF FF FF
  923. * mm7 = 00 00 00 00 00 00 00 00
  924. */
  925. #define PACK_RGB32 \
  926. "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
  927. "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
  928. "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
  929. "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
  930. "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
  931. "movq %%mm0, %%mm3 \n\t" \
  932. "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
  933. "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
  934. MOVNTQ" %%mm0, (%0) \n\t" \
  935. MOVNTQ" %%mm3, 8(%0) \n\t" \
  936. static inline void rgb15to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  937. {
  938. const uint16_t *end;
  939. const uint16_t *mm_end;
  940. uint8_t *d = dst;
  941. const uint16_t *s = (const uint16_t *)src;
  942. end = s + src_size/2;
  943. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  944. __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  945. __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
  946. mm_end = end - 3;
  947. while (s < mm_end) {
  948. __asm__ volatile(
  949. PREFETCH" 32(%1) \n\t"
  950. "movq (%1), %%mm0 \n\t"
  951. "movq (%1), %%mm1 \n\t"
  952. "movq (%1), %%mm2 \n\t"
  953. "pand %2, %%mm0 \n\t"
  954. "pand %3, %%mm1 \n\t"
  955. "pand %4, %%mm2 \n\t"
  956. "psllq $5, %%mm0 \n\t"
  957. "pmulhw %5, %%mm0 \n\t"
  958. "pmulhw %5, %%mm1 \n\t"
  959. "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
  960. PACK_RGB32
  961. ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
  962. NAMED_CONSTRAINTS_ADD(mul15_hi)
  963. :"memory");
  964. d += 16;
  965. s += 4;
  966. }
  967. __asm__ volatile(SFENCE:::"memory");
  968. __asm__ volatile(EMMS:::"memory");
  969. while (s < end) {
  970. register uint16_t bgr;
  971. bgr = *s++;
  972. *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
  973. *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
  974. *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
  975. *d++ = 255;
  976. }
  977. }
  978. static inline void rgb16to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  979. {
  980. const uint16_t *end;
  981. const uint16_t *mm_end;
  982. uint8_t *d = dst;
  983. const uint16_t *s = (const uint16_t*)src;
  984. end = s + src_size/2;
  985. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  986. __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  987. __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
  988. mm_end = end - 3;
  989. while (s < mm_end) {
  990. __asm__ volatile(
  991. PREFETCH" 32(%1) \n\t"
  992. "movq (%1), %%mm0 \n\t"
  993. "movq (%1), %%mm1 \n\t"
  994. "movq (%1), %%mm2 \n\t"
  995. "pand %2, %%mm0 \n\t"
  996. "pand %3, %%mm1 \n\t"
  997. "pand %4, %%mm2 \n\t"
  998. "psllq $5, %%mm0 \n\t"
  999. "psrlq $1, %%mm2 \n\t"
  1000. "pmulhw %5, %%mm0 \n\t"
  1001. "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
  1002. "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
  1003. PACK_RGB32
  1004. ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
  1005. NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi)
  1006. :"memory");
  1007. d += 16;
  1008. s += 4;
  1009. }
  1010. __asm__ volatile(SFENCE:::"memory");
  1011. __asm__ volatile(EMMS:::"memory");
  1012. while (s < end) {
  1013. register uint16_t bgr;
  1014. bgr = *s++;
  1015. *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
  1016. *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
  1017. *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
  1018. *d++ = 255;
  1019. }
  1020. }
  1021. static inline void rgb24tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
  1022. {
  1023. x86_reg mmx_size= 23 - src_size;
  1024. __asm__ volatile (
  1025. "test %%"FF_REG_a", %%"FF_REG_a" \n\t"
  1026. "jns 2f \n\t"
  1027. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  1028. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  1029. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  1030. ".p2align 4 \n\t"
  1031. "1: \n\t"
  1032. PREFETCH" 32(%1, %%"FF_REG_a") \n\t"
  1033. "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG
  1034. "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" // BGR BGR BG
  1035. "movq 2(%1, %%"FF_REG_a"), %%mm2 \n\t" // R BGR BGR B
  1036. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  1037. "pand %%mm5, %%mm0 \n\t"
  1038. "pand %%mm6, %%mm1 \n\t"
  1039. "pand %%mm7, %%mm2 \n\t"
  1040. "por %%mm0, %%mm1 \n\t"
  1041. "por %%mm2, %%mm1 \n\t"
  1042. "movq 6(%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG
  1043. MOVNTQ" %%mm1,(%2, %%"FF_REG_a") \n\t" // RGB RGB RG
  1044. "movq 8(%1, %%"FF_REG_a"), %%mm1 \n\t" // R BGR BGR B
  1045. "movq 10(%1, %%"FF_REG_a"), %%mm2 \n\t" // GR BGR BGR
  1046. "pand %%mm7, %%mm0 \n\t"
  1047. "pand %%mm5, %%mm1 \n\t"
  1048. "pand %%mm6, %%mm2 \n\t"
  1049. "por %%mm0, %%mm1 \n\t"
  1050. "por %%mm2, %%mm1 \n\t"
  1051. "movq 14(%1, %%"FF_REG_a"), %%mm0 \n\t" // R BGR BGR B
  1052. MOVNTQ" %%mm1, 8(%2, %%"FF_REG_a")\n\t" // B RGB RGB R
  1053. "movq 16(%1, %%"FF_REG_a"), %%mm1 \n\t" // GR BGR BGR
  1054. "movq 18(%1, %%"FF_REG_a"), %%mm2 \n\t" // BGR BGR BG
  1055. "pand %%mm6, %%mm0 \n\t"
  1056. "pand %%mm7, %%mm1 \n\t"
  1057. "pand %%mm5, %%mm2 \n\t"
  1058. "por %%mm0, %%mm1 \n\t"
  1059. "por %%mm2, %%mm1 \n\t"
  1060. MOVNTQ" %%mm1, 16(%2, %%"FF_REG_a") \n\t"
  1061. "add $24, %%"FF_REG_a" \n\t"
  1062. " js 1b \n\t"
  1063. "2: \n\t"
  1064. : "+a" (mmx_size)
  1065. : "r" (src-mmx_size), "r"(dst-mmx_size)
  1066. NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b)
  1067. );
  1068. __asm__ volatile(SFENCE:::"memory");
  1069. __asm__ volatile(EMMS:::"memory");
  1070. if (mmx_size==23) return; //finished, was multiple of 8
  1071. src+= src_size;
  1072. dst+= src_size;
  1073. src_size= 23-mmx_size;
  1074. src-= src_size;
  1075. dst-= src_size;
  1076. for (unsigned i = 0; i < src_size; i +=3) {
  1077. register uint8_t x;
  1078. x = src[i + 2];
  1079. dst[i + 1] = src[i + 1];
  1080. dst[i + 2] = src[i + 0];
  1081. dst[i + 0] = x;
  1082. }
  1083. }
  1084. static inline void yuvPlanartoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1085. int width, int height,
  1086. int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
  1087. {
  1088. const x86_reg chromWidth= width>>1;
  1089. for (int y = 0; y < height; y++) {
  1090. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1091. __asm__ volatile(
  1092. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
  1093. ".p2align 4 \n\t"
  1094. "1: \n\t"
  1095. PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t"
  1096. PREFETCH" 32(%2, %%"FF_REG_a") \n\t"
  1097. PREFETCH" 32(%3, %%"FF_REG_a") \n\t"
  1098. "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0)
  1099. "movq %%mm0, %%mm2 \n\t" // U(0)
  1100. "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0)
  1101. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1102. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1103. "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0)
  1104. "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8)
  1105. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1106. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1107. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1108. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1109. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1110. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1111. MOVNTQ" %%mm3, (%0, %%"FF_REG_a", 4) \n\t"
  1112. MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t"
  1113. MOVNTQ" %%mm5, 16(%0, %%"FF_REG_a", 4) \n\t"
  1114. MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t"
  1115. "add $8, %%"FF_REG_a" \n\t"
  1116. "cmp %4, %%"FF_REG_a" \n\t"
  1117. " jb 1b \n\t"
  1118. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1119. : "%"FF_REG_a
  1120. );
  1121. if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
  1122. usrc += chromStride;
  1123. vsrc += chromStride;
  1124. }
  1125. ysrc += lumStride;
  1126. dst += dstStride;
  1127. }
  1128. __asm__(EMMS" \n\t"
  1129. SFENCE" \n\t"
  1130. :::"memory");
  1131. }
  1132. /**
  1133. * Height should be a multiple of 2 and width should be a multiple of 16.
  1134. * (If this is a problem for anyone then tell me, and I will fix it.)
  1135. */
  1136. static inline void yv12toyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1137. int width, int height,
  1138. int lumStride, int chromStride, int dstStride)
  1139. {
  1140. //FIXME interpolate chroma
  1141. yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1142. }
  1143. static inline void yuvPlanartouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1144. int width, int height,
  1145. int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
  1146. {
  1147. const x86_reg chromWidth= width>>1;
  1148. for (int y = 0; y < height; y++) {
  1149. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1150. __asm__ volatile(
  1151. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
  1152. ".p2align 4 \n\t"
  1153. "1: \n\t"
  1154. PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t"
  1155. PREFETCH" 32(%2, %%"FF_REG_a") \n\t"
  1156. PREFETCH" 32(%3, %%"FF_REG_a") \n\t"
  1157. "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0)
  1158. "movq %%mm0, %%mm2 \n\t" // U(0)
  1159. "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0)
  1160. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1161. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1162. "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0)
  1163. "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8)
  1164. "movq %%mm0, %%mm4 \n\t" // Y(0)
  1165. "movq %%mm2, %%mm6 \n\t" // Y(8)
  1166. "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
  1167. "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
  1168. "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
  1169. "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
  1170. MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 4) \n\t"
  1171. MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t"
  1172. MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 4) \n\t"
  1173. MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t"
  1174. "add $8, %%"FF_REG_a" \n\t"
  1175. "cmp %4, %%"FF_REG_a" \n\t"
  1176. " jb 1b \n\t"
  1177. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1178. : "%"FF_REG_a
  1179. );
  1180. if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
  1181. usrc += chromStride;
  1182. vsrc += chromStride;
  1183. }
  1184. ysrc += lumStride;
  1185. dst += dstStride;
  1186. }
  1187. __asm__(EMMS" \n\t"
  1188. SFENCE" \n\t"
  1189. :::"memory");
  1190. }
  1191. /**
  1192. * Height should be a multiple of 2 and width should be a multiple of 16
  1193. * (If this is a problem for anyone then tell me, and I will fix it.)
  1194. */
  1195. static inline void yv12touyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1196. int width, int height,
  1197. int lumStride, int chromStride, int dstStride)
  1198. {
  1199. //FIXME interpolate chroma
  1200. yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1201. }
  1202. /**
  1203. * Width should be a multiple of 16.
  1204. */
  1205. static inline void yuv422ptouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1206. int width, int height,
  1207. int lumStride, int chromStride, int dstStride)
  1208. {
  1209. yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1210. }
  1211. /**
  1212. * Width should be a multiple of 16.
  1213. */
  1214. static inline void yuv422ptoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1215. int width, int height,
  1216. int lumStride, int chromStride, int dstStride)
  1217. {
  1218. yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1219. }
  1220. /**
  1221. * Height should be a multiple of 2 and width should be a multiple of 16.
  1222. * (If this is a problem for anyone then tell me, and I will fix it.)
  1223. */
  1224. static inline void yuy2toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1225. int width, int height,
  1226. int lumStride, int chromStride, int srcStride)
  1227. {
  1228. const x86_reg chromWidth= width>>1;
  1229. for (int y = 0; y < height; y += 2) {
  1230. __asm__ volatile(
  1231. "xor %%"FF_REG_a", %%"FF_REG_a"\n\t"
  1232. "pcmpeqw %%mm7, %%mm7 \n\t"
  1233. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1234. ".p2align 4 \n\t"
  1235. "1: \n\t"
  1236. PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t"
  1237. "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1238. "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1239. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  1240. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  1241. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  1242. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  1243. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1244. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1245. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1246. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1247. MOVNTQ" %%mm2, (%1, %%"FF_REG_a", 2) \n\t"
  1248. "movq 16(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
  1249. "movq 24(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
  1250. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  1251. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  1252. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  1253. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  1254. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1255. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1256. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1257. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1258. MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t"
  1259. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1260. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1261. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1262. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1263. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1264. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1265. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1266. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1267. MOVNTQ" %%mm0, (%3, %%"FF_REG_a") \n\t"
  1268. MOVNTQ" %%mm2, (%2, %%"FF_REG_a") \n\t"
  1269. "add $8, %%"FF_REG_a" \n\t"
  1270. "cmp %4, %%"FF_REG_a" \n\t"
  1271. " jb 1b \n\t"
  1272. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1273. : "memory", "%"FF_REG_a
  1274. );
  1275. ydst += lumStride;
  1276. src += srcStride;
  1277. __asm__ volatile(
  1278. "xor %%"FF_REG_a", %%"FF_REG_a"\n\t"
  1279. ".p2align 4 \n\t"
  1280. "1: \n\t"
  1281. PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t"
  1282. "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1283. "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1284. "movq 16(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
  1285. "movq 24(%0, %%"FF_REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
  1286. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1287. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1288. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1289. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1290. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1291. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1292. MOVNTQ" %%mm0, (%1, %%"FF_REG_a", 2) \n\t"
  1293. MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t"
  1294. "add $8, %%"FF_REG_a"\n\t"
  1295. "cmp %4, %%"FF_REG_a"\n\t"
  1296. " jb 1b \n\t"
  1297. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1298. : "memory", "%"FF_REG_a
  1299. );
  1300. udst += chromStride;
  1301. vdst += chromStride;
  1302. ydst += lumStride;
  1303. src += srcStride;
  1304. }
  1305. __asm__ volatile(EMMS" \n\t"
  1306. SFENCE" \n\t"
  1307. :::"memory");
  1308. }
  1309. static inline void planar2x_mmxext(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
  1310. {
  1311. dst[0]= src[0];
  1312. // first line
  1313. for (int x = 0; x < srcWidth - 1; x++) {
  1314. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1315. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1316. }
  1317. dst[2*srcWidth-1]= src[srcWidth-1];
  1318. dst+= dstStride;
  1319. for (int y = 1; y < srcHeight; y++) {
  1320. x86_reg mmxSize= srcWidth&~15;
  1321. if (mmxSize) {
  1322. __asm__ volatile(
  1323. "mov %4, %%"FF_REG_a" \n\t"
  1324. "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
  1325. "movq (%0, %%"FF_REG_a"), %%mm4 \n\t"
  1326. "movq %%mm4, %%mm2 \n\t"
  1327. "psllq $8, %%mm4 \n\t"
  1328. "pand %%mm0, %%mm2 \n\t"
  1329. "por %%mm2, %%mm4 \n\t"
  1330. "movq (%1, %%"FF_REG_a"), %%mm5 \n\t"
  1331. "movq %%mm5, %%mm3 \n\t"
  1332. "psllq $8, %%mm5 \n\t"
  1333. "pand %%mm0, %%mm3 \n\t"
  1334. "por %%mm3, %%mm5 \n\t"
  1335. "1: \n\t"
  1336. "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
  1337. "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
  1338. "movq 1(%0, %%"FF_REG_a"), %%mm2 \n\t"
  1339. "movq 1(%1, %%"FF_REG_a"), %%mm3 \n\t"
  1340. PAVGB" %%mm0, %%mm5 \n\t"
  1341. PAVGB" %%mm0, %%mm3 \n\t"
  1342. PAVGB" %%mm0, %%mm5 \n\t"
  1343. PAVGB" %%mm0, %%mm3 \n\t"
  1344. PAVGB" %%mm1, %%mm4 \n\t"
  1345. PAVGB" %%mm1, %%mm2 \n\t"
  1346. PAVGB" %%mm1, %%mm4 \n\t"
  1347. PAVGB" %%mm1, %%mm2 \n\t"
  1348. "movq %%mm5, %%mm7 \n\t"
  1349. "movq %%mm4, %%mm6 \n\t"
  1350. "punpcklbw %%mm3, %%mm5 \n\t"
  1351. "punpckhbw %%mm3, %%mm7 \n\t"
  1352. "punpcklbw %%mm2, %%mm4 \n\t"
  1353. "punpckhbw %%mm2, %%mm6 \n\t"
  1354. MOVNTQ" %%mm5, (%2, %%"FF_REG_a", 2) \n\t"
  1355. MOVNTQ" %%mm7, 8(%2, %%"FF_REG_a", 2) \n\t"
  1356. MOVNTQ" %%mm4, (%3, %%"FF_REG_a", 2) \n\t"
  1357. MOVNTQ" %%mm6, 8(%3, %%"FF_REG_a", 2) \n\t"
  1358. "add $8, %%"FF_REG_a" \n\t"
  1359. "movq -1(%0, %%"FF_REG_a"), %%mm4 \n\t"
  1360. "movq -1(%1, %%"FF_REG_a"), %%mm5 \n\t"
  1361. " js 1b \n\t"
  1362. :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
  1363. "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
  1364. "g" (-mmxSize)
  1365. NAMED_CONSTRAINTS_ADD(mmx_ff)
  1366. : "%"FF_REG_a
  1367. );
  1368. } else {
  1369. mmxSize = 1;
  1370. dst[0] = (src[0] * 3 + src[srcStride]) >> 2;
  1371. dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
  1372. }
  1373. for (int x = mmxSize - 1; x < srcWidth - 1; x++) {
  1374. dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
  1375. dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
  1376. dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
  1377. dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
  1378. }
  1379. dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
  1380. dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
  1381. dst+=dstStride*2;
  1382. src+=srcStride;
  1383. }
  1384. // last line
  1385. dst[0]= src[0];
  1386. for (int x = 0; x < srcWidth - 1; x++) {
  1387. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1388. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1389. }
  1390. dst[2*srcWidth-1]= src[srcWidth-1];
  1391. __asm__ volatile(EMMS" \n\t"
  1392. SFENCE" \n\t"
  1393. :::"memory");
  1394. }
  1395. /**
  1396. * Height should be a multiple of 2 and width should be a multiple of 2.
  1397. * (If this is a problem for anyone then tell me, and I will fix it.)
  1398. * Chrominance data is only taken from every second line,
  1399. * others are ignored in the C version.
  1400. * FIXME: Write HQ version.
  1401. */
  1402. #if ARCH_X86_32 && HAVE_7REGS
  1403. DECLARE_ASM_CONST(8, uint64_t, bgr2YOffset) = 0x1010101010101010ULL;
  1404. DECLARE_ASM_CONST(8, uint64_t, bgr2UVOffset) = 0x8080808080808080ULL;
  1405. DECLARE_ASM_CONST(8, uint64_t, w1111) = 0x0001000100010001ULL;
  1406. static inline void rgb24toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1407. int width, int height,
  1408. int lumStride, int chromStride, int srcStride,
  1409. const int32_t *rgb2yuv)
  1410. {
  1411. #define BGR2Y_IDX "16*4+16*32"
  1412. #define BGR2U_IDX "16*4+16*33"
  1413. #define BGR2V_IDX "16*4+16*34"
  1414. int y;
  1415. const x86_reg chromWidth= width>>1;
  1416. if (height > 2) {
  1417. ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv);
  1418. src += 2*srcStride;
  1419. ydst += 2*lumStride;
  1420. udst += chromStride;
  1421. vdst += chromStride;
  1422. height -= 2;
  1423. }
  1424. for (y = 0; y < height - 2; y += 2) {
  1425. for (int i = 0; i < 2; i++) {
  1426. __asm__ volatile(
  1427. "mov %2, %%"FF_REG_a"\n\t"
  1428. "movq "BGR2Y_IDX"(%3), %%mm6 \n\t"
  1429. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1430. "pxor %%mm7, %%mm7 \n\t"
  1431. "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
  1432. ".p2align 4 \n\t"
  1433. "1: \n\t"
  1434. PREFETCH" 64(%0, %%"FF_REG_d") \n\t"
  1435. "movd (%0, %%"FF_REG_d"), %%mm0 \n\t"
  1436. "movd 3(%0, %%"FF_REG_d"), %%mm1 \n\t"
  1437. "punpcklbw %%mm7, %%mm0 \n\t"
  1438. "punpcklbw %%mm7, %%mm1 \n\t"
  1439. "movd 6(%0, %%"FF_REG_d"), %%mm2 \n\t"
  1440. "movd 9(%0, %%"FF_REG_d"), %%mm3 \n\t"
  1441. "punpcklbw %%mm7, %%mm2 \n\t"
  1442. "punpcklbw %%mm7, %%mm3 \n\t"
  1443. "pmaddwd %%mm6, %%mm0 \n\t"
  1444. "pmaddwd %%mm6, %%mm1 \n\t"
  1445. "pmaddwd %%mm6, %%mm2 \n\t"
  1446. "pmaddwd %%mm6, %%mm3 \n\t"
  1447. "psrad $8, %%mm0 \n\t"
  1448. "psrad $8, %%mm1 \n\t"
  1449. "psrad $8, %%mm2 \n\t"
  1450. "psrad $8, %%mm3 \n\t"
  1451. "packssdw %%mm1, %%mm0 \n\t"
  1452. "packssdw %%mm3, %%mm2 \n\t"
  1453. "pmaddwd %%mm5, %%mm0 \n\t"
  1454. "pmaddwd %%mm5, %%mm2 \n\t"
  1455. "packssdw %%mm2, %%mm0 \n\t"
  1456. "psraw $7, %%mm0 \n\t"
  1457. "movd 12(%0, %%"FF_REG_d"), %%mm4 \n\t"
  1458. "movd 15(%0, %%"FF_REG_d"), %%mm1 \n\t"
  1459. "punpcklbw %%mm7, %%mm4 \n\t"
  1460. "punpcklbw %%mm7, %%mm1 \n\t"
  1461. "movd 18(%0, %%"FF_REG_d"), %%mm2 \n\t"
  1462. "movd 21(%0, %%"FF_REG_d"), %%mm3 \n\t"
  1463. "punpcklbw %%mm7, %%mm2 \n\t"
  1464. "punpcklbw %%mm7, %%mm3 \n\t"
  1465. "pmaddwd %%mm6, %%mm4 \n\t"
  1466. "pmaddwd %%mm6, %%mm1 \n\t"
  1467. "pmaddwd %%mm6, %%mm2 \n\t"
  1468. "pmaddwd %%mm6, %%mm3 \n\t"
  1469. "psrad $8, %%mm4 \n\t"
  1470. "psrad $8, %%mm1 \n\t"
  1471. "psrad $8, %%mm2 \n\t"
  1472. "psrad $8, %%mm3 \n\t"
  1473. "packssdw %%mm1, %%mm4 \n\t"
  1474. "packssdw %%mm3, %%mm2 \n\t"
  1475. "pmaddwd %%mm5, %%mm4 \n\t"
  1476. "pmaddwd %%mm5, %%mm2 \n\t"
  1477. "add $24, %%"FF_REG_d"\n\t"
  1478. "packssdw %%mm2, %%mm4 \n\t"
  1479. "psraw $7, %%mm4 \n\t"
  1480. "packuswb %%mm4, %%mm0 \n\t"
  1481. "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
  1482. MOVNTQ" %%mm0, (%1, %%"FF_REG_a") \n\t"
  1483. "add $8, %%"FF_REG_a" \n\t"
  1484. " js 1b \n\t"
  1485. : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
  1486. NAMED_CONSTRAINTS_ADD(w1111,bgr2YOffset)
  1487. : "%"FF_REG_a, "%"FF_REG_d
  1488. );
  1489. ydst += lumStride;
  1490. src += srcStride;
  1491. }
  1492. src -= srcStride*2;
  1493. __asm__ volatile(
  1494. "mov %4, %%"FF_REG_a"\n\t"
  1495. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1496. "movq "BGR2U_IDX"(%5), %%mm6 \n\t"
  1497. "pxor %%mm7, %%mm7 \n\t"
  1498. "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
  1499. "add %%"FF_REG_d", %%"FF_REG_d"\n\t"
  1500. ".p2align 4 \n\t"
  1501. "1: \n\t"
  1502. PREFETCH" 64(%0, %%"FF_REG_d") \n\t"
  1503. PREFETCH" 64(%1, %%"FF_REG_d") \n\t"
  1504. "movq (%0, %%"FF_REG_d"), %%mm0 \n\t"
  1505. "movq (%1, %%"FF_REG_d"), %%mm1 \n\t"
  1506. "movq 6(%0, %%"FF_REG_d"), %%mm2 \n\t"
  1507. "movq 6(%1, %%"FF_REG_d"), %%mm3 \n\t"
  1508. PAVGB" %%mm1, %%mm0 \n\t"
  1509. PAVGB" %%mm3, %%mm2 \n\t"
  1510. "movq %%mm0, %%mm1 \n\t"
  1511. "movq %%mm2, %%mm3 \n\t"
  1512. "psrlq $24, %%mm0 \n\t"
  1513. "psrlq $24, %%mm2 \n\t"
  1514. PAVGB" %%mm1, %%mm0 \n\t"
  1515. PAVGB" %%mm3, %%mm2 \n\t"
  1516. "punpcklbw %%mm7, %%mm0 \n\t"
  1517. "punpcklbw %%mm7, %%mm2 \n\t"
  1518. "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
  1519. "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
  1520. "pmaddwd %%mm0, %%mm1 \n\t"
  1521. "pmaddwd %%mm2, %%mm3 \n\t"
  1522. "pmaddwd %%mm6, %%mm0 \n\t"
  1523. "pmaddwd %%mm6, %%mm2 \n\t"
  1524. "psrad $8, %%mm0 \n\t"
  1525. "psrad $8, %%mm1 \n\t"
  1526. "psrad $8, %%mm2 \n\t"
  1527. "psrad $8, %%mm3 \n\t"
  1528. "packssdw %%mm2, %%mm0 \n\t"
  1529. "packssdw %%mm3, %%mm1 \n\t"
  1530. "pmaddwd %%mm5, %%mm0 \n\t"
  1531. "pmaddwd %%mm5, %%mm1 \n\t"
  1532. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  1533. "psraw $7, %%mm0 \n\t"
  1534. "movq 12(%0, %%"FF_REG_d"), %%mm4 \n\t"
  1535. "movq 12(%1, %%"FF_REG_d"), %%mm1 \n\t"
  1536. "movq 18(%0, %%"FF_REG_d"), %%mm2 \n\t"
  1537. "movq 18(%1, %%"FF_REG_d"), %%mm3 \n\t"
  1538. PAVGB" %%mm1, %%mm4 \n\t"
  1539. PAVGB" %%mm3, %%mm2 \n\t"
  1540. "movq %%mm4, %%mm1 \n\t"
  1541. "movq %%mm2, %%mm3 \n\t"
  1542. "psrlq $24, %%mm4 \n\t"
  1543. "psrlq $24, %%mm2 \n\t"
  1544. PAVGB" %%mm1, %%mm4 \n\t"
  1545. PAVGB" %%mm3, %%mm2 \n\t"
  1546. "punpcklbw %%mm7, %%mm4 \n\t"
  1547. "punpcklbw %%mm7, %%mm2 \n\t"
  1548. "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
  1549. "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
  1550. "pmaddwd %%mm4, %%mm1 \n\t"
  1551. "pmaddwd %%mm2, %%mm3 \n\t"
  1552. "pmaddwd %%mm6, %%mm4 \n\t"
  1553. "pmaddwd %%mm6, %%mm2 \n\t"
  1554. "psrad $8, %%mm4 \n\t"
  1555. "psrad $8, %%mm1 \n\t"
  1556. "psrad $8, %%mm2 \n\t"
  1557. "psrad $8, %%mm3 \n\t"
  1558. "packssdw %%mm2, %%mm4 \n\t"
  1559. "packssdw %%mm3, %%mm1 \n\t"
  1560. "pmaddwd %%mm5, %%mm4 \n\t"
  1561. "pmaddwd %%mm5, %%mm1 \n\t"
  1562. "add $24, %%"FF_REG_d"\n\t"
  1563. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  1564. "psraw $7, %%mm4 \n\t"
  1565. "movq %%mm0, %%mm1 \n\t"
  1566. "punpckldq %%mm4, %%mm0 \n\t"
  1567. "punpckhdq %%mm4, %%mm1 \n\t"
  1568. "packsswb %%mm1, %%mm0 \n\t"
  1569. "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
  1570. "movd %%mm0, (%2, %%"FF_REG_a") \n\t"
  1571. "punpckhdq %%mm0, %%mm0 \n\t"
  1572. "movd %%mm0, (%3, %%"FF_REG_a") \n\t"
  1573. "add $4, %%"FF_REG_a" \n\t"
  1574. " js 1b \n\t"
  1575. : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
  1576. NAMED_CONSTRAINTS_ADD(w1111,bgr2UVOffset)
  1577. : "%"FF_REG_a, "%"FF_REG_d
  1578. );
  1579. udst += chromStride;
  1580. vdst += chromStride;
  1581. src += srcStride*2;
  1582. }
  1583. __asm__ volatile(EMMS" \n\t"
  1584. SFENCE" \n\t"
  1585. :::"memory");
  1586. ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
  1587. }
  1588. #endif /* HAVE_7REGS */
  1589. static inline void vu9_to_vu12_mmxext(const uint8_t *src1, const uint8_t *src2,
  1590. uint8_t *dst1, uint8_t *dst2,
  1591. int width, int height,
  1592. int srcStride1, int srcStride2,
  1593. int dstStride1, int dstStride2)
  1594. {
  1595. int w,h;
  1596. w=width/2; h=height/2;
  1597. __asm__ volatile(
  1598. PREFETCH" %0 \n\t"
  1599. PREFETCH" %1 \n\t"
  1600. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
  1601. for (x86_reg y = 0; y < h; y++) {
  1602. const uint8_t* s1=src1+srcStride1*(y>>1);
  1603. uint8_t* d=dst1+dstStride1*y;
  1604. x86_reg x = 0;
  1605. for (;x<w-31;x+=32) {
  1606. __asm__ volatile(
  1607. PREFETCH" 32(%1,%2) \n\t"
  1608. "movq (%1,%2), %%mm0 \n\t"
  1609. "movq 8(%1,%2), %%mm2 \n\t"
  1610. "movq 16(%1,%2), %%mm4 \n\t"
  1611. "movq 24(%1,%2), %%mm6 \n\t"
  1612. "movq %%mm0, %%mm1 \n\t"
  1613. "movq %%mm2, %%mm3 \n\t"
  1614. "movq %%mm4, %%mm5 \n\t"
  1615. "movq %%mm6, %%mm7 \n\t"
  1616. "punpcklbw %%mm0, %%mm0 \n\t"
  1617. "punpckhbw %%mm1, %%mm1 \n\t"
  1618. "punpcklbw %%mm2, %%mm2 \n\t"
  1619. "punpckhbw %%mm3, %%mm3 \n\t"
  1620. "punpcklbw %%mm4, %%mm4 \n\t"
  1621. "punpckhbw %%mm5, %%mm5 \n\t"
  1622. "punpcklbw %%mm6, %%mm6 \n\t"
  1623. "punpckhbw %%mm7, %%mm7 \n\t"
  1624. MOVNTQ" %%mm0, (%0,%2,2) \n\t"
  1625. MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
  1626. MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
  1627. MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
  1628. MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
  1629. MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
  1630. MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
  1631. MOVNTQ" %%mm7, 56(%0,%2,2)"
  1632. :: "r"(d), "r"(s1), "r"(x)
  1633. :"memory");
  1634. }
  1635. for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
  1636. }
  1637. for (x86_reg y = 0; y < h; y++) {
  1638. const uint8_t* s2=src2+srcStride2*(y>>1);
  1639. uint8_t* d=dst2+dstStride2*y;
  1640. x86_reg x = 0;
  1641. for (;x<w-31;x+=32) {
  1642. __asm__ volatile(
  1643. PREFETCH" 32(%1,%2) \n\t"
  1644. "movq (%1,%2), %%mm0 \n\t"
  1645. "movq 8(%1,%2), %%mm2 \n\t"
  1646. "movq 16(%1,%2), %%mm4 \n\t"
  1647. "movq 24(%1,%2), %%mm6 \n\t"
  1648. "movq %%mm0, %%mm1 \n\t"
  1649. "movq %%mm2, %%mm3 \n\t"
  1650. "movq %%mm4, %%mm5 \n\t"
  1651. "movq %%mm6, %%mm7 \n\t"
  1652. "punpcklbw %%mm0, %%mm0 \n\t"
  1653. "punpckhbw %%mm1, %%mm1 \n\t"
  1654. "punpcklbw %%mm2, %%mm2 \n\t"
  1655. "punpckhbw %%mm3, %%mm3 \n\t"
  1656. "punpcklbw %%mm4, %%mm4 \n\t"
  1657. "punpckhbw %%mm5, %%mm5 \n\t"
  1658. "punpcklbw %%mm6, %%mm6 \n\t"
  1659. "punpckhbw %%mm7, %%mm7 \n\t"
  1660. MOVNTQ" %%mm0, (%0,%2,2) \n\t"
  1661. MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
  1662. MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
  1663. MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
  1664. MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
  1665. MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
  1666. MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
  1667. MOVNTQ" %%mm7, 56(%0,%2,2)"
  1668. :: "r"(d), "r"(s2), "r"(x)
  1669. :"memory");
  1670. }
  1671. for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
  1672. }
  1673. __asm__(
  1674. EMMS" \n\t"
  1675. SFENCE" \n\t"
  1676. ::: "memory"
  1677. );
  1678. }
  1679. static inline void yvu9_to_yuy2_mmxext(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
  1680. uint8_t *dst,
  1681. int width, int height,
  1682. int srcStride1, int srcStride2,
  1683. int srcStride3, int dstStride)
  1684. {
  1685. int w,h;
  1686. w=width/2; h=height;
  1687. for (int y = 0; y < h; y++) {
  1688. const uint8_t* yp=src1+srcStride1*y;
  1689. const uint8_t* up=src2+srcStride2*(y>>2);
  1690. const uint8_t* vp=src3+srcStride3*(y>>2);
  1691. uint8_t* d=dst+dstStride*y;
  1692. x86_reg x = 0;
  1693. for (;x<w-7;x+=8) {
  1694. __asm__ volatile(
  1695. PREFETCH" 32(%1, %0) \n\t"
  1696. PREFETCH" 32(%2, %0) \n\t"
  1697. PREFETCH" 32(%3, %0) \n\t"
  1698. "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  1699. "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
  1700. "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
  1701. "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  1702. "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
  1703. "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
  1704. "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
  1705. "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
  1706. "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
  1707. "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
  1708. "movq %%mm1, %%mm6 \n\t"
  1709. "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
  1710. "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
  1711. "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
  1712. MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
  1713. MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
  1714. "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
  1715. "movq 8(%1, %0, 4), %%mm0 \n\t"
  1716. "movq %%mm0, %%mm3 \n\t"
  1717. "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
  1718. "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
  1719. MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
  1720. MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
  1721. "movq %%mm4, %%mm6 \n\t"
  1722. "movq 16(%1, %0, 4), %%mm0 \n\t"
  1723. "movq %%mm0, %%mm3 \n\t"
  1724. "punpcklbw %%mm5, %%mm4 \n\t"
  1725. "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
  1726. "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
  1727. MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
  1728. MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
  1729. "punpckhbw %%mm5, %%mm6 \n\t"
  1730. "movq 24(%1, %0, 4), %%mm0 \n\t"
  1731. "movq %%mm0, %%mm3 \n\t"
  1732. "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
  1733. "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
  1734. MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
  1735. MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
  1736. : "+r" (x)
  1737. : "r"(yp), "r" (up), "r"(vp), "r"(d)
  1738. :"memory");
  1739. }
  1740. for (; x<w; x++) {
  1741. const int x2 = x<<2;
  1742. d[8*x+0] = yp[x2];
  1743. d[8*x+1] = up[x];
  1744. d[8*x+2] = yp[x2+1];
  1745. d[8*x+3] = vp[x];
  1746. d[8*x+4] = yp[x2+2];
  1747. d[8*x+5] = up[x];
  1748. d[8*x+6] = yp[x2+3];
  1749. d[8*x+7] = vp[x];
  1750. }
  1751. }
  1752. __asm__(
  1753. EMMS" \n\t"
  1754. SFENCE" \n\t"
  1755. ::: "memory"
  1756. );
  1757. }
  1758. static void extract_even_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count)
  1759. {
  1760. dst += count;
  1761. src += 2*count;
  1762. count= - count;
  1763. if(count <= -16) {
  1764. count += 15;
  1765. __asm__ volatile(
  1766. "pcmpeqw %%mm7, %%mm7 \n\t"
  1767. "psrlw $8, %%mm7 \n\t"
  1768. "1: \n\t"
  1769. "movq -30(%1, %0, 2), %%mm0 \n\t"
  1770. "movq -22(%1, %0, 2), %%mm1 \n\t"
  1771. "movq -14(%1, %0, 2), %%mm2 \n\t"
  1772. "movq -6(%1, %0, 2), %%mm3 \n\t"
  1773. "pand %%mm7, %%mm0 \n\t"
  1774. "pand %%mm7, %%mm1 \n\t"
  1775. "pand %%mm7, %%mm2 \n\t"
  1776. "pand %%mm7, %%mm3 \n\t"
  1777. "packuswb %%mm1, %%mm0 \n\t"
  1778. "packuswb %%mm3, %%mm2 \n\t"
  1779. MOVNTQ" %%mm0,-15(%2, %0) \n\t"
  1780. MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
  1781. "add $16, %0 \n\t"
  1782. " js 1b \n\t"
  1783. : "+r"(count)
  1784. : "r"(src), "r"(dst)
  1785. );
  1786. count -= 15;
  1787. }
  1788. while(count<0) {
  1789. dst[count]= src[2*count];
  1790. count++;
  1791. }
  1792. }
  1793. static void extract_odd_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count)
  1794. {
  1795. src ++;
  1796. dst += count;
  1797. src += 2*count;
  1798. count= - count;
  1799. if(count < -16) {
  1800. count += 16;
  1801. __asm__ volatile(
  1802. "pcmpeqw %%mm7, %%mm7 \n\t"
  1803. "psrlw $8, %%mm7 \n\t"
  1804. "1: \n\t"
  1805. "movq -32(%1, %0, 2), %%mm0 \n\t"
  1806. "movq -24(%1, %0, 2), %%mm1 \n\t"
  1807. "movq -16(%1, %0, 2), %%mm2 \n\t"
  1808. "movq -8(%1, %0, 2), %%mm3 \n\t"
  1809. "pand %%mm7, %%mm0 \n\t"
  1810. "pand %%mm7, %%mm1 \n\t"
  1811. "pand %%mm7, %%mm2 \n\t"
  1812. "pand %%mm7, %%mm3 \n\t"
  1813. "packuswb %%mm1, %%mm0 \n\t"
  1814. "packuswb %%mm3, %%mm2 \n\t"
  1815. MOVNTQ" %%mm0,-16(%2, %0) \n\t"
  1816. MOVNTQ" %%mm2,- 8(%2, %0) \n\t"
  1817. "add $16, %0 \n\t"
  1818. " js 1b \n\t"
  1819. : "+r"(count)
  1820. : "r"(src), "r"(dst)
  1821. );
  1822. count -= 16;
  1823. }
  1824. while(count<0) {
  1825. dst[count]= src[2*count];
  1826. count++;
  1827. }
  1828. }
  1829. #if ARCH_X86_32
  1830. static void extract_even2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  1831. {
  1832. dst0+= count;
  1833. dst1+= count;
  1834. src += 4*count;
  1835. count= - count;
  1836. if(count <= -8) {
  1837. count += 7;
  1838. __asm__ volatile(
  1839. "pcmpeqw %%mm7, %%mm7 \n\t"
  1840. "psrlw $8, %%mm7 \n\t"
  1841. "1: \n\t"
  1842. "movq -28(%1, %0, 4), %%mm0 \n\t"
  1843. "movq -20(%1, %0, 4), %%mm1 \n\t"
  1844. "movq -12(%1, %0, 4), %%mm2 \n\t"
  1845. "movq -4(%1, %0, 4), %%mm3 \n\t"
  1846. "pand %%mm7, %%mm0 \n\t"
  1847. "pand %%mm7, %%mm1 \n\t"
  1848. "pand %%mm7, %%mm2 \n\t"
  1849. "pand %%mm7, %%mm3 \n\t"
  1850. "packuswb %%mm1, %%mm0 \n\t"
  1851. "packuswb %%mm3, %%mm2 \n\t"
  1852. "movq %%mm0, %%mm1 \n\t"
  1853. "movq %%mm2, %%mm3 \n\t"
  1854. "psrlw $8, %%mm0 \n\t"
  1855. "psrlw $8, %%mm2 \n\t"
  1856. "pand %%mm7, %%mm1 \n\t"
  1857. "pand %%mm7, %%mm3 \n\t"
  1858. "packuswb %%mm2, %%mm0 \n\t"
  1859. "packuswb %%mm3, %%mm1 \n\t"
  1860. MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
  1861. MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
  1862. "add $8, %0 \n\t"
  1863. " js 1b \n\t"
  1864. : "+r"(count)
  1865. : "r"(src), "r"(dst0), "r"(dst1)
  1866. );
  1867. count -= 7;
  1868. }
  1869. while(count<0) {
  1870. dst0[count]= src[4*count+0];
  1871. dst1[count]= src[4*count+2];
  1872. count++;
  1873. }
  1874. }
  1875. #endif /* ARCH_X86_32 */
  1876. static void extract_even2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  1877. {
  1878. dst0 += count;
  1879. dst1 += count;
  1880. src0 += 4*count;
  1881. src1 += 4*count;
  1882. count= - count;
  1883. #ifdef PAVGB
  1884. if(count <= -8) {
  1885. count += 7;
  1886. __asm__ volatile(
  1887. "pcmpeqw %%mm7, %%mm7 \n\t"
  1888. "psrlw $8, %%mm7 \n\t"
  1889. "1: \n\t"
  1890. "movq -28(%1, %0, 4), %%mm0 \n\t"
  1891. "movq -20(%1, %0, 4), %%mm1 \n\t"
  1892. "movq -12(%1, %0, 4), %%mm2 \n\t"
  1893. "movq -4(%1, %0, 4), %%mm3 \n\t"
  1894. PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
  1895. PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
  1896. PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
  1897. PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
  1898. "pand %%mm7, %%mm0 \n\t"
  1899. "pand %%mm7, %%mm1 \n\t"
  1900. "pand %%mm7, %%mm2 \n\t"
  1901. "pand %%mm7, %%mm3 \n\t"
  1902. "packuswb %%mm1, %%mm0 \n\t"
  1903. "packuswb %%mm3, %%mm2 \n\t"
  1904. "movq %%mm0, %%mm1 \n\t"
  1905. "movq %%mm2, %%mm3 \n\t"
  1906. "psrlw $8, %%mm0 \n\t"
  1907. "psrlw $8, %%mm2 \n\t"
  1908. "pand %%mm7, %%mm1 \n\t"
  1909. "pand %%mm7, %%mm3 \n\t"
  1910. "packuswb %%mm2, %%mm0 \n\t"
  1911. "packuswb %%mm3, %%mm1 \n\t"
  1912. MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
  1913. MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
  1914. "add $8, %0 \n\t"
  1915. " js 1b \n\t"
  1916. : "+r"(count)
  1917. : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
  1918. );
  1919. count -= 7;
  1920. }
  1921. #endif
  1922. while(count<0) {
  1923. dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
  1924. dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
  1925. count++;
  1926. }
  1927. }
  1928. static void extract_odd2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  1929. {
  1930. dst0+= count;
  1931. dst1+= count;
  1932. src += 4*count;
  1933. count= - count;
  1934. if(count <= -8) {
  1935. count += 7;
  1936. __asm__ volatile(
  1937. "pcmpeqw %%mm7, %%mm7 \n\t"
  1938. "psrlw $8, %%mm7 \n\t"
  1939. "1: \n\t"
  1940. "movq -28(%1, %0, 4), %%mm0 \n\t"
  1941. "movq -20(%1, %0, 4), %%mm1 \n\t"
  1942. "movq -12(%1, %0, 4), %%mm2 \n\t"
  1943. "movq -4(%1, %0, 4), %%mm3 \n\t"
  1944. "psrlw $8, %%mm0 \n\t"
  1945. "psrlw $8, %%mm1 \n\t"
  1946. "psrlw $8, %%mm2 \n\t"
  1947. "psrlw $8, %%mm3 \n\t"
  1948. "packuswb %%mm1, %%mm0 \n\t"
  1949. "packuswb %%mm3, %%mm2 \n\t"
  1950. "movq %%mm0, %%mm1 \n\t"
  1951. "movq %%mm2, %%mm3 \n\t"
  1952. "psrlw $8, %%mm0 \n\t"
  1953. "psrlw $8, %%mm2 \n\t"
  1954. "pand %%mm7, %%mm1 \n\t"
  1955. "pand %%mm7, %%mm3 \n\t"
  1956. "packuswb %%mm2, %%mm0 \n\t"
  1957. "packuswb %%mm3, %%mm1 \n\t"
  1958. MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
  1959. MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
  1960. "add $8, %0 \n\t"
  1961. " js 1b \n\t"
  1962. : "+r"(count)
  1963. : "r"(src), "r"(dst0), "r"(dst1)
  1964. );
  1965. count -= 7;
  1966. }
  1967. src++;
  1968. while(count<0) {
  1969. dst0[count]= src[4*count+0];
  1970. dst1[count]= src[4*count+2];
  1971. count++;
  1972. }
  1973. }
  1974. static void extract_odd2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  1975. {
  1976. dst0 += count;
  1977. dst1 += count;
  1978. src0 += 4*count;
  1979. src1 += 4*count;
  1980. count= - count;
  1981. #ifdef PAVGB
  1982. if(count <= -8) {
  1983. count += 7;
  1984. __asm__ volatile(
  1985. "pcmpeqw %%mm7, %%mm7 \n\t"
  1986. "psrlw $8, %%mm7 \n\t"
  1987. "1: \n\t"
  1988. "movq -28(%1, %0, 4), %%mm0 \n\t"
  1989. "movq -20(%1, %0, 4), %%mm1 \n\t"
  1990. "movq -12(%1, %0, 4), %%mm2 \n\t"
  1991. "movq -4(%1, %0, 4), %%mm3 \n\t"
  1992. PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
  1993. PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
  1994. PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
  1995. PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
  1996. "psrlw $8, %%mm0 \n\t"
  1997. "psrlw $8, %%mm1 \n\t"
  1998. "psrlw $8, %%mm2 \n\t"
  1999. "psrlw $8, %%mm3 \n\t"
  2000. "packuswb %%mm1, %%mm0 \n\t"
  2001. "packuswb %%mm3, %%mm2 \n\t"
  2002. "movq %%mm0, %%mm1 \n\t"
  2003. "movq %%mm2, %%mm3 \n\t"
  2004. "psrlw $8, %%mm0 \n\t"
  2005. "psrlw $8, %%mm2 \n\t"
  2006. "pand %%mm7, %%mm1 \n\t"
  2007. "pand %%mm7, %%mm3 \n\t"
  2008. "packuswb %%mm2, %%mm0 \n\t"
  2009. "packuswb %%mm3, %%mm1 \n\t"
  2010. MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
  2011. MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
  2012. "add $8, %0 \n\t"
  2013. " js 1b \n\t"
  2014. : "+r"(count)
  2015. : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
  2016. );
  2017. count -= 7;
  2018. }
  2019. #endif
  2020. src0++;
  2021. src1++;
  2022. while(count<0) {
  2023. dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
  2024. dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
  2025. count++;
  2026. }
  2027. }
  2028. static void yuyvtoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2029. int width, int height,
  2030. int lumStride, int chromStride, int srcStride)
  2031. {
  2032. const int chromWidth = AV_CEIL_RSHIFT(width, 1);
  2033. for (int y = 0; y < height; y++) {
  2034. extract_even_mmxext(src, ydst, width);
  2035. if(y&1) {
  2036. extract_odd2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth);
  2037. udst+= chromStride;
  2038. vdst+= chromStride;
  2039. }
  2040. src += srcStride;
  2041. ydst+= lumStride;
  2042. }
  2043. __asm__(
  2044. EMMS" \n\t"
  2045. SFENCE" \n\t"
  2046. ::: "memory"
  2047. );
  2048. }
  2049. static void yuyvtoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2050. int width, int height,
  2051. int lumStride, int chromStride, int srcStride)
  2052. {
  2053. const int chromWidth = AV_CEIL_RSHIFT(width, 1);
  2054. for (int y = 0; y < height; y++) {
  2055. extract_even_mmxext(src, ydst, width);
  2056. extract_odd2_mmxext(src, udst, vdst, chromWidth);
  2057. src += srcStride;
  2058. ydst+= lumStride;
  2059. udst+= chromStride;
  2060. vdst+= chromStride;
  2061. }
  2062. __asm__(
  2063. EMMS" \n\t"
  2064. SFENCE" \n\t"
  2065. ::: "memory"
  2066. );
  2067. }
  2068. static void uyvytoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2069. int width, int height,
  2070. int lumStride, int chromStride, int srcStride)
  2071. {
  2072. const int chromWidth = AV_CEIL_RSHIFT(width, 1);
  2073. for (int y = 0; y < height; y++) {
  2074. extract_odd_mmxext(src, ydst, width);
  2075. if(y&1) {
  2076. extract_even2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth);
  2077. udst+= chromStride;
  2078. vdst+= chromStride;
  2079. }
  2080. src += srcStride;
  2081. ydst+= lumStride;
  2082. }
  2083. __asm__(
  2084. EMMS" \n\t"
  2085. SFENCE" \n\t"
  2086. ::: "memory"
  2087. );
  2088. }
  2089. #if ARCH_X86_32
  2090. static void uyvytoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2091. int width, int height,
  2092. int lumStride, int chromStride, int srcStride)
  2093. {
  2094. const int chromWidth = AV_CEIL_RSHIFT(width, 1);
  2095. for (int y = 0; y < height; y++) {
  2096. extract_odd_mmxext(src, ydst, width);
  2097. extract_even2_mmxext(src, udst, vdst, chromWidth);
  2098. src += srcStride;
  2099. ydst+= lumStride;
  2100. udst+= chromStride;
  2101. vdst+= chromStride;
  2102. }
  2103. __asm__(
  2104. EMMS" \n\t"
  2105. SFENCE" \n\t"
  2106. ::: "memory"
  2107. );
  2108. }
  2109. #endif /* ARCH_X86_32 */
  2110. static av_cold void rgb2rgb_init_mmxext(void)
  2111. {
  2112. rgb15to16 = rgb15to16_mmxext;
  2113. rgb15tobgr24 = rgb15tobgr24_mmxext;
  2114. rgb15to32 = rgb15to32_mmxext;
  2115. rgb16tobgr24 = rgb16tobgr24_mmxext;
  2116. rgb16to32 = rgb16to32_mmxext;
  2117. rgb16to15 = rgb16to15_mmxext;
  2118. rgb24tobgr16 = rgb24tobgr16_mmxext;
  2119. rgb24tobgr15 = rgb24tobgr15_mmxext;
  2120. rgb24tobgr32 = rgb24tobgr32_mmxext;
  2121. rgb32to16 = rgb32to16_mmxext;
  2122. rgb32to15 = rgb32to15_mmxext;
  2123. rgb32tobgr24 = rgb32tobgr24_mmxext;
  2124. rgb24to15 = rgb24to15_mmxext;
  2125. rgb24to16 = rgb24to16_mmxext;
  2126. rgb24tobgr24 = rgb24tobgr24_mmxext;
  2127. rgb32tobgr16 = rgb32tobgr16_mmxext;
  2128. rgb32tobgr15 = rgb32tobgr15_mmxext;
  2129. yv12toyuy2 = yv12toyuy2_mmxext;
  2130. yv12touyvy = yv12touyvy_mmxext;
  2131. yuv422ptoyuy2 = yuv422ptoyuy2_mmxext;
  2132. yuv422ptouyvy = yuv422ptouyvy_mmxext;
  2133. yuy2toyv12 = yuy2toyv12_mmxext;
  2134. vu9_to_vu12 = vu9_to_vu12_mmxext;
  2135. yvu9_to_yuy2 = yvu9_to_yuy2_mmxext;
  2136. #if ARCH_X86_32
  2137. uyvytoyuv422 = uyvytoyuv422_mmxext;
  2138. #endif
  2139. yuyvtoyuv422 = yuyvtoyuv422_mmxext;
  2140. planar2x = planar2x_mmxext;
  2141. #if ARCH_X86_32 && HAVE_7REGS
  2142. ff_rgb24toyv12 = rgb24toyv12_mmxext;
  2143. #endif /* ARCH_X86_32 && HAVE_7REGS */
  2144. yuyvtoyuv420 = yuyvtoyuv420_mmxext;
  2145. uyvytoyuv420 = uyvytoyuv420_mmxext;
  2146. }
  2147. //SSE2 versions
  2148. static void interleave_bytes_sse2(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
  2149. int width, int height, int src1Stride,
  2150. int src2Stride, int dstStride)
  2151. {
  2152. for (int h = 0; h < height; h++) {
  2153. if (width >= 16) {
  2154. if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) {
  2155. __asm__(
  2156. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
  2157. "1: \n\t"
  2158. PREFETCH" 64(%1, %%"FF_REG_a") \n\t"
  2159. PREFETCH" 64(%2, %%"FF_REG_a") \n\t"
  2160. "movdqa (%1, %%"FF_REG_a"), %%xmm0 \n\t"
  2161. "movdqa (%1, %%"FF_REG_a"), %%xmm1 \n\t"
  2162. "movdqa (%2, %%"FF_REG_a"), %%xmm2 \n\t"
  2163. "punpcklbw %%xmm2, %%xmm0 \n\t"
  2164. "punpckhbw %%xmm2, %%xmm1 \n\t"
  2165. "movntdq %%xmm0, (%0, %%"FF_REG_a", 2) \n\t"
  2166. "movntdq %%xmm1, 16(%0, %%"FF_REG_a", 2) \n\t"
  2167. "add $16, %%"FF_REG_a" \n\t"
  2168. "cmp %3, %%"FF_REG_a" \n\t"
  2169. " jb 1b \n\t"
  2170. ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
  2171. : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"FF_REG_a
  2172. );
  2173. } else
  2174. __asm__(
  2175. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
  2176. "1: \n\t"
  2177. PREFETCH" 64(%1, %%"FF_REG_a") \n\t"
  2178. PREFETCH" 64(%2, %%"FF_REG_a") \n\t"
  2179. "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
  2180. "movq 8(%1, %%"FF_REG_a"), %%mm2 \n\t"
  2181. "movq %%mm0, %%mm1 \n\t"
  2182. "movq %%mm2, %%mm3 \n\t"
  2183. "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
  2184. "movq 8(%2, %%"FF_REG_a"), %%mm5 \n\t"
  2185. "punpcklbw %%mm4, %%mm0 \n\t"
  2186. "punpckhbw %%mm4, %%mm1 \n\t"
  2187. "punpcklbw %%mm5, %%mm2 \n\t"
  2188. "punpckhbw %%mm5, %%mm3 \n\t"
  2189. MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 2) \n\t"
  2190. MOVNTQ" %%mm1, 8(%0, %%"FF_REG_a", 2) \n\t"
  2191. MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 2) \n\t"
  2192. MOVNTQ" %%mm3, 24(%0, %%"FF_REG_a", 2) \n\t"
  2193. "add $16, %%"FF_REG_a" \n\t"
  2194. "cmp %3, %%"FF_REG_a" \n\t"
  2195. " jb 1b \n\t"
  2196. ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
  2197. : "memory", "%"FF_REG_a
  2198. );
  2199. }
  2200. for (int w = (width & (~15)); w < width; w++) {
  2201. dest[2*w+0] = src1[w];
  2202. dest[2*w+1] = src2[w];
  2203. }
  2204. dest += dstStride;
  2205. src1 += src1Stride;
  2206. src2 += src2Stride;
  2207. }
  2208. __asm__(
  2209. EMMS" \n\t"
  2210. SFENCE" \n\t"
  2211. ::: "memory"
  2212. );
  2213. }
  2214. /*
  2215. RGB15->RGB16 original by Strepto/Astral
  2216. ported to gcc & bugfixed : A'rpi
  2217. MMXEXT, 3DNOW optimization by Nick Kurshev
  2218. 32-bit C version, and and&add trick by Michael Niedermayer
  2219. */
  2220. #endif /* HAVE_INLINE_ASM */
  2221. void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
  2222. void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
  2223. void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
  2224. void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
  2225. void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
  2226. void ff_shuffle_bytes_3102_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
  2227. void ff_shuffle_bytes_2013_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
  2228. void ff_shuffle_bytes_2130_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
  2229. void ff_shuffle_bytes_1203_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
  2230. #if ARCH_X86_64
  2231. void ff_shuffle_bytes_2103_avx2(const uint8_t *src, uint8_t *dst, int src_size);
  2232. void ff_shuffle_bytes_0321_avx2(const uint8_t *src, uint8_t *dst, int src_size);
  2233. void ff_shuffle_bytes_1230_avx2(const uint8_t *src, uint8_t *dst, int src_size);
  2234. void ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int src_size);
  2235. void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t *dst, int src_size);
  2236. void ff_shuffle_bytes_3102_avx2(const uint8_t *src, uint8_t *dst, int src_size);
  2237. void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size);
  2238. void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size);
  2239. void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size);
  2240. void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
  2241. void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
  2242. void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
  2243. void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
  2244. void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
  2245. void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
  2246. void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
  2247. void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
  2248. void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
  2249. void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  2250. const uint8_t *src, int width, int height,
  2251. int lumStride, int chromStride, int srcStride);
  2252. void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  2253. const uint8_t *src, int width, int height,
  2254. int lumStride, int chromStride, int srcStride);
  2255. void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  2256. const uint8_t *src, int width, int height,
  2257. int lumStride, int chromStride, int srcStride);
  2258. #endif
  2259. #define DEINTERLEAVE_BYTES(cpuext) \
  2260. void ff_nv12ToUV_ ## cpuext(uint8_t *dstU, uint8_t *dstV, \
  2261. const uint8_t *unused, \
  2262. const uint8_t *src1, \
  2263. const uint8_t *src2, \
  2264. int w, \
  2265. uint32_t *unused2, \
  2266. void *opq); \
  2267. static void deinterleave_bytes_ ## cpuext(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, \
  2268. int width, int height, int srcStride, \
  2269. int dst1Stride, int dst2Stride) \
  2270. { \
  2271. for (int h = 0; h < height; h++) { \
  2272. if (width >= 16) \
  2273. ff_nv12ToUV_ ## cpuext(dst1, dst2, NULL, src, NULL, width - 15, NULL, NULL); \
  2274. for (int w = (width & (~15)); w < width; w++) { \
  2275. dst1[w] = src[2*w+0]; \
  2276. dst2[w] = src[2*w+1]; \
  2277. } \
  2278. src += srcStride; \
  2279. dst1 += dst1Stride; \
  2280. dst2 += dst2Stride; \
  2281. } \
  2282. }
  2283. #if HAVE_SSE2_EXTERNAL
  2284. DEINTERLEAVE_BYTES(sse2)
  2285. #endif
  2286. #if HAVE_AVX_EXTERNAL
  2287. DEINTERLEAVE_BYTES(avx)
  2288. #endif
  2289. av_cold void rgb2rgb_init_x86(void)
  2290. {
  2291. int cpu_flags = av_get_cpu_flags();
  2292. #if HAVE_INLINE_ASM
  2293. if (INLINE_MMXEXT(cpu_flags))
  2294. rgb2rgb_init_mmxext();
  2295. if (INLINE_SSE2(cpu_flags))
  2296. interleaveBytes = interleave_bytes_sse2;
  2297. #endif /* HAVE_INLINE_ASM */
  2298. #if HAVE_SSE2_EXTERNAL
  2299. if (EXTERNAL_SSE2(cpu_flags)) {
  2300. #if ARCH_X86_64
  2301. uyvytoyuv422 = ff_uyvytoyuv422_sse2;
  2302. #endif
  2303. deinterleaveBytes = deinterleave_bytes_sse2;
  2304. }
  2305. #endif
  2306. if (EXTERNAL_SSSE3(cpu_flags)) {
  2307. shuffle_bytes_0321 = ff_shuffle_bytes_0321_ssse3;
  2308. shuffle_bytes_2103 = ff_shuffle_bytes_2103_ssse3;
  2309. shuffle_bytes_1230 = ff_shuffle_bytes_1230_ssse3;
  2310. shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3;
  2311. shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3;
  2312. shuffle_bytes_3102 = ff_shuffle_bytes_3102_ssse3;
  2313. shuffle_bytes_2013 = ff_shuffle_bytes_2013_ssse3;
  2314. shuffle_bytes_2130 = ff_shuffle_bytes_2130_ssse3;
  2315. shuffle_bytes_1203 = ff_shuffle_bytes_1203_ssse3;
  2316. }
  2317. #if HAVE_AVX_EXTERNAL
  2318. if (EXTERNAL_AVX(cpu_flags)) {
  2319. deinterleaveBytes = deinterleave_bytes_avx;
  2320. #if ARCH_X86_64
  2321. uyvytoyuv422 = ff_uyvytoyuv422_avx;
  2322. }
  2323. if (EXTERNAL_AVX2_FAST(cpu_flags)) {
  2324. shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx2;
  2325. shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx2;
  2326. shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx2;
  2327. shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2;
  2328. shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2;
  2329. shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx2;
  2330. shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx2;
  2331. shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
  2332. shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
  2333. }
  2334. if (EXTERNAL_AVX512ICL(cpu_flags)) {
  2335. shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
  2336. shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
  2337. shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
  2338. shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
  2339. shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl;
  2340. shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl;
  2341. shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl;
  2342. shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl;
  2343. shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl;
  2344. }
  2345. if (EXTERNAL_AVX2_FAST(cpu_flags)) {
  2346. uyvytoyuv422 = ff_uyvytoyuv422_avx2;
  2347. #endif
  2348. }
  2349. #endif
  2350. }