rgb2rgb_template.c 115 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010
  1. /*
  2. * software RGB to RGB converter
  3. * pluralize by software PAL8 to RGB converter
  4. * software YUV to YUV converter
  5. * software YUV to RGB converter
  6. * Written by Nick Kurshev.
  7. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  8. * lot of big-endian byte order fixes by Alex Beregszaszi
  9. *
  10. * This file is part of FFmpeg.
  11. *
  12. * FFmpeg is free software; you can redistribute it and/or modify
  13. * it under the terms of the GNU General Public License as published by
  14. * the Free Software Foundation; either version 2 of the License, or
  15. * (at your option) any later version.
  16. *
  17. * FFmpeg is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU General Public License
  23. * along with FFmpeg; if not, write to the Free Software
  24. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. *
  26. * The C code (not assembly, MMX, ...) of this file can be used
  27. * under the LGPL license.
  28. */
  29. #include <stddef.h>
  30. #undef PREFETCH
  31. #undef MOVNTQ
  32. #undef EMMS
  33. #undef SFENCE
  34. #undef MMREG_SIZE
  35. #undef PAVGB
  36. #if HAVE_SSE2
  37. #define MMREG_SIZE 16
  38. #else
  39. #define MMREG_SIZE 8
  40. #endif
  41. #if HAVE_AMD3DNOW
  42. #define PREFETCH "prefetch"
  43. #define PAVGB "pavgusb"
  44. #elif HAVE_MMX2
  45. #define PREFETCH "prefetchnta"
  46. #define PAVGB "pavgb"
  47. #else
  48. #define PREFETCH " # nop"
  49. #endif
  50. #if HAVE_AMD3DNOW
  51. /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
  52. #define EMMS "femms"
  53. #else
  54. #define EMMS "emms"
  55. #endif
  56. #if HAVE_MMX2
  57. #define MOVNTQ "movntq"
  58. #define SFENCE "sfence"
  59. #else
  60. #define MOVNTQ "movq"
  61. #define SFENCE " # nop"
  62. #endif
  63. static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
  64. {
  65. uint8_t *dest = dst;
  66. const uint8_t *s = src;
  67. const uint8_t *end;
  68. #if HAVE_MMX
  69. const uint8_t *mm_end;
  70. #endif
  71. end = s + src_size;
  72. #if HAVE_MMX
  73. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  74. mm_end = end - 23;
  75. __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
  76. while (s < mm_end) {
  77. __asm__ volatile(
  78. PREFETCH" 32%1 \n\t"
  79. "movd %1, %%mm0 \n\t"
  80. "punpckldq 3%1, %%mm0 \n\t"
  81. "movd 6%1, %%mm1 \n\t"
  82. "punpckldq 9%1, %%mm1 \n\t"
  83. "movd 12%1, %%mm2 \n\t"
  84. "punpckldq 15%1, %%mm2 \n\t"
  85. "movd 18%1, %%mm3 \n\t"
  86. "punpckldq 21%1, %%mm3 \n\t"
  87. "por %%mm7, %%mm0 \n\t"
  88. "por %%mm7, %%mm1 \n\t"
  89. "por %%mm7, %%mm2 \n\t"
  90. "por %%mm7, %%mm3 \n\t"
  91. MOVNTQ" %%mm0, %0 \n\t"
  92. MOVNTQ" %%mm1, 8%0 \n\t"
  93. MOVNTQ" %%mm2, 16%0 \n\t"
  94. MOVNTQ" %%mm3, 24%0"
  95. :"=m"(*dest)
  96. :"m"(*s)
  97. :"memory");
  98. dest += 32;
  99. s += 24;
  100. }
  101. __asm__ volatile(SFENCE:::"memory");
  102. __asm__ volatile(EMMS:::"memory");
  103. #endif
  104. while (s < end) {
  105. #if HAVE_BIGENDIAN
  106. /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
  107. *dest++ = 255;
  108. *dest++ = s[2];
  109. *dest++ = s[1];
  110. *dest++ = s[0];
  111. s+=3;
  112. #else
  113. *dest++ = *s++;
  114. *dest++ = *s++;
  115. *dest++ = *s++;
  116. *dest++ = 255;
  117. #endif
  118. }
  119. }
  120. static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  121. {
  122. uint8_t *dest = dst;
  123. const uint8_t *s = src;
  124. const uint8_t *end;
  125. #if HAVE_MMX
  126. const uint8_t *mm_end;
  127. #endif
  128. end = s + src_size;
  129. #if HAVE_MMX
  130. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  131. mm_end = end - 31;
  132. while (s < mm_end) {
  133. __asm__ volatile(
  134. PREFETCH" 32%1 \n\t"
  135. "movq %1, %%mm0 \n\t"
  136. "movq 8%1, %%mm1 \n\t"
  137. "movq 16%1, %%mm4 \n\t"
  138. "movq 24%1, %%mm5 \n\t"
  139. "movq %%mm0, %%mm2 \n\t"
  140. "movq %%mm1, %%mm3 \n\t"
  141. "movq %%mm4, %%mm6 \n\t"
  142. "movq %%mm5, %%mm7 \n\t"
  143. "psrlq $8, %%mm2 \n\t"
  144. "psrlq $8, %%mm3 \n\t"
  145. "psrlq $8, %%mm6 \n\t"
  146. "psrlq $8, %%mm7 \n\t"
  147. "pand %2, %%mm0 \n\t"
  148. "pand %2, %%mm1 \n\t"
  149. "pand %2, %%mm4 \n\t"
  150. "pand %2, %%mm5 \n\t"
  151. "pand %3, %%mm2 \n\t"
  152. "pand %3, %%mm3 \n\t"
  153. "pand %3, %%mm6 \n\t"
  154. "pand %3, %%mm7 \n\t"
  155. "por %%mm2, %%mm0 \n\t"
  156. "por %%mm3, %%mm1 \n\t"
  157. "por %%mm6, %%mm4 \n\t"
  158. "por %%mm7, %%mm5 \n\t"
  159. "movq %%mm1, %%mm2 \n\t"
  160. "movq %%mm4, %%mm3 \n\t"
  161. "psllq $48, %%mm2 \n\t"
  162. "psllq $32, %%mm3 \n\t"
  163. "pand %4, %%mm2 \n\t"
  164. "pand %5, %%mm3 \n\t"
  165. "por %%mm2, %%mm0 \n\t"
  166. "psrlq $16, %%mm1 \n\t"
  167. "psrlq $32, %%mm4 \n\t"
  168. "psllq $16, %%mm5 \n\t"
  169. "por %%mm3, %%mm1 \n\t"
  170. "pand %6, %%mm5 \n\t"
  171. "por %%mm5, %%mm4 \n\t"
  172. MOVNTQ" %%mm0, %0 \n\t"
  173. MOVNTQ" %%mm1, 8%0 \n\t"
  174. MOVNTQ" %%mm4, 16%0"
  175. :"=m"(*dest)
  176. :"m"(*s),"m"(mask24l),
  177. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  178. :"memory");
  179. dest += 24;
  180. s += 32;
  181. }
  182. __asm__ volatile(SFENCE:::"memory");
  183. __asm__ volatile(EMMS:::"memory");
  184. #endif
  185. while (s < end) {
  186. #if HAVE_BIGENDIAN
  187. /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
  188. s++;
  189. dest[2] = *s++;
  190. dest[1] = *s++;
  191. dest[0] = *s++;
  192. dest += 3;
  193. #else
  194. *dest++ = *s++;
  195. *dest++ = *s++;
  196. *dest++ = *s++;
  197. s++;
  198. #endif
  199. }
  200. }
  201. /*
  202. original by Strepto/Astral
  203. ported to gcc & bugfixed: A'rpi
  204. MMX2, 3DNOW optimization by Nick Kurshev
  205. 32-bit C version, and and&add trick by Michael Niedermayer
  206. */
  207. static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
  208. {
  209. register const uint8_t* s=src;
  210. register uint8_t* d=dst;
  211. register const uint8_t *end;
  212. const uint8_t *mm_end;
  213. end = s + src_size;
  214. #if HAVE_MMX
  215. __asm__ volatile(PREFETCH" %0"::"m"(*s));
  216. __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
  217. mm_end = end - 15;
  218. while (s<mm_end) {
  219. __asm__ volatile(
  220. PREFETCH" 32%1 \n\t"
  221. "movq %1, %%mm0 \n\t"
  222. "movq 8%1, %%mm2 \n\t"
  223. "movq %%mm0, %%mm1 \n\t"
  224. "movq %%mm2, %%mm3 \n\t"
  225. "pand %%mm4, %%mm0 \n\t"
  226. "pand %%mm4, %%mm2 \n\t"
  227. "paddw %%mm1, %%mm0 \n\t"
  228. "paddw %%mm3, %%mm2 \n\t"
  229. MOVNTQ" %%mm0, %0 \n\t"
  230. MOVNTQ" %%mm2, 8%0"
  231. :"=m"(*d)
  232. :"m"(*s)
  233. );
  234. d+=16;
  235. s+=16;
  236. }
  237. __asm__ volatile(SFENCE:::"memory");
  238. __asm__ volatile(EMMS:::"memory");
  239. #endif
  240. mm_end = end - 3;
  241. while (s < mm_end) {
  242. register unsigned x= *((const uint32_t *)s);
  243. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  244. d+=4;
  245. s+=4;
  246. }
  247. if (s < end) {
  248. register unsigned short x= *((const uint16_t *)s);
  249. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  250. }
  251. }
  252. static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
  253. {
  254. register const uint8_t* s=src;
  255. register uint8_t* d=dst;
  256. register const uint8_t *end;
  257. const uint8_t *mm_end;
  258. end = s + src_size;
  259. #if HAVE_MMX
  260. __asm__ volatile(PREFETCH" %0"::"m"(*s));
  261. __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
  262. __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
  263. mm_end = end - 15;
  264. while (s<mm_end) {
  265. __asm__ volatile(
  266. PREFETCH" 32%1 \n\t"
  267. "movq %1, %%mm0 \n\t"
  268. "movq 8%1, %%mm2 \n\t"
  269. "movq %%mm0, %%mm1 \n\t"
  270. "movq %%mm2, %%mm3 \n\t"
  271. "psrlq $1, %%mm0 \n\t"
  272. "psrlq $1, %%mm2 \n\t"
  273. "pand %%mm7, %%mm0 \n\t"
  274. "pand %%mm7, %%mm2 \n\t"
  275. "pand %%mm6, %%mm1 \n\t"
  276. "pand %%mm6, %%mm3 \n\t"
  277. "por %%mm1, %%mm0 \n\t"
  278. "por %%mm3, %%mm2 \n\t"
  279. MOVNTQ" %%mm0, %0 \n\t"
  280. MOVNTQ" %%mm2, 8%0"
  281. :"=m"(*d)
  282. :"m"(*s)
  283. );
  284. d+=16;
  285. s+=16;
  286. }
  287. __asm__ volatile(SFENCE:::"memory");
  288. __asm__ volatile(EMMS:::"memory");
  289. #endif
  290. mm_end = end - 3;
  291. while (s < mm_end) {
  292. register uint32_t x= *((const uint32_t*)s);
  293. *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
  294. s+=4;
  295. d+=4;
  296. }
  297. if (s < end) {
  298. register uint16_t x= *((const uint16_t*)s);
  299. *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
  300. }
  301. }
  302. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
  303. {
  304. const uint8_t *s = src;
  305. const uint8_t *end;
  306. #if HAVE_MMX
  307. const uint8_t *mm_end;
  308. #endif
  309. uint16_t *d = (uint16_t *)dst;
  310. end = s + src_size;
  311. #if HAVE_MMX
  312. mm_end = end - 15;
  313. #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
  314. __asm__ volatile(
  315. "movq %3, %%mm5 \n\t"
  316. "movq %4, %%mm6 \n\t"
  317. "movq %5, %%mm7 \n\t"
  318. "jmp 2f \n\t"
  319. ASMALIGN(4)
  320. "1: \n\t"
  321. PREFETCH" 32(%1) \n\t"
  322. "movd (%1), %%mm0 \n\t"
  323. "movd 4(%1), %%mm3 \n\t"
  324. "punpckldq 8(%1), %%mm0 \n\t"
  325. "punpckldq 12(%1), %%mm3 \n\t"
  326. "movq %%mm0, %%mm1 \n\t"
  327. "movq %%mm3, %%mm4 \n\t"
  328. "pand %%mm6, %%mm0 \n\t"
  329. "pand %%mm6, %%mm3 \n\t"
  330. "pmaddwd %%mm7, %%mm0 \n\t"
  331. "pmaddwd %%mm7, %%mm3 \n\t"
  332. "pand %%mm5, %%mm1 \n\t"
  333. "pand %%mm5, %%mm4 \n\t"
  334. "por %%mm1, %%mm0 \n\t"
  335. "por %%mm4, %%mm3 \n\t"
  336. "psrld $5, %%mm0 \n\t"
  337. "pslld $11, %%mm3 \n\t"
  338. "por %%mm3, %%mm0 \n\t"
  339. MOVNTQ" %%mm0, (%0) \n\t"
  340. "add $16, %1 \n\t"
  341. "add $8, %0 \n\t"
  342. "2: \n\t"
  343. "cmp %2, %1 \n\t"
  344. " jb 1b \n\t"
  345. : "+r" (d), "+r"(s)
  346. : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
  347. );
  348. #else
  349. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  350. __asm__ volatile(
  351. "movq %0, %%mm7 \n\t"
  352. "movq %1, %%mm6 \n\t"
  353. ::"m"(red_16mask),"m"(green_16mask));
  354. while (s < mm_end) {
  355. __asm__ volatile(
  356. PREFETCH" 32%1 \n\t"
  357. "movd %1, %%mm0 \n\t"
  358. "movd 4%1, %%mm3 \n\t"
  359. "punpckldq 8%1, %%mm0 \n\t"
  360. "punpckldq 12%1, %%mm3 \n\t"
  361. "movq %%mm0, %%mm1 \n\t"
  362. "movq %%mm0, %%mm2 \n\t"
  363. "movq %%mm3, %%mm4 \n\t"
  364. "movq %%mm3, %%mm5 \n\t"
  365. "psrlq $3, %%mm0 \n\t"
  366. "psrlq $3, %%mm3 \n\t"
  367. "pand %2, %%mm0 \n\t"
  368. "pand %2, %%mm3 \n\t"
  369. "psrlq $5, %%mm1 \n\t"
  370. "psrlq $5, %%mm4 \n\t"
  371. "pand %%mm6, %%mm1 \n\t"
  372. "pand %%mm6, %%mm4 \n\t"
  373. "psrlq $8, %%mm2 \n\t"
  374. "psrlq $8, %%mm5 \n\t"
  375. "pand %%mm7, %%mm2 \n\t"
  376. "pand %%mm7, %%mm5 \n\t"
  377. "por %%mm1, %%mm0 \n\t"
  378. "por %%mm4, %%mm3 \n\t"
  379. "por %%mm2, %%mm0 \n\t"
  380. "por %%mm5, %%mm3 \n\t"
  381. "psllq $16, %%mm3 \n\t"
  382. "por %%mm3, %%mm0 \n\t"
  383. MOVNTQ" %%mm0, %0 \n\t"
  384. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  385. d += 4;
  386. s += 16;
  387. }
  388. #endif
  389. __asm__ volatile(SFENCE:::"memory");
  390. __asm__ volatile(EMMS:::"memory");
  391. #endif
  392. while (s < end) {
  393. register int rgb = *(const uint32_t*)s; s += 4;
  394. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
  395. }
  396. }
  397. static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  398. {
  399. const uint8_t *s = src;
  400. const uint8_t *end;
  401. #if HAVE_MMX
  402. const uint8_t *mm_end;
  403. #endif
  404. uint16_t *d = (uint16_t *)dst;
  405. end = s + src_size;
  406. #if HAVE_MMX
  407. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  408. __asm__ volatile(
  409. "movq %0, %%mm7 \n\t"
  410. "movq %1, %%mm6 \n\t"
  411. ::"m"(red_16mask),"m"(green_16mask));
  412. mm_end = end - 15;
  413. while (s < mm_end) {
  414. __asm__ volatile(
  415. PREFETCH" 32%1 \n\t"
  416. "movd %1, %%mm0 \n\t"
  417. "movd 4%1, %%mm3 \n\t"
  418. "punpckldq 8%1, %%mm0 \n\t"
  419. "punpckldq 12%1, %%mm3 \n\t"
  420. "movq %%mm0, %%mm1 \n\t"
  421. "movq %%mm0, %%mm2 \n\t"
  422. "movq %%mm3, %%mm4 \n\t"
  423. "movq %%mm3, %%mm5 \n\t"
  424. "psllq $8, %%mm0 \n\t"
  425. "psllq $8, %%mm3 \n\t"
  426. "pand %%mm7, %%mm0 \n\t"
  427. "pand %%mm7, %%mm3 \n\t"
  428. "psrlq $5, %%mm1 \n\t"
  429. "psrlq $5, %%mm4 \n\t"
  430. "pand %%mm6, %%mm1 \n\t"
  431. "pand %%mm6, %%mm4 \n\t"
  432. "psrlq $19, %%mm2 \n\t"
  433. "psrlq $19, %%mm5 \n\t"
  434. "pand %2, %%mm2 \n\t"
  435. "pand %2, %%mm5 \n\t"
  436. "por %%mm1, %%mm0 \n\t"
  437. "por %%mm4, %%mm3 \n\t"
  438. "por %%mm2, %%mm0 \n\t"
  439. "por %%mm5, %%mm3 \n\t"
  440. "psllq $16, %%mm3 \n\t"
  441. "por %%mm3, %%mm0 \n\t"
  442. MOVNTQ" %%mm0, %0 \n\t"
  443. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  444. d += 4;
  445. s += 16;
  446. }
  447. __asm__ volatile(SFENCE:::"memory");
  448. __asm__ volatile(EMMS:::"memory");
  449. #endif
  450. while (s < end) {
  451. register int rgb = *(const uint32_t*)s; s += 4;
  452. *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
  453. }
  454. }
  455. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
  456. {
  457. const uint8_t *s = src;
  458. const uint8_t *end;
  459. #if HAVE_MMX
  460. const uint8_t *mm_end;
  461. #endif
  462. uint16_t *d = (uint16_t *)dst;
  463. end = s + src_size;
  464. #if HAVE_MMX
  465. mm_end = end - 15;
  466. #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
  467. __asm__ volatile(
  468. "movq %3, %%mm5 \n\t"
  469. "movq %4, %%mm6 \n\t"
  470. "movq %5, %%mm7 \n\t"
  471. "jmp 2f \n\t"
  472. ASMALIGN(4)
  473. "1: \n\t"
  474. PREFETCH" 32(%1) \n\t"
  475. "movd (%1), %%mm0 \n\t"
  476. "movd 4(%1), %%mm3 \n\t"
  477. "punpckldq 8(%1), %%mm0 \n\t"
  478. "punpckldq 12(%1), %%mm3 \n\t"
  479. "movq %%mm0, %%mm1 \n\t"
  480. "movq %%mm3, %%mm4 \n\t"
  481. "pand %%mm6, %%mm0 \n\t"
  482. "pand %%mm6, %%mm3 \n\t"
  483. "pmaddwd %%mm7, %%mm0 \n\t"
  484. "pmaddwd %%mm7, %%mm3 \n\t"
  485. "pand %%mm5, %%mm1 \n\t"
  486. "pand %%mm5, %%mm4 \n\t"
  487. "por %%mm1, %%mm0 \n\t"
  488. "por %%mm4, %%mm3 \n\t"
  489. "psrld $6, %%mm0 \n\t"
  490. "pslld $10, %%mm3 \n\t"
  491. "por %%mm3, %%mm0 \n\t"
  492. MOVNTQ" %%mm0, (%0) \n\t"
  493. "add $16, %1 \n\t"
  494. "add $8, %0 \n\t"
  495. "2: \n\t"
  496. "cmp %2, %1 \n\t"
  497. " jb 1b \n\t"
  498. : "+r" (d), "+r"(s)
  499. : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
  500. );
  501. #else
  502. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  503. __asm__ volatile(
  504. "movq %0, %%mm7 \n\t"
  505. "movq %1, %%mm6 \n\t"
  506. ::"m"(red_15mask),"m"(green_15mask));
  507. while (s < mm_end) {
  508. __asm__ volatile(
  509. PREFETCH" 32%1 \n\t"
  510. "movd %1, %%mm0 \n\t"
  511. "movd 4%1, %%mm3 \n\t"
  512. "punpckldq 8%1, %%mm0 \n\t"
  513. "punpckldq 12%1, %%mm3 \n\t"
  514. "movq %%mm0, %%mm1 \n\t"
  515. "movq %%mm0, %%mm2 \n\t"
  516. "movq %%mm3, %%mm4 \n\t"
  517. "movq %%mm3, %%mm5 \n\t"
  518. "psrlq $3, %%mm0 \n\t"
  519. "psrlq $3, %%mm3 \n\t"
  520. "pand %2, %%mm0 \n\t"
  521. "pand %2, %%mm3 \n\t"
  522. "psrlq $6, %%mm1 \n\t"
  523. "psrlq $6, %%mm4 \n\t"
  524. "pand %%mm6, %%mm1 \n\t"
  525. "pand %%mm6, %%mm4 \n\t"
  526. "psrlq $9, %%mm2 \n\t"
  527. "psrlq $9, %%mm5 \n\t"
  528. "pand %%mm7, %%mm2 \n\t"
  529. "pand %%mm7, %%mm5 \n\t"
  530. "por %%mm1, %%mm0 \n\t"
  531. "por %%mm4, %%mm3 \n\t"
  532. "por %%mm2, %%mm0 \n\t"
  533. "por %%mm5, %%mm3 \n\t"
  534. "psllq $16, %%mm3 \n\t"
  535. "por %%mm3, %%mm0 \n\t"
  536. MOVNTQ" %%mm0, %0 \n\t"
  537. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  538. d += 4;
  539. s += 16;
  540. }
  541. #endif
  542. __asm__ volatile(SFENCE:::"memory");
  543. __asm__ volatile(EMMS:::"memory");
  544. #endif
  545. while (s < end) {
  546. register int rgb = *(const uint32_t*)s; s += 4;
  547. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
  548. }
  549. }
  550. static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  551. {
  552. const uint8_t *s = src;
  553. const uint8_t *end;
  554. #if HAVE_MMX
  555. const uint8_t *mm_end;
  556. #endif
  557. uint16_t *d = (uint16_t *)dst;
  558. end = s + src_size;
  559. #if HAVE_MMX
  560. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  561. __asm__ volatile(
  562. "movq %0, %%mm7 \n\t"
  563. "movq %1, %%mm6 \n\t"
  564. ::"m"(red_15mask),"m"(green_15mask));
  565. mm_end = end - 15;
  566. while (s < mm_end) {
  567. __asm__ volatile(
  568. PREFETCH" 32%1 \n\t"
  569. "movd %1, %%mm0 \n\t"
  570. "movd 4%1, %%mm3 \n\t"
  571. "punpckldq 8%1, %%mm0 \n\t"
  572. "punpckldq 12%1, %%mm3 \n\t"
  573. "movq %%mm0, %%mm1 \n\t"
  574. "movq %%mm0, %%mm2 \n\t"
  575. "movq %%mm3, %%mm4 \n\t"
  576. "movq %%mm3, %%mm5 \n\t"
  577. "psllq $7, %%mm0 \n\t"
  578. "psllq $7, %%mm3 \n\t"
  579. "pand %%mm7, %%mm0 \n\t"
  580. "pand %%mm7, %%mm3 \n\t"
  581. "psrlq $6, %%mm1 \n\t"
  582. "psrlq $6, %%mm4 \n\t"
  583. "pand %%mm6, %%mm1 \n\t"
  584. "pand %%mm6, %%mm4 \n\t"
  585. "psrlq $19, %%mm2 \n\t"
  586. "psrlq $19, %%mm5 \n\t"
  587. "pand %2, %%mm2 \n\t"
  588. "pand %2, %%mm5 \n\t"
  589. "por %%mm1, %%mm0 \n\t"
  590. "por %%mm4, %%mm3 \n\t"
  591. "por %%mm2, %%mm0 \n\t"
  592. "por %%mm5, %%mm3 \n\t"
  593. "psllq $16, %%mm3 \n\t"
  594. "por %%mm3, %%mm0 \n\t"
  595. MOVNTQ" %%mm0, %0 \n\t"
  596. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  597. d += 4;
  598. s += 16;
  599. }
  600. __asm__ volatile(SFENCE:::"memory");
  601. __asm__ volatile(EMMS:::"memory");
  602. #endif
  603. while (s < end) {
  604. register int rgb = *(const uint32_t*)s; s += 4;
  605. *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
  606. }
  607. }
  608. static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  609. {
  610. const uint8_t *s = src;
  611. const uint8_t *end;
  612. #if HAVE_MMX
  613. const uint8_t *mm_end;
  614. #endif
  615. uint16_t *d = (uint16_t *)dst;
  616. end = s + src_size;
  617. #if HAVE_MMX
  618. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  619. __asm__ volatile(
  620. "movq %0, %%mm7 \n\t"
  621. "movq %1, %%mm6 \n\t"
  622. ::"m"(red_16mask),"m"(green_16mask));
  623. mm_end = end - 11;
  624. while (s < mm_end) {
  625. __asm__ volatile(
  626. PREFETCH" 32%1 \n\t"
  627. "movd %1, %%mm0 \n\t"
  628. "movd 3%1, %%mm3 \n\t"
  629. "punpckldq 6%1, %%mm0 \n\t"
  630. "punpckldq 9%1, %%mm3 \n\t"
  631. "movq %%mm0, %%mm1 \n\t"
  632. "movq %%mm0, %%mm2 \n\t"
  633. "movq %%mm3, %%mm4 \n\t"
  634. "movq %%mm3, %%mm5 \n\t"
  635. "psrlq $3, %%mm0 \n\t"
  636. "psrlq $3, %%mm3 \n\t"
  637. "pand %2, %%mm0 \n\t"
  638. "pand %2, %%mm3 \n\t"
  639. "psrlq $5, %%mm1 \n\t"
  640. "psrlq $5, %%mm4 \n\t"
  641. "pand %%mm6, %%mm1 \n\t"
  642. "pand %%mm6, %%mm4 \n\t"
  643. "psrlq $8, %%mm2 \n\t"
  644. "psrlq $8, %%mm5 \n\t"
  645. "pand %%mm7, %%mm2 \n\t"
  646. "pand %%mm7, %%mm5 \n\t"
  647. "por %%mm1, %%mm0 \n\t"
  648. "por %%mm4, %%mm3 \n\t"
  649. "por %%mm2, %%mm0 \n\t"
  650. "por %%mm5, %%mm3 \n\t"
  651. "psllq $16, %%mm3 \n\t"
  652. "por %%mm3, %%mm0 \n\t"
  653. MOVNTQ" %%mm0, %0 \n\t"
  654. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  655. d += 4;
  656. s += 12;
  657. }
  658. __asm__ volatile(SFENCE:::"memory");
  659. __asm__ volatile(EMMS:::"memory");
  660. #endif
  661. while (s < end) {
  662. const int b = *s++;
  663. const int g = *s++;
  664. const int r = *s++;
  665. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  666. }
  667. }
  668. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
  669. {
  670. const uint8_t *s = src;
  671. const uint8_t *end;
  672. #if HAVE_MMX
  673. const uint8_t *mm_end;
  674. #endif
  675. uint16_t *d = (uint16_t *)dst;
  676. end = s + src_size;
  677. #if HAVE_MMX
  678. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  679. __asm__ volatile(
  680. "movq %0, %%mm7 \n\t"
  681. "movq %1, %%mm6 \n\t"
  682. ::"m"(red_16mask),"m"(green_16mask));
  683. mm_end = end - 15;
  684. while (s < mm_end) {
  685. __asm__ volatile(
  686. PREFETCH" 32%1 \n\t"
  687. "movd %1, %%mm0 \n\t"
  688. "movd 3%1, %%mm3 \n\t"
  689. "punpckldq 6%1, %%mm0 \n\t"
  690. "punpckldq 9%1, %%mm3 \n\t"
  691. "movq %%mm0, %%mm1 \n\t"
  692. "movq %%mm0, %%mm2 \n\t"
  693. "movq %%mm3, %%mm4 \n\t"
  694. "movq %%mm3, %%mm5 \n\t"
  695. "psllq $8, %%mm0 \n\t"
  696. "psllq $8, %%mm3 \n\t"
  697. "pand %%mm7, %%mm0 \n\t"
  698. "pand %%mm7, %%mm3 \n\t"
  699. "psrlq $5, %%mm1 \n\t"
  700. "psrlq $5, %%mm4 \n\t"
  701. "pand %%mm6, %%mm1 \n\t"
  702. "pand %%mm6, %%mm4 \n\t"
  703. "psrlq $19, %%mm2 \n\t"
  704. "psrlq $19, %%mm5 \n\t"
  705. "pand %2, %%mm2 \n\t"
  706. "pand %2, %%mm5 \n\t"
  707. "por %%mm1, %%mm0 \n\t"
  708. "por %%mm4, %%mm3 \n\t"
  709. "por %%mm2, %%mm0 \n\t"
  710. "por %%mm5, %%mm3 \n\t"
  711. "psllq $16, %%mm3 \n\t"
  712. "por %%mm3, %%mm0 \n\t"
  713. MOVNTQ" %%mm0, %0 \n\t"
  714. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  715. d += 4;
  716. s += 12;
  717. }
  718. __asm__ volatile(SFENCE:::"memory");
  719. __asm__ volatile(EMMS:::"memory");
  720. #endif
  721. while (s < end) {
  722. const int r = *s++;
  723. const int g = *s++;
  724. const int b = *s++;
  725. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  726. }
  727. }
  728. static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  729. {
  730. const uint8_t *s = src;
  731. const uint8_t *end;
  732. #if HAVE_MMX
  733. const uint8_t *mm_end;
  734. #endif
  735. uint16_t *d = (uint16_t *)dst;
  736. end = s + src_size;
  737. #if HAVE_MMX
  738. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  739. __asm__ volatile(
  740. "movq %0, %%mm7 \n\t"
  741. "movq %1, %%mm6 \n\t"
  742. ::"m"(red_15mask),"m"(green_15mask));
  743. mm_end = end - 11;
  744. while (s < mm_end) {
  745. __asm__ volatile(
  746. PREFETCH" 32%1 \n\t"
  747. "movd %1, %%mm0 \n\t"
  748. "movd 3%1, %%mm3 \n\t"
  749. "punpckldq 6%1, %%mm0 \n\t"
  750. "punpckldq 9%1, %%mm3 \n\t"
  751. "movq %%mm0, %%mm1 \n\t"
  752. "movq %%mm0, %%mm2 \n\t"
  753. "movq %%mm3, %%mm4 \n\t"
  754. "movq %%mm3, %%mm5 \n\t"
  755. "psrlq $3, %%mm0 \n\t"
  756. "psrlq $3, %%mm3 \n\t"
  757. "pand %2, %%mm0 \n\t"
  758. "pand %2, %%mm3 \n\t"
  759. "psrlq $6, %%mm1 \n\t"
  760. "psrlq $6, %%mm4 \n\t"
  761. "pand %%mm6, %%mm1 \n\t"
  762. "pand %%mm6, %%mm4 \n\t"
  763. "psrlq $9, %%mm2 \n\t"
  764. "psrlq $9, %%mm5 \n\t"
  765. "pand %%mm7, %%mm2 \n\t"
  766. "pand %%mm7, %%mm5 \n\t"
  767. "por %%mm1, %%mm0 \n\t"
  768. "por %%mm4, %%mm3 \n\t"
  769. "por %%mm2, %%mm0 \n\t"
  770. "por %%mm5, %%mm3 \n\t"
  771. "psllq $16, %%mm3 \n\t"
  772. "por %%mm3, %%mm0 \n\t"
  773. MOVNTQ" %%mm0, %0 \n\t"
  774. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  775. d += 4;
  776. s += 12;
  777. }
  778. __asm__ volatile(SFENCE:::"memory");
  779. __asm__ volatile(EMMS:::"memory");
  780. #endif
  781. while (s < end) {
  782. const int b = *s++;
  783. const int g = *s++;
  784. const int r = *s++;
  785. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  786. }
  787. }
  788. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
  789. {
  790. const uint8_t *s = src;
  791. const uint8_t *end;
  792. #if HAVE_MMX
  793. const uint8_t *mm_end;
  794. #endif
  795. uint16_t *d = (uint16_t *)dst;
  796. end = s + src_size;
  797. #if HAVE_MMX
  798. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  799. __asm__ volatile(
  800. "movq %0, %%mm7 \n\t"
  801. "movq %1, %%mm6 \n\t"
  802. ::"m"(red_15mask),"m"(green_15mask));
  803. mm_end = end - 15;
  804. while (s < mm_end) {
  805. __asm__ volatile(
  806. PREFETCH" 32%1 \n\t"
  807. "movd %1, %%mm0 \n\t"
  808. "movd 3%1, %%mm3 \n\t"
  809. "punpckldq 6%1, %%mm0 \n\t"
  810. "punpckldq 9%1, %%mm3 \n\t"
  811. "movq %%mm0, %%mm1 \n\t"
  812. "movq %%mm0, %%mm2 \n\t"
  813. "movq %%mm3, %%mm4 \n\t"
  814. "movq %%mm3, %%mm5 \n\t"
  815. "psllq $7, %%mm0 \n\t"
  816. "psllq $7, %%mm3 \n\t"
  817. "pand %%mm7, %%mm0 \n\t"
  818. "pand %%mm7, %%mm3 \n\t"
  819. "psrlq $6, %%mm1 \n\t"
  820. "psrlq $6, %%mm4 \n\t"
  821. "pand %%mm6, %%mm1 \n\t"
  822. "pand %%mm6, %%mm4 \n\t"
  823. "psrlq $19, %%mm2 \n\t"
  824. "psrlq $19, %%mm5 \n\t"
  825. "pand %2, %%mm2 \n\t"
  826. "pand %2, %%mm5 \n\t"
  827. "por %%mm1, %%mm0 \n\t"
  828. "por %%mm4, %%mm3 \n\t"
  829. "por %%mm2, %%mm0 \n\t"
  830. "por %%mm5, %%mm3 \n\t"
  831. "psllq $16, %%mm3 \n\t"
  832. "por %%mm3, %%mm0 \n\t"
  833. MOVNTQ" %%mm0, %0 \n\t"
  834. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  835. d += 4;
  836. s += 12;
  837. }
  838. __asm__ volatile(SFENCE:::"memory");
  839. __asm__ volatile(EMMS:::"memory");
  840. #endif
  841. while (s < end) {
  842. const int r = *s++;
  843. const int g = *s++;
  844. const int b = *s++;
  845. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  846. }
  847. }
  848. /*
  849. I use less accurate approximation here by simply left-shifting the input
  850. value and filling the low order bits with zeroes. This method improves PNG
  851. compression but this scheme cannot reproduce white exactly, since it does
  852. not generate an all-ones maximum value; the net effect is to darken the
  853. image slightly.
  854. The better method should be "left bit replication":
  855. 4 3 2 1 0
  856. ---------
  857. 1 1 0 1 1
  858. 7 6 5 4 3 2 1 0
  859. ----------------
  860. 1 1 0 1 1 1 1 0
  861. |=======| |===|
  862. | leftmost bits repeated to fill open bits
  863. |
  864. original bits
  865. */
  866. static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  867. {
  868. const uint16_t *end;
  869. #if HAVE_MMX
  870. const uint16_t *mm_end;
  871. #endif
  872. uint8_t *d = dst;
  873. const uint16_t *s = (const uint16_t*)src;
  874. end = s + src_size/2;
  875. #if HAVE_MMX
  876. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  877. mm_end = end - 7;
  878. while (s < mm_end) {
  879. __asm__ volatile(
  880. PREFETCH" 32%1 \n\t"
  881. "movq %1, %%mm0 \n\t"
  882. "movq %1, %%mm1 \n\t"
  883. "movq %1, %%mm2 \n\t"
  884. "pand %2, %%mm0 \n\t"
  885. "pand %3, %%mm1 \n\t"
  886. "pand %4, %%mm2 \n\t"
  887. "psllq $3, %%mm0 \n\t"
  888. "psrlq $2, %%mm1 \n\t"
  889. "psrlq $7, %%mm2 \n\t"
  890. "movq %%mm0, %%mm3 \n\t"
  891. "movq %%mm1, %%mm4 \n\t"
  892. "movq %%mm2, %%mm5 \n\t"
  893. "punpcklwd %5, %%mm0 \n\t"
  894. "punpcklwd %5, %%mm1 \n\t"
  895. "punpcklwd %5, %%mm2 \n\t"
  896. "punpckhwd %5, %%mm3 \n\t"
  897. "punpckhwd %5, %%mm4 \n\t"
  898. "punpckhwd %5, %%mm5 \n\t"
  899. "psllq $8, %%mm1 \n\t"
  900. "psllq $16, %%mm2 \n\t"
  901. "por %%mm1, %%mm0 \n\t"
  902. "por %%mm2, %%mm0 \n\t"
  903. "psllq $8, %%mm4 \n\t"
  904. "psllq $16, %%mm5 \n\t"
  905. "por %%mm4, %%mm3 \n\t"
  906. "por %%mm5, %%mm3 \n\t"
  907. "movq %%mm0, %%mm6 \n\t"
  908. "movq %%mm3, %%mm7 \n\t"
  909. "movq 8%1, %%mm0 \n\t"
  910. "movq 8%1, %%mm1 \n\t"
  911. "movq 8%1, %%mm2 \n\t"
  912. "pand %2, %%mm0 \n\t"
  913. "pand %3, %%mm1 \n\t"
  914. "pand %4, %%mm2 \n\t"
  915. "psllq $3, %%mm0 \n\t"
  916. "psrlq $2, %%mm1 \n\t"
  917. "psrlq $7, %%mm2 \n\t"
  918. "movq %%mm0, %%mm3 \n\t"
  919. "movq %%mm1, %%mm4 \n\t"
  920. "movq %%mm2, %%mm5 \n\t"
  921. "punpcklwd %5, %%mm0 \n\t"
  922. "punpcklwd %5, %%mm1 \n\t"
  923. "punpcklwd %5, %%mm2 \n\t"
  924. "punpckhwd %5, %%mm3 \n\t"
  925. "punpckhwd %5, %%mm4 \n\t"
  926. "punpckhwd %5, %%mm5 \n\t"
  927. "psllq $8, %%mm1 \n\t"
  928. "psllq $16, %%mm2 \n\t"
  929. "por %%mm1, %%mm0 \n\t"
  930. "por %%mm2, %%mm0 \n\t"
  931. "psllq $8, %%mm4 \n\t"
  932. "psllq $16, %%mm5 \n\t"
  933. "por %%mm4, %%mm3 \n\t"
  934. "por %%mm5, %%mm3 \n\t"
  935. :"=m"(*d)
  936. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
  937. :"memory");
  938. /* borrowed 32 to 24 */
  939. __asm__ volatile(
  940. "movq %%mm0, %%mm4 \n\t"
  941. "movq %%mm3, %%mm5 \n\t"
  942. "movq %%mm6, %%mm0 \n\t"
  943. "movq %%mm7, %%mm1 \n\t"
  944. "movq %%mm4, %%mm6 \n\t"
  945. "movq %%mm5, %%mm7 \n\t"
  946. "movq %%mm0, %%mm2 \n\t"
  947. "movq %%mm1, %%mm3 \n\t"
  948. "psrlq $8, %%mm2 \n\t"
  949. "psrlq $8, %%mm3 \n\t"
  950. "psrlq $8, %%mm6 \n\t"
  951. "psrlq $8, %%mm7 \n\t"
  952. "pand %2, %%mm0 \n\t"
  953. "pand %2, %%mm1 \n\t"
  954. "pand %2, %%mm4 \n\t"
  955. "pand %2, %%mm5 \n\t"
  956. "pand %3, %%mm2 \n\t"
  957. "pand %3, %%mm3 \n\t"
  958. "pand %3, %%mm6 \n\t"
  959. "pand %3, %%mm7 \n\t"
  960. "por %%mm2, %%mm0 \n\t"
  961. "por %%mm3, %%mm1 \n\t"
  962. "por %%mm6, %%mm4 \n\t"
  963. "por %%mm7, %%mm5 \n\t"
  964. "movq %%mm1, %%mm2 \n\t"
  965. "movq %%mm4, %%mm3 \n\t"
  966. "psllq $48, %%mm2 \n\t"
  967. "psllq $32, %%mm3 \n\t"
  968. "pand %4, %%mm2 \n\t"
  969. "pand %5, %%mm3 \n\t"
  970. "por %%mm2, %%mm0 \n\t"
  971. "psrlq $16, %%mm1 \n\t"
  972. "psrlq $32, %%mm4 \n\t"
  973. "psllq $16, %%mm5 \n\t"
  974. "por %%mm3, %%mm1 \n\t"
  975. "pand %6, %%mm5 \n\t"
  976. "por %%mm5, %%mm4 \n\t"
  977. MOVNTQ" %%mm0, %0 \n\t"
  978. MOVNTQ" %%mm1, 8%0 \n\t"
  979. MOVNTQ" %%mm4, 16%0"
  980. :"=m"(*d)
  981. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  982. :"memory");
  983. d += 24;
  984. s += 8;
  985. }
  986. __asm__ volatile(SFENCE:::"memory");
  987. __asm__ volatile(EMMS:::"memory");
  988. #endif
  989. while (s < end) {
  990. register uint16_t bgr;
  991. bgr = *s++;
  992. *d++ = (bgr&0x1F)<<3;
  993. *d++ = (bgr&0x3E0)>>2;
  994. *d++ = (bgr&0x7C00)>>7;
  995. }
  996. }
  997. static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  998. {
  999. const uint16_t *end;
  1000. #if HAVE_MMX
  1001. const uint16_t *mm_end;
  1002. #endif
  1003. uint8_t *d = (uint8_t *)dst;
  1004. const uint16_t *s = (const uint16_t *)src;
  1005. end = s + src_size/2;
  1006. #if HAVE_MMX
  1007. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  1008. mm_end = end - 7;
  1009. while (s < mm_end) {
  1010. __asm__ volatile(
  1011. PREFETCH" 32%1 \n\t"
  1012. "movq %1, %%mm0 \n\t"
  1013. "movq %1, %%mm1 \n\t"
  1014. "movq %1, %%mm2 \n\t"
  1015. "pand %2, %%mm0 \n\t"
  1016. "pand %3, %%mm1 \n\t"
  1017. "pand %4, %%mm2 \n\t"
  1018. "psllq $3, %%mm0 \n\t"
  1019. "psrlq $3, %%mm1 \n\t"
  1020. "psrlq $8, %%mm2 \n\t"
  1021. "movq %%mm0, %%mm3 \n\t"
  1022. "movq %%mm1, %%mm4 \n\t"
  1023. "movq %%mm2, %%mm5 \n\t"
  1024. "punpcklwd %5, %%mm0 \n\t"
  1025. "punpcklwd %5, %%mm1 \n\t"
  1026. "punpcklwd %5, %%mm2 \n\t"
  1027. "punpckhwd %5, %%mm3 \n\t"
  1028. "punpckhwd %5, %%mm4 \n\t"
  1029. "punpckhwd %5, %%mm5 \n\t"
  1030. "psllq $8, %%mm1 \n\t"
  1031. "psllq $16, %%mm2 \n\t"
  1032. "por %%mm1, %%mm0 \n\t"
  1033. "por %%mm2, %%mm0 \n\t"
  1034. "psllq $8, %%mm4 \n\t"
  1035. "psllq $16, %%mm5 \n\t"
  1036. "por %%mm4, %%mm3 \n\t"
  1037. "por %%mm5, %%mm3 \n\t"
  1038. "movq %%mm0, %%mm6 \n\t"
  1039. "movq %%mm3, %%mm7 \n\t"
  1040. "movq 8%1, %%mm0 \n\t"
  1041. "movq 8%1, %%mm1 \n\t"
  1042. "movq 8%1, %%mm2 \n\t"
  1043. "pand %2, %%mm0 \n\t"
  1044. "pand %3, %%mm1 \n\t"
  1045. "pand %4, %%mm2 \n\t"
  1046. "psllq $3, %%mm0 \n\t"
  1047. "psrlq $3, %%mm1 \n\t"
  1048. "psrlq $8, %%mm2 \n\t"
  1049. "movq %%mm0, %%mm3 \n\t"
  1050. "movq %%mm1, %%mm4 \n\t"
  1051. "movq %%mm2, %%mm5 \n\t"
  1052. "punpcklwd %5, %%mm0 \n\t"
  1053. "punpcklwd %5, %%mm1 \n\t"
  1054. "punpcklwd %5, %%mm2 \n\t"
  1055. "punpckhwd %5, %%mm3 \n\t"
  1056. "punpckhwd %5, %%mm4 \n\t"
  1057. "punpckhwd %5, %%mm5 \n\t"
  1058. "psllq $8, %%mm1 \n\t"
  1059. "psllq $16, %%mm2 \n\t"
  1060. "por %%mm1, %%mm0 \n\t"
  1061. "por %%mm2, %%mm0 \n\t"
  1062. "psllq $8, %%mm4 \n\t"
  1063. "psllq $16, %%mm5 \n\t"
  1064. "por %%mm4, %%mm3 \n\t"
  1065. "por %%mm5, %%mm3 \n\t"
  1066. :"=m"(*d)
  1067. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
  1068. :"memory");
  1069. /* borrowed 32 to 24 */
  1070. __asm__ volatile(
  1071. "movq %%mm0, %%mm4 \n\t"
  1072. "movq %%mm3, %%mm5 \n\t"
  1073. "movq %%mm6, %%mm0 \n\t"
  1074. "movq %%mm7, %%mm1 \n\t"
  1075. "movq %%mm4, %%mm6 \n\t"
  1076. "movq %%mm5, %%mm7 \n\t"
  1077. "movq %%mm0, %%mm2 \n\t"
  1078. "movq %%mm1, %%mm3 \n\t"
  1079. "psrlq $8, %%mm2 \n\t"
  1080. "psrlq $8, %%mm3 \n\t"
  1081. "psrlq $8, %%mm6 \n\t"
  1082. "psrlq $8, %%mm7 \n\t"
  1083. "pand %2, %%mm0 \n\t"
  1084. "pand %2, %%mm1 \n\t"
  1085. "pand %2, %%mm4 \n\t"
  1086. "pand %2, %%mm5 \n\t"
  1087. "pand %3, %%mm2 \n\t"
  1088. "pand %3, %%mm3 \n\t"
  1089. "pand %3, %%mm6 \n\t"
  1090. "pand %3, %%mm7 \n\t"
  1091. "por %%mm2, %%mm0 \n\t"
  1092. "por %%mm3, %%mm1 \n\t"
  1093. "por %%mm6, %%mm4 \n\t"
  1094. "por %%mm7, %%mm5 \n\t"
  1095. "movq %%mm1, %%mm2 \n\t"
  1096. "movq %%mm4, %%mm3 \n\t"
  1097. "psllq $48, %%mm2 \n\t"
  1098. "psllq $32, %%mm3 \n\t"
  1099. "pand %4, %%mm2 \n\t"
  1100. "pand %5, %%mm3 \n\t"
  1101. "por %%mm2, %%mm0 \n\t"
  1102. "psrlq $16, %%mm1 \n\t"
  1103. "psrlq $32, %%mm4 \n\t"
  1104. "psllq $16, %%mm5 \n\t"
  1105. "por %%mm3, %%mm1 \n\t"
  1106. "pand %6, %%mm5 \n\t"
  1107. "por %%mm5, %%mm4 \n\t"
  1108. MOVNTQ" %%mm0, %0 \n\t"
  1109. MOVNTQ" %%mm1, 8%0 \n\t"
  1110. MOVNTQ" %%mm4, 16%0"
  1111. :"=m"(*d)
  1112. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1113. :"memory");
  1114. d += 24;
  1115. s += 8;
  1116. }
  1117. __asm__ volatile(SFENCE:::"memory");
  1118. __asm__ volatile(EMMS:::"memory");
  1119. #endif
  1120. while (s < end) {
  1121. register uint16_t bgr;
  1122. bgr = *s++;
  1123. *d++ = (bgr&0x1F)<<3;
  1124. *d++ = (bgr&0x7E0)>>3;
  1125. *d++ = (bgr&0xF800)>>8;
  1126. }
  1127. }
  1128. /*
  1129. * mm0 = 00 B3 00 B2 00 B1 00 B0
  1130. * mm1 = 00 G3 00 G2 00 G1 00 G0
  1131. * mm2 = 00 R3 00 R2 00 R1 00 R0
  1132. * mm6 = FF FF FF FF FF FF FF FF
  1133. * mm7 = 00 00 00 00 00 00 00 00
  1134. */
  1135. #define PACK_RGB32 \
  1136. "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
  1137. "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
  1138. "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
  1139. "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
  1140. "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
  1141. "movq %%mm0, %%mm3 \n\t" \
  1142. "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
  1143. "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
  1144. MOVNTQ" %%mm0, %0 \n\t" \
  1145. MOVNTQ" %%mm3, 8%0 \n\t" \
  1146. static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1147. {
  1148. const uint16_t *end;
  1149. #if HAVE_MMX
  1150. const uint16_t *mm_end;
  1151. #endif
  1152. uint8_t *d = dst;
  1153. const uint16_t *s = (const uint16_t *)src;
  1154. end = s + src_size/2;
  1155. #if HAVE_MMX
  1156. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  1157. __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  1158. __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
  1159. mm_end = end - 3;
  1160. while (s < mm_end) {
  1161. __asm__ volatile(
  1162. PREFETCH" 32%1 \n\t"
  1163. "movq %1, %%mm0 \n\t"
  1164. "movq %1, %%mm1 \n\t"
  1165. "movq %1, %%mm2 \n\t"
  1166. "pand %2, %%mm0 \n\t"
  1167. "pand %3, %%mm1 \n\t"
  1168. "pand %4, %%mm2 \n\t"
  1169. "psllq $3, %%mm0 \n\t"
  1170. "psrlq $2, %%mm1 \n\t"
  1171. "psrlq $7, %%mm2 \n\t"
  1172. PACK_RGB32
  1173. :"=m"(*d)
  1174. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
  1175. :"memory");
  1176. d += 16;
  1177. s += 4;
  1178. }
  1179. __asm__ volatile(SFENCE:::"memory");
  1180. __asm__ volatile(EMMS:::"memory");
  1181. #endif
  1182. while (s < end) {
  1183. register uint16_t bgr;
  1184. bgr = *s++;
  1185. #if HAVE_BIGENDIAN
  1186. *d++ = 255;
  1187. *d++ = (bgr&0x7C00)>>7;
  1188. *d++ = (bgr&0x3E0)>>2;
  1189. *d++ = (bgr&0x1F)<<3;
  1190. #else
  1191. *d++ = (bgr&0x1F)<<3;
  1192. *d++ = (bgr&0x3E0)>>2;
  1193. *d++ = (bgr&0x7C00)>>7;
  1194. *d++ = 255;
  1195. #endif
  1196. }
  1197. }
  1198. static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1199. {
  1200. const uint16_t *end;
  1201. #if HAVE_MMX
  1202. const uint16_t *mm_end;
  1203. #endif
  1204. uint8_t *d = dst;
  1205. const uint16_t *s = (const uint16_t*)src;
  1206. end = s + src_size/2;
  1207. #if HAVE_MMX
  1208. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  1209. __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  1210. __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
  1211. mm_end = end - 3;
  1212. while (s < mm_end) {
  1213. __asm__ volatile(
  1214. PREFETCH" 32%1 \n\t"
  1215. "movq %1, %%mm0 \n\t"
  1216. "movq %1, %%mm1 \n\t"
  1217. "movq %1, %%mm2 \n\t"
  1218. "pand %2, %%mm0 \n\t"
  1219. "pand %3, %%mm1 \n\t"
  1220. "pand %4, %%mm2 \n\t"
  1221. "psllq $3, %%mm0 \n\t"
  1222. "psrlq $3, %%mm1 \n\t"
  1223. "psrlq $8, %%mm2 \n\t"
  1224. PACK_RGB32
  1225. :"=m"(*d)
  1226. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
  1227. :"memory");
  1228. d += 16;
  1229. s += 4;
  1230. }
  1231. __asm__ volatile(SFENCE:::"memory");
  1232. __asm__ volatile(EMMS:::"memory");
  1233. #endif
  1234. while (s < end) {
  1235. register uint16_t bgr;
  1236. bgr = *s++;
  1237. #if HAVE_BIGENDIAN
  1238. *d++ = 255;
  1239. *d++ = (bgr&0xF800)>>8;
  1240. *d++ = (bgr&0x7E0)>>3;
  1241. *d++ = (bgr&0x1F)<<3;
  1242. #else
  1243. *d++ = (bgr&0x1F)<<3;
  1244. *d++ = (bgr&0x7E0)>>3;
  1245. *d++ = (bgr&0xF800)>>8;
  1246. *d++ = 255;
  1247. #endif
  1248. }
  1249. }
  1250. static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
  1251. {
  1252. x86_reg idx = 15 - src_size;
  1253. const uint8_t *s = src-idx;
  1254. uint8_t *d = dst-idx;
  1255. #if HAVE_MMX
  1256. __asm__ volatile(
  1257. "test %0, %0 \n\t"
  1258. "jns 2f \n\t"
  1259. PREFETCH" (%1, %0) \n\t"
  1260. "movq %3, %%mm7 \n\t"
  1261. "pxor %4, %%mm7 \n\t"
  1262. "movq %%mm7, %%mm6 \n\t"
  1263. "pxor %5, %%mm7 \n\t"
  1264. ASMALIGN(4)
  1265. "1: \n\t"
  1266. PREFETCH" 32(%1, %0) \n\t"
  1267. "movq (%1, %0), %%mm0 \n\t"
  1268. "movq 8(%1, %0), %%mm1 \n\t"
  1269. # if HAVE_MMX2
  1270. "pshufw $177, %%mm0, %%mm3 \n\t"
  1271. "pshufw $177, %%mm1, %%mm5 \n\t"
  1272. "pand %%mm7, %%mm0 \n\t"
  1273. "pand %%mm6, %%mm3 \n\t"
  1274. "pand %%mm7, %%mm1 \n\t"
  1275. "pand %%mm6, %%mm5 \n\t"
  1276. "por %%mm3, %%mm0 \n\t"
  1277. "por %%mm5, %%mm1 \n\t"
  1278. # else
  1279. "movq %%mm0, %%mm2 \n\t"
  1280. "movq %%mm1, %%mm4 \n\t"
  1281. "pand %%mm7, %%mm0 \n\t"
  1282. "pand %%mm6, %%mm2 \n\t"
  1283. "pand %%mm7, %%mm1 \n\t"
  1284. "pand %%mm6, %%mm4 \n\t"
  1285. "movq %%mm2, %%mm3 \n\t"
  1286. "movq %%mm4, %%mm5 \n\t"
  1287. "pslld $16, %%mm2 \n\t"
  1288. "psrld $16, %%mm3 \n\t"
  1289. "pslld $16, %%mm4 \n\t"
  1290. "psrld $16, %%mm5 \n\t"
  1291. "por %%mm2, %%mm0 \n\t"
  1292. "por %%mm4, %%mm1 \n\t"
  1293. "por %%mm3, %%mm0 \n\t"
  1294. "por %%mm5, %%mm1 \n\t"
  1295. # endif
  1296. MOVNTQ" %%mm0, (%2, %0) \n\t"
  1297. MOVNTQ" %%mm1, 8(%2, %0) \n\t"
  1298. "add $16, %0 \n\t"
  1299. "js 1b \n\t"
  1300. SFENCE" \n\t"
  1301. EMMS" \n\t"
  1302. "2: \n\t"
  1303. : "+&r"(idx)
  1304. : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
  1305. : "memory");
  1306. #endif
  1307. for (; idx<15; idx+=4) {
  1308. register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
  1309. v &= 0xff00ff;
  1310. *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
  1311. }
  1312. }
  1313. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  1314. {
  1315. unsigned i;
  1316. #if HAVE_MMX
  1317. x86_reg mmx_size= 23 - src_size;
  1318. __asm__ volatile (
  1319. "test %%"REG_a", %%"REG_a" \n\t"
  1320. "jns 2f \n\t"
  1321. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  1322. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  1323. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  1324. ASMALIGN(4)
  1325. "1: \n\t"
  1326. PREFETCH" 32(%1, %%"REG_a") \n\t"
  1327. "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1328. "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
  1329. "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
  1330. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  1331. "pand %%mm5, %%mm0 \n\t"
  1332. "pand %%mm6, %%mm1 \n\t"
  1333. "pand %%mm7, %%mm2 \n\t"
  1334. "por %%mm0, %%mm1 \n\t"
  1335. "por %%mm2, %%mm1 \n\t"
  1336. "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1337. MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
  1338. "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
  1339. "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
  1340. "pand %%mm7, %%mm0 \n\t"
  1341. "pand %%mm5, %%mm1 \n\t"
  1342. "pand %%mm6, %%mm2 \n\t"
  1343. "por %%mm0, %%mm1 \n\t"
  1344. "por %%mm2, %%mm1 \n\t"
  1345. "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
  1346. MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
  1347. "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
  1348. "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
  1349. "pand %%mm6, %%mm0 \n\t"
  1350. "pand %%mm7, %%mm1 \n\t"
  1351. "pand %%mm5, %%mm2 \n\t"
  1352. "por %%mm0, %%mm1 \n\t"
  1353. "por %%mm2, %%mm1 \n\t"
  1354. MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
  1355. "add $24, %%"REG_a" \n\t"
  1356. " js 1b \n\t"
  1357. "2: \n\t"
  1358. : "+a" (mmx_size)
  1359. : "r" (src-mmx_size), "r"(dst-mmx_size)
  1360. );
  1361. __asm__ volatile(SFENCE:::"memory");
  1362. __asm__ volatile(EMMS:::"memory");
  1363. if (mmx_size==23) return; //finished, was multiple of 8
  1364. src+= src_size;
  1365. dst+= src_size;
  1366. src_size= 23-mmx_size;
  1367. src-= src_size;
  1368. dst-= src_size;
  1369. #endif
  1370. for (i=0; i<src_size; i+=3) {
  1371. register uint8_t x;
  1372. x = src[i + 2];
  1373. dst[i + 1] = src[i + 1];
  1374. dst[i + 2] = src[i + 0];
  1375. dst[i + 0] = x;
  1376. }
  1377. }
  1378. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1379. long width, long height,
  1380. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1381. {
  1382. long y;
  1383. const x86_reg chromWidth= width>>1;
  1384. for (y=0; y<height; y++) {
  1385. #if HAVE_MMX
  1386. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1387. __asm__ volatile(
  1388. "xor %%"REG_a", %%"REG_a" \n\t"
  1389. ASMALIGN(4)
  1390. "1: \n\t"
  1391. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1392. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1393. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1394. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1395. "movq %%mm0, %%mm2 \n\t" // U(0)
  1396. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1397. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1398. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1399. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1400. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1401. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1402. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1403. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1404. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1405. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1406. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1407. MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
  1408. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
  1409. MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
  1410. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
  1411. "add $8, %%"REG_a" \n\t"
  1412. "cmp %4, %%"REG_a" \n\t"
  1413. " jb 1b \n\t"
  1414. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1415. : "%"REG_a
  1416. );
  1417. #else
  1418. #if ARCH_ALPHA && HAVE_MVI
  1419. #define pl2yuy2(n) \
  1420. y1 = yc[n]; \
  1421. y2 = yc2[n]; \
  1422. u = uc[n]; \
  1423. v = vc[n]; \
  1424. __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
  1425. __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
  1426. __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
  1427. __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
  1428. yuv1 = (u << 8) + (v << 24); \
  1429. yuv2 = yuv1 + y2; \
  1430. yuv1 += y1; \
  1431. qdst[n] = yuv1; \
  1432. qdst2[n] = yuv2;
  1433. int i;
  1434. uint64_t *qdst = (uint64_t *) dst;
  1435. uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
  1436. const uint32_t *yc = (uint32_t *) ysrc;
  1437. const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
  1438. const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
  1439. for (i = 0; i < chromWidth; i += 8) {
  1440. uint64_t y1, y2, yuv1, yuv2;
  1441. uint64_t u, v;
  1442. /* Prefetch */
  1443. __asm__("ldq $31,64(%0)" :: "r"(yc));
  1444. __asm__("ldq $31,64(%0)" :: "r"(yc2));
  1445. __asm__("ldq $31,64(%0)" :: "r"(uc));
  1446. __asm__("ldq $31,64(%0)" :: "r"(vc));
  1447. pl2yuy2(0);
  1448. pl2yuy2(1);
  1449. pl2yuy2(2);
  1450. pl2yuy2(3);
  1451. yc += 4;
  1452. yc2 += 4;
  1453. uc += 4;
  1454. vc += 4;
  1455. qdst += 4;
  1456. qdst2 += 4;
  1457. }
  1458. y++;
  1459. ysrc += lumStride;
  1460. dst += dstStride;
  1461. #elif HAVE_FAST_64BIT
  1462. int i;
  1463. uint64_t *ldst = (uint64_t *) dst;
  1464. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1465. for (i = 0; i < chromWidth; i += 2) {
  1466. uint64_t k, l;
  1467. k = yc[0] + (uc[0] << 8) +
  1468. (yc[1] << 16) + (vc[0] << 24);
  1469. l = yc[2] + (uc[1] << 8) +
  1470. (yc[3] << 16) + (vc[1] << 24);
  1471. *ldst++ = k + (l << 32);
  1472. yc += 4;
  1473. uc += 2;
  1474. vc += 2;
  1475. }
  1476. #else
  1477. int i, *idst = (int32_t *) dst;
  1478. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1479. for (i = 0; i < chromWidth; i++) {
  1480. #if HAVE_BIGENDIAN
  1481. *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
  1482. (yc[1] << 8) + (vc[0] << 0);
  1483. #else
  1484. *idst++ = yc[0] + (uc[0] << 8) +
  1485. (yc[1] << 16) + (vc[0] << 24);
  1486. #endif
  1487. yc += 2;
  1488. uc++;
  1489. vc++;
  1490. }
  1491. #endif
  1492. #endif
  1493. if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
  1494. usrc += chromStride;
  1495. vsrc += chromStride;
  1496. }
  1497. ysrc += lumStride;
  1498. dst += dstStride;
  1499. }
  1500. #if HAVE_MMX
  1501. __asm__(EMMS" \n\t"
  1502. SFENCE" \n\t"
  1503. :::"memory");
  1504. #endif
  1505. }
  1506. /**
  1507. * Height should be a multiple of 2 and width should be a multiple of 16.
  1508. * (If this is a problem for anyone then tell me, and I will fix it.)
  1509. */
  1510. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1511. long width, long height,
  1512. long lumStride, long chromStride, long dstStride)
  1513. {
  1514. //FIXME interpolate chroma
  1515. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1516. }
  1517. static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1518. long width, long height,
  1519. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1520. {
  1521. long y;
  1522. const x86_reg chromWidth= width>>1;
  1523. for (y=0; y<height; y++) {
  1524. #if HAVE_MMX
  1525. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1526. __asm__ volatile(
  1527. "xor %%"REG_a", %%"REG_a" \n\t"
  1528. ASMALIGN(4)
  1529. "1: \n\t"
  1530. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1531. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1532. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1533. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1534. "movq %%mm0, %%mm2 \n\t" // U(0)
  1535. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1536. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1537. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1538. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1539. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1540. "movq %%mm0, %%mm4 \n\t" // Y(0)
  1541. "movq %%mm2, %%mm6 \n\t" // Y(8)
  1542. "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
  1543. "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
  1544. "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
  1545. "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
  1546. MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
  1547. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
  1548. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
  1549. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
  1550. "add $8, %%"REG_a" \n\t"
  1551. "cmp %4, %%"REG_a" \n\t"
  1552. " jb 1b \n\t"
  1553. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1554. : "%"REG_a
  1555. );
  1556. #else
  1557. //FIXME adapt the Alpha ASM code from yv12->yuy2
  1558. #if HAVE_FAST_64BIT
  1559. int i;
  1560. uint64_t *ldst = (uint64_t *) dst;
  1561. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1562. for (i = 0; i < chromWidth; i += 2) {
  1563. uint64_t k, l;
  1564. k = uc[0] + (yc[0] << 8) +
  1565. (vc[0] << 16) + (yc[1] << 24);
  1566. l = uc[1] + (yc[2] << 8) +
  1567. (vc[1] << 16) + (yc[3] << 24);
  1568. *ldst++ = k + (l << 32);
  1569. yc += 4;
  1570. uc += 2;
  1571. vc += 2;
  1572. }
  1573. #else
  1574. int i, *idst = (int32_t *) dst;
  1575. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1576. for (i = 0; i < chromWidth; i++) {
  1577. #if HAVE_BIGENDIAN
  1578. *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
  1579. (vc[0] << 8) + (yc[1] << 0);
  1580. #else
  1581. *idst++ = uc[0] + (yc[0] << 8) +
  1582. (vc[0] << 16) + (yc[1] << 24);
  1583. #endif
  1584. yc += 2;
  1585. uc++;
  1586. vc++;
  1587. }
  1588. #endif
  1589. #endif
  1590. if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
  1591. usrc += chromStride;
  1592. vsrc += chromStride;
  1593. }
  1594. ysrc += lumStride;
  1595. dst += dstStride;
  1596. }
  1597. #if HAVE_MMX
  1598. __asm__(EMMS" \n\t"
  1599. SFENCE" \n\t"
  1600. :::"memory");
  1601. #endif
  1602. }
  1603. /**
  1604. * Height should be a multiple of 2 and width should be a multiple of 16
  1605. * (If this is a problem for anyone then tell me, and I will fix it.)
  1606. */
  1607. static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1608. long width, long height,
  1609. long lumStride, long chromStride, long dstStride)
  1610. {
  1611. //FIXME interpolate chroma
  1612. RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1613. }
  1614. /**
  1615. * Width should be a multiple of 16.
  1616. */
  1617. static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1618. long width, long height,
  1619. long lumStride, long chromStride, long dstStride)
  1620. {
  1621. RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1622. }
  1623. /**
  1624. * Width should be a multiple of 16.
  1625. */
  1626. static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1627. long width, long height,
  1628. long lumStride, long chromStride, long dstStride)
  1629. {
  1630. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1631. }
  1632. /**
  1633. * Height should be a multiple of 2 and width should be a multiple of 16.
  1634. * (If this is a problem for anyone then tell me, and I will fix it.)
  1635. */
  1636. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1637. long width, long height,
  1638. long lumStride, long chromStride, long srcStride)
  1639. {
  1640. long y;
  1641. const x86_reg chromWidth= width>>1;
  1642. for (y=0; y<height; y+=2) {
  1643. #if HAVE_MMX
  1644. __asm__ volatile(
  1645. "xor %%"REG_a", %%"REG_a" \n\t"
  1646. "pcmpeqw %%mm7, %%mm7 \n\t"
  1647. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1648. ASMALIGN(4)
  1649. "1: \n\t"
  1650. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1651. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1652. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1653. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  1654. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  1655. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  1656. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  1657. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1658. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1659. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1660. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1661. MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
  1662. "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
  1663. "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
  1664. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  1665. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  1666. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  1667. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  1668. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1669. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1670. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1671. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1672. MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
  1673. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1674. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1675. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1676. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1677. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1678. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1679. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1680. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1681. MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
  1682. MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
  1683. "add $8, %%"REG_a" \n\t"
  1684. "cmp %4, %%"REG_a" \n\t"
  1685. " jb 1b \n\t"
  1686. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1687. : "memory", "%"REG_a
  1688. );
  1689. ydst += lumStride;
  1690. src += srcStride;
  1691. __asm__ volatile(
  1692. "xor %%"REG_a", %%"REG_a" \n\t"
  1693. ASMALIGN(4)
  1694. "1: \n\t"
  1695. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1696. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1697. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1698. "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
  1699. "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
  1700. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1701. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1702. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1703. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1704. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1705. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1706. MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
  1707. MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
  1708. "add $8, %%"REG_a" \n\t"
  1709. "cmp %4, %%"REG_a" \n\t"
  1710. " jb 1b \n\t"
  1711. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1712. : "memory", "%"REG_a
  1713. );
  1714. #else
  1715. long i;
  1716. for (i=0; i<chromWidth; i++) {
  1717. ydst[2*i+0] = src[4*i+0];
  1718. udst[i] = src[4*i+1];
  1719. ydst[2*i+1] = src[4*i+2];
  1720. vdst[i] = src[4*i+3];
  1721. }
  1722. ydst += lumStride;
  1723. src += srcStride;
  1724. for (i=0; i<chromWidth; i++) {
  1725. ydst[2*i+0] = src[4*i+0];
  1726. ydst[2*i+1] = src[4*i+2];
  1727. }
  1728. #endif
  1729. udst += chromStride;
  1730. vdst += chromStride;
  1731. ydst += lumStride;
  1732. src += srcStride;
  1733. }
  1734. #if HAVE_MMX
  1735. __asm__ volatile(EMMS" \n\t"
  1736. SFENCE" \n\t"
  1737. :::"memory");
  1738. #endif
  1739. }
  1740. static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
  1741. uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1742. long width, long height, long lumStride, long chromStride)
  1743. {
  1744. /* Y Plane */
  1745. memcpy(ydst, ysrc, width*height);
  1746. /* XXX: implement upscaling for U,V */
  1747. }
  1748. static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
  1749. {
  1750. long x,y;
  1751. dst[0]= src[0];
  1752. // first line
  1753. for (x=0; x<srcWidth-1; x++) {
  1754. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1755. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1756. }
  1757. dst[2*srcWidth-1]= src[srcWidth-1];
  1758. dst+= dstStride;
  1759. for (y=1; y<srcHeight; y++) {
  1760. #if HAVE_MMX2 || HAVE_AMD3DNOW
  1761. const x86_reg mmxSize= srcWidth&~15;
  1762. __asm__ volatile(
  1763. "mov %4, %%"REG_a" \n\t"
  1764. "1: \n\t"
  1765. "movq (%0, %%"REG_a"), %%mm0 \n\t"
  1766. "movq (%1, %%"REG_a"), %%mm1 \n\t"
  1767. "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
  1768. "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
  1769. "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
  1770. "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
  1771. PAVGB" %%mm0, %%mm5 \n\t"
  1772. PAVGB" %%mm0, %%mm3 \n\t"
  1773. PAVGB" %%mm0, %%mm5 \n\t"
  1774. PAVGB" %%mm0, %%mm3 \n\t"
  1775. PAVGB" %%mm1, %%mm4 \n\t"
  1776. PAVGB" %%mm1, %%mm2 \n\t"
  1777. PAVGB" %%mm1, %%mm4 \n\t"
  1778. PAVGB" %%mm1, %%mm2 \n\t"
  1779. "movq %%mm5, %%mm7 \n\t"
  1780. "movq %%mm4, %%mm6 \n\t"
  1781. "punpcklbw %%mm3, %%mm5 \n\t"
  1782. "punpckhbw %%mm3, %%mm7 \n\t"
  1783. "punpcklbw %%mm2, %%mm4 \n\t"
  1784. "punpckhbw %%mm2, %%mm6 \n\t"
  1785. #if 1
  1786. MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
  1787. MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
  1788. MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
  1789. MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
  1790. #else
  1791. "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
  1792. "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
  1793. "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
  1794. "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
  1795. #endif
  1796. "add $8, %%"REG_a" \n\t"
  1797. " js 1b \n\t"
  1798. :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
  1799. "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
  1800. "g" (-mmxSize)
  1801. : "%"REG_a
  1802. );
  1803. #else
  1804. const x86_reg mmxSize=1;
  1805. #endif
  1806. dst[0 ]= (3*src[0] + src[srcStride])>>2;
  1807. dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
  1808. for (x=mmxSize-1; x<srcWidth-1; x++) {
  1809. dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
  1810. dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
  1811. dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
  1812. dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
  1813. }
  1814. dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
  1815. dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
  1816. dst+=dstStride*2;
  1817. src+=srcStride;
  1818. }
  1819. // last line
  1820. #if 1
  1821. dst[0]= src[0];
  1822. for (x=0; x<srcWidth-1; x++) {
  1823. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1824. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1825. }
  1826. dst[2*srcWidth-1]= src[srcWidth-1];
  1827. #else
  1828. for (x=0; x<srcWidth; x++) {
  1829. dst[2*x+0]=
  1830. dst[2*x+1]= src[x];
  1831. }
  1832. #endif
  1833. #if HAVE_MMX
  1834. __asm__ volatile(EMMS" \n\t"
  1835. SFENCE" \n\t"
  1836. :::"memory");
  1837. #endif
  1838. }
  1839. /**
  1840. * Height should be a multiple of 2 and width should be a multiple of 16.
  1841. * (If this is a problem for anyone then tell me, and I will fix it.)
  1842. * Chrominance data is only taken from every second line, others are ignored.
  1843. * FIXME: Write HQ version.
  1844. */
  1845. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1846. long width, long height,
  1847. long lumStride, long chromStride, long srcStride)
  1848. {
  1849. long y;
  1850. const x86_reg chromWidth= width>>1;
  1851. for (y=0; y<height; y+=2) {
  1852. #if HAVE_MMX
  1853. __asm__ volatile(
  1854. "xor %%"REG_a", %%"REG_a" \n\t"
  1855. "pcmpeqw %%mm7, %%mm7 \n\t"
  1856. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1857. ASMALIGN(4)
  1858. "1: \n\t"
  1859. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1860. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
  1861. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
  1862. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  1863. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  1864. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  1865. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  1866. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1867. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1868. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1869. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1870. MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
  1871. "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
  1872. "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
  1873. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  1874. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  1875. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  1876. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  1877. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1878. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1879. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1880. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1881. MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
  1882. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1883. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1884. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1885. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1886. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1887. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1888. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1889. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1890. MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
  1891. MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
  1892. "add $8, %%"REG_a" \n\t"
  1893. "cmp %4, %%"REG_a" \n\t"
  1894. " jb 1b \n\t"
  1895. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1896. : "memory", "%"REG_a
  1897. );
  1898. ydst += lumStride;
  1899. src += srcStride;
  1900. __asm__ volatile(
  1901. "xor %%"REG_a", %%"REG_a" \n\t"
  1902. ASMALIGN(4)
  1903. "1: \n\t"
  1904. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1905. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1906. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1907. "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
  1908. "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
  1909. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1910. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1911. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1912. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1913. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1914. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1915. MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
  1916. MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
  1917. "add $8, %%"REG_a" \n\t"
  1918. "cmp %4, %%"REG_a" \n\t"
  1919. " jb 1b \n\t"
  1920. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1921. : "memory", "%"REG_a
  1922. );
  1923. #else
  1924. long i;
  1925. for (i=0; i<chromWidth; i++) {
  1926. udst[i] = src[4*i+0];
  1927. ydst[2*i+0] = src[4*i+1];
  1928. vdst[i] = src[4*i+2];
  1929. ydst[2*i+1] = src[4*i+3];
  1930. }
  1931. ydst += lumStride;
  1932. src += srcStride;
  1933. for (i=0; i<chromWidth; i++) {
  1934. ydst[2*i+0] = src[4*i+1];
  1935. ydst[2*i+1] = src[4*i+3];
  1936. }
  1937. #endif
  1938. udst += chromStride;
  1939. vdst += chromStride;
  1940. ydst += lumStride;
  1941. src += srcStride;
  1942. }
  1943. #if HAVE_MMX
  1944. __asm__ volatile(EMMS" \n\t"
  1945. SFENCE" \n\t"
  1946. :::"memory");
  1947. #endif
  1948. }
  1949. /**
  1950. * Height should be a multiple of 2 and width should be a multiple of 2.
  1951. * (If this is a problem for anyone then tell me, and I will fix it.)
  1952. * Chrominance data is only taken from every second line,
  1953. * others are ignored in the C version.
  1954. * FIXME: Write HQ version.
  1955. */
  1956. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1957. long width, long height,
  1958. long lumStride, long chromStride, long srcStride)
  1959. {
  1960. long y;
  1961. const x86_reg chromWidth= width>>1;
  1962. #if HAVE_MMX
  1963. for (y=0; y<height-2; y+=2) {
  1964. long i;
  1965. for (i=0; i<2; i++) {
  1966. __asm__ volatile(
  1967. "mov %2, %%"REG_a" \n\t"
  1968. "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
  1969. "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
  1970. "pxor %%mm7, %%mm7 \n\t"
  1971. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
  1972. ASMALIGN(4)
  1973. "1: \n\t"
  1974. PREFETCH" 64(%0, %%"REG_d") \n\t"
  1975. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  1976. "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
  1977. "punpcklbw %%mm7, %%mm0 \n\t"
  1978. "punpcklbw %%mm7, %%mm1 \n\t"
  1979. "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
  1980. "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
  1981. "punpcklbw %%mm7, %%mm2 \n\t"
  1982. "punpcklbw %%mm7, %%mm3 \n\t"
  1983. "pmaddwd %%mm6, %%mm0 \n\t"
  1984. "pmaddwd %%mm6, %%mm1 \n\t"
  1985. "pmaddwd %%mm6, %%mm2 \n\t"
  1986. "pmaddwd %%mm6, %%mm3 \n\t"
  1987. #ifndef FAST_BGR2YV12
  1988. "psrad $8, %%mm0 \n\t"
  1989. "psrad $8, %%mm1 \n\t"
  1990. "psrad $8, %%mm2 \n\t"
  1991. "psrad $8, %%mm3 \n\t"
  1992. #endif
  1993. "packssdw %%mm1, %%mm0 \n\t"
  1994. "packssdw %%mm3, %%mm2 \n\t"
  1995. "pmaddwd %%mm5, %%mm0 \n\t"
  1996. "pmaddwd %%mm5, %%mm2 \n\t"
  1997. "packssdw %%mm2, %%mm0 \n\t"
  1998. "psraw $7, %%mm0 \n\t"
  1999. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  2000. "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
  2001. "punpcklbw %%mm7, %%mm4 \n\t"
  2002. "punpcklbw %%mm7, %%mm1 \n\t"
  2003. "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
  2004. "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
  2005. "punpcklbw %%mm7, %%mm2 \n\t"
  2006. "punpcklbw %%mm7, %%mm3 \n\t"
  2007. "pmaddwd %%mm6, %%mm4 \n\t"
  2008. "pmaddwd %%mm6, %%mm1 \n\t"
  2009. "pmaddwd %%mm6, %%mm2 \n\t"
  2010. "pmaddwd %%mm6, %%mm3 \n\t"
  2011. #ifndef FAST_BGR2YV12
  2012. "psrad $8, %%mm4 \n\t"
  2013. "psrad $8, %%mm1 \n\t"
  2014. "psrad $8, %%mm2 \n\t"
  2015. "psrad $8, %%mm3 \n\t"
  2016. #endif
  2017. "packssdw %%mm1, %%mm4 \n\t"
  2018. "packssdw %%mm3, %%mm2 \n\t"
  2019. "pmaddwd %%mm5, %%mm4 \n\t"
  2020. "pmaddwd %%mm5, %%mm2 \n\t"
  2021. "add $24, %%"REG_d" \n\t"
  2022. "packssdw %%mm2, %%mm4 \n\t"
  2023. "psraw $7, %%mm4 \n\t"
  2024. "packuswb %%mm4, %%mm0 \n\t"
  2025. "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
  2026. MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
  2027. "add $8, %%"REG_a" \n\t"
  2028. " js 1b \n\t"
  2029. : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
  2030. : "%"REG_a, "%"REG_d
  2031. );
  2032. ydst += lumStride;
  2033. src += srcStride;
  2034. }
  2035. src -= srcStride*2;
  2036. __asm__ volatile(
  2037. "mov %4, %%"REG_a" \n\t"
  2038. "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
  2039. "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
  2040. "pxor %%mm7, %%mm7 \n\t"
  2041. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
  2042. "add %%"REG_d", %%"REG_d" \n\t"
  2043. ASMALIGN(4)
  2044. "1: \n\t"
  2045. PREFETCH" 64(%0, %%"REG_d") \n\t"
  2046. PREFETCH" 64(%1, %%"REG_d") \n\t"
  2047. #if HAVE_MMX2 || HAVE_AMD3DNOW
  2048. "movq (%0, %%"REG_d"), %%mm0 \n\t"
  2049. "movq (%1, %%"REG_d"), %%mm1 \n\t"
  2050. "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
  2051. "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
  2052. PAVGB" %%mm1, %%mm0 \n\t"
  2053. PAVGB" %%mm3, %%mm2 \n\t"
  2054. "movq %%mm0, %%mm1 \n\t"
  2055. "movq %%mm2, %%mm3 \n\t"
  2056. "psrlq $24, %%mm0 \n\t"
  2057. "psrlq $24, %%mm2 \n\t"
  2058. PAVGB" %%mm1, %%mm0 \n\t"
  2059. PAVGB" %%mm3, %%mm2 \n\t"
  2060. "punpcklbw %%mm7, %%mm0 \n\t"
  2061. "punpcklbw %%mm7, %%mm2 \n\t"
  2062. #else
  2063. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  2064. "movd (%1, %%"REG_d"), %%mm1 \n\t"
  2065. "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
  2066. "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
  2067. "punpcklbw %%mm7, %%mm0 \n\t"
  2068. "punpcklbw %%mm7, %%mm1 \n\t"
  2069. "punpcklbw %%mm7, %%mm2 \n\t"
  2070. "punpcklbw %%mm7, %%mm3 \n\t"
  2071. "paddw %%mm1, %%mm0 \n\t"
  2072. "paddw %%mm3, %%mm2 \n\t"
  2073. "paddw %%mm2, %%mm0 \n\t"
  2074. "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
  2075. "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
  2076. "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
  2077. "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
  2078. "punpcklbw %%mm7, %%mm4 \n\t"
  2079. "punpcklbw %%mm7, %%mm1 \n\t"
  2080. "punpcklbw %%mm7, %%mm2 \n\t"
  2081. "punpcklbw %%mm7, %%mm3 \n\t"
  2082. "paddw %%mm1, %%mm4 \n\t"
  2083. "paddw %%mm3, %%mm2 \n\t"
  2084. "paddw %%mm4, %%mm2 \n\t"
  2085. "psrlw $2, %%mm0 \n\t"
  2086. "psrlw $2, %%mm2 \n\t"
  2087. #endif
  2088. "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
  2089. "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
  2090. "pmaddwd %%mm0, %%mm1 \n\t"
  2091. "pmaddwd %%mm2, %%mm3 \n\t"
  2092. "pmaddwd %%mm6, %%mm0 \n\t"
  2093. "pmaddwd %%mm6, %%mm2 \n\t"
  2094. #ifndef FAST_BGR2YV12
  2095. "psrad $8, %%mm0 \n\t"
  2096. "psrad $8, %%mm1 \n\t"
  2097. "psrad $8, %%mm2 \n\t"
  2098. "psrad $8, %%mm3 \n\t"
  2099. #endif
  2100. "packssdw %%mm2, %%mm0 \n\t"
  2101. "packssdw %%mm3, %%mm1 \n\t"
  2102. "pmaddwd %%mm5, %%mm0 \n\t"
  2103. "pmaddwd %%mm5, %%mm1 \n\t"
  2104. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  2105. "psraw $7, %%mm0 \n\t"
  2106. #if HAVE_MMX2 || HAVE_AMD3DNOW
  2107. "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
  2108. "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
  2109. "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
  2110. "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
  2111. PAVGB" %%mm1, %%mm4 \n\t"
  2112. PAVGB" %%mm3, %%mm2 \n\t"
  2113. "movq %%mm4, %%mm1 \n\t"
  2114. "movq %%mm2, %%mm3 \n\t"
  2115. "psrlq $24, %%mm4 \n\t"
  2116. "psrlq $24, %%mm2 \n\t"
  2117. PAVGB" %%mm1, %%mm4 \n\t"
  2118. PAVGB" %%mm3, %%mm2 \n\t"
  2119. "punpcklbw %%mm7, %%mm4 \n\t"
  2120. "punpcklbw %%mm7, %%mm2 \n\t"
  2121. #else
  2122. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  2123. "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
  2124. "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
  2125. "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
  2126. "punpcklbw %%mm7, %%mm4 \n\t"
  2127. "punpcklbw %%mm7, %%mm1 \n\t"
  2128. "punpcklbw %%mm7, %%mm2 \n\t"
  2129. "punpcklbw %%mm7, %%mm3 \n\t"
  2130. "paddw %%mm1, %%mm4 \n\t"
  2131. "paddw %%mm3, %%mm2 \n\t"
  2132. "paddw %%mm2, %%mm4 \n\t"
  2133. "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
  2134. "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
  2135. "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
  2136. "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
  2137. "punpcklbw %%mm7, %%mm5 \n\t"
  2138. "punpcklbw %%mm7, %%mm1 \n\t"
  2139. "punpcklbw %%mm7, %%mm2 \n\t"
  2140. "punpcklbw %%mm7, %%mm3 \n\t"
  2141. "paddw %%mm1, %%mm5 \n\t"
  2142. "paddw %%mm3, %%mm2 \n\t"
  2143. "paddw %%mm5, %%mm2 \n\t"
  2144. "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
  2145. "psrlw $2, %%mm4 \n\t"
  2146. "psrlw $2, %%mm2 \n\t"
  2147. #endif
  2148. "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
  2149. "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
  2150. "pmaddwd %%mm4, %%mm1 \n\t"
  2151. "pmaddwd %%mm2, %%mm3 \n\t"
  2152. "pmaddwd %%mm6, %%mm4 \n\t"
  2153. "pmaddwd %%mm6, %%mm2 \n\t"
  2154. #ifndef FAST_BGR2YV12
  2155. "psrad $8, %%mm4 \n\t"
  2156. "psrad $8, %%mm1 \n\t"
  2157. "psrad $8, %%mm2 \n\t"
  2158. "psrad $8, %%mm3 \n\t"
  2159. #endif
  2160. "packssdw %%mm2, %%mm4 \n\t"
  2161. "packssdw %%mm3, %%mm1 \n\t"
  2162. "pmaddwd %%mm5, %%mm4 \n\t"
  2163. "pmaddwd %%mm5, %%mm1 \n\t"
  2164. "add $24, %%"REG_d" \n\t"
  2165. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  2166. "psraw $7, %%mm4 \n\t"
  2167. "movq %%mm0, %%mm1 \n\t"
  2168. "punpckldq %%mm4, %%mm0 \n\t"
  2169. "punpckhdq %%mm4, %%mm1 \n\t"
  2170. "packsswb %%mm1, %%mm0 \n\t"
  2171. "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
  2172. "movd %%mm0, (%2, %%"REG_a") \n\t"
  2173. "punpckhdq %%mm0, %%mm0 \n\t"
  2174. "movd %%mm0, (%3, %%"REG_a") \n\t"
  2175. "add $4, %%"REG_a" \n\t"
  2176. " js 1b \n\t"
  2177. : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
  2178. : "%"REG_a, "%"REG_d
  2179. );
  2180. udst += chromStride;
  2181. vdst += chromStride;
  2182. src += srcStride*2;
  2183. }
  2184. __asm__ volatile(EMMS" \n\t"
  2185. SFENCE" \n\t"
  2186. :::"memory");
  2187. #else
  2188. y=0;
  2189. #endif
  2190. for (; y<height; y+=2) {
  2191. long i;
  2192. for (i=0; i<chromWidth; i++) {
  2193. unsigned int b = src[6*i+0];
  2194. unsigned int g = src[6*i+1];
  2195. unsigned int r = src[6*i+2];
  2196. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2197. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  2198. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  2199. udst[i] = U;
  2200. vdst[i] = V;
  2201. ydst[2*i] = Y;
  2202. b = src[6*i+3];
  2203. g = src[6*i+4];
  2204. r = src[6*i+5];
  2205. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2206. ydst[2*i+1] = Y;
  2207. }
  2208. ydst += lumStride;
  2209. src += srcStride;
  2210. for (i=0; i<chromWidth; i++) {
  2211. unsigned int b = src[6*i+0];
  2212. unsigned int g = src[6*i+1];
  2213. unsigned int r = src[6*i+2];
  2214. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2215. ydst[2*i] = Y;
  2216. b = src[6*i+3];
  2217. g = src[6*i+4];
  2218. r = src[6*i+5];
  2219. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2220. ydst[2*i+1] = Y;
  2221. }
  2222. udst += chromStride;
  2223. vdst += chromStride;
  2224. ydst += lumStride;
  2225. src += srcStride;
  2226. }
  2227. }
  2228. static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
  2229. long width, long height, long src1Stride,
  2230. long src2Stride, long dstStride)
  2231. {
  2232. long h;
  2233. for (h=0; h < height; h++) {
  2234. long w;
  2235. #if HAVE_MMX
  2236. #if HAVE_SSE2
  2237. __asm__(
  2238. "xor %%"REG_a", %%"REG_a" \n\t"
  2239. "1: \n\t"
  2240. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2241. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2242. "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
  2243. "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
  2244. "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
  2245. "punpcklbw %%xmm2, %%xmm0 \n\t"
  2246. "punpckhbw %%xmm2, %%xmm1 \n\t"
  2247. "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
  2248. "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
  2249. "add $16, %%"REG_a" \n\t"
  2250. "cmp %3, %%"REG_a" \n\t"
  2251. " jb 1b \n\t"
  2252. ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
  2253. : "memory", "%"REG_a""
  2254. );
  2255. #else
  2256. __asm__(
  2257. "xor %%"REG_a", %%"REG_a" \n\t"
  2258. "1: \n\t"
  2259. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2260. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2261. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  2262. "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
  2263. "movq %%mm0, %%mm1 \n\t"
  2264. "movq %%mm2, %%mm3 \n\t"
  2265. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  2266. "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
  2267. "punpcklbw %%mm4, %%mm0 \n\t"
  2268. "punpckhbw %%mm4, %%mm1 \n\t"
  2269. "punpcklbw %%mm5, %%mm2 \n\t"
  2270. "punpckhbw %%mm5, %%mm3 \n\t"
  2271. MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
  2272. MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
  2273. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
  2274. MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
  2275. "add $16, %%"REG_a" \n\t"
  2276. "cmp %3, %%"REG_a" \n\t"
  2277. " jb 1b \n\t"
  2278. ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
  2279. : "memory", "%"REG_a
  2280. );
  2281. #endif
  2282. for (w= (width&(~15)); w < width; w++) {
  2283. dest[2*w+0] = src1[w];
  2284. dest[2*w+1] = src2[w];
  2285. }
  2286. #else
  2287. for (w=0; w < width; w++) {
  2288. dest[2*w+0] = src1[w];
  2289. dest[2*w+1] = src2[w];
  2290. }
  2291. #endif
  2292. dest += dstStride;
  2293. src1 += src1Stride;
  2294. src2 += src2Stride;
  2295. }
  2296. #if HAVE_MMX
  2297. __asm__(
  2298. EMMS" \n\t"
  2299. SFENCE" \n\t"
  2300. ::: "memory"
  2301. );
  2302. #endif
  2303. }
  2304. static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
  2305. uint8_t *dst1, uint8_t *dst2,
  2306. long width, long height,
  2307. long srcStride1, long srcStride2,
  2308. long dstStride1, long dstStride2)
  2309. {
  2310. x86_reg y;
  2311. long x,w,h;
  2312. w=width/2; h=height/2;
  2313. #if HAVE_MMX
  2314. __asm__ volatile(
  2315. PREFETCH" %0 \n\t"
  2316. PREFETCH" %1 \n\t"
  2317. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
  2318. #endif
  2319. for (y=0;y<h;y++) {
  2320. const uint8_t* s1=src1+srcStride1*(y>>1);
  2321. uint8_t* d=dst1+dstStride1*y;
  2322. x=0;
  2323. #if HAVE_MMX
  2324. for (;x<w-31;x+=32) {
  2325. __asm__ volatile(
  2326. PREFETCH" 32%1 \n\t"
  2327. "movq %1, %%mm0 \n\t"
  2328. "movq 8%1, %%mm2 \n\t"
  2329. "movq 16%1, %%mm4 \n\t"
  2330. "movq 24%1, %%mm6 \n\t"
  2331. "movq %%mm0, %%mm1 \n\t"
  2332. "movq %%mm2, %%mm3 \n\t"
  2333. "movq %%mm4, %%mm5 \n\t"
  2334. "movq %%mm6, %%mm7 \n\t"
  2335. "punpcklbw %%mm0, %%mm0 \n\t"
  2336. "punpckhbw %%mm1, %%mm1 \n\t"
  2337. "punpcklbw %%mm2, %%mm2 \n\t"
  2338. "punpckhbw %%mm3, %%mm3 \n\t"
  2339. "punpcklbw %%mm4, %%mm4 \n\t"
  2340. "punpckhbw %%mm5, %%mm5 \n\t"
  2341. "punpcklbw %%mm6, %%mm6 \n\t"
  2342. "punpckhbw %%mm7, %%mm7 \n\t"
  2343. MOVNTQ" %%mm0, %0 \n\t"
  2344. MOVNTQ" %%mm1, 8%0 \n\t"
  2345. MOVNTQ" %%mm2, 16%0 \n\t"
  2346. MOVNTQ" %%mm3, 24%0 \n\t"
  2347. MOVNTQ" %%mm4, 32%0 \n\t"
  2348. MOVNTQ" %%mm5, 40%0 \n\t"
  2349. MOVNTQ" %%mm6, 48%0 \n\t"
  2350. MOVNTQ" %%mm7, 56%0"
  2351. :"=m"(d[2*x])
  2352. :"m"(s1[x])
  2353. :"memory");
  2354. }
  2355. #endif
  2356. for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
  2357. }
  2358. for (y=0;y<h;y++) {
  2359. const uint8_t* s2=src2+srcStride2*(y>>1);
  2360. uint8_t* d=dst2+dstStride2*y;
  2361. x=0;
  2362. #if HAVE_MMX
  2363. for (;x<w-31;x+=32) {
  2364. __asm__ volatile(
  2365. PREFETCH" 32%1 \n\t"
  2366. "movq %1, %%mm0 \n\t"
  2367. "movq 8%1, %%mm2 \n\t"
  2368. "movq 16%1, %%mm4 \n\t"
  2369. "movq 24%1, %%mm6 \n\t"
  2370. "movq %%mm0, %%mm1 \n\t"
  2371. "movq %%mm2, %%mm3 \n\t"
  2372. "movq %%mm4, %%mm5 \n\t"
  2373. "movq %%mm6, %%mm7 \n\t"
  2374. "punpcklbw %%mm0, %%mm0 \n\t"
  2375. "punpckhbw %%mm1, %%mm1 \n\t"
  2376. "punpcklbw %%mm2, %%mm2 \n\t"
  2377. "punpckhbw %%mm3, %%mm3 \n\t"
  2378. "punpcklbw %%mm4, %%mm4 \n\t"
  2379. "punpckhbw %%mm5, %%mm5 \n\t"
  2380. "punpcklbw %%mm6, %%mm6 \n\t"
  2381. "punpckhbw %%mm7, %%mm7 \n\t"
  2382. MOVNTQ" %%mm0, %0 \n\t"
  2383. MOVNTQ" %%mm1, 8%0 \n\t"
  2384. MOVNTQ" %%mm2, 16%0 \n\t"
  2385. MOVNTQ" %%mm3, 24%0 \n\t"
  2386. MOVNTQ" %%mm4, 32%0 \n\t"
  2387. MOVNTQ" %%mm5, 40%0 \n\t"
  2388. MOVNTQ" %%mm6, 48%0 \n\t"
  2389. MOVNTQ" %%mm7, 56%0"
  2390. :"=m"(d[2*x])
  2391. :"m"(s2[x])
  2392. :"memory");
  2393. }
  2394. #endif
  2395. for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
  2396. }
  2397. #if HAVE_MMX
  2398. __asm__(
  2399. EMMS" \n\t"
  2400. SFENCE" \n\t"
  2401. ::: "memory"
  2402. );
  2403. #endif
  2404. }
  2405. static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
  2406. uint8_t *dst,
  2407. long width, long height,
  2408. long srcStride1, long srcStride2,
  2409. long srcStride3, long dstStride)
  2410. {
  2411. x86_reg x;
  2412. long y,w,h;
  2413. w=width/2; h=height;
  2414. for (y=0;y<h;y++) {
  2415. const uint8_t* yp=src1+srcStride1*y;
  2416. const uint8_t* up=src2+srcStride2*(y>>2);
  2417. const uint8_t* vp=src3+srcStride3*(y>>2);
  2418. uint8_t* d=dst+dstStride*y;
  2419. x=0;
  2420. #if HAVE_MMX
  2421. for (;x<w-7;x+=8) {
  2422. __asm__ volatile(
  2423. PREFETCH" 32(%1, %0) \n\t"
  2424. PREFETCH" 32(%2, %0) \n\t"
  2425. PREFETCH" 32(%3, %0) \n\t"
  2426. "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2427. "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
  2428. "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
  2429. "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2430. "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
  2431. "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
  2432. "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
  2433. "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
  2434. "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
  2435. "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
  2436. "movq %%mm1, %%mm6 \n\t"
  2437. "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
  2438. "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
  2439. "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
  2440. MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
  2441. MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
  2442. "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
  2443. "movq 8(%1, %0, 4), %%mm0 \n\t"
  2444. "movq %%mm0, %%mm3 \n\t"
  2445. "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
  2446. "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
  2447. MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
  2448. MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
  2449. "movq %%mm4, %%mm6 \n\t"
  2450. "movq 16(%1, %0, 4), %%mm0 \n\t"
  2451. "movq %%mm0, %%mm3 \n\t"
  2452. "punpcklbw %%mm5, %%mm4 \n\t"
  2453. "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
  2454. "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
  2455. MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
  2456. MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
  2457. "punpckhbw %%mm5, %%mm6 \n\t"
  2458. "movq 24(%1, %0, 4), %%mm0 \n\t"
  2459. "movq %%mm0, %%mm3 \n\t"
  2460. "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
  2461. "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
  2462. MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
  2463. MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
  2464. : "+r" (x)
  2465. : "r"(yp), "r" (up), "r"(vp), "r"(d)
  2466. :"memory");
  2467. }
  2468. #endif
  2469. for (; x<w; x++) {
  2470. const long x2 = x<<2;
  2471. d[8*x+0] = yp[x2];
  2472. d[8*x+1] = up[x];
  2473. d[8*x+2] = yp[x2+1];
  2474. d[8*x+3] = vp[x];
  2475. d[8*x+4] = yp[x2+2];
  2476. d[8*x+5] = up[x];
  2477. d[8*x+6] = yp[x2+3];
  2478. d[8*x+7] = vp[x];
  2479. }
  2480. }
  2481. #if HAVE_MMX
  2482. __asm__(
  2483. EMMS" \n\t"
  2484. SFENCE" \n\t"
  2485. ::: "memory"
  2486. );
  2487. #endif
  2488. }
  2489. static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
  2490. {
  2491. dst += count;
  2492. src += 2*count;
  2493. count= - count;
  2494. #if HAVE_MMX
  2495. if(count <= -16) {
  2496. count += 15;
  2497. __asm__ volatile(
  2498. "pcmpeqw %%mm7, %%mm7 \n\t"
  2499. "psrlw $8, %%mm7 \n\t"
  2500. "1: \n\t"
  2501. "movq -30(%1, %0, 2), %%mm0 \n\t"
  2502. "movq -22(%1, %0, 2), %%mm1 \n\t"
  2503. "movq -14(%1, %0, 2), %%mm2 \n\t"
  2504. "movq -6(%1, %0, 2), %%mm3 \n\t"
  2505. "pand %%mm7, %%mm0 \n\t"
  2506. "pand %%mm7, %%mm1 \n\t"
  2507. "pand %%mm7, %%mm2 \n\t"
  2508. "pand %%mm7, %%mm3 \n\t"
  2509. "packuswb %%mm1, %%mm0 \n\t"
  2510. "packuswb %%mm3, %%mm2 \n\t"
  2511. MOVNTQ" %%mm0,-15(%2, %0) \n\t"
  2512. MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
  2513. "add $16, %0 \n\t"
  2514. " js 1b \n\t"
  2515. : "+r"(count)
  2516. : "r"(src), "r"(dst)
  2517. );
  2518. count -= 15;
  2519. }
  2520. #endif
  2521. while(count<0) {
  2522. dst[count]= src[2*count];
  2523. count++;
  2524. }
  2525. }
  2526. static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2527. {
  2528. dst0+= count;
  2529. dst1+= count;
  2530. src += 4*count;
  2531. count= - count;
  2532. #if HAVE_MMX
  2533. if(count <= -8) {
  2534. count += 7;
  2535. __asm__ volatile(
  2536. "pcmpeqw %%mm7, %%mm7 \n\t"
  2537. "psrlw $8, %%mm7 \n\t"
  2538. "1: \n\t"
  2539. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2540. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2541. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2542. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2543. "pand %%mm7, %%mm0 \n\t"
  2544. "pand %%mm7, %%mm1 \n\t"
  2545. "pand %%mm7, %%mm2 \n\t"
  2546. "pand %%mm7, %%mm3 \n\t"
  2547. "packuswb %%mm1, %%mm0 \n\t"
  2548. "packuswb %%mm3, %%mm2 \n\t"
  2549. "movq %%mm0, %%mm1 \n\t"
  2550. "movq %%mm2, %%mm3 \n\t"
  2551. "psrlw $8, %%mm0 \n\t"
  2552. "psrlw $8, %%mm2 \n\t"
  2553. "pand %%mm7, %%mm1 \n\t"
  2554. "pand %%mm7, %%mm3 \n\t"
  2555. "packuswb %%mm2, %%mm0 \n\t"
  2556. "packuswb %%mm3, %%mm1 \n\t"
  2557. MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
  2558. MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
  2559. "add $8, %0 \n\t"
  2560. " js 1b \n\t"
  2561. : "+r"(count)
  2562. : "r"(src), "r"(dst0), "r"(dst1)
  2563. );
  2564. count -= 7;
  2565. }
  2566. #endif
  2567. while(count<0) {
  2568. dst0[count]= src[4*count+0];
  2569. dst1[count]= src[4*count+2];
  2570. count++;
  2571. }
  2572. }
  2573. static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2574. {
  2575. dst0 += count;
  2576. dst1 += count;
  2577. src0 += 4*count;
  2578. src1 += 4*count;
  2579. count= - count;
  2580. #ifdef PAVGB
  2581. if(count <= -8) {
  2582. count += 7;
  2583. __asm__ volatile(
  2584. "pcmpeqw %%mm7, %%mm7 \n\t"
  2585. "psrlw $8, %%mm7 \n\t"
  2586. "1: \n\t"
  2587. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2588. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2589. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2590. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2591. PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
  2592. PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
  2593. PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
  2594. PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
  2595. "pand %%mm7, %%mm0 \n\t"
  2596. "pand %%mm7, %%mm1 \n\t"
  2597. "pand %%mm7, %%mm2 \n\t"
  2598. "pand %%mm7, %%mm3 \n\t"
  2599. "packuswb %%mm1, %%mm0 \n\t"
  2600. "packuswb %%mm3, %%mm2 \n\t"
  2601. "movq %%mm0, %%mm1 \n\t"
  2602. "movq %%mm2, %%mm3 \n\t"
  2603. "psrlw $8, %%mm0 \n\t"
  2604. "psrlw $8, %%mm2 \n\t"
  2605. "pand %%mm7, %%mm1 \n\t"
  2606. "pand %%mm7, %%mm3 \n\t"
  2607. "packuswb %%mm2, %%mm0 \n\t"
  2608. "packuswb %%mm3, %%mm1 \n\t"
  2609. MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
  2610. MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
  2611. "add $8, %0 \n\t"
  2612. " js 1b \n\t"
  2613. : "+r"(count)
  2614. : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
  2615. );
  2616. count -= 7;
  2617. }
  2618. #endif
  2619. while(count<0) {
  2620. dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
  2621. dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
  2622. count++;
  2623. }
  2624. }
  2625. static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2626. {
  2627. dst0+= count;
  2628. dst1+= count;
  2629. src += 4*count;
  2630. count= - count;
  2631. #if HAVE_MMX
  2632. if(count <= -8) {
  2633. count += 7;
  2634. __asm__ volatile(
  2635. "pcmpeqw %%mm7, %%mm7 \n\t"
  2636. "psrlw $8, %%mm7 \n\t"
  2637. "1: \n\t"
  2638. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2639. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2640. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2641. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2642. "psrlw $8, %%mm0 \n\t"
  2643. "psrlw $8, %%mm1 \n\t"
  2644. "psrlw $8, %%mm2 \n\t"
  2645. "psrlw $8, %%mm3 \n\t"
  2646. "packuswb %%mm1, %%mm0 \n\t"
  2647. "packuswb %%mm3, %%mm2 \n\t"
  2648. "movq %%mm0, %%mm1 \n\t"
  2649. "movq %%mm2, %%mm3 \n\t"
  2650. "psrlw $8, %%mm0 \n\t"
  2651. "psrlw $8, %%mm2 \n\t"
  2652. "pand %%mm7, %%mm1 \n\t"
  2653. "pand %%mm7, %%mm3 \n\t"
  2654. "packuswb %%mm2, %%mm0 \n\t"
  2655. "packuswb %%mm3, %%mm1 \n\t"
  2656. MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
  2657. MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
  2658. "add $8, %0 \n\t"
  2659. " js 1b \n\t"
  2660. : "+r"(count)
  2661. : "r"(src), "r"(dst0), "r"(dst1)
  2662. );
  2663. count -= 7;
  2664. }
  2665. #endif
  2666. src++;
  2667. while(count<0) {
  2668. dst0[count]= src[4*count+0];
  2669. dst1[count]= src[4*count+2];
  2670. count++;
  2671. }
  2672. }
  2673. static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2674. {
  2675. dst0 += count;
  2676. dst1 += count;
  2677. src0 += 4*count;
  2678. src1 += 4*count;
  2679. count= - count;
  2680. #ifdef PAVGB
  2681. if(count <= -8) {
  2682. count += 7;
  2683. __asm__ volatile(
  2684. "pcmpeqw %%mm7, %%mm7 \n\t"
  2685. "psrlw $8, %%mm7 \n\t"
  2686. "1: \n\t"
  2687. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2688. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2689. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2690. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2691. PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
  2692. PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
  2693. PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
  2694. PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
  2695. "psrlw $8, %%mm0 \n\t"
  2696. "psrlw $8, %%mm1 \n\t"
  2697. "psrlw $8, %%mm2 \n\t"
  2698. "psrlw $8, %%mm3 \n\t"
  2699. "packuswb %%mm1, %%mm0 \n\t"
  2700. "packuswb %%mm3, %%mm2 \n\t"
  2701. "movq %%mm0, %%mm1 \n\t"
  2702. "movq %%mm2, %%mm3 \n\t"
  2703. "psrlw $8, %%mm0 \n\t"
  2704. "psrlw $8, %%mm2 \n\t"
  2705. "pand %%mm7, %%mm1 \n\t"
  2706. "pand %%mm7, %%mm3 \n\t"
  2707. "packuswb %%mm2, %%mm0 \n\t"
  2708. "packuswb %%mm3, %%mm1 \n\t"
  2709. MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
  2710. MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
  2711. "add $8, %0 \n\t"
  2712. " js 1b \n\t"
  2713. : "+r"(count)
  2714. : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
  2715. );
  2716. count -= 7;
  2717. }
  2718. #endif
  2719. src0++;
  2720. src1++;
  2721. while(count<0) {
  2722. dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
  2723. dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
  2724. count++;
  2725. }
  2726. }
  2727. static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2728. long width, long height,
  2729. long lumStride, long chromStride, long srcStride)
  2730. {
  2731. long y;
  2732. const long chromWidth= -((-width)>>1);
  2733. for (y=0; y<height; y++) {
  2734. RENAME(extract_even)(src, ydst, width);
  2735. if(y&1) {
  2736. RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
  2737. udst+= chromStride;
  2738. vdst+= chromStride;
  2739. }
  2740. src += srcStride;
  2741. ydst+= lumStride;
  2742. }
  2743. #if HAVE_MMX
  2744. __asm__(
  2745. EMMS" \n\t"
  2746. SFENCE" \n\t"
  2747. ::: "memory"
  2748. );
  2749. #endif
  2750. }
  2751. static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2752. long width, long height,
  2753. long lumStride, long chromStride, long srcStride)
  2754. {
  2755. long y;
  2756. const long chromWidth= -((-width)>>1);
  2757. for (y=0; y<height; y++) {
  2758. RENAME(extract_even)(src, ydst, width);
  2759. RENAME(extract_odd2)(src, udst, vdst, chromWidth);
  2760. src += srcStride;
  2761. ydst+= lumStride;
  2762. udst+= chromStride;
  2763. vdst+= chromStride;
  2764. }
  2765. #if HAVE_MMX
  2766. __asm__(
  2767. EMMS" \n\t"
  2768. SFENCE" \n\t"
  2769. ::: "memory"
  2770. );
  2771. #endif
  2772. }
  2773. static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2774. long width, long height,
  2775. long lumStride, long chromStride, long srcStride)
  2776. {
  2777. long y;
  2778. const long chromWidth= -((-width)>>1);
  2779. for (y=0; y<height; y++) {
  2780. RENAME(extract_even)(src+1, ydst, width);
  2781. if(y&1) {
  2782. RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
  2783. udst+= chromStride;
  2784. vdst+= chromStride;
  2785. }
  2786. src += srcStride;
  2787. ydst+= lumStride;
  2788. }
  2789. #if HAVE_MMX
  2790. __asm__(
  2791. EMMS" \n\t"
  2792. SFENCE" \n\t"
  2793. ::: "memory"
  2794. );
  2795. #endif
  2796. }
  2797. static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2798. long width, long height,
  2799. long lumStride, long chromStride, long srcStride)
  2800. {
  2801. long y;
  2802. const long chromWidth= -((-width)>>1);
  2803. for (y=0; y<height; y++) {
  2804. RENAME(extract_even)(src+1, ydst, width);
  2805. RENAME(extract_even2)(src, udst, vdst, chromWidth);
  2806. src += srcStride;
  2807. ydst+= lumStride;
  2808. udst+= chromStride;
  2809. vdst+= chromStride;
  2810. }
  2811. #if HAVE_MMX
  2812. __asm__(
  2813. EMMS" \n\t"
  2814. SFENCE" \n\t"
  2815. ::: "memory"
  2816. );
  2817. #endif
  2818. }
  2819. static inline void RENAME(rgb2rgb_init)(void)
  2820. {
  2821. rgb15to16 = RENAME(rgb15to16);
  2822. rgb15tobgr24 = RENAME(rgb15tobgr24);
  2823. rgb15to32 = RENAME(rgb15to32);
  2824. rgb16tobgr24 = RENAME(rgb16tobgr24);
  2825. rgb16to32 = RENAME(rgb16to32);
  2826. rgb16to15 = RENAME(rgb16to15);
  2827. rgb24tobgr16 = RENAME(rgb24tobgr16);
  2828. rgb24tobgr15 = RENAME(rgb24tobgr15);
  2829. rgb24tobgr32 = RENAME(rgb24tobgr32);
  2830. rgb32to16 = RENAME(rgb32to16);
  2831. rgb32to15 = RENAME(rgb32to15);
  2832. rgb32tobgr24 = RENAME(rgb32tobgr24);
  2833. rgb24to15 = RENAME(rgb24to15);
  2834. rgb24to16 = RENAME(rgb24to16);
  2835. rgb24tobgr24 = RENAME(rgb24tobgr24);
  2836. rgb32tobgr32 = RENAME(rgb32tobgr32);
  2837. rgb32tobgr16 = RENAME(rgb32tobgr16);
  2838. rgb32tobgr15 = RENAME(rgb32tobgr15);
  2839. yv12toyuy2 = RENAME(yv12toyuy2);
  2840. yv12touyvy = RENAME(yv12touyvy);
  2841. yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
  2842. yuv422ptouyvy = RENAME(yuv422ptouyvy);
  2843. yuy2toyv12 = RENAME(yuy2toyv12);
  2844. // yvu9toyv12 = RENAME(yvu9toyv12);
  2845. planar2x = RENAME(planar2x);
  2846. rgb24toyv12 = RENAME(rgb24toyv12);
  2847. interleaveBytes = RENAME(interleaveBytes);
  2848. vu9_to_vu12 = RENAME(vu9_to_vu12);
  2849. yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
  2850. uyvytoyuv420 = RENAME(uyvytoyuv420);
  2851. uyvytoyuv422 = RENAME(uyvytoyuv422);
  2852. yuyvtoyuv420 = RENAME(yuyvtoyuv420);
  2853. yuyvtoyuv422 = RENAME(yuyvtoyuv422);
  2854. }