rgb2rgb_template.c 98 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737
  1. /*
  2. * rgb2rgb.c, Software RGB to RGB convertor
  3. * pluralize by Software PAL8 to RGB convertor
  4. * Software YUV to YUV convertor
  5. * Software YUV to RGB convertor
  6. * Written by Nick Kurshev.
  7. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  8. * lot of big-endian byteorder fixes by Alex Beregszaszi
  9. *
  10. * This file is part of FFmpeg.
  11. *
  12. * FFmpeg is free software; you can redistribute it and/or modify
  13. * it under the terms of the GNU General Public License as published by
  14. * the Free Software Foundation; either version 2 of the License, or
  15. * (at your option) any later version.
  16. *
  17. * FFmpeg is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU General Public License
  23. * along with FFmpeg; if not, write to the Free Software
  24. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. *
  26. * The C code (not assembly, mmx, ...) of this file can be used
  27. * under the LGPL license.
  28. */
  29. #include <stddef.h>
  30. #include <inttypes.h> /* for __WORDSIZE */
  31. #ifndef __WORDSIZE
  32. // #warning You have a misconfigured system and will probably lose performance!
  33. #define __WORDSIZE MP_WORDSIZE
  34. #endif
  35. #undef PREFETCH
  36. #undef MOVNTQ
  37. #undef EMMS
  38. #undef SFENCE
  39. #undef MMREG_SIZE
  40. #undef PREFETCHW
  41. #undef PAVGB
  42. #ifdef HAVE_SSE2
  43. #define MMREG_SIZE 16
  44. #else
  45. #define MMREG_SIZE 8
  46. #endif
  47. #ifdef HAVE_3DNOW
  48. #define PREFETCH "prefetch"
  49. #define PREFETCHW "prefetchw"
  50. #define PAVGB "pavgusb"
  51. #elif defined (HAVE_MMX2)
  52. #define PREFETCH "prefetchnta"
  53. #define PREFETCHW "prefetcht0"
  54. #define PAVGB "pavgb"
  55. #else
  56. #ifdef __APPLE__
  57. #define PREFETCH "#"
  58. #define PREFETCHW "#"
  59. #else
  60. #define PREFETCH " # nop"
  61. #define PREFETCHW " # nop"
  62. #endif
  63. #endif
  64. #ifdef HAVE_3DNOW
  65. /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
  66. #define EMMS "femms"
  67. #else
  68. #define EMMS "emms"
  69. #endif
  70. #ifdef HAVE_MMX2
  71. #define MOVNTQ "movntq"
  72. #define SFENCE "sfence"
  73. #else
  74. #define MOVNTQ "movq"
  75. #define SFENCE " # nop"
  76. #endif
  77. static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
  78. {
  79. uint8_t *dest = dst;
  80. const uint8_t *s = src;
  81. const uint8_t *end;
  82. #ifdef HAVE_MMX
  83. const uint8_t *mm_end;
  84. #endif
  85. end = s + src_size;
  86. #ifdef HAVE_MMX
  87. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  88. mm_end = end - 23;
  89. __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
  90. while (s < mm_end)
  91. {
  92. __asm __volatile(
  93. PREFETCH" 32%1 \n\t"
  94. "movd %1, %%mm0 \n\t"
  95. "punpckldq 3%1, %%mm0 \n\t"
  96. "movd 6%1, %%mm1 \n\t"
  97. "punpckldq 9%1, %%mm1 \n\t"
  98. "movd 12%1, %%mm2 \n\t"
  99. "punpckldq 15%1, %%mm2 \n\t"
  100. "movd 18%1, %%mm3 \n\t"
  101. "punpckldq 21%1, %%mm3 \n\t"
  102. "pand %%mm7, %%mm0 \n\t"
  103. "pand %%mm7, %%mm1 \n\t"
  104. "pand %%mm7, %%mm2 \n\t"
  105. "pand %%mm7, %%mm3 \n\t"
  106. MOVNTQ" %%mm0, %0 \n\t"
  107. MOVNTQ" %%mm1, 8%0 \n\t"
  108. MOVNTQ" %%mm2, 16%0 \n\t"
  109. MOVNTQ" %%mm3, 24%0"
  110. :"=m"(*dest)
  111. :"m"(*s)
  112. :"memory");
  113. dest += 32;
  114. s += 24;
  115. }
  116. __asm __volatile(SFENCE:::"memory");
  117. __asm __volatile(EMMS:::"memory");
  118. #endif
  119. while (s < end)
  120. {
  121. #ifdef WORDS_BIGENDIAN
  122. /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
  123. *dest++ = 0;
  124. *dest++ = s[2];
  125. *dest++ = s[1];
  126. *dest++ = s[0];
  127. s+=3;
  128. #else
  129. *dest++ = *s++;
  130. *dest++ = *s++;
  131. *dest++ = *s++;
  132. *dest++ = 0;
  133. #endif
  134. }
  135. }
  136. static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
  137. {
  138. uint8_t *dest = dst;
  139. const uint8_t *s = src;
  140. const uint8_t *end;
  141. #ifdef HAVE_MMX
  142. const uint8_t *mm_end;
  143. #endif
  144. end = s + src_size;
  145. #ifdef HAVE_MMX
  146. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  147. mm_end = end - 31;
  148. while (s < mm_end)
  149. {
  150. __asm __volatile(
  151. PREFETCH" 32%1 \n\t"
  152. "movq %1, %%mm0 \n\t"
  153. "movq 8%1, %%mm1 \n\t"
  154. "movq 16%1, %%mm4 \n\t"
  155. "movq 24%1, %%mm5 \n\t"
  156. "movq %%mm0, %%mm2 \n\t"
  157. "movq %%mm1, %%mm3 \n\t"
  158. "movq %%mm4, %%mm6 \n\t"
  159. "movq %%mm5, %%mm7 \n\t"
  160. "psrlq $8, %%mm2 \n\t"
  161. "psrlq $8, %%mm3 \n\t"
  162. "psrlq $8, %%mm6 \n\t"
  163. "psrlq $8, %%mm7 \n\t"
  164. "pand %2, %%mm0 \n\t"
  165. "pand %2, %%mm1 \n\t"
  166. "pand %2, %%mm4 \n\t"
  167. "pand %2, %%mm5 \n\t"
  168. "pand %3, %%mm2 \n\t"
  169. "pand %3, %%mm3 \n\t"
  170. "pand %3, %%mm6 \n\t"
  171. "pand %3, %%mm7 \n\t"
  172. "por %%mm2, %%mm0 \n\t"
  173. "por %%mm3, %%mm1 \n\t"
  174. "por %%mm6, %%mm4 \n\t"
  175. "por %%mm7, %%mm5 \n\t"
  176. "movq %%mm1, %%mm2 \n\t"
  177. "movq %%mm4, %%mm3 \n\t"
  178. "psllq $48, %%mm2 \n\t"
  179. "psllq $32, %%mm3 \n\t"
  180. "pand %4, %%mm2 \n\t"
  181. "pand %5, %%mm3 \n\t"
  182. "por %%mm2, %%mm0 \n\t"
  183. "psrlq $16, %%mm1 \n\t"
  184. "psrlq $32, %%mm4 \n\t"
  185. "psllq $16, %%mm5 \n\t"
  186. "por %%mm3, %%mm1 \n\t"
  187. "pand %6, %%mm5 \n\t"
  188. "por %%mm5, %%mm4 \n\t"
  189. MOVNTQ" %%mm0, %0 \n\t"
  190. MOVNTQ" %%mm1, 8%0 \n\t"
  191. MOVNTQ" %%mm4, 16%0"
  192. :"=m"(*dest)
  193. :"m"(*s),"m"(mask24l),
  194. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  195. :"memory");
  196. dest += 24;
  197. s += 32;
  198. }
  199. __asm __volatile(SFENCE:::"memory");
  200. __asm __volatile(EMMS:::"memory");
  201. #endif
  202. while (s < end)
  203. {
  204. #ifdef WORDS_BIGENDIAN
  205. /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
  206. s++;
  207. dest[2] = *s++;
  208. dest[1] = *s++;
  209. dest[0] = *s++;
  210. dest += 3;
  211. #else
  212. *dest++ = *s++;
  213. *dest++ = *s++;
  214. *dest++ = *s++;
  215. s++;
  216. #endif
  217. }
  218. }
  219. /*
  220. Original by Strepto/Astral
  221. ported to gcc & bugfixed : A'rpi
  222. MMX2, 3DNOW optimization by Nick Kurshev
  223. 32 bit C version, and and&add trick by Michael Niedermayer
  224. */
  225. static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
  226. {
  227. register const uint8_t* s=src;
  228. register uint8_t* d=dst;
  229. register const uint8_t *end;
  230. const uint8_t *mm_end;
  231. end = s + src_size;
  232. #ifdef HAVE_MMX
  233. __asm __volatile(PREFETCH" %0"::"m"(*s));
  234. __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
  235. mm_end = end - 15;
  236. while (s<mm_end)
  237. {
  238. __asm __volatile(
  239. PREFETCH" 32%1 \n\t"
  240. "movq %1, %%mm0 \n\t"
  241. "movq 8%1, %%mm2 \n\t"
  242. "movq %%mm0, %%mm1 \n\t"
  243. "movq %%mm2, %%mm3 \n\t"
  244. "pand %%mm4, %%mm0 \n\t"
  245. "pand %%mm4, %%mm2 \n\t"
  246. "paddw %%mm1, %%mm0 \n\t"
  247. "paddw %%mm3, %%mm2 \n\t"
  248. MOVNTQ" %%mm0, %0 \n\t"
  249. MOVNTQ" %%mm2, 8%0"
  250. :"=m"(*d)
  251. :"m"(*s)
  252. );
  253. d+=16;
  254. s+=16;
  255. }
  256. __asm __volatile(SFENCE:::"memory");
  257. __asm __volatile(EMMS:::"memory");
  258. #endif
  259. mm_end = end - 3;
  260. while (s < mm_end)
  261. {
  262. register unsigned x= *((uint32_t *)s);
  263. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  264. d+=4;
  265. s+=4;
  266. }
  267. if (s < end)
  268. {
  269. register unsigned short x= *((uint16_t *)s);
  270. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  271. }
  272. }
  273. static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
  274. {
  275. register const uint8_t* s=src;
  276. register uint8_t* d=dst;
  277. register const uint8_t *end;
  278. const uint8_t *mm_end;
  279. end = s + src_size;
  280. #ifdef HAVE_MMX
  281. __asm __volatile(PREFETCH" %0"::"m"(*s));
  282. __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
  283. __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
  284. mm_end = end - 15;
  285. while (s<mm_end)
  286. {
  287. __asm __volatile(
  288. PREFETCH" 32%1 \n\t"
  289. "movq %1, %%mm0 \n\t"
  290. "movq 8%1, %%mm2 \n\t"
  291. "movq %%mm0, %%mm1 \n\t"
  292. "movq %%mm2, %%mm3 \n\t"
  293. "psrlq $1, %%mm0 \n\t"
  294. "psrlq $1, %%mm2 \n\t"
  295. "pand %%mm7, %%mm0 \n\t"
  296. "pand %%mm7, %%mm2 \n\t"
  297. "pand %%mm6, %%mm1 \n\t"
  298. "pand %%mm6, %%mm3 \n\t"
  299. "por %%mm1, %%mm0 \n\t"
  300. "por %%mm3, %%mm2 \n\t"
  301. MOVNTQ" %%mm0, %0 \n\t"
  302. MOVNTQ" %%mm2, 8%0"
  303. :"=m"(*d)
  304. :"m"(*s)
  305. );
  306. d+=16;
  307. s+=16;
  308. }
  309. __asm __volatile(SFENCE:::"memory");
  310. __asm __volatile(EMMS:::"memory");
  311. #endif
  312. mm_end = end - 3;
  313. while (s < mm_end)
  314. {
  315. register uint32_t x= *((uint32_t *)s);
  316. *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
  317. s+=4;
  318. d+=4;
  319. }
  320. if (s < end)
  321. {
  322. register uint16_t x= *((uint16_t *)s);
  323. *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
  324. s+=2;
  325. d+=2;
  326. }
  327. }
  328. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
  329. {
  330. const uint8_t *s = src;
  331. const uint8_t *end;
  332. #ifdef HAVE_MMX
  333. const uint8_t *mm_end;
  334. #endif
  335. uint16_t *d = (uint16_t *)dst;
  336. end = s + src_size;
  337. #ifdef HAVE_MMX
  338. mm_end = end - 15;
  339. #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
  340. asm volatile(
  341. "movq %3, %%mm5 \n\t"
  342. "movq %4, %%mm6 \n\t"
  343. "movq %5, %%mm7 \n\t"
  344. "jmp 2f \n\t"
  345. ASMALIGN(4)
  346. "1: \n\t"
  347. PREFETCH" 32(%1) \n\t"
  348. "movd (%1), %%mm0 \n\t"
  349. "movd 4(%1), %%mm3 \n\t"
  350. "punpckldq 8(%1), %%mm0 \n\t"
  351. "punpckldq 12(%1), %%mm3 \n\t"
  352. "movq %%mm0, %%mm1 \n\t"
  353. "movq %%mm3, %%mm4 \n\t"
  354. "pand %%mm6, %%mm0 \n\t"
  355. "pand %%mm6, %%mm3 \n\t"
  356. "pmaddwd %%mm7, %%mm0 \n\t"
  357. "pmaddwd %%mm7, %%mm3 \n\t"
  358. "pand %%mm5, %%mm1 \n\t"
  359. "pand %%mm5, %%mm4 \n\t"
  360. "por %%mm1, %%mm0 \n\t"
  361. "por %%mm4, %%mm3 \n\t"
  362. "psrld $5, %%mm0 \n\t"
  363. "pslld $11, %%mm3 \n\t"
  364. "por %%mm3, %%mm0 \n\t"
  365. MOVNTQ" %%mm0, (%0) \n\t"
  366. "add $16, %1 \n\t"
  367. "add $8, %0 \n\t"
  368. "2: \n\t"
  369. "cmp %2, %1 \n\t"
  370. " jb 1b \n\t"
  371. : "+r" (d), "+r"(s)
  372. : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
  373. );
  374. #else
  375. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  376. __asm __volatile(
  377. "movq %0, %%mm7 \n\t"
  378. "movq %1, %%mm6 \n\t"
  379. ::"m"(red_16mask),"m"(green_16mask));
  380. while (s < mm_end)
  381. {
  382. __asm __volatile(
  383. PREFETCH" 32%1 \n\t"
  384. "movd %1, %%mm0 \n\t"
  385. "movd 4%1, %%mm3 \n\t"
  386. "punpckldq 8%1, %%mm0 \n\t"
  387. "punpckldq 12%1, %%mm3 \n\t"
  388. "movq %%mm0, %%mm1 \n\t"
  389. "movq %%mm0, %%mm2 \n\t"
  390. "movq %%mm3, %%mm4 \n\t"
  391. "movq %%mm3, %%mm5 \n\t"
  392. "psrlq $3, %%mm0 \n\t"
  393. "psrlq $3, %%mm3 \n\t"
  394. "pand %2, %%mm0 \n\t"
  395. "pand %2, %%mm3 \n\t"
  396. "psrlq $5, %%mm1 \n\t"
  397. "psrlq $5, %%mm4 \n\t"
  398. "pand %%mm6, %%mm1 \n\t"
  399. "pand %%mm6, %%mm4 \n\t"
  400. "psrlq $8, %%mm2 \n\t"
  401. "psrlq $8, %%mm5 \n\t"
  402. "pand %%mm7, %%mm2 \n\t"
  403. "pand %%mm7, %%mm5 \n\t"
  404. "por %%mm1, %%mm0 \n\t"
  405. "por %%mm4, %%mm3 \n\t"
  406. "por %%mm2, %%mm0 \n\t"
  407. "por %%mm5, %%mm3 \n\t"
  408. "psllq $16, %%mm3 \n\t"
  409. "por %%mm3, %%mm0 \n\t"
  410. MOVNTQ" %%mm0, %0 \n\t"
  411. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  412. d += 4;
  413. s += 16;
  414. }
  415. #endif
  416. __asm __volatile(SFENCE:::"memory");
  417. __asm __volatile(EMMS:::"memory");
  418. #endif
  419. while (s < end)
  420. {
  421. register int rgb = *(uint32_t*)s; s += 4;
  422. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
  423. }
  424. }
  425. static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  426. {
  427. const uint8_t *s = src;
  428. const uint8_t *end;
  429. #ifdef HAVE_MMX
  430. const uint8_t *mm_end;
  431. #endif
  432. uint16_t *d = (uint16_t *)dst;
  433. end = s + src_size;
  434. #ifdef HAVE_MMX
  435. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  436. __asm __volatile(
  437. "movq %0, %%mm7 \n\t"
  438. "movq %1, %%mm6 \n\t"
  439. ::"m"(red_16mask),"m"(green_16mask));
  440. mm_end = end - 15;
  441. while (s < mm_end)
  442. {
  443. __asm __volatile(
  444. PREFETCH" 32%1 \n\t"
  445. "movd %1, %%mm0 \n\t"
  446. "movd 4%1, %%mm3 \n\t"
  447. "punpckldq 8%1, %%mm0 \n\t"
  448. "punpckldq 12%1, %%mm3 \n\t"
  449. "movq %%mm0, %%mm1 \n\t"
  450. "movq %%mm0, %%mm2 \n\t"
  451. "movq %%mm3, %%mm4 \n\t"
  452. "movq %%mm3, %%mm5 \n\t"
  453. "psllq $8, %%mm0 \n\t"
  454. "psllq $8, %%mm3 \n\t"
  455. "pand %%mm7, %%mm0 \n\t"
  456. "pand %%mm7, %%mm3 \n\t"
  457. "psrlq $5, %%mm1 \n\t"
  458. "psrlq $5, %%mm4 \n\t"
  459. "pand %%mm6, %%mm1 \n\t"
  460. "pand %%mm6, %%mm4 \n\t"
  461. "psrlq $19, %%mm2 \n\t"
  462. "psrlq $19, %%mm5 \n\t"
  463. "pand %2, %%mm2 \n\t"
  464. "pand %2, %%mm5 \n\t"
  465. "por %%mm1, %%mm0 \n\t"
  466. "por %%mm4, %%mm3 \n\t"
  467. "por %%mm2, %%mm0 \n\t"
  468. "por %%mm5, %%mm3 \n\t"
  469. "psllq $16, %%mm3 \n\t"
  470. "por %%mm3, %%mm0 \n\t"
  471. MOVNTQ" %%mm0, %0 \n\t"
  472. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  473. d += 4;
  474. s += 16;
  475. }
  476. __asm __volatile(SFENCE:::"memory");
  477. __asm __volatile(EMMS:::"memory");
  478. #endif
  479. while (s < end)
  480. {
  481. register int rgb = *(uint32_t*)s; s += 4;
  482. *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
  483. }
  484. }
  485. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
  486. {
  487. const uint8_t *s = src;
  488. const uint8_t *end;
  489. #ifdef HAVE_MMX
  490. const uint8_t *mm_end;
  491. #endif
  492. uint16_t *d = (uint16_t *)dst;
  493. end = s + src_size;
  494. #ifdef HAVE_MMX
  495. mm_end = end - 15;
  496. #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
  497. asm volatile(
  498. "movq %3, %%mm5 \n\t"
  499. "movq %4, %%mm6 \n\t"
  500. "movq %5, %%mm7 \n\t"
  501. "jmp 2f \n\t"
  502. ASMALIGN(4)
  503. "1: \n\t"
  504. PREFETCH" 32(%1) \n\t"
  505. "movd (%1), %%mm0 \n\t"
  506. "movd 4(%1), %%mm3 \n\t"
  507. "punpckldq 8(%1), %%mm0 \n\t"
  508. "punpckldq 12(%1), %%mm3 \n\t"
  509. "movq %%mm0, %%mm1 \n\t"
  510. "movq %%mm3, %%mm4 \n\t"
  511. "pand %%mm6, %%mm0 \n\t"
  512. "pand %%mm6, %%mm3 \n\t"
  513. "pmaddwd %%mm7, %%mm0 \n\t"
  514. "pmaddwd %%mm7, %%mm3 \n\t"
  515. "pand %%mm5, %%mm1 \n\t"
  516. "pand %%mm5, %%mm4 \n\t"
  517. "por %%mm1, %%mm0 \n\t"
  518. "por %%mm4, %%mm3 \n\t"
  519. "psrld $6, %%mm0 \n\t"
  520. "pslld $10, %%mm3 \n\t"
  521. "por %%mm3, %%mm0 \n\t"
  522. MOVNTQ" %%mm0, (%0) \n\t"
  523. "add $16, %1 \n\t"
  524. "add $8, %0 \n\t"
  525. "2: \n\t"
  526. "cmp %2, %1 \n\t"
  527. " jb 1b \n\t"
  528. : "+r" (d), "+r"(s)
  529. : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
  530. );
  531. #else
  532. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  533. __asm __volatile(
  534. "movq %0, %%mm7 \n\t"
  535. "movq %1, %%mm6 \n\t"
  536. ::"m"(red_15mask),"m"(green_15mask));
  537. while (s < mm_end)
  538. {
  539. __asm __volatile(
  540. PREFETCH" 32%1 \n\t"
  541. "movd %1, %%mm0 \n\t"
  542. "movd 4%1, %%mm3 \n\t"
  543. "punpckldq 8%1, %%mm0 \n\t"
  544. "punpckldq 12%1, %%mm3 \n\t"
  545. "movq %%mm0, %%mm1 \n\t"
  546. "movq %%mm0, %%mm2 \n\t"
  547. "movq %%mm3, %%mm4 \n\t"
  548. "movq %%mm3, %%mm5 \n\t"
  549. "psrlq $3, %%mm0 \n\t"
  550. "psrlq $3, %%mm3 \n\t"
  551. "pand %2, %%mm0 \n\t"
  552. "pand %2, %%mm3 \n\t"
  553. "psrlq $6, %%mm1 \n\t"
  554. "psrlq $6, %%mm4 \n\t"
  555. "pand %%mm6, %%mm1 \n\t"
  556. "pand %%mm6, %%mm4 \n\t"
  557. "psrlq $9, %%mm2 \n\t"
  558. "psrlq $9, %%mm5 \n\t"
  559. "pand %%mm7, %%mm2 \n\t"
  560. "pand %%mm7, %%mm5 \n\t"
  561. "por %%mm1, %%mm0 \n\t"
  562. "por %%mm4, %%mm3 \n\t"
  563. "por %%mm2, %%mm0 \n\t"
  564. "por %%mm5, %%mm3 \n\t"
  565. "psllq $16, %%mm3 \n\t"
  566. "por %%mm3, %%mm0 \n\t"
  567. MOVNTQ" %%mm0, %0 \n\t"
  568. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  569. d += 4;
  570. s += 16;
  571. }
  572. #endif
  573. __asm __volatile(SFENCE:::"memory");
  574. __asm __volatile(EMMS:::"memory");
  575. #endif
  576. while (s < end)
  577. {
  578. register int rgb = *(uint32_t*)s; s += 4;
  579. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
  580. }
  581. }
  582. static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  583. {
  584. const uint8_t *s = src;
  585. const uint8_t *end;
  586. #ifdef HAVE_MMX
  587. const uint8_t *mm_end;
  588. #endif
  589. uint16_t *d = (uint16_t *)dst;
  590. end = s + src_size;
  591. #ifdef HAVE_MMX
  592. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  593. __asm __volatile(
  594. "movq %0, %%mm7 \n\t"
  595. "movq %1, %%mm6 \n\t"
  596. ::"m"(red_15mask),"m"(green_15mask));
  597. mm_end = end - 15;
  598. while (s < mm_end)
  599. {
  600. __asm __volatile(
  601. PREFETCH" 32%1 \n\t"
  602. "movd %1, %%mm0 \n\t"
  603. "movd 4%1, %%mm3 \n\t"
  604. "punpckldq 8%1, %%mm0 \n\t"
  605. "punpckldq 12%1, %%mm3 \n\t"
  606. "movq %%mm0, %%mm1 \n\t"
  607. "movq %%mm0, %%mm2 \n\t"
  608. "movq %%mm3, %%mm4 \n\t"
  609. "movq %%mm3, %%mm5 \n\t"
  610. "psllq $7, %%mm0 \n\t"
  611. "psllq $7, %%mm3 \n\t"
  612. "pand %%mm7, %%mm0 \n\t"
  613. "pand %%mm7, %%mm3 \n\t"
  614. "psrlq $6, %%mm1 \n\t"
  615. "psrlq $6, %%mm4 \n\t"
  616. "pand %%mm6, %%mm1 \n\t"
  617. "pand %%mm6, %%mm4 \n\t"
  618. "psrlq $19, %%mm2 \n\t"
  619. "psrlq $19, %%mm5 \n\t"
  620. "pand %2, %%mm2 \n\t"
  621. "pand %2, %%mm5 \n\t"
  622. "por %%mm1, %%mm0 \n\t"
  623. "por %%mm4, %%mm3 \n\t"
  624. "por %%mm2, %%mm0 \n\t"
  625. "por %%mm5, %%mm3 \n\t"
  626. "psllq $16, %%mm3 \n\t"
  627. "por %%mm3, %%mm0 \n\t"
  628. MOVNTQ" %%mm0, %0 \n\t"
  629. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  630. d += 4;
  631. s += 16;
  632. }
  633. __asm __volatile(SFENCE:::"memory");
  634. __asm __volatile(EMMS:::"memory");
  635. #endif
  636. while (s < end)
  637. {
  638. register int rgb = *(uint32_t*)s; s += 4;
  639. *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
  640. }
  641. }
  642. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
  643. {
  644. const uint8_t *s = src;
  645. const uint8_t *end;
  646. #ifdef HAVE_MMX
  647. const uint8_t *mm_end;
  648. #endif
  649. uint16_t *d = (uint16_t *)dst;
  650. end = s + src_size;
  651. #ifdef HAVE_MMX
  652. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  653. __asm __volatile(
  654. "movq %0, %%mm7 \n\t"
  655. "movq %1, %%mm6 \n\t"
  656. ::"m"(red_16mask),"m"(green_16mask));
  657. mm_end = end - 11;
  658. while (s < mm_end)
  659. {
  660. __asm __volatile(
  661. PREFETCH" 32%1 \n\t"
  662. "movd %1, %%mm0 \n\t"
  663. "movd 3%1, %%mm3 \n\t"
  664. "punpckldq 6%1, %%mm0 \n\t"
  665. "punpckldq 9%1, %%mm3 \n\t"
  666. "movq %%mm0, %%mm1 \n\t"
  667. "movq %%mm0, %%mm2 \n\t"
  668. "movq %%mm3, %%mm4 \n\t"
  669. "movq %%mm3, %%mm5 \n\t"
  670. "psrlq $3, %%mm0 \n\t"
  671. "psrlq $3, %%mm3 \n\t"
  672. "pand %2, %%mm0 \n\t"
  673. "pand %2, %%mm3 \n\t"
  674. "psrlq $5, %%mm1 \n\t"
  675. "psrlq $5, %%mm4 \n\t"
  676. "pand %%mm6, %%mm1 \n\t"
  677. "pand %%mm6, %%mm4 \n\t"
  678. "psrlq $8, %%mm2 \n\t"
  679. "psrlq $8, %%mm5 \n\t"
  680. "pand %%mm7, %%mm2 \n\t"
  681. "pand %%mm7, %%mm5 \n\t"
  682. "por %%mm1, %%mm0 \n\t"
  683. "por %%mm4, %%mm3 \n\t"
  684. "por %%mm2, %%mm0 \n\t"
  685. "por %%mm5, %%mm3 \n\t"
  686. "psllq $16, %%mm3 \n\t"
  687. "por %%mm3, %%mm0 \n\t"
  688. MOVNTQ" %%mm0, %0 \n\t"
  689. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  690. d += 4;
  691. s += 12;
  692. }
  693. __asm __volatile(SFENCE:::"memory");
  694. __asm __volatile(EMMS:::"memory");
  695. #endif
  696. while (s < end)
  697. {
  698. const int b = *s++;
  699. const int g = *s++;
  700. const int r = *s++;
  701. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  702. }
  703. }
  704. static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  705. {
  706. const uint8_t *s = src;
  707. const uint8_t *end;
  708. #ifdef HAVE_MMX
  709. const uint8_t *mm_end;
  710. #endif
  711. uint16_t *d = (uint16_t *)dst;
  712. end = s + src_size;
  713. #ifdef HAVE_MMX
  714. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  715. __asm __volatile(
  716. "movq %0, %%mm7 \n\t"
  717. "movq %1, %%mm6 \n\t"
  718. ::"m"(red_16mask),"m"(green_16mask));
  719. mm_end = end - 15;
  720. while (s < mm_end)
  721. {
  722. __asm __volatile(
  723. PREFETCH" 32%1 \n\t"
  724. "movd %1, %%mm0 \n\t"
  725. "movd 3%1, %%mm3 \n\t"
  726. "punpckldq 6%1, %%mm0 \n\t"
  727. "punpckldq 9%1, %%mm3 \n\t"
  728. "movq %%mm0, %%mm1 \n\t"
  729. "movq %%mm0, %%mm2 \n\t"
  730. "movq %%mm3, %%mm4 \n\t"
  731. "movq %%mm3, %%mm5 \n\t"
  732. "psllq $8, %%mm0 \n\t"
  733. "psllq $8, %%mm3 \n\t"
  734. "pand %%mm7, %%mm0 \n\t"
  735. "pand %%mm7, %%mm3 \n\t"
  736. "psrlq $5, %%mm1 \n\t"
  737. "psrlq $5, %%mm4 \n\t"
  738. "pand %%mm6, %%mm1 \n\t"
  739. "pand %%mm6, %%mm4 \n\t"
  740. "psrlq $19, %%mm2 \n\t"
  741. "psrlq $19, %%mm5 \n\t"
  742. "pand %2, %%mm2 \n\t"
  743. "pand %2, %%mm5 \n\t"
  744. "por %%mm1, %%mm0 \n\t"
  745. "por %%mm4, %%mm3 \n\t"
  746. "por %%mm2, %%mm0 \n\t"
  747. "por %%mm5, %%mm3 \n\t"
  748. "psllq $16, %%mm3 \n\t"
  749. "por %%mm3, %%mm0 \n\t"
  750. MOVNTQ" %%mm0, %0 \n\t"
  751. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  752. d += 4;
  753. s += 12;
  754. }
  755. __asm __volatile(SFENCE:::"memory");
  756. __asm __volatile(EMMS:::"memory");
  757. #endif
  758. while (s < end)
  759. {
  760. const int r = *s++;
  761. const int g = *s++;
  762. const int b = *s++;
  763. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  764. }
  765. }
  766. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
  767. {
  768. const uint8_t *s = src;
  769. const uint8_t *end;
  770. #ifdef HAVE_MMX
  771. const uint8_t *mm_end;
  772. #endif
  773. uint16_t *d = (uint16_t *)dst;
  774. end = s + src_size;
  775. #ifdef HAVE_MMX
  776. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  777. __asm __volatile(
  778. "movq %0, %%mm7 \n\t"
  779. "movq %1, %%mm6 \n\t"
  780. ::"m"(red_15mask),"m"(green_15mask));
  781. mm_end = end - 11;
  782. while (s < mm_end)
  783. {
  784. __asm __volatile(
  785. PREFETCH" 32%1 \n\t"
  786. "movd %1, %%mm0 \n\t"
  787. "movd 3%1, %%mm3 \n\t"
  788. "punpckldq 6%1, %%mm0 \n\t"
  789. "punpckldq 9%1, %%mm3 \n\t"
  790. "movq %%mm0, %%mm1 \n\t"
  791. "movq %%mm0, %%mm2 \n\t"
  792. "movq %%mm3, %%mm4 \n\t"
  793. "movq %%mm3, %%mm5 \n\t"
  794. "psrlq $3, %%mm0 \n\t"
  795. "psrlq $3, %%mm3 \n\t"
  796. "pand %2, %%mm0 \n\t"
  797. "pand %2, %%mm3 \n\t"
  798. "psrlq $6, %%mm1 \n\t"
  799. "psrlq $6, %%mm4 \n\t"
  800. "pand %%mm6, %%mm1 \n\t"
  801. "pand %%mm6, %%mm4 \n\t"
  802. "psrlq $9, %%mm2 \n\t"
  803. "psrlq $9, %%mm5 \n\t"
  804. "pand %%mm7, %%mm2 \n\t"
  805. "pand %%mm7, %%mm5 \n\t"
  806. "por %%mm1, %%mm0 \n\t"
  807. "por %%mm4, %%mm3 \n\t"
  808. "por %%mm2, %%mm0 \n\t"
  809. "por %%mm5, %%mm3 \n\t"
  810. "psllq $16, %%mm3 \n\t"
  811. "por %%mm3, %%mm0 \n\t"
  812. MOVNTQ" %%mm0, %0 \n\t"
  813. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  814. d += 4;
  815. s += 12;
  816. }
  817. __asm __volatile(SFENCE:::"memory");
  818. __asm __volatile(EMMS:::"memory");
  819. #endif
  820. while (s < end)
  821. {
  822. const int b = *s++;
  823. const int g = *s++;
  824. const int r = *s++;
  825. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  826. }
  827. }
  828. static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  829. {
  830. const uint8_t *s = src;
  831. const uint8_t *end;
  832. #ifdef HAVE_MMX
  833. const uint8_t *mm_end;
  834. #endif
  835. uint16_t *d = (uint16_t *)dst;
  836. end = s + src_size;
  837. #ifdef HAVE_MMX
  838. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  839. __asm __volatile(
  840. "movq %0, %%mm7 \n\t"
  841. "movq %1, %%mm6 \n\t"
  842. ::"m"(red_15mask),"m"(green_15mask));
  843. mm_end = end - 15;
  844. while (s < mm_end)
  845. {
  846. __asm __volatile(
  847. PREFETCH" 32%1 \n\t"
  848. "movd %1, %%mm0 \n\t"
  849. "movd 3%1, %%mm3 \n\t"
  850. "punpckldq 6%1, %%mm0 \n\t"
  851. "punpckldq 9%1, %%mm3 \n\t"
  852. "movq %%mm0, %%mm1 \n\t"
  853. "movq %%mm0, %%mm2 \n\t"
  854. "movq %%mm3, %%mm4 \n\t"
  855. "movq %%mm3, %%mm5 \n\t"
  856. "psllq $7, %%mm0 \n\t"
  857. "psllq $7, %%mm3 \n\t"
  858. "pand %%mm7, %%mm0 \n\t"
  859. "pand %%mm7, %%mm3 \n\t"
  860. "psrlq $6, %%mm1 \n\t"
  861. "psrlq $6, %%mm4 \n\t"
  862. "pand %%mm6, %%mm1 \n\t"
  863. "pand %%mm6, %%mm4 \n\t"
  864. "psrlq $19, %%mm2 \n\t"
  865. "psrlq $19, %%mm5 \n\t"
  866. "pand %2, %%mm2 \n\t"
  867. "pand %2, %%mm5 \n\t"
  868. "por %%mm1, %%mm0 \n\t"
  869. "por %%mm4, %%mm3 \n\t"
  870. "por %%mm2, %%mm0 \n\t"
  871. "por %%mm5, %%mm3 \n\t"
  872. "psllq $16, %%mm3 \n\t"
  873. "por %%mm3, %%mm0 \n\t"
  874. MOVNTQ" %%mm0, %0 \n\t"
  875. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  876. d += 4;
  877. s += 12;
  878. }
  879. __asm __volatile(SFENCE:::"memory");
  880. __asm __volatile(EMMS:::"memory");
  881. #endif
  882. while (s < end)
  883. {
  884. const int r = *s++;
  885. const int g = *s++;
  886. const int b = *s++;
  887. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  888. }
  889. }
  890. /*
  891. I use less accurate approximation here by simply left-shifting the input
  892. value and filling the low order bits with zeroes. This method improves PNG
  893. compression but this scheme cannot reproduce white exactly, since it does
  894. not generate an all-ones maximum value; the net effect is to darken the
  895. image slightly.
  896. The better method should be "left bit replication":
  897. 4 3 2 1 0
  898. ---------
  899. 1 1 0 1 1
  900. 7 6 5 4 3 2 1 0
  901. ----------------
  902. 1 1 0 1 1 1 1 0
  903. |=======| |===|
  904. | Leftmost Bits Repeated to Fill Open Bits
  905. |
  906. Original Bits
  907. */
  908. static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
  909. {
  910. const uint16_t *end;
  911. #ifdef HAVE_MMX
  912. const uint16_t *mm_end;
  913. #endif
  914. uint8_t *d = (uint8_t *)dst;
  915. const uint16_t *s = (uint16_t *)src;
  916. end = s + src_size/2;
  917. #ifdef HAVE_MMX
  918. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  919. mm_end = end - 7;
  920. while (s < mm_end)
  921. {
  922. __asm __volatile(
  923. PREFETCH" 32%1 \n\t"
  924. "movq %1, %%mm0 \n\t"
  925. "movq %1, %%mm1 \n\t"
  926. "movq %1, %%mm2 \n\t"
  927. "pand %2, %%mm0 \n\t"
  928. "pand %3, %%mm1 \n\t"
  929. "pand %4, %%mm2 \n\t"
  930. "psllq $3, %%mm0 \n\t"
  931. "psrlq $2, %%mm1 \n\t"
  932. "psrlq $7, %%mm2 \n\t"
  933. "movq %%mm0, %%mm3 \n\t"
  934. "movq %%mm1, %%mm4 \n\t"
  935. "movq %%mm2, %%mm5 \n\t"
  936. "punpcklwd %5, %%mm0 \n\t"
  937. "punpcklwd %5, %%mm1 \n\t"
  938. "punpcklwd %5, %%mm2 \n\t"
  939. "punpckhwd %5, %%mm3 \n\t"
  940. "punpckhwd %5, %%mm4 \n\t"
  941. "punpckhwd %5, %%mm5 \n\t"
  942. "psllq $8, %%mm1 \n\t"
  943. "psllq $16, %%mm2 \n\t"
  944. "por %%mm1, %%mm0 \n\t"
  945. "por %%mm2, %%mm0 \n\t"
  946. "psllq $8, %%mm4 \n\t"
  947. "psllq $16, %%mm5 \n\t"
  948. "por %%mm4, %%mm3 \n\t"
  949. "por %%mm5, %%mm3 \n\t"
  950. "movq %%mm0, %%mm6 \n\t"
  951. "movq %%mm3, %%mm7 \n\t"
  952. "movq 8%1, %%mm0 \n\t"
  953. "movq 8%1, %%mm1 \n\t"
  954. "movq 8%1, %%mm2 \n\t"
  955. "pand %2, %%mm0 \n\t"
  956. "pand %3, %%mm1 \n\t"
  957. "pand %4, %%mm2 \n\t"
  958. "psllq $3, %%mm0 \n\t"
  959. "psrlq $2, %%mm1 \n\t"
  960. "psrlq $7, %%mm2 \n\t"
  961. "movq %%mm0, %%mm3 \n\t"
  962. "movq %%mm1, %%mm4 \n\t"
  963. "movq %%mm2, %%mm5 \n\t"
  964. "punpcklwd %5, %%mm0 \n\t"
  965. "punpcklwd %5, %%mm1 \n\t"
  966. "punpcklwd %5, %%mm2 \n\t"
  967. "punpckhwd %5, %%mm3 \n\t"
  968. "punpckhwd %5, %%mm4 \n\t"
  969. "punpckhwd %5, %%mm5 \n\t"
  970. "psllq $8, %%mm1 \n\t"
  971. "psllq $16, %%mm2 \n\t"
  972. "por %%mm1, %%mm0 \n\t"
  973. "por %%mm2, %%mm0 \n\t"
  974. "psllq $8, %%mm4 \n\t"
  975. "psllq $16, %%mm5 \n\t"
  976. "por %%mm4, %%mm3 \n\t"
  977. "por %%mm5, %%mm3 \n\t"
  978. :"=m"(*d)
  979. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
  980. :"memory");
  981. /* Borrowed 32 to 24 */
  982. __asm __volatile(
  983. "movq %%mm0, %%mm4 \n\t"
  984. "movq %%mm3, %%mm5 \n\t"
  985. "movq %%mm6, %%mm0 \n\t"
  986. "movq %%mm7, %%mm1 \n\t"
  987. "movq %%mm4, %%mm6 \n\t"
  988. "movq %%mm5, %%mm7 \n\t"
  989. "movq %%mm0, %%mm2 \n\t"
  990. "movq %%mm1, %%mm3 \n\t"
  991. "psrlq $8, %%mm2 \n\t"
  992. "psrlq $8, %%mm3 \n\t"
  993. "psrlq $8, %%mm6 \n\t"
  994. "psrlq $8, %%mm7 \n\t"
  995. "pand %2, %%mm0 \n\t"
  996. "pand %2, %%mm1 \n\t"
  997. "pand %2, %%mm4 \n\t"
  998. "pand %2, %%mm5 \n\t"
  999. "pand %3, %%mm2 \n\t"
  1000. "pand %3, %%mm3 \n\t"
  1001. "pand %3, %%mm6 \n\t"
  1002. "pand %3, %%mm7 \n\t"
  1003. "por %%mm2, %%mm0 \n\t"
  1004. "por %%mm3, %%mm1 \n\t"
  1005. "por %%mm6, %%mm4 \n\t"
  1006. "por %%mm7, %%mm5 \n\t"
  1007. "movq %%mm1, %%mm2 \n\t"
  1008. "movq %%mm4, %%mm3 \n\t"
  1009. "psllq $48, %%mm2 \n\t"
  1010. "psllq $32, %%mm3 \n\t"
  1011. "pand %4, %%mm2 \n\t"
  1012. "pand %5, %%mm3 \n\t"
  1013. "por %%mm2, %%mm0 \n\t"
  1014. "psrlq $16, %%mm1 \n\t"
  1015. "psrlq $32, %%mm4 \n\t"
  1016. "psllq $16, %%mm5 \n\t"
  1017. "por %%mm3, %%mm1 \n\t"
  1018. "pand %6, %%mm5 \n\t"
  1019. "por %%mm5, %%mm4 \n\t"
  1020. MOVNTQ" %%mm0, %0 \n\t"
  1021. MOVNTQ" %%mm1, 8%0 \n\t"
  1022. MOVNTQ" %%mm4, 16%0"
  1023. :"=m"(*d)
  1024. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1025. :"memory");
  1026. d += 24;
  1027. s += 8;
  1028. }
  1029. __asm __volatile(SFENCE:::"memory");
  1030. __asm __volatile(EMMS:::"memory");
  1031. #endif
  1032. while (s < end)
  1033. {
  1034. register uint16_t bgr;
  1035. bgr = *s++;
  1036. *d++ = (bgr&0x1F)<<3;
  1037. *d++ = (bgr&0x3E0)>>2;
  1038. *d++ = (bgr&0x7C00)>>7;
  1039. }
  1040. }
  1041. static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
  1042. {
  1043. const uint16_t *end;
  1044. #ifdef HAVE_MMX
  1045. const uint16_t *mm_end;
  1046. #endif
  1047. uint8_t *d = (uint8_t *)dst;
  1048. const uint16_t *s = (const uint16_t *)src;
  1049. end = s + src_size/2;
  1050. #ifdef HAVE_MMX
  1051. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1052. mm_end = end - 7;
  1053. while (s < mm_end)
  1054. {
  1055. __asm __volatile(
  1056. PREFETCH" 32%1 \n\t"
  1057. "movq %1, %%mm0 \n\t"
  1058. "movq %1, %%mm1 \n\t"
  1059. "movq %1, %%mm2 \n\t"
  1060. "pand %2, %%mm0 \n\t"
  1061. "pand %3, %%mm1 \n\t"
  1062. "pand %4, %%mm2 \n\t"
  1063. "psllq $3, %%mm0 \n\t"
  1064. "psrlq $3, %%mm1 \n\t"
  1065. "psrlq $8, %%mm2 \n\t"
  1066. "movq %%mm0, %%mm3 \n\t"
  1067. "movq %%mm1, %%mm4 \n\t"
  1068. "movq %%mm2, %%mm5 \n\t"
  1069. "punpcklwd %5, %%mm0 \n\t"
  1070. "punpcklwd %5, %%mm1 \n\t"
  1071. "punpcklwd %5, %%mm2 \n\t"
  1072. "punpckhwd %5, %%mm3 \n\t"
  1073. "punpckhwd %5, %%mm4 \n\t"
  1074. "punpckhwd %5, %%mm5 \n\t"
  1075. "psllq $8, %%mm1 \n\t"
  1076. "psllq $16, %%mm2 \n\t"
  1077. "por %%mm1, %%mm0 \n\t"
  1078. "por %%mm2, %%mm0 \n\t"
  1079. "psllq $8, %%mm4 \n\t"
  1080. "psllq $16, %%mm5 \n\t"
  1081. "por %%mm4, %%mm3 \n\t"
  1082. "por %%mm5, %%mm3 \n\t"
  1083. "movq %%mm0, %%mm6 \n\t"
  1084. "movq %%mm3, %%mm7 \n\t"
  1085. "movq 8%1, %%mm0 \n\t"
  1086. "movq 8%1, %%mm1 \n\t"
  1087. "movq 8%1, %%mm2 \n\t"
  1088. "pand %2, %%mm0 \n\t"
  1089. "pand %3, %%mm1 \n\t"
  1090. "pand %4, %%mm2 \n\t"
  1091. "psllq $3, %%mm0 \n\t"
  1092. "psrlq $3, %%mm1 \n\t"
  1093. "psrlq $8, %%mm2 \n\t"
  1094. "movq %%mm0, %%mm3 \n\t"
  1095. "movq %%mm1, %%mm4 \n\t"
  1096. "movq %%mm2, %%mm5 \n\t"
  1097. "punpcklwd %5, %%mm0 \n\t"
  1098. "punpcklwd %5, %%mm1 \n\t"
  1099. "punpcklwd %5, %%mm2 \n\t"
  1100. "punpckhwd %5, %%mm3 \n\t"
  1101. "punpckhwd %5, %%mm4 \n\t"
  1102. "punpckhwd %5, %%mm5 \n\t"
  1103. "psllq $8, %%mm1 \n\t"
  1104. "psllq $16, %%mm2 \n\t"
  1105. "por %%mm1, %%mm0 \n\t"
  1106. "por %%mm2, %%mm0 \n\t"
  1107. "psllq $8, %%mm4 \n\t"
  1108. "psllq $16, %%mm5 \n\t"
  1109. "por %%mm4, %%mm3 \n\t"
  1110. "por %%mm5, %%mm3 \n\t"
  1111. :"=m"(*d)
  1112. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
  1113. :"memory");
  1114. /* Borrowed 32 to 24 */
  1115. __asm __volatile(
  1116. "movq %%mm0, %%mm4 \n\t"
  1117. "movq %%mm3, %%mm5 \n\t"
  1118. "movq %%mm6, %%mm0 \n\t"
  1119. "movq %%mm7, %%mm1 \n\t"
  1120. "movq %%mm4, %%mm6 \n\t"
  1121. "movq %%mm5, %%mm7 \n\t"
  1122. "movq %%mm0, %%mm2 \n\t"
  1123. "movq %%mm1, %%mm3 \n\t"
  1124. "psrlq $8, %%mm2 \n\t"
  1125. "psrlq $8, %%mm3 \n\t"
  1126. "psrlq $8, %%mm6 \n\t"
  1127. "psrlq $8, %%mm7 \n\t"
  1128. "pand %2, %%mm0 \n\t"
  1129. "pand %2, %%mm1 \n\t"
  1130. "pand %2, %%mm4 \n\t"
  1131. "pand %2, %%mm5 \n\t"
  1132. "pand %3, %%mm2 \n\t"
  1133. "pand %3, %%mm3 \n\t"
  1134. "pand %3, %%mm6 \n\t"
  1135. "pand %3, %%mm7 \n\t"
  1136. "por %%mm2, %%mm0 \n\t"
  1137. "por %%mm3, %%mm1 \n\t"
  1138. "por %%mm6, %%mm4 \n\t"
  1139. "por %%mm7, %%mm5 \n\t"
  1140. "movq %%mm1, %%mm2 \n\t"
  1141. "movq %%mm4, %%mm3 \n\t"
  1142. "psllq $48, %%mm2 \n\t"
  1143. "psllq $32, %%mm3 \n\t"
  1144. "pand %4, %%mm2 \n\t"
  1145. "pand %5, %%mm3 \n\t"
  1146. "por %%mm2, %%mm0 \n\t"
  1147. "psrlq $16, %%mm1 \n\t"
  1148. "psrlq $32, %%mm4 \n\t"
  1149. "psllq $16, %%mm5 \n\t"
  1150. "por %%mm3, %%mm1 \n\t"
  1151. "pand %6, %%mm5 \n\t"
  1152. "por %%mm5, %%mm4 \n\t"
  1153. MOVNTQ" %%mm0, %0 \n\t"
  1154. MOVNTQ" %%mm1, 8%0 \n\t"
  1155. MOVNTQ" %%mm4, 16%0"
  1156. :"=m"(*d)
  1157. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1158. :"memory");
  1159. d += 24;
  1160. s += 8;
  1161. }
  1162. __asm __volatile(SFENCE:::"memory");
  1163. __asm __volatile(EMMS:::"memory");
  1164. #endif
  1165. while (s < end)
  1166. {
  1167. register uint16_t bgr;
  1168. bgr = *s++;
  1169. *d++ = (bgr&0x1F)<<3;
  1170. *d++ = (bgr&0x7E0)>>3;
  1171. *d++ = (bgr&0xF800)>>8;
  1172. }
  1173. }
  1174. static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1175. {
  1176. const uint16_t *end;
  1177. #ifdef HAVE_MMX
  1178. const uint16_t *mm_end;
  1179. #endif
  1180. uint8_t *d = (uint8_t *)dst;
  1181. const uint16_t *s = (const uint16_t *)src;
  1182. end = s + src_size/2;
  1183. #ifdef HAVE_MMX
  1184. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1185. __asm __volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  1186. mm_end = end - 3;
  1187. while (s < mm_end)
  1188. {
  1189. __asm __volatile(
  1190. PREFETCH" 32%1 \n\t"
  1191. "movq %1, %%mm0 \n\t"
  1192. "movq %1, %%mm1 \n\t"
  1193. "movq %1, %%mm2 \n\t"
  1194. "pand %2, %%mm0 \n\t"
  1195. "pand %3, %%mm1 \n\t"
  1196. "pand %4, %%mm2 \n\t"
  1197. "psllq $3, %%mm0 \n\t"
  1198. "psrlq $2, %%mm1 \n\t"
  1199. "psrlq $7, %%mm2 \n\t"
  1200. "movq %%mm0, %%mm3 \n\t"
  1201. "movq %%mm1, %%mm4 \n\t"
  1202. "movq %%mm2, %%mm5 \n\t"
  1203. "punpcklwd %%mm7, %%mm0 \n\t"
  1204. "punpcklwd %%mm7, %%mm1 \n\t"
  1205. "punpcklwd %%mm7, %%mm2 \n\t"
  1206. "punpckhwd %%mm7, %%mm3 \n\t"
  1207. "punpckhwd %%mm7, %%mm4 \n\t"
  1208. "punpckhwd %%mm7, %%mm5 \n\t"
  1209. "psllq $8, %%mm1 \n\t"
  1210. "psllq $16, %%mm2 \n\t"
  1211. "por %%mm1, %%mm0 \n\t"
  1212. "por %%mm2, %%mm0 \n\t"
  1213. "psllq $8, %%mm4 \n\t"
  1214. "psllq $16, %%mm5 \n\t"
  1215. "por %%mm4, %%mm3 \n\t"
  1216. "por %%mm5, %%mm3 \n\t"
  1217. MOVNTQ" %%mm0, %0 \n\t"
  1218. MOVNTQ" %%mm3, 8%0 \n\t"
  1219. :"=m"(*d)
  1220. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
  1221. :"memory");
  1222. d += 16;
  1223. s += 4;
  1224. }
  1225. __asm __volatile(SFENCE:::"memory");
  1226. __asm __volatile(EMMS:::"memory");
  1227. #endif
  1228. while (s < end)
  1229. {
  1230. #if 0 //slightly slower on Athlon
  1231. int bgr= *s++;
  1232. *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
  1233. #else
  1234. register uint16_t bgr;
  1235. bgr = *s++;
  1236. #ifdef WORDS_BIGENDIAN
  1237. *d++ = 0;
  1238. *d++ = (bgr&0x7C00)>>7;
  1239. *d++ = (bgr&0x3E0)>>2;
  1240. *d++ = (bgr&0x1F)<<3;
  1241. #else
  1242. *d++ = (bgr&0x1F)<<3;
  1243. *d++ = (bgr&0x3E0)>>2;
  1244. *d++ = (bgr&0x7C00)>>7;
  1245. *d++ = 0;
  1246. #endif
  1247. #endif
  1248. }
  1249. }
  1250. static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1251. {
  1252. const uint16_t *end;
  1253. #ifdef HAVE_MMX
  1254. const uint16_t *mm_end;
  1255. #endif
  1256. uint8_t *d = (uint8_t *)dst;
  1257. const uint16_t *s = (uint16_t *)src;
  1258. end = s + src_size/2;
  1259. #ifdef HAVE_MMX
  1260. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1261. __asm __volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  1262. mm_end = end - 3;
  1263. while (s < mm_end)
  1264. {
  1265. __asm __volatile(
  1266. PREFETCH" 32%1 \n\t"
  1267. "movq %1, %%mm0 \n\t"
  1268. "movq %1, %%mm1 \n\t"
  1269. "movq %1, %%mm2 \n\t"
  1270. "pand %2, %%mm0 \n\t"
  1271. "pand %3, %%mm1 \n\t"
  1272. "pand %4, %%mm2 \n\t"
  1273. "psllq $3, %%mm0 \n\t"
  1274. "psrlq $3, %%mm1 \n\t"
  1275. "psrlq $8, %%mm2 \n\t"
  1276. "movq %%mm0, %%mm3 \n\t"
  1277. "movq %%mm1, %%mm4 \n\t"
  1278. "movq %%mm2, %%mm5 \n\t"
  1279. "punpcklwd %%mm7, %%mm0 \n\t"
  1280. "punpcklwd %%mm7, %%mm1 \n\t"
  1281. "punpcklwd %%mm7, %%mm2 \n\t"
  1282. "punpckhwd %%mm7, %%mm3 \n\t"
  1283. "punpckhwd %%mm7, %%mm4 \n\t"
  1284. "punpckhwd %%mm7, %%mm5 \n\t"
  1285. "psllq $8, %%mm1 \n\t"
  1286. "psllq $16, %%mm2 \n\t"
  1287. "por %%mm1, %%mm0 \n\t"
  1288. "por %%mm2, %%mm0 \n\t"
  1289. "psllq $8, %%mm4 \n\t"
  1290. "psllq $16, %%mm5 \n\t"
  1291. "por %%mm4, %%mm3 \n\t"
  1292. "por %%mm5, %%mm3 \n\t"
  1293. MOVNTQ" %%mm0, %0 \n\t"
  1294. MOVNTQ" %%mm3, 8%0 \n\t"
  1295. :"=m"(*d)
  1296. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
  1297. :"memory");
  1298. d += 16;
  1299. s += 4;
  1300. }
  1301. __asm __volatile(SFENCE:::"memory");
  1302. __asm __volatile(EMMS:::"memory");
  1303. #endif
  1304. while (s < end)
  1305. {
  1306. register uint16_t bgr;
  1307. bgr = *s++;
  1308. #ifdef WORDS_BIGENDIAN
  1309. *d++ = 0;
  1310. *d++ = (bgr&0xF800)>>8;
  1311. *d++ = (bgr&0x7E0)>>3;
  1312. *d++ = (bgr&0x1F)<<3;
  1313. #else
  1314. *d++ = (bgr&0x1F)<<3;
  1315. *d++ = (bgr&0x7E0)>>3;
  1316. *d++ = (bgr&0xF800)>>8;
  1317. *d++ = 0;
  1318. #endif
  1319. }
  1320. }
  1321. static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
  1322. {
  1323. long idx = 15 - src_size;
  1324. uint8_t *s = (uint8_t *) src-idx, *d = dst-idx;
  1325. #ifdef HAVE_MMX
  1326. __asm __volatile(
  1327. "test %0, %0 \n\t"
  1328. "jns 2f \n\t"
  1329. PREFETCH" (%1, %0) \n\t"
  1330. "movq %3, %%mm7 \n\t"
  1331. "pxor %4, %%mm7 \n\t"
  1332. "movq %%mm7, %%mm6 \n\t"
  1333. "pxor %5, %%mm7 \n\t"
  1334. ASMALIGN(4)
  1335. "1: \n\t"
  1336. PREFETCH" 32(%1, %0) \n\t"
  1337. "movq (%1, %0), %%mm0 \n\t"
  1338. "movq 8(%1, %0), %%mm1 \n\t"
  1339. # ifdef HAVE_MMX2
  1340. "pshufw $177, %%mm0, %%mm3 \n\t"
  1341. "pshufw $177, %%mm1, %%mm5 \n\t"
  1342. "pand %%mm7, %%mm0 \n\t"
  1343. "pand %%mm6, %%mm3 \n\t"
  1344. "pand %%mm7, %%mm1 \n\t"
  1345. "pand %%mm6, %%mm5 \n\t"
  1346. "por %%mm3, %%mm0 \n\t"
  1347. "por %%mm5, %%mm1 \n\t"
  1348. # else
  1349. "movq %%mm0, %%mm2 \n\t"
  1350. "movq %%mm1, %%mm4 \n\t"
  1351. "pand %%mm7, %%mm0 \n\t"
  1352. "pand %%mm6, %%mm2 \n\t"
  1353. "pand %%mm7, %%mm1 \n\t"
  1354. "pand %%mm6, %%mm4 \n\t"
  1355. "movq %%mm2, %%mm3 \n\t"
  1356. "movq %%mm4, %%mm5 \n\t"
  1357. "pslld $16, %%mm2 \n\t"
  1358. "psrld $16, %%mm3 \n\t"
  1359. "pslld $16, %%mm4 \n\t"
  1360. "psrld $16, %%mm5 \n\t"
  1361. "por %%mm2, %%mm0 \n\t"
  1362. "por %%mm4, %%mm1 \n\t"
  1363. "por %%mm3, %%mm0 \n\t"
  1364. "por %%mm5, %%mm1 \n\t"
  1365. # endif
  1366. MOVNTQ" %%mm0, (%2, %0) \n\t"
  1367. MOVNTQ" %%mm1, 8(%2, %0) \n\t"
  1368. "add $16, %0 \n\t"
  1369. "js 1b \n\t"
  1370. SFENCE" \n\t"
  1371. EMMS" \n\t"
  1372. "2: \n\t"
  1373. : "+&r"(idx)
  1374. : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
  1375. : "memory");
  1376. #endif
  1377. for (; idx<15; idx+=4) {
  1378. register int v = *(uint32_t *)&s[idx], g = v & 0xff00ff00;
  1379. v &= 0xff00ff;
  1380. *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
  1381. }
  1382. }
  1383. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  1384. {
  1385. unsigned i;
  1386. #ifdef HAVE_MMX
  1387. long mmx_size= 23 - src_size;
  1388. asm volatile (
  1389. "test %%"REG_a", %%"REG_a" \n\t"
  1390. "jns 2f \n\t"
  1391. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  1392. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  1393. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  1394. ASMALIGN(4)
  1395. "1: \n\t"
  1396. PREFETCH" 32(%1, %%"REG_a") \n\t"
  1397. "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1398. "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
  1399. "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
  1400. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  1401. "pand %%mm5, %%mm0 \n\t"
  1402. "pand %%mm6, %%mm1 \n\t"
  1403. "pand %%mm7, %%mm2 \n\t"
  1404. "por %%mm0, %%mm1 \n\t"
  1405. "por %%mm2, %%mm1 \n\t"
  1406. "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1407. MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
  1408. "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
  1409. "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
  1410. "pand %%mm7, %%mm0 \n\t"
  1411. "pand %%mm5, %%mm1 \n\t"
  1412. "pand %%mm6, %%mm2 \n\t"
  1413. "por %%mm0, %%mm1 \n\t"
  1414. "por %%mm2, %%mm1 \n\t"
  1415. "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
  1416. MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
  1417. "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
  1418. "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
  1419. "pand %%mm6, %%mm0 \n\t"
  1420. "pand %%mm7, %%mm1 \n\t"
  1421. "pand %%mm5, %%mm2 \n\t"
  1422. "por %%mm0, %%mm1 \n\t"
  1423. "por %%mm2, %%mm1 \n\t"
  1424. MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
  1425. "add $24, %%"REG_a" \n\t"
  1426. " js 1b \n\t"
  1427. "2: \n\t"
  1428. : "+a" (mmx_size)
  1429. : "r" (src-mmx_size), "r"(dst-mmx_size)
  1430. );
  1431. __asm __volatile(SFENCE:::"memory");
  1432. __asm __volatile(EMMS:::"memory");
  1433. if (mmx_size==23) return; //finihsed, was multiple of 8
  1434. src+= src_size;
  1435. dst+= src_size;
  1436. src_size= 23-mmx_size;
  1437. src-= src_size;
  1438. dst-= src_size;
  1439. #endif
  1440. for (i=0; i<src_size; i+=3)
  1441. {
  1442. register uint8_t x;
  1443. x = src[i + 2];
  1444. dst[i + 1] = src[i + 1];
  1445. dst[i + 2] = src[i + 0];
  1446. dst[i + 0] = x;
  1447. }
  1448. }
  1449. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1450. long width, long height,
  1451. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1452. {
  1453. long y;
  1454. const long chromWidth= width>>1;
  1455. for (y=0; y<height; y++)
  1456. {
  1457. #ifdef HAVE_MMX
  1458. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1459. asm volatile(
  1460. "xor %%"REG_a", %%"REG_a" \n\t"
  1461. ASMALIGN(4)
  1462. "1: \n\t"
  1463. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1464. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1465. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1466. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1467. "movq %%mm0, %%mm2 \n\t" // U(0)
  1468. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1469. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1470. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1471. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1472. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1473. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1474. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1475. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1476. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1477. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1478. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1479. MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
  1480. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
  1481. MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
  1482. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
  1483. "add $8, %%"REG_a" \n\t"
  1484. "cmp %4, %%"REG_a" \n\t"
  1485. " jb 1b \n\t"
  1486. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1487. : "%"REG_a
  1488. );
  1489. #else
  1490. #if defined ARCH_ALPHA && defined HAVE_MVI
  1491. #define pl2yuy2(n) \
  1492. y1 = yc[n]; \
  1493. y2 = yc2[n]; \
  1494. u = uc[n]; \
  1495. v = vc[n]; \
  1496. asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
  1497. asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
  1498. asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
  1499. asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
  1500. yuv1 = (u << 8) + (v << 24); \
  1501. yuv2 = yuv1 + y2; \
  1502. yuv1 += y1; \
  1503. qdst[n] = yuv1; \
  1504. qdst2[n] = yuv2;
  1505. int i;
  1506. uint64_t *qdst = (uint64_t *) dst;
  1507. uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
  1508. const uint32_t *yc = (uint32_t *) ysrc;
  1509. const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
  1510. const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
  1511. for (i = 0; i < chromWidth; i += 8){
  1512. uint64_t y1, y2, yuv1, yuv2;
  1513. uint64_t u, v;
  1514. /* Prefetch */
  1515. asm("ldq $31,64(%0)" :: "r"(yc));
  1516. asm("ldq $31,64(%0)" :: "r"(yc2));
  1517. asm("ldq $31,64(%0)" :: "r"(uc));
  1518. asm("ldq $31,64(%0)" :: "r"(vc));
  1519. pl2yuy2(0);
  1520. pl2yuy2(1);
  1521. pl2yuy2(2);
  1522. pl2yuy2(3);
  1523. yc += 4;
  1524. yc2 += 4;
  1525. uc += 4;
  1526. vc += 4;
  1527. qdst += 4;
  1528. qdst2 += 4;
  1529. }
  1530. y++;
  1531. ysrc += lumStride;
  1532. dst += dstStride;
  1533. #elif __WORDSIZE >= 64
  1534. int i;
  1535. uint64_t *ldst = (uint64_t *) dst;
  1536. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1537. for (i = 0; i < chromWidth; i += 2){
  1538. uint64_t k, l;
  1539. k = yc[0] + (uc[0] << 8) +
  1540. (yc[1] << 16) + (vc[0] << 24);
  1541. l = yc[2] + (uc[1] << 8) +
  1542. (yc[3] << 16) + (vc[1] << 24);
  1543. *ldst++ = k + (l << 32);
  1544. yc += 4;
  1545. uc += 2;
  1546. vc += 2;
  1547. }
  1548. #else
  1549. int i, *idst = (int32_t *) dst;
  1550. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1551. for (i = 0; i < chromWidth; i++){
  1552. #ifdef WORDS_BIGENDIAN
  1553. *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
  1554. (yc[1] << 8) + (vc[0] << 0);
  1555. #else
  1556. *idst++ = yc[0] + (uc[0] << 8) +
  1557. (yc[1] << 16) + (vc[0] << 24);
  1558. #endif
  1559. yc += 2;
  1560. uc++;
  1561. vc++;
  1562. }
  1563. #endif
  1564. #endif
  1565. if ((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
  1566. {
  1567. usrc += chromStride;
  1568. vsrc += chromStride;
  1569. }
  1570. ysrc += lumStride;
  1571. dst += dstStride;
  1572. }
  1573. #ifdef HAVE_MMX
  1574. asm( EMMS" \n\t"
  1575. SFENCE" \n\t"
  1576. :::"memory");
  1577. #endif
  1578. }
  1579. /**
  1580. * Height should be a multiple of 2 and width should be a multiple of 16 (if
  1581. * this is a problem for anyone then tell me, and I will fix it).
  1582. */
  1583. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1584. long width, long height,
  1585. long lumStride, long chromStride, long dstStride)
  1586. {
  1587. //FIXME interpolate chroma
  1588. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1589. }
  1590. static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1591. long width, long height,
  1592. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1593. {
  1594. long y;
  1595. const long chromWidth= width>>1;
  1596. for (y=0; y<height; y++)
  1597. {
  1598. #ifdef HAVE_MMX
  1599. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1600. asm volatile(
  1601. "xor %%"REG_a", %%"REG_a" \n\t"
  1602. ASMALIGN(4)
  1603. "1: \n\t"
  1604. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1605. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1606. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1607. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1608. "movq %%mm0, %%mm2 \n\t" // U(0)
  1609. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1610. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1611. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1612. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1613. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1614. "movq %%mm0, %%mm4 \n\t" // Y(0)
  1615. "movq %%mm2, %%mm6 \n\t" // Y(8)
  1616. "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
  1617. "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
  1618. "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
  1619. "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
  1620. MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
  1621. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
  1622. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
  1623. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
  1624. "add $8, %%"REG_a" \n\t"
  1625. "cmp %4, %%"REG_a" \n\t"
  1626. " jb 1b \n\t"
  1627. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1628. : "%"REG_a
  1629. );
  1630. #else
  1631. //FIXME adapt the Alpha ASM code from yv12->yuy2
  1632. #if __WORDSIZE >= 64
  1633. int i;
  1634. uint64_t *ldst = (uint64_t *) dst;
  1635. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1636. for (i = 0; i < chromWidth; i += 2){
  1637. uint64_t k, l;
  1638. k = uc[0] + (yc[0] << 8) +
  1639. (vc[0] << 16) + (yc[1] << 24);
  1640. l = uc[1] + (yc[2] << 8) +
  1641. (vc[1] << 16) + (yc[3] << 24);
  1642. *ldst++ = k + (l << 32);
  1643. yc += 4;
  1644. uc += 2;
  1645. vc += 2;
  1646. }
  1647. #else
  1648. int i, *idst = (int32_t *) dst;
  1649. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1650. for (i = 0; i < chromWidth; i++){
  1651. #ifdef WORDS_BIGENDIAN
  1652. *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
  1653. (vc[0] << 8) + (yc[1] << 0);
  1654. #else
  1655. *idst++ = uc[0] + (yc[0] << 8) +
  1656. (vc[0] << 16) + (yc[1] << 24);
  1657. #endif
  1658. yc += 2;
  1659. uc++;
  1660. vc++;
  1661. }
  1662. #endif
  1663. #endif
  1664. if ((y&(vertLumPerChroma-1))==(vertLumPerChroma-1))
  1665. {
  1666. usrc += chromStride;
  1667. vsrc += chromStride;
  1668. }
  1669. ysrc += lumStride;
  1670. dst += dstStride;
  1671. }
  1672. #ifdef HAVE_MMX
  1673. asm( EMMS" \n\t"
  1674. SFENCE" \n\t"
  1675. :::"memory");
  1676. #endif
  1677. }
  1678. /**
  1679. * Height should be a multiple of 2 and width should be a multiple of 16 (if
  1680. * this is a problem for anyone then tell me, and I will fix it).
  1681. */
  1682. static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1683. long width, long height,
  1684. long lumStride, long chromStride, long dstStride)
  1685. {
  1686. //FIXME interpolate chroma
  1687. RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1688. }
  1689. /**
  1690. * Width should be a multiple of 16.
  1691. */
  1692. static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1693. long width, long height,
  1694. long lumStride, long chromStride, long dstStride)
  1695. {
  1696. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1697. }
  1698. /**
  1699. * Height should be a multiple of 2 and width should be a multiple of 16 (if
  1700. * this is a problem for anyone then tell me, and I will fix it).
  1701. */
  1702. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1703. long width, long height,
  1704. long lumStride, long chromStride, long srcStride)
  1705. {
  1706. long y;
  1707. const long chromWidth= width>>1;
  1708. for (y=0; y<height; y+=2)
  1709. {
  1710. #ifdef HAVE_MMX
  1711. asm volatile(
  1712. "xor %%"REG_a", %%"REG_a" \n\t"
  1713. "pcmpeqw %%mm7, %%mm7 \n\t"
  1714. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1715. ASMALIGN(4)
  1716. "1: \n\t"
  1717. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1718. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1719. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1720. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  1721. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  1722. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  1723. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  1724. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1725. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1726. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1727. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1728. MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
  1729. "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
  1730. "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
  1731. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  1732. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  1733. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  1734. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  1735. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1736. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1737. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1738. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1739. MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
  1740. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1741. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1742. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1743. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1744. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1745. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1746. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1747. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1748. MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
  1749. MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
  1750. "add $8, %%"REG_a" \n\t"
  1751. "cmp %4, %%"REG_a" \n\t"
  1752. " jb 1b \n\t"
  1753. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1754. : "memory", "%"REG_a
  1755. );
  1756. ydst += lumStride;
  1757. src += srcStride;
  1758. asm volatile(
  1759. "xor %%"REG_a", %%"REG_a" \n\t"
  1760. ASMALIGN(4)
  1761. "1: \n\t"
  1762. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1763. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1764. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1765. "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
  1766. "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
  1767. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1768. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1769. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1770. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1771. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1772. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1773. MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
  1774. MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
  1775. "add $8, %%"REG_a" \n\t"
  1776. "cmp %4, %%"REG_a" \n\t"
  1777. " jb 1b \n\t"
  1778. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1779. : "memory", "%"REG_a
  1780. );
  1781. #else
  1782. long i;
  1783. for (i=0; i<chromWidth; i++)
  1784. {
  1785. ydst[2*i+0] = src[4*i+0];
  1786. udst[i] = src[4*i+1];
  1787. ydst[2*i+1] = src[4*i+2];
  1788. vdst[i] = src[4*i+3];
  1789. }
  1790. ydst += lumStride;
  1791. src += srcStride;
  1792. for (i=0; i<chromWidth; i++)
  1793. {
  1794. ydst[2*i+0] = src[4*i+0];
  1795. ydst[2*i+1] = src[4*i+2];
  1796. }
  1797. #endif
  1798. udst += chromStride;
  1799. vdst += chromStride;
  1800. ydst += lumStride;
  1801. src += srcStride;
  1802. }
  1803. #ifdef HAVE_MMX
  1804. asm volatile( EMMS" \n\t"
  1805. SFENCE" \n\t"
  1806. :::"memory");
  1807. #endif
  1808. }
  1809. static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
  1810. uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1811. long width, long height, long lumStride, long chromStride)
  1812. {
  1813. /* Y Plane */
  1814. memcpy(ydst, ysrc, width*height);
  1815. /* XXX: implement upscaling for U,V */
  1816. }
  1817. static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
  1818. {
  1819. long x,y;
  1820. dst[0]= src[0];
  1821. // first line
  1822. for (x=0; x<srcWidth-1; x++){
  1823. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1824. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1825. }
  1826. dst[2*srcWidth-1]= src[srcWidth-1];
  1827. dst+= dstStride;
  1828. for (y=1; y<srcHeight; y++){
  1829. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1830. const long mmxSize= srcWidth&~15;
  1831. asm volatile(
  1832. "mov %4, %%"REG_a" \n\t"
  1833. "1: \n\t"
  1834. "movq (%0, %%"REG_a"), %%mm0 \n\t"
  1835. "movq (%1, %%"REG_a"), %%mm1 \n\t"
  1836. "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
  1837. "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
  1838. "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
  1839. "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
  1840. PAVGB" %%mm0, %%mm5 \n\t"
  1841. PAVGB" %%mm0, %%mm3 \n\t"
  1842. PAVGB" %%mm0, %%mm5 \n\t"
  1843. PAVGB" %%mm0, %%mm3 \n\t"
  1844. PAVGB" %%mm1, %%mm4 \n\t"
  1845. PAVGB" %%mm1, %%mm2 \n\t"
  1846. PAVGB" %%mm1, %%mm4 \n\t"
  1847. PAVGB" %%mm1, %%mm2 \n\t"
  1848. "movq %%mm5, %%mm7 \n\t"
  1849. "movq %%mm4, %%mm6 \n\t"
  1850. "punpcklbw %%mm3, %%mm5 \n\t"
  1851. "punpckhbw %%mm3, %%mm7 \n\t"
  1852. "punpcklbw %%mm2, %%mm4 \n\t"
  1853. "punpckhbw %%mm2, %%mm6 \n\t"
  1854. #if 1
  1855. MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
  1856. MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
  1857. MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
  1858. MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
  1859. #else
  1860. "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
  1861. "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
  1862. "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
  1863. "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
  1864. #endif
  1865. "add $8, %%"REG_a" \n\t"
  1866. " js 1b \n\t"
  1867. :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
  1868. "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
  1869. "g" (-mmxSize)
  1870. : "%"REG_a
  1871. );
  1872. #else
  1873. const long mmxSize=1;
  1874. #endif
  1875. dst[0 ]= (3*src[0] + src[srcStride])>>2;
  1876. dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
  1877. for (x=mmxSize-1; x<srcWidth-1; x++){
  1878. dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
  1879. dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
  1880. dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
  1881. dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
  1882. }
  1883. dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
  1884. dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
  1885. dst+=dstStride*2;
  1886. src+=srcStride;
  1887. }
  1888. // last line
  1889. #if 1
  1890. dst[0]= src[0];
  1891. for (x=0; x<srcWidth-1; x++){
  1892. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1893. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1894. }
  1895. dst[2*srcWidth-1]= src[srcWidth-1];
  1896. #else
  1897. for (x=0; x<srcWidth; x++){
  1898. dst[2*x+0]=
  1899. dst[2*x+1]= src[x];
  1900. }
  1901. #endif
  1902. #ifdef HAVE_MMX
  1903. asm volatile( EMMS" \n\t"
  1904. SFENCE" \n\t"
  1905. :::"memory");
  1906. #endif
  1907. }
  1908. /**
  1909. * Height should be a multiple of 2 and width should be a multiple of 16 (if
  1910. * this is a problem for anyone then tell me, and I will fix it).
  1911. * Chrominance data is only taken from every secound line, others are ignored.
  1912. * FIXME: Write HQ version.
  1913. */
  1914. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1915. long width, long height,
  1916. long lumStride, long chromStride, long srcStride)
  1917. {
  1918. long y;
  1919. const long chromWidth= width>>1;
  1920. for (y=0; y<height; y+=2)
  1921. {
  1922. #ifdef HAVE_MMX
  1923. asm volatile(
  1924. "xorl %%eax, %%eax \n\t"
  1925. "pcmpeqw %%mm7, %%mm7 \n\t"
  1926. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1927. ASMALIGN(4)
  1928. "1: \n\t"
  1929. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1930. "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
  1931. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
  1932. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  1933. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  1934. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  1935. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  1936. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1937. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1938. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1939. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1940. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  1941. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
  1942. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
  1943. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  1944. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  1945. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  1946. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  1947. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1948. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1949. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1950. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1951. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  1952. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1953. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1954. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1955. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1956. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1957. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1958. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1959. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1960. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  1961. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  1962. "addl $8, %%eax \n\t"
  1963. "cmpl %4, %%eax \n\t"
  1964. " jb 1b \n\t"
  1965. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1966. : "memory", "%eax"
  1967. );
  1968. ydst += lumStride;
  1969. src += srcStride;
  1970. asm volatile(
  1971. "xorl %%eax, %%eax \n\t"
  1972. ASMALIGN(4)
  1973. "1: \n\t"
  1974. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1975. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  1976. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  1977. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  1978. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  1979. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1980. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1981. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1982. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1983. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1984. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1985. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  1986. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  1987. "addl $8, %%eax \n\t"
  1988. "cmpl %4, %%eax \n\t"
  1989. " jb 1b \n\t"
  1990. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1991. : "memory", "%eax"
  1992. );
  1993. #else
  1994. long i;
  1995. for (i=0; i<chromWidth; i++)
  1996. {
  1997. udst[i] = src[4*i+0];
  1998. ydst[2*i+0] = src[4*i+1];
  1999. vdst[i] = src[4*i+2];
  2000. ydst[2*i+1] = src[4*i+3];
  2001. }
  2002. ydst += lumStride;
  2003. src += srcStride;
  2004. for (i=0; i<chromWidth; i++)
  2005. {
  2006. ydst[2*i+0] = src[4*i+1];
  2007. ydst[2*i+1] = src[4*i+3];
  2008. }
  2009. #endif
  2010. udst += chromStride;
  2011. vdst += chromStride;
  2012. ydst += lumStride;
  2013. src += srcStride;
  2014. }
  2015. #ifdef HAVE_MMX
  2016. asm volatile( EMMS" \n\t"
  2017. SFENCE" \n\t"
  2018. :::"memory");
  2019. #endif
  2020. }
  2021. /**
  2022. * Height should be a multiple of 2 and width should be a multiple of 2 (if
  2023. * this is a problem for anyone then tell me, and I will fix it).
  2024. * Chrominance data is only taken from every secound line,
  2025. * others are ignored in the C version.
  2026. * FIXME: Write HQ version.
  2027. */
  2028. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  2029. long width, long height,
  2030. long lumStride, long chromStride, long srcStride)
  2031. {
  2032. long y;
  2033. const long chromWidth= width>>1;
  2034. #ifdef HAVE_MMX
  2035. for (y=0; y<height-2; y+=2)
  2036. {
  2037. long i;
  2038. for (i=0; i<2; i++)
  2039. {
  2040. asm volatile(
  2041. "mov %2, %%"REG_a" \n\t"
  2042. "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
  2043. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2044. "pxor %%mm7, %%mm7 \n\t"
  2045. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
  2046. ASMALIGN(4)
  2047. "1: \n\t"
  2048. PREFETCH" 64(%0, %%"REG_d") \n\t"
  2049. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  2050. "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
  2051. "punpcklbw %%mm7, %%mm0 \n\t"
  2052. "punpcklbw %%mm7, %%mm1 \n\t"
  2053. "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
  2054. "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
  2055. "punpcklbw %%mm7, %%mm2 \n\t"
  2056. "punpcklbw %%mm7, %%mm3 \n\t"
  2057. "pmaddwd %%mm6, %%mm0 \n\t"
  2058. "pmaddwd %%mm6, %%mm1 \n\t"
  2059. "pmaddwd %%mm6, %%mm2 \n\t"
  2060. "pmaddwd %%mm6, %%mm3 \n\t"
  2061. #ifndef FAST_BGR2YV12
  2062. "psrad $8, %%mm0 \n\t"
  2063. "psrad $8, %%mm1 \n\t"
  2064. "psrad $8, %%mm2 \n\t"
  2065. "psrad $8, %%mm3 \n\t"
  2066. #endif
  2067. "packssdw %%mm1, %%mm0 \n\t"
  2068. "packssdw %%mm3, %%mm2 \n\t"
  2069. "pmaddwd %%mm5, %%mm0 \n\t"
  2070. "pmaddwd %%mm5, %%mm2 \n\t"
  2071. "packssdw %%mm2, %%mm0 \n\t"
  2072. "psraw $7, %%mm0 \n\t"
  2073. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  2074. "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
  2075. "punpcklbw %%mm7, %%mm4 \n\t"
  2076. "punpcklbw %%mm7, %%mm1 \n\t"
  2077. "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
  2078. "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
  2079. "punpcklbw %%mm7, %%mm2 \n\t"
  2080. "punpcklbw %%mm7, %%mm3 \n\t"
  2081. "pmaddwd %%mm6, %%mm4 \n\t"
  2082. "pmaddwd %%mm6, %%mm1 \n\t"
  2083. "pmaddwd %%mm6, %%mm2 \n\t"
  2084. "pmaddwd %%mm6, %%mm3 \n\t"
  2085. #ifndef FAST_BGR2YV12
  2086. "psrad $8, %%mm4 \n\t"
  2087. "psrad $8, %%mm1 \n\t"
  2088. "psrad $8, %%mm2 \n\t"
  2089. "psrad $8, %%mm3 \n\t"
  2090. #endif
  2091. "packssdw %%mm1, %%mm4 \n\t"
  2092. "packssdw %%mm3, %%mm2 \n\t"
  2093. "pmaddwd %%mm5, %%mm4 \n\t"
  2094. "pmaddwd %%mm5, %%mm2 \n\t"
  2095. "add $24, %%"REG_d" \n\t"
  2096. "packssdw %%mm2, %%mm4 \n\t"
  2097. "psraw $7, %%mm4 \n\t"
  2098. "packuswb %%mm4, %%mm0 \n\t"
  2099. "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
  2100. MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
  2101. "add $8, %%"REG_a" \n\t"
  2102. " js 1b \n\t"
  2103. : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
  2104. : "%"REG_a, "%"REG_d
  2105. );
  2106. ydst += lumStride;
  2107. src += srcStride;
  2108. }
  2109. src -= srcStride*2;
  2110. asm volatile(
  2111. "mov %4, %%"REG_a" \n\t"
  2112. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2113. "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
  2114. "pxor %%mm7, %%mm7 \n\t"
  2115. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
  2116. "add %%"REG_d", %%"REG_d" \n\t"
  2117. ASMALIGN(4)
  2118. "1: \n\t"
  2119. PREFETCH" 64(%0, %%"REG_d") \n\t"
  2120. PREFETCH" 64(%1, %%"REG_d") \n\t"
  2121. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  2122. "movq (%0, %%"REG_d"), %%mm0 \n\t"
  2123. "movq (%1, %%"REG_d"), %%mm1 \n\t"
  2124. "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
  2125. "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
  2126. PAVGB" %%mm1, %%mm0 \n\t"
  2127. PAVGB" %%mm3, %%mm2 \n\t"
  2128. "movq %%mm0, %%mm1 \n\t"
  2129. "movq %%mm2, %%mm3 \n\t"
  2130. "psrlq $24, %%mm0 \n\t"
  2131. "psrlq $24, %%mm2 \n\t"
  2132. PAVGB" %%mm1, %%mm0 \n\t"
  2133. PAVGB" %%mm3, %%mm2 \n\t"
  2134. "punpcklbw %%mm7, %%mm0 \n\t"
  2135. "punpcklbw %%mm7, %%mm2 \n\t"
  2136. #else
  2137. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  2138. "movd (%1, %%"REG_d"), %%mm1 \n\t"
  2139. "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
  2140. "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
  2141. "punpcklbw %%mm7, %%mm0 \n\t"
  2142. "punpcklbw %%mm7, %%mm1 \n\t"
  2143. "punpcklbw %%mm7, %%mm2 \n\t"
  2144. "punpcklbw %%mm7, %%mm3 \n\t"
  2145. "paddw %%mm1, %%mm0 \n\t"
  2146. "paddw %%mm3, %%mm2 \n\t"
  2147. "paddw %%mm2, %%mm0 \n\t"
  2148. "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
  2149. "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
  2150. "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
  2151. "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
  2152. "punpcklbw %%mm7, %%mm4 \n\t"
  2153. "punpcklbw %%mm7, %%mm1 \n\t"
  2154. "punpcklbw %%mm7, %%mm2 \n\t"
  2155. "punpcklbw %%mm7, %%mm3 \n\t"
  2156. "paddw %%mm1, %%mm4 \n\t"
  2157. "paddw %%mm3, %%mm2 \n\t"
  2158. "paddw %%mm4, %%mm2 \n\t"
  2159. "psrlw $2, %%mm0 \n\t"
  2160. "psrlw $2, %%mm2 \n\t"
  2161. #endif
  2162. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  2163. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  2164. "pmaddwd %%mm0, %%mm1 \n\t"
  2165. "pmaddwd %%mm2, %%mm3 \n\t"
  2166. "pmaddwd %%mm6, %%mm0 \n\t"
  2167. "pmaddwd %%mm6, %%mm2 \n\t"
  2168. #ifndef FAST_BGR2YV12
  2169. "psrad $8, %%mm0 \n\t"
  2170. "psrad $8, %%mm1 \n\t"
  2171. "psrad $8, %%mm2 \n\t"
  2172. "psrad $8, %%mm3 \n\t"
  2173. #endif
  2174. "packssdw %%mm2, %%mm0 \n\t"
  2175. "packssdw %%mm3, %%mm1 \n\t"
  2176. "pmaddwd %%mm5, %%mm0 \n\t"
  2177. "pmaddwd %%mm5, %%mm1 \n\t"
  2178. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  2179. "psraw $7, %%mm0 \n\t"
  2180. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  2181. "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
  2182. "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
  2183. "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
  2184. "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
  2185. PAVGB" %%mm1, %%mm4 \n\t"
  2186. PAVGB" %%mm3, %%mm2 \n\t"
  2187. "movq %%mm4, %%mm1 \n\t"
  2188. "movq %%mm2, %%mm3 \n\t"
  2189. "psrlq $24, %%mm4 \n\t"
  2190. "psrlq $24, %%mm2 \n\t"
  2191. PAVGB" %%mm1, %%mm4 \n\t"
  2192. PAVGB" %%mm3, %%mm2 \n\t"
  2193. "punpcklbw %%mm7, %%mm4 \n\t"
  2194. "punpcklbw %%mm7, %%mm2 \n\t"
  2195. #else
  2196. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  2197. "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
  2198. "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
  2199. "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
  2200. "punpcklbw %%mm7, %%mm4 \n\t"
  2201. "punpcklbw %%mm7, %%mm1 \n\t"
  2202. "punpcklbw %%mm7, %%mm2 \n\t"
  2203. "punpcklbw %%mm7, %%mm3 \n\t"
  2204. "paddw %%mm1, %%mm4 \n\t"
  2205. "paddw %%mm3, %%mm2 \n\t"
  2206. "paddw %%mm2, %%mm4 \n\t"
  2207. "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
  2208. "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
  2209. "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
  2210. "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
  2211. "punpcklbw %%mm7, %%mm5 \n\t"
  2212. "punpcklbw %%mm7, %%mm1 \n\t"
  2213. "punpcklbw %%mm7, %%mm2 \n\t"
  2214. "punpcklbw %%mm7, %%mm3 \n\t"
  2215. "paddw %%mm1, %%mm5 \n\t"
  2216. "paddw %%mm3, %%mm2 \n\t"
  2217. "paddw %%mm5, %%mm2 \n\t"
  2218. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2219. "psrlw $2, %%mm4 \n\t"
  2220. "psrlw $2, %%mm2 \n\t"
  2221. #endif
  2222. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  2223. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  2224. "pmaddwd %%mm4, %%mm1 \n\t"
  2225. "pmaddwd %%mm2, %%mm3 \n\t"
  2226. "pmaddwd %%mm6, %%mm4 \n\t"
  2227. "pmaddwd %%mm6, %%mm2 \n\t"
  2228. #ifndef FAST_BGR2YV12
  2229. "psrad $8, %%mm4 \n\t"
  2230. "psrad $8, %%mm1 \n\t"
  2231. "psrad $8, %%mm2 \n\t"
  2232. "psrad $8, %%mm3 \n\t"
  2233. #endif
  2234. "packssdw %%mm2, %%mm4 \n\t"
  2235. "packssdw %%mm3, %%mm1 \n\t"
  2236. "pmaddwd %%mm5, %%mm4 \n\t"
  2237. "pmaddwd %%mm5, %%mm1 \n\t"
  2238. "add $24, %%"REG_d" \n\t"
  2239. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  2240. "psraw $7, %%mm4 \n\t"
  2241. "movq %%mm0, %%mm1 \n\t"
  2242. "punpckldq %%mm4, %%mm0 \n\t"
  2243. "punpckhdq %%mm4, %%mm1 \n\t"
  2244. "packsswb %%mm1, %%mm0 \n\t"
  2245. "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
  2246. "movd %%mm0, (%2, %%"REG_a") \n\t"
  2247. "punpckhdq %%mm0, %%mm0 \n\t"
  2248. "movd %%mm0, (%3, %%"REG_a") \n\t"
  2249. "add $4, %%"REG_a" \n\t"
  2250. " js 1b \n\t"
  2251. : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
  2252. : "%"REG_a, "%"REG_d
  2253. );
  2254. udst += chromStride;
  2255. vdst += chromStride;
  2256. src += srcStride*2;
  2257. }
  2258. asm volatile( EMMS" \n\t"
  2259. SFENCE" \n\t"
  2260. :::"memory");
  2261. #else
  2262. y=0;
  2263. #endif
  2264. for (; y<height; y+=2)
  2265. {
  2266. long i;
  2267. for (i=0; i<chromWidth; i++)
  2268. {
  2269. unsigned int b = src[6*i+0];
  2270. unsigned int g = src[6*i+1];
  2271. unsigned int r = src[6*i+2];
  2272. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2273. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  2274. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  2275. udst[i] = U;
  2276. vdst[i] = V;
  2277. ydst[2*i] = Y;
  2278. b = src[6*i+3];
  2279. g = src[6*i+4];
  2280. r = src[6*i+5];
  2281. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2282. ydst[2*i+1] = Y;
  2283. }
  2284. ydst += lumStride;
  2285. src += srcStride;
  2286. for (i=0; i<chromWidth; i++)
  2287. {
  2288. unsigned int b = src[6*i+0];
  2289. unsigned int g = src[6*i+1];
  2290. unsigned int r = src[6*i+2];
  2291. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2292. ydst[2*i] = Y;
  2293. b = src[6*i+3];
  2294. g = src[6*i+4];
  2295. r = src[6*i+5];
  2296. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2297. ydst[2*i+1] = Y;
  2298. }
  2299. udst += chromStride;
  2300. vdst += chromStride;
  2301. ydst += lumStride;
  2302. src += srcStride;
  2303. }
  2304. }
  2305. void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
  2306. long width, long height, long src1Stride,
  2307. long src2Stride, long dstStride){
  2308. long h;
  2309. for (h=0; h < height; h++)
  2310. {
  2311. long w;
  2312. #ifdef HAVE_MMX
  2313. #ifdef HAVE_SSE2
  2314. asm(
  2315. "xor %%"REG_a", %%"REG_a" \n\t"
  2316. "1: \n\t"
  2317. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2318. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2319. "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
  2320. "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
  2321. "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
  2322. "punpcklbw %%xmm2, %%xmm0 \n\t"
  2323. "punpckhbw %%xmm2, %%xmm1 \n\t"
  2324. "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
  2325. "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
  2326. "add $16, %%"REG_a" \n\t"
  2327. "cmp %3, %%"REG_a" \n\t"
  2328. " jb 1b \n\t"
  2329. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  2330. : "memory", "%"REG_a""
  2331. );
  2332. #else
  2333. asm(
  2334. "xor %%"REG_a", %%"REG_a" \n\t"
  2335. "1: \n\t"
  2336. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2337. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2338. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  2339. "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
  2340. "movq %%mm0, %%mm1 \n\t"
  2341. "movq %%mm2, %%mm3 \n\t"
  2342. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  2343. "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
  2344. "punpcklbw %%mm4, %%mm0 \n\t"
  2345. "punpckhbw %%mm4, %%mm1 \n\t"
  2346. "punpcklbw %%mm5, %%mm2 \n\t"
  2347. "punpckhbw %%mm5, %%mm3 \n\t"
  2348. MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
  2349. MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
  2350. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
  2351. MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
  2352. "add $16, %%"REG_a" \n\t"
  2353. "cmp %3, %%"REG_a" \n\t"
  2354. " jb 1b \n\t"
  2355. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  2356. : "memory", "%"REG_a
  2357. );
  2358. #endif
  2359. for (w= (width&(~15)); w < width; w++)
  2360. {
  2361. dest[2*w+0] = src1[w];
  2362. dest[2*w+1] = src2[w];
  2363. }
  2364. #else
  2365. for (w=0; w < width; w++)
  2366. {
  2367. dest[2*w+0] = src1[w];
  2368. dest[2*w+1] = src2[w];
  2369. }
  2370. #endif
  2371. dest += dstStride;
  2372. src1 += src1Stride;
  2373. src2 += src2Stride;
  2374. }
  2375. #ifdef HAVE_MMX
  2376. asm(
  2377. EMMS" \n\t"
  2378. SFENCE" \n\t"
  2379. ::: "memory"
  2380. );
  2381. #endif
  2382. }
  2383. static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
  2384. uint8_t *dst1, uint8_t *dst2,
  2385. long width, long height,
  2386. long srcStride1, long srcStride2,
  2387. long dstStride1, long dstStride2)
  2388. {
  2389. long y,x,w,h;
  2390. w=width/2; h=height/2;
  2391. #ifdef HAVE_MMX
  2392. asm volatile(
  2393. PREFETCH" %0 \n\t"
  2394. PREFETCH" %1 \n\t"
  2395. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
  2396. #endif
  2397. for (y=0;y<h;y++){
  2398. const uint8_t* s1=src1+srcStride1*(y>>1);
  2399. uint8_t* d=dst1+dstStride1*y;
  2400. x=0;
  2401. #ifdef HAVE_MMX
  2402. for (;x<w-31;x+=32)
  2403. {
  2404. asm volatile(
  2405. PREFETCH" 32%1 \n\t"
  2406. "movq %1, %%mm0 \n\t"
  2407. "movq 8%1, %%mm2 \n\t"
  2408. "movq 16%1, %%mm4 \n\t"
  2409. "movq 24%1, %%mm6 \n\t"
  2410. "movq %%mm0, %%mm1 \n\t"
  2411. "movq %%mm2, %%mm3 \n\t"
  2412. "movq %%mm4, %%mm5 \n\t"
  2413. "movq %%mm6, %%mm7 \n\t"
  2414. "punpcklbw %%mm0, %%mm0 \n\t"
  2415. "punpckhbw %%mm1, %%mm1 \n\t"
  2416. "punpcklbw %%mm2, %%mm2 \n\t"
  2417. "punpckhbw %%mm3, %%mm3 \n\t"
  2418. "punpcklbw %%mm4, %%mm4 \n\t"
  2419. "punpckhbw %%mm5, %%mm5 \n\t"
  2420. "punpcklbw %%mm6, %%mm6 \n\t"
  2421. "punpckhbw %%mm7, %%mm7 \n\t"
  2422. MOVNTQ" %%mm0, %0 \n\t"
  2423. MOVNTQ" %%mm1, 8%0 \n\t"
  2424. MOVNTQ" %%mm2, 16%0 \n\t"
  2425. MOVNTQ" %%mm3, 24%0 \n\t"
  2426. MOVNTQ" %%mm4, 32%0 \n\t"
  2427. MOVNTQ" %%mm5, 40%0 \n\t"
  2428. MOVNTQ" %%mm6, 48%0 \n\t"
  2429. MOVNTQ" %%mm7, 56%0"
  2430. :"=m"(d[2*x])
  2431. :"m"(s1[x])
  2432. :"memory");
  2433. }
  2434. #endif
  2435. for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
  2436. }
  2437. for (y=0;y<h;y++){
  2438. const uint8_t* s2=src2+srcStride2*(y>>1);
  2439. uint8_t* d=dst2+dstStride2*y;
  2440. x=0;
  2441. #ifdef HAVE_MMX
  2442. for (;x<w-31;x+=32)
  2443. {
  2444. asm volatile(
  2445. PREFETCH" 32%1 \n\t"
  2446. "movq %1, %%mm0 \n\t"
  2447. "movq 8%1, %%mm2 \n\t"
  2448. "movq 16%1, %%mm4 \n\t"
  2449. "movq 24%1, %%mm6 \n\t"
  2450. "movq %%mm0, %%mm1 \n\t"
  2451. "movq %%mm2, %%mm3 \n\t"
  2452. "movq %%mm4, %%mm5 \n\t"
  2453. "movq %%mm6, %%mm7 \n\t"
  2454. "punpcklbw %%mm0, %%mm0 \n\t"
  2455. "punpckhbw %%mm1, %%mm1 \n\t"
  2456. "punpcklbw %%mm2, %%mm2 \n\t"
  2457. "punpckhbw %%mm3, %%mm3 \n\t"
  2458. "punpcklbw %%mm4, %%mm4 \n\t"
  2459. "punpckhbw %%mm5, %%mm5 \n\t"
  2460. "punpcklbw %%mm6, %%mm6 \n\t"
  2461. "punpckhbw %%mm7, %%mm7 \n\t"
  2462. MOVNTQ" %%mm0, %0 \n\t"
  2463. MOVNTQ" %%mm1, 8%0 \n\t"
  2464. MOVNTQ" %%mm2, 16%0 \n\t"
  2465. MOVNTQ" %%mm3, 24%0 \n\t"
  2466. MOVNTQ" %%mm4, 32%0 \n\t"
  2467. MOVNTQ" %%mm5, 40%0 \n\t"
  2468. MOVNTQ" %%mm6, 48%0 \n\t"
  2469. MOVNTQ" %%mm7, 56%0"
  2470. :"=m"(d[2*x])
  2471. :"m"(s2[x])
  2472. :"memory");
  2473. }
  2474. #endif
  2475. for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
  2476. }
  2477. #ifdef HAVE_MMX
  2478. asm(
  2479. EMMS" \n\t"
  2480. SFENCE" \n\t"
  2481. ::: "memory"
  2482. );
  2483. #endif
  2484. }
  2485. static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
  2486. uint8_t *dst,
  2487. long width, long height,
  2488. long srcStride1, long srcStride2,
  2489. long srcStride3, long dstStride)
  2490. {
  2491. long y,x,w,h;
  2492. w=width/2; h=height;
  2493. for (y=0;y<h;y++){
  2494. const uint8_t* yp=src1+srcStride1*y;
  2495. const uint8_t* up=src2+srcStride2*(y>>2);
  2496. const uint8_t* vp=src3+srcStride3*(y>>2);
  2497. uint8_t* d=dst+dstStride*y;
  2498. x=0;
  2499. #ifdef HAVE_MMX
  2500. for (;x<w-7;x+=8)
  2501. {
  2502. asm volatile(
  2503. PREFETCH" 32(%1, %0) \n\t"
  2504. PREFETCH" 32(%2, %0) \n\t"
  2505. PREFETCH" 32(%3, %0) \n\t"
  2506. "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2507. "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
  2508. "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
  2509. "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2510. "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
  2511. "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
  2512. "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
  2513. "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
  2514. "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
  2515. "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
  2516. "movq %%mm1, %%mm6 \n\t"
  2517. "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
  2518. "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
  2519. "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
  2520. MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
  2521. MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
  2522. "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
  2523. "movq 8(%1, %0, 4), %%mm0 \n\t"
  2524. "movq %%mm0, %%mm3 \n\t"
  2525. "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
  2526. "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
  2527. MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
  2528. MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
  2529. "movq %%mm4, %%mm6 \n\t"
  2530. "movq 16(%1, %0, 4), %%mm0 \n\t"
  2531. "movq %%mm0, %%mm3 \n\t"
  2532. "punpcklbw %%mm5, %%mm4 \n\t"
  2533. "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
  2534. "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
  2535. MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
  2536. MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
  2537. "punpckhbw %%mm5, %%mm6 \n\t"
  2538. "movq 24(%1, %0, 4), %%mm0 \n\t"
  2539. "movq %%mm0, %%mm3 \n\t"
  2540. "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
  2541. "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
  2542. MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
  2543. MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
  2544. : "+r" (x)
  2545. : "r"(yp), "r" (up), "r"(vp), "r"(d)
  2546. :"memory");
  2547. }
  2548. #endif
  2549. for (; x<w; x++)
  2550. {
  2551. const long x2 = x<<2;
  2552. d[8*x+0] = yp[x2];
  2553. d[8*x+1] = up[x];
  2554. d[8*x+2] = yp[x2+1];
  2555. d[8*x+3] = vp[x];
  2556. d[8*x+4] = yp[x2+2];
  2557. d[8*x+5] = up[x];
  2558. d[8*x+6] = yp[x2+3];
  2559. d[8*x+7] = vp[x];
  2560. }
  2561. }
  2562. #ifdef HAVE_MMX
  2563. asm(
  2564. EMMS" \n\t"
  2565. SFENCE" \n\t"
  2566. ::: "memory"
  2567. );
  2568. #endif
  2569. }
  2570. static inline void RENAME(rgb2rgb_init)(void){
  2571. rgb15to16 = RENAME(rgb15to16);
  2572. rgb15to24 = RENAME(rgb15to24);
  2573. rgb15to32 = RENAME(rgb15to32);
  2574. rgb16to24 = RENAME(rgb16to24);
  2575. rgb16to32 = RENAME(rgb16to32);
  2576. rgb16to15 = RENAME(rgb16to15);
  2577. rgb24to16 = RENAME(rgb24to16);
  2578. rgb24to15 = RENAME(rgb24to15);
  2579. rgb24to32 = RENAME(rgb24to32);
  2580. rgb32to16 = RENAME(rgb32to16);
  2581. rgb32to15 = RENAME(rgb32to15);
  2582. rgb32to24 = RENAME(rgb32to24);
  2583. rgb24tobgr15 = RENAME(rgb24tobgr15);
  2584. rgb24tobgr16 = RENAME(rgb24tobgr16);
  2585. rgb24tobgr24 = RENAME(rgb24tobgr24);
  2586. rgb32tobgr32 = RENAME(rgb32tobgr32);
  2587. rgb32tobgr16 = RENAME(rgb32tobgr16);
  2588. rgb32tobgr15 = RENAME(rgb32tobgr15);
  2589. yv12toyuy2 = RENAME(yv12toyuy2);
  2590. yv12touyvy = RENAME(yv12touyvy);
  2591. yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
  2592. yuy2toyv12 = RENAME(yuy2toyv12);
  2593. // uyvytoyv12 = RENAME(uyvytoyv12);
  2594. // yvu9toyv12 = RENAME(yvu9toyv12);
  2595. planar2x = RENAME(planar2x);
  2596. rgb24toyv12 = RENAME(rgb24toyv12);
  2597. interleaveBytes = RENAME(interleaveBytes);
  2598. vu9_to_vu12 = RENAME(vu9_to_vu12);
  2599. yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
  2600. }