vf_fspp.c 71 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118
  1. /*
  2. * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
  3. * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
  4. *
  5. * This file is part of MPlayer.
  6. *
  7. * MPlayer is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * MPlayer is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License along
  18. * with MPlayer; if not, write to the Free Software Foundation, Inc.,
  19. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20. */
  21. /*
  22. * This implementation is based on an algorithm described in
  23. * "Aria Nosratinia Embedded Post-Processing for
  24. * Enhancement of Compressed Images (1999)"
  25. * (http://citeseer.nj.nec.com/nosratinia99embedded.html)
  26. * Futher, with splitting (i)dct into hor/ver passes, one of them can be
  27. * performed once per block, not pixel. This allows for much better speed.
  28. */
  29. /*
  30. Heavily optimized version of SPP filter by Nikolaj
  31. */
  32. #include <stdio.h>
  33. #include <stdlib.h>
  34. #include <string.h>
  35. #include <inttypes.h>
  36. #include <math.h>
  37. #include "config.h"
  38. #include "mp_msg.h"
  39. #include "cpudetect.h"
  40. #include "img_format.h"
  41. #include "mp_image.h"
  42. #include "vf.h"
  43. #include "av_helpers.h"
  44. #include "libvo/fastmemcpy.h"
  45. #include "libavutil/internal.h"
  46. #include "libavutil/intreadwrite.h"
  47. #include "libavutil/mem.h"
  48. #include "libavutil/x86/asm.h"
  49. #include "libavcodec/avcodec.h"
  50. #include "libavcodec/dsputil.h"
  51. #undef free
  52. #undef malloc
  53. //===========================================================================//
  54. #define BLOCKSZ 12
  55. static const short custom_threshold[64]=
  56. // values (296) can't be too high
  57. // -it causes too big quant dependence
  58. // or maybe overflow(check), which results in some flashing
  59. { 71, 296, 295, 237, 71, 40, 38, 19,
  60. 245, 193, 185, 121, 102, 73, 53, 27,
  61. 158, 129, 141, 107, 97, 73, 50, 26,
  62. 102, 116, 109, 98, 82, 66, 45, 23,
  63. 71, 94, 95, 81, 70, 56, 38, 20,
  64. 56, 77, 74, 66, 56, 44, 30, 15,
  65. 38, 53, 50, 45, 38, 30, 21, 11,
  66. 20, 27, 26, 23, 20, 15, 11, 5
  67. };
  68. static const uint8_t __attribute__((aligned(32))) dither[8][8]={
  69. { 0, 48, 12, 60, 3, 51, 15, 63, },
  70. { 32, 16, 44, 28, 35, 19, 47, 31, },
  71. { 8, 56, 4, 52, 11, 59, 7, 55, },
  72. { 40, 24, 36, 20, 43, 27, 39, 23, },
  73. { 2, 50, 14, 62, 1, 49, 13, 61, },
  74. { 34, 18, 46, 30, 33, 17, 45, 29, },
  75. { 10, 58, 6, 54, 9, 57, 5, 53, },
  76. { 42, 26, 38, 22, 41, 25, 37, 21, },
  77. };
  78. struct vf_priv_s { //align 16 !
  79. uint64_t threshold_mtx_noq[8*2];
  80. uint64_t threshold_mtx[8*2];//used in both C & MMX (& later SSE2) versions
  81. int log2_count;
  82. int temp_stride;
  83. int qp;
  84. int mpeg2;
  85. int prev_q;
  86. uint8_t *src;
  87. int16_t *temp;
  88. int bframes;
  89. char *non_b_qp;
  90. };
  91. #if !HAVE_MMX
  92. //This func reads from 1 slice, 1 and clears 0 & 1
  93. static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
  94. {int y, x;
  95. #define STORE(pos) \
  96. temp= (src[x + pos] + (d[pos]>>log2_scale))>>(6-log2_scale); \
  97. src[x + pos]=src[x + pos - 8*src_stride]=0; \
  98. if(temp & 0x100) temp= ~(temp>>31); \
  99. dst[x + pos]= temp;
  100. for(y=0; y<height; y++){
  101. const uint8_t *d= dither[y];
  102. for(x=0; x<width; x+=8){
  103. int temp;
  104. STORE(0);
  105. STORE(1);
  106. STORE(2);
  107. STORE(3);
  108. STORE(4);
  109. STORE(5);
  110. STORE(6);
  111. STORE(7);
  112. }
  113. src+=src_stride;
  114. dst+=dst_stride;
  115. }
  116. }
  117. //This func reads from 2 slices, 0 & 2 and clears 2-nd
  118. static void store_slice2_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
  119. {int y, x;
  120. #define STORE2(pos) \
  121. temp= (src[x + pos] + src[x + pos + 16*src_stride] + (d[pos]>>log2_scale))>>(6-log2_scale); \
  122. src[x + pos + 16*src_stride]=0; \
  123. if(temp & 0x100) temp= ~(temp>>31); \
  124. dst[x + pos]= temp;
  125. for(y=0; y<height; y++){
  126. const uint8_t *d= dither[y];
  127. for(x=0; x<width; x+=8){
  128. int temp;
  129. STORE2(0);
  130. STORE2(1);
  131. STORE2(2);
  132. STORE2(3);
  133. STORE2(4);
  134. STORE2(5);
  135. STORE2(6);
  136. STORE2(7);
  137. }
  138. src+=src_stride;
  139. dst+=dst_stride;
  140. }
  141. }
  142. static void mul_thrmat_c(struct vf_priv_s *p,int q)
  143. {
  144. int a;
  145. for(a=0;a<64;a++)
  146. ((short*)p->threshold_mtx)[a]=q * ((short*)p->threshold_mtx_noq)[a];//ints faster in C
  147. }
  148. static void column_fidct_c(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt);
  149. static void row_idct_c(int16_t* workspace,
  150. int16_t* output_adr, int output_stride, int cnt);
  151. static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt);
  152. //this is rather ugly, but there is no need for function pointers
  153. #define store_slice_s store_slice_c
  154. #define store_slice2_s store_slice2_c
  155. #define mul_thrmat_s mul_thrmat_c
  156. #define column_fidct_s column_fidct_c
  157. #define row_idct_s row_idct_c
  158. #define row_fdct_s row_fdct_c
  159. #else /* HAVE_MMX */
  160. //This func reads from 1 slice, 1 and clears 0 & 1
  161. static void store_slice_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
  162. {
  163. const uint8_t *od=&dither[0][0];
  164. const uint8_t *end=&dither[height][0];
  165. width = (width+7)&~7;
  166. dst_stride-=width;
  167. //src_stride=(src_stride-width)*2;
  168. __asm__ volatile(
  169. "mov %5, %%"REG_d" \n\t"
  170. "mov %6, %%"REG_S" \n\t"
  171. "mov %7, %%"REG_D" \n\t"
  172. "mov %1, %%"REG_a" \n\t"
  173. "movd %%"REG_d", %%mm5 \n\t"
  174. "xor $-1, %%"REG_d" \n\t"
  175. "mov %%"REG_a", %%"REG_c" \n\t"
  176. "add $7, %%"REG_d" \n\t"
  177. "neg %%"REG_a" \n\t"
  178. "sub %0, %%"REG_c" \n\t"
  179. "add %%"REG_c", %%"REG_c" \n\t"
  180. "movd %%"REG_d", %%mm2 \n\t"
  181. "mov %%"REG_c", %1 \n\t"
  182. "mov %2, %%"REG_d" \n\t"
  183. "shl $4, %%"REG_a" \n\t"
  184. "2: \n\t"
  185. "movq (%%"REG_d"), %%mm3 \n\t"
  186. "movq %%mm3, %%mm4 \n\t"
  187. "pxor %%mm7, %%mm7 \n\t"
  188. "punpcklbw %%mm7, %%mm3 \n\t"
  189. "punpckhbw %%mm7, %%mm4 \n\t"
  190. "mov %0, %%"REG_c" \n\t"
  191. "psraw %%mm5, %%mm3 \n\t"
  192. "psraw %%mm5, %%mm4 \n\t"
  193. "1: \n\t"
  194. "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t"
  195. "movq (%%"REG_S"), %%mm0 \n\t"
  196. "movq 8(%%"REG_S"), %%mm1 \n\t"
  197. "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t"
  198. "paddw %%mm3, %%mm0 \n\t"
  199. "paddw %%mm4, %%mm1 \n\t"
  200. "movq %%mm7, (%%"REG_S") \n\t"
  201. "psraw %%mm2, %%mm0 \n\t"
  202. "psraw %%mm2, %%mm1 \n\t"
  203. "movq %%mm7, 8(%%"REG_S") \n\t"
  204. "packuswb %%mm1, %%mm0 \n\t"
  205. "add $16, %%"REG_S" \n\t"
  206. "movq %%mm0, (%%"REG_D") \n\t"
  207. "add $8, %%"REG_D" \n\t"
  208. "sub $8, %%"REG_c" \n\t"
  209. "jg 1b \n\t"
  210. "add %1, %%"REG_S" \n\t"
  211. "add $8, %%"REG_d" \n\t"
  212. "add %3, %%"REG_D" \n\t"
  213. "cmp %4, %%"REG_d" \n\t"
  214. "jl 2b \n\t"
  215. :
  216. : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
  217. "m" (log2_scale), "m" (src), "m" (dst) //input
  218. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  219. );
  220. }
  221. //This func reads from 2 slices, 0 & 2 and clears 2-nd
  222. static void store_slice2_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
  223. {
  224. const uint8_t *od=&dither[0][0];
  225. const uint8_t *end=&dither[height][0];
  226. width = (width+7)&~7;
  227. dst_stride-=width;
  228. //src_stride=(src_stride-width)*2;
  229. __asm__ volatile(
  230. "mov %5, %%"REG_d" \n\t"
  231. "mov %6, %%"REG_S" \n\t"
  232. "mov %7, %%"REG_D" \n\t"
  233. "mov %1, %%"REG_a" \n\t"
  234. "movd %%"REG_d", %%mm5 \n\t"
  235. "xor $-1, %%"REG_d" \n\t"
  236. "mov %%"REG_a", %%"REG_c" \n\t"
  237. "add $7, %%"REG_d" \n\t"
  238. "sub %0, %%"REG_c" \n\t"
  239. "add %%"REG_c", %%"REG_c" \n\t"
  240. "movd %%"REG_d", %%mm2 \n\t"
  241. "mov %%"REG_c", %1 \n\t"
  242. "mov %2, %%"REG_d" \n\t"
  243. "shl $5, %%"REG_a" \n\t"
  244. "2: \n\t"
  245. "movq (%%"REG_d"), %%mm3 \n\t"
  246. "movq %%mm3, %%mm4 \n\t"
  247. "pxor %%mm7, %%mm7 \n\t"
  248. "punpcklbw %%mm7, %%mm3 \n\t"
  249. "punpckhbw %%mm7, %%mm4 \n\t"
  250. "mov %0, %%"REG_c" \n\t"
  251. "psraw %%mm5, %%mm3 \n\t"
  252. "psraw %%mm5, %%mm4 \n\t"
  253. "1: \n\t"
  254. "movq (%%"REG_S"), %%mm0 \n\t"
  255. "movq 8(%%"REG_S"), %%mm1 \n\t"
  256. "paddw %%mm3, %%mm0 \n\t"
  257. "paddw (%%"REG_S",%%"REG_a",), %%mm0 \n\t"
  258. "paddw %%mm4, %%mm1 \n\t"
  259. "movq 8(%%"REG_S",%%"REG_a",), %%mm6 \n\t"
  260. "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t"
  261. "psraw %%mm2, %%mm0 \n\t"
  262. "paddw %%mm6, %%mm1 \n\t"
  263. "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t"
  264. "psraw %%mm2, %%mm1 \n\t"
  265. "packuswb %%mm1, %%mm0 \n\t"
  266. "movq %%mm0, (%%"REG_D") \n\t"
  267. "add $16, %%"REG_S" \n\t"
  268. "add $8, %%"REG_D" \n\t"
  269. "sub $8, %%"REG_c" \n\t"
  270. "jg 1b \n\t"
  271. "add %1, %%"REG_S" \n\t"
  272. "add $8, %%"REG_d" \n\t"
  273. "add %3, %%"REG_D" \n\t"
  274. "cmp %4, %%"REG_d" \n\t"
  275. "jl 2b \n\t"
  276. :
  277. : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
  278. "m" (log2_scale), "m" (src), "m" (dst) //input
  279. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S
  280. );
  281. }
  282. static void mul_thrmat_mmx(struct vf_priv_s *p, int q)
  283. {
  284. uint64_t *adr=&p->threshold_mtx_noq[0];
  285. __asm__ volatile(
  286. "movd %0, %%mm7 \n\t"
  287. "add $8*8*2, %%"REG_D" \n\t"
  288. "movq 0*8(%%"REG_S"), %%mm0 \n\t"
  289. "punpcklwd %%mm7, %%mm7 \n\t"
  290. "movq 1*8(%%"REG_S"), %%mm1 \n\t"
  291. "punpckldq %%mm7, %%mm7 \n\t"
  292. "pmullw %%mm7, %%mm0 \n\t"
  293. "movq 2*8(%%"REG_S"), %%mm2 \n\t"
  294. "pmullw %%mm7, %%mm1 \n\t"
  295. "movq 3*8(%%"REG_S"), %%mm3 \n\t"
  296. "pmullw %%mm7, %%mm2 \n\t"
  297. "movq %%mm0, 0*8(%%"REG_D") \n\t"
  298. "movq 4*8(%%"REG_S"), %%mm4 \n\t"
  299. "pmullw %%mm7, %%mm3 \n\t"
  300. "movq %%mm1, 1*8(%%"REG_D") \n\t"
  301. "movq 5*8(%%"REG_S"), %%mm5 \n\t"
  302. "pmullw %%mm7, %%mm4 \n\t"
  303. "movq %%mm2, 2*8(%%"REG_D") \n\t"
  304. "movq 6*8(%%"REG_S"), %%mm6 \n\t"
  305. "pmullw %%mm7, %%mm5 \n\t"
  306. "movq %%mm3, 3*8(%%"REG_D") \n\t"
  307. "movq 7*8+0*8(%%"REG_S"), %%mm0 \n\t"
  308. "pmullw %%mm7, %%mm6 \n\t"
  309. "movq %%mm4, 4*8(%%"REG_D") \n\t"
  310. "movq 7*8+1*8(%%"REG_S"), %%mm1 \n\t"
  311. "pmullw %%mm7, %%mm0 \n\t"
  312. "movq %%mm5, 5*8(%%"REG_D") \n\t"
  313. "movq 7*8+2*8(%%"REG_S"), %%mm2 \n\t"
  314. "pmullw %%mm7, %%mm1 \n\t"
  315. "movq %%mm6, 6*8(%%"REG_D") \n\t"
  316. "movq 7*8+3*8(%%"REG_S"), %%mm3 \n\t"
  317. "pmullw %%mm7, %%mm2 \n\t"
  318. "movq %%mm0, 7*8+0*8(%%"REG_D") \n\t"
  319. "movq 7*8+4*8(%%"REG_S"), %%mm4 \n\t"
  320. "pmullw %%mm7, %%mm3 \n\t"
  321. "movq %%mm1, 7*8+1*8(%%"REG_D") \n\t"
  322. "movq 7*8+5*8(%%"REG_S"), %%mm5 \n\t"
  323. "pmullw %%mm7, %%mm4 \n\t"
  324. "movq %%mm2, 7*8+2*8(%%"REG_D") \n\t"
  325. "movq 7*8+6*8(%%"REG_S"), %%mm6 \n\t"
  326. "pmullw %%mm7, %%mm5 \n\t"
  327. "movq %%mm3, 7*8+3*8(%%"REG_D") \n\t"
  328. "movq 14*8+0*8(%%"REG_S"), %%mm0 \n\t"
  329. "pmullw %%mm7, %%mm6 \n\t"
  330. "movq %%mm4, 7*8+4*8(%%"REG_D") \n\t"
  331. "movq 14*8+1*8(%%"REG_S"), %%mm1 \n\t"
  332. "pmullw %%mm7, %%mm0 \n\t"
  333. "movq %%mm5, 7*8+5*8(%%"REG_D") \n\t"
  334. "pmullw %%mm7, %%mm1 \n\t"
  335. "movq %%mm6, 7*8+6*8(%%"REG_D") \n\t"
  336. "movq %%mm0, 14*8+0*8(%%"REG_D") \n\t"
  337. "movq %%mm1, 14*8+1*8(%%"REG_D") \n\t"
  338. : "+g" (q), "+S" (adr), "+D" (adr)
  339. :
  340. );
  341. }
  342. static void column_fidct_mmx(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt);
  343. static void row_idct_mmx(int16_t* workspace,
  344. int16_t* output_adr, int output_stride, int cnt);
  345. static void row_fdct_mmx(int16_t *data, const uint8_t *pixels, int line_size, int cnt);
  346. #define store_slice_s store_slice_mmx
  347. #define store_slice2_s store_slice2_mmx
  348. #define mul_thrmat_s mul_thrmat_mmx
  349. #define column_fidct_s column_fidct_mmx
  350. #define row_idct_s row_idct_mmx
  351. #define row_fdct_s row_fdct_mmx
  352. #endif // HAVE_MMX
  353. static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src,
  354. int dst_stride, int src_stride,
  355. int width, int height,
  356. uint8_t *qp_store, int qp_stride, int is_luma)
  357. {
  358. int x, x0, y, es, qy, t;
  359. const int stride= is_luma ? p->temp_stride : (width+16);//((width+16+15)&(~15))
  360. const int step=6-p->log2_count;
  361. const int qps= 3 + is_luma;
  362. int32_t __attribute__((aligned(32))) block_align[4*8*BLOCKSZ+ 4*8*BLOCKSZ];
  363. int16_t *block= (int16_t *)block_align;
  364. int16_t *block3=(int16_t *)(block_align+4*8*BLOCKSZ);
  365. memset(block3, 0, 4*8*BLOCKSZ);
  366. //p->src=src-src_stride*8-8;//!
  367. if (!src || !dst) return; // HACK avoid crash for Y8 colourspace
  368. for(y=0; y<height; y++){
  369. int index= 8 + 8*stride + y*stride;
  370. fast_memcpy(p->src + index, src + y*src_stride, width);//this line can be avoided by using DR & user fr.buffers
  371. for(x=0; x<8; x++){
  372. p->src[index - x - 1]= p->src[index + x ];
  373. p->src[index + width + x ]= p->src[index + width - x - 1];
  374. }
  375. }
  376. for(y=0; y<8; y++){
  377. fast_memcpy(p->src + ( 7-y)*stride, p->src + ( y+8)*stride, stride);
  378. fast_memcpy(p->src + (height+8+y)*stride, p->src + (height-y+7)*stride, stride);
  379. }
  380. //FIXME (try edge emu)
  381. for(y=8; y<24; y++)
  382. memset(p->temp+ 8 +y*stride, 0,width*sizeof(int16_t));
  383. for(y=step; y<height+8; y+=step){ //step= 1,2
  384. qy=y-4;
  385. if (qy>height-1) qy=height-1;
  386. if (qy<0) qy=0;
  387. qy=(qy>>qps)*qp_stride;
  388. row_fdct_s(block, p->src + y*stride +2-(y&1), stride, 2);
  389. for(x0=0; x0<width+8-8*(BLOCKSZ-1); x0+=8*(BLOCKSZ-1)){
  390. row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, 2*(BLOCKSZ-1));
  391. if(p->qp)
  392. column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+0*8, block3+0*8, 8*(BLOCKSZ-1)); //yes, this is a HOTSPOT
  393. else
  394. for (x=0; x<8*(BLOCKSZ-1); x+=8) {
  395. t=x+x0-2; //correct t=x+x0-2-(y&1), but its the same
  396. if (t<0) t=0;//t always < width-2
  397. t=qp_store[qy+(t>>qps)];
  398. t=norm_qscale(t, p->mpeg2);
  399. if (t!=p->prev_q) p->prev_q=t, mul_thrmat_s(p, t);
  400. column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+x*8, block3+x*8, 8); //yes, this is a HOTSPOT
  401. }
  402. row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, 2*(BLOCKSZ-1));
  403. memmove(block, block+(BLOCKSZ-1)*64, 8*8*sizeof(int16_t)); //cycling
  404. memmove(block3, block3+(BLOCKSZ-1)*64, 6*8*sizeof(int16_t));
  405. }
  406. //
  407. es=width+8-x0; // 8, ...
  408. if (es>8)
  409. row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, (es-4)>>2);
  410. column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block, block3, es&(~1));
  411. row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, es>>2);
  412. {const int y1=y-8+step;//l5-7 l4-6
  413. if (!(y1&7) && y1) {
  414. if (y1&8) store_slice_s(dst + (y1-8)*dst_stride, p->temp+ 8 +8*stride,
  415. dst_stride, stride, width, 8, 5-p->log2_count);
  416. else store_slice2_s(dst + (y1-8)*dst_stride, p->temp+ 8 +0*stride,
  417. dst_stride, stride, width, 8, 5-p->log2_count);
  418. } }
  419. }
  420. if (y&7) { // == height & 7
  421. if (y&8) store_slice_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +8*stride,
  422. dst_stride, stride, width, y&7, 5-p->log2_count);
  423. else store_slice2_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +0*stride,
  424. dst_stride, stride, width, y&7, 5-p->log2_count);
  425. }
  426. }
  427. static int config(struct vf_instance *vf,
  428. int width, int height, int d_width, int d_height,
  429. unsigned int flags, unsigned int outfmt)
  430. {
  431. int h= (height+16+15)&(~15);
  432. vf->priv->temp_stride= (width+16+15)&(~15);
  433. vf->priv->temp= (int16_t*)av_mallocz(vf->priv->temp_stride*3*8*sizeof(int16_t));
  434. //this can also be avoided, see above
  435. vf->priv->src = (uint8_t*)av_malloc(vf->priv->temp_stride*h*sizeof(uint8_t));
  436. return ff_vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
  437. }
  438. static void get_image(struct vf_instance *vf, mp_image_t *mpi)
  439. {
  440. if(mpi->flags&MP_IMGFLAG_PRESERVE) return; // don't change
  441. // ok, we can do pp in-place (or pp disabled):
  442. vf->dmpi=ff_vf_get_image(vf->next,mpi->imgfmt,
  443. mpi->type, mpi->flags, mpi->width, mpi->height);
  444. mpi->planes[0]=vf->dmpi->planes[0];
  445. mpi->stride[0]=vf->dmpi->stride[0];
  446. mpi->width=vf->dmpi->width;
  447. if(mpi->flags&MP_IMGFLAG_PLANAR){
  448. mpi->planes[1]=vf->dmpi->planes[1];
  449. mpi->planes[2]=vf->dmpi->planes[2];
  450. mpi->stride[1]=vf->dmpi->stride[1];
  451. mpi->stride[2]=vf->dmpi->stride[2];
  452. }
  453. mpi->flags|=MP_IMGFLAG_DIRECT;
  454. }
  455. static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
  456. {
  457. mp_image_t *dmpi;
  458. if(!(mpi->flags&MP_IMGFLAG_DIRECT)){
  459. // no DR, so get a new image! hope we'll get DR buffer:
  460. dmpi=ff_vf_get_image(vf->next,mpi->imgfmt,
  461. MP_IMGTYPE_TEMP,
  462. MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_PREFER_ALIGNED_STRIDE,
  463. mpi->width,mpi->height);
  464. ff_vf_clone_mpi_attributes(dmpi, mpi);
  465. }else{
  466. dmpi=vf->dmpi;
  467. }
  468. vf->priv->mpeg2= mpi->qscale_type;
  469. if(mpi->pict_type != 3 && mpi->qscale && !vf->priv->qp){
  470. int w = mpi->qstride;
  471. int h = (mpi->h + 15) >> 4;
  472. if (!w) {
  473. w = (mpi->w + 15) >> 4;
  474. h = 1;
  475. }
  476. if(!vf->priv->non_b_qp)
  477. vf->priv->non_b_qp= malloc(w*h);
  478. fast_memcpy(vf->priv->non_b_qp, mpi->qscale, w*h);
  479. }
  480. if(vf->priv->log2_count || !(mpi->flags&MP_IMGFLAG_DIRECT)){
  481. char *qp_tab= vf->priv->non_b_qp;
  482. if(vf->priv->bframes || !qp_tab)
  483. qp_tab= mpi->qscale;
  484. if(qp_tab || vf->priv->qp){
  485. filter(vf->priv, dmpi->planes[0], mpi->planes[0], dmpi->stride[0], mpi->stride[0],
  486. mpi->w, mpi->h, qp_tab, mpi->qstride, 1);
  487. filter(vf->priv, dmpi->planes[1], mpi->planes[1], dmpi->stride[1], mpi->stride[1],
  488. mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
  489. filter(vf->priv, dmpi->planes[2], mpi->planes[2], dmpi->stride[2], mpi->stride[2],
  490. mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
  491. }else{
  492. memcpy_pic(dmpi->planes[0], mpi->planes[0], mpi->w, mpi->h, dmpi->stride[0], mpi->stride[0]);
  493. memcpy_pic(dmpi->planes[1], mpi->planes[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[1], mpi->stride[1]);
  494. memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[2], mpi->stride[2]);
  495. }
  496. }
  497. #if HAVE_MMX
  498. if(ff_gCpuCaps.hasMMX) __asm__ volatile ("emms\n\t");
  499. #endif
  500. #if HAVE_MMX2
  501. if(ff_gCpuCaps.hasMMX2) __asm__ volatile ("sfence\n\t");
  502. #endif
  503. return ff_vf_next_put_image(vf,dmpi, pts);
  504. }
  505. static void uninit(struct vf_instance *vf)
  506. {
  507. if(!vf->priv) return;
  508. av_free(vf->priv->temp);
  509. vf->priv->temp= NULL;
  510. av_free(vf->priv->src);
  511. vf->priv->src= NULL;
  512. //free(vf->priv->avctx);
  513. //vf->priv->avctx= NULL;
  514. free(vf->priv->non_b_qp);
  515. vf->priv->non_b_qp= NULL;
  516. av_free(vf->priv);
  517. vf->priv=NULL;
  518. }
  519. //===========================================================================//
  520. static int query_format(struct vf_instance *vf, unsigned int fmt)
  521. {
  522. switch(fmt){
  523. case IMGFMT_YVU9:
  524. case IMGFMT_IF09:
  525. case IMGFMT_YV12:
  526. case IMGFMT_I420:
  527. case IMGFMT_IYUV:
  528. case IMGFMT_CLPL:
  529. case IMGFMT_Y800:
  530. case IMGFMT_Y8:
  531. case IMGFMT_444P:
  532. case IMGFMT_422P:
  533. case IMGFMT_411P:
  534. return ff_vf_next_query_format(vf,fmt);
  535. }
  536. return 0;
  537. }
  538. static int control(struct vf_instance *vf, int request, void* data)
  539. {
  540. switch(request){
  541. case VFCTRL_QUERY_MAX_PP_LEVEL:
  542. return 5;
  543. case VFCTRL_SET_PP_LEVEL:
  544. vf->priv->log2_count= *((unsigned int*)data);
  545. if (vf->priv->log2_count < 4) vf->priv->log2_count=4;
  546. return CONTROL_TRUE;
  547. }
  548. return ff_vf_next_control(vf,request,data);
  549. }
  550. static int vf_open(vf_instance_t *vf, char *args)
  551. {
  552. int i=0, bias;
  553. int custom_threshold_m[64];
  554. int log2c=-1;
  555. vf->config=config;
  556. vf->put_image=put_image;
  557. vf->get_image=get_image;
  558. vf->query_format=query_format;
  559. vf->uninit=uninit;
  560. vf->control= control;
  561. vf->priv=av_mallocz(sizeof(struct vf_priv_s));//assumes align 16 !
  562. ff_init_avcodec();
  563. //vf->priv->avctx= avcodec_alloc_context();
  564. //dsputil_init(&vf->priv->dsp, vf->priv->avctx);
  565. vf->priv->log2_count= 4;
  566. vf->priv->bframes = 0;
  567. if (args) sscanf(args, "%d:%d:%d:%d", &log2c, &vf->priv->qp, &i, &vf->priv->bframes);
  568. if( log2c >=4 && log2c <=5 )
  569. vf->priv->log2_count = log2c;
  570. else if( log2c >= 6 )
  571. vf->priv->log2_count = 5;
  572. if(vf->priv->qp < 0)
  573. vf->priv->qp = 0;
  574. if (i < -15) i = -15;
  575. if (i > 32) i = 32;
  576. bias= (1<<4)+i; //regulable
  577. vf->priv->prev_q=0;
  578. //
  579. for(i=0;i<64;i++) //FIXME: tune custom_threshold[] and remove this !
  580. custom_threshold_m[i]=(int)(custom_threshold[i]*(bias/71.)+ 0.5);
  581. for(i=0;i<8;i++){
  582. vf->priv->threshold_mtx_noq[2*i]=(uint64_t)custom_threshold_m[i*8+2]
  583. |(((uint64_t)custom_threshold_m[i*8+6])<<16)
  584. |(((uint64_t)custom_threshold_m[i*8+0])<<32)
  585. |(((uint64_t)custom_threshold_m[i*8+4])<<48);
  586. vf->priv->threshold_mtx_noq[2*i+1]=(uint64_t)custom_threshold_m[i*8+5]
  587. |(((uint64_t)custom_threshold_m[i*8+3])<<16)
  588. |(((uint64_t)custom_threshold_m[i*8+1])<<32)
  589. |(((uint64_t)custom_threshold_m[i*8+7])<<48);
  590. }
  591. if (vf->priv->qp) vf->priv->prev_q=vf->priv->qp, mul_thrmat_s(vf->priv, vf->priv->qp);
  592. return 1;
  593. }
  594. const vf_info_t ff_vf_info_fspp = {
  595. "fast simple postprocess",
  596. "fspp",
  597. "Michael Niedermayer, Nikolaj Poroshin",
  598. "",
  599. vf_open,
  600. NULL
  601. };
  602. //====================================================================
  603. //Specific spp's dct, idct and threshold functions
  604. //I'd prefer to have them in the separate file.
  605. //#define MANGLE(a) #a
  606. //typedef int16_t int16_t; //! only int16_t
  607. #define DCTSIZE 8
  608. #define DCTSIZE_S "8"
  609. #define FIX(x,s) ((int) ((x) * (1<<s) + 0.5)&0xffff)
  610. #define C64(x) ((uint64_t)((x)|(x)<<16))<<32 | (uint64_t)(x) | (uint64_t)(x)<<16
  611. #define FIX64(x,s) C64(FIX(x,s))
  612. #define MULTIPLY16H(x,k) (((x)*(k))>>16)
  613. #define THRESHOLD(r,x,t) if(((unsigned)((x)+t))>t*2) r=(x);else r=0;
  614. #define DESCALE(x,n) (((x) + (1 << ((n)-1))) >> n)
  615. #if HAVE_MMX
  616. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433)=FIX64(0.382683433, 14);
  617. DECLARE_ALIGNED(8, uint64_t, ff_MM_FIX_0_541196100)=FIX64(0.541196100, 14);
  618. DECLARE_ALIGNED(8, uint64_t, ff_MM_FIX_0_707106781)=FIX64(0.707106781, 14);
  619. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965)=FIX64(1.306562965, 14);
  620. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A)=FIX64(1.414213562, 14);
  621. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065)=FIX64(1.847759065, 13);
  622. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930)=FIX64(-2.613125930, 13); //-
  623. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562)=FIX64(1.414213562, 13);
  624. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200)=FIX64(1.082392200, 13);
  625. //for t3,t5,t7 == 0 shortcut
  626. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065)=FIX64(0.847759065, 14);
  627. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497)=FIX64(0.566454497, 14);
  628. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367)=FIX64(0.198912367, 14);
  629. DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND)=C64(4);
  630. DECLARE_ASM_CONST(8, uint64_t, MM_2)=C64(2);
  631. #else /* !HAVE_MMX */
  632. typedef int32_t int_simd16_t;
  633. static const int16_t FIX_0_382683433=FIX(0.382683433, 14);
  634. static const int16_t FIX_0_541196100=FIX(0.541196100, 14);
  635. static const int16_t FIX_0_707106781=FIX(0.707106781, 14);
  636. static const int16_t FIX_1_306562965=FIX(1.306562965, 14);
  637. static const int16_t FIX_1_414213562_A=FIX(1.414213562, 14);
  638. static const int16_t FIX_1_847759065=FIX(1.847759065, 13);
  639. static const int16_t FIX_2_613125930=FIX(-2.613125930, 13); //-
  640. static const int16_t FIX_1_414213562=FIX(1.414213562, 13);
  641. static const int16_t FIX_1_082392200=FIX(1.082392200, 13);
  642. #endif
  643. #if !HAVE_MMX
  644. static void column_fidct_c(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt)
  645. {
  646. int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  647. int_simd16_t tmp10, tmp11, tmp12, tmp13;
  648. int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
  649. int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
  650. int16_t* dataptr;
  651. int16_t* wsptr;
  652. int16_t *threshold;
  653. int ctr;
  654. dataptr = data;
  655. wsptr = output;
  656. for (; cnt > 0; cnt-=2) { //start positions
  657. threshold=(int16_t*)thr_adr;//threshold_mtx
  658. for (ctr = DCTSIZE; ctr > 0; ctr--) {
  659. // Process columns from input, add to output.
  660. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
  661. tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
  662. tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
  663. tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
  664. tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
  665. tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
  666. tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
  667. tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
  668. // Even part of FDCT
  669. tmp10 = tmp0 + tmp3;
  670. tmp13 = tmp0 - tmp3;
  671. tmp11 = tmp1 + tmp2;
  672. tmp12 = tmp1 - tmp2;
  673. d0 = tmp10 + tmp11;
  674. d4 = tmp10 - tmp11;
  675. z1 = MULTIPLY16H((tmp12 + tmp13) <<2, FIX_0_707106781);
  676. d2 = tmp13 + z1;
  677. d6 = tmp13 - z1;
  678. // Even part of IDCT
  679. THRESHOLD(tmp0, d0, threshold[0*8]);
  680. THRESHOLD(tmp1, d2, threshold[2*8]);
  681. THRESHOLD(tmp2, d4, threshold[4*8]);
  682. THRESHOLD(tmp3, d6, threshold[6*8]);
  683. tmp0+=2;
  684. tmp10 = (tmp0 + tmp2)>>2;
  685. tmp11 = (tmp0 - tmp2)>>2;
  686. tmp13 = (tmp1 + tmp3)>>2; //+2 ! (psnr decides)
  687. tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
  688. tmp0 = tmp10 + tmp13; //->temps
  689. tmp3 = tmp10 - tmp13; //->temps
  690. tmp1 = tmp11 + tmp12; //->temps
  691. tmp2 = tmp11 - tmp12; //->temps
  692. // Odd part of FDCT
  693. tmp10 = tmp4 + tmp5;
  694. tmp11 = tmp5 + tmp6;
  695. tmp12 = tmp6 + tmp7;
  696. z5 = MULTIPLY16H((tmp10 - tmp12)<<2, FIX_0_382683433);
  697. z2 = MULTIPLY16H(tmp10 <<2, FIX_0_541196100) + z5;
  698. z4 = MULTIPLY16H(tmp12 <<2, FIX_1_306562965) + z5;
  699. z3 = MULTIPLY16H(tmp11 <<2, FIX_0_707106781);
  700. z11 = tmp7 + z3;
  701. z13 = tmp7 - z3;
  702. d5 = z13 + z2;
  703. d3 = z13 - z2;
  704. d1 = z11 + z4;
  705. d7 = z11 - z4;
  706. // Odd part of IDCT
  707. THRESHOLD(tmp4, d1, threshold[1*8]);
  708. THRESHOLD(tmp5, d3, threshold[3*8]);
  709. THRESHOLD(tmp6, d5, threshold[5*8]);
  710. THRESHOLD(tmp7, d7, threshold[7*8]);
  711. //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
  712. z13 = tmp6 + tmp5;
  713. z10 = (tmp6 - tmp5)<<1;
  714. z11 = tmp4 + tmp7;
  715. z12 = (tmp4 - tmp7)<<1;
  716. tmp7 = (z11 + z13)>>2; //+2 !
  717. tmp11 = MULTIPLY16H((z11 - z13)<<1, FIX_1_414213562);
  718. z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
  719. tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
  720. tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
  721. tmp6 = tmp12 - tmp7;
  722. tmp5 = tmp11 - tmp6;
  723. tmp4 = tmp10 + tmp5;
  724. wsptr[DCTSIZE*0]+= (tmp0 + tmp7);
  725. wsptr[DCTSIZE*1]+= (tmp1 + tmp6);
  726. wsptr[DCTSIZE*2]+= (tmp2 + tmp5);
  727. wsptr[DCTSIZE*3]+= (tmp3 - tmp4);
  728. wsptr[DCTSIZE*4]+= (tmp3 + tmp4);
  729. wsptr[DCTSIZE*5]+= (tmp2 - tmp5);
  730. wsptr[DCTSIZE*6]= (tmp1 - tmp6);
  731. wsptr[DCTSIZE*7]= (tmp0 - tmp7);
  732. //
  733. dataptr++; //next column
  734. wsptr++;
  735. threshold++;
  736. }
  737. dataptr+=8; //skip each second start pos
  738. wsptr +=8;
  739. }
  740. }
  741. #else /* HAVE_MMX */
  742. static void column_fidct_mmx(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt)
  743. {
  744. uint64_t __attribute__((aligned(8))) temps[4];
  745. __asm__ volatile(
  746. ASMALIGN(4)
  747. "1: \n\t"
  748. "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
  749. //
  750. "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
  751. "movq %%mm1, %%mm0 \n\t"
  752. "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
  753. "movq %%mm7, %%mm3 \n\t"
  754. "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
  755. "movq %%mm1, %%mm5 \n\t"
  756. "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
  757. "psubw %%mm7, %%mm1 \n\t" //t13
  758. "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
  759. "movq %%mm6, %%mm4 \n\t"
  760. "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
  761. "paddw %%mm7, %%mm5 \n\t" //t10
  762. "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
  763. "movq %%mm6, %%mm7 \n\t"
  764. "paddw %%mm2, %%mm6 \n\t" //t11
  765. "psubw %%mm2, %%mm7 \n\t" //t12
  766. "movq %%mm5, %%mm2 \n\t"
  767. "paddw %%mm6, %%mm5 \n\t" //d0
  768. // i0 t13 t12 i3 i1 d0 - d4
  769. "psubw %%mm6, %%mm2 \n\t" //d4
  770. "paddw %%mm1, %%mm7 \n\t"
  771. "movq 4*16(%%"REG_d"), %%mm6 \n\t"
  772. "psllw $2, %%mm7 \n\t"
  773. "psubw 0*16(%%"REG_d"), %%mm5 \n\t"
  774. "psubw %%mm6, %%mm2 \n\t"
  775. "paddusw 0*16(%%"REG_d"), %%mm5 \n\t"
  776. "paddusw %%mm6, %%mm2 \n\t"
  777. "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t"
  778. //
  779. "paddw 0*16(%%"REG_d"), %%mm5 \n\t"
  780. "paddw %%mm6, %%mm2 \n\t"
  781. "psubusw 0*16(%%"REG_d"), %%mm5 \n\t"
  782. "psubusw %%mm6, %%mm2 \n\t"
  783. //This func is totally compute-bound, operates at huge speed. So, DC shortcut
  784. // at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
  785. //However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare.
  786. "paddw "MANGLE(MM_2)", %%mm5 \n\t"
  787. "movq %%mm2, %%mm6 \n\t"
  788. "paddw %%mm5, %%mm2 \n\t"
  789. "psubw %%mm6, %%mm5 \n\t"
  790. "movq %%mm1, %%mm6 \n\t"
  791. "paddw %%mm7, %%mm1 \n\t" //d2
  792. "psubw 2*16(%%"REG_d"), %%mm1 \n\t"
  793. "psubw %%mm7, %%mm6 \n\t" //d6
  794. "movq 6*16(%%"REG_d"), %%mm7 \n\t"
  795. "psraw $2, %%mm5 \n\t"
  796. "paddusw 2*16(%%"REG_d"), %%mm1 \n\t"
  797. "psubw %%mm7, %%mm6 \n\t"
  798. // t7 d2 /t11 t4 t6 - d6 /t10
  799. "paddw 2*16(%%"REG_d"), %%mm1 \n\t"
  800. "paddusw %%mm7, %%mm6 \n\t"
  801. "psubusw 2*16(%%"REG_d"), %%mm1 \n\t"
  802. "paddw %%mm7, %%mm6 \n\t"
  803. "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
  804. "psubusw %%mm7, %%mm6 \n\t"
  805. //movq [edi+"DCTSIZE_S"*2*2], mm1
  806. //movq [edi+"DCTSIZE_S"*6*2], mm6
  807. "movq %%mm1, %%mm7 \n\t"
  808. "psraw $2, %%mm2 \n\t"
  809. "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
  810. "psubw %%mm6, %%mm1 \n\t"
  811. "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
  812. "paddw %%mm7, %%mm6 \n\t" //'t13
  813. "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! ---
  814. "movq %%mm2, %%mm7 \n\t"
  815. "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
  816. "paddw %%mm6, %%mm2 \n\t" //'t0
  817. "movq %%mm2, 0*8+%3 \n\t" //!
  818. "psubw %%mm6, %%mm7 \n\t" //'t3
  819. "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
  820. "psubw %%mm6, %%mm1 \n\t" //'t12
  821. "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
  822. "movq %%mm5, %%mm6 \n\t"
  823. "movq %%mm7, 3*8+%3 \n\t"
  824. "paddw %%mm2, %%mm3 \n\t" //t10
  825. "paddw %%mm4, %%mm2 \n\t" //t11
  826. "paddw %%mm0, %%mm4 \n\t" //t12
  827. "movq %%mm3, %%mm7 \n\t"
  828. "psubw %%mm4, %%mm3 \n\t"
  829. "psllw $2, %%mm3 \n\t"
  830. "psllw $2, %%mm7 \n\t" //opt for P6
  831. "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
  832. "psllw $2, %%mm4 \n\t"
  833. "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t"
  834. "psllw $2, %%mm2 \n\t"
  835. "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
  836. "paddw %%mm1, %%mm5 \n\t" //'t1
  837. "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t"
  838. "psubw %%mm1, %%mm6 \n\t" //'t2
  839. // t7 't12 't11 t4 t6 - 't13 't10 ---
  840. "paddw %%mm3, %%mm7 \n\t" //z2
  841. "movq %%mm5, 1*8+%3 \n\t"
  842. "paddw %%mm3, %%mm4 \n\t" //z4
  843. "movq 3*16(%%"REG_d"), %%mm3 \n\t"
  844. "movq %%mm0, %%mm1 \n\t"
  845. "movq %%mm6, 2*8+%3 \n\t"
  846. "psubw %%mm2, %%mm1 \n\t" //z13
  847. //===
  848. "paddw %%mm2, %%mm0 \n\t" //z11
  849. "movq %%mm1, %%mm5 \n\t"
  850. "movq 5*16(%%"REG_d"), %%mm2 \n\t"
  851. "psubw %%mm7, %%mm1 \n\t" //d3
  852. "paddw %%mm7, %%mm5 \n\t" //d5
  853. "psubw %%mm3, %%mm1 \n\t"
  854. "movq 1*16(%%"REG_d"), %%mm7 \n\t"
  855. "psubw %%mm2, %%mm5 \n\t"
  856. "movq %%mm0, %%mm6 \n\t"
  857. "paddw %%mm4, %%mm0 \n\t" //d1
  858. "paddusw %%mm3, %%mm1 \n\t"
  859. "psubw %%mm4, %%mm6 \n\t" //d7
  860. // d1 d3 - - - d5 d7 -
  861. "movq 7*16(%%"REG_d"), %%mm4 \n\t"
  862. "psubw %%mm7, %%mm0 \n\t"
  863. "psubw %%mm4, %%mm6 \n\t"
  864. "paddusw %%mm2, %%mm5 \n\t"
  865. "paddusw %%mm4, %%mm6 \n\t"
  866. "paddw %%mm3, %%mm1 \n\t"
  867. "paddw %%mm2, %%mm5 \n\t"
  868. "paddw %%mm4, %%mm6 \n\t"
  869. "psubusw %%mm3, %%mm1 \n\t"
  870. "psubusw %%mm2, %%mm5 \n\t"
  871. "psubusw %%mm4, %%mm6 \n\t"
  872. "movq %%mm1, %%mm4 \n\t"
  873. "por %%mm5, %%mm4 \n\t"
  874. "paddusw %%mm7, %%mm0 \n\t"
  875. "por %%mm6, %%mm4 \n\t"
  876. "paddw %%mm7, %%mm0 \n\t"
  877. "packssdw %%mm4, %%mm4 \n\t"
  878. "psubusw %%mm7, %%mm0 \n\t"
  879. "movd %%mm4, %%"REG_a" \n\t"
  880. "or %%"REG_a", %%"REG_a" \n\t"
  881. "jnz 2f \n\t"
  882. //movq [edi+"DCTSIZE_S"*3*2], mm1
  883. //movq [edi+"DCTSIZE_S"*5*2], mm5
  884. //movq [edi+"DCTSIZE_S"*1*2], mm0
  885. //movq [edi+"DCTSIZE_S"*7*2], mm6
  886. // t4 t5 - - - t6 t7 -
  887. //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
  888. //Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile
  889. "movq 0*8+%3, %%mm4 \n\t"
  890. "movq %%mm0, %%mm1 \n\t"
  891. "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
  892. "movq %%mm1, %%mm2 \n\t"
  893. "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
  894. "movq %%mm2, %%mm3 \n\t"
  895. "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
  896. "paddw %%mm4, %%mm5 \n\t"
  897. "movq 1*8+%3, %%mm6 \n\t"
  898. //paddw mm3, MM_2
  899. "psraw $2, %%mm3 \n\t" //tmp7
  900. "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
  901. "psubw %%mm3, %%mm4 \n\t"
  902. "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
  903. "paddw %%mm3, %%mm5 \n\t"
  904. "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
  905. "paddw %%mm6, %%mm7 \n\t"
  906. "movq 2*8+%3, %%mm3 \n\t"
  907. "psubw %%mm0, %%mm6 \n\t"
  908. "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
  909. "paddw %%mm0, %%mm7 \n\t"
  910. "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
  911. "paddw %%mm3, %%mm4 \n\t"
  912. "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
  913. "psubw %%mm1, %%mm3 \n\t"
  914. "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
  915. "paddw %%mm1, %%mm4 \n\t"
  916. "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
  917. "paddw %%mm3, %%mm5 \n\t"
  918. "movq 3*8+%3, %%mm0 \n\t"
  919. "add $8, %%"REG_S" \n\t"
  920. "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
  921. "paddw %%mm0, %%mm6 \n\t"
  922. "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
  923. "psubw %%mm2, %%mm0 \n\t"
  924. "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
  925. "paddw %%mm2, %%mm6 \n\t"
  926. "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
  927. "paddw %%mm0, %%mm7 \n\t"
  928. "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
  929. "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
  930. "add $8, %%"REG_D" \n\t"
  931. "jmp 4f \n\t"
  932. "2: \n\t"
  933. //--- non DC2
  934. //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1)
  935. //psraw mm5, 2
  936. //psraw mm0, 2
  937. //psraw mm6, 2
  938. "movq %%mm5, %%mm3 \n\t"
  939. "psubw %%mm1, %%mm5 \n\t"
  940. "psllw $1, %%mm5 \n\t" //'z10
  941. "paddw %%mm1, %%mm3 \n\t" //'z13
  942. "movq %%mm0, %%mm2 \n\t"
  943. "psubw %%mm6, %%mm0 \n\t"
  944. "movq %%mm5, %%mm1 \n\t"
  945. "psllw $1, %%mm0 \n\t" //'z12
  946. "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
  947. "paddw %%mm0, %%mm5 \n\t"
  948. "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
  949. "paddw %%mm6, %%mm2 \n\t" //'z11
  950. "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
  951. "movq %%mm2, %%mm7 \n\t"
  952. //---
  953. "movq 0*8+%3, %%mm4 \n\t"
  954. "psubw %%mm3, %%mm2 \n\t"
  955. "psllw $1, %%mm2 \n\t"
  956. "paddw %%mm3, %%mm7 \n\t" //'t7
  957. "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
  958. "movq %%mm4, %%mm6 \n\t"
  959. //paddw mm7, MM_2
  960. "psraw $2, %%mm7 \n\t"
  961. "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
  962. "psubw %%mm7, %%mm6 \n\t"
  963. "movq 1*8+%3, %%mm3 \n\t"
  964. "paddw %%mm7, %%mm4 \n\t"
  965. "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
  966. "paddw %%mm5, %%mm1 \n\t" //'t12
  967. "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
  968. "psubw %%mm7, %%mm1 \n\t" //'t6
  969. "movq 2*8+%3, %%mm7 \n\t"
  970. "psubw %%mm5, %%mm0 \n\t" //'t10
  971. "movq 3*8+%3, %%mm6 \n\t"
  972. "movq %%mm3, %%mm5 \n\t"
  973. "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
  974. "psubw %%mm1, %%mm5 \n\t"
  975. "psubw %%mm1, %%mm2 \n\t" //'t5
  976. "paddw %%mm1, %%mm3 \n\t"
  977. "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
  978. "movq %%mm7, %%mm4 \n\t"
  979. "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
  980. "psubw %%mm2, %%mm4 \n\t"
  981. "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
  982. "paddw %%mm2, %%mm7 \n\t"
  983. "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
  984. "paddw %%mm2, %%mm0 \n\t" //'t4
  985. // 't4 't6 't5 - - - - 't7
  986. "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
  987. "movq %%mm6, %%mm1 \n\t"
  988. "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
  989. "psubw %%mm0, %%mm1 \n\t"
  990. "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
  991. "paddw %%mm0, %%mm6 \n\t"
  992. "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
  993. "add $8, %%"REG_S" \n\t"
  994. "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
  995. "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
  996. "add $8, %%"REG_D" \n\t"
  997. "4: \n\t"
  998. //=part 2 (the same)===========================================================
  999. "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
  1000. //
  1001. "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
  1002. "movq %%mm1, %%mm0 \n\t"
  1003. "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
  1004. "movq %%mm7, %%mm3 \n\t"
  1005. "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
  1006. "movq %%mm1, %%mm5 \n\t"
  1007. "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
  1008. "psubw %%mm7, %%mm1 \n\t" //t13
  1009. "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
  1010. "movq %%mm6, %%mm4 \n\t"
  1011. "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
  1012. "paddw %%mm7, %%mm5 \n\t" //t10
  1013. "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
  1014. "movq %%mm6, %%mm7 \n\t"
  1015. "paddw %%mm2, %%mm6 \n\t" //t11
  1016. "psubw %%mm2, %%mm7 \n\t" //t12
  1017. "movq %%mm5, %%mm2 \n\t"
  1018. "paddw %%mm6, %%mm5 \n\t" //d0
  1019. // i0 t13 t12 i3 i1 d0 - d4
  1020. "psubw %%mm6, %%mm2 \n\t" //d4
  1021. "paddw %%mm1, %%mm7 \n\t"
  1022. "movq 1*8+4*16(%%"REG_d"), %%mm6 \n\t"
  1023. "psllw $2, %%mm7 \n\t"
  1024. "psubw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
  1025. "psubw %%mm6, %%mm2 \n\t"
  1026. "paddusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
  1027. "paddusw %%mm6, %%mm2 \n\t"
  1028. "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t"
  1029. //
  1030. "paddw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
  1031. "paddw %%mm6, %%mm2 \n\t"
  1032. "psubusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
  1033. "psubusw %%mm6, %%mm2 \n\t"
  1034. //This func is totally compute-bound, operates at huge speed. So, DC shortcut
  1035. // at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
  1036. //However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare.
  1037. "paddw "MANGLE(MM_2)", %%mm5 \n\t"
  1038. "movq %%mm2, %%mm6 \n\t"
  1039. "paddw %%mm5, %%mm2 \n\t"
  1040. "psubw %%mm6, %%mm5 \n\t"
  1041. "movq %%mm1, %%mm6 \n\t"
  1042. "paddw %%mm7, %%mm1 \n\t" //d2
  1043. "psubw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
  1044. "psubw %%mm7, %%mm6 \n\t" //d6
  1045. "movq 1*8+6*16(%%"REG_d"), %%mm7 \n\t"
  1046. "psraw $2, %%mm5 \n\t"
  1047. "paddusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
  1048. "psubw %%mm7, %%mm6 \n\t"
  1049. // t7 d2 /t11 t4 t6 - d6 /t10
  1050. "paddw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
  1051. "paddusw %%mm7, %%mm6 \n\t"
  1052. "psubusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
  1053. "paddw %%mm7, %%mm6 \n\t"
  1054. "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
  1055. "psubusw %%mm7, %%mm6 \n\t"
  1056. //movq [edi+"DCTSIZE_S"*2*2], mm1
  1057. //movq [edi+"DCTSIZE_S"*6*2], mm6
  1058. "movq %%mm1, %%mm7 \n\t"
  1059. "psraw $2, %%mm2 \n\t"
  1060. "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
  1061. "psubw %%mm6, %%mm1 \n\t"
  1062. "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
  1063. "paddw %%mm7, %%mm6 \n\t" //'t13
  1064. "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! ---
  1065. "movq %%mm2, %%mm7 \n\t"
  1066. "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
  1067. "paddw %%mm6, %%mm2 \n\t" //'t0
  1068. "movq %%mm2, 0*8+%3 \n\t" //!
  1069. "psubw %%mm6, %%mm7 \n\t" //'t3
  1070. "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
  1071. "psubw %%mm6, %%mm1 \n\t" //'t12
  1072. "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
  1073. "movq %%mm5, %%mm6 \n\t"
  1074. "movq %%mm7, 3*8+%3 \n\t"
  1075. "paddw %%mm2, %%mm3 \n\t" //t10
  1076. "paddw %%mm4, %%mm2 \n\t" //t11
  1077. "paddw %%mm0, %%mm4 \n\t" //t12
  1078. "movq %%mm3, %%mm7 \n\t"
  1079. "psubw %%mm4, %%mm3 \n\t"
  1080. "psllw $2, %%mm3 \n\t"
  1081. "psllw $2, %%mm7 \n\t" //opt for P6
  1082. "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
  1083. "psllw $2, %%mm4 \n\t"
  1084. "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t"
  1085. "psllw $2, %%mm2 \n\t"
  1086. "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
  1087. "paddw %%mm1, %%mm5 \n\t" //'t1
  1088. "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t"
  1089. "psubw %%mm1, %%mm6 \n\t" //'t2
  1090. // t7 't12 't11 t4 t6 - 't13 't10 ---
  1091. "paddw %%mm3, %%mm7 \n\t" //z2
  1092. "movq %%mm5, 1*8+%3 \n\t"
  1093. "paddw %%mm3, %%mm4 \n\t" //z4
  1094. "movq 1*8+3*16(%%"REG_d"), %%mm3 \n\t"
  1095. "movq %%mm0, %%mm1 \n\t"
  1096. "movq %%mm6, 2*8+%3 \n\t"
  1097. "psubw %%mm2, %%mm1 \n\t" //z13
  1098. //===
  1099. "paddw %%mm2, %%mm0 \n\t" //z11
  1100. "movq %%mm1, %%mm5 \n\t"
  1101. "movq 1*8+5*16(%%"REG_d"), %%mm2 \n\t"
  1102. "psubw %%mm7, %%mm1 \n\t" //d3
  1103. "paddw %%mm7, %%mm5 \n\t" //d5
  1104. "psubw %%mm3, %%mm1 \n\t"
  1105. "movq 1*8+1*16(%%"REG_d"), %%mm7 \n\t"
  1106. "psubw %%mm2, %%mm5 \n\t"
  1107. "movq %%mm0, %%mm6 \n\t"
  1108. "paddw %%mm4, %%mm0 \n\t" //d1
  1109. "paddusw %%mm3, %%mm1 \n\t"
  1110. "psubw %%mm4, %%mm6 \n\t" //d7
  1111. // d1 d3 - - - d5 d7 -
  1112. "movq 1*8+7*16(%%"REG_d"), %%mm4 \n\t"
  1113. "psubw %%mm7, %%mm0 \n\t"
  1114. "psubw %%mm4, %%mm6 \n\t"
  1115. "paddusw %%mm2, %%mm5 \n\t"
  1116. "paddusw %%mm4, %%mm6 \n\t"
  1117. "paddw %%mm3, %%mm1 \n\t"
  1118. "paddw %%mm2, %%mm5 \n\t"
  1119. "paddw %%mm4, %%mm6 \n\t"
  1120. "psubusw %%mm3, %%mm1 \n\t"
  1121. "psubusw %%mm2, %%mm5 \n\t"
  1122. "psubusw %%mm4, %%mm6 \n\t"
  1123. "movq %%mm1, %%mm4 \n\t"
  1124. "por %%mm5, %%mm4 \n\t"
  1125. "paddusw %%mm7, %%mm0 \n\t"
  1126. "por %%mm6, %%mm4 \n\t"
  1127. "paddw %%mm7, %%mm0 \n\t"
  1128. "packssdw %%mm4, %%mm4 \n\t"
  1129. "psubusw %%mm7, %%mm0 \n\t"
  1130. "movd %%mm4, %%"REG_a" \n\t"
  1131. "or %%"REG_a", %%"REG_a" \n\t"
  1132. "jnz 3f \n\t"
  1133. //movq [edi+"DCTSIZE_S"*3*2], mm1
  1134. //movq [edi+"DCTSIZE_S"*5*2], mm5
  1135. //movq [edi+"DCTSIZE_S"*1*2], mm0
  1136. //movq [edi+"DCTSIZE_S"*7*2], mm6
  1137. // t4 t5 - - - t6 t7 -
  1138. //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
  1139. //Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile
  1140. "movq 0*8+%3, %%mm4 \n\t"
  1141. "movq %%mm0, %%mm1 \n\t"
  1142. "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
  1143. "movq %%mm1, %%mm2 \n\t"
  1144. "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
  1145. "movq %%mm2, %%mm3 \n\t"
  1146. "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
  1147. "paddw %%mm4, %%mm5 \n\t"
  1148. "movq 1*8+%3, %%mm6 \n\t"
  1149. //paddw mm3, MM_2
  1150. "psraw $2, %%mm3 \n\t" //tmp7
  1151. "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
  1152. "psubw %%mm3, %%mm4 \n\t"
  1153. "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
  1154. "paddw %%mm3, %%mm5 \n\t"
  1155. "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
  1156. "paddw %%mm6, %%mm7 \n\t"
  1157. "movq 2*8+%3, %%mm3 \n\t"
  1158. "psubw %%mm0, %%mm6 \n\t"
  1159. "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
  1160. "paddw %%mm0, %%mm7 \n\t"
  1161. "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
  1162. "paddw %%mm3, %%mm4 \n\t"
  1163. "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
  1164. "psubw %%mm1, %%mm3 \n\t"
  1165. "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
  1166. "paddw %%mm1, %%mm4 \n\t"
  1167. "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
  1168. "paddw %%mm3, %%mm5 \n\t"
  1169. "movq 3*8+%3, %%mm0 \n\t"
  1170. "add $24, %%"REG_S" \n\t"
  1171. "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
  1172. "paddw %%mm0, %%mm6 \n\t"
  1173. "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
  1174. "psubw %%mm2, %%mm0 \n\t"
  1175. "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
  1176. "paddw %%mm2, %%mm6 \n\t"
  1177. "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
  1178. "paddw %%mm0, %%mm7 \n\t"
  1179. "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
  1180. "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
  1181. "add $24, %%"REG_D" \n\t"
  1182. "sub $2, %%"REG_c" \n\t"
  1183. "jnz 1b \n\t"
  1184. "jmp 5f \n\t"
  1185. "3: \n\t"
  1186. //--- non DC2
  1187. //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1)
  1188. //psraw mm5, 2
  1189. //psraw mm0, 2
  1190. //psraw mm6, 2
  1191. "movq %%mm5, %%mm3 \n\t"
  1192. "psubw %%mm1, %%mm5 \n\t"
  1193. "psllw $1, %%mm5 \n\t" //'z10
  1194. "paddw %%mm1, %%mm3 \n\t" //'z13
  1195. "movq %%mm0, %%mm2 \n\t"
  1196. "psubw %%mm6, %%mm0 \n\t"
  1197. "movq %%mm5, %%mm1 \n\t"
  1198. "psllw $1, %%mm0 \n\t" //'z12
  1199. "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
  1200. "paddw %%mm0, %%mm5 \n\t"
  1201. "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
  1202. "paddw %%mm6, %%mm2 \n\t" //'z11
  1203. "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
  1204. "movq %%mm2, %%mm7 \n\t"
  1205. //---
  1206. "movq 0*8+%3, %%mm4 \n\t"
  1207. "psubw %%mm3, %%mm2 \n\t"
  1208. "psllw $1, %%mm2 \n\t"
  1209. "paddw %%mm3, %%mm7 \n\t" //'t7
  1210. "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
  1211. "movq %%mm4, %%mm6 \n\t"
  1212. //paddw mm7, MM_2
  1213. "psraw $2, %%mm7 \n\t"
  1214. "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
  1215. "psubw %%mm7, %%mm6 \n\t"
  1216. "movq 1*8+%3, %%mm3 \n\t"
  1217. "paddw %%mm7, %%mm4 \n\t"
  1218. "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
  1219. "paddw %%mm5, %%mm1 \n\t" //'t12
  1220. "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
  1221. "psubw %%mm7, %%mm1 \n\t" //'t6
  1222. "movq 2*8+%3, %%mm7 \n\t"
  1223. "psubw %%mm5, %%mm0 \n\t" //'t10
  1224. "movq 3*8+%3, %%mm6 \n\t"
  1225. "movq %%mm3, %%mm5 \n\t"
  1226. "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
  1227. "psubw %%mm1, %%mm5 \n\t"
  1228. "psubw %%mm1, %%mm2 \n\t" //'t5
  1229. "paddw %%mm1, %%mm3 \n\t"
  1230. "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
  1231. "movq %%mm7, %%mm4 \n\t"
  1232. "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
  1233. "psubw %%mm2, %%mm4 \n\t"
  1234. "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
  1235. "paddw %%mm2, %%mm7 \n\t"
  1236. "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
  1237. "paddw %%mm2, %%mm0 \n\t" //'t4
  1238. // 't4 't6 't5 - - - - 't7
  1239. "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
  1240. "movq %%mm6, %%mm1 \n\t"
  1241. "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
  1242. "psubw %%mm0, %%mm1 \n\t"
  1243. "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
  1244. "paddw %%mm0, %%mm6 \n\t"
  1245. "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
  1246. "add $24, %%"REG_S" \n\t"
  1247. "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
  1248. "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
  1249. "add $24, %%"REG_D" \n\t"
  1250. "sub $2, %%"REG_c" \n\t"
  1251. "jnz 1b \n\t"
  1252. "5: \n\t"
  1253. : "+S"(data), "+D"(output), "+c"(cnt), "=o"(temps)
  1254. : "d"(thr_adr)
  1255. : "%"REG_a
  1256. );
  1257. }
  1258. #endif // HAVE_MMX
  1259. #if !HAVE_MMX
  1260. static void row_idct_c(int16_t* workspace,
  1261. int16_t* output_adr, int output_stride, int cnt)
  1262. {
  1263. int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  1264. int_simd16_t tmp10, tmp11, tmp12, tmp13;
  1265. int_simd16_t z5, z10, z11, z12, z13;
  1266. int16_t* outptr;
  1267. int16_t* wsptr;
  1268. cnt*=4;
  1269. wsptr = workspace;
  1270. outptr = output_adr;
  1271. for (; cnt > 0; cnt--) {
  1272. // Even part
  1273. //Simd version reads 4x4 block and transposes it
  1274. tmp10 = ( wsptr[2] + wsptr[3]);
  1275. tmp11 = ( wsptr[2] - wsptr[3]);
  1276. tmp13 = ( wsptr[0] + wsptr[1]);
  1277. tmp12 = (MULTIPLY16H( wsptr[0] - wsptr[1], FIX_1_414213562_A)<<2) - tmp13;//this shift order to avoid overflow
  1278. tmp0 = tmp10 + tmp13; //->temps
  1279. tmp3 = tmp10 - tmp13; //->temps
  1280. tmp1 = tmp11 + tmp12;
  1281. tmp2 = tmp11 - tmp12;
  1282. // Odd part
  1283. //Also transpose, with previous:
  1284. // ---- ---- ||||
  1285. // ---- ---- idct ||||
  1286. // ---- ---- ---> ||||
  1287. // ---- ---- ||||
  1288. z13 = wsptr[4] + wsptr[5];
  1289. z10 = wsptr[4] - wsptr[5];
  1290. z11 = wsptr[6] + wsptr[7];
  1291. z12 = wsptr[6] - wsptr[7];
  1292. tmp7 = z11 + z13;
  1293. tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
  1294. z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
  1295. tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
  1296. tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
  1297. tmp6 = (tmp12<<3) - tmp7;
  1298. tmp5 = (tmp11<<3) - tmp6;
  1299. tmp4 = (tmp10<<3) + tmp5;
  1300. // Final output stage: descale and write column
  1301. outptr[0*output_stride]+= DESCALE(tmp0 + tmp7, 3);
  1302. outptr[1*output_stride]+= DESCALE(tmp1 + tmp6, 3);
  1303. outptr[2*output_stride]+= DESCALE(tmp2 + tmp5, 3);
  1304. outptr[3*output_stride]+= DESCALE(tmp3 - tmp4, 3);
  1305. outptr[4*output_stride]+= DESCALE(tmp3 + tmp4, 3);
  1306. outptr[5*output_stride]+= DESCALE(tmp2 - tmp5, 3);
  1307. outptr[6*output_stride]+= DESCALE(tmp1 - tmp6, 3); //no += ?
  1308. outptr[7*output_stride]+= DESCALE(tmp0 - tmp7, 3); //no += ?
  1309. outptr++;
  1310. wsptr += DCTSIZE; // advance pointer to next row
  1311. }
  1312. }
  1313. #else /* HAVE_MMX */
  1314. static void row_idct_mmx (int16_t* workspace,
  1315. int16_t* output_adr, int output_stride, int cnt)
  1316. {
  1317. uint64_t __attribute__((aligned(8))) temps[4];
  1318. __asm__ volatile(
  1319. "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t"
  1320. "1: \n\t"
  1321. "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0 \n\t"
  1322. //
  1323. "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1 \n\t"
  1324. "movq %%mm0, %%mm4 \n\t"
  1325. "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
  1326. "punpcklwd %%mm1, %%mm0 \n\t"
  1327. "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3 \n\t"
  1328. "punpckhwd %%mm1, %%mm4 \n\t"
  1329. //transpose 4x4
  1330. "movq %%mm2, %%mm7 \n\t"
  1331. "punpcklwd %%mm3, %%mm2 \n\t"
  1332. "movq %%mm0, %%mm6 \n\t"
  1333. "punpckldq %%mm2, %%mm0 \n\t" //0
  1334. "punpckhdq %%mm2, %%mm6 \n\t" //1
  1335. "movq %%mm0, %%mm5 \n\t"
  1336. "punpckhwd %%mm3, %%mm7 \n\t"
  1337. "psubw %%mm6, %%mm0 \n\t"
  1338. "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t"
  1339. "movq %%mm4, %%mm2 \n\t"
  1340. "punpckldq %%mm7, %%mm4 \n\t" //2
  1341. "paddw %%mm6, %%mm5 \n\t"
  1342. "punpckhdq %%mm7, %%mm2 \n\t" //3
  1343. "movq %%mm4, %%mm1 \n\t"
  1344. "psllw $2, %%mm0 \n\t"
  1345. "paddw %%mm2, %%mm4 \n\t" //t10
  1346. "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t"
  1347. "psubw %%mm2, %%mm1 \n\t" //t11
  1348. "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t"
  1349. "psubw %%mm5, %%mm0 \n\t"
  1350. "movq %%mm4, %%mm6 \n\t"
  1351. "paddw %%mm5, %%mm4 \n\t" //t0
  1352. "psubw %%mm5, %%mm6 \n\t" //t3
  1353. "movq %%mm1, %%mm7 \n\t"
  1354. "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t"
  1355. "paddw %%mm0, %%mm1 \n\t" //t1
  1356. "movq %%mm4, 0*8+%3 \n\t" //t0
  1357. "movq %%mm3, %%mm4 \n\t"
  1358. "movq %%mm6, 1*8+%3 \n\t" //t3
  1359. "punpcklwd %%mm2, %%mm3 \n\t"
  1360. //transpose 4x4
  1361. "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t"
  1362. "punpckhwd %%mm2, %%mm4 \n\t"
  1363. "movq %%mm5, %%mm2 \n\t"
  1364. "punpcklwd %%mm6, %%mm5 \n\t"
  1365. "psubw %%mm0, %%mm7 \n\t" //t2
  1366. "punpckhwd %%mm6, %%mm2 \n\t"
  1367. "movq %%mm3, %%mm0 \n\t"
  1368. "punpckldq %%mm5, %%mm3 \n\t" //4
  1369. "punpckhdq %%mm5, %%mm0 \n\t" //5
  1370. "movq %%mm4, %%mm5 \n\t"
  1371. //
  1372. "movq %%mm3, %%mm6 \n\t"
  1373. "punpckldq %%mm2, %%mm4 \n\t" //6
  1374. "psubw %%mm0, %%mm3 \n\t" //z10
  1375. "punpckhdq %%mm2, %%mm5 \n\t" //7
  1376. "paddw %%mm0, %%mm6 \n\t" //z13
  1377. "movq %%mm4, %%mm2 \n\t"
  1378. "movq %%mm3, %%mm0 \n\t"
  1379. "psubw %%mm5, %%mm4 \n\t" //z12
  1380. "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0 \n\t" //-
  1381. "paddw %%mm4, %%mm3 \n\t"
  1382. "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3 \n\t" //z5
  1383. "paddw %%mm5, %%mm2 \n\t" //z11 >
  1384. "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4 \n\t"
  1385. "movq %%mm2, %%mm5 \n\t"
  1386. "psubw %%mm6, %%mm2 \n\t"
  1387. "paddw %%mm6, %%mm5 \n\t" //t7
  1388. "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //t11
  1389. "paddw %%mm3, %%mm0 \n\t" //t12
  1390. "psllw $3, %%mm0 \n\t"
  1391. "psubw %%mm3, %%mm4 \n\t" //t10
  1392. "movq 0*8+%3, %%mm6 \n\t"
  1393. "movq %%mm1, %%mm3 \n\t"
  1394. "psllw $3, %%mm4 \n\t"
  1395. "psubw %%mm5, %%mm0 \n\t" //t6
  1396. "psllw $3, %%mm2 \n\t"
  1397. "paddw %%mm0, %%mm1 \n\t" //d1
  1398. "psubw %%mm0, %%mm2 \n\t" //t5
  1399. "psubw %%mm0, %%mm3 \n\t" //d6
  1400. "paddw %%mm2, %%mm4 \n\t" //t4
  1401. "movq %%mm7, %%mm0 \n\t"
  1402. "paddw %%mm2, %%mm7 \n\t" //d2
  1403. "psubw %%mm2, %%mm0 \n\t" //d5
  1404. "movq "MANGLE(MM_DESCALE_RND)", %%mm2 \n\t" //4
  1405. "psubw %%mm5, %%mm6 \n\t" //d7
  1406. "paddw 0*8+%3, %%mm5 \n\t" //d0
  1407. "paddw %%mm2, %%mm1 \n\t"
  1408. "paddw %%mm2, %%mm5 \n\t"
  1409. "psraw $3, %%mm1 \n\t"
  1410. "paddw %%mm2, %%mm7 \n\t"
  1411. "psraw $3, %%mm5 \n\t"
  1412. "paddw (%%"REG_D"), %%mm5 \n\t"
  1413. "psraw $3, %%mm7 \n\t"
  1414. "paddw (%%"REG_D",%%"REG_a",), %%mm1 \n\t"
  1415. "paddw %%mm2, %%mm0 \n\t"
  1416. "paddw (%%"REG_D",%%"REG_a",2), %%mm7 \n\t"
  1417. "paddw %%mm2, %%mm3 \n\t"
  1418. "movq %%mm5, (%%"REG_D") \n\t"
  1419. "paddw %%mm2, %%mm6 \n\t"
  1420. "movq %%mm1, (%%"REG_D",%%"REG_a",) \n\t"
  1421. "psraw $3, %%mm0 \n\t"
  1422. "movq %%mm7, (%%"REG_D",%%"REG_a",2) \n\t"
  1423. "add %%"REG_d", %%"REG_D" \n\t" //3*ls
  1424. "movq 1*8+%3, %%mm5 \n\t" //t3
  1425. "psraw $3, %%mm3 \n\t"
  1426. "paddw (%%"REG_D",%%"REG_a",2), %%mm0 \n\t"
  1427. "psubw %%mm4, %%mm5 \n\t" //d3
  1428. "paddw (%%"REG_D",%%"REG_d",), %%mm3 \n\t"
  1429. "psraw $3, %%mm6 \n\t"
  1430. "paddw 1*8+%3, %%mm4 \n\t" //d4
  1431. "paddw %%mm2, %%mm5 \n\t"
  1432. "paddw (%%"REG_D",%%"REG_a",4), %%mm6 \n\t"
  1433. "paddw %%mm2, %%mm4 \n\t"
  1434. "movq %%mm0, (%%"REG_D",%%"REG_a",2) \n\t"
  1435. "psraw $3, %%mm5 \n\t"
  1436. "paddw (%%"REG_D"), %%mm5 \n\t"
  1437. "psraw $3, %%mm4 \n\t"
  1438. "paddw (%%"REG_D",%%"REG_a",), %%mm4 \n\t"
  1439. "add $"DCTSIZE_S"*2*4, %%"REG_S" \n\t" //4 rows
  1440. "movq %%mm3, (%%"REG_D",%%"REG_d",) \n\t"
  1441. "movq %%mm6, (%%"REG_D",%%"REG_a",4) \n\t"
  1442. "movq %%mm5, (%%"REG_D") \n\t"
  1443. "movq %%mm4, (%%"REG_D",%%"REG_a",) \n\t"
  1444. "sub %%"REG_d", %%"REG_D" \n\t"
  1445. "add $8, %%"REG_D" \n\t"
  1446. "dec %%"REG_c" \n\t"
  1447. "jnz 1b \n\t"
  1448. : "+S"(workspace), "+D"(output_adr), "+c"(cnt), "=o"(temps)
  1449. : "a"(output_stride*sizeof(short))
  1450. : "%"REG_d
  1451. );
  1452. }
  1453. #endif // HAVE_MMX
  1454. #if !HAVE_MMX
  1455. static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
  1456. {
  1457. int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  1458. int_simd16_t tmp10, tmp11, tmp12, tmp13;
  1459. int_simd16_t z1, z2, z3, z4, z5, z11, z13;
  1460. int16_t *dataptr;
  1461. cnt*=4;
  1462. // Pass 1: process rows.
  1463. dataptr = data;
  1464. for (; cnt > 0; cnt--) {
  1465. tmp0 = pixels[line_size*0] + pixels[line_size*7];
  1466. tmp7 = pixels[line_size*0] - pixels[line_size*7];
  1467. tmp1 = pixels[line_size*1] + pixels[line_size*6];
  1468. tmp6 = pixels[line_size*1] - pixels[line_size*6];
  1469. tmp2 = pixels[line_size*2] + pixels[line_size*5];
  1470. tmp5 = pixels[line_size*2] - pixels[line_size*5];
  1471. tmp3 = pixels[line_size*3] + pixels[line_size*4];
  1472. tmp4 = pixels[line_size*3] - pixels[line_size*4];
  1473. // Even part
  1474. tmp10 = tmp0 + tmp3;
  1475. tmp13 = tmp0 - tmp3;
  1476. tmp11 = tmp1 + tmp2;
  1477. tmp12 = tmp1 - tmp2;
  1478. //Even columns are written first, this leads to different order of columns
  1479. //in column_fidct(), but they are processed independently, so all ok.
  1480. //Later in the row_idct() columns readed at the same order.
  1481. dataptr[2] = tmp10 + tmp11;
  1482. dataptr[3] = tmp10 - tmp11;
  1483. z1 = MULTIPLY16H((tmp12 + tmp13)<<2, FIX_0_707106781);
  1484. dataptr[0] = tmp13 + z1;
  1485. dataptr[1] = tmp13 - z1;
  1486. // Odd part
  1487. tmp10 = (tmp4 + tmp5) <<2;
  1488. tmp11 = (tmp5 + tmp6) <<2;
  1489. tmp12 = (tmp6 + tmp7) <<2;
  1490. z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
  1491. z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
  1492. z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
  1493. z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
  1494. z11 = tmp7 + z3;
  1495. z13 = tmp7 - z3;
  1496. dataptr[4] = z13 + z2;
  1497. dataptr[5] = z13 - z2;
  1498. dataptr[6] = z11 + z4;
  1499. dataptr[7] = z11 - z4;
  1500. pixels++; // advance pointer to next column
  1501. dataptr += DCTSIZE;
  1502. }
  1503. }
  1504. #else /* HAVE_MMX */
  1505. static void row_fdct_mmx(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
  1506. {
  1507. uint64_t __attribute__((aligned(8))) temps[4];
  1508. __asm__ volatile(
  1509. "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t"
  1510. "6: \n\t"
  1511. "movd (%%"REG_S"), %%mm0 \n\t"
  1512. "pxor %%mm7, %%mm7 \n\t"
  1513. "movd (%%"REG_S",%%"REG_a",), %%mm1 \n\t"
  1514. "punpcklbw %%mm7, %%mm0 \n\t"
  1515. "movd (%%"REG_S",%%"REG_a",2), %%mm2 \n\t"
  1516. "punpcklbw %%mm7, %%mm1 \n\t"
  1517. "punpcklbw %%mm7, %%mm2 \n\t"
  1518. "add %%"REG_d", %%"REG_S" \n\t"
  1519. "movq %%mm0, %%mm5 \n\t"
  1520. //
  1521. "movd (%%"REG_S",%%"REG_a",4), %%mm3 \n\t" //7 ;prefetch!
  1522. "movq %%mm1, %%mm6 \n\t"
  1523. "movd (%%"REG_S",%%"REG_d",), %%mm4 \n\t" //6
  1524. "punpcklbw %%mm7, %%mm3 \n\t"
  1525. "psubw %%mm3, %%mm5 \n\t"
  1526. "punpcklbw %%mm7, %%mm4 \n\t"
  1527. "paddw %%mm3, %%mm0 \n\t"
  1528. "psubw %%mm4, %%mm6 \n\t"
  1529. "movd (%%"REG_S",%%"REG_a",2), %%mm3 \n\t" //5
  1530. "paddw %%mm4, %%mm1 \n\t"
  1531. "movq %%mm5, 0*8+%3 \n\t" //t7
  1532. "punpcklbw %%mm7, %%mm3 \n\t"
  1533. "movq %%mm6, 1*8+%3 \n\t" //t6
  1534. "movq %%mm2, %%mm4 \n\t"
  1535. "movd (%%"REG_S"), %%mm5 \n\t" //3
  1536. "paddw %%mm3, %%mm2 \n\t"
  1537. "movd (%%"REG_S",%%"REG_a",), %%mm6 \n\t" //4
  1538. "punpcklbw %%mm7, %%mm5 \n\t"
  1539. "psubw %%mm3, %%mm4 \n\t"
  1540. "punpcklbw %%mm7, %%mm6 \n\t"
  1541. "movq %%mm5, %%mm3 \n\t"
  1542. "paddw %%mm6, %%mm5 \n\t" //t3
  1543. "psubw %%mm6, %%mm3 \n\t" //t4 ; t0 t1 t2 t4 t5 t3 - -
  1544. "movq %%mm0, %%mm6 \n\t"
  1545. "movq %%mm1, %%mm7 \n\t"
  1546. "psubw %%mm5, %%mm0 \n\t" //t13
  1547. "psubw %%mm2, %%mm1 \n\t"
  1548. "paddw %%mm2, %%mm7 \n\t" //t11
  1549. "paddw %%mm0, %%mm1 \n\t"
  1550. "movq %%mm7, %%mm2 \n\t"
  1551. "psllw $2, %%mm1 \n\t"
  1552. "paddw %%mm5, %%mm6 \n\t" //t10
  1553. "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm1 \n\t"
  1554. "paddw %%mm6, %%mm7 \n\t" //d2
  1555. "psubw %%mm2, %%mm6 \n\t" //d3
  1556. "movq %%mm0, %%mm5 \n\t"
  1557. //transpose 4x4
  1558. "movq %%mm7, %%mm2 \n\t"
  1559. "punpcklwd %%mm6, %%mm7 \n\t"
  1560. "paddw %%mm1, %%mm0 \n\t" //d0
  1561. "punpckhwd %%mm6, %%mm2 \n\t"
  1562. "psubw %%mm1, %%mm5 \n\t" //d1
  1563. "movq %%mm0, %%mm6 \n\t"
  1564. "movq 1*8+%3, %%mm1 \n\t"
  1565. "punpcklwd %%mm5, %%mm0 \n\t"
  1566. "punpckhwd %%mm5, %%mm6 \n\t"
  1567. "movq %%mm0, %%mm5 \n\t"
  1568. "punpckldq %%mm7, %%mm0 \n\t" //0
  1569. "paddw %%mm4, %%mm3 \n\t"
  1570. "punpckhdq %%mm7, %%mm5 \n\t" //1
  1571. "movq %%mm6, %%mm7 \n\t"
  1572. "movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
  1573. "punpckldq %%mm2, %%mm6 \n\t" //2
  1574. "movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
  1575. "punpckhdq %%mm2, %%mm7 \n\t" //3
  1576. "movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
  1577. "paddw %%mm1, %%mm4 \n\t"
  1578. "movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
  1579. "psllw $2, %%mm3 \n\t" //t10
  1580. "movq 0*8+%3, %%mm2 \n\t"
  1581. "psllw $2, %%mm4 \n\t" //t11
  1582. "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm4 \n\t" //z3
  1583. "paddw %%mm2, %%mm1 \n\t"
  1584. "psllw $2, %%mm1 \n\t" //t12
  1585. "movq %%mm3, %%mm0 \n\t"
  1586. "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm0 \n\t"
  1587. "psubw %%mm1, %%mm3 \n\t"
  1588. "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" //z5
  1589. "movq %%mm2, %%mm5 \n\t"
  1590. "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t"
  1591. "psubw %%mm4, %%mm2 \n\t" //z13
  1592. "paddw %%mm4, %%mm5 \n\t" //z11
  1593. "movq %%mm2, %%mm6 \n\t"
  1594. "paddw %%mm3, %%mm0 \n\t" //z2
  1595. "movq %%mm5, %%mm7 \n\t"
  1596. "paddw %%mm0, %%mm2 \n\t" //d4
  1597. "psubw %%mm0, %%mm6 \n\t" //d5
  1598. "movq %%mm2, %%mm4 \n\t"
  1599. "paddw %%mm3, %%mm1 \n\t" //z4
  1600. //transpose 4x4
  1601. "punpcklwd %%mm6, %%mm2 \n\t"
  1602. "paddw %%mm1, %%mm5 \n\t" //d6
  1603. "punpckhwd %%mm6, %%mm4 \n\t"
  1604. "psubw %%mm1, %%mm7 \n\t" //d7
  1605. "movq %%mm5, %%mm6 \n\t"
  1606. "punpcklwd %%mm7, %%mm5 \n\t"
  1607. "punpckhwd %%mm7, %%mm6 \n\t"
  1608. "movq %%mm2, %%mm7 \n\t"
  1609. "punpckldq %%mm5, %%mm2 \n\t" //4
  1610. "sub %%"REG_d", %%"REG_S" \n\t"
  1611. "punpckhdq %%mm5, %%mm7 \n\t" //5
  1612. "movq %%mm4, %%mm5 \n\t"
  1613. "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t"
  1614. "punpckldq %%mm6, %%mm4 \n\t" //6
  1615. "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t"
  1616. "punpckhdq %%mm6, %%mm5 \n\t" //7
  1617. "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t"
  1618. "add $4, %%"REG_S" \n\t"
  1619. "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t"
  1620. "add $"DCTSIZE_S"*2*4, %%"REG_D" \n\t" //4 rows
  1621. "dec %%"REG_c" \n\t"
  1622. "jnz 6b \n\t"
  1623. : "+S"(pixels), "+D"(data), "+c"(cnt), "=o"(temps)
  1624. : "a"(line_size)
  1625. : "%"REG_d);
  1626. }
  1627. #endif // HAVE_MMX