vf_fspp.c 71 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117
  1. /*
  2. * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
  3. * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
  4. *
  5. * This file is part of MPlayer.
  6. *
  7. * MPlayer is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * MPlayer is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License along
  18. * with MPlayer; if not, write to the Free Software Foundation, Inc.,
  19. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20. */
  21. /*
  22. * This implementation is based on an algorithm described in
  23. * "Aria Nosratinia Embedded Post-Processing for
  24. * Enhancement of Compressed Images (1999)"
  25. * (http://citeseer.nj.nec.com/nosratinia99embedded.html)
  26. * Futher, with splitting (i)dct into hor/ver passes, one of them can be
  27. * performed once per block, not pixel. This allows for much better speed.
  28. */
  29. /*
  30. Heavily optimized version of SPP filter by Nikolaj
  31. */
  32. #include <stdio.h>
  33. #include <stdlib.h>
  34. #include <string.h>
  35. #include <inttypes.h>
  36. #include <math.h>
  37. #include "config.h"
  38. #include "mp_msg.h"
  39. #include "cpudetect.h"
  40. #include "img_format.h"
  41. #include "mp_image.h"
  42. #include "vf.h"
  43. #include "vd_ffmpeg.h"
  44. #include "libvo/fastmemcpy.h"
  45. #include "libavutil/internal.h"
  46. #include "libavutil/intreadwrite.h"
  47. #include "libavutil/mem.h"
  48. #include "libavcodec/avcodec.h"
  49. #include "libavcodec/dsputil.h"
  50. #undef free
  51. #undef malloc
  52. //===========================================================================//
  53. #define BLOCKSZ 12
  54. static const short custom_threshold[64]=
  55. // values (296) can't be too high
  56. // -it causes too big quant dependence
  57. // or maybe overflow(check), which results in some flashing
  58. { 71, 296, 295, 237, 71, 40, 38, 19,
  59. 245, 193, 185, 121, 102, 73, 53, 27,
  60. 158, 129, 141, 107, 97, 73, 50, 26,
  61. 102, 116, 109, 98, 82, 66, 45, 23,
  62. 71, 94, 95, 81, 70, 56, 38, 20,
  63. 56, 77, 74, 66, 56, 44, 30, 15,
  64. 38, 53, 50, 45, 38, 30, 21, 11,
  65. 20, 27, 26, 23, 20, 15, 11, 5
  66. };
  67. static const uint8_t __attribute__((aligned(32))) dither[8][8]={
  68. { 0, 48, 12, 60, 3, 51, 15, 63, },
  69. { 32, 16, 44, 28, 35, 19, 47, 31, },
  70. { 8, 56, 4, 52, 11, 59, 7, 55, },
  71. { 40, 24, 36, 20, 43, 27, 39, 23, },
  72. { 2, 50, 14, 62, 1, 49, 13, 61, },
  73. { 34, 18, 46, 30, 33, 17, 45, 29, },
  74. { 10, 58, 6, 54, 9, 57, 5, 53, },
  75. { 42, 26, 38, 22, 41, 25, 37, 21, },
  76. };
  77. struct vf_priv_s { //align 16 !
  78. uint64_t threshold_mtx_noq[8*2];
  79. uint64_t threshold_mtx[8*2];//used in both C & MMX (& later SSE2) versions
  80. int log2_count;
  81. int temp_stride;
  82. int qp;
  83. int mpeg2;
  84. int prev_q;
  85. uint8_t *src;
  86. int16_t *temp;
  87. int bframes;
  88. char *non_b_qp;
  89. };
  90. #if !HAVE_MMX
  91. //This func reads from 1 slice, 1 and clears 0 & 1
  92. static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
  93. {int y, x;
  94. #define STORE(pos) \
  95. temp= (src[x + pos] + (d[pos]>>log2_scale))>>(6-log2_scale); \
  96. src[x + pos]=src[x + pos - 8*src_stride]=0; \
  97. if(temp & 0x100) temp= ~(temp>>31); \
  98. dst[x + pos]= temp;
  99. for(y=0; y<height; y++){
  100. const uint8_t *d= dither[y];
  101. for(x=0; x<width; x+=8){
  102. int temp;
  103. STORE(0);
  104. STORE(1);
  105. STORE(2);
  106. STORE(3);
  107. STORE(4);
  108. STORE(5);
  109. STORE(6);
  110. STORE(7);
  111. }
  112. src+=src_stride;
  113. dst+=dst_stride;
  114. }
  115. }
  116. //This func reads from 2 slices, 0 & 2 and clears 2-nd
  117. static void store_slice2_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
  118. {int y, x;
  119. #define STORE2(pos) \
  120. temp= (src[x + pos] + src[x + pos + 16*src_stride] + (d[pos]>>log2_scale))>>(6-log2_scale); \
  121. src[x + pos + 16*src_stride]=0; \
  122. if(temp & 0x100) temp= ~(temp>>31); \
  123. dst[x + pos]= temp;
  124. for(y=0; y<height; y++){
  125. const uint8_t *d= dither[y];
  126. for(x=0; x<width; x+=8){
  127. int temp;
  128. STORE2(0);
  129. STORE2(1);
  130. STORE2(2);
  131. STORE2(3);
  132. STORE2(4);
  133. STORE2(5);
  134. STORE2(6);
  135. STORE2(7);
  136. }
  137. src+=src_stride;
  138. dst+=dst_stride;
  139. }
  140. }
  141. static void mul_thrmat_c(struct vf_priv_s *p,int q)
  142. {
  143. int a;
  144. for(a=0;a<64;a++)
  145. ((short*)p->threshold_mtx)[a]=q * ((short*)p->threshold_mtx_noq)[a];//ints faster in C
  146. }
  147. static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);
  148. static void row_idct_c(DCTELEM* workspace,
  149. int16_t* output_adr, int output_stride, int cnt);
  150. static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);
  151. //this is rather ugly, but there is no need for function pointers
  152. #define store_slice_s store_slice_c
  153. #define store_slice2_s store_slice2_c
  154. #define mul_thrmat_s mul_thrmat_c
  155. #define column_fidct_s column_fidct_c
  156. #define row_idct_s row_idct_c
  157. #define row_fdct_s row_fdct_c
  158. #else /* HAVE_MMX */
  159. //This func reads from 1 slice, 1 and clears 0 & 1
  160. static void store_slice_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
  161. {
  162. const uint8_t *od=&dither[0][0];
  163. const uint8_t *end=&dither[height][0];
  164. width = (width+7)&~7;
  165. dst_stride-=width;
  166. //src_stride=(src_stride-width)*2;
  167. __asm__ volatile(
  168. "mov %5, %%"REG_d" \n\t"
  169. "mov %6, %%"REG_S" \n\t"
  170. "mov %7, %%"REG_D" \n\t"
  171. "mov %1, %%"REG_a" \n\t"
  172. "movd %%"REG_d", %%mm5 \n\t"
  173. "xor $-1, %%"REG_d" \n\t"
  174. "mov %%"REG_a", %%"REG_c" \n\t"
  175. "add $7, %%"REG_d" \n\t"
  176. "neg %%"REG_a" \n\t"
  177. "sub %0, %%"REG_c" \n\t"
  178. "add %%"REG_c", %%"REG_c" \n\t"
  179. "movd %%"REG_d", %%mm2 \n\t"
  180. "mov %%"REG_c", %1 \n\t"
  181. "mov %2, %%"REG_d" \n\t"
  182. "shl $4, %%"REG_a" \n\t"
  183. "2: \n\t"
  184. "movq (%%"REG_d"), %%mm3 \n\t"
  185. "movq %%mm3, %%mm4 \n\t"
  186. "pxor %%mm7, %%mm7 \n\t"
  187. "punpcklbw %%mm7, %%mm3 \n\t"
  188. "punpckhbw %%mm7, %%mm4 \n\t"
  189. "mov %0, %%"REG_c" \n\t"
  190. "psraw %%mm5, %%mm3 \n\t"
  191. "psraw %%mm5, %%mm4 \n\t"
  192. "1: \n\t"
  193. "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t"
  194. "movq (%%"REG_S"), %%mm0 \n\t"
  195. "movq 8(%%"REG_S"), %%mm1 \n\t"
  196. "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t"
  197. "paddw %%mm3, %%mm0 \n\t"
  198. "paddw %%mm4, %%mm1 \n\t"
  199. "movq %%mm7, (%%"REG_S") \n\t"
  200. "psraw %%mm2, %%mm0 \n\t"
  201. "psraw %%mm2, %%mm1 \n\t"
  202. "movq %%mm7, 8(%%"REG_S") \n\t"
  203. "packuswb %%mm1, %%mm0 \n\t"
  204. "add $16, %%"REG_S" \n\t"
  205. "movq %%mm0, (%%"REG_D") \n\t"
  206. "add $8, %%"REG_D" \n\t"
  207. "sub $8, %%"REG_c" \n\t"
  208. "jg 1b \n\t"
  209. "add %1, %%"REG_S" \n\t"
  210. "add $8, %%"REG_d" \n\t"
  211. "add %3, %%"REG_D" \n\t"
  212. "cmp %4, %%"REG_d" \n\t"
  213. "jl 2b \n\t"
  214. :
  215. : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
  216. "m" (log2_scale), "m" (src), "m" (dst) //input
  217. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  218. );
  219. }
  220. //This func reads from 2 slices, 0 & 2 and clears 2-nd
  221. static void store_slice2_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
  222. {
  223. const uint8_t *od=&dither[0][0];
  224. const uint8_t *end=&dither[height][0];
  225. width = (width+7)&~7;
  226. dst_stride-=width;
  227. //src_stride=(src_stride-width)*2;
  228. __asm__ volatile(
  229. "mov %5, %%"REG_d" \n\t"
  230. "mov %6, %%"REG_S" \n\t"
  231. "mov %7, %%"REG_D" \n\t"
  232. "mov %1, %%"REG_a" \n\t"
  233. "movd %%"REG_d", %%mm5 \n\t"
  234. "xor $-1, %%"REG_d" \n\t"
  235. "mov %%"REG_a", %%"REG_c" \n\t"
  236. "add $7, %%"REG_d" \n\t"
  237. "sub %0, %%"REG_c" \n\t"
  238. "add %%"REG_c", %%"REG_c" \n\t"
  239. "movd %%"REG_d", %%mm2 \n\t"
  240. "mov %%"REG_c", %1 \n\t"
  241. "mov %2, %%"REG_d" \n\t"
  242. "shl $5, %%"REG_a" \n\t"
  243. "2: \n\t"
  244. "movq (%%"REG_d"), %%mm3 \n\t"
  245. "movq %%mm3, %%mm4 \n\t"
  246. "pxor %%mm7, %%mm7 \n\t"
  247. "punpcklbw %%mm7, %%mm3 \n\t"
  248. "punpckhbw %%mm7, %%mm4 \n\t"
  249. "mov %0, %%"REG_c" \n\t"
  250. "psraw %%mm5, %%mm3 \n\t"
  251. "psraw %%mm5, %%mm4 \n\t"
  252. "1: \n\t"
  253. "movq (%%"REG_S"), %%mm0 \n\t"
  254. "movq 8(%%"REG_S"), %%mm1 \n\t"
  255. "paddw %%mm3, %%mm0 \n\t"
  256. "paddw (%%"REG_S",%%"REG_a",), %%mm0 \n\t"
  257. "paddw %%mm4, %%mm1 \n\t"
  258. "movq 8(%%"REG_S",%%"REG_a",), %%mm6 \n\t"
  259. "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t"
  260. "psraw %%mm2, %%mm0 \n\t"
  261. "paddw %%mm6, %%mm1 \n\t"
  262. "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t"
  263. "psraw %%mm2, %%mm1 \n\t"
  264. "packuswb %%mm1, %%mm0 \n\t"
  265. "movq %%mm0, (%%"REG_D") \n\t"
  266. "add $16, %%"REG_S" \n\t"
  267. "add $8, %%"REG_D" \n\t"
  268. "sub $8, %%"REG_c" \n\t"
  269. "jg 1b \n\t"
  270. "add %1, %%"REG_S" \n\t"
  271. "add $8, %%"REG_d" \n\t"
  272. "add %3, %%"REG_D" \n\t"
  273. "cmp %4, %%"REG_d" \n\t"
  274. "jl 2b \n\t"
  275. :
  276. : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
  277. "m" (log2_scale), "m" (src), "m" (dst) //input
  278. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S
  279. );
  280. }
  281. static void mul_thrmat_mmx(struct vf_priv_s *p, int q)
  282. {
  283. uint64_t *adr=&p->threshold_mtx_noq[0];
  284. __asm__ volatile(
  285. "movd %0, %%mm7 \n\t"
  286. "add $8*8*2, %%"REG_D" \n\t"
  287. "movq 0*8(%%"REG_S"), %%mm0 \n\t"
  288. "punpcklwd %%mm7, %%mm7 \n\t"
  289. "movq 1*8(%%"REG_S"), %%mm1 \n\t"
  290. "punpckldq %%mm7, %%mm7 \n\t"
  291. "pmullw %%mm7, %%mm0 \n\t"
  292. "movq 2*8(%%"REG_S"), %%mm2 \n\t"
  293. "pmullw %%mm7, %%mm1 \n\t"
  294. "movq 3*8(%%"REG_S"), %%mm3 \n\t"
  295. "pmullw %%mm7, %%mm2 \n\t"
  296. "movq %%mm0, 0*8(%%"REG_D") \n\t"
  297. "movq 4*8(%%"REG_S"), %%mm4 \n\t"
  298. "pmullw %%mm7, %%mm3 \n\t"
  299. "movq %%mm1, 1*8(%%"REG_D") \n\t"
  300. "movq 5*8(%%"REG_S"), %%mm5 \n\t"
  301. "pmullw %%mm7, %%mm4 \n\t"
  302. "movq %%mm2, 2*8(%%"REG_D") \n\t"
  303. "movq 6*8(%%"REG_S"), %%mm6 \n\t"
  304. "pmullw %%mm7, %%mm5 \n\t"
  305. "movq %%mm3, 3*8(%%"REG_D") \n\t"
  306. "movq 7*8+0*8(%%"REG_S"), %%mm0 \n\t"
  307. "pmullw %%mm7, %%mm6 \n\t"
  308. "movq %%mm4, 4*8(%%"REG_D") \n\t"
  309. "movq 7*8+1*8(%%"REG_S"), %%mm1 \n\t"
  310. "pmullw %%mm7, %%mm0 \n\t"
  311. "movq %%mm5, 5*8(%%"REG_D") \n\t"
  312. "movq 7*8+2*8(%%"REG_S"), %%mm2 \n\t"
  313. "pmullw %%mm7, %%mm1 \n\t"
  314. "movq %%mm6, 6*8(%%"REG_D") \n\t"
  315. "movq 7*8+3*8(%%"REG_S"), %%mm3 \n\t"
  316. "pmullw %%mm7, %%mm2 \n\t"
  317. "movq %%mm0, 7*8+0*8(%%"REG_D") \n\t"
  318. "movq 7*8+4*8(%%"REG_S"), %%mm4 \n\t"
  319. "pmullw %%mm7, %%mm3 \n\t"
  320. "movq %%mm1, 7*8+1*8(%%"REG_D") \n\t"
  321. "movq 7*8+5*8(%%"REG_S"), %%mm5 \n\t"
  322. "pmullw %%mm7, %%mm4 \n\t"
  323. "movq %%mm2, 7*8+2*8(%%"REG_D") \n\t"
  324. "movq 7*8+6*8(%%"REG_S"), %%mm6 \n\t"
  325. "pmullw %%mm7, %%mm5 \n\t"
  326. "movq %%mm3, 7*8+3*8(%%"REG_D") \n\t"
  327. "movq 14*8+0*8(%%"REG_S"), %%mm0 \n\t"
  328. "pmullw %%mm7, %%mm6 \n\t"
  329. "movq %%mm4, 7*8+4*8(%%"REG_D") \n\t"
  330. "movq 14*8+1*8(%%"REG_S"), %%mm1 \n\t"
  331. "pmullw %%mm7, %%mm0 \n\t"
  332. "movq %%mm5, 7*8+5*8(%%"REG_D") \n\t"
  333. "pmullw %%mm7, %%mm1 \n\t"
  334. "movq %%mm6, 7*8+6*8(%%"REG_D") \n\t"
  335. "movq %%mm0, 14*8+0*8(%%"REG_D") \n\t"
  336. "movq %%mm1, 14*8+1*8(%%"REG_D") \n\t"
  337. : "+g" (q), "+S" (adr), "+D" (adr)
  338. :
  339. );
  340. }
  341. static void column_fidct_mmx(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);
  342. static void row_idct_mmx(DCTELEM* workspace,
  343. int16_t* output_adr, int output_stride, int cnt);
  344. static void row_fdct_mmx(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);
  345. #define store_slice_s store_slice_mmx
  346. #define store_slice2_s store_slice2_mmx
  347. #define mul_thrmat_s mul_thrmat_mmx
  348. #define column_fidct_s column_fidct_mmx
  349. #define row_idct_s row_idct_mmx
  350. #define row_fdct_s row_fdct_mmx
  351. #endif // HAVE_MMX
  352. static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src,
  353. int dst_stride, int src_stride,
  354. int width, int height,
  355. uint8_t *qp_store, int qp_stride, int is_luma)
  356. {
  357. int x, x0, y, es, qy, t;
  358. const int stride= is_luma ? p->temp_stride : (width+16);//((width+16+15)&(~15))
  359. const int step=6-p->log2_count;
  360. const int qps= 3 + is_luma;
  361. int32_t __attribute__((aligned(32))) block_align[4*8*BLOCKSZ+ 4*8*BLOCKSZ];
  362. DCTELEM *block= (DCTELEM *)block_align;
  363. DCTELEM *block3=(DCTELEM *)(block_align+4*8*BLOCKSZ);
  364. memset(block3, 0, 4*8*BLOCKSZ);
  365. //p->src=src-src_stride*8-8;//!
  366. if (!src || !dst) return; // HACK avoid crash for Y8 colourspace
  367. for(y=0; y<height; y++){
  368. int index= 8 + 8*stride + y*stride;
  369. fast_memcpy(p->src + index, src + y*src_stride, width);//this line can be avoided by using DR & user fr.buffers
  370. for(x=0; x<8; x++){
  371. p->src[index - x - 1]= p->src[index + x ];
  372. p->src[index + width + x ]= p->src[index + width - x - 1];
  373. }
  374. }
  375. for(y=0; y<8; y++){
  376. fast_memcpy(p->src + ( 7-y)*stride, p->src + ( y+8)*stride, stride);
  377. fast_memcpy(p->src + (height+8+y)*stride, p->src + (height-y+7)*stride, stride);
  378. }
  379. //FIXME (try edge emu)
  380. for(y=8; y<24; y++)
  381. memset(p->temp+ 8 +y*stride, 0,width*sizeof(int16_t));
  382. for(y=step; y<height+8; y+=step){ //step= 1,2
  383. qy=y-4;
  384. if (qy>height-1) qy=height-1;
  385. if (qy<0) qy=0;
  386. qy=(qy>>qps)*qp_stride;
  387. row_fdct_s(block, p->src + y*stride +2-(y&1), stride, 2);
  388. for(x0=0; x0<width+8-8*(BLOCKSZ-1); x0+=8*(BLOCKSZ-1)){
  389. row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, 2*(BLOCKSZ-1));
  390. if(p->qp)
  391. column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+0*8, block3+0*8, 8*(BLOCKSZ-1)); //yes, this is a HOTSPOT
  392. else
  393. for (x=0; x<8*(BLOCKSZ-1); x+=8) {
  394. t=x+x0-2; //correct t=x+x0-2-(y&1), but its the same
  395. if (t<0) t=0;//t always < width-2
  396. t=qp_store[qy+(t>>qps)];
  397. t=norm_qscale(t, p->mpeg2);
  398. if (t!=p->prev_q) p->prev_q=t, mul_thrmat_s(p, t);
  399. column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+x*8, block3+x*8, 8); //yes, this is a HOTSPOT
  400. }
  401. row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, 2*(BLOCKSZ-1));
  402. memmove(block, block+(BLOCKSZ-1)*64, 8*8*sizeof(DCTELEM)); //cycling
  403. memmove(block3, block3+(BLOCKSZ-1)*64, 6*8*sizeof(DCTELEM));
  404. }
  405. //
  406. es=width+8-x0; // 8, ...
  407. if (es>8)
  408. row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, (es-4)>>2);
  409. column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block, block3, es&(~1));
  410. row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, es>>2);
  411. {const int y1=y-8+step;//l5-7 l4-6
  412. if (!(y1&7) && y1) {
  413. if (y1&8) store_slice_s(dst + (y1-8)*dst_stride, p->temp+ 8 +8*stride,
  414. dst_stride, stride, width, 8, 5-p->log2_count);
  415. else store_slice2_s(dst + (y1-8)*dst_stride, p->temp+ 8 +0*stride,
  416. dst_stride, stride, width, 8, 5-p->log2_count);
  417. } }
  418. }
  419. if (y&7) { // == height & 7
  420. if (y&8) store_slice_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +8*stride,
  421. dst_stride, stride, width, y&7, 5-p->log2_count);
  422. else store_slice2_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +0*stride,
  423. dst_stride, stride, width, y&7, 5-p->log2_count);
  424. }
  425. }
  426. static int config(struct vf_instance *vf,
  427. int width, int height, int d_width, int d_height,
  428. unsigned int flags, unsigned int outfmt)
  429. {
  430. int h= (height+16+15)&(~15);
  431. vf->priv->temp_stride= (width+16+15)&(~15);
  432. vf->priv->temp= (int16_t*)av_mallocz(vf->priv->temp_stride*3*8*sizeof(int16_t));
  433. //this can also be avoided, see above
  434. vf->priv->src = (uint8_t*)av_malloc(vf->priv->temp_stride*h*sizeof(uint8_t));
  435. return vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
  436. }
  437. static void get_image(struct vf_instance *vf, mp_image_t *mpi)
  438. {
  439. if(mpi->flags&MP_IMGFLAG_PRESERVE) return; // don't change
  440. // ok, we can do pp in-place (or pp disabled):
  441. vf->dmpi=vf_get_image(vf->next,mpi->imgfmt,
  442. mpi->type, mpi->flags, mpi->width, mpi->height);
  443. mpi->planes[0]=vf->dmpi->planes[0];
  444. mpi->stride[0]=vf->dmpi->stride[0];
  445. mpi->width=vf->dmpi->width;
  446. if(mpi->flags&MP_IMGFLAG_PLANAR){
  447. mpi->planes[1]=vf->dmpi->planes[1];
  448. mpi->planes[2]=vf->dmpi->planes[2];
  449. mpi->stride[1]=vf->dmpi->stride[1];
  450. mpi->stride[2]=vf->dmpi->stride[2];
  451. }
  452. mpi->flags|=MP_IMGFLAG_DIRECT;
  453. }
  454. static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
  455. {
  456. mp_image_t *dmpi;
  457. if(!(mpi->flags&MP_IMGFLAG_DIRECT)){
  458. // no DR, so get a new image! hope we'll get DR buffer:
  459. dmpi=vf_get_image(vf->next,mpi->imgfmt,
  460. MP_IMGTYPE_TEMP,
  461. MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_PREFER_ALIGNED_STRIDE,
  462. mpi->width,mpi->height);
  463. vf_clone_mpi_attributes(dmpi, mpi);
  464. }else{
  465. dmpi=vf->dmpi;
  466. }
  467. vf->priv->mpeg2= mpi->qscale_type;
  468. if(mpi->pict_type != 3 && mpi->qscale && !vf->priv->qp){
  469. int w = mpi->qstride;
  470. int h = (mpi->h + 15) >> 4;
  471. if (!w) {
  472. w = (mpi->w + 15) >> 4;
  473. h = 1;
  474. }
  475. if(!vf->priv->non_b_qp)
  476. vf->priv->non_b_qp= malloc(w*h);
  477. fast_memcpy(vf->priv->non_b_qp, mpi->qscale, w*h);
  478. }
  479. if(vf->priv->log2_count || !(mpi->flags&MP_IMGFLAG_DIRECT)){
  480. char *qp_tab= vf->priv->non_b_qp;
  481. if(vf->priv->bframes || !qp_tab)
  482. qp_tab= mpi->qscale;
  483. if(qp_tab || vf->priv->qp){
  484. filter(vf->priv, dmpi->planes[0], mpi->planes[0], dmpi->stride[0], mpi->stride[0],
  485. mpi->w, mpi->h, qp_tab, mpi->qstride, 1);
  486. filter(vf->priv, dmpi->planes[1], mpi->planes[1], dmpi->stride[1], mpi->stride[1],
  487. mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
  488. filter(vf->priv, dmpi->planes[2], mpi->planes[2], dmpi->stride[2], mpi->stride[2],
  489. mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
  490. }else{
  491. memcpy_pic(dmpi->planes[0], mpi->planes[0], mpi->w, mpi->h, dmpi->stride[0], mpi->stride[0]);
  492. memcpy_pic(dmpi->planes[1], mpi->planes[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[1], mpi->stride[1]);
  493. memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[2], mpi->stride[2]);
  494. }
  495. }
  496. #if HAVE_MMX
  497. if(gCpuCaps.hasMMX) __asm__ volatile ("emms\n\t");
  498. #endif
  499. #if HAVE_MMX2
  500. if(gCpuCaps.hasMMX2) __asm__ volatile ("sfence\n\t");
  501. #endif
  502. return vf_next_put_image(vf,dmpi, pts);
  503. }
  504. static void uninit(struct vf_instance *vf)
  505. {
  506. if(!vf->priv) return;
  507. av_free(vf->priv->temp);
  508. vf->priv->temp= NULL;
  509. av_free(vf->priv->src);
  510. vf->priv->src= NULL;
  511. //free(vf->priv->avctx);
  512. //vf->priv->avctx= NULL;
  513. free(vf->priv->non_b_qp);
  514. vf->priv->non_b_qp= NULL;
  515. av_free(vf->priv);
  516. vf->priv=NULL;
  517. }
  518. //===========================================================================//
  519. static int query_format(struct vf_instance *vf, unsigned int fmt)
  520. {
  521. switch(fmt){
  522. case IMGFMT_YVU9:
  523. case IMGFMT_IF09:
  524. case IMGFMT_YV12:
  525. case IMGFMT_I420:
  526. case IMGFMT_IYUV:
  527. case IMGFMT_CLPL:
  528. case IMGFMT_Y800:
  529. case IMGFMT_Y8:
  530. case IMGFMT_444P:
  531. case IMGFMT_422P:
  532. case IMGFMT_411P:
  533. return vf_next_query_format(vf,fmt);
  534. }
  535. return 0;
  536. }
  537. static int control(struct vf_instance *vf, int request, void* data)
  538. {
  539. switch(request){
  540. case VFCTRL_QUERY_MAX_PP_LEVEL:
  541. return 5;
  542. case VFCTRL_SET_PP_LEVEL:
  543. vf->priv->log2_count= *((unsigned int*)data);
  544. if (vf->priv->log2_count < 4) vf->priv->log2_count=4;
  545. return CONTROL_TRUE;
  546. }
  547. return vf_next_control(vf,request,data);
  548. }
  549. static int vf_open(vf_instance_t *vf, char *args)
  550. {
  551. int i=0, bias;
  552. int custom_threshold_m[64];
  553. int log2c=-1;
  554. vf->config=config;
  555. vf->put_image=put_image;
  556. vf->get_image=get_image;
  557. vf->query_format=query_format;
  558. vf->uninit=uninit;
  559. vf->control= control;
  560. vf->priv=av_mallocz(sizeof(struct vf_priv_s));//assumes align 16 !
  561. init_avcodec();
  562. //vf->priv->avctx= avcodec_alloc_context();
  563. //dsputil_init(&vf->priv->dsp, vf->priv->avctx);
  564. vf->priv->log2_count= 4;
  565. vf->priv->bframes = 0;
  566. if (args) sscanf(args, "%d:%d:%d:%d", &log2c, &vf->priv->qp, &i, &vf->priv->bframes);
  567. if( log2c >=4 && log2c <=5 )
  568. vf->priv->log2_count = log2c;
  569. else if( log2c >= 6 )
  570. vf->priv->log2_count = 5;
  571. if(vf->priv->qp < 0)
  572. vf->priv->qp = 0;
  573. if (i < -15) i = -15;
  574. if (i > 32) i = 32;
  575. bias= (1<<4)+i; //regulable
  576. vf->priv->prev_q=0;
  577. //
  578. for(i=0;i<64;i++) //FIXME: tune custom_threshold[] and remove this !
  579. custom_threshold_m[i]=(int)(custom_threshold[i]*(bias/71.)+ 0.5);
  580. for(i=0;i<8;i++){
  581. vf->priv->threshold_mtx_noq[2*i]=(uint64_t)custom_threshold_m[i*8+2]
  582. |(((uint64_t)custom_threshold_m[i*8+6])<<16)
  583. |(((uint64_t)custom_threshold_m[i*8+0])<<32)
  584. |(((uint64_t)custom_threshold_m[i*8+4])<<48);
  585. vf->priv->threshold_mtx_noq[2*i+1]=(uint64_t)custom_threshold_m[i*8+5]
  586. |(((uint64_t)custom_threshold_m[i*8+3])<<16)
  587. |(((uint64_t)custom_threshold_m[i*8+1])<<32)
  588. |(((uint64_t)custom_threshold_m[i*8+7])<<48);
  589. }
  590. if (vf->priv->qp) vf->priv->prev_q=vf->priv->qp, mul_thrmat_s(vf->priv, vf->priv->qp);
  591. return 1;
  592. }
  593. const vf_info_t vf_info_fspp = {
  594. "fast simple postprocess",
  595. "fspp",
  596. "Michael Niedermayer, Nikolaj Poroshin",
  597. "",
  598. vf_open,
  599. NULL
  600. };
  601. //====================================================================
  602. //Specific spp's dct, idct and threshold functions
  603. //I'd prefer to have them in the separate file.
  604. //#define MANGLE(a) #a
  605. //typedef int16_t DCTELEM; //! only int16_t
  606. #define DCTSIZE 8
  607. #define DCTSIZE_S "8"
  608. #define FIX(x,s) ((int) ((x) * (1<<s) + 0.5)&0xffff)
  609. #define C64(x) ((uint64_t)((x)|(x)<<16))<<32 | (uint64_t)(x) | (uint64_t)(x)<<16
  610. #define FIX64(x,s) C64(FIX(x,s))
  611. #define MULTIPLY16H(x,k) (((x)*(k))>>16)
  612. #define THRESHOLD(r,x,t) if(((unsigned)((x)+t))>t*2) r=(x);else r=0;
  613. #define DESCALE(x,n) (((x) + (1 << ((n)-1))) >> n)
  614. #if HAVE_MMX
  615. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433)=FIX64(0.382683433, 14);
  616. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_541196100)=FIX64(0.541196100, 14);
  617. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_707106781)=FIX64(0.707106781, 14);
  618. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965)=FIX64(1.306562965, 14);
  619. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A)=FIX64(1.414213562, 14);
  620. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065)=FIX64(1.847759065, 13);
  621. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930)=FIX64(-2.613125930, 13); //-
  622. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562)=FIX64(1.414213562, 13);
  623. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200)=FIX64(1.082392200, 13);
  624. //for t3,t5,t7 == 0 shortcut
  625. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065)=FIX64(0.847759065, 14);
  626. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497)=FIX64(0.566454497, 14);
  627. DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367)=FIX64(0.198912367, 14);
  628. DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND)=C64(4);
  629. DECLARE_ASM_CONST(8, uint64_t, MM_2)=C64(2);
  630. #else /* !HAVE_MMX */
  631. typedef int32_t int_simd16_t;
  632. static const int16_t FIX_0_382683433=FIX(0.382683433, 14);
  633. static const int16_t FIX_0_541196100=FIX(0.541196100, 14);
  634. static const int16_t FIX_0_707106781=FIX(0.707106781, 14);
  635. static const int16_t FIX_1_306562965=FIX(1.306562965, 14);
  636. static const int16_t FIX_1_414213562_A=FIX(1.414213562, 14);
  637. static const int16_t FIX_1_847759065=FIX(1.847759065, 13);
  638. static const int16_t FIX_2_613125930=FIX(-2.613125930, 13); //-
  639. static const int16_t FIX_1_414213562=FIX(1.414213562, 13);
  640. static const int16_t FIX_1_082392200=FIX(1.082392200, 13);
  641. #endif
  642. #if !HAVE_MMX
  643. static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt)
  644. {
  645. int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  646. int_simd16_t tmp10, tmp11, tmp12, tmp13;
  647. int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
  648. int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
  649. DCTELEM* dataptr;
  650. DCTELEM* wsptr;
  651. int16_t *threshold;
  652. int ctr;
  653. dataptr = data;
  654. wsptr = output;
  655. for (; cnt > 0; cnt-=2) { //start positions
  656. threshold=(int16_t*)thr_adr;//threshold_mtx
  657. for (ctr = DCTSIZE; ctr > 0; ctr--) {
  658. // Process columns from input, add to output.
  659. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
  660. tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
  661. tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
  662. tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
  663. tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
  664. tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
  665. tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
  666. tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
  667. // Even part of FDCT
  668. tmp10 = tmp0 + tmp3;
  669. tmp13 = tmp0 - tmp3;
  670. tmp11 = tmp1 + tmp2;
  671. tmp12 = tmp1 - tmp2;
  672. d0 = tmp10 + tmp11;
  673. d4 = tmp10 - tmp11;
  674. z1 = MULTIPLY16H((tmp12 + tmp13) <<2, FIX_0_707106781);
  675. d2 = tmp13 + z1;
  676. d6 = tmp13 - z1;
  677. // Even part of IDCT
  678. THRESHOLD(tmp0, d0, threshold[0*8]);
  679. THRESHOLD(tmp1, d2, threshold[2*8]);
  680. THRESHOLD(tmp2, d4, threshold[4*8]);
  681. THRESHOLD(tmp3, d6, threshold[6*8]);
  682. tmp0+=2;
  683. tmp10 = (tmp0 + tmp2)>>2;
  684. tmp11 = (tmp0 - tmp2)>>2;
  685. tmp13 = (tmp1 + tmp3)>>2; //+2 ! (psnr decides)
  686. tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
  687. tmp0 = tmp10 + tmp13; //->temps
  688. tmp3 = tmp10 - tmp13; //->temps
  689. tmp1 = tmp11 + tmp12; //->temps
  690. tmp2 = tmp11 - tmp12; //->temps
  691. // Odd part of FDCT
  692. tmp10 = tmp4 + tmp5;
  693. tmp11 = tmp5 + tmp6;
  694. tmp12 = tmp6 + tmp7;
  695. z5 = MULTIPLY16H((tmp10 - tmp12)<<2, FIX_0_382683433);
  696. z2 = MULTIPLY16H(tmp10 <<2, FIX_0_541196100) + z5;
  697. z4 = MULTIPLY16H(tmp12 <<2, FIX_1_306562965) + z5;
  698. z3 = MULTIPLY16H(tmp11 <<2, FIX_0_707106781);
  699. z11 = tmp7 + z3;
  700. z13 = tmp7 - z3;
  701. d5 = z13 + z2;
  702. d3 = z13 - z2;
  703. d1 = z11 + z4;
  704. d7 = z11 - z4;
  705. // Odd part of IDCT
  706. THRESHOLD(tmp4, d1, threshold[1*8]);
  707. THRESHOLD(tmp5, d3, threshold[3*8]);
  708. THRESHOLD(tmp6, d5, threshold[5*8]);
  709. THRESHOLD(tmp7, d7, threshold[7*8]);
  710. //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
  711. z13 = tmp6 + tmp5;
  712. z10 = (tmp6 - tmp5)<<1;
  713. z11 = tmp4 + tmp7;
  714. z12 = (tmp4 - tmp7)<<1;
  715. tmp7 = (z11 + z13)>>2; //+2 !
  716. tmp11 = MULTIPLY16H((z11 - z13)<<1, FIX_1_414213562);
  717. z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
  718. tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
  719. tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
  720. tmp6 = tmp12 - tmp7;
  721. tmp5 = tmp11 - tmp6;
  722. tmp4 = tmp10 + tmp5;
  723. wsptr[DCTSIZE*0]+= (tmp0 + tmp7);
  724. wsptr[DCTSIZE*1]+= (tmp1 + tmp6);
  725. wsptr[DCTSIZE*2]+= (tmp2 + tmp5);
  726. wsptr[DCTSIZE*3]+= (tmp3 - tmp4);
  727. wsptr[DCTSIZE*4]+= (tmp3 + tmp4);
  728. wsptr[DCTSIZE*5]+= (tmp2 - tmp5);
  729. wsptr[DCTSIZE*6]= (tmp1 - tmp6);
  730. wsptr[DCTSIZE*7]= (tmp0 - tmp7);
  731. //
  732. dataptr++; //next column
  733. wsptr++;
  734. threshold++;
  735. }
  736. dataptr+=8; //skip each second start pos
  737. wsptr +=8;
  738. }
  739. }
  740. #else /* HAVE_MMX */
  741. static void column_fidct_mmx(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt)
  742. {
  743. uint64_t __attribute__((aligned(8))) temps[4];
  744. __asm__ volatile(
  745. ASMALIGN(4)
  746. "1: \n\t"
  747. "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
  748. //
  749. "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
  750. "movq %%mm1, %%mm0 \n\t"
  751. "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
  752. "movq %%mm7, %%mm3 \n\t"
  753. "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
  754. "movq %%mm1, %%mm5 \n\t"
  755. "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
  756. "psubw %%mm7, %%mm1 \n\t" //t13
  757. "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
  758. "movq %%mm6, %%mm4 \n\t"
  759. "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
  760. "paddw %%mm7, %%mm5 \n\t" //t10
  761. "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
  762. "movq %%mm6, %%mm7 \n\t"
  763. "paddw %%mm2, %%mm6 \n\t" //t11
  764. "psubw %%mm2, %%mm7 \n\t" //t12
  765. "movq %%mm5, %%mm2 \n\t"
  766. "paddw %%mm6, %%mm5 \n\t" //d0
  767. // i0 t13 t12 i3 i1 d0 - d4
  768. "psubw %%mm6, %%mm2 \n\t" //d4
  769. "paddw %%mm1, %%mm7 \n\t"
  770. "movq 4*16(%%"REG_d"), %%mm6 \n\t"
  771. "psllw $2, %%mm7 \n\t"
  772. "psubw 0*16(%%"REG_d"), %%mm5 \n\t"
  773. "psubw %%mm6, %%mm2 \n\t"
  774. "paddusw 0*16(%%"REG_d"), %%mm5 \n\t"
  775. "paddusw %%mm6, %%mm2 \n\t"
  776. "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t"
  777. //
  778. "paddw 0*16(%%"REG_d"), %%mm5 \n\t"
  779. "paddw %%mm6, %%mm2 \n\t"
  780. "psubusw 0*16(%%"REG_d"), %%mm5 \n\t"
  781. "psubusw %%mm6, %%mm2 \n\t"
  782. //This func is totally compute-bound, operates at huge speed. So, DC shortcut
  783. // at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
  784. //However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare.
  785. "paddw "MANGLE(MM_2)", %%mm5 \n\t"
  786. "movq %%mm2, %%mm6 \n\t"
  787. "paddw %%mm5, %%mm2 \n\t"
  788. "psubw %%mm6, %%mm5 \n\t"
  789. "movq %%mm1, %%mm6 \n\t"
  790. "paddw %%mm7, %%mm1 \n\t" //d2
  791. "psubw 2*16(%%"REG_d"), %%mm1 \n\t"
  792. "psubw %%mm7, %%mm6 \n\t" //d6
  793. "movq 6*16(%%"REG_d"), %%mm7 \n\t"
  794. "psraw $2, %%mm5 \n\t"
  795. "paddusw 2*16(%%"REG_d"), %%mm1 \n\t"
  796. "psubw %%mm7, %%mm6 \n\t"
  797. // t7 d2 /t11 t4 t6 - d6 /t10
  798. "paddw 2*16(%%"REG_d"), %%mm1 \n\t"
  799. "paddusw %%mm7, %%mm6 \n\t"
  800. "psubusw 2*16(%%"REG_d"), %%mm1 \n\t"
  801. "paddw %%mm7, %%mm6 \n\t"
  802. "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
  803. "psubusw %%mm7, %%mm6 \n\t"
  804. //movq [edi+"DCTSIZE_S"*2*2], mm1
  805. //movq [edi+"DCTSIZE_S"*6*2], mm6
  806. "movq %%mm1, %%mm7 \n\t"
  807. "psraw $2, %%mm2 \n\t"
  808. "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
  809. "psubw %%mm6, %%mm1 \n\t"
  810. "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
  811. "paddw %%mm7, %%mm6 \n\t" //'t13
  812. "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! ---
  813. "movq %%mm2, %%mm7 \n\t"
  814. "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
  815. "paddw %%mm6, %%mm2 \n\t" //'t0
  816. "movq %%mm2, 0*8+%3 \n\t" //!
  817. "psubw %%mm6, %%mm7 \n\t" //'t3
  818. "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
  819. "psubw %%mm6, %%mm1 \n\t" //'t12
  820. "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
  821. "movq %%mm5, %%mm6 \n\t"
  822. "movq %%mm7, 3*8+%3 \n\t"
  823. "paddw %%mm2, %%mm3 \n\t" //t10
  824. "paddw %%mm4, %%mm2 \n\t" //t11
  825. "paddw %%mm0, %%mm4 \n\t" //t12
  826. "movq %%mm3, %%mm7 \n\t"
  827. "psubw %%mm4, %%mm3 \n\t"
  828. "psllw $2, %%mm3 \n\t"
  829. "psllw $2, %%mm7 \n\t" //opt for P6
  830. "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
  831. "psllw $2, %%mm4 \n\t"
  832. "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t"
  833. "psllw $2, %%mm2 \n\t"
  834. "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
  835. "paddw %%mm1, %%mm5 \n\t" //'t1
  836. "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t"
  837. "psubw %%mm1, %%mm6 \n\t" //'t2
  838. // t7 't12 't11 t4 t6 - 't13 't10 ---
  839. "paddw %%mm3, %%mm7 \n\t" //z2
  840. "movq %%mm5, 1*8+%3 \n\t"
  841. "paddw %%mm3, %%mm4 \n\t" //z4
  842. "movq 3*16(%%"REG_d"), %%mm3 \n\t"
  843. "movq %%mm0, %%mm1 \n\t"
  844. "movq %%mm6, 2*8+%3 \n\t"
  845. "psubw %%mm2, %%mm1 \n\t" //z13
  846. //===
  847. "paddw %%mm2, %%mm0 \n\t" //z11
  848. "movq %%mm1, %%mm5 \n\t"
  849. "movq 5*16(%%"REG_d"), %%mm2 \n\t"
  850. "psubw %%mm7, %%mm1 \n\t" //d3
  851. "paddw %%mm7, %%mm5 \n\t" //d5
  852. "psubw %%mm3, %%mm1 \n\t"
  853. "movq 1*16(%%"REG_d"), %%mm7 \n\t"
  854. "psubw %%mm2, %%mm5 \n\t"
  855. "movq %%mm0, %%mm6 \n\t"
  856. "paddw %%mm4, %%mm0 \n\t" //d1
  857. "paddusw %%mm3, %%mm1 \n\t"
  858. "psubw %%mm4, %%mm6 \n\t" //d7
  859. // d1 d3 - - - d5 d7 -
  860. "movq 7*16(%%"REG_d"), %%mm4 \n\t"
  861. "psubw %%mm7, %%mm0 \n\t"
  862. "psubw %%mm4, %%mm6 \n\t"
  863. "paddusw %%mm2, %%mm5 \n\t"
  864. "paddusw %%mm4, %%mm6 \n\t"
  865. "paddw %%mm3, %%mm1 \n\t"
  866. "paddw %%mm2, %%mm5 \n\t"
  867. "paddw %%mm4, %%mm6 \n\t"
  868. "psubusw %%mm3, %%mm1 \n\t"
  869. "psubusw %%mm2, %%mm5 \n\t"
  870. "psubusw %%mm4, %%mm6 \n\t"
  871. "movq %%mm1, %%mm4 \n\t"
  872. "por %%mm5, %%mm4 \n\t"
  873. "paddusw %%mm7, %%mm0 \n\t"
  874. "por %%mm6, %%mm4 \n\t"
  875. "paddw %%mm7, %%mm0 \n\t"
  876. "packssdw %%mm4, %%mm4 \n\t"
  877. "psubusw %%mm7, %%mm0 \n\t"
  878. "movd %%mm4, %%"REG_a" \n\t"
  879. "or %%"REG_a", %%"REG_a" \n\t"
  880. "jnz 2f \n\t"
  881. //movq [edi+"DCTSIZE_S"*3*2], mm1
  882. //movq [edi+"DCTSIZE_S"*5*2], mm5
  883. //movq [edi+"DCTSIZE_S"*1*2], mm0
  884. //movq [edi+"DCTSIZE_S"*7*2], mm6
  885. // t4 t5 - - - t6 t7 -
  886. //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
  887. //Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile
  888. "movq 0*8+%3, %%mm4 \n\t"
  889. "movq %%mm0, %%mm1 \n\t"
  890. "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
  891. "movq %%mm1, %%mm2 \n\t"
  892. "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
  893. "movq %%mm2, %%mm3 \n\t"
  894. "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
  895. "paddw %%mm4, %%mm5 \n\t"
  896. "movq 1*8+%3, %%mm6 \n\t"
  897. //paddw mm3, MM_2
  898. "psraw $2, %%mm3 \n\t" //tmp7
  899. "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
  900. "psubw %%mm3, %%mm4 \n\t"
  901. "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
  902. "paddw %%mm3, %%mm5 \n\t"
  903. "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
  904. "paddw %%mm6, %%mm7 \n\t"
  905. "movq 2*8+%3, %%mm3 \n\t"
  906. "psubw %%mm0, %%mm6 \n\t"
  907. "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
  908. "paddw %%mm0, %%mm7 \n\t"
  909. "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
  910. "paddw %%mm3, %%mm4 \n\t"
  911. "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
  912. "psubw %%mm1, %%mm3 \n\t"
  913. "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
  914. "paddw %%mm1, %%mm4 \n\t"
  915. "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
  916. "paddw %%mm3, %%mm5 \n\t"
  917. "movq 3*8+%3, %%mm0 \n\t"
  918. "add $8, %%"REG_S" \n\t"
  919. "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
  920. "paddw %%mm0, %%mm6 \n\t"
  921. "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
  922. "psubw %%mm2, %%mm0 \n\t"
  923. "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
  924. "paddw %%mm2, %%mm6 \n\t"
  925. "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
  926. "paddw %%mm0, %%mm7 \n\t"
  927. "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
  928. "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
  929. "add $8, %%"REG_D" \n\t"
  930. "jmp 4f \n\t"
  931. "2: \n\t"
  932. //--- non DC2
  933. //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1)
  934. //psraw mm5, 2
  935. //psraw mm0, 2
  936. //psraw mm6, 2
  937. "movq %%mm5, %%mm3 \n\t"
  938. "psubw %%mm1, %%mm5 \n\t"
  939. "psllw $1, %%mm5 \n\t" //'z10
  940. "paddw %%mm1, %%mm3 \n\t" //'z13
  941. "movq %%mm0, %%mm2 \n\t"
  942. "psubw %%mm6, %%mm0 \n\t"
  943. "movq %%mm5, %%mm1 \n\t"
  944. "psllw $1, %%mm0 \n\t" //'z12
  945. "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
  946. "paddw %%mm0, %%mm5 \n\t"
  947. "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
  948. "paddw %%mm6, %%mm2 \n\t" //'z11
  949. "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
  950. "movq %%mm2, %%mm7 \n\t"
  951. //---
  952. "movq 0*8+%3, %%mm4 \n\t"
  953. "psubw %%mm3, %%mm2 \n\t"
  954. "psllw $1, %%mm2 \n\t"
  955. "paddw %%mm3, %%mm7 \n\t" //'t7
  956. "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
  957. "movq %%mm4, %%mm6 \n\t"
  958. //paddw mm7, MM_2
  959. "psraw $2, %%mm7 \n\t"
  960. "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
  961. "psubw %%mm7, %%mm6 \n\t"
  962. "movq 1*8+%3, %%mm3 \n\t"
  963. "paddw %%mm7, %%mm4 \n\t"
  964. "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
  965. "paddw %%mm5, %%mm1 \n\t" //'t12
  966. "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
  967. "psubw %%mm7, %%mm1 \n\t" //'t6
  968. "movq 2*8+%3, %%mm7 \n\t"
  969. "psubw %%mm5, %%mm0 \n\t" //'t10
  970. "movq 3*8+%3, %%mm6 \n\t"
  971. "movq %%mm3, %%mm5 \n\t"
  972. "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
  973. "psubw %%mm1, %%mm5 \n\t"
  974. "psubw %%mm1, %%mm2 \n\t" //'t5
  975. "paddw %%mm1, %%mm3 \n\t"
  976. "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
  977. "movq %%mm7, %%mm4 \n\t"
  978. "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
  979. "psubw %%mm2, %%mm4 \n\t"
  980. "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
  981. "paddw %%mm2, %%mm7 \n\t"
  982. "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
  983. "paddw %%mm2, %%mm0 \n\t" //'t4
  984. // 't4 't6 't5 - - - - 't7
  985. "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
  986. "movq %%mm6, %%mm1 \n\t"
  987. "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
  988. "psubw %%mm0, %%mm1 \n\t"
  989. "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
  990. "paddw %%mm0, %%mm6 \n\t"
  991. "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
  992. "add $8, %%"REG_S" \n\t"
  993. "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
  994. "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
  995. "add $8, %%"REG_D" \n\t"
  996. "4: \n\t"
  997. //=part 2 (the same)===========================================================
  998. "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
  999. //
  1000. "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
  1001. "movq %%mm1, %%mm0 \n\t"
  1002. "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
  1003. "movq %%mm7, %%mm3 \n\t"
  1004. "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
  1005. "movq %%mm1, %%mm5 \n\t"
  1006. "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
  1007. "psubw %%mm7, %%mm1 \n\t" //t13
  1008. "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
  1009. "movq %%mm6, %%mm4 \n\t"
  1010. "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
  1011. "paddw %%mm7, %%mm5 \n\t" //t10
  1012. "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
  1013. "movq %%mm6, %%mm7 \n\t"
  1014. "paddw %%mm2, %%mm6 \n\t" //t11
  1015. "psubw %%mm2, %%mm7 \n\t" //t12
  1016. "movq %%mm5, %%mm2 \n\t"
  1017. "paddw %%mm6, %%mm5 \n\t" //d0
  1018. // i0 t13 t12 i3 i1 d0 - d4
  1019. "psubw %%mm6, %%mm2 \n\t" //d4
  1020. "paddw %%mm1, %%mm7 \n\t"
  1021. "movq 1*8+4*16(%%"REG_d"), %%mm6 \n\t"
  1022. "psllw $2, %%mm7 \n\t"
  1023. "psubw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
  1024. "psubw %%mm6, %%mm2 \n\t"
  1025. "paddusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
  1026. "paddusw %%mm6, %%mm2 \n\t"
  1027. "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t"
  1028. //
  1029. "paddw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
  1030. "paddw %%mm6, %%mm2 \n\t"
  1031. "psubusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
  1032. "psubusw %%mm6, %%mm2 \n\t"
  1033. //This func is totally compute-bound, operates at huge speed. So, DC shortcut
  1034. // at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
  1035. //However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare.
  1036. "paddw "MANGLE(MM_2)", %%mm5 \n\t"
  1037. "movq %%mm2, %%mm6 \n\t"
  1038. "paddw %%mm5, %%mm2 \n\t"
  1039. "psubw %%mm6, %%mm5 \n\t"
  1040. "movq %%mm1, %%mm6 \n\t"
  1041. "paddw %%mm7, %%mm1 \n\t" //d2
  1042. "psubw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
  1043. "psubw %%mm7, %%mm6 \n\t" //d6
  1044. "movq 1*8+6*16(%%"REG_d"), %%mm7 \n\t"
  1045. "psraw $2, %%mm5 \n\t"
  1046. "paddusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
  1047. "psubw %%mm7, %%mm6 \n\t"
  1048. // t7 d2 /t11 t4 t6 - d6 /t10
  1049. "paddw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
  1050. "paddusw %%mm7, %%mm6 \n\t"
  1051. "psubusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
  1052. "paddw %%mm7, %%mm6 \n\t"
  1053. "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
  1054. "psubusw %%mm7, %%mm6 \n\t"
  1055. //movq [edi+"DCTSIZE_S"*2*2], mm1
  1056. //movq [edi+"DCTSIZE_S"*6*2], mm6
  1057. "movq %%mm1, %%mm7 \n\t"
  1058. "psraw $2, %%mm2 \n\t"
  1059. "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
  1060. "psubw %%mm6, %%mm1 \n\t"
  1061. "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
  1062. "paddw %%mm7, %%mm6 \n\t" //'t13
  1063. "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! ---
  1064. "movq %%mm2, %%mm7 \n\t"
  1065. "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
  1066. "paddw %%mm6, %%mm2 \n\t" //'t0
  1067. "movq %%mm2, 0*8+%3 \n\t" //!
  1068. "psubw %%mm6, %%mm7 \n\t" //'t3
  1069. "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
  1070. "psubw %%mm6, %%mm1 \n\t" //'t12
  1071. "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
  1072. "movq %%mm5, %%mm6 \n\t"
  1073. "movq %%mm7, 3*8+%3 \n\t"
  1074. "paddw %%mm2, %%mm3 \n\t" //t10
  1075. "paddw %%mm4, %%mm2 \n\t" //t11
  1076. "paddw %%mm0, %%mm4 \n\t" //t12
  1077. "movq %%mm3, %%mm7 \n\t"
  1078. "psubw %%mm4, %%mm3 \n\t"
  1079. "psllw $2, %%mm3 \n\t"
  1080. "psllw $2, %%mm7 \n\t" //opt for P6
  1081. "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
  1082. "psllw $2, %%mm4 \n\t"
  1083. "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t"
  1084. "psllw $2, %%mm2 \n\t"
  1085. "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
  1086. "paddw %%mm1, %%mm5 \n\t" //'t1
  1087. "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t"
  1088. "psubw %%mm1, %%mm6 \n\t" //'t2
  1089. // t7 't12 't11 t4 t6 - 't13 't10 ---
  1090. "paddw %%mm3, %%mm7 \n\t" //z2
  1091. "movq %%mm5, 1*8+%3 \n\t"
  1092. "paddw %%mm3, %%mm4 \n\t" //z4
  1093. "movq 1*8+3*16(%%"REG_d"), %%mm3 \n\t"
  1094. "movq %%mm0, %%mm1 \n\t"
  1095. "movq %%mm6, 2*8+%3 \n\t"
  1096. "psubw %%mm2, %%mm1 \n\t" //z13
  1097. //===
  1098. "paddw %%mm2, %%mm0 \n\t" //z11
  1099. "movq %%mm1, %%mm5 \n\t"
  1100. "movq 1*8+5*16(%%"REG_d"), %%mm2 \n\t"
  1101. "psubw %%mm7, %%mm1 \n\t" //d3
  1102. "paddw %%mm7, %%mm5 \n\t" //d5
  1103. "psubw %%mm3, %%mm1 \n\t"
  1104. "movq 1*8+1*16(%%"REG_d"), %%mm7 \n\t"
  1105. "psubw %%mm2, %%mm5 \n\t"
  1106. "movq %%mm0, %%mm6 \n\t"
  1107. "paddw %%mm4, %%mm0 \n\t" //d1
  1108. "paddusw %%mm3, %%mm1 \n\t"
  1109. "psubw %%mm4, %%mm6 \n\t" //d7
  1110. // d1 d3 - - - d5 d7 -
  1111. "movq 1*8+7*16(%%"REG_d"), %%mm4 \n\t"
  1112. "psubw %%mm7, %%mm0 \n\t"
  1113. "psubw %%mm4, %%mm6 \n\t"
  1114. "paddusw %%mm2, %%mm5 \n\t"
  1115. "paddusw %%mm4, %%mm6 \n\t"
  1116. "paddw %%mm3, %%mm1 \n\t"
  1117. "paddw %%mm2, %%mm5 \n\t"
  1118. "paddw %%mm4, %%mm6 \n\t"
  1119. "psubusw %%mm3, %%mm1 \n\t"
  1120. "psubusw %%mm2, %%mm5 \n\t"
  1121. "psubusw %%mm4, %%mm6 \n\t"
  1122. "movq %%mm1, %%mm4 \n\t"
  1123. "por %%mm5, %%mm4 \n\t"
  1124. "paddusw %%mm7, %%mm0 \n\t"
  1125. "por %%mm6, %%mm4 \n\t"
  1126. "paddw %%mm7, %%mm0 \n\t"
  1127. "packssdw %%mm4, %%mm4 \n\t"
  1128. "psubusw %%mm7, %%mm0 \n\t"
  1129. "movd %%mm4, %%"REG_a" \n\t"
  1130. "or %%"REG_a", %%"REG_a" \n\t"
  1131. "jnz 3f \n\t"
  1132. //movq [edi+"DCTSIZE_S"*3*2], mm1
  1133. //movq [edi+"DCTSIZE_S"*5*2], mm5
  1134. //movq [edi+"DCTSIZE_S"*1*2], mm0
  1135. //movq [edi+"DCTSIZE_S"*7*2], mm6
  1136. // t4 t5 - - - t6 t7 -
  1137. //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
  1138. //Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile
  1139. "movq 0*8+%3, %%mm4 \n\t"
  1140. "movq %%mm0, %%mm1 \n\t"
  1141. "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
  1142. "movq %%mm1, %%mm2 \n\t"
  1143. "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
  1144. "movq %%mm2, %%mm3 \n\t"
  1145. "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
  1146. "paddw %%mm4, %%mm5 \n\t"
  1147. "movq 1*8+%3, %%mm6 \n\t"
  1148. //paddw mm3, MM_2
  1149. "psraw $2, %%mm3 \n\t" //tmp7
  1150. "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
  1151. "psubw %%mm3, %%mm4 \n\t"
  1152. "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
  1153. "paddw %%mm3, %%mm5 \n\t"
  1154. "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
  1155. "paddw %%mm6, %%mm7 \n\t"
  1156. "movq 2*8+%3, %%mm3 \n\t"
  1157. "psubw %%mm0, %%mm6 \n\t"
  1158. "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
  1159. "paddw %%mm0, %%mm7 \n\t"
  1160. "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
  1161. "paddw %%mm3, %%mm4 \n\t"
  1162. "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
  1163. "psubw %%mm1, %%mm3 \n\t"
  1164. "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
  1165. "paddw %%mm1, %%mm4 \n\t"
  1166. "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
  1167. "paddw %%mm3, %%mm5 \n\t"
  1168. "movq 3*8+%3, %%mm0 \n\t"
  1169. "add $24, %%"REG_S" \n\t"
  1170. "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
  1171. "paddw %%mm0, %%mm6 \n\t"
  1172. "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
  1173. "psubw %%mm2, %%mm0 \n\t"
  1174. "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
  1175. "paddw %%mm2, %%mm6 \n\t"
  1176. "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
  1177. "paddw %%mm0, %%mm7 \n\t"
  1178. "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
  1179. "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
  1180. "add $24, %%"REG_D" \n\t"
  1181. "sub $2, %%"REG_c" \n\t"
  1182. "jnz 1b \n\t"
  1183. "jmp 5f \n\t"
  1184. "3: \n\t"
  1185. //--- non DC2
  1186. //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1)
  1187. //psraw mm5, 2
  1188. //psraw mm0, 2
  1189. //psraw mm6, 2
  1190. "movq %%mm5, %%mm3 \n\t"
  1191. "psubw %%mm1, %%mm5 \n\t"
  1192. "psllw $1, %%mm5 \n\t" //'z10
  1193. "paddw %%mm1, %%mm3 \n\t" //'z13
  1194. "movq %%mm0, %%mm2 \n\t"
  1195. "psubw %%mm6, %%mm0 \n\t"
  1196. "movq %%mm5, %%mm1 \n\t"
  1197. "psllw $1, %%mm0 \n\t" //'z12
  1198. "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
  1199. "paddw %%mm0, %%mm5 \n\t"
  1200. "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
  1201. "paddw %%mm6, %%mm2 \n\t" //'z11
  1202. "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
  1203. "movq %%mm2, %%mm7 \n\t"
  1204. //---
  1205. "movq 0*8+%3, %%mm4 \n\t"
  1206. "psubw %%mm3, %%mm2 \n\t"
  1207. "psllw $1, %%mm2 \n\t"
  1208. "paddw %%mm3, %%mm7 \n\t" //'t7
  1209. "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
  1210. "movq %%mm4, %%mm6 \n\t"
  1211. //paddw mm7, MM_2
  1212. "psraw $2, %%mm7 \n\t"
  1213. "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
  1214. "psubw %%mm7, %%mm6 \n\t"
  1215. "movq 1*8+%3, %%mm3 \n\t"
  1216. "paddw %%mm7, %%mm4 \n\t"
  1217. "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
  1218. "paddw %%mm5, %%mm1 \n\t" //'t12
  1219. "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
  1220. "psubw %%mm7, %%mm1 \n\t" //'t6
  1221. "movq 2*8+%3, %%mm7 \n\t"
  1222. "psubw %%mm5, %%mm0 \n\t" //'t10
  1223. "movq 3*8+%3, %%mm6 \n\t"
  1224. "movq %%mm3, %%mm5 \n\t"
  1225. "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
  1226. "psubw %%mm1, %%mm5 \n\t"
  1227. "psubw %%mm1, %%mm2 \n\t" //'t5
  1228. "paddw %%mm1, %%mm3 \n\t"
  1229. "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
  1230. "movq %%mm7, %%mm4 \n\t"
  1231. "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
  1232. "psubw %%mm2, %%mm4 \n\t"
  1233. "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
  1234. "paddw %%mm2, %%mm7 \n\t"
  1235. "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
  1236. "paddw %%mm2, %%mm0 \n\t" //'t4
  1237. // 't4 't6 't5 - - - - 't7
  1238. "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
  1239. "movq %%mm6, %%mm1 \n\t"
  1240. "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
  1241. "psubw %%mm0, %%mm1 \n\t"
  1242. "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
  1243. "paddw %%mm0, %%mm6 \n\t"
  1244. "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
  1245. "add $24, %%"REG_S" \n\t"
  1246. "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
  1247. "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
  1248. "add $24, %%"REG_D" \n\t"
  1249. "sub $2, %%"REG_c" \n\t"
  1250. "jnz 1b \n\t"
  1251. "5: \n\t"
  1252. : "+S"(data), "+D"(output), "+c"(cnt), "=o"(temps)
  1253. : "d"(thr_adr)
  1254. : "%"REG_a
  1255. );
  1256. }
  1257. #endif // HAVE_MMX
  1258. #if !HAVE_MMX
  1259. static void row_idct_c(DCTELEM* workspace,
  1260. int16_t* output_adr, int output_stride, int cnt)
  1261. {
  1262. int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  1263. int_simd16_t tmp10, tmp11, tmp12, tmp13;
  1264. int_simd16_t z5, z10, z11, z12, z13;
  1265. int16_t* outptr;
  1266. DCTELEM* wsptr;
  1267. cnt*=4;
  1268. wsptr = workspace;
  1269. outptr = output_adr;
  1270. for (; cnt > 0; cnt--) {
  1271. // Even part
  1272. //Simd version reads 4x4 block and transposes it
  1273. tmp10 = ( wsptr[2] + wsptr[3]);
  1274. tmp11 = ( wsptr[2] - wsptr[3]);
  1275. tmp13 = ( wsptr[0] + wsptr[1]);
  1276. tmp12 = (MULTIPLY16H( wsptr[0] - wsptr[1], FIX_1_414213562_A)<<2) - tmp13;//this shift order to avoid overflow
  1277. tmp0 = tmp10 + tmp13; //->temps
  1278. tmp3 = tmp10 - tmp13; //->temps
  1279. tmp1 = tmp11 + tmp12;
  1280. tmp2 = tmp11 - tmp12;
  1281. // Odd part
  1282. //Also transpose, with previous:
  1283. // ---- ---- ||||
  1284. // ---- ---- idct ||||
  1285. // ---- ---- ---> ||||
  1286. // ---- ---- ||||
  1287. z13 = wsptr[4] + wsptr[5];
  1288. z10 = wsptr[4] - wsptr[5];
  1289. z11 = wsptr[6] + wsptr[7];
  1290. z12 = wsptr[6] - wsptr[7];
  1291. tmp7 = z11 + z13;
  1292. tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
  1293. z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
  1294. tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
  1295. tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
  1296. tmp6 = (tmp12<<3) - tmp7;
  1297. tmp5 = (tmp11<<3) - tmp6;
  1298. tmp4 = (tmp10<<3) + tmp5;
  1299. // Final output stage: descale and write column
  1300. outptr[0*output_stride]+= DESCALE(tmp0 + tmp7, 3);
  1301. outptr[1*output_stride]+= DESCALE(tmp1 + tmp6, 3);
  1302. outptr[2*output_stride]+= DESCALE(tmp2 + tmp5, 3);
  1303. outptr[3*output_stride]+= DESCALE(tmp3 - tmp4, 3);
  1304. outptr[4*output_stride]+= DESCALE(tmp3 + tmp4, 3);
  1305. outptr[5*output_stride]+= DESCALE(tmp2 - tmp5, 3);
  1306. outptr[6*output_stride]+= DESCALE(tmp1 - tmp6, 3); //no += ?
  1307. outptr[7*output_stride]+= DESCALE(tmp0 - tmp7, 3); //no += ?
  1308. outptr++;
  1309. wsptr += DCTSIZE; // advance pointer to next row
  1310. }
  1311. }
  1312. #else /* HAVE_MMX */
  1313. static void row_idct_mmx (DCTELEM* workspace,
  1314. int16_t* output_adr, int output_stride, int cnt)
  1315. {
  1316. uint64_t __attribute__((aligned(8))) temps[4];
  1317. __asm__ volatile(
  1318. "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t"
  1319. "1: \n\t"
  1320. "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0 \n\t"
  1321. //
  1322. "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1 \n\t"
  1323. "movq %%mm0, %%mm4 \n\t"
  1324. "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
  1325. "punpcklwd %%mm1, %%mm0 \n\t"
  1326. "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3 \n\t"
  1327. "punpckhwd %%mm1, %%mm4 \n\t"
  1328. //transpose 4x4
  1329. "movq %%mm2, %%mm7 \n\t"
  1330. "punpcklwd %%mm3, %%mm2 \n\t"
  1331. "movq %%mm0, %%mm6 \n\t"
  1332. "punpckldq %%mm2, %%mm0 \n\t" //0
  1333. "punpckhdq %%mm2, %%mm6 \n\t" //1
  1334. "movq %%mm0, %%mm5 \n\t"
  1335. "punpckhwd %%mm3, %%mm7 \n\t"
  1336. "psubw %%mm6, %%mm0 \n\t"
  1337. "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t"
  1338. "movq %%mm4, %%mm2 \n\t"
  1339. "punpckldq %%mm7, %%mm4 \n\t" //2
  1340. "paddw %%mm6, %%mm5 \n\t"
  1341. "punpckhdq %%mm7, %%mm2 \n\t" //3
  1342. "movq %%mm4, %%mm1 \n\t"
  1343. "psllw $2, %%mm0 \n\t"
  1344. "paddw %%mm2, %%mm4 \n\t" //t10
  1345. "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t"
  1346. "psubw %%mm2, %%mm1 \n\t" //t11
  1347. "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t"
  1348. "psubw %%mm5, %%mm0 \n\t"
  1349. "movq %%mm4, %%mm6 \n\t"
  1350. "paddw %%mm5, %%mm4 \n\t" //t0
  1351. "psubw %%mm5, %%mm6 \n\t" //t3
  1352. "movq %%mm1, %%mm7 \n\t"
  1353. "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t"
  1354. "paddw %%mm0, %%mm1 \n\t" //t1
  1355. "movq %%mm4, 0*8+%3 \n\t" //t0
  1356. "movq %%mm3, %%mm4 \n\t"
  1357. "movq %%mm6, 1*8+%3 \n\t" //t3
  1358. "punpcklwd %%mm2, %%mm3 \n\t"
  1359. //transpose 4x4
  1360. "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t"
  1361. "punpckhwd %%mm2, %%mm4 \n\t"
  1362. "movq %%mm5, %%mm2 \n\t"
  1363. "punpcklwd %%mm6, %%mm5 \n\t"
  1364. "psubw %%mm0, %%mm7 \n\t" //t2
  1365. "punpckhwd %%mm6, %%mm2 \n\t"
  1366. "movq %%mm3, %%mm0 \n\t"
  1367. "punpckldq %%mm5, %%mm3 \n\t" //4
  1368. "punpckhdq %%mm5, %%mm0 \n\t" //5
  1369. "movq %%mm4, %%mm5 \n\t"
  1370. //
  1371. "movq %%mm3, %%mm6 \n\t"
  1372. "punpckldq %%mm2, %%mm4 \n\t" //6
  1373. "psubw %%mm0, %%mm3 \n\t" //z10
  1374. "punpckhdq %%mm2, %%mm5 \n\t" //7
  1375. "paddw %%mm0, %%mm6 \n\t" //z13
  1376. "movq %%mm4, %%mm2 \n\t"
  1377. "movq %%mm3, %%mm0 \n\t"
  1378. "psubw %%mm5, %%mm4 \n\t" //z12
  1379. "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0 \n\t" //-
  1380. "paddw %%mm4, %%mm3 \n\t"
  1381. "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3 \n\t" //z5
  1382. "paddw %%mm5, %%mm2 \n\t" //z11 >
  1383. "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4 \n\t"
  1384. "movq %%mm2, %%mm5 \n\t"
  1385. "psubw %%mm6, %%mm2 \n\t"
  1386. "paddw %%mm6, %%mm5 \n\t" //t7
  1387. "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //t11
  1388. "paddw %%mm3, %%mm0 \n\t" //t12
  1389. "psllw $3, %%mm0 \n\t"
  1390. "psubw %%mm3, %%mm4 \n\t" //t10
  1391. "movq 0*8+%3, %%mm6 \n\t"
  1392. "movq %%mm1, %%mm3 \n\t"
  1393. "psllw $3, %%mm4 \n\t"
  1394. "psubw %%mm5, %%mm0 \n\t" //t6
  1395. "psllw $3, %%mm2 \n\t"
  1396. "paddw %%mm0, %%mm1 \n\t" //d1
  1397. "psubw %%mm0, %%mm2 \n\t" //t5
  1398. "psubw %%mm0, %%mm3 \n\t" //d6
  1399. "paddw %%mm2, %%mm4 \n\t" //t4
  1400. "movq %%mm7, %%mm0 \n\t"
  1401. "paddw %%mm2, %%mm7 \n\t" //d2
  1402. "psubw %%mm2, %%mm0 \n\t" //d5
  1403. "movq "MANGLE(MM_DESCALE_RND)", %%mm2 \n\t" //4
  1404. "psubw %%mm5, %%mm6 \n\t" //d7
  1405. "paddw 0*8+%3, %%mm5 \n\t" //d0
  1406. "paddw %%mm2, %%mm1 \n\t"
  1407. "paddw %%mm2, %%mm5 \n\t"
  1408. "psraw $3, %%mm1 \n\t"
  1409. "paddw %%mm2, %%mm7 \n\t"
  1410. "psraw $3, %%mm5 \n\t"
  1411. "paddw (%%"REG_D"), %%mm5 \n\t"
  1412. "psraw $3, %%mm7 \n\t"
  1413. "paddw (%%"REG_D",%%"REG_a",), %%mm1 \n\t"
  1414. "paddw %%mm2, %%mm0 \n\t"
  1415. "paddw (%%"REG_D",%%"REG_a",2), %%mm7 \n\t"
  1416. "paddw %%mm2, %%mm3 \n\t"
  1417. "movq %%mm5, (%%"REG_D") \n\t"
  1418. "paddw %%mm2, %%mm6 \n\t"
  1419. "movq %%mm1, (%%"REG_D",%%"REG_a",) \n\t"
  1420. "psraw $3, %%mm0 \n\t"
  1421. "movq %%mm7, (%%"REG_D",%%"REG_a",2) \n\t"
  1422. "add %%"REG_d", %%"REG_D" \n\t" //3*ls
  1423. "movq 1*8+%3, %%mm5 \n\t" //t3
  1424. "psraw $3, %%mm3 \n\t"
  1425. "paddw (%%"REG_D",%%"REG_a",2), %%mm0 \n\t"
  1426. "psubw %%mm4, %%mm5 \n\t" //d3
  1427. "paddw (%%"REG_D",%%"REG_d",), %%mm3 \n\t"
  1428. "psraw $3, %%mm6 \n\t"
  1429. "paddw 1*8+%3, %%mm4 \n\t" //d4
  1430. "paddw %%mm2, %%mm5 \n\t"
  1431. "paddw (%%"REG_D",%%"REG_a",4), %%mm6 \n\t"
  1432. "paddw %%mm2, %%mm4 \n\t"
  1433. "movq %%mm0, (%%"REG_D",%%"REG_a",2) \n\t"
  1434. "psraw $3, %%mm5 \n\t"
  1435. "paddw (%%"REG_D"), %%mm5 \n\t"
  1436. "psraw $3, %%mm4 \n\t"
  1437. "paddw (%%"REG_D",%%"REG_a",), %%mm4 \n\t"
  1438. "add $"DCTSIZE_S"*2*4, %%"REG_S" \n\t" //4 rows
  1439. "movq %%mm3, (%%"REG_D",%%"REG_d",) \n\t"
  1440. "movq %%mm6, (%%"REG_D",%%"REG_a",4) \n\t"
  1441. "movq %%mm5, (%%"REG_D") \n\t"
  1442. "movq %%mm4, (%%"REG_D",%%"REG_a",) \n\t"
  1443. "sub %%"REG_d", %%"REG_D" \n\t"
  1444. "add $8, %%"REG_D" \n\t"
  1445. "dec %%"REG_c" \n\t"
  1446. "jnz 1b \n\t"
  1447. : "+S"(workspace), "+D"(output_adr), "+c"(cnt), "=o"(temps)
  1448. : "a"(output_stride*sizeof(short))
  1449. : "%"REG_d
  1450. );
  1451. }
  1452. #endif // HAVE_MMX
  1453. #if !HAVE_MMX
  1454. static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt)
  1455. {
  1456. int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  1457. int_simd16_t tmp10, tmp11, tmp12, tmp13;
  1458. int_simd16_t z1, z2, z3, z4, z5, z11, z13;
  1459. DCTELEM *dataptr;
  1460. cnt*=4;
  1461. // Pass 1: process rows.
  1462. dataptr = data;
  1463. for (; cnt > 0; cnt--) {
  1464. tmp0 = pixels[line_size*0] + pixels[line_size*7];
  1465. tmp7 = pixels[line_size*0] - pixels[line_size*7];
  1466. tmp1 = pixels[line_size*1] + pixels[line_size*6];
  1467. tmp6 = pixels[line_size*1] - pixels[line_size*6];
  1468. tmp2 = pixels[line_size*2] + pixels[line_size*5];
  1469. tmp5 = pixels[line_size*2] - pixels[line_size*5];
  1470. tmp3 = pixels[line_size*3] + pixels[line_size*4];
  1471. tmp4 = pixels[line_size*3] - pixels[line_size*4];
  1472. // Even part
  1473. tmp10 = tmp0 + tmp3;
  1474. tmp13 = tmp0 - tmp3;
  1475. tmp11 = tmp1 + tmp2;
  1476. tmp12 = tmp1 - tmp2;
  1477. //Even columns are written first, this leads to different order of columns
  1478. //in column_fidct(), but they are processed independently, so all ok.
  1479. //Later in the row_idct() columns readed at the same order.
  1480. dataptr[2] = tmp10 + tmp11;
  1481. dataptr[3] = tmp10 - tmp11;
  1482. z1 = MULTIPLY16H((tmp12 + tmp13)<<2, FIX_0_707106781);
  1483. dataptr[0] = tmp13 + z1;
  1484. dataptr[1] = tmp13 - z1;
  1485. // Odd part
  1486. tmp10 = (tmp4 + tmp5) <<2;
  1487. tmp11 = (tmp5 + tmp6) <<2;
  1488. tmp12 = (tmp6 + tmp7) <<2;
  1489. z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
  1490. z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
  1491. z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
  1492. z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
  1493. z11 = tmp7 + z3;
  1494. z13 = tmp7 - z3;
  1495. dataptr[4] = z13 + z2;
  1496. dataptr[5] = z13 - z2;
  1497. dataptr[6] = z11 + z4;
  1498. dataptr[7] = z11 - z4;
  1499. pixels++; // advance pointer to next column
  1500. dataptr += DCTSIZE;
  1501. }
  1502. }
  1503. #else /* HAVE_MMX */
  1504. static void row_fdct_mmx(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt)
  1505. {
  1506. uint64_t __attribute__((aligned(8))) temps[4];
  1507. __asm__ volatile(
  1508. "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t"
  1509. "6: \n\t"
  1510. "movd (%%"REG_S"), %%mm0 \n\t"
  1511. "pxor %%mm7, %%mm7 \n\t"
  1512. "movd (%%"REG_S",%%"REG_a",), %%mm1 \n\t"
  1513. "punpcklbw %%mm7, %%mm0 \n\t"
  1514. "movd (%%"REG_S",%%"REG_a",2), %%mm2 \n\t"
  1515. "punpcklbw %%mm7, %%mm1 \n\t"
  1516. "punpcklbw %%mm7, %%mm2 \n\t"
  1517. "add %%"REG_d", %%"REG_S" \n\t"
  1518. "movq %%mm0, %%mm5 \n\t"
  1519. //
  1520. "movd (%%"REG_S",%%"REG_a",4), %%mm3 \n\t" //7 ;prefetch!
  1521. "movq %%mm1, %%mm6 \n\t"
  1522. "movd (%%"REG_S",%%"REG_d",), %%mm4 \n\t" //6
  1523. "punpcklbw %%mm7, %%mm3 \n\t"
  1524. "psubw %%mm3, %%mm5 \n\t"
  1525. "punpcklbw %%mm7, %%mm4 \n\t"
  1526. "paddw %%mm3, %%mm0 \n\t"
  1527. "psubw %%mm4, %%mm6 \n\t"
  1528. "movd (%%"REG_S",%%"REG_a",2), %%mm3 \n\t" //5
  1529. "paddw %%mm4, %%mm1 \n\t"
  1530. "movq %%mm5, 0*8+%3 \n\t" //t7
  1531. "punpcklbw %%mm7, %%mm3 \n\t"
  1532. "movq %%mm6, 1*8+%3 \n\t" //t6
  1533. "movq %%mm2, %%mm4 \n\t"
  1534. "movd (%%"REG_S"), %%mm5 \n\t" //3
  1535. "paddw %%mm3, %%mm2 \n\t"
  1536. "movd (%%"REG_S",%%"REG_a",), %%mm6 \n\t" //4
  1537. "punpcklbw %%mm7, %%mm5 \n\t"
  1538. "psubw %%mm3, %%mm4 \n\t"
  1539. "punpcklbw %%mm7, %%mm6 \n\t"
  1540. "movq %%mm5, %%mm3 \n\t"
  1541. "paddw %%mm6, %%mm5 \n\t" //t3
  1542. "psubw %%mm6, %%mm3 \n\t" //t4 ; t0 t1 t2 t4 t5 t3 - -
  1543. "movq %%mm0, %%mm6 \n\t"
  1544. "movq %%mm1, %%mm7 \n\t"
  1545. "psubw %%mm5, %%mm0 \n\t" //t13
  1546. "psubw %%mm2, %%mm1 \n\t"
  1547. "paddw %%mm2, %%mm7 \n\t" //t11
  1548. "paddw %%mm0, %%mm1 \n\t"
  1549. "movq %%mm7, %%mm2 \n\t"
  1550. "psllw $2, %%mm1 \n\t"
  1551. "paddw %%mm5, %%mm6 \n\t" //t10
  1552. "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm1 \n\t"
  1553. "paddw %%mm6, %%mm7 \n\t" //d2
  1554. "psubw %%mm2, %%mm6 \n\t" //d3
  1555. "movq %%mm0, %%mm5 \n\t"
  1556. //transpose 4x4
  1557. "movq %%mm7, %%mm2 \n\t"
  1558. "punpcklwd %%mm6, %%mm7 \n\t"
  1559. "paddw %%mm1, %%mm0 \n\t" //d0
  1560. "punpckhwd %%mm6, %%mm2 \n\t"
  1561. "psubw %%mm1, %%mm5 \n\t" //d1
  1562. "movq %%mm0, %%mm6 \n\t"
  1563. "movq 1*8+%3, %%mm1 \n\t"
  1564. "punpcklwd %%mm5, %%mm0 \n\t"
  1565. "punpckhwd %%mm5, %%mm6 \n\t"
  1566. "movq %%mm0, %%mm5 \n\t"
  1567. "punpckldq %%mm7, %%mm0 \n\t" //0
  1568. "paddw %%mm4, %%mm3 \n\t"
  1569. "punpckhdq %%mm7, %%mm5 \n\t" //1
  1570. "movq %%mm6, %%mm7 \n\t"
  1571. "movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
  1572. "punpckldq %%mm2, %%mm6 \n\t" //2
  1573. "movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
  1574. "punpckhdq %%mm2, %%mm7 \n\t" //3
  1575. "movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
  1576. "paddw %%mm1, %%mm4 \n\t"
  1577. "movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
  1578. "psllw $2, %%mm3 \n\t" //t10
  1579. "movq 0*8+%3, %%mm2 \n\t"
  1580. "psllw $2, %%mm4 \n\t" //t11
  1581. "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm4 \n\t" //z3
  1582. "paddw %%mm2, %%mm1 \n\t"
  1583. "psllw $2, %%mm1 \n\t" //t12
  1584. "movq %%mm3, %%mm0 \n\t"
  1585. "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm0 \n\t"
  1586. "psubw %%mm1, %%mm3 \n\t"
  1587. "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" //z5
  1588. "movq %%mm2, %%mm5 \n\t"
  1589. "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t"
  1590. "psubw %%mm4, %%mm2 \n\t" //z13
  1591. "paddw %%mm4, %%mm5 \n\t" //z11
  1592. "movq %%mm2, %%mm6 \n\t"
  1593. "paddw %%mm3, %%mm0 \n\t" //z2
  1594. "movq %%mm5, %%mm7 \n\t"
  1595. "paddw %%mm0, %%mm2 \n\t" //d4
  1596. "psubw %%mm0, %%mm6 \n\t" //d5
  1597. "movq %%mm2, %%mm4 \n\t"
  1598. "paddw %%mm3, %%mm1 \n\t" //z4
  1599. //transpose 4x4
  1600. "punpcklwd %%mm6, %%mm2 \n\t"
  1601. "paddw %%mm1, %%mm5 \n\t" //d6
  1602. "punpckhwd %%mm6, %%mm4 \n\t"
  1603. "psubw %%mm1, %%mm7 \n\t" //d7
  1604. "movq %%mm5, %%mm6 \n\t"
  1605. "punpcklwd %%mm7, %%mm5 \n\t"
  1606. "punpckhwd %%mm7, %%mm6 \n\t"
  1607. "movq %%mm2, %%mm7 \n\t"
  1608. "punpckldq %%mm5, %%mm2 \n\t" //4
  1609. "sub %%"REG_d", %%"REG_S" \n\t"
  1610. "punpckhdq %%mm5, %%mm7 \n\t" //5
  1611. "movq %%mm4, %%mm5 \n\t"
  1612. "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t"
  1613. "punpckldq %%mm6, %%mm4 \n\t" //6
  1614. "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t"
  1615. "punpckhdq %%mm6, %%mm5 \n\t" //7
  1616. "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t"
  1617. "add $4, %%"REG_S" \n\t"
  1618. "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t"
  1619. "add $"DCTSIZE_S"*2*4, %%"REG_D" \n\t" //4 rows
  1620. "dec %%"REG_c" \n\t"
  1621. "jnz 6b \n\t"
  1622. : "+S"(pixels), "+D"(data), "+c"(cnt), "=o"(temps)
  1623. : "a"(line_size)
  1624. : "%"REG_d);
  1625. }
  1626. #endif // HAVE_MMX