simple_idct_mmx.c 71 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295
  1. /*
  2. * Simple IDCT MMX
  3. *
  4. * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #include "libavcodec/dsputil.h"
  23. #include "libavcodec/simple_idct.h"
  24. /*
  25. 23170.475006
  26. 22725.260826
  27. 21406.727617
  28. 19265.545870
  29. 16384.000000
  30. 12872.826198
  31. 8866.956905
  32. 4520.335430
  33. */
  34. #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  35. #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  36. #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  37. #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  38. #if 0
  39. #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  40. #else
  41. #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
  42. #endif
  43. #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  44. #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  45. #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  46. #define ROW_SHIFT 11
  47. #define COL_SHIFT 20 // 6
  48. DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
  49. DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
  50. DECLARE_ALIGNED(8, static const int16_t, coeffs[])= {
  51. 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
  52. // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
  53. // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
  54. 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
  55. // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
  56. // 0, 0, 0, 0,
  57. // 0, 0, 0, 0,
  58. C4, C4, C4, C4,
  59. C4, -C4, C4, -C4,
  60. C2, C6, C2, C6,
  61. C6, -C2, C6, -C2,
  62. C1, C3, C1, C3,
  63. C5, C7, C5, C7,
  64. C3, -C7, C3, -C7,
  65. -C1, -C5, -C1, -C5,
  66. C5, -C1, C5, -C1,
  67. C7, C3, C7, C3,
  68. C7, -C5, C7, -C5,
  69. C3, -C1, C3, -C1
  70. };
  71. #if 0
  72. static void unused_var_killer(void)
  73. {
  74. int a= wm1010 + d40000;
  75. temp[0]=a;
  76. }
  77. static void inline idctCol (int16_t * col, int16_t *input)
  78. {
  79. #undef C0
  80. #undef C1
  81. #undef C2
  82. #undef C3
  83. #undef C4
  84. #undef C5
  85. #undef C6
  86. #undef C7
  87. int a0, a1, a2, a3, b0, b1, b2, b3;
  88. const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  89. const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  90. const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  91. const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  92. const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  93. const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  94. const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  95. const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  96. /*
  97. if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
  98. col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
  99. col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
  100. return;
  101. }*/
  102. col[8*0] = input[8*0 + 0];
  103. col[8*1] = input[8*2 + 0];
  104. col[8*2] = input[8*0 + 1];
  105. col[8*3] = input[8*2 + 1];
  106. col[8*4] = input[8*4 + 0];
  107. col[8*5] = input[8*6 + 0];
  108. col[8*6] = input[8*4 + 1];
  109. col[8*7] = input[8*6 + 1];
  110. a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
  111. a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
  112. a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
  113. a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
  114. b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
  115. b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
  116. b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
  117. b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
  118. col[8*0] = (a0 + b0) >> COL_SHIFT;
  119. col[8*1] = (a1 + b1) >> COL_SHIFT;
  120. col[8*2] = (a2 + b2) >> COL_SHIFT;
  121. col[8*3] = (a3 + b3) >> COL_SHIFT;
  122. col[8*4] = (a3 - b3) >> COL_SHIFT;
  123. col[8*5] = (a2 - b2) >> COL_SHIFT;
  124. col[8*6] = (a1 - b1) >> COL_SHIFT;
  125. col[8*7] = (a0 - b0) >> COL_SHIFT;
  126. }
  127. static void inline idctRow (int16_t * output, int16_t * input)
  128. {
  129. int16_t row[8];
  130. int a0, a1, a2, a3, b0, b1, b2, b3;
  131. const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  132. const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  133. const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  134. const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  135. const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  136. const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  137. const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  138. const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  139. row[0] = input[0];
  140. row[2] = input[1];
  141. row[4] = input[4];
  142. row[6] = input[5];
  143. row[1] = input[8];
  144. row[3] = input[9];
  145. row[5] = input[12];
  146. row[7] = input[13];
  147. if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
  148. row[0] = row[1] = row[2] = row[3] = row[4] =
  149. row[5] = row[6] = row[7] = row[0]<<3;
  150. output[0] = row[0];
  151. output[2] = row[1];
  152. output[4] = row[2];
  153. output[6] = row[3];
  154. output[8] = row[4];
  155. output[10] = row[5];
  156. output[12] = row[6];
  157. output[14] = row[7];
  158. return;
  159. }
  160. a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
  161. a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
  162. a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
  163. a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
  164. b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
  165. b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
  166. b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
  167. b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
  168. row[0] = (a0 + b0) >> ROW_SHIFT;
  169. row[1] = (a1 + b1) >> ROW_SHIFT;
  170. row[2] = (a2 + b2) >> ROW_SHIFT;
  171. row[3] = (a3 + b3) >> ROW_SHIFT;
  172. row[4] = (a3 - b3) >> ROW_SHIFT;
  173. row[5] = (a2 - b2) >> ROW_SHIFT;
  174. row[6] = (a1 - b1) >> ROW_SHIFT;
  175. row[7] = (a0 - b0) >> ROW_SHIFT;
  176. output[0] = row[0];
  177. output[2] = row[1];
  178. output[4] = row[2];
  179. output[6] = row[3];
  180. output[8] = row[4];
  181. output[10] = row[5];
  182. output[12] = row[6];
  183. output[14] = row[7];
  184. }
  185. #endif
  186. static inline void idct(int16_t *block)
  187. {
  188. DECLARE_ALIGNED(8, int64_t, align_tmp[16]);
  189. int16_t * const temp= (int16_t*)align_tmp;
  190. __asm__ volatile(
  191. #if 0 //Alternative, simpler variant
  192. #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  193. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  194. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  195. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  196. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  197. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  198. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  199. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  200. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  201. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  202. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  203. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  204. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  205. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  206. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  207. #rounder ", %%mm4 \n\t"\
  208. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  209. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  210. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  211. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  212. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  213. #rounder ", %%mm0 \n\t"\
  214. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  215. "paddd %%mm0, %%mm0 \n\t" \
  216. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  217. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  218. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  219. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  220. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  221. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  222. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  223. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  224. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  225. "psrad $" #shift ", %%mm7 \n\t"\
  226. "psrad $" #shift ", %%mm4 \n\t"\
  227. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  228. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  229. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  230. "psrad $" #shift ", %%mm1 \n\t"\
  231. "psrad $" #shift ", %%mm2 \n\t"\
  232. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  233. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  234. "movq %%mm7, " #dst " \n\t"\
  235. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  236. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  237. "movq %%mm2, 24+" #dst " \n\t"\
  238. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  239. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  240. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  241. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  242. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  243. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  244. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  245. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  246. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  247. "psrad $" #shift ", %%mm2 \n\t"\
  248. "psrad $" #shift ", %%mm0 \n\t"\
  249. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  250. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  251. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  252. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  253. "psrad $" #shift ", %%mm6 \n\t"\
  254. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  255. "movq %%mm2, 8+" #dst " \n\t"\
  256. "psrad $" #shift ", %%mm4 \n\t"\
  257. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  258. "movq %%mm4, 16+" #dst " \n\t"\
  259. #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
  260. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  261. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  262. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  263. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  264. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  265. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  266. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  267. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  268. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  269. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  270. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  271. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  272. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  273. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  274. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  275. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  276. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  277. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  278. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  279. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  280. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  281. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  282. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  283. "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
  284. "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
  285. "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  286. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  287. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  288. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  289. "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
  290. "psrad $" #shift ", %%mm7 \n\t"\
  291. "psrad $" #shift ", %%mm4 \n\t"\
  292. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  293. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  294. "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  295. "psrad $" #shift ", %%mm0 \n\t"\
  296. "psrad $" #shift ", %%mm2 \n\t"\
  297. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  298. "movd %%mm7, " #dst " \n\t"\
  299. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  300. "movd %%mm0, 16+" #dst " \n\t"\
  301. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  302. "movd %%mm2, 96+" #dst " \n\t"\
  303. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  304. "movd %%mm4, 112+" #dst " \n\t"\
  305. "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
  306. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  307. "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  308. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  309. "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  310. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  311. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  312. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  313. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  314. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  315. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  316. "psrad $" #shift ", %%mm2 \n\t"\
  317. "psrad $" #shift ", %%mm5 \n\t"\
  318. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  319. "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
  320. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  321. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  322. "psrad $" #shift ", %%mm6 \n\t"\
  323. "psrad $" #shift ", %%mm4 \n\t"\
  324. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  325. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  326. "movd %%mm2, 32+" #dst " \n\t"\
  327. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  328. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  329. "movd %%mm6, 48+" #dst " \n\t"\
  330. "movd %%mm4, 64+" #dst " \n\t"\
  331. "movd %%mm5, 80+" #dst " \n\t"\
  332. #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  333. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  334. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  335. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  336. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  337. "movq "MANGLE(wm1010)", %%mm4 \n\t"\
  338. "pand %%mm0, %%mm4 \n\t"\
  339. "por %%mm1, %%mm4 \n\t"\
  340. "por %%mm2, %%mm4 \n\t"\
  341. "por %%mm3, %%mm4 \n\t"\
  342. "packssdw %%mm4,%%mm4 \n\t"\
  343. "movd %%mm4, %%eax \n\t"\
  344. "orl %%eax, %%eax \n\t"\
  345. "jz 1f \n\t"\
  346. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  347. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  348. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  349. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  350. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  351. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  352. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  353. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  354. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  355. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  356. #rounder ", %%mm4 \n\t"\
  357. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  358. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  359. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  360. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  361. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  362. #rounder ", %%mm0 \n\t"\
  363. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  364. "paddd %%mm0, %%mm0 \n\t" \
  365. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  366. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  367. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  368. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  369. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  370. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  371. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  372. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  373. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  374. "psrad $" #shift ", %%mm7 \n\t"\
  375. "psrad $" #shift ", %%mm4 \n\t"\
  376. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  377. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  378. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  379. "psrad $" #shift ", %%mm1 \n\t"\
  380. "psrad $" #shift ", %%mm2 \n\t"\
  381. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  382. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  383. "movq %%mm7, " #dst " \n\t"\
  384. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  385. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  386. "movq %%mm2, 24+" #dst " \n\t"\
  387. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  388. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  389. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  390. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  391. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  392. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  393. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  394. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  395. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  396. "psrad $" #shift ", %%mm2 \n\t"\
  397. "psrad $" #shift ", %%mm0 \n\t"\
  398. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  399. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  400. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  401. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  402. "psrad $" #shift ", %%mm6 \n\t"\
  403. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  404. "movq %%mm2, 8+" #dst " \n\t"\
  405. "psrad $" #shift ", %%mm4 \n\t"\
  406. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  407. "movq %%mm4, 16+" #dst " \n\t"\
  408. "jmp 2f \n\t"\
  409. "1: \n\t"\
  410. "pslld $16, %%mm0 \n\t"\
  411. "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
  412. "psrad $13, %%mm0 \n\t"\
  413. "packssdw %%mm0, %%mm0 \n\t"\
  414. "movq %%mm0, " #dst " \n\t"\
  415. "movq %%mm0, 8+" #dst " \n\t"\
  416. "movq %%mm0, 16+" #dst " \n\t"\
  417. "movq %%mm0, 24+" #dst " \n\t"\
  418. "2: \n\t"
  419. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  420. ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
  421. /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
  422. ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
  423. ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
  424. DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
  425. DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
  426. DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
  427. //IDCT( src0, src4, src1, src5, dst, shift)
  428. COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  429. COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  430. COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  431. COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  432. #else
  433. #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  434. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  435. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  436. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  437. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  438. "movq "MANGLE(wm1010)", %%mm4 \n\t"\
  439. "pand %%mm0, %%mm4 \n\t"\
  440. "por %%mm1, %%mm4 \n\t"\
  441. "por %%mm2, %%mm4 \n\t"\
  442. "por %%mm3, %%mm4 \n\t"\
  443. "packssdw %%mm4,%%mm4 \n\t"\
  444. "movd %%mm4, %%eax \n\t"\
  445. "orl %%eax, %%eax \n\t"\
  446. "jz 1f \n\t"\
  447. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  448. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  449. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  450. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  451. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  452. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  453. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  454. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  455. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  456. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  457. #rounder ", %%mm4 \n\t"\
  458. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  459. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  460. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  461. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  462. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  463. #rounder ", %%mm0 \n\t"\
  464. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  465. "paddd %%mm0, %%mm0 \n\t" \
  466. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  467. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  468. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  469. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  470. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  471. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  472. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  473. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  474. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  475. "psrad $" #shift ", %%mm7 \n\t"\
  476. "psrad $" #shift ", %%mm4 \n\t"\
  477. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  478. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  479. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  480. "psrad $" #shift ", %%mm1 \n\t"\
  481. "psrad $" #shift ", %%mm2 \n\t"\
  482. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  483. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  484. "movq %%mm7, " #dst " \n\t"\
  485. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  486. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  487. "movq %%mm2, 24+" #dst " \n\t"\
  488. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  489. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  490. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  491. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  492. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  493. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  494. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  495. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  496. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  497. "psrad $" #shift ", %%mm2 \n\t"\
  498. "psrad $" #shift ", %%mm0 \n\t"\
  499. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  500. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  501. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  502. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  503. "psrad $" #shift ", %%mm6 \n\t"\
  504. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  505. "movq %%mm2, 8+" #dst " \n\t"\
  506. "psrad $" #shift ", %%mm4 \n\t"\
  507. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  508. "movq %%mm4, 16+" #dst " \n\t"\
  509. "jmp 2f \n\t"\
  510. "1: \n\t"\
  511. "pslld $16, %%mm0 \n\t"\
  512. "paddd "MANGLE(d40000)", %%mm0 \n\t"\
  513. "psrad $13, %%mm0 \n\t"\
  514. "packssdw %%mm0, %%mm0 \n\t"\
  515. "movq %%mm0, " #dst " \n\t"\
  516. "movq %%mm0, 8+" #dst " \n\t"\
  517. "movq %%mm0, 16+" #dst " \n\t"\
  518. "movq %%mm0, 24+" #dst " \n\t"\
  519. "2: \n\t"
  520. #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
  521. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  522. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  523. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  524. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  525. "movq %%mm0, %%mm4 \n\t"\
  526. "por %%mm1, %%mm4 \n\t"\
  527. "por %%mm2, %%mm4 \n\t"\
  528. "por %%mm3, %%mm4 \n\t"\
  529. "packssdw %%mm4,%%mm4 \n\t"\
  530. "movd %%mm4, %%eax \n\t"\
  531. "orl %%eax, %%eax \n\t"\
  532. "jz " #bt " \n\t"\
  533. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  534. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  535. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  536. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  537. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  538. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  539. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  540. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  541. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  542. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  543. #rounder ", %%mm4 \n\t"\
  544. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  545. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  546. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  547. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  548. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  549. #rounder ", %%mm0 \n\t"\
  550. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  551. "paddd %%mm0, %%mm0 \n\t" \
  552. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  553. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  554. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  555. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  556. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  557. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  558. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  559. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  560. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  561. "psrad $" #shift ", %%mm7 \n\t"\
  562. "psrad $" #shift ", %%mm4 \n\t"\
  563. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  564. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  565. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  566. "psrad $" #shift ", %%mm1 \n\t"\
  567. "psrad $" #shift ", %%mm2 \n\t"\
  568. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  569. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  570. "movq %%mm7, " #dst " \n\t"\
  571. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  572. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  573. "movq %%mm2, 24+" #dst " \n\t"\
  574. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  575. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  576. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  577. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  578. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  579. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  580. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  581. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  582. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  583. "psrad $" #shift ", %%mm2 \n\t"\
  584. "psrad $" #shift ", %%mm0 \n\t"\
  585. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  586. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  587. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  588. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  589. "psrad $" #shift ", %%mm6 \n\t"\
  590. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  591. "movq %%mm2, 8+" #dst " \n\t"\
  592. "psrad $" #shift ", %%mm4 \n\t"\
  593. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  594. "movq %%mm4, 16+" #dst " \n\t"\
  595. #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  596. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  597. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  598. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  599. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  600. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  601. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  602. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  603. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  604. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  605. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  606. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  607. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  608. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  609. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  610. #rounder ", %%mm4 \n\t"\
  611. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  612. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  613. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  614. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  615. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  616. #rounder ", %%mm0 \n\t"\
  617. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  618. "paddd %%mm0, %%mm0 \n\t" \
  619. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  620. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  621. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  622. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  623. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  624. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  625. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  626. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  627. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  628. "psrad $" #shift ", %%mm7 \n\t"\
  629. "psrad $" #shift ", %%mm4 \n\t"\
  630. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  631. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  632. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  633. "psrad $" #shift ", %%mm1 \n\t"\
  634. "psrad $" #shift ", %%mm2 \n\t"\
  635. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  636. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  637. "movq %%mm7, " #dst " \n\t"\
  638. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  639. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  640. "movq %%mm2, 24+" #dst " \n\t"\
  641. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  642. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  643. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  644. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  645. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  646. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  647. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  648. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  649. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  650. "psrad $" #shift ", %%mm2 \n\t"\
  651. "psrad $" #shift ", %%mm0 \n\t"\
  652. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  653. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  654. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  655. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  656. "psrad $" #shift ", %%mm6 \n\t"\
  657. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  658. "movq %%mm2, 8+" #dst " \n\t"\
  659. "psrad $" #shift ", %%mm4 \n\t"\
  660. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  661. "movq %%mm4, 16+" #dst " \n\t"\
  662. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  663. DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
  664. Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
  665. Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
  666. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
  667. #undef IDCT
  668. #define IDCT(src0, src4, src1, src5, dst, shift) \
  669. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  670. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  671. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  672. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  673. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  674. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  675. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  676. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  677. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  678. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  679. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  680. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  681. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  682. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  683. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  684. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  685. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  686. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  687. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  688. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  689. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  690. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  691. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  692. "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
  693. "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
  694. "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  695. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  696. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  697. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  698. "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
  699. "psrad $" #shift ", %%mm7 \n\t"\
  700. "psrad $" #shift ", %%mm4 \n\t"\
  701. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  702. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  703. "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  704. "psrad $" #shift ", %%mm0 \n\t"\
  705. "psrad $" #shift ", %%mm2 \n\t"\
  706. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  707. "movd %%mm7, " #dst " \n\t"\
  708. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  709. "movd %%mm0, 16+" #dst " \n\t"\
  710. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  711. "movd %%mm2, 96+" #dst " \n\t"\
  712. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  713. "movd %%mm4, 112+" #dst " \n\t"\
  714. "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
  715. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  716. "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  717. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  718. "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  719. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  720. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  721. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  722. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  723. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  724. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  725. "psrad $" #shift ", %%mm2 \n\t"\
  726. "psrad $" #shift ", %%mm5 \n\t"\
  727. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  728. "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
  729. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  730. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  731. "psrad $" #shift ", %%mm6 \n\t"\
  732. "psrad $" #shift ", %%mm4 \n\t"\
  733. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  734. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  735. "movd %%mm2, 32+" #dst " \n\t"\
  736. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  737. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  738. "movd %%mm6, 48+" #dst " \n\t"\
  739. "movd %%mm4, 64+" #dst " \n\t"\
  740. "movd %%mm5, 80+" #dst " \n\t"
  741. //IDCT( src0, src4, src1, src5, dst, shift)
  742. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  743. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  744. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  745. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  746. "jmp 9f \n\t"
  747. "#" ASMALIGN(4) \
  748. "4: \n\t"
  749. Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
  750. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
  751. #undef IDCT
  752. #define IDCT(src0, src4, src1, src5, dst, shift) \
  753. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  754. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  755. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  756. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  757. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  758. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  759. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  760. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  761. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  762. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  763. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  764. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  765. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  766. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  767. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  768. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  769. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  770. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  771. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  772. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  773. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  774. "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  775. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  776. "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  777. "psrad $" #shift ", %%mm1 \n\t"\
  778. "psrad $" #shift ", %%mm4 \n\t"\
  779. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  780. "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  781. "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  782. "psrad $" #shift ", %%mm0 \n\t"\
  783. "psrad $" #shift ", %%mm2 \n\t"\
  784. "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  785. "movd %%mm1, " #dst " \n\t"\
  786. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  787. "movd %%mm0, 16+" #dst " \n\t"\
  788. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  789. "movd %%mm2, 96+" #dst " \n\t"\
  790. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  791. "movd %%mm4, 112+" #dst " \n\t"\
  792. "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
  793. "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  794. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  795. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  796. "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  797. "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  798. "psrad $" #shift ", %%mm2 \n\t"\
  799. "psrad $" #shift ", %%mm5 \n\t"\
  800. "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
  801. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  802. "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
  803. "psrad $" #shift ", %%mm6 \n\t"\
  804. "psrad $" #shift ", %%mm1 \n\t"\
  805. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  806. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  807. "movd %%mm2, 32+" #dst " \n\t"\
  808. "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
  809. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  810. "movd %%mm6, 48+" #dst " \n\t"\
  811. "movd %%mm1, 64+" #dst " \n\t"\
  812. "movd %%mm5, 80+" #dst " \n\t"
  813. //IDCT( src0, src4, src1, src5, dst, shift)
  814. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  815. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  816. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  817. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  818. "jmp 9f \n\t"
  819. "#" ASMALIGN(4) \
  820. "6: \n\t"
  821. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
  822. #undef IDCT
  823. #define IDCT(src0, src4, src1, src5, dst, shift) \
  824. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  825. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  826. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  827. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  828. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  829. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  830. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  831. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  832. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  833. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  834. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  835. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  836. "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  837. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  838. "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  839. "psrad $" #shift ", %%mm1 \n\t"\
  840. "psrad $" #shift ", %%mm4 \n\t"\
  841. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  842. "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  843. "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  844. "psrad $" #shift ", %%mm0 \n\t"\
  845. "psrad $" #shift ", %%mm2 \n\t"\
  846. "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  847. "movd %%mm1, " #dst " \n\t"\
  848. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  849. "movd %%mm0, 16+" #dst " \n\t"\
  850. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  851. "movd %%mm2, 96+" #dst " \n\t"\
  852. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  853. "movd %%mm4, 112+" #dst " \n\t"\
  854. "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
  855. "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  856. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  857. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  858. "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  859. "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  860. "psrad $" #shift ", %%mm2 \n\t"\
  861. "psrad $" #shift ", %%mm5 \n\t"\
  862. "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
  863. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  864. "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
  865. "psrad $" #shift ", %%mm6 \n\t"\
  866. "psrad $" #shift ", %%mm1 \n\t"\
  867. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  868. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  869. "movd %%mm2, 32+" #dst " \n\t"\
  870. "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
  871. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  872. "movd %%mm6, 48+" #dst " \n\t"\
  873. "movd %%mm1, 64+" #dst " \n\t"\
  874. "movd %%mm5, 80+" #dst " \n\t"
  875. //IDCT( src0, src4, src1, src5, dst, shift)
  876. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  877. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  878. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  879. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  880. "jmp 9f \n\t"
  881. "#" ASMALIGN(4) \
  882. "2: \n\t"
  883. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
  884. #undef IDCT
  885. #define IDCT(src0, src4, src1, src5, dst, shift) \
  886. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  887. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  888. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  889. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  890. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  891. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  892. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  893. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  894. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  895. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  896. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  897. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  898. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  899. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  900. "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
  901. "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
  902. "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  903. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  904. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  905. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  906. "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
  907. "psrad $" #shift ", %%mm7 \n\t"\
  908. "psrad $" #shift ", %%mm4 \n\t"\
  909. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  910. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  911. "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  912. "psrad $" #shift ", %%mm0 \n\t"\
  913. "psrad $" #shift ", %%mm2 \n\t"\
  914. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  915. "movd %%mm7, " #dst " \n\t"\
  916. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  917. "movd %%mm0, 16+" #dst " \n\t"\
  918. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  919. "movd %%mm2, 96+" #dst " \n\t"\
  920. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  921. "movd %%mm4, 112+" #dst " \n\t"\
  922. "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
  923. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  924. "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  925. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  926. "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  927. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  928. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  929. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  930. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  931. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  932. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  933. "psrad $" #shift ", %%mm2 \n\t"\
  934. "psrad $" #shift ", %%mm5 \n\t"\
  935. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  936. "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
  937. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  938. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  939. "psrad $" #shift ", %%mm6 \n\t"\
  940. "psrad $" #shift ", %%mm4 \n\t"\
  941. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  942. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  943. "movd %%mm2, 32+" #dst " \n\t"\
  944. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  945. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  946. "movd %%mm6, 48+" #dst " \n\t"\
  947. "movd %%mm4, 64+" #dst " \n\t"\
  948. "movd %%mm5, 80+" #dst " \n\t"
  949. //IDCT( src0, src4, src1, src5, dst, shift)
  950. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  951. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  952. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  953. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  954. "jmp 9f \n\t"
  955. "#" ASMALIGN(4) \
  956. "3: \n\t"
  957. #undef IDCT
  958. #define IDCT(src0, src4, src1, src5, dst, shift) \
  959. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  960. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  961. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  962. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  963. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  964. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  965. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  966. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  967. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  968. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  969. "movq 64(%2), %%mm3 \n\t"\
  970. "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  971. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  972. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  973. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  974. "psrad $" #shift ", %%mm7 \n\t"\
  975. "psrad $" #shift ", %%mm4 \n\t"\
  976. "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
  977. "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  978. "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
  979. "psrad $" #shift ", %%mm0 \n\t"\
  980. "psrad $" #shift ", %%mm1 \n\t"\
  981. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  982. "movd %%mm7, " #dst " \n\t"\
  983. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  984. "movd %%mm0, 16+" #dst " \n\t"\
  985. "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
  986. "movd %%mm1, 96+" #dst " \n\t"\
  987. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  988. "movd %%mm4, 112+" #dst " \n\t"\
  989. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  990. "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  991. "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  992. "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
  993. "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
  994. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  995. "psrad $" #shift ", %%mm1 \n\t"\
  996. "psrad $" #shift ", %%mm5 \n\t"\
  997. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  998. "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  999. "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  1000. "psrad $" #shift ", %%mm6 \n\t"\
  1001. "psrad $" #shift ", %%mm4 \n\t"\
  1002. "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
  1003. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  1004. "movd %%mm1, 32+" #dst " \n\t"\
  1005. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  1006. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  1007. "movd %%mm6, 48+" #dst " \n\t"\
  1008. "movd %%mm4, 64+" #dst " \n\t"\
  1009. "movd %%mm5, 80+" #dst " \n\t"
  1010. //IDCT( src0, src4, src1, src5, dst, shift)
  1011. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  1012. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  1013. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  1014. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  1015. "jmp 9f \n\t"
  1016. "#" ASMALIGN(4) \
  1017. "5: \n\t"
  1018. #undef IDCT
  1019. #define IDCT(src0, src4, src1, src5, dst, shift) \
  1020. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  1021. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  1022. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  1023. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1024. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  1025. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1026. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  1027. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  1028. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  1029. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  1030. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1031. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  1032. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  1033. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1034. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  1035. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  1036. "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
  1037. "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
  1038. "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
  1039. "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1040. "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
  1041. "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1042. "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
  1043. "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  1044. "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  1045. "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
  1046. "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
  1047. "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
  1048. "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
  1049. "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
  1050. "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
  1051. "psrad $" #shift ", %%mm4 \n\t"\
  1052. "psrad $" #shift ", %%mm7 \n\t"\
  1053. "psrad $" #shift ", %%mm3 \n\t"\
  1054. "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
  1055. "movq %%mm4, " #dst " \n\t"\
  1056. "psrad $" #shift ", %%mm0 \n\t"\
  1057. "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
  1058. "movq %%mm0, 16+" #dst " \n\t"\
  1059. "movq %%mm0, 96+" #dst " \n\t"\
  1060. "movq %%mm4, 112+" #dst " \n\t"\
  1061. "psrad $" #shift ", %%mm5 \n\t"\
  1062. "psrad $" #shift ", %%mm6 \n\t"\
  1063. "psrad $" #shift ", %%mm2 \n\t"\
  1064. "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  1065. "movq %%mm5, 32+" #dst " \n\t"\
  1066. "psrad $" #shift ", %%mm1 \n\t"\
  1067. "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  1068. "movq %%mm6, 48+" #dst " \n\t"\
  1069. "movq %%mm6, 64+" #dst " \n\t"\
  1070. "movq %%mm5, 80+" #dst " \n\t"
  1071. //IDCT( src0, src4, src1, src5, dst, shift)
  1072. IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  1073. //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  1074. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  1075. //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  1076. "jmp 9f \n\t"
  1077. "#" ASMALIGN(4) \
  1078. "1: \n\t"
  1079. #undef IDCT
  1080. #define IDCT(src0, src4, src1, src5, dst, shift) \
  1081. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  1082. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  1083. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  1084. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  1085. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1086. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  1087. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1088. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  1089. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  1090. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  1091. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  1092. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1093. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  1094. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  1095. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  1096. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  1097. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1098. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  1099. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  1100. "movq 64(%2), %%mm1 \n\t"\
  1101. "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  1102. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  1103. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  1104. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  1105. "psrad $" #shift ", %%mm7 \n\t"\
  1106. "psrad $" #shift ", %%mm4 \n\t"\
  1107. "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
  1108. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  1109. "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
  1110. "psrad $" #shift ", %%mm0 \n\t"\
  1111. "psrad $" #shift ", %%mm3 \n\t"\
  1112. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  1113. "movd %%mm7, " #dst " \n\t"\
  1114. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  1115. "movd %%mm0, 16+" #dst " \n\t"\
  1116. "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
  1117. "movd %%mm3, 96+" #dst " \n\t"\
  1118. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  1119. "movd %%mm4, 112+" #dst " \n\t"\
  1120. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  1121. "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  1122. "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  1123. "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
  1124. "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
  1125. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  1126. "psrad $" #shift ", %%mm3 \n\t"\
  1127. "psrad $" #shift ", %%mm5 \n\t"\
  1128. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  1129. "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  1130. "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  1131. "psrad $" #shift ", %%mm6 \n\t"\
  1132. "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
  1133. "movd %%mm3, 32+" #dst " \n\t"\
  1134. "psrad $" #shift ", %%mm4 \n\t"\
  1135. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  1136. "movd %%mm6, 48+" #dst " \n\t"\
  1137. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  1138. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  1139. "movd %%mm4, 64+" #dst " \n\t"\
  1140. "movd %%mm5, 80+" #dst " \n\t"
  1141. //IDCT( src0, src4, src1, src5, dst, shift)
  1142. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  1143. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  1144. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  1145. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  1146. "jmp 9f \n\t"
  1147. "#" ASMALIGN(4)
  1148. "7: \n\t"
  1149. #undef IDCT
  1150. #define IDCT(src0, src4, src1, src5, dst, shift) \
  1151. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  1152. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  1153. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1154. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  1155. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1156. "psrad $" #shift ", %%mm4 \n\t"\
  1157. "psrad $" #shift ", %%mm0 \n\t"\
  1158. "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
  1159. "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
  1160. "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1161. "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
  1162. "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1163. "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
  1164. "psrad $" #shift ", %%mm1 \n\t"\
  1165. "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
  1166. "movq %%mm4, " #dst " \n\t"\
  1167. "psrad $" #shift ", %%mm2 \n\t"\
  1168. "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
  1169. "movq %%mm0, 16+" #dst " \n\t"\
  1170. "movq %%mm0, 96+" #dst " \n\t"\
  1171. "movq %%mm4, 112+" #dst " \n\t"\
  1172. "movq %%mm0, 32+" #dst " \n\t"\
  1173. "movq %%mm4, 48+" #dst " \n\t"\
  1174. "movq %%mm4, 64+" #dst " \n\t"\
  1175. "movq %%mm0, 80+" #dst " \n\t"
  1176. //IDCT( src0, src4, src1, src5, dst, shift)
  1177. IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  1178. //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  1179. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  1180. //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  1181. #endif
  1182. /*
  1183. Input
  1184. 00 40 04 44 20 60 24 64
  1185. 10 30 14 34 50 70 54 74
  1186. 01 41 03 43 21 61 23 63
  1187. 11 31 13 33 51 71 53 73
  1188. 02 42 06 46 22 62 26 66
  1189. 12 32 16 36 52 72 56 76
  1190. 05 45 07 47 25 65 27 67
  1191. 15 35 17 37 55 75 57 77
  1192. Temp
  1193. 00 04 10 14 20 24 30 34
  1194. 40 44 50 54 60 64 70 74
  1195. 01 03 11 13 21 23 31 33
  1196. 41 43 51 53 61 63 71 73
  1197. 02 06 12 16 22 26 32 36
  1198. 42 46 52 56 62 66 72 76
  1199. 05 07 15 17 25 27 35 37
  1200. 45 47 55 57 65 67 75 77
  1201. */
  1202. "9: \n\t"
  1203. :: "r" (block), "r" (temp), "r" (coeffs)
  1204. : "%eax"
  1205. );
  1206. }
  1207. void ff_simple_idct_mmx(int16_t *block)
  1208. {
  1209. idct(block);
  1210. }
  1211. //FIXME merge add/put into the idct
  1212. void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
  1213. {
  1214. idct(block);
  1215. put_pixels_clamped_mmx(block, dest, line_size);
  1216. }
  1217. void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
  1218. {
  1219. idct(block);
  1220. add_pixels_clamped_mmx(block, dest, line_size);
  1221. }