h264dsp_mmx.c 92 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296
  1. /*
  2. * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "dsputil_mmx.h"
  21. DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;
  22. DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL;
  23. /***********************************/
  24. /* IDCT */
  25. #define SUMSUB_BADC( a, b, c, d ) \
  26. "paddw "#b", "#a" \n\t"\
  27. "paddw "#d", "#c" \n\t"\
  28. "paddw "#b", "#b" \n\t"\
  29. "paddw "#d", "#d" \n\t"\
  30. "psubw "#a", "#b" \n\t"\
  31. "psubw "#c", "#d" \n\t"
  32. #define SUMSUBD2_AB( a, b, t ) \
  33. "movq "#b", "#t" \n\t"\
  34. "psraw $1 , "#b" \n\t"\
  35. "paddw "#a", "#b" \n\t"\
  36. "psraw $1 , "#a" \n\t"\
  37. "psubw "#t", "#a" \n\t"
  38. #define IDCT4_1D( s02, s13, d02, d13, t ) \
  39. SUMSUB_BA ( s02, d02 )\
  40. SUMSUBD2_AB( s13, d13, t )\
  41. SUMSUB_BADC( d13, s02, s13, d02 )
  42. #define STORE_DIFF_4P( p, t, z ) \
  43. "psraw $6, "#p" \n\t"\
  44. "movd (%0), "#t" \n\t"\
  45. "punpcklbw "#z", "#t" \n\t"\
  46. "paddsw "#t", "#p" \n\t"\
  47. "packuswb "#z", "#p" \n\t"\
  48. "movd "#p", (%0) \n\t"
  49. static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
  50. {
  51. /* Load dct coeffs */
  52. __asm__ volatile(
  53. "movq (%0), %%mm0 \n\t"
  54. "movq 8(%0), %%mm1 \n\t"
  55. "movq 16(%0), %%mm2 \n\t"
  56. "movq 24(%0), %%mm3 \n\t"
  57. :: "r"(block) );
  58. __asm__ volatile(
  59. /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */
  60. IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
  61. "movq %0, %%mm6 \n\t"
  62. /* in: 1,4,0,2 out: 1,2,3,0 */
  63. TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
  64. "paddw %%mm6, %%mm3 \n\t"
  65. /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */
  66. IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
  67. "pxor %%mm7, %%mm7 \n\t"
  68. :: "m"(ff_pw_32));
  69. __asm__ volatile(
  70. STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
  71. "add %1, %0 \n\t"
  72. STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
  73. "add %1, %0 \n\t"
  74. STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
  75. "add %1, %0 \n\t"
  76. STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
  77. : "+r"(dst)
  78. : "r" ((x86_reg)stride)
  79. );
  80. }
  81. static inline void h264_idct8_1d(int16_t *block)
  82. {
  83. __asm__ volatile(
  84. "movq 112(%0), %%mm7 \n\t"
  85. "movq 80(%0), %%mm0 \n\t"
  86. "movq 48(%0), %%mm3 \n\t"
  87. "movq 16(%0), %%mm5 \n\t"
  88. "movq %%mm0, %%mm4 \n\t"
  89. "movq %%mm5, %%mm1 \n\t"
  90. "psraw $1, %%mm4 \n\t"
  91. "psraw $1, %%mm1 \n\t"
  92. "paddw %%mm0, %%mm4 \n\t"
  93. "paddw %%mm5, %%mm1 \n\t"
  94. "paddw %%mm7, %%mm4 \n\t"
  95. "paddw %%mm0, %%mm1 \n\t"
  96. "psubw %%mm5, %%mm4 \n\t"
  97. "paddw %%mm3, %%mm1 \n\t"
  98. "psubw %%mm3, %%mm5 \n\t"
  99. "psubw %%mm3, %%mm0 \n\t"
  100. "paddw %%mm7, %%mm5 \n\t"
  101. "psubw %%mm7, %%mm0 \n\t"
  102. "psraw $1, %%mm3 \n\t"
  103. "psraw $1, %%mm7 \n\t"
  104. "psubw %%mm3, %%mm5 \n\t"
  105. "psubw %%mm7, %%mm0 \n\t"
  106. "movq %%mm4, %%mm3 \n\t"
  107. "movq %%mm1, %%mm7 \n\t"
  108. "psraw $2, %%mm1 \n\t"
  109. "psraw $2, %%mm3 \n\t"
  110. "paddw %%mm5, %%mm3 \n\t"
  111. "psraw $2, %%mm5 \n\t"
  112. "paddw %%mm0, %%mm1 \n\t"
  113. "psraw $2, %%mm0 \n\t"
  114. "psubw %%mm4, %%mm5 \n\t"
  115. "psubw %%mm0, %%mm7 \n\t"
  116. "movq 32(%0), %%mm2 \n\t"
  117. "movq 96(%0), %%mm6 \n\t"
  118. "movq %%mm2, %%mm4 \n\t"
  119. "movq %%mm6, %%mm0 \n\t"
  120. "psraw $1, %%mm4 \n\t"
  121. "psraw $1, %%mm6 \n\t"
  122. "psubw %%mm0, %%mm4 \n\t"
  123. "paddw %%mm2, %%mm6 \n\t"
  124. "movq (%0), %%mm2 \n\t"
  125. "movq 64(%0), %%mm0 \n\t"
  126. SUMSUB_BA( %%mm0, %%mm2 )
  127. SUMSUB_BA( %%mm6, %%mm0 )
  128. SUMSUB_BA( %%mm4, %%mm2 )
  129. SUMSUB_BA( %%mm7, %%mm6 )
  130. SUMSUB_BA( %%mm5, %%mm4 )
  131. SUMSUB_BA( %%mm3, %%mm2 )
  132. SUMSUB_BA( %%mm1, %%mm0 )
  133. :: "r"(block)
  134. );
  135. }
  136. static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
  137. {
  138. int i;
  139. int16_t __attribute__ ((aligned(8))) b2[64];
  140. block[0] += 32;
  141. for(i=0; i<2; i++){
  142. DECLARE_ALIGNED_8(uint64_t, tmp);
  143. h264_idct8_1d(block+4*i);
  144. __asm__ volatile(
  145. "movq %%mm7, %0 \n\t"
  146. TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
  147. "movq %%mm0, 8(%1) \n\t"
  148. "movq %%mm6, 24(%1) \n\t"
  149. "movq %%mm7, 40(%1) \n\t"
  150. "movq %%mm4, 56(%1) \n\t"
  151. "movq %0, %%mm7 \n\t"
  152. TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
  153. "movq %%mm7, (%1) \n\t"
  154. "movq %%mm1, 16(%1) \n\t"
  155. "movq %%mm0, 32(%1) \n\t"
  156. "movq %%mm3, 48(%1) \n\t"
  157. : "=m"(tmp)
  158. : "r"(b2+32*i)
  159. : "memory"
  160. );
  161. }
  162. for(i=0; i<2; i++){
  163. h264_idct8_1d(b2+4*i);
  164. __asm__ volatile(
  165. "psraw $6, %%mm7 \n\t"
  166. "psraw $6, %%mm6 \n\t"
  167. "psraw $6, %%mm5 \n\t"
  168. "psraw $6, %%mm4 \n\t"
  169. "psraw $6, %%mm3 \n\t"
  170. "psraw $6, %%mm2 \n\t"
  171. "psraw $6, %%mm1 \n\t"
  172. "psraw $6, %%mm0 \n\t"
  173. "movq %%mm7, (%0) \n\t"
  174. "movq %%mm5, 16(%0) \n\t"
  175. "movq %%mm3, 32(%0) \n\t"
  176. "movq %%mm1, 48(%0) \n\t"
  177. "movq %%mm0, 64(%0) \n\t"
  178. "movq %%mm2, 80(%0) \n\t"
  179. "movq %%mm4, 96(%0) \n\t"
  180. "movq %%mm6, 112(%0) \n\t"
  181. :: "r"(b2+4*i)
  182. : "memory"
  183. );
  184. }
  185. add_pixels_clamped_mmx(b2, dst, stride);
  186. }
  187. #define STORE_DIFF_8P( p, d, t, z )\
  188. "movq "#d", "#t" \n"\
  189. "psraw $6, "#p" \n"\
  190. "punpcklbw "#z", "#t" \n"\
  191. "paddsw "#t", "#p" \n"\
  192. "packuswb "#p", "#p" \n"\
  193. "movq "#p", "#d" \n"
  194. #define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
  195. "movdqa "#c", "#a" \n"\
  196. "movdqa "#g", "#e" \n"\
  197. "psraw $1, "#c" \n"\
  198. "psraw $1, "#g" \n"\
  199. "psubw "#e", "#c" \n"\
  200. "paddw "#a", "#g" \n"\
  201. "movdqa "#b", "#e" \n"\
  202. "psraw $1, "#e" \n"\
  203. "paddw "#b", "#e" \n"\
  204. "paddw "#d", "#e" \n"\
  205. "paddw "#f", "#e" \n"\
  206. "movdqa "#f", "#a" \n"\
  207. "psraw $1, "#a" \n"\
  208. "paddw "#f", "#a" \n"\
  209. "paddw "#h", "#a" \n"\
  210. "psubw "#b", "#a" \n"\
  211. "psubw "#d", "#b" \n"\
  212. "psubw "#d", "#f" \n"\
  213. "paddw "#h", "#b" \n"\
  214. "psubw "#h", "#f" \n"\
  215. "psraw $1, "#d" \n"\
  216. "psraw $1, "#h" \n"\
  217. "psubw "#d", "#b" \n"\
  218. "psubw "#h", "#f" \n"\
  219. "movdqa "#e", "#d" \n"\
  220. "movdqa "#a", "#h" \n"\
  221. "psraw $2, "#d" \n"\
  222. "psraw $2, "#h" \n"\
  223. "paddw "#f", "#d" \n"\
  224. "paddw "#b", "#h" \n"\
  225. "psraw $2, "#f" \n"\
  226. "psraw $2, "#b" \n"\
  227. "psubw "#f", "#e" \n"\
  228. "psubw "#a", "#b" \n"\
  229. "movdqa 0x00(%1), "#a" \n"\
  230. "movdqa 0x40(%1), "#f" \n"\
  231. SUMSUB_BA(f, a)\
  232. SUMSUB_BA(g, f)\
  233. SUMSUB_BA(c, a)\
  234. SUMSUB_BA(e, g)\
  235. SUMSUB_BA(b, c)\
  236. SUMSUB_BA(h, a)\
  237. SUMSUB_BA(d, f)
  238. static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
  239. {
  240. __asm__ volatile(
  241. "movdqa 0x10(%1), %%xmm1 \n"
  242. "movdqa 0x20(%1), %%xmm2 \n"
  243. "movdqa 0x30(%1), %%xmm3 \n"
  244. "movdqa 0x50(%1), %%xmm5 \n"
  245. "movdqa 0x60(%1), %%xmm6 \n"
  246. "movdqa 0x70(%1), %%xmm7 \n"
  247. H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
  248. TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
  249. "paddw %4, %%xmm4 \n"
  250. "movdqa %%xmm4, 0x00(%1) \n"
  251. "movdqa %%xmm2, 0x40(%1) \n"
  252. H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
  253. "movdqa %%xmm6, 0x60(%1) \n"
  254. "movdqa %%xmm7, 0x70(%1) \n"
  255. "pxor %%xmm7, %%xmm7 \n"
  256. STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7)
  257. STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7)
  258. STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
  259. STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7)
  260. "lea (%0,%2,4), %0 \n"
  261. STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7)
  262. STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7)
  263. "movdqa 0x60(%1), %%xmm0 \n"
  264. "movdqa 0x70(%1), %%xmm1 \n"
  265. STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
  266. STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7)
  267. :"+r"(dst)
  268. :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
  269. );
  270. }
  271. static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
  272. {
  273. int dc = (block[0] + 32) >> 6;
  274. __asm__ volatile(
  275. "movd %0, %%mm0 \n\t"
  276. "pshufw $0, %%mm0, %%mm0 \n\t"
  277. "pxor %%mm1, %%mm1 \n\t"
  278. "psubw %%mm0, %%mm1 \n\t"
  279. "packuswb %%mm0, %%mm0 \n\t"
  280. "packuswb %%mm1, %%mm1 \n\t"
  281. ::"r"(dc)
  282. );
  283. __asm__ volatile(
  284. "movd %0, %%mm2 \n\t"
  285. "movd %1, %%mm3 \n\t"
  286. "movd %2, %%mm4 \n\t"
  287. "movd %3, %%mm5 \n\t"
  288. "paddusb %%mm0, %%mm2 \n\t"
  289. "paddusb %%mm0, %%mm3 \n\t"
  290. "paddusb %%mm0, %%mm4 \n\t"
  291. "paddusb %%mm0, %%mm5 \n\t"
  292. "psubusb %%mm1, %%mm2 \n\t"
  293. "psubusb %%mm1, %%mm3 \n\t"
  294. "psubusb %%mm1, %%mm4 \n\t"
  295. "psubusb %%mm1, %%mm5 \n\t"
  296. "movd %%mm2, %0 \n\t"
  297. "movd %%mm3, %1 \n\t"
  298. "movd %%mm4, %2 \n\t"
  299. "movd %%mm5, %3 \n\t"
  300. :"+m"(*(uint32_t*)(dst+0*stride)),
  301. "+m"(*(uint32_t*)(dst+1*stride)),
  302. "+m"(*(uint32_t*)(dst+2*stride)),
  303. "+m"(*(uint32_t*)(dst+3*stride))
  304. );
  305. }
  306. static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
  307. {
  308. int dc = (block[0] + 32) >> 6;
  309. int y;
  310. __asm__ volatile(
  311. "movd %0, %%mm0 \n\t"
  312. "pshufw $0, %%mm0, %%mm0 \n\t"
  313. "pxor %%mm1, %%mm1 \n\t"
  314. "psubw %%mm0, %%mm1 \n\t"
  315. "packuswb %%mm0, %%mm0 \n\t"
  316. "packuswb %%mm1, %%mm1 \n\t"
  317. ::"r"(dc)
  318. );
  319. for(y=2; y--; dst += 4*stride){
  320. __asm__ volatile(
  321. "movq %0, %%mm2 \n\t"
  322. "movq %1, %%mm3 \n\t"
  323. "movq %2, %%mm4 \n\t"
  324. "movq %3, %%mm5 \n\t"
  325. "paddusb %%mm0, %%mm2 \n\t"
  326. "paddusb %%mm0, %%mm3 \n\t"
  327. "paddusb %%mm0, %%mm4 \n\t"
  328. "paddusb %%mm0, %%mm5 \n\t"
  329. "psubusb %%mm1, %%mm2 \n\t"
  330. "psubusb %%mm1, %%mm3 \n\t"
  331. "psubusb %%mm1, %%mm4 \n\t"
  332. "psubusb %%mm1, %%mm5 \n\t"
  333. "movq %%mm2, %0 \n\t"
  334. "movq %%mm3, %1 \n\t"
  335. "movq %%mm4, %2 \n\t"
  336. "movq %%mm5, %3 \n\t"
  337. :"+m"(*(uint64_t*)(dst+0*stride)),
  338. "+m"(*(uint64_t*)(dst+1*stride)),
  339. "+m"(*(uint64_t*)(dst+2*stride)),
  340. "+m"(*(uint64_t*)(dst+3*stride))
  341. );
  342. }
  343. }
  344. //FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
  345. static const uint8_t scan8[16 + 2*4]={
  346. 4+1*8, 5+1*8, 4+2*8, 5+2*8,
  347. 6+1*8, 7+1*8, 6+2*8, 7+2*8,
  348. 4+3*8, 5+3*8, 4+4*8, 5+4*8,
  349. 6+3*8, 7+3*8, 6+4*8, 7+4*8,
  350. 1+1*8, 2+1*8,
  351. 1+2*8, 2+2*8,
  352. 1+4*8, 2+4*8,
  353. 1+5*8, 2+5*8,
  354. };
  355. static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  356. int i;
  357. for(i=0; i<16; i++){
  358. if(nnzc[ scan8[i] ])
  359. ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
  360. }
  361. }
  362. static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  363. int i;
  364. for(i=0; i<16; i+=4){
  365. if(nnzc[ scan8[i] ])
  366. ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride);
  367. }
  368. }
  369. static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  370. int i;
  371. for(i=0; i<16; i++){
  372. int nnz = nnzc[ scan8[i] ];
  373. if(nnz){
  374. if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
  375. else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
  376. }
  377. }
  378. }
  379. static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  380. int i;
  381. for(i=0; i<16; i++){
  382. if(nnzc[ scan8[i] ] || block[i*16])
  383. ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
  384. }
  385. }
  386. static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  387. int i;
  388. for(i=0; i<16; i++){
  389. if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
  390. else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
  391. }
  392. }
  393. static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  394. int i;
  395. for(i=0; i<16; i+=4){
  396. int nnz = nnzc[ scan8[i] ];
  397. if(nnz){
  398. if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
  399. else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride);
  400. }
  401. }
  402. }
  403. static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  404. int i;
  405. for(i=0; i<16; i+=4){
  406. int nnz = nnzc[ scan8[i] ];
  407. if(nnz){
  408. if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
  409. else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride);
  410. }
  411. }
  412. }
  413. static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  414. int i;
  415. for(i=16; i<16+8; i++){
  416. if(nnzc[ scan8[i] ] || block[i*16])
  417. ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
  418. }
  419. }
  420. static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  421. int i;
  422. for(i=16; i<16+8; i++){
  423. if(nnzc[ scan8[i] ])
  424. ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
  425. else if(block[i*16])
  426. ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
  427. }
  428. }
  429. #if CONFIG_GPL && HAVE_YASM
  430. static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride)
  431. {
  432. __asm__ volatile(
  433. "movd %0, %%mm0 \n\t" // 0 0 X D
  434. "punpcklwd %1, %%mm0 \n\t" // x X d D
  435. "paddsw %2, %%mm0 \n\t"
  436. "psraw $6, %%mm0 \n\t"
  437. "punpcklwd %%mm0, %%mm0 \n\t" // d d D D
  438. "pxor %%mm1, %%mm1 \n\t" // 0 0 0 0
  439. "psubw %%mm0, %%mm1 \n\t" // -d-d-D-D
  440. "packuswb %%mm1, %%mm0 \n\t" // -d-d-D-D d d D D
  441. "pshufw $0xFA, %%mm0, %%mm1 \n\t" // -d-d-d-d-D-D-D-D
  442. "punpcklwd %%mm0, %%mm0 \n\t" // d d d d D D D D
  443. ::"m"(block[ 0]),
  444. "m"(block[16]),
  445. "m"(ff_pw_32)
  446. );
  447. __asm__ volatile(
  448. "movq %0, %%mm2 \n\t"
  449. "movq %1, %%mm3 \n\t"
  450. "movq %2, %%mm4 \n\t"
  451. "movq %3, %%mm5 \n\t"
  452. "paddusb %%mm0, %%mm2 \n\t"
  453. "paddusb %%mm0, %%mm3 \n\t"
  454. "paddusb %%mm0, %%mm4 \n\t"
  455. "paddusb %%mm0, %%mm5 \n\t"
  456. "psubusb %%mm1, %%mm2 \n\t"
  457. "psubusb %%mm1, %%mm3 \n\t"
  458. "psubusb %%mm1, %%mm4 \n\t"
  459. "psubusb %%mm1, %%mm5 \n\t"
  460. "movq %%mm2, %0 \n\t"
  461. "movq %%mm3, %1 \n\t"
  462. "movq %%mm4, %2 \n\t"
  463. "movq %%mm5, %3 \n\t"
  464. :"+m"(*(uint64_t*)(dst+0*stride)),
  465. "+m"(*(uint64_t*)(dst+1*stride)),
  466. "+m"(*(uint64_t*)(dst+2*stride)),
  467. "+m"(*(uint64_t*)(dst+3*stride))
  468. );
  469. }
  470. extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride);
  471. static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  472. int i;
  473. for(i=0; i<16; i+=2)
  474. if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
  475. ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
  476. }
  477. static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  478. int i;
  479. for(i=0; i<16; i+=2){
  480. if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
  481. ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
  482. else if(block[i*16]|block[i*16+16])
  483. ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride);
  484. }
  485. }
  486. static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  487. int i;
  488. for(i=16; i<16+8; i+=2){
  489. if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
  490. ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
  491. else if(block[i*16]|block[i*16+16])
  492. ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
  493. }
  494. }
  495. #endif
  496. /***********************************/
  497. /* deblocking */
  498. // out: o = |x-y|>a
  499. // clobbers: t
  500. #define DIFF_GT_MMX(x,y,a,o,t)\
  501. "movq "#y", "#t" \n\t"\
  502. "movq "#x", "#o" \n\t"\
  503. "psubusb "#x", "#t" \n\t"\
  504. "psubusb "#y", "#o" \n\t"\
  505. "por "#t", "#o" \n\t"\
  506. "psubusb "#a", "#o" \n\t"
  507. // out: o = |x-y|>a
  508. // clobbers: t
  509. #define DIFF_GT2_MMX(x,y,a,o,t)\
  510. "movq "#y", "#t" \n\t"\
  511. "movq "#x", "#o" \n\t"\
  512. "psubusb "#x", "#t" \n\t"\
  513. "psubusb "#y", "#o" \n\t"\
  514. "psubusb "#a", "#t" \n\t"\
  515. "psubusb "#a", "#o" \n\t"\
  516. "pcmpeqb "#t", "#o" \n\t"\
  517. // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
  518. // out: mm5=beta-1, mm7=mask
  519. // clobbers: mm4,mm6
  520. #define H264_DEBLOCK_MASK(alpha1, beta1) \
  521. "pshufw $0, "#alpha1", %%mm4 \n\t"\
  522. "pshufw $0, "#beta1 ", %%mm5 \n\t"\
  523. "packuswb %%mm4, %%mm4 \n\t"\
  524. "packuswb %%mm5, %%mm5 \n\t"\
  525. DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
  526. DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
  527. "por %%mm4, %%mm7 \n\t"\
  528. DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
  529. "por %%mm4, %%mm7 \n\t"\
  530. "pxor %%mm6, %%mm6 \n\t"\
  531. "pcmpeqb %%mm6, %%mm7 \n\t"
  532. // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
  533. // out: mm1=p0' mm2=q0'
  534. // clobbers: mm0,3-6
  535. #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
  536. "movq %%mm1 , %%mm5 \n\t"\
  537. "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\
  538. "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\
  539. "pcmpeqb %%mm4 , %%mm4 \n\t"\
  540. "pxor %%mm4 , %%mm3 \n\t"\
  541. "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
  542. "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
  543. "pxor %%mm1 , %%mm4 \n\t"\
  544. "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
  545. "pavgb %%mm5 , %%mm3 \n\t"\
  546. "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\
  547. "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
  548. "psubusb %%mm3 , %%mm6 \n\t"\
  549. "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
  550. "pminub %%mm7 , %%mm6 \n\t"\
  551. "pminub %%mm7 , %%mm3 \n\t"\
  552. "psubusb %%mm6 , %%mm1 \n\t"\
  553. "psubusb %%mm3 , %%mm2 \n\t"\
  554. "paddusb %%mm3 , %%mm1 \n\t"\
  555. "paddusb %%mm6 , %%mm2 \n\t"
  556. // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
  557. // out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
  558. // clobbers: q2, tmp, tc0
  559. #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
  560. "movq %%mm1, "#tmp" \n\t"\
  561. "pavgb %%mm2, "#tmp" \n\t"\
  562. "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\
  563. "pxor "q2addr", "#tmp" \n\t"\
  564. "pand %8, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\
  565. "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
  566. "movq "#p1", "#tmp" \n\t"\
  567. "psubusb "#tc0", "#tmp" \n\t"\
  568. "paddusb "#p1", "#tc0" \n\t"\
  569. "pmaxub "#tmp", "#q2" \n\t"\
  570. "pminub "#tc0", "#q2" \n\t"\
  571. "movq "#q2", "q1addr" \n\t"
  572. static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
  573. {
  574. DECLARE_ALIGNED_8(uint64_t, tmp0[2]);
  575. __asm__ volatile(
  576. "movq (%1,%3), %%mm0 \n\t" //p1
  577. "movq (%1,%3,2), %%mm1 \n\t" //p0
  578. "movq (%2), %%mm2 \n\t" //q0
  579. "movq (%2,%3), %%mm3 \n\t" //q1
  580. H264_DEBLOCK_MASK(%6, %7)
  581. "movd %5, %%mm4 \n\t"
  582. "punpcklbw %%mm4, %%mm4 \n\t"
  583. "punpcklwd %%mm4, %%mm4 \n\t"
  584. "pcmpeqb %%mm3, %%mm3 \n\t"
  585. "movq %%mm4, %%mm6 \n\t"
  586. "pcmpgtb %%mm3, %%mm4 \n\t"
  587. "movq %%mm6, 8+%0 \n\t"
  588. "pand %%mm4, %%mm7 \n\t"
  589. "movq %%mm7, %0 \n\t"
  590. /* filter p1 */
  591. "movq (%1), %%mm3 \n\t" //p2
  592. DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
  593. "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta
  594. "pand 8+%0, %%mm7 \n\t" // mask & tc0
  595. "movq %%mm7, %%mm4 \n\t"
  596. "psubb %%mm6, %%mm7 \n\t"
  597. "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0
  598. H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
  599. /* filter q1 */
  600. "movq (%2,%3,2), %%mm4 \n\t" //q2
  601. DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
  602. "pand %0, %%mm6 \n\t"
  603. "movq 8+%0, %%mm5 \n\t" // can be merged with the and below but is slower then
  604. "pand %%mm6, %%mm5 \n\t"
  605. "psubb %%mm6, %%mm7 \n\t"
  606. "movq (%2,%3), %%mm3 \n\t"
  607. H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
  608. /* filter p0, q0 */
  609. H264_DEBLOCK_P0_Q0(%8, unused)
  610. "movq %%mm1, (%1,%3,2) \n\t"
  611. "movq %%mm2, (%2) \n\t"
  612. : "=m"(*tmp0)
  613. : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
  614. "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
  615. "m"(ff_bone)
  616. );
  617. }
  618. static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  619. {
  620. if((tc0[0] & tc0[1]) >= 0)
  621. h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
  622. if((tc0[2] & tc0[3]) >= 0)
  623. h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
  624. }
  625. static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  626. {
  627. //FIXME: could cut some load/stores by merging transpose with filter
  628. // also, it only needs to transpose 6x8
  629. DECLARE_ALIGNED_8(uint8_t, trans[8*8]);
  630. int i;
  631. for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
  632. if((tc0[0] & tc0[1]) < 0)
  633. continue;
  634. transpose4x4(trans, pix-4, 8, stride);
  635. transpose4x4(trans +4*8, pix, 8, stride);
  636. transpose4x4(trans+4, pix-4+4*stride, 8, stride);
  637. transpose4x4(trans+4+4*8, pix +4*stride, 8, stride);
  638. h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
  639. transpose4x4(pix-2, trans +2*8, stride, 8);
  640. transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
  641. }
  642. }
  643. static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
  644. {
  645. __asm__ volatile(
  646. "movq (%0), %%mm0 \n\t" //p1
  647. "movq (%0,%2), %%mm1 \n\t" //p0
  648. "movq (%1), %%mm2 \n\t" //q0
  649. "movq (%1,%2), %%mm3 \n\t" //q1
  650. H264_DEBLOCK_MASK(%4, %5)
  651. "movd %3, %%mm6 \n\t"
  652. "punpcklbw %%mm6, %%mm6 \n\t"
  653. "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask
  654. H264_DEBLOCK_P0_Q0(%6, %7)
  655. "movq %%mm1, (%0,%2) \n\t"
  656. "movq %%mm2, (%1) \n\t"
  657. :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
  658. "r"(*(uint32_t*)tc0),
  659. "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
  660. );
  661. }
  662. static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  663. {
  664. h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
  665. }
  666. static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  667. {
  668. //FIXME: could cut some load/stores by merging transpose with filter
  669. DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
  670. transpose4x4(trans, pix-2, 8, stride);
  671. transpose4x4(trans+4, pix-2+4*stride, 8, stride);
  672. h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
  673. transpose4x4(pix-2, trans, stride, 8);
  674. transpose4x4(pix-2+4*stride, trans+4, stride, 8);
  675. }
  676. // p0 = (p0 + q1 + 2*p1 + 2) >> 2
  677. #define H264_FILTER_CHROMA4(p0, p1, q1, one) \
  678. "movq "#p0", %%mm4 \n\t"\
  679. "pxor "#q1", %%mm4 \n\t"\
  680. "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\
  681. "pavgb "#q1", "#p0" \n\t"\
  682. "psubusb %%mm4, "#p0" \n\t"\
  683. "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
  684. static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
  685. {
  686. __asm__ volatile(
  687. "movq (%0), %%mm0 \n\t"
  688. "movq (%0,%2), %%mm1 \n\t"
  689. "movq (%1), %%mm2 \n\t"
  690. "movq (%1,%2), %%mm3 \n\t"
  691. H264_DEBLOCK_MASK(%3, %4)
  692. "movq %%mm1, %%mm5 \n\t"
  693. "movq %%mm2, %%mm6 \n\t"
  694. H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
  695. H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
  696. "psubb %%mm5, %%mm1 \n\t"
  697. "psubb %%mm6, %%mm2 \n\t"
  698. "pand %%mm7, %%mm1 \n\t"
  699. "pand %%mm7, %%mm2 \n\t"
  700. "paddb %%mm5, %%mm1 \n\t"
  701. "paddb %%mm6, %%mm2 \n\t"
  702. "movq %%mm1, (%0,%2) \n\t"
  703. "movq %%mm2, (%1) \n\t"
  704. :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
  705. "m"(alpha1), "m"(beta1), "m"(ff_bone)
  706. );
  707. }
  708. static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
  709. {
  710. h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
  711. }
  712. static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
  713. {
  714. //FIXME: could cut some load/stores by merging transpose with filter
  715. DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
  716. transpose4x4(trans, pix-2, 8, stride);
  717. transpose4x4(trans+4, pix-2+4*stride, 8, stride);
  718. h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
  719. transpose4x4(pix-2, trans, stride, 8);
  720. transpose4x4(pix-2+4*stride, trans+4, stride, 8);
  721. }
  722. static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
  723. int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
  724. int dir;
  725. __asm__ volatile(
  726. "pxor %%mm7, %%mm7 \n\t"
  727. "movq %0, %%mm6 \n\t"
  728. "movq %1, %%mm5 \n\t"
  729. "movq %2, %%mm4 \n\t"
  730. ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7)
  731. );
  732. if(field)
  733. __asm__ volatile(
  734. "movq %0, %%mm5 \n\t"
  735. "movq %1, %%mm4 \n\t"
  736. ::"m"(ff_pb_3_1), "m"(ff_pb_7_3)
  737. );
  738. // could do a special case for dir==0 && edges==1, but it only reduces the
  739. // average filter time by 1.2%
  740. for( dir=1; dir>=0; dir-- ) {
  741. const int d_idx = dir ? -8 : -1;
  742. const int mask_mv = dir ? mask_mv1 : mask_mv0;
  743. DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
  744. int b_idx, edge, l;
  745. for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
  746. __asm__ volatile(
  747. "pand %0, %%mm0 \n\t"
  748. ::"m"(mask_dir)
  749. );
  750. if(!(mask_mv & edge)) {
  751. __asm__ volatile("pxor %%mm0, %%mm0 \n\t":);
  752. for( l = bidir; l >= 0; l-- ) {
  753. __asm__ volatile(
  754. "movd %0, %%mm1 \n\t"
  755. "punpckldq %1, %%mm1 \n\t"
  756. "movq %%mm1, %%mm2 \n\t"
  757. "psrlw $7, %%mm2 \n\t"
  758. "pand %%mm6, %%mm2 \n\t"
  759. "por %%mm2, %%mm1 \n\t" // ref_cache with -2 mapped to -1
  760. "punpckldq %%mm1, %%mm2 \n\t"
  761. "pcmpeqb %%mm2, %%mm1 \n\t"
  762. "paddb %%mm6, %%mm1 \n\t"
  763. "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn]
  764. "por %%mm1, %%mm0 \n\t"
  765. "movq %2, %%mm1 \n\t"
  766. "movq %3, %%mm2 \n\t"
  767. "psubw %4, %%mm1 \n\t"
  768. "psubw %5, %%mm2 \n\t"
  769. "packsswb %%mm2, %%mm1 \n\t"
  770. "paddb %%mm5, %%mm1 \n\t"
  771. "pminub %%mm4, %%mm1 \n\t"
  772. "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit
  773. "por %%mm1, %%mm0 \n\t"
  774. ::"m"(ref[l][b_idx]),
  775. "m"(ref[l][b_idx+d_idx]),
  776. "m"(mv[l][b_idx][0]),
  777. "m"(mv[l][b_idx+2][0]),
  778. "m"(mv[l][b_idx+d_idx][0]),
  779. "m"(mv[l][b_idx+d_idx+2][0])
  780. );
  781. }
  782. }
  783. __asm__ volatile(
  784. "movd %0, %%mm1 \n\t"
  785. "por %1, %%mm1 \n\t"
  786. "punpcklbw %%mm7, %%mm1 \n\t"
  787. "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn]
  788. ::"m"(nnz[b_idx]),
  789. "m"(nnz[b_idx+d_idx])
  790. );
  791. __asm__ volatile(
  792. "pcmpeqw %%mm7, %%mm0 \n\t"
  793. "pcmpeqw %%mm7, %%mm0 \n\t"
  794. "psrlw $15, %%mm0 \n\t" // nonzero -> 1
  795. "psrlw $14, %%mm1 \n\t"
  796. "movq %%mm0, %%mm2 \n\t"
  797. "por %%mm1, %%mm2 \n\t"
  798. "psrlw $1, %%mm1 \n\t"
  799. "pandn %%mm2, %%mm1 \n\t"
  800. "movq %%mm1, %0 \n\t"
  801. :"=m"(*bS[dir][edge])
  802. ::"memory"
  803. );
  804. }
  805. edges = 4;
  806. step = 1;
  807. }
  808. __asm__ volatile(
  809. "movq (%0), %%mm0 \n\t"
  810. "movq 8(%0), %%mm1 \n\t"
  811. "movq 16(%0), %%mm2 \n\t"
  812. "movq 24(%0), %%mm3 \n\t"
  813. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
  814. "movq %%mm0, (%0) \n\t"
  815. "movq %%mm3, 8(%0) \n\t"
  816. "movq %%mm4, 16(%0) \n\t"
  817. "movq %%mm2, 24(%0) \n\t"
  818. ::"r"(bS[0])
  819. :"memory"
  820. );
  821. }
  822. /***********************************/
  823. /* motion compensation */
  824. #define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
  825. "mov"#q" "#C", "#T" \n\t"\
  826. "mov"#d" (%0), "#F" \n\t"\
  827. "paddw "#D", "#T" \n\t"\
  828. "psllw $2, "#T" \n\t"\
  829. "psubw "#B", "#T" \n\t"\
  830. "psubw "#E", "#T" \n\t"\
  831. "punpcklbw "#Z", "#F" \n\t"\
  832. "pmullw %4, "#T" \n\t"\
  833. "paddw %5, "#A" \n\t"\
  834. "add %2, %0 \n\t"\
  835. "paddw "#F", "#A" \n\t"\
  836. "paddw "#A", "#T" \n\t"\
  837. "psraw $5, "#T" \n\t"\
  838. "packuswb "#T", "#T" \n\t"\
  839. OP(T, (%1), A, d)\
  840. "add %3, %1 \n\t"
  841. #define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
  842. "mov"#q" "#C", "#T" \n\t"\
  843. "mov"#d" (%0), "#F" \n\t"\
  844. "paddw "#D", "#T" \n\t"\
  845. "psllw $2, "#T" \n\t"\
  846. "paddw %4, "#A" \n\t"\
  847. "psubw "#B", "#T" \n\t"\
  848. "psubw "#E", "#T" \n\t"\
  849. "punpcklbw "#Z", "#F" \n\t"\
  850. "pmullw %3, "#T" \n\t"\
  851. "paddw "#F", "#A" \n\t"\
  852. "add %2, %0 \n\t"\
  853. "paddw "#A", "#T" \n\t"\
  854. "mov"#q" "#T", "#OF"(%1) \n\t"
  855. #define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
  856. #define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
  857. #define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
  858. #define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
  859. #define QPEL_H264(OPNAME, OP, MMX)\
  860. static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  861. int h=4;\
  862. \
  863. __asm__ volatile(\
  864. "pxor %%mm7, %%mm7 \n\t"\
  865. "movq %5, %%mm4 \n\t"\
  866. "movq %6, %%mm5 \n\t"\
  867. "1: \n\t"\
  868. "movd -1(%0), %%mm1 \n\t"\
  869. "movd (%0), %%mm2 \n\t"\
  870. "movd 1(%0), %%mm3 \n\t"\
  871. "movd 2(%0), %%mm0 \n\t"\
  872. "punpcklbw %%mm7, %%mm1 \n\t"\
  873. "punpcklbw %%mm7, %%mm2 \n\t"\
  874. "punpcklbw %%mm7, %%mm3 \n\t"\
  875. "punpcklbw %%mm7, %%mm0 \n\t"\
  876. "paddw %%mm0, %%mm1 \n\t"\
  877. "paddw %%mm3, %%mm2 \n\t"\
  878. "movd -2(%0), %%mm0 \n\t"\
  879. "movd 3(%0), %%mm3 \n\t"\
  880. "punpcklbw %%mm7, %%mm0 \n\t"\
  881. "punpcklbw %%mm7, %%mm3 \n\t"\
  882. "paddw %%mm3, %%mm0 \n\t"\
  883. "psllw $2, %%mm2 \n\t"\
  884. "psubw %%mm1, %%mm2 \n\t"\
  885. "pmullw %%mm4, %%mm2 \n\t"\
  886. "paddw %%mm5, %%mm0 \n\t"\
  887. "paddw %%mm2, %%mm0 \n\t"\
  888. "psraw $5, %%mm0 \n\t"\
  889. "packuswb %%mm0, %%mm0 \n\t"\
  890. OP(%%mm0, (%1),%%mm6, d)\
  891. "add %3, %0 \n\t"\
  892. "add %4, %1 \n\t"\
  893. "decl %2 \n\t"\
  894. " jnz 1b \n\t"\
  895. : "+a"(src), "+c"(dst), "+g"(h)\
  896. : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  897. : "memory"\
  898. );\
  899. }\
  900. static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  901. int h=4;\
  902. __asm__ volatile(\
  903. "pxor %%mm7, %%mm7 \n\t"\
  904. "movq %0, %%mm4 \n\t"\
  905. "movq %1, %%mm5 \n\t"\
  906. :: "m"(ff_pw_5), "m"(ff_pw_16)\
  907. );\
  908. do{\
  909. __asm__ volatile(\
  910. "movd -1(%0), %%mm1 \n\t"\
  911. "movd (%0), %%mm2 \n\t"\
  912. "movd 1(%0), %%mm3 \n\t"\
  913. "movd 2(%0), %%mm0 \n\t"\
  914. "punpcklbw %%mm7, %%mm1 \n\t"\
  915. "punpcklbw %%mm7, %%mm2 \n\t"\
  916. "punpcklbw %%mm7, %%mm3 \n\t"\
  917. "punpcklbw %%mm7, %%mm0 \n\t"\
  918. "paddw %%mm0, %%mm1 \n\t"\
  919. "paddw %%mm3, %%mm2 \n\t"\
  920. "movd -2(%0), %%mm0 \n\t"\
  921. "movd 3(%0), %%mm3 \n\t"\
  922. "punpcklbw %%mm7, %%mm0 \n\t"\
  923. "punpcklbw %%mm7, %%mm3 \n\t"\
  924. "paddw %%mm3, %%mm0 \n\t"\
  925. "psllw $2, %%mm2 \n\t"\
  926. "psubw %%mm1, %%mm2 \n\t"\
  927. "pmullw %%mm4, %%mm2 \n\t"\
  928. "paddw %%mm5, %%mm0 \n\t"\
  929. "paddw %%mm2, %%mm0 \n\t"\
  930. "movd (%2), %%mm3 \n\t"\
  931. "psraw $5, %%mm0 \n\t"\
  932. "packuswb %%mm0, %%mm0 \n\t"\
  933. PAVGB" %%mm3, %%mm0 \n\t"\
  934. OP(%%mm0, (%1),%%mm6, d)\
  935. "add %4, %0 \n\t"\
  936. "add %4, %1 \n\t"\
  937. "add %3, %2 \n\t"\
  938. : "+a"(src), "+c"(dst), "+d"(src2)\
  939. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
  940. : "memory"\
  941. );\
  942. }while(--h);\
  943. }\
  944. static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  945. src -= 2*srcStride;\
  946. __asm__ volatile(\
  947. "pxor %%mm7, %%mm7 \n\t"\
  948. "movd (%0), %%mm0 \n\t"\
  949. "add %2, %0 \n\t"\
  950. "movd (%0), %%mm1 \n\t"\
  951. "add %2, %0 \n\t"\
  952. "movd (%0), %%mm2 \n\t"\
  953. "add %2, %0 \n\t"\
  954. "movd (%0), %%mm3 \n\t"\
  955. "add %2, %0 \n\t"\
  956. "movd (%0), %%mm4 \n\t"\
  957. "add %2, %0 \n\t"\
  958. "punpcklbw %%mm7, %%mm0 \n\t"\
  959. "punpcklbw %%mm7, %%mm1 \n\t"\
  960. "punpcklbw %%mm7, %%mm2 \n\t"\
  961. "punpcklbw %%mm7, %%mm3 \n\t"\
  962. "punpcklbw %%mm7, %%mm4 \n\t"\
  963. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  964. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  965. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  966. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  967. \
  968. : "+a"(src), "+c"(dst)\
  969. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  970. : "memory"\
  971. );\
  972. }\
  973. static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  974. int h=4;\
  975. int w=3;\
  976. src -= 2*srcStride+2;\
  977. while(w--){\
  978. __asm__ volatile(\
  979. "pxor %%mm7, %%mm7 \n\t"\
  980. "movd (%0), %%mm0 \n\t"\
  981. "add %2, %0 \n\t"\
  982. "movd (%0), %%mm1 \n\t"\
  983. "add %2, %0 \n\t"\
  984. "movd (%0), %%mm2 \n\t"\
  985. "add %2, %0 \n\t"\
  986. "movd (%0), %%mm3 \n\t"\
  987. "add %2, %0 \n\t"\
  988. "movd (%0), %%mm4 \n\t"\
  989. "add %2, %0 \n\t"\
  990. "punpcklbw %%mm7, %%mm0 \n\t"\
  991. "punpcklbw %%mm7, %%mm1 \n\t"\
  992. "punpcklbw %%mm7, %%mm2 \n\t"\
  993. "punpcklbw %%mm7, %%mm3 \n\t"\
  994. "punpcklbw %%mm7, %%mm4 \n\t"\
  995. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
  996. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
  997. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
  998. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
  999. \
  1000. : "+a"(src)\
  1001. : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  1002. : "memory"\
  1003. );\
  1004. tmp += 4;\
  1005. src += 4 - 9*srcStride;\
  1006. }\
  1007. tmp -= 3*4;\
  1008. __asm__ volatile(\
  1009. "1: \n\t"\
  1010. "movq (%0), %%mm0 \n\t"\
  1011. "paddw 10(%0), %%mm0 \n\t"\
  1012. "movq 2(%0), %%mm1 \n\t"\
  1013. "paddw 8(%0), %%mm1 \n\t"\
  1014. "movq 4(%0), %%mm2 \n\t"\
  1015. "paddw 6(%0), %%mm2 \n\t"\
  1016. "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\
  1017. "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\
  1018. "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\
  1019. "paddsw %%mm2, %%mm0 \n\t"\
  1020. "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\
  1021. "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\
  1022. "psraw $6, %%mm0 \n\t"\
  1023. "packuswb %%mm0, %%mm0 \n\t"\
  1024. OP(%%mm0, (%1),%%mm7, d)\
  1025. "add $24, %0 \n\t"\
  1026. "add %3, %1 \n\t"\
  1027. "decl %2 \n\t"\
  1028. " jnz 1b \n\t"\
  1029. : "+a"(tmp), "+c"(dst), "+g"(h)\
  1030. : "S"((x86_reg)dstStride)\
  1031. : "memory"\
  1032. );\
  1033. }\
  1034. \
  1035. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1036. int h=8;\
  1037. __asm__ volatile(\
  1038. "pxor %%mm7, %%mm7 \n\t"\
  1039. "movq %5, %%mm6 \n\t"\
  1040. "1: \n\t"\
  1041. "movq (%0), %%mm0 \n\t"\
  1042. "movq 1(%0), %%mm2 \n\t"\
  1043. "movq %%mm0, %%mm1 \n\t"\
  1044. "movq %%mm2, %%mm3 \n\t"\
  1045. "punpcklbw %%mm7, %%mm0 \n\t"\
  1046. "punpckhbw %%mm7, %%mm1 \n\t"\
  1047. "punpcklbw %%mm7, %%mm2 \n\t"\
  1048. "punpckhbw %%mm7, %%mm3 \n\t"\
  1049. "paddw %%mm2, %%mm0 \n\t"\
  1050. "paddw %%mm3, %%mm1 \n\t"\
  1051. "psllw $2, %%mm0 \n\t"\
  1052. "psllw $2, %%mm1 \n\t"\
  1053. "movq -1(%0), %%mm2 \n\t"\
  1054. "movq 2(%0), %%mm4 \n\t"\
  1055. "movq %%mm2, %%mm3 \n\t"\
  1056. "movq %%mm4, %%mm5 \n\t"\
  1057. "punpcklbw %%mm7, %%mm2 \n\t"\
  1058. "punpckhbw %%mm7, %%mm3 \n\t"\
  1059. "punpcklbw %%mm7, %%mm4 \n\t"\
  1060. "punpckhbw %%mm7, %%mm5 \n\t"\
  1061. "paddw %%mm4, %%mm2 \n\t"\
  1062. "paddw %%mm3, %%mm5 \n\t"\
  1063. "psubw %%mm2, %%mm0 \n\t"\
  1064. "psubw %%mm5, %%mm1 \n\t"\
  1065. "pmullw %%mm6, %%mm0 \n\t"\
  1066. "pmullw %%mm6, %%mm1 \n\t"\
  1067. "movd -2(%0), %%mm2 \n\t"\
  1068. "movd 7(%0), %%mm5 \n\t"\
  1069. "punpcklbw %%mm7, %%mm2 \n\t"\
  1070. "punpcklbw %%mm7, %%mm5 \n\t"\
  1071. "paddw %%mm3, %%mm2 \n\t"\
  1072. "paddw %%mm5, %%mm4 \n\t"\
  1073. "movq %6, %%mm5 \n\t"\
  1074. "paddw %%mm5, %%mm2 \n\t"\
  1075. "paddw %%mm5, %%mm4 \n\t"\
  1076. "paddw %%mm2, %%mm0 \n\t"\
  1077. "paddw %%mm4, %%mm1 \n\t"\
  1078. "psraw $5, %%mm0 \n\t"\
  1079. "psraw $5, %%mm1 \n\t"\
  1080. "packuswb %%mm1, %%mm0 \n\t"\
  1081. OP(%%mm0, (%1),%%mm5, q)\
  1082. "add %3, %0 \n\t"\
  1083. "add %4, %1 \n\t"\
  1084. "decl %2 \n\t"\
  1085. " jnz 1b \n\t"\
  1086. : "+a"(src), "+c"(dst), "+g"(h)\
  1087. : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  1088. : "memory"\
  1089. );\
  1090. }\
  1091. \
  1092. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  1093. int h=8;\
  1094. __asm__ volatile(\
  1095. "pxor %%mm7, %%mm7 \n\t"\
  1096. "movq %0, %%mm6 \n\t"\
  1097. :: "m"(ff_pw_5)\
  1098. );\
  1099. do{\
  1100. __asm__ volatile(\
  1101. "movq (%0), %%mm0 \n\t"\
  1102. "movq 1(%0), %%mm2 \n\t"\
  1103. "movq %%mm0, %%mm1 \n\t"\
  1104. "movq %%mm2, %%mm3 \n\t"\
  1105. "punpcklbw %%mm7, %%mm0 \n\t"\
  1106. "punpckhbw %%mm7, %%mm1 \n\t"\
  1107. "punpcklbw %%mm7, %%mm2 \n\t"\
  1108. "punpckhbw %%mm7, %%mm3 \n\t"\
  1109. "paddw %%mm2, %%mm0 \n\t"\
  1110. "paddw %%mm3, %%mm1 \n\t"\
  1111. "psllw $2, %%mm0 \n\t"\
  1112. "psllw $2, %%mm1 \n\t"\
  1113. "movq -1(%0), %%mm2 \n\t"\
  1114. "movq 2(%0), %%mm4 \n\t"\
  1115. "movq %%mm2, %%mm3 \n\t"\
  1116. "movq %%mm4, %%mm5 \n\t"\
  1117. "punpcklbw %%mm7, %%mm2 \n\t"\
  1118. "punpckhbw %%mm7, %%mm3 \n\t"\
  1119. "punpcklbw %%mm7, %%mm4 \n\t"\
  1120. "punpckhbw %%mm7, %%mm5 \n\t"\
  1121. "paddw %%mm4, %%mm2 \n\t"\
  1122. "paddw %%mm3, %%mm5 \n\t"\
  1123. "psubw %%mm2, %%mm0 \n\t"\
  1124. "psubw %%mm5, %%mm1 \n\t"\
  1125. "pmullw %%mm6, %%mm0 \n\t"\
  1126. "pmullw %%mm6, %%mm1 \n\t"\
  1127. "movd -2(%0), %%mm2 \n\t"\
  1128. "movd 7(%0), %%mm5 \n\t"\
  1129. "punpcklbw %%mm7, %%mm2 \n\t"\
  1130. "punpcklbw %%mm7, %%mm5 \n\t"\
  1131. "paddw %%mm3, %%mm2 \n\t"\
  1132. "paddw %%mm5, %%mm4 \n\t"\
  1133. "movq %5, %%mm5 \n\t"\
  1134. "paddw %%mm5, %%mm2 \n\t"\
  1135. "paddw %%mm5, %%mm4 \n\t"\
  1136. "paddw %%mm2, %%mm0 \n\t"\
  1137. "paddw %%mm4, %%mm1 \n\t"\
  1138. "psraw $5, %%mm0 \n\t"\
  1139. "psraw $5, %%mm1 \n\t"\
  1140. "movq (%2), %%mm4 \n\t"\
  1141. "packuswb %%mm1, %%mm0 \n\t"\
  1142. PAVGB" %%mm4, %%mm0 \n\t"\
  1143. OP(%%mm0, (%1),%%mm5, q)\
  1144. "add %4, %0 \n\t"\
  1145. "add %4, %1 \n\t"\
  1146. "add %3, %2 \n\t"\
  1147. : "+a"(src), "+c"(dst), "+d"(src2)\
  1148. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
  1149. "m"(ff_pw_16)\
  1150. : "memory"\
  1151. );\
  1152. }while(--h);\
  1153. }\
  1154. \
  1155. static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1156. int w= 2;\
  1157. src -= 2*srcStride;\
  1158. \
  1159. while(w--){\
  1160. __asm__ volatile(\
  1161. "pxor %%mm7, %%mm7 \n\t"\
  1162. "movd (%0), %%mm0 \n\t"\
  1163. "add %2, %0 \n\t"\
  1164. "movd (%0), %%mm1 \n\t"\
  1165. "add %2, %0 \n\t"\
  1166. "movd (%0), %%mm2 \n\t"\
  1167. "add %2, %0 \n\t"\
  1168. "movd (%0), %%mm3 \n\t"\
  1169. "add %2, %0 \n\t"\
  1170. "movd (%0), %%mm4 \n\t"\
  1171. "add %2, %0 \n\t"\
  1172. "punpcklbw %%mm7, %%mm0 \n\t"\
  1173. "punpcklbw %%mm7, %%mm1 \n\t"\
  1174. "punpcklbw %%mm7, %%mm2 \n\t"\
  1175. "punpcklbw %%mm7, %%mm3 \n\t"\
  1176. "punpcklbw %%mm7, %%mm4 \n\t"\
  1177. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  1178. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  1179. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  1180. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  1181. QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
  1182. QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
  1183. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  1184. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  1185. \
  1186. : "+a"(src), "+c"(dst)\
  1187. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  1188. : "memory"\
  1189. );\
  1190. if(h==16){\
  1191. __asm__ volatile(\
  1192. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  1193. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  1194. QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
  1195. QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
  1196. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  1197. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  1198. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  1199. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  1200. \
  1201. : "+a"(src), "+c"(dst)\
  1202. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  1203. : "memory"\
  1204. );\
  1205. }\
  1206. src += 4-(h+5)*srcStride;\
  1207. dst += 4-h*dstStride;\
  1208. }\
  1209. }\
  1210. static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
  1211. int w = (size+8)>>2;\
  1212. src -= 2*srcStride+2;\
  1213. while(w--){\
  1214. __asm__ volatile(\
  1215. "pxor %%mm7, %%mm7 \n\t"\
  1216. "movd (%0), %%mm0 \n\t"\
  1217. "add %2, %0 \n\t"\
  1218. "movd (%0), %%mm1 \n\t"\
  1219. "add %2, %0 \n\t"\
  1220. "movd (%0), %%mm2 \n\t"\
  1221. "add %2, %0 \n\t"\
  1222. "movd (%0), %%mm3 \n\t"\
  1223. "add %2, %0 \n\t"\
  1224. "movd (%0), %%mm4 \n\t"\
  1225. "add %2, %0 \n\t"\
  1226. "punpcklbw %%mm7, %%mm0 \n\t"\
  1227. "punpcklbw %%mm7, %%mm1 \n\t"\
  1228. "punpcklbw %%mm7, %%mm2 \n\t"\
  1229. "punpcklbw %%mm7, %%mm3 \n\t"\
  1230. "punpcklbw %%mm7, %%mm4 \n\t"\
  1231. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
  1232. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
  1233. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
  1234. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
  1235. QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
  1236. QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
  1237. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
  1238. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
  1239. : "+a"(src)\
  1240. : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  1241. : "memory"\
  1242. );\
  1243. if(size==16){\
  1244. __asm__ volatile(\
  1245. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
  1246. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
  1247. QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
  1248. QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
  1249. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
  1250. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
  1251. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
  1252. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
  1253. : "+a"(src)\
  1254. : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  1255. : "memory"\
  1256. );\
  1257. }\
  1258. tmp += 4;\
  1259. src += 4 - (size+5)*srcStride;\
  1260. }\
  1261. }\
  1262. static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
  1263. int w = size>>4;\
  1264. do{\
  1265. int h = size;\
  1266. __asm__ volatile(\
  1267. "1: \n\t"\
  1268. "movq (%0), %%mm0 \n\t"\
  1269. "movq 8(%0), %%mm3 \n\t"\
  1270. "movq 2(%0), %%mm1 \n\t"\
  1271. "movq 10(%0), %%mm4 \n\t"\
  1272. "paddw %%mm4, %%mm0 \n\t"\
  1273. "paddw %%mm3, %%mm1 \n\t"\
  1274. "paddw 18(%0), %%mm3 \n\t"\
  1275. "paddw 16(%0), %%mm4 \n\t"\
  1276. "movq 4(%0), %%mm2 \n\t"\
  1277. "movq 12(%0), %%mm5 \n\t"\
  1278. "paddw 6(%0), %%mm2 \n\t"\
  1279. "paddw 14(%0), %%mm5 \n\t"\
  1280. "psubw %%mm1, %%mm0 \n\t"\
  1281. "psubw %%mm4, %%mm3 \n\t"\
  1282. "psraw $2, %%mm0 \n\t"\
  1283. "psraw $2, %%mm3 \n\t"\
  1284. "psubw %%mm1, %%mm0 \n\t"\
  1285. "psubw %%mm4, %%mm3 \n\t"\
  1286. "paddsw %%mm2, %%mm0 \n\t"\
  1287. "paddsw %%mm5, %%mm3 \n\t"\
  1288. "psraw $2, %%mm0 \n\t"\
  1289. "psraw $2, %%mm3 \n\t"\
  1290. "paddw %%mm2, %%mm0 \n\t"\
  1291. "paddw %%mm5, %%mm3 \n\t"\
  1292. "psraw $6, %%mm0 \n\t"\
  1293. "psraw $6, %%mm3 \n\t"\
  1294. "packuswb %%mm3, %%mm0 \n\t"\
  1295. OP(%%mm0, (%1),%%mm7, q)\
  1296. "add $48, %0 \n\t"\
  1297. "add %3, %1 \n\t"\
  1298. "decl %2 \n\t"\
  1299. " jnz 1b \n\t"\
  1300. : "+a"(tmp), "+c"(dst), "+g"(h)\
  1301. : "S"((x86_reg)dstStride)\
  1302. : "memory"\
  1303. );\
  1304. tmp += 8 - size*24;\
  1305. dst += 8 - size*dstStride;\
  1306. }while(w--);\
  1307. }\
  1308. \
  1309. static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1310. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
  1311. }\
  1312. static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1313. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
  1314. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
  1315. }\
  1316. \
  1317. static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1318. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  1319. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  1320. src += 8*srcStride;\
  1321. dst += 8*dstStride;\
  1322. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  1323. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  1324. }\
  1325. \
  1326. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  1327. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  1328. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  1329. src += 8*dstStride;\
  1330. dst += 8*dstStride;\
  1331. src2 += 8*src2Stride;\
  1332. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  1333. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  1334. }\
  1335. \
  1336. static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
  1337. put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
  1338. OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
  1339. }\
  1340. static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  1341. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
  1342. }\
  1343. \
  1344. static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  1345. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
  1346. }\
  1347. \
  1348. static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  1349. {\
  1350. __asm__ volatile(\
  1351. "movq (%1), %%mm0 \n\t"\
  1352. "movq 24(%1), %%mm1 \n\t"\
  1353. "psraw $5, %%mm0 \n\t"\
  1354. "psraw $5, %%mm1 \n\t"\
  1355. "packuswb %%mm0, %%mm0 \n\t"\
  1356. "packuswb %%mm1, %%mm1 \n\t"\
  1357. PAVGB" (%0), %%mm0 \n\t"\
  1358. PAVGB" (%0,%3), %%mm1 \n\t"\
  1359. OP(%%mm0, (%2), %%mm4, d)\
  1360. OP(%%mm1, (%2,%4), %%mm5, d)\
  1361. "lea (%0,%3,2), %0 \n\t"\
  1362. "lea (%2,%4,2), %2 \n\t"\
  1363. "movq 48(%1), %%mm0 \n\t"\
  1364. "movq 72(%1), %%mm1 \n\t"\
  1365. "psraw $5, %%mm0 \n\t"\
  1366. "psraw $5, %%mm1 \n\t"\
  1367. "packuswb %%mm0, %%mm0 \n\t"\
  1368. "packuswb %%mm1, %%mm1 \n\t"\
  1369. PAVGB" (%0), %%mm0 \n\t"\
  1370. PAVGB" (%0,%3), %%mm1 \n\t"\
  1371. OP(%%mm0, (%2), %%mm4, d)\
  1372. OP(%%mm1, (%2,%4), %%mm5, d)\
  1373. :"+a"(src8), "+c"(src16), "+d"(dst)\
  1374. :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
  1375. :"memory");\
  1376. }\
  1377. static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  1378. {\
  1379. do{\
  1380. __asm__ volatile(\
  1381. "movq (%1), %%mm0 \n\t"\
  1382. "movq 8(%1), %%mm1 \n\t"\
  1383. "movq 48(%1), %%mm2 \n\t"\
  1384. "movq 8+48(%1), %%mm3 \n\t"\
  1385. "psraw $5, %%mm0 \n\t"\
  1386. "psraw $5, %%mm1 \n\t"\
  1387. "psraw $5, %%mm2 \n\t"\
  1388. "psraw $5, %%mm3 \n\t"\
  1389. "packuswb %%mm1, %%mm0 \n\t"\
  1390. "packuswb %%mm3, %%mm2 \n\t"\
  1391. PAVGB" (%0), %%mm0 \n\t"\
  1392. PAVGB" (%0,%3), %%mm2 \n\t"\
  1393. OP(%%mm0, (%2), %%mm5, q)\
  1394. OP(%%mm2, (%2,%4), %%mm5, q)\
  1395. ::"a"(src8), "c"(src16), "d"(dst),\
  1396. "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
  1397. :"memory");\
  1398. src8 += 2L*src8Stride;\
  1399. src16 += 48;\
  1400. dst += 2L*dstStride;\
  1401. }while(h-=2);\
  1402. }\
  1403. static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  1404. {\
  1405. OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
  1406. OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
  1407. }\
  1408. #if ARCH_X86_64
  1409. #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  1410. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  1411. int h=16;\
  1412. __asm__ volatile(\
  1413. "pxor %%xmm15, %%xmm15 \n\t"\
  1414. "movdqa %6, %%xmm14 \n\t"\
  1415. "movdqa %7, %%xmm13 \n\t"\
  1416. "1: \n\t"\
  1417. "lddqu 3(%0), %%xmm1 \n\t"\
  1418. "lddqu -5(%0), %%xmm7 \n\t"\
  1419. "movdqa %%xmm1, %%xmm0 \n\t"\
  1420. "punpckhbw %%xmm15, %%xmm1 \n\t"\
  1421. "punpcklbw %%xmm15, %%xmm0 \n\t"\
  1422. "punpcklbw %%xmm15, %%xmm7 \n\t"\
  1423. "movdqa %%xmm1, %%xmm2 \n\t"\
  1424. "movdqa %%xmm0, %%xmm6 \n\t"\
  1425. "movdqa %%xmm1, %%xmm3 \n\t"\
  1426. "movdqa %%xmm0, %%xmm8 \n\t"\
  1427. "movdqa %%xmm1, %%xmm4 \n\t"\
  1428. "movdqa %%xmm0, %%xmm9 \n\t"\
  1429. "movdqa %%xmm1, %%xmm5 \n\t"\
  1430. "movdqa %%xmm0, %%xmm10 \n\t"\
  1431. "palignr $6, %%xmm0, %%xmm5 \n\t"\
  1432. "palignr $6, %%xmm7, %%xmm10\n\t"\
  1433. "palignr $8, %%xmm0, %%xmm4 \n\t"\
  1434. "palignr $8, %%xmm7, %%xmm9 \n\t"\
  1435. "palignr $10,%%xmm0, %%xmm3 \n\t"\
  1436. "palignr $10,%%xmm7, %%xmm8 \n\t"\
  1437. "paddw %%xmm1, %%xmm5 \n\t"\
  1438. "paddw %%xmm0, %%xmm10 \n\t"\
  1439. "palignr $12,%%xmm0, %%xmm2 \n\t"\
  1440. "palignr $12,%%xmm7, %%xmm6 \n\t"\
  1441. "palignr $14,%%xmm0, %%xmm1 \n\t"\
  1442. "palignr $14,%%xmm7, %%xmm0 \n\t"\
  1443. "paddw %%xmm3, %%xmm2 \n\t"\
  1444. "paddw %%xmm8, %%xmm6 \n\t"\
  1445. "paddw %%xmm4, %%xmm1 \n\t"\
  1446. "paddw %%xmm9, %%xmm0 \n\t"\
  1447. "psllw $2, %%xmm2 \n\t"\
  1448. "psllw $2, %%xmm6 \n\t"\
  1449. "psubw %%xmm1, %%xmm2 \n\t"\
  1450. "psubw %%xmm0, %%xmm6 \n\t"\
  1451. "paddw %%xmm13,%%xmm5 \n\t"\
  1452. "paddw %%xmm13,%%xmm10 \n\t"\
  1453. "pmullw %%xmm14,%%xmm2 \n\t"\
  1454. "pmullw %%xmm14,%%xmm6 \n\t"\
  1455. "lddqu (%2), %%xmm3 \n\t"\
  1456. "paddw %%xmm5, %%xmm2 \n\t"\
  1457. "paddw %%xmm10,%%xmm6 \n\t"\
  1458. "psraw $5, %%xmm2 \n\t"\
  1459. "psraw $5, %%xmm6 \n\t"\
  1460. "packuswb %%xmm2,%%xmm6 \n\t"\
  1461. "pavgb %%xmm3, %%xmm6 \n\t"\
  1462. OP(%%xmm6, (%1), %%xmm4, dqa)\
  1463. "add %5, %0 \n\t"\
  1464. "add %5, %1 \n\t"\
  1465. "add %4, %2 \n\t"\
  1466. "decl %3 \n\t"\
  1467. "jg 1b \n\t"\
  1468. : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
  1469. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
  1470. "m"(ff_pw_5), "m"(ff_pw_16)\
  1471. : "memory"\
  1472. );\
  1473. }
  1474. #else // ARCH_X86_64
  1475. #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  1476. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  1477. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  1478. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  1479. src += 8*dstStride;\
  1480. dst += 8*dstStride;\
  1481. src2 += 8*src2Stride;\
  1482. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  1483. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  1484. }
  1485. #endif // ARCH_X86_64
  1486. #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
  1487. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  1488. int h=8;\
  1489. __asm__ volatile(\
  1490. "pxor %%xmm7, %%xmm7 \n\t"\
  1491. "movdqa %0, %%xmm6 \n\t"\
  1492. :: "m"(ff_pw_5)\
  1493. );\
  1494. do{\
  1495. __asm__ volatile(\
  1496. "lddqu -5(%0), %%xmm1 \n\t"\
  1497. "movdqa %%xmm1, %%xmm0 \n\t"\
  1498. "punpckhbw %%xmm7, %%xmm1 \n\t"\
  1499. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  1500. "movdqa %%xmm1, %%xmm2 \n\t"\
  1501. "movdqa %%xmm1, %%xmm3 \n\t"\
  1502. "movdqa %%xmm1, %%xmm4 \n\t"\
  1503. "movdqa %%xmm1, %%xmm5 \n\t"\
  1504. "palignr $6, %%xmm0, %%xmm5 \n\t"\
  1505. "palignr $8, %%xmm0, %%xmm4 \n\t"\
  1506. "palignr $10,%%xmm0, %%xmm3 \n\t"\
  1507. "paddw %%xmm1, %%xmm5 \n\t"\
  1508. "palignr $12,%%xmm0, %%xmm2 \n\t"\
  1509. "palignr $14,%%xmm0, %%xmm1 \n\t"\
  1510. "paddw %%xmm3, %%xmm2 \n\t"\
  1511. "paddw %%xmm4, %%xmm1 \n\t"\
  1512. "psllw $2, %%xmm2 \n\t"\
  1513. "movq (%2), %%xmm3 \n\t"\
  1514. "psubw %%xmm1, %%xmm2 \n\t"\
  1515. "paddw %5, %%xmm5 \n\t"\
  1516. "pmullw %%xmm6, %%xmm2 \n\t"\
  1517. "paddw %%xmm5, %%xmm2 \n\t"\
  1518. "psraw $5, %%xmm2 \n\t"\
  1519. "packuswb %%xmm2, %%xmm2 \n\t"\
  1520. "pavgb %%xmm3, %%xmm2 \n\t"\
  1521. OP(%%xmm2, (%1), %%xmm4, q)\
  1522. "add %4, %0 \n\t"\
  1523. "add %4, %1 \n\t"\
  1524. "add %3, %2 \n\t"\
  1525. : "+a"(src), "+c"(dst), "+d"(src2)\
  1526. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
  1527. "m"(ff_pw_16)\
  1528. : "memory"\
  1529. );\
  1530. }while(--h);\
  1531. }\
  1532. QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  1533. \
  1534. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1535. int h=8;\
  1536. __asm__ volatile(\
  1537. "pxor %%xmm7, %%xmm7 \n\t"\
  1538. "movdqa %5, %%xmm6 \n\t"\
  1539. "1: \n\t"\
  1540. "lddqu -5(%0), %%xmm1 \n\t"\
  1541. "movdqa %%xmm1, %%xmm0 \n\t"\
  1542. "punpckhbw %%xmm7, %%xmm1 \n\t"\
  1543. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  1544. "movdqa %%xmm1, %%xmm2 \n\t"\
  1545. "movdqa %%xmm1, %%xmm3 \n\t"\
  1546. "movdqa %%xmm1, %%xmm4 \n\t"\
  1547. "movdqa %%xmm1, %%xmm5 \n\t"\
  1548. "palignr $6, %%xmm0, %%xmm5 \n\t"\
  1549. "palignr $8, %%xmm0, %%xmm4 \n\t"\
  1550. "palignr $10,%%xmm0, %%xmm3 \n\t"\
  1551. "paddw %%xmm1, %%xmm5 \n\t"\
  1552. "palignr $12,%%xmm0, %%xmm2 \n\t"\
  1553. "palignr $14,%%xmm0, %%xmm1 \n\t"\
  1554. "paddw %%xmm3, %%xmm2 \n\t"\
  1555. "paddw %%xmm4, %%xmm1 \n\t"\
  1556. "psllw $2, %%xmm2 \n\t"\
  1557. "psubw %%xmm1, %%xmm2 \n\t"\
  1558. "paddw %6, %%xmm5 \n\t"\
  1559. "pmullw %%xmm6, %%xmm2 \n\t"\
  1560. "paddw %%xmm5, %%xmm2 \n\t"\
  1561. "psraw $5, %%xmm2 \n\t"\
  1562. "packuswb %%xmm2, %%xmm2 \n\t"\
  1563. OP(%%xmm2, (%1), %%xmm4, q)\
  1564. "add %3, %0 \n\t"\
  1565. "add %4, %1 \n\t"\
  1566. "decl %2 \n\t"\
  1567. " jnz 1b \n\t"\
  1568. : "+a"(src), "+c"(dst), "+g"(h)\
  1569. : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\
  1570. "m"(ff_pw_5), "m"(ff_pw_16)\
  1571. : "memory"\
  1572. );\
  1573. }\
  1574. static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1575. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  1576. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  1577. src += 8*srcStride;\
  1578. dst += 8*dstStride;\
  1579. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  1580. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  1581. }\
  1582. #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
  1583. static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1584. src -= 2*srcStride;\
  1585. \
  1586. __asm__ volatile(\
  1587. "pxor %%xmm7, %%xmm7 \n\t"\
  1588. "movq (%0), %%xmm0 \n\t"\
  1589. "add %2, %0 \n\t"\
  1590. "movq (%0), %%xmm1 \n\t"\
  1591. "add %2, %0 \n\t"\
  1592. "movq (%0), %%xmm2 \n\t"\
  1593. "add %2, %0 \n\t"\
  1594. "movq (%0), %%xmm3 \n\t"\
  1595. "add %2, %0 \n\t"\
  1596. "movq (%0), %%xmm4 \n\t"\
  1597. "add %2, %0 \n\t"\
  1598. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  1599. "punpcklbw %%xmm7, %%xmm1 \n\t"\
  1600. "punpcklbw %%xmm7, %%xmm2 \n\t"\
  1601. "punpcklbw %%xmm7, %%xmm3 \n\t"\
  1602. "punpcklbw %%xmm7, %%xmm4 \n\t"\
  1603. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  1604. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  1605. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  1606. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  1607. QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
  1608. QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
  1609. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  1610. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  1611. \
  1612. : "+a"(src), "+c"(dst)\
  1613. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  1614. : "memory"\
  1615. );\
  1616. if(h==16){\
  1617. __asm__ volatile(\
  1618. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  1619. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  1620. QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
  1621. QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
  1622. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  1623. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  1624. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  1625. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  1626. \
  1627. : "+a"(src), "+c"(dst)\
  1628. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  1629. : "memory"\
  1630. );\
  1631. }\
  1632. }\
  1633. static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1634. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
  1635. }\
  1636. static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1637. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
  1638. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
  1639. }
  1640. static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
  1641. int w = (size+8)>>3;
  1642. src -= 2*srcStride+2;
  1643. while(w--){
  1644. __asm__ volatile(
  1645. "pxor %%xmm7, %%xmm7 \n\t"
  1646. "movq (%0), %%xmm0 \n\t"
  1647. "add %2, %0 \n\t"
  1648. "movq (%0), %%xmm1 \n\t"
  1649. "add %2, %0 \n\t"
  1650. "movq (%0), %%xmm2 \n\t"
  1651. "add %2, %0 \n\t"
  1652. "movq (%0), %%xmm3 \n\t"
  1653. "add %2, %0 \n\t"
  1654. "movq (%0), %%xmm4 \n\t"
  1655. "add %2, %0 \n\t"
  1656. "punpcklbw %%xmm7, %%xmm0 \n\t"
  1657. "punpcklbw %%xmm7, %%xmm1 \n\t"
  1658. "punpcklbw %%xmm7, %%xmm2 \n\t"
  1659. "punpcklbw %%xmm7, %%xmm3 \n\t"
  1660. "punpcklbw %%xmm7, %%xmm4 \n\t"
  1661. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
  1662. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
  1663. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
  1664. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
  1665. QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
  1666. QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
  1667. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
  1668. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
  1669. : "+a"(src)
  1670. : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
  1671. : "memory"
  1672. );
  1673. if(size==16){
  1674. __asm__ volatile(
  1675. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
  1676. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
  1677. QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
  1678. QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
  1679. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
  1680. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
  1681. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
  1682. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
  1683. : "+a"(src)
  1684. : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
  1685. : "memory"
  1686. );
  1687. }
  1688. tmp += 8;
  1689. src += 8 - (size+5)*srcStride;
  1690. }
  1691. }
  1692. #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
  1693. static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
  1694. int h = size;\
  1695. if(size == 16){\
  1696. __asm__ volatile(\
  1697. "1: \n\t"\
  1698. "movdqa 32(%0), %%xmm4 \n\t"\
  1699. "movdqa 16(%0), %%xmm5 \n\t"\
  1700. "movdqa (%0), %%xmm7 \n\t"\
  1701. "movdqa %%xmm4, %%xmm3 \n\t"\
  1702. "movdqa %%xmm4, %%xmm2 \n\t"\
  1703. "movdqa %%xmm4, %%xmm1 \n\t"\
  1704. "movdqa %%xmm4, %%xmm0 \n\t"\
  1705. "palignr $10, %%xmm5, %%xmm0 \n\t"\
  1706. "palignr $8, %%xmm5, %%xmm1 \n\t"\
  1707. "palignr $6, %%xmm5, %%xmm2 \n\t"\
  1708. "palignr $4, %%xmm5, %%xmm3 \n\t"\
  1709. "palignr $2, %%xmm5, %%xmm4 \n\t"\
  1710. "paddw %%xmm5, %%xmm0 \n\t"\
  1711. "paddw %%xmm4, %%xmm1 \n\t"\
  1712. "paddw %%xmm3, %%xmm2 \n\t"\
  1713. "movdqa %%xmm5, %%xmm6 \n\t"\
  1714. "movdqa %%xmm5, %%xmm4 \n\t"\
  1715. "movdqa %%xmm5, %%xmm3 \n\t"\
  1716. "palignr $8, %%xmm7, %%xmm4 \n\t"\
  1717. "palignr $2, %%xmm7, %%xmm6 \n\t"\
  1718. "palignr $10, %%xmm7, %%xmm3 \n\t"\
  1719. "paddw %%xmm6, %%xmm4 \n\t"\
  1720. "movdqa %%xmm5, %%xmm6 \n\t"\
  1721. "palignr $6, %%xmm7, %%xmm5 \n\t"\
  1722. "palignr $4, %%xmm7, %%xmm6 \n\t"\
  1723. "paddw %%xmm7, %%xmm3 \n\t"\
  1724. "paddw %%xmm6, %%xmm5 \n\t"\
  1725. \
  1726. "psubw %%xmm1, %%xmm0 \n\t"\
  1727. "psubw %%xmm4, %%xmm3 \n\t"\
  1728. "psraw $2, %%xmm0 \n\t"\
  1729. "psraw $2, %%xmm3 \n\t"\
  1730. "psubw %%xmm1, %%xmm0 \n\t"\
  1731. "psubw %%xmm4, %%xmm3 \n\t"\
  1732. "paddw %%xmm2, %%xmm0 \n\t"\
  1733. "paddw %%xmm5, %%xmm3 \n\t"\
  1734. "psraw $2, %%xmm0 \n\t"\
  1735. "psraw $2, %%xmm3 \n\t"\
  1736. "paddw %%xmm2, %%xmm0 \n\t"\
  1737. "paddw %%xmm5, %%xmm3 \n\t"\
  1738. "psraw $6, %%xmm0 \n\t"\
  1739. "psraw $6, %%xmm3 \n\t"\
  1740. "packuswb %%xmm0, %%xmm3 \n\t"\
  1741. OP(%%xmm3, (%1), %%xmm7, dqa)\
  1742. "add $48, %0 \n\t"\
  1743. "add %3, %1 \n\t"\
  1744. "decl %2 \n\t"\
  1745. " jnz 1b \n\t"\
  1746. : "+a"(tmp), "+c"(dst), "+g"(h)\
  1747. : "S"((x86_reg)dstStride)\
  1748. : "memory"\
  1749. );\
  1750. }else{\
  1751. __asm__ volatile(\
  1752. "1: \n\t"\
  1753. "movdqa 16(%0), %%xmm1 \n\t"\
  1754. "movdqa (%0), %%xmm0 \n\t"\
  1755. "movdqa %%xmm1, %%xmm2 \n\t"\
  1756. "movdqa %%xmm1, %%xmm3 \n\t"\
  1757. "movdqa %%xmm1, %%xmm4 \n\t"\
  1758. "movdqa %%xmm1, %%xmm5 \n\t"\
  1759. "palignr $10, %%xmm0, %%xmm5 \n\t"\
  1760. "palignr $8, %%xmm0, %%xmm4 \n\t"\
  1761. "palignr $6, %%xmm0, %%xmm3 \n\t"\
  1762. "palignr $4, %%xmm0, %%xmm2 \n\t"\
  1763. "palignr $2, %%xmm0, %%xmm1 \n\t"\
  1764. "paddw %%xmm5, %%xmm0 \n\t"\
  1765. "paddw %%xmm4, %%xmm1 \n\t"\
  1766. "paddw %%xmm3, %%xmm2 \n\t"\
  1767. "psubw %%xmm1, %%xmm0 \n\t"\
  1768. "psraw $2, %%xmm0 \n\t"\
  1769. "psubw %%xmm1, %%xmm0 \n\t"\
  1770. "paddw %%xmm2, %%xmm0 \n\t"\
  1771. "psraw $2, %%xmm0 \n\t"\
  1772. "paddw %%xmm2, %%xmm0 \n\t"\
  1773. "psraw $6, %%xmm0 \n\t"\
  1774. "packuswb %%xmm0, %%xmm0 \n\t"\
  1775. OP(%%xmm0, (%1), %%xmm7, q)\
  1776. "add $48, %0 \n\t"\
  1777. "add %3, %1 \n\t"\
  1778. "decl %2 \n\t"\
  1779. " jnz 1b \n\t"\
  1780. : "+a"(tmp), "+c"(dst), "+g"(h)\
  1781. : "S"((x86_reg)dstStride)\
  1782. : "memory"\
  1783. );\
  1784. }\
  1785. }
  1786. #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
  1787. static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
  1788. put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
  1789. OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
  1790. }\
  1791. static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  1792. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
  1793. }\
  1794. static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  1795. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
  1796. }\
  1797. #define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
  1798. #define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
  1799. #define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
  1800. #define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
  1801. #define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
  1802. #define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
  1803. #define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
  1804. #define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
  1805. #define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
  1806. #define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
  1807. #define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
  1808. #define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
  1809. #define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
  1810. #define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
  1811. #define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
  1812. #define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
  1813. #define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
  1814. #define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
  1815. #define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
  1816. #define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
  1817. #define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
  1818. #define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
  1819. #define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
  1820. #define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
  1821. #define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
  1822. #define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
  1823. #define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
  1824. H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
  1825. H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
  1826. H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
  1827. H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
  1828. static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
  1829. put_pixels16_sse2(dst, src, stride, 16);
  1830. }
  1831. static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
  1832. avg_pixels16_sse2(dst, src, stride, 16);
  1833. }
  1834. #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
  1835. #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
  1836. #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
  1837. static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  1838. OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
  1839. }\
  1840. #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
  1841. static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1842. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
  1843. }\
  1844. \
  1845. static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1846. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
  1847. }\
  1848. \
  1849. static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1850. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
  1851. }\
  1852. #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
  1853. static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1854. DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
  1855. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1856. OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
  1857. }\
  1858. \
  1859. static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1860. OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
  1861. }\
  1862. \
  1863. static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1864. DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
  1865. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1866. OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
  1867. }\
  1868. #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
  1869. static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1870. DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
  1871. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1872. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
  1873. }\
  1874. \
  1875. static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1876. DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
  1877. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
  1878. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
  1879. }\
  1880. \
  1881. static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1882. DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
  1883. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1884. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
  1885. }\
  1886. \
  1887. static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1888. DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
  1889. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
  1890. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
  1891. }\
  1892. \
  1893. static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1894. DECLARE_ALIGNED(ALIGN, uint16_t, temp[SIZE*(SIZE<8?12:24)]);\
  1895. OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
  1896. }\
  1897. \
  1898. static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1899. DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
  1900. uint8_t * const halfHV= temp;\
  1901. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1902. assert(((int)temp & 7) == 0);\
  1903. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1904. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
  1905. }\
  1906. \
  1907. static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1908. DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
  1909. uint8_t * const halfHV= temp;\
  1910. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1911. assert(((int)temp & 7) == 0);\
  1912. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1913. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
  1914. }\
  1915. \
  1916. static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1917. DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
  1918. uint8_t * const halfHV= temp;\
  1919. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1920. assert(((int)temp & 7) == 0);\
  1921. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1922. OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
  1923. }\
  1924. \
  1925. static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1926. DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
  1927. uint8_t * const halfHV= temp;\
  1928. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1929. assert(((int)temp & 7) == 0);\
  1930. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1931. OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
  1932. }\
  1933. #define H264_MC_4816(MMX)\
  1934. H264_MC(put_, 4, MMX, 8)\
  1935. H264_MC(put_, 8, MMX, 8)\
  1936. H264_MC(put_, 16,MMX, 8)\
  1937. H264_MC(avg_, 4, MMX, 8)\
  1938. H264_MC(avg_, 8, MMX, 8)\
  1939. H264_MC(avg_, 16,MMX, 8)\
  1940. #define H264_MC_816(QPEL, XMM)\
  1941. QPEL(put_, 8, XMM, 16)\
  1942. QPEL(put_, 16,XMM, 16)\
  1943. QPEL(avg_, 8, XMM, 16)\
  1944. QPEL(avg_, 16,XMM, 16)\
  1945. #define AVG_3DNOW_OP(a,b,temp, size) \
  1946. "mov" #size " " #b ", " #temp " \n\t"\
  1947. "pavgusb " #temp ", " #a " \n\t"\
  1948. "mov" #size " " #a ", " #b " \n\t"
  1949. #define AVG_MMX2_OP(a,b,temp, size) \
  1950. "mov" #size " " #b ", " #temp " \n\t"\
  1951. "pavgb " #temp ", " #a " \n\t"\
  1952. "mov" #size " " #a ", " #b " \n\t"
  1953. #define PAVGB "pavgusb"
  1954. QPEL_H264(put_, PUT_OP, 3dnow)
  1955. QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
  1956. #undef PAVGB
  1957. #define PAVGB "pavgb"
  1958. QPEL_H264(put_, PUT_OP, mmx2)
  1959. QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
  1960. QPEL_H264_V_XMM(put_, PUT_OP, sse2)
  1961. QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2)
  1962. QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
  1963. QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2)
  1964. #if HAVE_SSSE3
  1965. QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
  1966. QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3)
  1967. QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
  1968. QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3)
  1969. QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
  1970. QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3)
  1971. #endif
  1972. #undef PAVGB
  1973. H264_MC_4816(3dnow)
  1974. H264_MC_4816(mmx2)
  1975. H264_MC_816(H264_MC_V, sse2)
  1976. H264_MC_816(H264_MC_HV, sse2)
  1977. #if HAVE_SSSE3
  1978. H264_MC_816(H264_MC_H, ssse3)
  1979. H264_MC_816(H264_MC_HV, ssse3)
  1980. #endif
  1981. /* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */
  1982. DECLARE_ALIGNED_8(static const uint64_t, h264_rnd_reg[4]) = {
  1983. 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL
  1984. };
  1985. #define H264_CHROMA_OP(S,D)
  1986. #define H264_CHROMA_OP4(S,D,T)
  1987. #define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx
  1988. #define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx
  1989. #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
  1990. #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
  1991. #include "dsputil_h264_template_mmx.c"
  1992. static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  1993. {
  1994. put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
  1995. }
  1996. static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  1997. {
  1998. put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg+2);
  1999. }
  2000. static void put_h264_chroma_mc4_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  2001. {
  2002. put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
  2003. }
  2004. #undef H264_CHROMA_OP
  2005. #undef H264_CHROMA_OP4
  2006. #undef H264_CHROMA_MC8_TMPL
  2007. #undef H264_CHROMA_MC4_TMPL
  2008. #undef H264_CHROMA_MC2_TMPL
  2009. #undef H264_CHROMA_MC8_MV0
  2010. #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
  2011. #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
  2012. "pavgb " #T ", " #D " \n\t"
  2013. #define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_mmx2
  2014. #define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_mmx2
  2015. #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
  2016. #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
  2017. #include "dsputil_h264_template_mmx.c"
  2018. static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  2019. {
  2020. avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
  2021. }
  2022. static void avg_h264_chroma_mc4_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  2023. {
  2024. avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
  2025. }
  2026. #undef H264_CHROMA_OP
  2027. #undef H264_CHROMA_OP4
  2028. #undef H264_CHROMA_MC8_TMPL
  2029. #undef H264_CHROMA_MC4_TMPL
  2030. #undef H264_CHROMA_MC2_TMPL
  2031. #undef H264_CHROMA_MC8_MV0
  2032. #define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
  2033. #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
  2034. "pavgusb " #T ", " #D " \n\t"
  2035. #define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_3dnow
  2036. #define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_3dnow
  2037. #define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
  2038. #include "dsputil_h264_template_mmx.c"
  2039. static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  2040. {
  2041. avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
  2042. }
  2043. static void avg_h264_chroma_mc4_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  2044. {
  2045. avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
  2046. }
  2047. #undef H264_CHROMA_OP
  2048. #undef H264_CHROMA_OP4
  2049. #undef H264_CHROMA_MC8_TMPL
  2050. #undef H264_CHROMA_MC4_TMPL
  2051. #undef H264_CHROMA_MC8_MV0
  2052. #if HAVE_SSSE3
  2053. #define AVG_OP(X)
  2054. #undef H264_CHROMA_MC8_TMPL
  2055. #undef H264_CHROMA_MC4_TMPL
  2056. #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
  2057. #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
  2058. #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
  2059. #include "dsputil_h264_template_ssse3.c"
  2060. static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  2061. {
  2062. put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
  2063. }
  2064. static void put_h264_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  2065. {
  2066. put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0);
  2067. }
  2068. #undef AVG_OP
  2069. #undef H264_CHROMA_MC8_TMPL
  2070. #undef H264_CHROMA_MC4_TMPL
  2071. #undef H264_CHROMA_MC8_MV0
  2072. #define AVG_OP(X) X
  2073. #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
  2074. #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
  2075. #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
  2076. #include "dsputil_h264_template_ssse3.c"
  2077. static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  2078. {
  2079. avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
  2080. }
  2081. #undef AVG_OP
  2082. #undef H264_CHROMA_MC8_TMPL
  2083. #undef H264_CHROMA_MC4_TMPL
  2084. #undef H264_CHROMA_MC8_MV0
  2085. #endif
  2086. /***********************************/
  2087. /* weighted prediction */
  2088. static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
  2089. {
  2090. int x, y;
  2091. offset <<= log2_denom;
  2092. offset += (1 << log2_denom) >> 1;
  2093. __asm__ volatile(
  2094. "movd %0, %%mm4 \n\t"
  2095. "movd %1, %%mm5 \n\t"
  2096. "movd %2, %%mm6 \n\t"
  2097. "pshufw $0, %%mm4, %%mm4 \n\t"
  2098. "pshufw $0, %%mm5, %%mm5 \n\t"
  2099. "pxor %%mm7, %%mm7 \n\t"
  2100. :: "g"(weight), "g"(offset), "g"(log2_denom)
  2101. );
  2102. for(y=0; y<h; y+=2){
  2103. for(x=0; x<w; x+=4){
  2104. __asm__ volatile(
  2105. "movd %0, %%mm0 \n\t"
  2106. "movd %1, %%mm1 \n\t"
  2107. "punpcklbw %%mm7, %%mm0 \n\t"
  2108. "punpcklbw %%mm7, %%mm1 \n\t"
  2109. "pmullw %%mm4, %%mm0 \n\t"
  2110. "pmullw %%mm4, %%mm1 \n\t"
  2111. "paddsw %%mm5, %%mm0 \n\t"
  2112. "paddsw %%mm5, %%mm1 \n\t"
  2113. "psraw %%mm6, %%mm0 \n\t"
  2114. "psraw %%mm6, %%mm1 \n\t"
  2115. "packuswb %%mm7, %%mm0 \n\t"
  2116. "packuswb %%mm7, %%mm1 \n\t"
  2117. "movd %%mm0, %0 \n\t"
  2118. "movd %%mm1, %1 \n\t"
  2119. : "+m"(*(uint32_t*)(dst+x)),
  2120. "+m"(*(uint32_t*)(dst+x+stride))
  2121. );
  2122. }
  2123. dst += 2*stride;
  2124. }
  2125. }
  2126. static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
  2127. {
  2128. int x, y;
  2129. offset = ((offset + 1) | 1) << log2_denom;
  2130. __asm__ volatile(
  2131. "movd %0, %%mm3 \n\t"
  2132. "movd %1, %%mm4 \n\t"
  2133. "movd %2, %%mm5 \n\t"
  2134. "movd %3, %%mm6 \n\t"
  2135. "pshufw $0, %%mm3, %%mm3 \n\t"
  2136. "pshufw $0, %%mm4, %%mm4 \n\t"
  2137. "pshufw $0, %%mm5, %%mm5 \n\t"
  2138. "pxor %%mm7, %%mm7 \n\t"
  2139. :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
  2140. );
  2141. for(y=0; y<h; y++){
  2142. for(x=0; x<w; x+=4){
  2143. __asm__ volatile(
  2144. "movd %0, %%mm0 \n\t"
  2145. "movd %1, %%mm1 \n\t"
  2146. "punpcklbw %%mm7, %%mm0 \n\t"
  2147. "punpcklbw %%mm7, %%mm1 \n\t"
  2148. "pmullw %%mm3, %%mm0 \n\t"
  2149. "pmullw %%mm4, %%mm1 \n\t"
  2150. "paddsw %%mm1, %%mm0 \n\t"
  2151. "paddsw %%mm5, %%mm0 \n\t"
  2152. "psraw %%mm6, %%mm0 \n\t"
  2153. "packuswb %%mm0, %%mm0 \n\t"
  2154. "movd %%mm0, %0 \n\t"
  2155. : "+m"(*(uint32_t*)(dst+x))
  2156. : "m"(*(uint32_t*)(src+x))
  2157. );
  2158. }
  2159. src += stride;
  2160. dst += stride;
  2161. }
  2162. }
  2163. #define H264_WEIGHT(W,H) \
  2164. static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
  2165. ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
  2166. } \
  2167. static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
  2168. ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
  2169. }
  2170. H264_WEIGHT(16,16)
  2171. H264_WEIGHT(16, 8)
  2172. H264_WEIGHT( 8,16)
  2173. H264_WEIGHT( 8, 8)
  2174. H264_WEIGHT( 8, 4)
  2175. H264_WEIGHT( 4, 8)
  2176. H264_WEIGHT( 4, 4)
  2177. H264_WEIGHT( 4, 2)