dsputil_mmx_avg_template.c 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896
  1. /*
  2. * DSP utils : average functions are compiled twice for 3dnow/mmx2
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  8. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  9. *
  10. * This file is part of FFmpeg.
  11. *
  12. * FFmpeg is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU Lesser General Public
  14. * License as published by the Free Software Foundation; either
  15. * version 2.1 of the License, or (at your option) any later version.
  16. *
  17. * FFmpeg is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. * Lesser General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU Lesser General Public
  23. * License along with FFmpeg; if not, write to the Free Software
  24. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. */
  26. /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
  27. clobber bug - now it will work with 2.95.2 and also with -fPIC
  28. */
  29. static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  30. {
  31. __asm__ volatile(
  32. "lea (%3, %3), %%"REG_a" \n\t"
  33. "1: \n\t"
  34. "movq (%1), %%mm0 \n\t"
  35. "movq (%1, %3), %%mm1 \n\t"
  36. PAVGB" 1(%1), %%mm0 \n\t"
  37. PAVGB" 1(%1, %3), %%mm1 \n\t"
  38. "movq %%mm0, (%2) \n\t"
  39. "movq %%mm1, (%2, %3) \n\t"
  40. "add %%"REG_a", %1 \n\t"
  41. "add %%"REG_a", %2 \n\t"
  42. "movq (%1), %%mm0 \n\t"
  43. "movq (%1, %3), %%mm1 \n\t"
  44. PAVGB" 1(%1), %%mm0 \n\t"
  45. PAVGB" 1(%1, %3), %%mm1 \n\t"
  46. "add %%"REG_a", %1 \n\t"
  47. "movq %%mm0, (%2) \n\t"
  48. "movq %%mm1, (%2, %3) \n\t"
  49. "add %%"REG_a", %2 \n\t"
  50. "subl $4, %0 \n\t"
  51. "jnz 1b \n\t"
  52. :"+g"(h), "+S"(pixels), "+D"(block)
  53. :"r" ((x86_reg)line_size)
  54. :"%"REG_a, "memory");
  55. }
  56. static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  57. {
  58. __asm__ volatile(
  59. "testl $1, %0 \n\t"
  60. " jz 1f \n\t"
  61. "movd (%1), %%mm0 \n\t"
  62. "movd (%2), %%mm1 \n\t"
  63. "add %4, %1 \n\t"
  64. "add $4, %2 \n\t"
  65. PAVGB" %%mm1, %%mm0 \n\t"
  66. "movd %%mm0, (%3) \n\t"
  67. "add %5, %3 \n\t"
  68. "decl %0 \n\t"
  69. "1: \n\t"
  70. "movd (%1), %%mm0 \n\t"
  71. "add %4, %1 \n\t"
  72. "movd (%1), %%mm1 \n\t"
  73. "movd (%2), %%mm2 \n\t"
  74. "movd 4(%2), %%mm3 \n\t"
  75. "add %4, %1 \n\t"
  76. PAVGB" %%mm2, %%mm0 \n\t"
  77. PAVGB" %%mm3, %%mm1 \n\t"
  78. "movd %%mm0, (%3) \n\t"
  79. "add %5, %3 \n\t"
  80. "movd %%mm1, (%3) \n\t"
  81. "add %5, %3 \n\t"
  82. "movd (%1), %%mm0 \n\t"
  83. "add %4, %1 \n\t"
  84. "movd (%1), %%mm1 \n\t"
  85. "movd 8(%2), %%mm2 \n\t"
  86. "movd 12(%2), %%mm3 \n\t"
  87. "add %4, %1 \n\t"
  88. PAVGB" %%mm2, %%mm0 \n\t"
  89. PAVGB" %%mm3, %%mm1 \n\t"
  90. "movd %%mm0, (%3) \n\t"
  91. "add %5, %3 \n\t"
  92. "movd %%mm1, (%3) \n\t"
  93. "add %5, %3 \n\t"
  94. "add $16, %2 \n\t"
  95. "subl $4, %0 \n\t"
  96. "jnz 1b \n\t"
  97. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  98. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  99. #else
  100. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  101. #endif
  102. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  103. :"memory");
  104. }
  105. static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  106. {
  107. __asm__ volatile(
  108. "testl $1, %0 \n\t"
  109. " jz 1f \n\t"
  110. "movq (%1), %%mm0 \n\t"
  111. "movq (%2), %%mm1 \n\t"
  112. "add %4, %1 \n\t"
  113. "add $8, %2 \n\t"
  114. PAVGB" %%mm1, %%mm0 \n\t"
  115. "movq %%mm0, (%3) \n\t"
  116. "add %5, %3 \n\t"
  117. "decl %0 \n\t"
  118. "1: \n\t"
  119. "movq (%1), %%mm0 \n\t"
  120. "add %4, %1 \n\t"
  121. "movq (%1), %%mm1 \n\t"
  122. "add %4, %1 \n\t"
  123. PAVGB" (%2), %%mm0 \n\t"
  124. PAVGB" 8(%2), %%mm1 \n\t"
  125. "movq %%mm0, (%3) \n\t"
  126. "add %5, %3 \n\t"
  127. "movq %%mm1, (%3) \n\t"
  128. "add %5, %3 \n\t"
  129. "movq (%1), %%mm0 \n\t"
  130. "add %4, %1 \n\t"
  131. "movq (%1), %%mm1 \n\t"
  132. "add %4, %1 \n\t"
  133. PAVGB" 16(%2), %%mm0 \n\t"
  134. PAVGB" 24(%2), %%mm1 \n\t"
  135. "movq %%mm0, (%3) \n\t"
  136. "add %5, %3 \n\t"
  137. "movq %%mm1, (%3) \n\t"
  138. "add %5, %3 \n\t"
  139. "add $32, %2 \n\t"
  140. "subl $4, %0 \n\t"
  141. "jnz 1b \n\t"
  142. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  143. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  144. #else
  145. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  146. #endif
  147. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  148. :"memory");
  149. //the following should be used, though better not with gcc ...
  150. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  151. :"r"(src1Stride), "r"(dstStride)
  152. :"memory");*/
  153. }
  154. static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  155. {
  156. __asm__ volatile(
  157. "pcmpeqb %%mm6, %%mm6 \n\t"
  158. "testl $1, %0 \n\t"
  159. " jz 1f \n\t"
  160. "movq (%1), %%mm0 \n\t"
  161. "movq (%2), %%mm1 \n\t"
  162. "add %4, %1 \n\t"
  163. "add $8, %2 \n\t"
  164. "pxor %%mm6, %%mm0 \n\t"
  165. "pxor %%mm6, %%mm1 \n\t"
  166. PAVGB" %%mm1, %%mm0 \n\t"
  167. "pxor %%mm6, %%mm0 \n\t"
  168. "movq %%mm0, (%3) \n\t"
  169. "add %5, %3 \n\t"
  170. "decl %0 \n\t"
  171. "1: \n\t"
  172. "movq (%1), %%mm0 \n\t"
  173. "add %4, %1 \n\t"
  174. "movq (%1), %%mm1 \n\t"
  175. "add %4, %1 \n\t"
  176. "movq (%2), %%mm2 \n\t"
  177. "movq 8(%2), %%mm3 \n\t"
  178. "pxor %%mm6, %%mm0 \n\t"
  179. "pxor %%mm6, %%mm1 \n\t"
  180. "pxor %%mm6, %%mm2 \n\t"
  181. "pxor %%mm6, %%mm3 \n\t"
  182. PAVGB" %%mm2, %%mm0 \n\t"
  183. PAVGB" %%mm3, %%mm1 \n\t"
  184. "pxor %%mm6, %%mm0 \n\t"
  185. "pxor %%mm6, %%mm1 \n\t"
  186. "movq %%mm0, (%3) \n\t"
  187. "add %5, %3 \n\t"
  188. "movq %%mm1, (%3) \n\t"
  189. "add %5, %3 \n\t"
  190. "movq (%1), %%mm0 \n\t"
  191. "add %4, %1 \n\t"
  192. "movq (%1), %%mm1 \n\t"
  193. "add %4, %1 \n\t"
  194. "movq 16(%2), %%mm2 \n\t"
  195. "movq 24(%2), %%mm3 \n\t"
  196. "pxor %%mm6, %%mm0 \n\t"
  197. "pxor %%mm6, %%mm1 \n\t"
  198. "pxor %%mm6, %%mm2 \n\t"
  199. "pxor %%mm6, %%mm3 \n\t"
  200. PAVGB" %%mm2, %%mm0 \n\t"
  201. PAVGB" %%mm3, %%mm1 \n\t"
  202. "pxor %%mm6, %%mm0 \n\t"
  203. "pxor %%mm6, %%mm1 \n\t"
  204. "movq %%mm0, (%3) \n\t"
  205. "add %5, %3 \n\t"
  206. "movq %%mm1, (%3) \n\t"
  207. "add %5, %3 \n\t"
  208. "add $32, %2 \n\t"
  209. "subl $4, %0 \n\t"
  210. "jnz 1b \n\t"
  211. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  212. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  213. #else
  214. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  215. #endif
  216. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  217. :"memory");
  218. //the following should be used, though better not with gcc ...
  219. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  220. :"r"(src1Stride), "r"(dstStride)
  221. :"memory");*/
  222. }
  223. static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  224. {
  225. __asm__ volatile(
  226. "testl $1, %0 \n\t"
  227. " jz 1f \n\t"
  228. "movd (%1), %%mm0 \n\t"
  229. "movd (%2), %%mm1 \n\t"
  230. "add %4, %1 \n\t"
  231. "add $4, %2 \n\t"
  232. PAVGB" %%mm1, %%mm0 \n\t"
  233. PAVGB" (%3), %%mm0 \n\t"
  234. "movd %%mm0, (%3) \n\t"
  235. "add %5, %3 \n\t"
  236. "decl %0 \n\t"
  237. "1: \n\t"
  238. "movd (%1), %%mm0 \n\t"
  239. "add %4, %1 \n\t"
  240. "movd (%1), %%mm1 \n\t"
  241. "add %4, %1 \n\t"
  242. PAVGB" (%2), %%mm0 \n\t"
  243. PAVGB" 4(%2), %%mm1 \n\t"
  244. PAVGB" (%3), %%mm0 \n\t"
  245. "movd %%mm0, (%3) \n\t"
  246. "add %5, %3 \n\t"
  247. PAVGB" (%3), %%mm1 \n\t"
  248. "movd %%mm1, (%3) \n\t"
  249. "add %5, %3 \n\t"
  250. "movd (%1), %%mm0 \n\t"
  251. "add %4, %1 \n\t"
  252. "movd (%1), %%mm1 \n\t"
  253. "add %4, %1 \n\t"
  254. PAVGB" 8(%2), %%mm0 \n\t"
  255. PAVGB" 12(%2), %%mm1 \n\t"
  256. PAVGB" (%3), %%mm0 \n\t"
  257. "movd %%mm0, (%3) \n\t"
  258. "add %5, %3 \n\t"
  259. PAVGB" (%3), %%mm1 \n\t"
  260. "movd %%mm1, (%3) \n\t"
  261. "add %5, %3 \n\t"
  262. "add $16, %2 \n\t"
  263. "subl $4, %0 \n\t"
  264. "jnz 1b \n\t"
  265. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  266. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  267. #else
  268. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  269. #endif
  270. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  271. :"memory");
  272. }
  273. static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  274. {
  275. __asm__ volatile(
  276. "testl $1, %0 \n\t"
  277. " jz 1f \n\t"
  278. "movq (%1), %%mm0 \n\t"
  279. "movq (%2), %%mm1 \n\t"
  280. "add %4, %1 \n\t"
  281. "add $8, %2 \n\t"
  282. PAVGB" %%mm1, %%mm0 \n\t"
  283. PAVGB" (%3), %%mm0 \n\t"
  284. "movq %%mm0, (%3) \n\t"
  285. "add %5, %3 \n\t"
  286. "decl %0 \n\t"
  287. "1: \n\t"
  288. "movq (%1), %%mm0 \n\t"
  289. "add %4, %1 \n\t"
  290. "movq (%1), %%mm1 \n\t"
  291. "add %4, %1 \n\t"
  292. PAVGB" (%2), %%mm0 \n\t"
  293. PAVGB" 8(%2), %%mm1 \n\t"
  294. PAVGB" (%3), %%mm0 \n\t"
  295. "movq %%mm0, (%3) \n\t"
  296. "add %5, %3 \n\t"
  297. PAVGB" (%3), %%mm1 \n\t"
  298. "movq %%mm1, (%3) \n\t"
  299. "add %5, %3 \n\t"
  300. "movq (%1), %%mm0 \n\t"
  301. "add %4, %1 \n\t"
  302. "movq (%1), %%mm1 \n\t"
  303. "add %4, %1 \n\t"
  304. PAVGB" 16(%2), %%mm0 \n\t"
  305. PAVGB" 24(%2), %%mm1 \n\t"
  306. PAVGB" (%3), %%mm0 \n\t"
  307. "movq %%mm0, (%3) \n\t"
  308. "add %5, %3 \n\t"
  309. PAVGB" (%3), %%mm1 \n\t"
  310. "movq %%mm1, (%3) \n\t"
  311. "add %5, %3 \n\t"
  312. "add $32, %2 \n\t"
  313. "subl $4, %0 \n\t"
  314. "jnz 1b \n\t"
  315. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  316. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  317. #else
  318. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  319. #endif
  320. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  321. :"memory");
  322. //the following should be used, though better not with gcc ...
  323. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  324. :"r"(src1Stride), "r"(dstStride)
  325. :"memory");*/
  326. }
  327. static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  328. {
  329. __asm__ volatile(
  330. "lea (%3, %3), %%"REG_a" \n\t"
  331. "1: \n\t"
  332. "movq (%1), %%mm0 \n\t"
  333. "movq (%1, %3), %%mm1 \n\t"
  334. "movq 8(%1), %%mm2 \n\t"
  335. "movq 8(%1, %3), %%mm3 \n\t"
  336. PAVGB" 1(%1), %%mm0 \n\t"
  337. PAVGB" 1(%1, %3), %%mm1 \n\t"
  338. PAVGB" 9(%1), %%mm2 \n\t"
  339. PAVGB" 9(%1, %3), %%mm3 \n\t"
  340. "movq %%mm0, (%2) \n\t"
  341. "movq %%mm1, (%2, %3) \n\t"
  342. "movq %%mm2, 8(%2) \n\t"
  343. "movq %%mm3, 8(%2, %3) \n\t"
  344. "add %%"REG_a", %1 \n\t"
  345. "add %%"REG_a", %2 \n\t"
  346. "movq (%1), %%mm0 \n\t"
  347. "movq (%1, %3), %%mm1 \n\t"
  348. "movq 8(%1), %%mm2 \n\t"
  349. "movq 8(%1, %3), %%mm3 \n\t"
  350. PAVGB" 1(%1), %%mm0 \n\t"
  351. PAVGB" 1(%1, %3), %%mm1 \n\t"
  352. PAVGB" 9(%1), %%mm2 \n\t"
  353. PAVGB" 9(%1, %3), %%mm3 \n\t"
  354. "add %%"REG_a", %1 \n\t"
  355. "movq %%mm0, (%2) \n\t"
  356. "movq %%mm1, (%2, %3) \n\t"
  357. "movq %%mm2, 8(%2) \n\t"
  358. "movq %%mm3, 8(%2, %3) \n\t"
  359. "add %%"REG_a", %2 \n\t"
  360. "subl $4, %0 \n\t"
  361. "jnz 1b \n\t"
  362. :"+g"(h), "+S"(pixels), "+D"(block)
  363. :"r" ((x86_reg)line_size)
  364. :"%"REG_a, "memory");
  365. }
  366. static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  367. {
  368. __asm__ volatile(
  369. "testl $1, %0 \n\t"
  370. " jz 1f \n\t"
  371. "movq (%1), %%mm0 \n\t"
  372. "movq 8(%1), %%mm1 \n\t"
  373. PAVGB" (%2), %%mm0 \n\t"
  374. PAVGB" 8(%2), %%mm1 \n\t"
  375. "add %4, %1 \n\t"
  376. "add $16, %2 \n\t"
  377. "movq %%mm0, (%3) \n\t"
  378. "movq %%mm1, 8(%3) \n\t"
  379. "add %5, %3 \n\t"
  380. "decl %0 \n\t"
  381. "1: \n\t"
  382. "movq (%1), %%mm0 \n\t"
  383. "movq 8(%1), %%mm1 \n\t"
  384. "add %4, %1 \n\t"
  385. PAVGB" (%2), %%mm0 \n\t"
  386. PAVGB" 8(%2), %%mm1 \n\t"
  387. "movq %%mm0, (%3) \n\t"
  388. "movq %%mm1, 8(%3) \n\t"
  389. "add %5, %3 \n\t"
  390. "movq (%1), %%mm0 \n\t"
  391. "movq 8(%1), %%mm1 \n\t"
  392. "add %4, %1 \n\t"
  393. PAVGB" 16(%2), %%mm0 \n\t"
  394. PAVGB" 24(%2), %%mm1 \n\t"
  395. "movq %%mm0, (%3) \n\t"
  396. "movq %%mm1, 8(%3) \n\t"
  397. "add %5, %3 \n\t"
  398. "add $32, %2 \n\t"
  399. "subl $2, %0 \n\t"
  400. "jnz 1b \n\t"
  401. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  402. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  403. #else
  404. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  405. #endif
  406. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  407. :"memory");
  408. //the following should be used, though better not with gcc ...
  409. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  410. :"r"(src1Stride), "r"(dstStride)
  411. :"memory");*/
  412. }
  413. static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  414. {
  415. __asm__ volatile(
  416. "testl $1, %0 \n\t"
  417. " jz 1f \n\t"
  418. "movq (%1), %%mm0 \n\t"
  419. "movq 8(%1), %%mm1 \n\t"
  420. PAVGB" (%2), %%mm0 \n\t"
  421. PAVGB" 8(%2), %%mm1 \n\t"
  422. "add %4, %1 \n\t"
  423. "add $16, %2 \n\t"
  424. PAVGB" (%3), %%mm0 \n\t"
  425. PAVGB" 8(%3), %%mm1 \n\t"
  426. "movq %%mm0, (%3) \n\t"
  427. "movq %%mm1, 8(%3) \n\t"
  428. "add %5, %3 \n\t"
  429. "decl %0 \n\t"
  430. "1: \n\t"
  431. "movq (%1), %%mm0 \n\t"
  432. "movq 8(%1), %%mm1 \n\t"
  433. "add %4, %1 \n\t"
  434. PAVGB" (%2), %%mm0 \n\t"
  435. PAVGB" 8(%2), %%mm1 \n\t"
  436. PAVGB" (%3), %%mm0 \n\t"
  437. PAVGB" 8(%3), %%mm1 \n\t"
  438. "movq %%mm0, (%3) \n\t"
  439. "movq %%mm1, 8(%3) \n\t"
  440. "add %5, %3 \n\t"
  441. "movq (%1), %%mm0 \n\t"
  442. "movq 8(%1), %%mm1 \n\t"
  443. "add %4, %1 \n\t"
  444. PAVGB" 16(%2), %%mm0 \n\t"
  445. PAVGB" 24(%2), %%mm1 \n\t"
  446. PAVGB" (%3), %%mm0 \n\t"
  447. PAVGB" 8(%3), %%mm1 \n\t"
  448. "movq %%mm0, (%3) \n\t"
  449. "movq %%mm1, 8(%3) \n\t"
  450. "add %5, %3 \n\t"
  451. "add $32, %2 \n\t"
  452. "subl $2, %0 \n\t"
  453. "jnz 1b \n\t"
  454. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  455. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  456. #else
  457. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  458. #endif
  459. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  460. :"memory");
  461. //the following should be used, though better not with gcc ...
  462. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  463. :"r"(src1Stride), "r"(dstStride)
  464. :"memory");*/
  465. }
  466. static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  467. {
  468. __asm__ volatile(
  469. "pcmpeqb %%mm6, %%mm6 \n\t"
  470. "testl $1, %0 \n\t"
  471. " jz 1f \n\t"
  472. "movq (%1), %%mm0 \n\t"
  473. "movq 8(%1), %%mm1 \n\t"
  474. "movq (%2), %%mm2 \n\t"
  475. "movq 8(%2), %%mm3 \n\t"
  476. "pxor %%mm6, %%mm0 \n\t"
  477. "pxor %%mm6, %%mm1 \n\t"
  478. "pxor %%mm6, %%mm2 \n\t"
  479. "pxor %%mm6, %%mm3 \n\t"
  480. PAVGB" %%mm2, %%mm0 \n\t"
  481. PAVGB" %%mm3, %%mm1 \n\t"
  482. "pxor %%mm6, %%mm0 \n\t"
  483. "pxor %%mm6, %%mm1 \n\t"
  484. "add %4, %1 \n\t"
  485. "add $16, %2 \n\t"
  486. "movq %%mm0, (%3) \n\t"
  487. "movq %%mm1, 8(%3) \n\t"
  488. "add %5, %3 \n\t"
  489. "decl %0 \n\t"
  490. "1: \n\t"
  491. "movq (%1), %%mm0 \n\t"
  492. "movq 8(%1), %%mm1 \n\t"
  493. "add %4, %1 \n\t"
  494. "movq (%2), %%mm2 \n\t"
  495. "movq 8(%2), %%mm3 \n\t"
  496. "pxor %%mm6, %%mm0 \n\t"
  497. "pxor %%mm6, %%mm1 \n\t"
  498. "pxor %%mm6, %%mm2 \n\t"
  499. "pxor %%mm6, %%mm3 \n\t"
  500. PAVGB" %%mm2, %%mm0 \n\t"
  501. PAVGB" %%mm3, %%mm1 \n\t"
  502. "pxor %%mm6, %%mm0 \n\t"
  503. "pxor %%mm6, %%mm1 \n\t"
  504. "movq %%mm0, (%3) \n\t"
  505. "movq %%mm1, 8(%3) \n\t"
  506. "add %5, %3 \n\t"
  507. "movq (%1), %%mm0 \n\t"
  508. "movq 8(%1), %%mm1 \n\t"
  509. "add %4, %1 \n\t"
  510. "movq 16(%2), %%mm2 \n\t"
  511. "movq 24(%2), %%mm3 \n\t"
  512. "pxor %%mm6, %%mm0 \n\t"
  513. "pxor %%mm6, %%mm1 \n\t"
  514. "pxor %%mm6, %%mm2 \n\t"
  515. "pxor %%mm6, %%mm3 \n\t"
  516. PAVGB" %%mm2, %%mm0 \n\t"
  517. PAVGB" %%mm3, %%mm1 \n\t"
  518. "pxor %%mm6, %%mm0 \n\t"
  519. "pxor %%mm6, %%mm1 \n\t"
  520. "movq %%mm0, (%3) \n\t"
  521. "movq %%mm1, 8(%3) \n\t"
  522. "add %5, %3 \n\t"
  523. "add $32, %2 \n\t"
  524. "subl $2, %0 \n\t"
  525. "jnz 1b \n\t"
  526. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  527. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  528. #else
  529. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  530. #endif
  531. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  532. :"memory");
  533. //the following should be used, though better not with gcc ...
  534. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  535. :"r"(src1Stride), "r"(dstStride)
  536. :"memory");*/
  537. }
  538. /* GL: this function does incorrect rounding if overflow */
  539. static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  540. {
  541. MOVQ_BONE(mm6);
  542. __asm__ volatile(
  543. "lea (%3, %3), %%"REG_a" \n\t"
  544. "1: \n\t"
  545. "movq (%1), %%mm0 \n\t"
  546. "movq (%1, %3), %%mm2 \n\t"
  547. "movq 1(%1), %%mm1 \n\t"
  548. "movq 1(%1, %3), %%mm3 \n\t"
  549. "add %%"REG_a", %1 \n\t"
  550. "psubusb %%mm6, %%mm0 \n\t"
  551. "psubusb %%mm6, %%mm2 \n\t"
  552. PAVGB" %%mm1, %%mm0 \n\t"
  553. PAVGB" %%mm3, %%mm2 \n\t"
  554. "movq %%mm0, (%2) \n\t"
  555. "movq %%mm2, (%2, %3) \n\t"
  556. "movq (%1), %%mm0 \n\t"
  557. "movq 1(%1), %%mm1 \n\t"
  558. "movq (%1, %3), %%mm2 \n\t"
  559. "movq 1(%1, %3), %%mm3 \n\t"
  560. "add %%"REG_a", %2 \n\t"
  561. "add %%"REG_a", %1 \n\t"
  562. "psubusb %%mm6, %%mm0 \n\t"
  563. "psubusb %%mm6, %%mm2 \n\t"
  564. PAVGB" %%mm1, %%mm0 \n\t"
  565. PAVGB" %%mm3, %%mm2 \n\t"
  566. "movq %%mm0, (%2) \n\t"
  567. "movq %%mm2, (%2, %3) \n\t"
  568. "add %%"REG_a", %2 \n\t"
  569. "subl $4, %0 \n\t"
  570. "jnz 1b \n\t"
  571. :"+g"(h), "+S"(pixels), "+D"(block)
  572. :"r" ((x86_reg)line_size)
  573. :"%"REG_a, "memory");
  574. }
  575. static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  576. {
  577. __asm__ volatile(
  578. "lea (%3, %3), %%"REG_a" \n\t"
  579. "movq (%1), %%mm0 \n\t"
  580. "sub %3, %2 \n\t"
  581. "1: \n\t"
  582. "movq (%1, %3), %%mm1 \n\t"
  583. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  584. "add %%"REG_a", %1 \n\t"
  585. PAVGB" %%mm1, %%mm0 \n\t"
  586. PAVGB" %%mm2, %%mm1 \n\t"
  587. "movq %%mm0, (%2, %3) \n\t"
  588. "movq %%mm1, (%2, %%"REG_a") \n\t"
  589. "movq (%1, %3), %%mm1 \n\t"
  590. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  591. "add %%"REG_a", %2 \n\t"
  592. "add %%"REG_a", %1 \n\t"
  593. PAVGB" %%mm1, %%mm2 \n\t"
  594. PAVGB" %%mm0, %%mm1 \n\t"
  595. "movq %%mm2, (%2, %3) \n\t"
  596. "movq %%mm1, (%2, %%"REG_a") \n\t"
  597. "add %%"REG_a", %2 \n\t"
  598. "subl $4, %0 \n\t"
  599. "jnz 1b \n\t"
  600. :"+g"(h), "+S"(pixels), "+D" (block)
  601. :"r" ((x86_reg)line_size)
  602. :"%"REG_a, "memory");
  603. }
  604. /* GL: this function does incorrect rounding if overflow */
  605. static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  606. {
  607. MOVQ_BONE(mm6);
  608. __asm__ volatile(
  609. "lea (%3, %3), %%"REG_a" \n\t"
  610. "movq (%1), %%mm0 \n\t"
  611. "sub %3, %2 \n\t"
  612. "1: \n\t"
  613. "movq (%1, %3), %%mm1 \n\t"
  614. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  615. "add %%"REG_a", %1 \n\t"
  616. "psubusb %%mm6, %%mm1 \n\t"
  617. PAVGB" %%mm1, %%mm0 \n\t"
  618. PAVGB" %%mm2, %%mm1 \n\t"
  619. "movq %%mm0, (%2, %3) \n\t"
  620. "movq %%mm1, (%2, %%"REG_a") \n\t"
  621. "movq (%1, %3), %%mm1 \n\t"
  622. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  623. "add %%"REG_a", %2 \n\t"
  624. "add %%"REG_a", %1 \n\t"
  625. "psubusb %%mm6, %%mm1 \n\t"
  626. PAVGB" %%mm1, %%mm2 \n\t"
  627. PAVGB" %%mm0, %%mm1 \n\t"
  628. "movq %%mm2, (%2, %3) \n\t"
  629. "movq %%mm1, (%2, %%"REG_a") \n\t"
  630. "add %%"REG_a", %2 \n\t"
  631. "subl $4, %0 \n\t"
  632. "jnz 1b \n\t"
  633. :"+g"(h), "+S"(pixels), "+D" (block)
  634. :"r" ((x86_reg)line_size)
  635. :"%"REG_a, "memory");
  636. }
  637. static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  638. {
  639. __asm__ volatile(
  640. "lea (%3, %3), %%"REG_a" \n\t"
  641. "1: \n\t"
  642. "movq (%2), %%mm0 \n\t"
  643. "movq (%2, %3), %%mm1 \n\t"
  644. PAVGB" (%1), %%mm0 \n\t"
  645. PAVGB" (%1, %3), %%mm1 \n\t"
  646. "movq %%mm0, (%2) \n\t"
  647. "movq %%mm1, (%2, %3) \n\t"
  648. "add %%"REG_a", %1 \n\t"
  649. "add %%"REG_a", %2 \n\t"
  650. "movq (%2), %%mm0 \n\t"
  651. "movq (%2, %3), %%mm1 \n\t"
  652. PAVGB" (%1), %%mm0 \n\t"
  653. PAVGB" (%1, %3), %%mm1 \n\t"
  654. "add %%"REG_a", %1 \n\t"
  655. "movq %%mm0, (%2) \n\t"
  656. "movq %%mm1, (%2, %3) \n\t"
  657. "add %%"REG_a", %2 \n\t"
  658. "subl $4, %0 \n\t"
  659. "jnz 1b \n\t"
  660. :"+g"(h), "+S"(pixels), "+D"(block)
  661. :"r" ((x86_reg)line_size)
  662. :"%"REG_a, "memory");
  663. }
  664. static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  665. {
  666. __asm__ volatile(
  667. "lea (%3, %3), %%"REG_a" \n\t"
  668. "1: \n\t"
  669. "movq (%1), %%mm0 \n\t"
  670. "movq (%1, %3), %%mm2 \n\t"
  671. PAVGB" 1(%1), %%mm0 \n\t"
  672. PAVGB" 1(%1, %3), %%mm2 \n\t"
  673. PAVGB" (%2), %%mm0 \n\t"
  674. PAVGB" (%2, %3), %%mm2 \n\t"
  675. "add %%"REG_a", %1 \n\t"
  676. "movq %%mm0, (%2) \n\t"
  677. "movq %%mm2, (%2, %3) \n\t"
  678. "movq (%1), %%mm0 \n\t"
  679. "movq (%1, %3), %%mm2 \n\t"
  680. PAVGB" 1(%1), %%mm0 \n\t"
  681. PAVGB" 1(%1, %3), %%mm2 \n\t"
  682. "add %%"REG_a", %2 \n\t"
  683. "add %%"REG_a", %1 \n\t"
  684. PAVGB" (%2), %%mm0 \n\t"
  685. PAVGB" (%2, %3), %%mm2 \n\t"
  686. "movq %%mm0, (%2) \n\t"
  687. "movq %%mm2, (%2, %3) \n\t"
  688. "add %%"REG_a", %2 \n\t"
  689. "subl $4, %0 \n\t"
  690. "jnz 1b \n\t"
  691. :"+g"(h), "+S"(pixels), "+D"(block)
  692. :"r" ((x86_reg)line_size)
  693. :"%"REG_a, "memory");
  694. }
  695. static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  696. {
  697. __asm__ volatile(
  698. "lea (%3, %3), %%"REG_a" \n\t"
  699. "movq (%1), %%mm0 \n\t"
  700. "sub %3, %2 \n\t"
  701. "1: \n\t"
  702. "movq (%1, %3), %%mm1 \n\t"
  703. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  704. "add %%"REG_a", %1 \n\t"
  705. PAVGB" %%mm1, %%mm0 \n\t"
  706. PAVGB" %%mm2, %%mm1 \n\t"
  707. "movq (%2, %3), %%mm3 \n\t"
  708. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  709. PAVGB" %%mm3, %%mm0 \n\t"
  710. PAVGB" %%mm4, %%mm1 \n\t"
  711. "movq %%mm0, (%2, %3) \n\t"
  712. "movq %%mm1, (%2, %%"REG_a") \n\t"
  713. "movq (%1, %3), %%mm1 \n\t"
  714. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  715. PAVGB" %%mm1, %%mm2 \n\t"
  716. PAVGB" %%mm0, %%mm1 \n\t"
  717. "add %%"REG_a", %2 \n\t"
  718. "add %%"REG_a", %1 \n\t"
  719. "movq (%2, %3), %%mm3 \n\t"
  720. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  721. PAVGB" %%mm3, %%mm2 \n\t"
  722. PAVGB" %%mm4, %%mm1 \n\t"
  723. "movq %%mm2, (%2, %3) \n\t"
  724. "movq %%mm1, (%2, %%"REG_a") \n\t"
  725. "add %%"REG_a", %2 \n\t"
  726. "subl $4, %0 \n\t"
  727. "jnz 1b \n\t"
  728. :"+g"(h), "+S"(pixels), "+D"(block)
  729. :"r" ((x86_reg)line_size)
  730. :"%"REG_a, "memory");
  731. }
  732. /* Note this is not correctly rounded, but this function is only
  733. * used for B-frames so it does not matter. */
  734. static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  735. {
  736. MOVQ_BONE(mm6);
  737. __asm__ volatile(
  738. "lea (%3, %3), %%"REG_a" \n\t"
  739. "movq (%1), %%mm0 \n\t"
  740. PAVGB" 1(%1), %%mm0 \n\t"
  741. ASMALIGN(3)
  742. "1: \n\t"
  743. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  744. "movq (%1, %3), %%mm1 \n\t"
  745. "psubusb %%mm6, %%mm2 \n\t"
  746. PAVGB" 1(%1, %3), %%mm1 \n\t"
  747. PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
  748. "add %%"REG_a", %1 \n\t"
  749. PAVGB" %%mm1, %%mm0 \n\t"
  750. PAVGB" %%mm2, %%mm1 \n\t"
  751. PAVGB" (%2), %%mm0 \n\t"
  752. PAVGB" (%2, %3), %%mm1 \n\t"
  753. "movq %%mm0, (%2) \n\t"
  754. "movq %%mm1, (%2, %3) \n\t"
  755. "movq (%1, %3), %%mm1 \n\t"
  756. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  757. PAVGB" 1(%1, %3), %%mm1 \n\t"
  758. PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
  759. "add %%"REG_a", %2 \n\t"
  760. "add %%"REG_a", %1 \n\t"
  761. PAVGB" %%mm1, %%mm2 \n\t"
  762. PAVGB" %%mm0, %%mm1 \n\t"
  763. PAVGB" (%2), %%mm2 \n\t"
  764. PAVGB" (%2, %3), %%mm1 \n\t"
  765. "movq %%mm2, (%2) \n\t"
  766. "movq %%mm1, (%2, %3) \n\t"
  767. "add %%"REG_a", %2 \n\t"
  768. "subl $4, %0 \n\t"
  769. "jnz 1b \n\t"
  770. :"+g"(h), "+S"(pixels), "+D"(block)
  771. :"r" ((x86_reg)line_size)
  772. :"%"REG_a, "memory");
  773. }
  774. static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  775. {
  776. do {
  777. __asm__ volatile(
  778. "movd (%1), %%mm0 \n\t"
  779. "movd (%1, %2), %%mm1 \n\t"
  780. "movd (%1, %2, 2), %%mm2 \n\t"
  781. "movd (%1, %3), %%mm3 \n\t"
  782. PAVGB" (%0), %%mm0 \n\t"
  783. PAVGB" (%0, %2), %%mm1 \n\t"
  784. PAVGB" (%0, %2, 2), %%mm2 \n\t"
  785. PAVGB" (%0, %3), %%mm3 \n\t"
  786. "movd %%mm0, (%1) \n\t"
  787. "movd %%mm1, (%1, %2) \n\t"
  788. "movd %%mm2, (%1, %2, 2) \n\t"
  789. "movd %%mm3, (%1, %3) \n\t"
  790. ::"S"(pixels), "D"(block),
  791. "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
  792. :"memory");
  793. block += 4*line_size;
  794. pixels += 4*line_size;
  795. h -= 4;
  796. } while(h > 0);
  797. }
  798. //FIXME the following could be optimized too ...
  799. static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  800. DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
  801. DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
  802. }
  803. static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  804. DEF(put_pixels8_y2)(block , pixels , line_size, h);
  805. DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
  806. }
  807. static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  808. DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
  809. DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
  810. }
  811. static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  812. DEF(avg_pixels8)(block , pixels , line_size, h);
  813. DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
  814. }
  815. static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  816. DEF(avg_pixels8_x2)(block , pixels , line_size, h);
  817. DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
  818. }
  819. static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  820. DEF(avg_pixels8_y2)(block , pixels , line_size, h);
  821. DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
  822. }
  823. static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  824. DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
  825. DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
  826. }
  827. #define QPEL_2TAP_L3(OPNAME) \
  828. static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
  829. __asm__ volatile(\
  830. "1: \n\t"\
  831. "movq (%1,%2), %%mm0 \n\t"\
  832. "movq 8(%1,%2), %%mm1 \n\t"\
  833. PAVGB" (%1,%3), %%mm0 \n\t"\
  834. PAVGB" 8(%1,%3), %%mm1 \n\t"\
  835. PAVGB" (%1), %%mm0 \n\t"\
  836. PAVGB" 8(%1), %%mm1 \n\t"\
  837. STORE_OP( (%1,%4),%%mm0)\
  838. STORE_OP(8(%1,%4),%%mm1)\
  839. "movq %%mm0, (%1,%4) \n\t"\
  840. "movq %%mm1, 8(%1,%4) \n\t"\
  841. "add %5, %1 \n\t"\
  842. "decl %0 \n\t"\
  843. "jnz 1b \n\t"\
  844. :"+g"(h), "+r"(src)\
  845. :"r"((x86_reg)off1), "r"((x86_reg)off2),\
  846. "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
  847. :"memory"\
  848. );\
  849. }\
  850. static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
  851. __asm__ volatile(\
  852. "1: \n\t"\
  853. "movq (%1,%2), %%mm0 \n\t"\
  854. PAVGB" (%1,%3), %%mm0 \n\t"\
  855. PAVGB" (%1), %%mm0 \n\t"\
  856. STORE_OP((%1,%4),%%mm0)\
  857. "movq %%mm0, (%1,%4) \n\t"\
  858. "add %5, %1 \n\t"\
  859. "decl %0 \n\t"\
  860. "jnz 1b \n\t"\
  861. :"+g"(h), "+r"(src)\
  862. :"r"((x86_reg)off1), "r"((x86_reg)off2),\
  863. "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
  864. :"memory"\
  865. );\
  866. }
  867. #define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
  868. QPEL_2TAP_L3(avg_)
  869. #undef STORE_OP
  870. #define STORE_OP(a,b)
  871. QPEL_2TAP_L3(put_)
  872. #undef STORE_OP
  873. #undef QPEL_2TAP_L3