dsputil_mmx_avg.h 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900
  1. /*
  2. * DSP utils : average functions are compiled twice for 3dnow/mmx2
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. * Copyright (c) 2002-2004 Michael Niedermayer
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  8. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  9. *
  10. * This file is part of FFmpeg.
  11. *
  12. * FFmpeg is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU Lesser General Public
  14. * License as published by the Free Software Foundation; either
  15. * version 2.1 of the License, or (at your option) any later version.
  16. *
  17. * FFmpeg is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. * Lesser General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU Lesser General Public
  23. * License along with FFmpeg; if not, write to the Free Software
  24. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. */
  26. /* This header intentionally has no multiple inclusion guards. It is meant to
  27. * be included multiple times and generates different code depending on the
  28. * value of certain #defines. */
  29. /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
  30. clobber bug - now it will work with 2.95.2 and also with -fPIC
  31. */
  32. static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  33. {
  34. asm volatile(
  35. "lea (%3, %3), %%"REG_a" \n\t"
  36. "1: \n\t"
  37. "movq (%1), %%mm0 \n\t"
  38. "movq (%1, %3), %%mm1 \n\t"
  39. PAVGB" 1(%1), %%mm0 \n\t"
  40. PAVGB" 1(%1, %3), %%mm1 \n\t"
  41. "movq %%mm0, (%2) \n\t"
  42. "movq %%mm1, (%2, %3) \n\t"
  43. "add %%"REG_a", %1 \n\t"
  44. "add %%"REG_a", %2 \n\t"
  45. "movq (%1), %%mm0 \n\t"
  46. "movq (%1, %3), %%mm1 \n\t"
  47. PAVGB" 1(%1), %%mm0 \n\t"
  48. PAVGB" 1(%1, %3), %%mm1 \n\t"
  49. "add %%"REG_a", %1 \n\t"
  50. "movq %%mm0, (%2) \n\t"
  51. "movq %%mm1, (%2, %3) \n\t"
  52. "add %%"REG_a", %2 \n\t"
  53. "subl $4, %0 \n\t"
  54. "jnz 1b \n\t"
  55. :"+g"(h), "+S"(pixels), "+D"(block)
  56. :"r" ((x86_reg)line_size)
  57. :"%"REG_a, "memory");
  58. }
  59. static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  60. {
  61. asm volatile(
  62. "testl $1, %0 \n\t"
  63. " jz 1f \n\t"
  64. "movd (%1), %%mm0 \n\t"
  65. "movd (%2), %%mm1 \n\t"
  66. "add %4, %1 \n\t"
  67. "add $4, %2 \n\t"
  68. PAVGB" %%mm1, %%mm0 \n\t"
  69. "movd %%mm0, (%3) \n\t"
  70. "add %5, %3 \n\t"
  71. "decl %0 \n\t"
  72. "1: \n\t"
  73. "movd (%1), %%mm0 \n\t"
  74. "add %4, %1 \n\t"
  75. "movd (%1), %%mm1 \n\t"
  76. "movd (%2), %%mm2 \n\t"
  77. "movd 4(%2), %%mm3 \n\t"
  78. "add %4, %1 \n\t"
  79. PAVGB" %%mm2, %%mm0 \n\t"
  80. PAVGB" %%mm3, %%mm1 \n\t"
  81. "movd %%mm0, (%3) \n\t"
  82. "add %5, %3 \n\t"
  83. "movd %%mm1, (%3) \n\t"
  84. "add %5, %3 \n\t"
  85. "movd (%1), %%mm0 \n\t"
  86. "add %4, %1 \n\t"
  87. "movd (%1), %%mm1 \n\t"
  88. "movd 8(%2), %%mm2 \n\t"
  89. "movd 12(%2), %%mm3 \n\t"
  90. "add %4, %1 \n\t"
  91. PAVGB" %%mm2, %%mm0 \n\t"
  92. PAVGB" %%mm3, %%mm1 \n\t"
  93. "movd %%mm0, (%3) \n\t"
  94. "add %5, %3 \n\t"
  95. "movd %%mm1, (%3) \n\t"
  96. "add %5, %3 \n\t"
  97. "add $16, %2 \n\t"
  98. "subl $4, %0 \n\t"
  99. "jnz 1b \n\t"
  100. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  101. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  102. #else
  103. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  104. #endif
  105. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  106. :"memory");
  107. }
  108. static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  109. {
  110. asm volatile(
  111. "testl $1, %0 \n\t"
  112. " jz 1f \n\t"
  113. "movq (%1), %%mm0 \n\t"
  114. "movq (%2), %%mm1 \n\t"
  115. "add %4, %1 \n\t"
  116. "add $8, %2 \n\t"
  117. PAVGB" %%mm1, %%mm0 \n\t"
  118. "movq %%mm0, (%3) \n\t"
  119. "add %5, %3 \n\t"
  120. "decl %0 \n\t"
  121. "1: \n\t"
  122. "movq (%1), %%mm0 \n\t"
  123. "add %4, %1 \n\t"
  124. "movq (%1), %%mm1 \n\t"
  125. "add %4, %1 \n\t"
  126. PAVGB" (%2), %%mm0 \n\t"
  127. PAVGB" 8(%2), %%mm1 \n\t"
  128. "movq %%mm0, (%3) \n\t"
  129. "add %5, %3 \n\t"
  130. "movq %%mm1, (%3) \n\t"
  131. "add %5, %3 \n\t"
  132. "movq (%1), %%mm0 \n\t"
  133. "add %4, %1 \n\t"
  134. "movq (%1), %%mm1 \n\t"
  135. "add %4, %1 \n\t"
  136. PAVGB" 16(%2), %%mm0 \n\t"
  137. PAVGB" 24(%2), %%mm1 \n\t"
  138. "movq %%mm0, (%3) \n\t"
  139. "add %5, %3 \n\t"
  140. "movq %%mm1, (%3) \n\t"
  141. "add %5, %3 \n\t"
  142. "add $32, %2 \n\t"
  143. "subl $4, %0 \n\t"
  144. "jnz 1b \n\t"
  145. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  146. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  147. #else
  148. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  149. #endif
  150. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  151. :"memory");
  152. //the following should be used, though better not with gcc ...
  153. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  154. :"r"(src1Stride), "r"(dstStride)
  155. :"memory");*/
  156. }
  157. static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  158. {
  159. asm volatile(
  160. "pcmpeqb %%mm6, %%mm6 \n\t"
  161. "testl $1, %0 \n\t"
  162. " jz 1f \n\t"
  163. "movq (%1), %%mm0 \n\t"
  164. "movq (%2), %%mm1 \n\t"
  165. "add %4, %1 \n\t"
  166. "add $8, %2 \n\t"
  167. "pxor %%mm6, %%mm0 \n\t"
  168. "pxor %%mm6, %%mm1 \n\t"
  169. PAVGB" %%mm1, %%mm0 \n\t"
  170. "pxor %%mm6, %%mm0 \n\t"
  171. "movq %%mm0, (%3) \n\t"
  172. "add %5, %3 \n\t"
  173. "decl %0 \n\t"
  174. "1: \n\t"
  175. "movq (%1), %%mm0 \n\t"
  176. "add %4, %1 \n\t"
  177. "movq (%1), %%mm1 \n\t"
  178. "add %4, %1 \n\t"
  179. "movq (%2), %%mm2 \n\t"
  180. "movq 8(%2), %%mm3 \n\t"
  181. "pxor %%mm6, %%mm0 \n\t"
  182. "pxor %%mm6, %%mm1 \n\t"
  183. "pxor %%mm6, %%mm2 \n\t"
  184. "pxor %%mm6, %%mm3 \n\t"
  185. PAVGB" %%mm2, %%mm0 \n\t"
  186. PAVGB" %%mm3, %%mm1 \n\t"
  187. "pxor %%mm6, %%mm0 \n\t"
  188. "pxor %%mm6, %%mm1 \n\t"
  189. "movq %%mm0, (%3) \n\t"
  190. "add %5, %3 \n\t"
  191. "movq %%mm1, (%3) \n\t"
  192. "add %5, %3 \n\t"
  193. "movq (%1), %%mm0 \n\t"
  194. "add %4, %1 \n\t"
  195. "movq (%1), %%mm1 \n\t"
  196. "add %4, %1 \n\t"
  197. "movq 16(%2), %%mm2 \n\t"
  198. "movq 24(%2), %%mm3 \n\t"
  199. "pxor %%mm6, %%mm0 \n\t"
  200. "pxor %%mm6, %%mm1 \n\t"
  201. "pxor %%mm6, %%mm2 \n\t"
  202. "pxor %%mm6, %%mm3 \n\t"
  203. PAVGB" %%mm2, %%mm0 \n\t"
  204. PAVGB" %%mm3, %%mm1 \n\t"
  205. "pxor %%mm6, %%mm0 \n\t"
  206. "pxor %%mm6, %%mm1 \n\t"
  207. "movq %%mm0, (%3) \n\t"
  208. "add %5, %3 \n\t"
  209. "movq %%mm1, (%3) \n\t"
  210. "add %5, %3 \n\t"
  211. "add $32, %2 \n\t"
  212. "subl $4, %0 \n\t"
  213. "jnz 1b \n\t"
  214. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  215. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  216. #else
  217. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  218. #endif
  219. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  220. :"memory");
  221. //the following should be used, though better not with gcc ...
  222. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  223. :"r"(src1Stride), "r"(dstStride)
  224. :"memory");*/
  225. }
  226. static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  227. {
  228. asm volatile(
  229. "testl $1, %0 \n\t"
  230. " jz 1f \n\t"
  231. "movd (%1), %%mm0 \n\t"
  232. "movd (%2), %%mm1 \n\t"
  233. "add %4, %1 \n\t"
  234. "add $4, %2 \n\t"
  235. PAVGB" %%mm1, %%mm0 \n\t"
  236. PAVGB" (%3), %%mm0 \n\t"
  237. "movd %%mm0, (%3) \n\t"
  238. "add %5, %3 \n\t"
  239. "decl %0 \n\t"
  240. "1: \n\t"
  241. "movd (%1), %%mm0 \n\t"
  242. "add %4, %1 \n\t"
  243. "movd (%1), %%mm1 \n\t"
  244. "add %4, %1 \n\t"
  245. PAVGB" (%2), %%mm0 \n\t"
  246. PAVGB" 4(%2), %%mm1 \n\t"
  247. PAVGB" (%3), %%mm0 \n\t"
  248. "movd %%mm0, (%3) \n\t"
  249. "add %5, %3 \n\t"
  250. PAVGB" (%3), %%mm1 \n\t"
  251. "movd %%mm1, (%3) \n\t"
  252. "add %5, %3 \n\t"
  253. "movd (%1), %%mm0 \n\t"
  254. "add %4, %1 \n\t"
  255. "movd (%1), %%mm1 \n\t"
  256. "add %4, %1 \n\t"
  257. PAVGB" 8(%2), %%mm0 \n\t"
  258. PAVGB" 12(%2), %%mm1 \n\t"
  259. PAVGB" (%3), %%mm0 \n\t"
  260. "movd %%mm0, (%3) \n\t"
  261. "add %5, %3 \n\t"
  262. PAVGB" (%3), %%mm1 \n\t"
  263. "movd %%mm1, (%3) \n\t"
  264. "add %5, %3 \n\t"
  265. "add $16, %2 \n\t"
  266. "subl $4, %0 \n\t"
  267. "jnz 1b \n\t"
  268. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  269. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  270. #else
  271. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  272. #endif
  273. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  274. :"memory");
  275. }
  276. static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  277. {
  278. asm volatile(
  279. "testl $1, %0 \n\t"
  280. " jz 1f \n\t"
  281. "movq (%1), %%mm0 \n\t"
  282. "movq (%2), %%mm1 \n\t"
  283. "add %4, %1 \n\t"
  284. "add $8, %2 \n\t"
  285. PAVGB" %%mm1, %%mm0 \n\t"
  286. PAVGB" (%3), %%mm0 \n\t"
  287. "movq %%mm0, (%3) \n\t"
  288. "add %5, %3 \n\t"
  289. "decl %0 \n\t"
  290. "1: \n\t"
  291. "movq (%1), %%mm0 \n\t"
  292. "add %4, %1 \n\t"
  293. "movq (%1), %%mm1 \n\t"
  294. "add %4, %1 \n\t"
  295. PAVGB" (%2), %%mm0 \n\t"
  296. PAVGB" 8(%2), %%mm1 \n\t"
  297. PAVGB" (%3), %%mm0 \n\t"
  298. "movq %%mm0, (%3) \n\t"
  299. "add %5, %3 \n\t"
  300. PAVGB" (%3), %%mm1 \n\t"
  301. "movq %%mm1, (%3) \n\t"
  302. "add %5, %3 \n\t"
  303. "movq (%1), %%mm0 \n\t"
  304. "add %4, %1 \n\t"
  305. "movq (%1), %%mm1 \n\t"
  306. "add %4, %1 \n\t"
  307. PAVGB" 16(%2), %%mm0 \n\t"
  308. PAVGB" 24(%2), %%mm1 \n\t"
  309. PAVGB" (%3), %%mm0 \n\t"
  310. "movq %%mm0, (%3) \n\t"
  311. "add %5, %3 \n\t"
  312. PAVGB" (%3), %%mm1 \n\t"
  313. "movq %%mm1, (%3) \n\t"
  314. "add %5, %3 \n\t"
  315. "add $32, %2 \n\t"
  316. "subl $4, %0 \n\t"
  317. "jnz 1b \n\t"
  318. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  319. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  320. #else
  321. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  322. #endif
  323. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  324. :"memory");
  325. //the following should be used, though better not with gcc ...
  326. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  327. :"r"(src1Stride), "r"(dstStride)
  328. :"memory");*/
  329. }
  330. static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  331. {
  332. asm volatile(
  333. "lea (%3, %3), %%"REG_a" \n\t"
  334. "1: \n\t"
  335. "movq (%1), %%mm0 \n\t"
  336. "movq (%1, %3), %%mm1 \n\t"
  337. "movq 8(%1), %%mm2 \n\t"
  338. "movq 8(%1, %3), %%mm3 \n\t"
  339. PAVGB" 1(%1), %%mm0 \n\t"
  340. PAVGB" 1(%1, %3), %%mm1 \n\t"
  341. PAVGB" 9(%1), %%mm2 \n\t"
  342. PAVGB" 9(%1, %3), %%mm3 \n\t"
  343. "movq %%mm0, (%2) \n\t"
  344. "movq %%mm1, (%2, %3) \n\t"
  345. "movq %%mm2, 8(%2) \n\t"
  346. "movq %%mm3, 8(%2, %3) \n\t"
  347. "add %%"REG_a", %1 \n\t"
  348. "add %%"REG_a", %2 \n\t"
  349. "movq (%1), %%mm0 \n\t"
  350. "movq (%1, %3), %%mm1 \n\t"
  351. "movq 8(%1), %%mm2 \n\t"
  352. "movq 8(%1, %3), %%mm3 \n\t"
  353. PAVGB" 1(%1), %%mm0 \n\t"
  354. PAVGB" 1(%1, %3), %%mm1 \n\t"
  355. PAVGB" 9(%1), %%mm2 \n\t"
  356. PAVGB" 9(%1, %3), %%mm3 \n\t"
  357. "add %%"REG_a", %1 \n\t"
  358. "movq %%mm0, (%2) \n\t"
  359. "movq %%mm1, (%2, %3) \n\t"
  360. "movq %%mm2, 8(%2) \n\t"
  361. "movq %%mm3, 8(%2, %3) \n\t"
  362. "add %%"REG_a", %2 \n\t"
  363. "subl $4, %0 \n\t"
  364. "jnz 1b \n\t"
  365. :"+g"(h), "+S"(pixels), "+D"(block)
  366. :"r" ((x86_reg)line_size)
  367. :"%"REG_a, "memory");
  368. }
  369. static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  370. {
  371. asm volatile(
  372. "testl $1, %0 \n\t"
  373. " jz 1f \n\t"
  374. "movq (%1), %%mm0 \n\t"
  375. "movq 8(%1), %%mm1 \n\t"
  376. PAVGB" (%2), %%mm0 \n\t"
  377. PAVGB" 8(%2), %%mm1 \n\t"
  378. "add %4, %1 \n\t"
  379. "add $16, %2 \n\t"
  380. "movq %%mm0, (%3) \n\t"
  381. "movq %%mm1, 8(%3) \n\t"
  382. "add %5, %3 \n\t"
  383. "decl %0 \n\t"
  384. "1: \n\t"
  385. "movq (%1), %%mm0 \n\t"
  386. "movq 8(%1), %%mm1 \n\t"
  387. "add %4, %1 \n\t"
  388. PAVGB" (%2), %%mm0 \n\t"
  389. PAVGB" 8(%2), %%mm1 \n\t"
  390. "movq %%mm0, (%3) \n\t"
  391. "movq %%mm1, 8(%3) \n\t"
  392. "add %5, %3 \n\t"
  393. "movq (%1), %%mm0 \n\t"
  394. "movq 8(%1), %%mm1 \n\t"
  395. "add %4, %1 \n\t"
  396. PAVGB" 16(%2), %%mm0 \n\t"
  397. PAVGB" 24(%2), %%mm1 \n\t"
  398. "movq %%mm0, (%3) \n\t"
  399. "movq %%mm1, 8(%3) \n\t"
  400. "add %5, %3 \n\t"
  401. "add $32, %2 \n\t"
  402. "subl $2, %0 \n\t"
  403. "jnz 1b \n\t"
  404. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  405. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  406. #else
  407. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  408. #endif
  409. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  410. :"memory");
  411. //the following should be used, though better not with gcc ...
  412. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  413. :"r"(src1Stride), "r"(dstStride)
  414. :"memory");*/
  415. }
  416. static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  417. {
  418. asm volatile(
  419. "testl $1, %0 \n\t"
  420. " jz 1f \n\t"
  421. "movq (%1), %%mm0 \n\t"
  422. "movq 8(%1), %%mm1 \n\t"
  423. PAVGB" (%2), %%mm0 \n\t"
  424. PAVGB" 8(%2), %%mm1 \n\t"
  425. "add %4, %1 \n\t"
  426. "add $16, %2 \n\t"
  427. PAVGB" (%3), %%mm0 \n\t"
  428. PAVGB" 8(%3), %%mm1 \n\t"
  429. "movq %%mm0, (%3) \n\t"
  430. "movq %%mm1, 8(%3) \n\t"
  431. "add %5, %3 \n\t"
  432. "decl %0 \n\t"
  433. "1: \n\t"
  434. "movq (%1), %%mm0 \n\t"
  435. "movq 8(%1), %%mm1 \n\t"
  436. "add %4, %1 \n\t"
  437. PAVGB" (%2), %%mm0 \n\t"
  438. PAVGB" 8(%2), %%mm1 \n\t"
  439. PAVGB" (%3), %%mm0 \n\t"
  440. PAVGB" 8(%3), %%mm1 \n\t"
  441. "movq %%mm0, (%3) \n\t"
  442. "movq %%mm1, 8(%3) \n\t"
  443. "add %5, %3 \n\t"
  444. "movq (%1), %%mm0 \n\t"
  445. "movq 8(%1), %%mm1 \n\t"
  446. "add %4, %1 \n\t"
  447. PAVGB" 16(%2), %%mm0 \n\t"
  448. PAVGB" 24(%2), %%mm1 \n\t"
  449. PAVGB" (%3), %%mm0 \n\t"
  450. PAVGB" 8(%3), %%mm1 \n\t"
  451. "movq %%mm0, (%3) \n\t"
  452. "movq %%mm1, 8(%3) \n\t"
  453. "add %5, %3 \n\t"
  454. "add $32, %2 \n\t"
  455. "subl $2, %0 \n\t"
  456. "jnz 1b \n\t"
  457. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  458. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  459. #else
  460. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  461. #endif
  462. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  463. :"memory");
  464. //the following should be used, though better not with gcc ...
  465. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  466. :"r"(src1Stride), "r"(dstStride)
  467. :"memory");*/
  468. }
  469. static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  470. {
  471. asm volatile(
  472. "pcmpeqb %%mm6, %%mm6 \n\t"
  473. "testl $1, %0 \n\t"
  474. " jz 1f \n\t"
  475. "movq (%1), %%mm0 \n\t"
  476. "movq 8(%1), %%mm1 \n\t"
  477. "movq (%2), %%mm2 \n\t"
  478. "movq 8(%2), %%mm3 \n\t"
  479. "pxor %%mm6, %%mm0 \n\t"
  480. "pxor %%mm6, %%mm1 \n\t"
  481. "pxor %%mm6, %%mm2 \n\t"
  482. "pxor %%mm6, %%mm3 \n\t"
  483. PAVGB" %%mm2, %%mm0 \n\t"
  484. PAVGB" %%mm3, %%mm1 \n\t"
  485. "pxor %%mm6, %%mm0 \n\t"
  486. "pxor %%mm6, %%mm1 \n\t"
  487. "add %4, %1 \n\t"
  488. "add $16, %2 \n\t"
  489. "movq %%mm0, (%3) \n\t"
  490. "movq %%mm1, 8(%3) \n\t"
  491. "add %5, %3 \n\t"
  492. "decl %0 \n\t"
  493. "1: \n\t"
  494. "movq (%1), %%mm0 \n\t"
  495. "movq 8(%1), %%mm1 \n\t"
  496. "add %4, %1 \n\t"
  497. "movq (%2), %%mm2 \n\t"
  498. "movq 8(%2), %%mm3 \n\t"
  499. "pxor %%mm6, %%mm0 \n\t"
  500. "pxor %%mm6, %%mm1 \n\t"
  501. "pxor %%mm6, %%mm2 \n\t"
  502. "pxor %%mm6, %%mm3 \n\t"
  503. PAVGB" %%mm2, %%mm0 \n\t"
  504. PAVGB" %%mm3, %%mm1 \n\t"
  505. "pxor %%mm6, %%mm0 \n\t"
  506. "pxor %%mm6, %%mm1 \n\t"
  507. "movq %%mm0, (%3) \n\t"
  508. "movq %%mm1, 8(%3) \n\t"
  509. "add %5, %3 \n\t"
  510. "movq (%1), %%mm0 \n\t"
  511. "movq 8(%1), %%mm1 \n\t"
  512. "add %4, %1 \n\t"
  513. "movq 16(%2), %%mm2 \n\t"
  514. "movq 24(%2), %%mm3 \n\t"
  515. "pxor %%mm6, %%mm0 \n\t"
  516. "pxor %%mm6, %%mm1 \n\t"
  517. "pxor %%mm6, %%mm2 \n\t"
  518. "pxor %%mm6, %%mm3 \n\t"
  519. PAVGB" %%mm2, %%mm0 \n\t"
  520. PAVGB" %%mm3, %%mm1 \n\t"
  521. "pxor %%mm6, %%mm0 \n\t"
  522. "pxor %%mm6, %%mm1 \n\t"
  523. "movq %%mm0, (%3) \n\t"
  524. "movq %%mm1, 8(%3) \n\t"
  525. "add %5, %3 \n\t"
  526. "add $32, %2 \n\t"
  527. "subl $2, %0 \n\t"
  528. "jnz 1b \n\t"
  529. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  530. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  531. #else
  532. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  533. #endif
  534. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  535. :"memory");
  536. //the following should be used, though better not with gcc ...
  537. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  538. :"r"(src1Stride), "r"(dstStride)
  539. :"memory");*/
  540. }
  541. /* GL: this function does incorrect rounding if overflow */
  542. static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  543. {
  544. MOVQ_BONE(mm6);
  545. asm volatile(
  546. "lea (%3, %3), %%"REG_a" \n\t"
  547. "1: \n\t"
  548. "movq (%1), %%mm0 \n\t"
  549. "movq (%1, %3), %%mm2 \n\t"
  550. "movq 1(%1), %%mm1 \n\t"
  551. "movq 1(%1, %3), %%mm3 \n\t"
  552. "add %%"REG_a", %1 \n\t"
  553. "psubusb %%mm6, %%mm0 \n\t"
  554. "psubusb %%mm6, %%mm2 \n\t"
  555. PAVGB" %%mm1, %%mm0 \n\t"
  556. PAVGB" %%mm3, %%mm2 \n\t"
  557. "movq %%mm0, (%2) \n\t"
  558. "movq %%mm2, (%2, %3) \n\t"
  559. "movq (%1), %%mm0 \n\t"
  560. "movq 1(%1), %%mm1 \n\t"
  561. "movq (%1, %3), %%mm2 \n\t"
  562. "movq 1(%1, %3), %%mm3 \n\t"
  563. "add %%"REG_a", %2 \n\t"
  564. "add %%"REG_a", %1 \n\t"
  565. "psubusb %%mm6, %%mm0 \n\t"
  566. "psubusb %%mm6, %%mm2 \n\t"
  567. PAVGB" %%mm1, %%mm0 \n\t"
  568. PAVGB" %%mm3, %%mm2 \n\t"
  569. "movq %%mm0, (%2) \n\t"
  570. "movq %%mm2, (%2, %3) \n\t"
  571. "add %%"REG_a", %2 \n\t"
  572. "subl $4, %0 \n\t"
  573. "jnz 1b \n\t"
  574. :"+g"(h), "+S"(pixels), "+D"(block)
  575. :"r" ((x86_reg)line_size)
  576. :"%"REG_a, "memory");
  577. }
  578. static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  579. {
  580. asm volatile(
  581. "lea (%3, %3), %%"REG_a" \n\t"
  582. "movq (%1), %%mm0 \n\t"
  583. "sub %3, %2 \n\t"
  584. "1: \n\t"
  585. "movq (%1, %3), %%mm1 \n\t"
  586. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  587. "add %%"REG_a", %1 \n\t"
  588. PAVGB" %%mm1, %%mm0 \n\t"
  589. PAVGB" %%mm2, %%mm1 \n\t"
  590. "movq %%mm0, (%2, %3) \n\t"
  591. "movq %%mm1, (%2, %%"REG_a") \n\t"
  592. "movq (%1, %3), %%mm1 \n\t"
  593. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  594. "add %%"REG_a", %2 \n\t"
  595. "add %%"REG_a", %1 \n\t"
  596. PAVGB" %%mm1, %%mm2 \n\t"
  597. PAVGB" %%mm0, %%mm1 \n\t"
  598. "movq %%mm2, (%2, %3) \n\t"
  599. "movq %%mm1, (%2, %%"REG_a") \n\t"
  600. "add %%"REG_a", %2 \n\t"
  601. "subl $4, %0 \n\t"
  602. "jnz 1b \n\t"
  603. :"+g"(h), "+S"(pixels), "+D" (block)
  604. :"r" ((x86_reg)line_size)
  605. :"%"REG_a, "memory");
  606. }
  607. /* GL: this function does incorrect rounding if overflow */
  608. static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  609. {
  610. MOVQ_BONE(mm6);
  611. asm volatile(
  612. "lea (%3, %3), %%"REG_a" \n\t"
  613. "movq (%1), %%mm0 \n\t"
  614. "sub %3, %2 \n\t"
  615. "1: \n\t"
  616. "movq (%1, %3), %%mm1 \n\t"
  617. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  618. "add %%"REG_a", %1 \n\t"
  619. "psubusb %%mm6, %%mm1 \n\t"
  620. PAVGB" %%mm1, %%mm0 \n\t"
  621. PAVGB" %%mm2, %%mm1 \n\t"
  622. "movq %%mm0, (%2, %3) \n\t"
  623. "movq %%mm1, (%2, %%"REG_a") \n\t"
  624. "movq (%1, %3), %%mm1 \n\t"
  625. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  626. "add %%"REG_a", %2 \n\t"
  627. "add %%"REG_a", %1 \n\t"
  628. "psubusb %%mm6, %%mm1 \n\t"
  629. PAVGB" %%mm1, %%mm2 \n\t"
  630. PAVGB" %%mm0, %%mm1 \n\t"
  631. "movq %%mm2, (%2, %3) \n\t"
  632. "movq %%mm1, (%2, %%"REG_a") \n\t"
  633. "add %%"REG_a", %2 \n\t"
  634. "subl $4, %0 \n\t"
  635. "jnz 1b \n\t"
  636. :"+g"(h), "+S"(pixels), "+D" (block)
  637. :"r" ((x86_reg)line_size)
  638. :"%"REG_a, "memory");
  639. }
  640. static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  641. {
  642. asm volatile(
  643. "lea (%3, %3), %%"REG_a" \n\t"
  644. "1: \n\t"
  645. "movq (%2), %%mm0 \n\t"
  646. "movq (%2, %3), %%mm1 \n\t"
  647. PAVGB" (%1), %%mm0 \n\t"
  648. PAVGB" (%1, %3), %%mm1 \n\t"
  649. "movq %%mm0, (%2) \n\t"
  650. "movq %%mm1, (%2, %3) \n\t"
  651. "add %%"REG_a", %1 \n\t"
  652. "add %%"REG_a", %2 \n\t"
  653. "movq (%2), %%mm0 \n\t"
  654. "movq (%2, %3), %%mm1 \n\t"
  655. PAVGB" (%1), %%mm0 \n\t"
  656. PAVGB" (%1, %3), %%mm1 \n\t"
  657. "add %%"REG_a", %1 \n\t"
  658. "movq %%mm0, (%2) \n\t"
  659. "movq %%mm1, (%2, %3) \n\t"
  660. "add %%"REG_a", %2 \n\t"
  661. "subl $4, %0 \n\t"
  662. "jnz 1b \n\t"
  663. :"+g"(h), "+S"(pixels), "+D"(block)
  664. :"r" ((x86_reg)line_size)
  665. :"%"REG_a, "memory");
  666. }
  667. static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  668. {
  669. asm volatile(
  670. "lea (%3, %3), %%"REG_a" \n\t"
  671. "1: \n\t"
  672. "movq (%1), %%mm0 \n\t"
  673. "movq (%1, %3), %%mm2 \n\t"
  674. PAVGB" 1(%1), %%mm0 \n\t"
  675. PAVGB" 1(%1, %3), %%mm2 \n\t"
  676. PAVGB" (%2), %%mm0 \n\t"
  677. PAVGB" (%2, %3), %%mm2 \n\t"
  678. "add %%"REG_a", %1 \n\t"
  679. "movq %%mm0, (%2) \n\t"
  680. "movq %%mm2, (%2, %3) \n\t"
  681. "movq (%1), %%mm0 \n\t"
  682. "movq (%1, %3), %%mm2 \n\t"
  683. PAVGB" 1(%1), %%mm0 \n\t"
  684. PAVGB" 1(%1, %3), %%mm2 \n\t"
  685. "add %%"REG_a", %2 \n\t"
  686. "add %%"REG_a", %1 \n\t"
  687. PAVGB" (%2), %%mm0 \n\t"
  688. PAVGB" (%2, %3), %%mm2 \n\t"
  689. "movq %%mm0, (%2) \n\t"
  690. "movq %%mm2, (%2, %3) \n\t"
  691. "add %%"REG_a", %2 \n\t"
  692. "subl $4, %0 \n\t"
  693. "jnz 1b \n\t"
  694. :"+g"(h), "+S"(pixels), "+D"(block)
  695. :"r" ((x86_reg)line_size)
  696. :"%"REG_a, "memory");
  697. }
  698. static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  699. {
  700. asm volatile(
  701. "lea (%3, %3), %%"REG_a" \n\t"
  702. "movq (%1), %%mm0 \n\t"
  703. "sub %3, %2 \n\t"
  704. "1: \n\t"
  705. "movq (%1, %3), %%mm1 \n\t"
  706. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  707. "add %%"REG_a", %1 \n\t"
  708. PAVGB" %%mm1, %%mm0 \n\t"
  709. PAVGB" %%mm2, %%mm1 \n\t"
  710. "movq (%2, %3), %%mm3 \n\t"
  711. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  712. PAVGB" %%mm3, %%mm0 \n\t"
  713. PAVGB" %%mm4, %%mm1 \n\t"
  714. "movq %%mm0, (%2, %3) \n\t"
  715. "movq %%mm1, (%2, %%"REG_a") \n\t"
  716. "movq (%1, %3), %%mm1 \n\t"
  717. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  718. PAVGB" %%mm1, %%mm2 \n\t"
  719. PAVGB" %%mm0, %%mm1 \n\t"
  720. "add %%"REG_a", %2 \n\t"
  721. "add %%"REG_a", %1 \n\t"
  722. "movq (%2, %3), %%mm3 \n\t"
  723. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  724. PAVGB" %%mm3, %%mm2 \n\t"
  725. PAVGB" %%mm4, %%mm1 \n\t"
  726. "movq %%mm2, (%2, %3) \n\t"
  727. "movq %%mm1, (%2, %%"REG_a") \n\t"
  728. "add %%"REG_a", %2 \n\t"
  729. "subl $4, %0 \n\t"
  730. "jnz 1b \n\t"
  731. :"+g"(h), "+S"(pixels), "+D"(block)
  732. :"r" ((x86_reg)line_size)
  733. :"%"REG_a, "memory");
  734. }
  735. /* Note this is not correctly rounded, but this function is only
  736. * used for B-frames so it does not matter. */
  737. static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  738. {
  739. MOVQ_BONE(mm6);
  740. asm volatile(
  741. "lea (%3, %3), %%"REG_a" \n\t"
  742. "movq (%1), %%mm0 \n\t"
  743. PAVGB" 1(%1), %%mm0 \n\t"
  744. ASMALIGN(3)
  745. "1: \n\t"
  746. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  747. "movq (%1, %3), %%mm1 \n\t"
  748. "psubusb %%mm6, %%mm2 \n\t"
  749. PAVGB" 1(%1, %3), %%mm1 \n\t"
  750. PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
  751. "add %%"REG_a", %1 \n\t"
  752. PAVGB" %%mm1, %%mm0 \n\t"
  753. PAVGB" %%mm2, %%mm1 \n\t"
  754. PAVGB" (%2), %%mm0 \n\t"
  755. PAVGB" (%2, %3), %%mm1 \n\t"
  756. "movq %%mm0, (%2) \n\t"
  757. "movq %%mm1, (%2, %3) \n\t"
  758. "movq (%1, %3), %%mm1 \n\t"
  759. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  760. PAVGB" 1(%1, %3), %%mm1 \n\t"
  761. PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
  762. "add %%"REG_a", %2 \n\t"
  763. "add %%"REG_a", %1 \n\t"
  764. PAVGB" %%mm1, %%mm2 \n\t"
  765. PAVGB" %%mm0, %%mm1 \n\t"
  766. PAVGB" (%2), %%mm2 \n\t"
  767. PAVGB" (%2, %3), %%mm1 \n\t"
  768. "movq %%mm2, (%2) \n\t"
  769. "movq %%mm1, (%2, %3) \n\t"
  770. "add %%"REG_a", %2 \n\t"
  771. "subl $4, %0 \n\t"
  772. "jnz 1b \n\t"
  773. :"+g"(h), "+S"(pixels), "+D"(block)
  774. :"r" ((x86_reg)line_size)
  775. :"%"REG_a, "memory");
  776. }
  777. static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  778. {
  779. do {
  780. asm volatile(
  781. "movd (%1), %%mm0 \n\t"
  782. "movd (%1, %2), %%mm1 \n\t"
  783. "movd (%1, %2, 2), %%mm2 \n\t"
  784. "movd (%1, %3), %%mm3 \n\t"
  785. PAVGB" (%0), %%mm0 \n\t"
  786. PAVGB" (%0, %2), %%mm1 \n\t"
  787. PAVGB" (%0, %2, 2), %%mm2 \n\t"
  788. PAVGB" (%0, %3), %%mm3 \n\t"
  789. "movd %%mm0, (%1) \n\t"
  790. "movd %%mm1, (%1, %2) \n\t"
  791. "movd %%mm2, (%1, %2, 2) \n\t"
  792. "movd %%mm3, (%1, %3) \n\t"
  793. ::"S"(pixels), "D"(block),
  794. "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
  795. :"memory");
  796. block += 4*line_size;
  797. pixels += 4*line_size;
  798. h -= 4;
  799. } while(h > 0);
  800. }
  801. //FIXME the following could be optimized too ...
  802. static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  803. DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
  804. DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
  805. }
  806. static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  807. DEF(put_pixels8_y2)(block , pixels , line_size, h);
  808. DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
  809. }
  810. static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  811. DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
  812. DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
  813. }
  814. static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  815. DEF(avg_pixels8)(block , pixels , line_size, h);
  816. DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
  817. }
  818. static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  819. DEF(avg_pixels8_x2)(block , pixels , line_size, h);
  820. DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
  821. }
  822. static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  823. DEF(avg_pixels8_y2)(block , pixels , line_size, h);
  824. DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
  825. }
  826. static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  827. DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
  828. DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
  829. }
  830. #define QPEL_2TAP_L3(OPNAME) \
  831. static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
  832. asm volatile(\
  833. "1: \n\t"\
  834. "movq (%1,%2), %%mm0 \n\t"\
  835. "movq 8(%1,%2), %%mm1 \n\t"\
  836. PAVGB" (%1,%3), %%mm0 \n\t"\
  837. PAVGB" 8(%1,%3), %%mm1 \n\t"\
  838. PAVGB" (%1), %%mm0 \n\t"\
  839. PAVGB" 8(%1), %%mm1 \n\t"\
  840. STORE_OP( (%1,%4),%%mm0)\
  841. STORE_OP(8(%1,%4),%%mm1)\
  842. "movq %%mm0, (%1,%4) \n\t"\
  843. "movq %%mm1, 8(%1,%4) \n\t"\
  844. "add %5, %1 \n\t"\
  845. "decl %0 \n\t"\
  846. "jnz 1b \n\t"\
  847. :"+g"(h), "+r"(src)\
  848. :"r"((x86_reg)off1), "r"((x86_reg)off2),\
  849. "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
  850. :"memory"\
  851. );\
  852. }\
  853. static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
  854. asm volatile(\
  855. "1: \n\t"\
  856. "movq (%1,%2), %%mm0 \n\t"\
  857. PAVGB" (%1,%3), %%mm0 \n\t"\
  858. PAVGB" (%1), %%mm0 \n\t"\
  859. STORE_OP((%1,%4),%%mm0)\
  860. "movq %%mm0, (%1,%4) \n\t"\
  861. "add %5, %1 \n\t"\
  862. "decl %0 \n\t"\
  863. "jnz 1b \n\t"\
  864. :"+g"(h), "+r"(src)\
  865. :"r"((x86_reg)off1), "r"((x86_reg)off2),\
  866. "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
  867. :"memory"\
  868. );\
  869. }
  870. #define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
  871. QPEL_2TAP_L3(avg_)
  872. #undef STORE_OP
  873. #define STORE_OP(a,b)
  874. QPEL_2TAP_L3(put_)
  875. #undef STORE_OP
  876. #undef QPEL_2TAP_L3