dsputil_mmx_rnd_template.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590
  1. /*
  2. * DSP utils mmx functions are compiled twice for rnd/no_rnd
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  8. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  9. *
  10. * This file is part of FFmpeg.
  11. *
  12. * FFmpeg is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU Lesser General Public
  14. * License as published by the Free Software Foundation; either
  15. * version 2.1 of the License, or (at your option) any later version.
  16. *
  17. * FFmpeg is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. * Lesser General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU Lesser General Public
  23. * License along with FFmpeg; if not, write to the Free Software
  24. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. */
  26. // put_pixels
  27. static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  28. {
  29. MOVQ_BFE(mm6);
  30. __asm__ volatile(
  31. "lea (%3, %3), %%"REG_a" \n\t"
  32. ASMALIGN(3)
  33. "1: \n\t"
  34. "movq (%1), %%mm0 \n\t"
  35. "movq 1(%1), %%mm1 \n\t"
  36. "movq (%1, %3), %%mm2 \n\t"
  37. "movq 1(%1, %3), %%mm3 \n\t"
  38. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  39. "movq %%mm4, (%2) \n\t"
  40. "movq %%mm5, (%2, %3) \n\t"
  41. "add %%"REG_a", %1 \n\t"
  42. "add %%"REG_a", %2 \n\t"
  43. "movq (%1), %%mm0 \n\t"
  44. "movq 1(%1), %%mm1 \n\t"
  45. "movq (%1, %3), %%mm2 \n\t"
  46. "movq 1(%1, %3), %%mm3 \n\t"
  47. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  48. "movq %%mm4, (%2) \n\t"
  49. "movq %%mm5, (%2, %3) \n\t"
  50. "add %%"REG_a", %1 \n\t"
  51. "add %%"REG_a", %2 \n\t"
  52. "subl $4, %0 \n\t"
  53. "jnz 1b \n\t"
  54. :"+g"(h), "+S"(pixels), "+D"(block)
  55. :"r"((x86_reg)line_size)
  56. :REG_a, "memory");
  57. }
  58. static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  59. {
  60. MOVQ_BFE(mm6);
  61. __asm__ volatile(
  62. "testl $1, %0 \n\t"
  63. " jz 1f \n\t"
  64. "movq (%1), %%mm0 \n\t"
  65. "movq (%2), %%mm1 \n\t"
  66. "add %4, %1 \n\t"
  67. "add $8, %2 \n\t"
  68. PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
  69. "movq %%mm4, (%3) \n\t"
  70. "add %5, %3 \n\t"
  71. "decl %0 \n\t"
  72. ASMALIGN(3)
  73. "1: \n\t"
  74. "movq (%1), %%mm0 \n\t"
  75. "movq (%2), %%mm1 \n\t"
  76. "add %4, %1 \n\t"
  77. "movq (%1), %%mm2 \n\t"
  78. "movq 8(%2), %%mm3 \n\t"
  79. "add %4, %1 \n\t"
  80. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  81. "movq %%mm4, (%3) \n\t"
  82. "add %5, %3 \n\t"
  83. "movq %%mm5, (%3) \n\t"
  84. "add %5, %3 \n\t"
  85. "movq (%1), %%mm0 \n\t"
  86. "movq 16(%2), %%mm1 \n\t"
  87. "add %4, %1 \n\t"
  88. "movq (%1), %%mm2 \n\t"
  89. "movq 24(%2), %%mm3 \n\t"
  90. "add %4, %1 \n\t"
  91. "add $32, %2 \n\t"
  92. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  93. "movq %%mm4, (%3) \n\t"
  94. "add %5, %3 \n\t"
  95. "movq %%mm5, (%3) \n\t"
  96. "add %5, %3 \n\t"
  97. "subl $4, %0 \n\t"
  98. "jnz 1b \n\t"
  99. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  100. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  101. #else
  102. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  103. #endif
  104. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  105. :"memory");
  106. }
  107. static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  108. {
  109. MOVQ_BFE(mm6);
  110. __asm__ volatile(
  111. "lea (%3, %3), %%"REG_a" \n\t"
  112. ASMALIGN(3)
  113. "1: \n\t"
  114. "movq (%1), %%mm0 \n\t"
  115. "movq 1(%1), %%mm1 \n\t"
  116. "movq (%1, %3), %%mm2 \n\t"
  117. "movq 1(%1, %3), %%mm3 \n\t"
  118. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  119. "movq %%mm4, (%2) \n\t"
  120. "movq %%mm5, (%2, %3) \n\t"
  121. "movq 8(%1), %%mm0 \n\t"
  122. "movq 9(%1), %%mm1 \n\t"
  123. "movq 8(%1, %3), %%mm2 \n\t"
  124. "movq 9(%1, %3), %%mm3 \n\t"
  125. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  126. "movq %%mm4, 8(%2) \n\t"
  127. "movq %%mm5, 8(%2, %3) \n\t"
  128. "add %%"REG_a", %1 \n\t"
  129. "add %%"REG_a", %2 \n\t"
  130. "movq (%1), %%mm0 \n\t"
  131. "movq 1(%1), %%mm1 \n\t"
  132. "movq (%1, %3), %%mm2 \n\t"
  133. "movq 1(%1, %3), %%mm3 \n\t"
  134. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  135. "movq %%mm4, (%2) \n\t"
  136. "movq %%mm5, (%2, %3) \n\t"
  137. "movq 8(%1), %%mm0 \n\t"
  138. "movq 9(%1), %%mm1 \n\t"
  139. "movq 8(%1, %3), %%mm2 \n\t"
  140. "movq 9(%1, %3), %%mm3 \n\t"
  141. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  142. "movq %%mm4, 8(%2) \n\t"
  143. "movq %%mm5, 8(%2, %3) \n\t"
  144. "add %%"REG_a", %1 \n\t"
  145. "add %%"REG_a", %2 \n\t"
  146. "subl $4, %0 \n\t"
  147. "jnz 1b \n\t"
  148. :"+g"(h), "+S"(pixels), "+D"(block)
  149. :"r"((x86_reg)line_size)
  150. :REG_a, "memory");
  151. }
  152. static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  153. {
  154. MOVQ_BFE(mm6);
  155. __asm__ volatile(
  156. "testl $1, %0 \n\t"
  157. " jz 1f \n\t"
  158. "movq (%1), %%mm0 \n\t"
  159. "movq (%2), %%mm1 \n\t"
  160. "movq 8(%1), %%mm2 \n\t"
  161. "movq 8(%2), %%mm3 \n\t"
  162. "add %4, %1 \n\t"
  163. "add $16, %2 \n\t"
  164. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  165. "movq %%mm4, (%3) \n\t"
  166. "movq %%mm5, 8(%3) \n\t"
  167. "add %5, %3 \n\t"
  168. "decl %0 \n\t"
  169. ASMALIGN(3)
  170. "1: \n\t"
  171. "movq (%1), %%mm0 \n\t"
  172. "movq (%2), %%mm1 \n\t"
  173. "movq 8(%1), %%mm2 \n\t"
  174. "movq 8(%2), %%mm3 \n\t"
  175. "add %4, %1 \n\t"
  176. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  177. "movq %%mm4, (%3) \n\t"
  178. "movq %%mm5, 8(%3) \n\t"
  179. "add %5, %3 \n\t"
  180. "movq (%1), %%mm0 \n\t"
  181. "movq 16(%2), %%mm1 \n\t"
  182. "movq 8(%1), %%mm2 \n\t"
  183. "movq 24(%2), %%mm3 \n\t"
  184. "add %4, %1 \n\t"
  185. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  186. "movq %%mm4, (%3) \n\t"
  187. "movq %%mm5, 8(%3) \n\t"
  188. "add %5, %3 \n\t"
  189. "add $32, %2 \n\t"
  190. "subl $2, %0 \n\t"
  191. "jnz 1b \n\t"
  192. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  193. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  194. #else
  195. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  196. #endif
  197. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  198. :"memory");
  199. }
  200. static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  201. {
  202. MOVQ_BFE(mm6);
  203. __asm__ volatile(
  204. "lea (%3, %3), %%"REG_a" \n\t"
  205. "movq (%1), %%mm0 \n\t"
  206. ASMALIGN(3)
  207. "1: \n\t"
  208. "movq (%1, %3), %%mm1 \n\t"
  209. "movq (%1, %%"REG_a"),%%mm2 \n\t"
  210. PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
  211. "movq %%mm4, (%2) \n\t"
  212. "movq %%mm5, (%2, %3) \n\t"
  213. "add %%"REG_a", %1 \n\t"
  214. "add %%"REG_a", %2 \n\t"
  215. "movq (%1, %3), %%mm1 \n\t"
  216. "movq (%1, %%"REG_a"),%%mm0 \n\t"
  217. PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
  218. "movq %%mm4, (%2) \n\t"
  219. "movq %%mm5, (%2, %3) \n\t"
  220. "add %%"REG_a", %1 \n\t"
  221. "add %%"REG_a", %2 \n\t"
  222. "subl $4, %0 \n\t"
  223. "jnz 1b \n\t"
  224. :"+g"(h), "+S"(pixels), "+D"(block)
  225. :"r"((x86_reg)line_size)
  226. :REG_a, "memory");
  227. }
  228. static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  229. {
  230. MOVQ_ZERO(mm7);
  231. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  232. __asm__ volatile(
  233. "movq (%1), %%mm0 \n\t"
  234. "movq 1(%1), %%mm4 \n\t"
  235. "movq %%mm0, %%mm1 \n\t"
  236. "movq %%mm4, %%mm5 \n\t"
  237. "punpcklbw %%mm7, %%mm0 \n\t"
  238. "punpcklbw %%mm7, %%mm4 \n\t"
  239. "punpckhbw %%mm7, %%mm1 \n\t"
  240. "punpckhbw %%mm7, %%mm5 \n\t"
  241. "paddusw %%mm0, %%mm4 \n\t"
  242. "paddusw %%mm1, %%mm5 \n\t"
  243. "xor %%"REG_a", %%"REG_a" \n\t"
  244. "add %3, %1 \n\t"
  245. ASMALIGN(3)
  246. "1: \n\t"
  247. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  248. "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
  249. "movq %%mm0, %%mm1 \n\t"
  250. "movq %%mm2, %%mm3 \n\t"
  251. "punpcklbw %%mm7, %%mm0 \n\t"
  252. "punpcklbw %%mm7, %%mm2 \n\t"
  253. "punpckhbw %%mm7, %%mm1 \n\t"
  254. "punpckhbw %%mm7, %%mm3 \n\t"
  255. "paddusw %%mm2, %%mm0 \n\t"
  256. "paddusw %%mm3, %%mm1 \n\t"
  257. "paddusw %%mm6, %%mm4 \n\t"
  258. "paddusw %%mm6, %%mm5 \n\t"
  259. "paddusw %%mm0, %%mm4 \n\t"
  260. "paddusw %%mm1, %%mm5 \n\t"
  261. "psrlw $2, %%mm4 \n\t"
  262. "psrlw $2, %%mm5 \n\t"
  263. "packuswb %%mm5, %%mm4 \n\t"
  264. "movq %%mm4, (%2, %%"REG_a") \n\t"
  265. "add %3, %%"REG_a" \n\t"
  266. "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  267. "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
  268. "movq %%mm2, %%mm3 \n\t"
  269. "movq %%mm4, %%mm5 \n\t"
  270. "punpcklbw %%mm7, %%mm2 \n\t"
  271. "punpcklbw %%mm7, %%mm4 \n\t"
  272. "punpckhbw %%mm7, %%mm3 \n\t"
  273. "punpckhbw %%mm7, %%mm5 \n\t"
  274. "paddusw %%mm2, %%mm4 \n\t"
  275. "paddusw %%mm3, %%mm5 \n\t"
  276. "paddusw %%mm6, %%mm0 \n\t"
  277. "paddusw %%mm6, %%mm1 \n\t"
  278. "paddusw %%mm4, %%mm0 \n\t"
  279. "paddusw %%mm5, %%mm1 \n\t"
  280. "psrlw $2, %%mm0 \n\t"
  281. "psrlw $2, %%mm1 \n\t"
  282. "packuswb %%mm1, %%mm0 \n\t"
  283. "movq %%mm0, (%2, %%"REG_a") \n\t"
  284. "add %3, %%"REG_a" \n\t"
  285. "subl $2, %0 \n\t"
  286. "jnz 1b \n\t"
  287. :"+g"(h), "+S"(pixels)
  288. :"D"(block), "r"((x86_reg)line_size)
  289. :REG_a, "memory");
  290. }
  291. // avg_pixels
  292. static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  293. {
  294. MOVQ_BFE(mm6);
  295. JUMPALIGN();
  296. do {
  297. __asm__ volatile(
  298. "movd %0, %%mm0 \n\t"
  299. "movd %1, %%mm1 \n\t"
  300. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  301. "movd %%mm2, %0 \n\t"
  302. :"+m"(*block)
  303. :"m"(*pixels)
  304. :"memory");
  305. pixels += line_size;
  306. block += line_size;
  307. }
  308. while (--h);
  309. }
  310. // in case more speed is needed - unroling would certainly help
  311. static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  312. {
  313. MOVQ_BFE(mm6);
  314. JUMPALIGN();
  315. do {
  316. __asm__ volatile(
  317. "movq %0, %%mm0 \n\t"
  318. "movq %1, %%mm1 \n\t"
  319. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  320. "movq %%mm2, %0 \n\t"
  321. :"+m"(*block)
  322. :"m"(*pixels)
  323. :"memory");
  324. pixels += line_size;
  325. block += line_size;
  326. }
  327. while (--h);
  328. }
  329. static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  330. {
  331. MOVQ_BFE(mm6);
  332. JUMPALIGN();
  333. do {
  334. __asm__ volatile(
  335. "movq %0, %%mm0 \n\t"
  336. "movq %1, %%mm1 \n\t"
  337. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  338. "movq %%mm2, %0 \n\t"
  339. "movq 8%0, %%mm0 \n\t"
  340. "movq 8%1, %%mm1 \n\t"
  341. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  342. "movq %%mm2, 8%0 \n\t"
  343. :"+m"(*block)
  344. :"m"(*pixels)
  345. :"memory");
  346. pixels += line_size;
  347. block += line_size;
  348. }
  349. while (--h);
  350. }
  351. static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  352. {
  353. MOVQ_BFE(mm6);
  354. JUMPALIGN();
  355. do {
  356. __asm__ volatile(
  357. "movq %1, %%mm0 \n\t"
  358. "movq 1%1, %%mm1 \n\t"
  359. "movq %0, %%mm3 \n\t"
  360. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  361. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  362. "movq %%mm0, %0 \n\t"
  363. :"+m"(*block)
  364. :"m"(*pixels)
  365. :"memory");
  366. pixels += line_size;
  367. block += line_size;
  368. } while (--h);
  369. }
  370. static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  371. {
  372. MOVQ_BFE(mm6);
  373. JUMPALIGN();
  374. do {
  375. __asm__ volatile(
  376. "movq %1, %%mm0 \n\t"
  377. "movq %2, %%mm1 \n\t"
  378. "movq %0, %%mm3 \n\t"
  379. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  380. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  381. "movq %%mm0, %0 \n\t"
  382. :"+m"(*dst)
  383. :"m"(*src1), "m"(*src2)
  384. :"memory");
  385. dst += dstStride;
  386. src1 += src1Stride;
  387. src2 += 8;
  388. } while (--h);
  389. }
  390. static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  391. {
  392. MOVQ_BFE(mm6);
  393. JUMPALIGN();
  394. do {
  395. __asm__ volatile(
  396. "movq %1, %%mm0 \n\t"
  397. "movq 1%1, %%mm1 \n\t"
  398. "movq %0, %%mm3 \n\t"
  399. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  400. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  401. "movq %%mm0, %0 \n\t"
  402. "movq 8%1, %%mm0 \n\t"
  403. "movq 9%1, %%mm1 \n\t"
  404. "movq 8%0, %%mm3 \n\t"
  405. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  406. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  407. "movq %%mm0, 8%0 \n\t"
  408. :"+m"(*block)
  409. :"m"(*pixels)
  410. :"memory");
  411. pixels += line_size;
  412. block += line_size;
  413. } while (--h);
  414. }
  415. static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  416. {
  417. MOVQ_BFE(mm6);
  418. JUMPALIGN();
  419. do {
  420. __asm__ volatile(
  421. "movq %1, %%mm0 \n\t"
  422. "movq %2, %%mm1 \n\t"
  423. "movq %0, %%mm3 \n\t"
  424. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  425. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  426. "movq %%mm0, %0 \n\t"
  427. "movq 8%1, %%mm0 \n\t"
  428. "movq 8%2, %%mm1 \n\t"
  429. "movq 8%0, %%mm3 \n\t"
  430. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  431. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  432. "movq %%mm0, 8%0 \n\t"
  433. :"+m"(*dst)
  434. :"m"(*src1), "m"(*src2)
  435. :"memory");
  436. dst += dstStride;
  437. src1 += src1Stride;
  438. src2 += 16;
  439. } while (--h);
  440. }
  441. static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  442. {
  443. MOVQ_BFE(mm6);
  444. __asm__ volatile(
  445. "lea (%3, %3), %%"REG_a" \n\t"
  446. "movq (%1), %%mm0 \n\t"
  447. ASMALIGN(3)
  448. "1: \n\t"
  449. "movq (%1, %3), %%mm1 \n\t"
  450. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  451. PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
  452. "movq (%2), %%mm3 \n\t"
  453. PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
  454. "movq (%2, %3), %%mm3 \n\t"
  455. PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
  456. "movq %%mm0, (%2) \n\t"
  457. "movq %%mm1, (%2, %3) \n\t"
  458. "add %%"REG_a", %1 \n\t"
  459. "add %%"REG_a", %2 \n\t"
  460. "movq (%1, %3), %%mm1 \n\t"
  461. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  462. PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
  463. "movq (%2), %%mm3 \n\t"
  464. PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
  465. "movq (%2, %3), %%mm3 \n\t"
  466. PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
  467. "movq %%mm2, (%2) \n\t"
  468. "movq %%mm1, (%2, %3) \n\t"
  469. "add %%"REG_a", %1 \n\t"
  470. "add %%"REG_a", %2 \n\t"
  471. "subl $4, %0 \n\t"
  472. "jnz 1b \n\t"
  473. :"+g"(h), "+S"(pixels), "+D"(block)
  474. :"r"((x86_reg)line_size)
  475. :REG_a, "memory");
  476. }
  477. // this routine is 'slightly' suboptimal but mostly unused
  478. static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  479. {
  480. MOVQ_ZERO(mm7);
  481. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  482. __asm__ volatile(
  483. "movq (%1), %%mm0 \n\t"
  484. "movq 1(%1), %%mm4 \n\t"
  485. "movq %%mm0, %%mm1 \n\t"
  486. "movq %%mm4, %%mm5 \n\t"
  487. "punpcklbw %%mm7, %%mm0 \n\t"
  488. "punpcklbw %%mm7, %%mm4 \n\t"
  489. "punpckhbw %%mm7, %%mm1 \n\t"
  490. "punpckhbw %%mm7, %%mm5 \n\t"
  491. "paddusw %%mm0, %%mm4 \n\t"
  492. "paddusw %%mm1, %%mm5 \n\t"
  493. "xor %%"REG_a", %%"REG_a" \n\t"
  494. "add %3, %1 \n\t"
  495. ASMALIGN(3)
  496. "1: \n\t"
  497. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  498. "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
  499. "movq %%mm0, %%mm1 \n\t"
  500. "movq %%mm2, %%mm3 \n\t"
  501. "punpcklbw %%mm7, %%mm0 \n\t"
  502. "punpcklbw %%mm7, %%mm2 \n\t"
  503. "punpckhbw %%mm7, %%mm1 \n\t"
  504. "punpckhbw %%mm7, %%mm3 \n\t"
  505. "paddusw %%mm2, %%mm0 \n\t"
  506. "paddusw %%mm3, %%mm1 \n\t"
  507. "paddusw %%mm6, %%mm4 \n\t"
  508. "paddusw %%mm6, %%mm5 \n\t"
  509. "paddusw %%mm0, %%mm4 \n\t"
  510. "paddusw %%mm1, %%mm5 \n\t"
  511. "psrlw $2, %%mm4 \n\t"
  512. "psrlw $2, %%mm5 \n\t"
  513. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  514. "packuswb %%mm5, %%mm4 \n\t"
  515. "pcmpeqd %%mm2, %%mm2 \n\t"
  516. "paddb %%mm2, %%mm2 \n\t"
  517. PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
  518. "movq %%mm5, (%2, %%"REG_a") \n\t"
  519. "add %3, %%"REG_a" \n\t"
  520. "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  521. "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
  522. "movq %%mm2, %%mm3 \n\t"
  523. "movq %%mm4, %%mm5 \n\t"
  524. "punpcklbw %%mm7, %%mm2 \n\t"
  525. "punpcklbw %%mm7, %%mm4 \n\t"
  526. "punpckhbw %%mm7, %%mm3 \n\t"
  527. "punpckhbw %%mm7, %%mm5 \n\t"
  528. "paddusw %%mm2, %%mm4 \n\t"
  529. "paddusw %%mm3, %%mm5 \n\t"
  530. "paddusw %%mm6, %%mm0 \n\t"
  531. "paddusw %%mm6, %%mm1 \n\t"
  532. "paddusw %%mm4, %%mm0 \n\t"
  533. "paddusw %%mm5, %%mm1 \n\t"
  534. "psrlw $2, %%mm0 \n\t"
  535. "psrlw $2, %%mm1 \n\t"
  536. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  537. "packuswb %%mm1, %%mm0 \n\t"
  538. "pcmpeqd %%mm2, %%mm2 \n\t"
  539. "paddb %%mm2, %%mm2 \n\t"
  540. PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
  541. "movq %%mm1, (%2, %%"REG_a") \n\t"
  542. "add %3, %%"REG_a" \n\t"
  543. "subl $2, %0 \n\t"
  544. "jnz 1b \n\t"
  545. :"+g"(h), "+S"(pixels)
  546. :"D"(block), "r"((x86_reg)line_size)
  547. :REG_a, "memory");
  548. }
  549. //FIXME optimize
  550. static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  551. DEF(put, pixels8_y2)(block , pixels , line_size, h);
  552. DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
  553. }
  554. static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  555. DEF(put, pixels8_xy2)(block , pixels , line_size, h);
  556. DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
  557. }
  558. static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  559. DEF(avg, pixels8_y2)(block , pixels , line_size, h);
  560. DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
  561. }
  562. static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  563. DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
  564. DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
  565. }