dsputil_mmx_rnd.h 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594
  1. /*
  2. * DSP utils mmx functions are compiled twice for rnd/no_rnd
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  8. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  9. *
  10. * This file is part of FFmpeg.
  11. *
  12. * FFmpeg is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU Lesser General Public
  14. * License as published by the Free Software Foundation; either
  15. * version 2.1 of the License, or (at your option) any later version.
  16. *
  17. * FFmpeg is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. * Lesser General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU Lesser General Public
  23. * License along with FFmpeg; if not, write to the Free Software
  24. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. */
  26. /* This header intentionally has no multiple inclusion guards. It is meant to
  27. * be included multiple times and generates different code depending on the
  28. * value of certain #defines. */
  29. // put_pixels
  30. static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  31. {
  32. MOVQ_BFE(mm6);
  33. asm volatile(
  34. "lea (%3, %3), %%"REG_a" \n\t"
  35. ASMALIGN(3)
  36. "1: \n\t"
  37. "movq (%1), %%mm0 \n\t"
  38. "movq 1(%1), %%mm1 \n\t"
  39. "movq (%1, %3), %%mm2 \n\t"
  40. "movq 1(%1, %3), %%mm3 \n\t"
  41. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  42. "movq %%mm4, (%2) \n\t"
  43. "movq %%mm5, (%2, %3) \n\t"
  44. "add %%"REG_a", %1 \n\t"
  45. "add %%"REG_a", %2 \n\t"
  46. "movq (%1), %%mm0 \n\t"
  47. "movq 1(%1), %%mm1 \n\t"
  48. "movq (%1, %3), %%mm2 \n\t"
  49. "movq 1(%1, %3), %%mm3 \n\t"
  50. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  51. "movq %%mm4, (%2) \n\t"
  52. "movq %%mm5, (%2, %3) \n\t"
  53. "add %%"REG_a", %1 \n\t"
  54. "add %%"REG_a", %2 \n\t"
  55. "subl $4, %0 \n\t"
  56. "jnz 1b \n\t"
  57. :"+g"(h), "+S"(pixels), "+D"(block)
  58. :"r"((x86_reg)line_size)
  59. :REG_a, "memory");
  60. }
  61. static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  62. {
  63. MOVQ_BFE(mm6);
  64. asm volatile(
  65. "testl $1, %0 \n\t"
  66. " jz 1f \n\t"
  67. "movq (%1), %%mm0 \n\t"
  68. "movq (%2), %%mm1 \n\t"
  69. "add %4, %1 \n\t"
  70. "add $8, %2 \n\t"
  71. PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
  72. "movq %%mm4, (%3) \n\t"
  73. "add %5, %3 \n\t"
  74. "decl %0 \n\t"
  75. ASMALIGN(3)
  76. "1: \n\t"
  77. "movq (%1), %%mm0 \n\t"
  78. "movq (%2), %%mm1 \n\t"
  79. "add %4, %1 \n\t"
  80. "movq (%1), %%mm2 \n\t"
  81. "movq 8(%2), %%mm3 \n\t"
  82. "add %4, %1 \n\t"
  83. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  84. "movq %%mm4, (%3) \n\t"
  85. "add %5, %3 \n\t"
  86. "movq %%mm5, (%3) \n\t"
  87. "add %5, %3 \n\t"
  88. "movq (%1), %%mm0 \n\t"
  89. "movq 16(%2), %%mm1 \n\t"
  90. "add %4, %1 \n\t"
  91. "movq (%1), %%mm2 \n\t"
  92. "movq 24(%2), %%mm3 \n\t"
  93. "add %4, %1 \n\t"
  94. "add $32, %2 \n\t"
  95. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  96. "movq %%mm4, (%3) \n\t"
  97. "add %5, %3 \n\t"
  98. "movq %%mm5, (%3) \n\t"
  99. "add %5, %3 \n\t"
  100. "subl $4, %0 \n\t"
  101. "jnz 1b \n\t"
  102. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  103. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  104. #else
  105. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  106. #endif
  107. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  108. :"memory");
  109. }
  110. static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  111. {
  112. MOVQ_BFE(mm6);
  113. asm volatile(
  114. "lea (%3, %3), %%"REG_a" \n\t"
  115. ASMALIGN(3)
  116. "1: \n\t"
  117. "movq (%1), %%mm0 \n\t"
  118. "movq 1(%1), %%mm1 \n\t"
  119. "movq (%1, %3), %%mm2 \n\t"
  120. "movq 1(%1, %3), %%mm3 \n\t"
  121. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  122. "movq %%mm4, (%2) \n\t"
  123. "movq %%mm5, (%2, %3) \n\t"
  124. "movq 8(%1), %%mm0 \n\t"
  125. "movq 9(%1), %%mm1 \n\t"
  126. "movq 8(%1, %3), %%mm2 \n\t"
  127. "movq 9(%1, %3), %%mm3 \n\t"
  128. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  129. "movq %%mm4, 8(%2) \n\t"
  130. "movq %%mm5, 8(%2, %3) \n\t"
  131. "add %%"REG_a", %1 \n\t"
  132. "add %%"REG_a", %2 \n\t"
  133. "movq (%1), %%mm0 \n\t"
  134. "movq 1(%1), %%mm1 \n\t"
  135. "movq (%1, %3), %%mm2 \n\t"
  136. "movq 1(%1, %3), %%mm3 \n\t"
  137. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  138. "movq %%mm4, (%2) \n\t"
  139. "movq %%mm5, (%2, %3) \n\t"
  140. "movq 8(%1), %%mm0 \n\t"
  141. "movq 9(%1), %%mm1 \n\t"
  142. "movq 8(%1, %3), %%mm2 \n\t"
  143. "movq 9(%1, %3), %%mm3 \n\t"
  144. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  145. "movq %%mm4, 8(%2) \n\t"
  146. "movq %%mm5, 8(%2, %3) \n\t"
  147. "add %%"REG_a", %1 \n\t"
  148. "add %%"REG_a", %2 \n\t"
  149. "subl $4, %0 \n\t"
  150. "jnz 1b \n\t"
  151. :"+g"(h), "+S"(pixels), "+D"(block)
  152. :"r"((x86_reg)line_size)
  153. :REG_a, "memory");
  154. }
  155. static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  156. {
  157. MOVQ_BFE(mm6);
  158. asm volatile(
  159. "testl $1, %0 \n\t"
  160. " jz 1f \n\t"
  161. "movq (%1), %%mm0 \n\t"
  162. "movq (%2), %%mm1 \n\t"
  163. "movq 8(%1), %%mm2 \n\t"
  164. "movq 8(%2), %%mm3 \n\t"
  165. "add %4, %1 \n\t"
  166. "add $16, %2 \n\t"
  167. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  168. "movq %%mm4, (%3) \n\t"
  169. "movq %%mm5, 8(%3) \n\t"
  170. "add %5, %3 \n\t"
  171. "decl %0 \n\t"
  172. ASMALIGN(3)
  173. "1: \n\t"
  174. "movq (%1), %%mm0 \n\t"
  175. "movq (%2), %%mm1 \n\t"
  176. "movq 8(%1), %%mm2 \n\t"
  177. "movq 8(%2), %%mm3 \n\t"
  178. "add %4, %1 \n\t"
  179. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  180. "movq %%mm4, (%3) \n\t"
  181. "movq %%mm5, 8(%3) \n\t"
  182. "add %5, %3 \n\t"
  183. "movq (%1), %%mm0 \n\t"
  184. "movq 16(%2), %%mm1 \n\t"
  185. "movq 8(%1), %%mm2 \n\t"
  186. "movq 24(%2), %%mm3 \n\t"
  187. "add %4, %1 \n\t"
  188. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  189. "movq %%mm4, (%3) \n\t"
  190. "movq %%mm5, 8(%3) \n\t"
  191. "add %5, %3 \n\t"
  192. "add $32, %2 \n\t"
  193. "subl $2, %0 \n\t"
  194. "jnz 1b \n\t"
  195. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  196. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  197. #else
  198. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  199. #endif
  200. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  201. :"memory");
  202. }
  203. static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  204. {
  205. MOVQ_BFE(mm6);
  206. asm volatile(
  207. "lea (%3, %3), %%"REG_a" \n\t"
  208. "movq (%1), %%mm0 \n\t"
  209. ASMALIGN(3)
  210. "1: \n\t"
  211. "movq (%1, %3), %%mm1 \n\t"
  212. "movq (%1, %%"REG_a"),%%mm2 \n\t"
  213. PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
  214. "movq %%mm4, (%2) \n\t"
  215. "movq %%mm5, (%2, %3) \n\t"
  216. "add %%"REG_a", %1 \n\t"
  217. "add %%"REG_a", %2 \n\t"
  218. "movq (%1, %3), %%mm1 \n\t"
  219. "movq (%1, %%"REG_a"),%%mm0 \n\t"
  220. PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
  221. "movq %%mm4, (%2) \n\t"
  222. "movq %%mm5, (%2, %3) \n\t"
  223. "add %%"REG_a", %1 \n\t"
  224. "add %%"REG_a", %2 \n\t"
  225. "subl $4, %0 \n\t"
  226. "jnz 1b \n\t"
  227. :"+g"(h), "+S"(pixels), "+D"(block)
  228. :"r"((x86_reg)line_size)
  229. :REG_a, "memory");
  230. }
  231. static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  232. {
  233. MOVQ_ZERO(mm7);
  234. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  235. asm volatile(
  236. "movq (%1), %%mm0 \n\t"
  237. "movq 1(%1), %%mm4 \n\t"
  238. "movq %%mm0, %%mm1 \n\t"
  239. "movq %%mm4, %%mm5 \n\t"
  240. "punpcklbw %%mm7, %%mm0 \n\t"
  241. "punpcklbw %%mm7, %%mm4 \n\t"
  242. "punpckhbw %%mm7, %%mm1 \n\t"
  243. "punpckhbw %%mm7, %%mm5 \n\t"
  244. "paddusw %%mm0, %%mm4 \n\t"
  245. "paddusw %%mm1, %%mm5 \n\t"
  246. "xor %%"REG_a", %%"REG_a" \n\t"
  247. "add %3, %1 \n\t"
  248. ASMALIGN(3)
  249. "1: \n\t"
  250. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  251. "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
  252. "movq %%mm0, %%mm1 \n\t"
  253. "movq %%mm2, %%mm3 \n\t"
  254. "punpcklbw %%mm7, %%mm0 \n\t"
  255. "punpcklbw %%mm7, %%mm2 \n\t"
  256. "punpckhbw %%mm7, %%mm1 \n\t"
  257. "punpckhbw %%mm7, %%mm3 \n\t"
  258. "paddusw %%mm2, %%mm0 \n\t"
  259. "paddusw %%mm3, %%mm1 \n\t"
  260. "paddusw %%mm6, %%mm4 \n\t"
  261. "paddusw %%mm6, %%mm5 \n\t"
  262. "paddusw %%mm0, %%mm4 \n\t"
  263. "paddusw %%mm1, %%mm5 \n\t"
  264. "psrlw $2, %%mm4 \n\t"
  265. "psrlw $2, %%mm5 \n\t"
  266. "packuswb %%mm5, %%mm4 \n\t"
  267. "movq %%mm4, (%2, %%"REG_a") \n\t"
  268. "add %3, %%"REG_a" \n\t"
  269. "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  270. "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
  271. "movq %%mm2, %%mm3 \n\t"
  272. "movq %%mm4, %%mm5 \n\t"
  273. "punpcklbw %%mm7, %%mm2 \n\t"
  274. "punpcklbw %%mm7, %%mm4 \n\t"
  275. "punpckhbw %%mm7, %%mm3 \n\t"
  276. "punpckhbw %%mm7, %%mm5 \n\t"
  277. "paddusw %%mm2, %%mm4 \n\t"
  278. "paddusw %%mm3, %%mm5 \n\t"
  279. "paddusw %%mm6, %%mm0 \n\t"
  280. "paddusw %%mm6, %%mm1 \n\t"
  281. "paddusw %%mm4, %%mm0 \n\t"
  282. "paddusw %%mm5, %%mm1 \n\t"
  283. "psrlw $2, %%mm0 \n\t"
  284. "psrlw $2, %%mm1 \n\t"
  285. "packuswb %%mm1, %%mm0 \n\t"
  286. "movq %%mm0, (%2, %%"REG_a") \n\t"
  287. "add %3, %%"REG_a" \n\t"
  288. "subl $2, %0 \n\t"
  289. "jnz 1b \n\t"
  290. :"+g"(h), "+S"(pixels)
  291. :"D"(block), "r"((x86_reg)line_size)
  292. :REG_a, "memory");
  293. }
  294. // avg_pixels
  295. static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  296. {
  297. MOVQ_BFE(mm6);
  298. JUMPALIGN();
  299. do {
  300. asm volatile(
  301. "movd %0, %%mm0 \n\t"
  302. "movd %1, %%mm1 \n\t"
  303. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  304. "movd %%mm2, %0 \n\t"
  305. :"+m"(*block)
  306. :"m"(*pixels)
  307. :"memory");
  308. pixels += line_size;
  309. block += line_size;
  310. }
  311. while (--h);
  312. }
  313. // in case more speed is needed - unroling would certainly help
  314. static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  315. {
  316. MOVQ_BFE(mm6);
  317. JUMPALIGN();
  318. do {
  319. asm volatile(
  320. "movq %0, %%mm0 \n\t"
  321. "movq %1, %%mm1 \n\t"
  322. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  323. "movq %%mm2, %0 \n\t"
  324. :"+m"(*block)
  325. :"m"(*pixels)
  326. :"memory");
  327. pixels += line_size;
  328. block += line_size;
  329. }
  330. while (--h);
  331. }
  332. static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  333. {
  334. MOVQ_BFE(mm6);
  335. JUMPALIGN();
  336. do {
  337. asm volatile(
  338. "movq %0, %%mm0 \n\t"
  339. "movq %1, %%mm1 \n\t"
  340. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  341. "movq %%mm2, %0 \n\t"
  342. "movq 8%0, %%mm0 \n\t"
  343. "movq 8%1, %%mm1 \n\t"
  344. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  345. "movq %%mm2, 8%0 \n\t"
  346. :"+m"(*block)
  347. :"m"(*pixels)
  348. :"memory");
  349. pixels += line_size;
  350. block += line_size;
  351. }
  352. while (--h);
  353. }
  354. static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  355. {
  356. MOVQ_BFE(mm6);
  357. JUMPALIGN();
  358. do {
  359. asm volatile(
  360. "movq %1, %%mm0 \n\t"
  361. "movq 1%1, %%mm1 \n\t"
  362. "movq %0, %%mm3 \n\t"
  363. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  364. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  365. "movq %%mm0, %0 \n\t"
  366. :"+m"(*block)
  367. :"m"(*pixels)
  368. :"memory");
  369. pixels += line_size;
  370. block += line_size;
  371. } while (--h);
  372. }
  373. static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  374. {
  375. MOVQ_BFE(mm6);
  376. JUMPALIGN();
  377. do {
  378. asm volatile(
  379. "movq %1, %%mm0 \n\t"
  380. "movq %2, %%mm1 \n\t"
  381. "movq %0, %%mm3 \n\t"
  382. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  383. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  384. "movq %%mm0, %0 \n\t"
  385. :"+m"(*dst)
  386. :"m"(*src1), "m"(*src2)
  387. :"memory");
  388. dst += dstStride;
  389. src1 += src1Stride;
  390. src2 += 8;
  391. } while (--h);
  392. }
  393. static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  394. {
  395. MOVQ_BFE(mm6);
  396. JUMPALIGN();
  397. do {
  398. asm volatile(
  399. "movq %1, %%mm0 \n\t"
  400. "movq 1%1, %%mm1 \n\t"
  401. "movq %0, %%mm3 \n\t"
  402. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  403. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  404. "movq %%mm0, %0 \n\t"
  405. "movq 8%1, %%mm0 \n\t"
  406. "movq 9%1, %%mm1 \n\t"
  407. "movq 8%0, %%mm3 \n\t"
  408. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  409. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  410. "movq %%mm0, 8%0 \n\t"
  411. :"+m"(*block)
  412. :"m"(*pixels)
  413. :"memory");
  414. pixels += line_size;
  415. block += line_size;
  416. } while (--h);
  417. }
  418. static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  419. {
  420. MOVQ_BFE(mm6);
  421. JUMPALIGN();
  422. do {
  423. asm volatile(
  424. "movq %1, %%mm0 \n\t"
  425. "movq %2, %%mm1 \n\t"
  426. "movq %0, %%mm3 \n\t"
  427. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  428. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  429. "movq %%mm0, %0 \n\t"
  430. "movq 8%1, %%mm0 \n\t"
  431. "movq 8%2, %%mm1 \n\t"
  432. "movq 8%0, %%mm3 \n\t"
  433. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  434. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  435. "movq %%mm0, 8%0 \n\t"
  436. :"+m"(*dst)
  437. :"m"(*src1), "m"(*src2)
  438. :"memory");
  439. dst += dstStride;
  440. src1 += src1Stride;
  441. src2 += 16;
  442. } while (--h);
  443. }
  444. static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  445. {
  446. MOVQ_BFE(mm6);
  447. asm volatile(
  448. "lea (%3, %3), %%"REG_a" \n\t"
  449. "movq (%1), %%mm0 \n\t"
  450. ASMALIGN(3)
  451. "1: \n\t"
  452. "movq (%1, %3), %%mm1 \n\t"
  453. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  454. PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
  455. "movq (%2), %%mm3 \n\t"
  456. PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
  457. "movq (%2, %3), %%mm3 \n\t"
  458. PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
  459. "movq %%mm0, (%2) \n\t"
  460. "movq %%mm1, (%2, %3) \n\t"
  461. "add %%"REG_a", %1 \n\t"
  462. "add %%"REG_a", %2 \n\t"
  463. "movq (%1, %3), %%mm1 \n\t"
  464. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  465. PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
  466. "movq (%2), %%mm3 \n\t"
  467. PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
  468. "movq (%2, %3), %%mm3 \n\t"
  469. PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
  470. "movq %%mm2, (%2) \n\t"
  471. "movq %%mm1, (%2, %3) \n\t"
  472. "add %%"REG_a", %1 \n\t"
  473. "add %%"REG_a", %2 \n\t"
  474. "subl $4, %0 \n\t"
  475. "jnz 1b \n\t"
  476. :"+g"(h), "+S"(pixels), "+D"(block)
  477. :"r"((x86_reg)line_size)
  478. :REG_a, "memory");
  479. }
  480. // this routine is 'slightly' suboptimal but mostly unused
  481. static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  482. {
  483. MOVQ_ZERO(mm7);
  484. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  485. asm volatile(
  486. "movq (%1), %%mm0 \n\t"
  487. "movq 1(%1), %%mm4 \n\t"
  488. "movq %%mm0, %%mm1 \n\t"
  489. "movq %%mm4, %%mm5 \n\t"
  490. "punpcklbw %%mm7, %%mm0 \n\t"
  491. "punpcklbw %%mm7, %%mm4 \n\t"
  492. "punpckhbw %%mm7, %%mm1 \n\t"
  493. "punpckhbw %%mm7, %%mm5 \n\t"
  494. "paddusw %%mm0, %%mm4 \n\t"
  495. "paddusw %%mm1, %%mm5 \n\t"
  496. "xor %%"REG_a", %%"REG_a" \n\t"
  497. "add %3, %1 \n\t"
  498. ASMALIGN(3)
  499. "1: \n\t"
  500. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  501. "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
  502. "movq %%mm0, %%mm1 \n\t"
  503. "movq %%mm2, %%mm3 \n\t"
  504. "punpcklbw %%mm7, %%mm0 \n\t"
  505. "punpcklbw %%mm7, %%mm2 \n\t"
  506. "punpckhbw %%mm7, %%mm1 \n\t"
  507. "punpckhbw %%mm7, %%mm3 \n\t"
  508. "paddusw %%mm2, %%mm0 \n\t"
  509. "paddusw %%mm3, %%mm1 \n\t"
  510. "paddusw %%mm6, %%mm4 \n\t"
  511. "paddusw %%mm6, %%mm5 \n\t"
  512. "paddusw %%mm0, %%mm4 \n\t"
  513. "paddusw %%mm1, %%mm5 \n\t"
  514. "psrlw $2, %%mm4 \n\t"
  515. "psrlw $2, %%mm5 \n\t"
  516. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  517. "packuswb %%mm5, %%mm4 \n\t"
  518. "pcmpeqd %%mm2, %%mm2 \n\t"
  519. "paddb %%mm2, %%mm2 \n\t"
  520. PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
  521. "movq %%mm5, (%2, %%"REG_a") \n\t"
  522. "add %3, %%"REG_a" \n\t"
  523. "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  524. "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
  525. "movq %%mm2, %%mm3 \n\t"
  526. "movq %%mm4, %%mm5 \n\t"
  527. "punpcklbw %%mm7, %%mm2 \n\t"
  528. "punpcklbw %%mm7, %%mm4 \n\t"
  529. "punpckhbw %%mm7, %%mm3 \n\t"
  530. "punpckhbw %%mm7, %%mm5 \n\t"
  531. "paddusw %%mm2, %%mm4 \n\t"
  532. "paddusw %%mm3, %%mm5 \n\t"
  533. "paddusw %%mm6, %%mm0 \n\t"
  534. "paddusw %%mm6, %%mm1 \n\t"
  535. "paddusw %%mm4, %%mm0 \n\t"
  536. "paddusw %%mm5, %%mm1 \n\t"
  537. "psrlw $2, %%mm0 \n\t"
  538. "psrlw $2, %%mm1 \n\t"
  539. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  540. "packuswb %%mm1, %%mm0 \n\t"
  541. "pcmpeqd %%mm2, %%mm2 \n\t"
  542. "paddb %%mm2, %%mm2 \n\t"
  543. PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
  544. "movq %%mm1, (%2, %%"REG_a") \n\t"
  545. "add %3, %%"REG_a" \n\t"
  546. "subl $2, %0 \n\t"
  547. "jnz 1b \n\t"
  548. :"+g"(h), "+S"(pixels)
  549. :"D"(block), "r"((x86_reg)line_size)
  550. :REG_a, "memory");
  551. }
  552. //FIXME optimize
  553. static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  554. DEF(put, pixels8_y2)(block , pixels , line_size, h);
  555. DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
  556. }
  557. static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  558. DEF(put, pixels8_xy2)(block , pixels , line_size, h);
  559. DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
  560. }
  561. static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  562. DEF(avg, pixels8_y2)(block , pixels , line_size, h);
  563. DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
  564. }
  565. static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  566. DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
  567. DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
  568. }