dsputilenc_mmx.c 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098
  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. *
  8. * This file is part of FFmpeg.
  9. *
  10. * FFmpeg is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * FFmpeg is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with FFmpeg; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. #include "libavutil/attributes.h"
  25. #include "libavutil/cpu.h"
  26. #include "libavutil/x86/asm.h"
  27. #include "libavutil/x86/cpu.h"
  28. #include "libavcodec/dct.h"
  29. #include "libavcodec/dsputil.h"
  30. #include "libavcodec/mpegvideo.h"
  31. #include "libavcodec/mathops.h"
  32. #include "dsputil_x86.h"
  33. void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
  34. void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
  35. void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
  36. int stride);
  37. int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
  38. int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
  39. #if HAVE_INLINE_ASM
  40. static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  41. int line_size, int h)
  42. {
  43. int tmp;
  44. __asm__ volatile (
  45. "movl %4, %%ecx \n"
  46. "shr $1, %%ecx \n"
  47. "pxor %%mm0, %%mm0 \n" /* mm0 = 0 */
  48. "pxor %%mm7, %%mm7 \n" /* mm7 holds the sum */
  49. "1: \n"
  50. "movq (%0), %%mm1 \n" /* mm1 = pix1[0][0 - 7] */
  51. "movq (%1), %%mm2 \n" /* mm2 = pix2[0][0 - 7] */
  52. "movq (%0, %3), %%mm3 \n" /* mm3 = pix1[1][0 - 7] */
  53. "movq (%1, %3), %%mm4 \n" /* mm4 = pix2[1][0 - 7] */
  54. /* todo: mm1-mm2, mm3-mm4 */
  55. /* algo: subtract mm1 from mm2 with saturation and vice versa */
  56. /* OR the results to get absolute difference */
  57. "movq %%mm1, %%mm5 \n"
  58. "movq %%mm3, %%mm6 \n"
  59. "psubusb %%mm2, %%mm1 \n"
  60. "psubusb %%mm4, %%mm3 \n"
  61. "psubusb %%mm5, %%mm2 \n"
  62. "psubusb %%mm6, %%mm4 \n"
  63. "por %%mm1, %%mm2 \n"
  64. "por %%mm3, %%mm4 \n"
  65. /* now convert to 16-bit vectors so we can square them */
  66. "movq %%mm2, %%mm1 \n"
  67. "movq %%mm4, %%mm3 \n"
  68. "punpckhbw %%mm0, %%mm2 \n"
  69. "punpckhbw %%mm0, %%mm4 \n"
  70. "punpcklbw %%mm0, %%mm1 \n" /* mm1 now spread over (mm1, mm2) */
  71. "punpcklbw %%mm0, %%mm3 \n" /* mm4 now spread over (mm3, mm4) */
  72. "pmaddwd %%mm2, %%mm2 \n"
  73. "pmaddwd %%mm4, %%mm4 \n"
  74. "pmaddwd %%mm1, %%mm1 \n"
  75. "pmaddwd %%mm3, %%mm3 \n"
  76. "lea (%0, %3, 2), %0 \n" /* pix1 += 2 * line_size */
  77. "lea (%1, %3, 2), %1 \n" /* pix2 += 2 * line_size */
  78. "paddd %%mm2, %%mm1 \n"
  79. "paddd %%mm4, %%mm3 \n"
  80. "paddd %%mm1, %%mm7 \n"
  81. "paddd %%mm3, %%mm7 \n"
  82. "decl %%ecx \n"
  83. "jnz 1b \n"
  84. "movq %%mm7, %%mm1 \n"
  85. "psrlq $32, %%mm7 \n" /* shift hi dword to lo */
  86. "paddd %%mm7, %%mm1 \n"
  87. "movd %%mm1, %2 \n"
  88. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  89. : "r" ((x86_reg) line_size), "m" (h)
  90. : "%ecx");
  91. return tmp;
  92. }
  93. static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  94. int line_size, int h)
  95. {
  96. int tmp;
  97. __asm__ volatile (
  98. "movl %4, %%ecx\n"
  99. "pxor %%mm0, %%mm0\n" /* mm0 = 0 */
  100. "pxor %%mm7, %%mm7\n" /* mm7 holds the sum */
  101. "1:\n"
  102. "movq (%0), %%mm1\n" /* mm1 = pix1[0 - 7] */
  103. "movq (%1), %%mm2\n" /* mm2 = pix2[0 - 7] */
  104. "movq 8(%0), %%mm3\n" /* mm3 = pix1[8 - 15] */
  105. "movq 8(%1), %%mm4\n" /* mm4 = pix2[8 - 15] */
  106. /* todo: mm1-mm2, mm3-mm4 */
  107. /* algo: subtract mm1 from mm2 with saturation and vice versa */
  108. /* OR the results to get absolute difference */
  109. "movq %%mm1, %%mm5\n"
  110. "movq %%mm3, %%mm6\n"
  111. "psubusb %%mm2, %%mm1\n"
  112. "psubusb %%mm4, %%mm3\n"
  113. "psubusb %%mm5, %%mm2\n"
  114. "psubusb %%mm6, %%mm4\n"
  115. "por %%mm1, %%mm2\n"
  116. "por %%mm3, %%mm4\n"
  117. /* now convert to 16-bit vectors so we can square them */
  118. "movq %%mm2, %%mm1\n"
  119. "movq %%mm4, %%mm3\n"
  120. "punpckhbw %%mm0, %%mm2\n"
  121. "punpckhbw %%mm0, %%mm4\n"
  122. "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
  123. "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
  124. "pmaddwd %%mm2, %%mm2\n"
  125. "pmaddwd %%mm4, %%mm4\n"
  126. "pmaddwd %%mm1, %%mm1\n"
  127. "pmaddwd %%mm3, %%mm3\n"
  128. "add %3, %0\n"
  129. "add %3, %1\n"
  130. "paddd %%mm2, %%mm1\n"
  131. "paddd %%mm4, %%mm3\n"
  132. "paddd %%mm1, %%mm7\n"
  133. "paddd %%mm3, %%mm7\n"
  134. "decl %%ecx\n"
  135. "jnz 1b\n"
  136. "movq %%mm7, %%mm1\n"
  137. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  138. "paddd %%mm7, %%mm1\n"
  139. "movd %%mm1, %2\n"
  140. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  141. : "r" ((x86_reg) line_size), "m" (h)
  142. : "%ecx");
  143. return tmp;
  144. }
  145. static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h)
  146. {
  147. int tmp;
  148. __asm__ volatile (
  149. "movl %3, %%ecx\n"
  150. "pxor %%mm7, %%mm7\n"
  151. "pxor %%mm6, %%mm6\n"
  152. "movq (%0), %%mm0\n"
  153. "movq %%mm0, %%mm1\n"
  154. "psllq $8, %%mm0\n"
  155. "psrlq $8, %%mm1\n"
  156. "psrlq $8, %%mm0\n"
  157. "movq %%mm0, %%mm2\n"
  158. "movq %%mm1, %%mm3\n"
  159. "punpcklbw %%mm7, %%mm0\n"
  160. "punpcklbw %%mm7, %%mm1\n"
  161. "punpckhbw %%mm7, %%mm2\n"
  162. "punpckhbw %%mm7, %%mm3\n"
  163. "psubw %%mm1, %%mm0\n"
  164. "psubw %%mm3, %%mm2\n"
  165. "add %2, %0\n"
  166. "movq (%0), %%mm4\n"
  167. "movq %%mm4, %%mm1\n"
  168. "psllq $8, %%mm4\n"
  169. "psrlq $8, %%mm1\n"
  170. "psrlq $8, %%mm4\n"
  171. "movq %%mm4, %%mm5\n"
  172. "movq %%mm1, %%mm3\n"
  173. "punpcklbw %%mm7, %%mm4\n"
  174. "punpcklbw %%mm7, %%mm1\n"
  175. "punpckhbw %%mm7, %%mm5\n"
  176. "punpckhbw %%mm7, %%mm3\n"
  177. "psubw %%mm1, %%mm4\n"
  178. "psubw %%mm3, %%mm5\n"
  179. "psubw %%mm4, %%mm0\n"
  180. "psubw %%mm5, %%mm2\n"
  181. "pxor %%mm3, %%mm3\n"
  182. "pxor %%mm1, %%mm1\n"
  183. "pcmpgtw %%mm0, %%mm3\n\t"
  184. "pcmpgtw %%mm2, %%mm1\n\t"
  185. "pxor %%mm3, %%mm0\n"
  186. "pxor %%mm1, %%mm2\n"
  187. "psubw %%mm3, %%mm0\n"
  188. "psubw %%mm1, %%mm2\n"
  189. "paddw %%mm0, %%mm2\n"
  190. "paddw %%mm2, %%mm6\n"
  191. "add %2, %0\n"
  192. "1:\n"
  193. "movq (%0), %%mm0\n"
  194. "movq %%mm0, %%mm1\n"
  195. "psllq $8, %%mm0\n"
  196. "psrlq $8, %%mm1\n"
  197. "psrlq $8, %%mm0\n"
  198. "movq %%mm0, %%mm2\n"
  199. "movq %%mm1, %%mm3\n"
  200. "punpcklbw %%mm7, %%mm0\n"
  201. "punpcklbw %%mm7, %%mm1\n"
  202. "punpckhbw %%mm7, %%mm2\n"
  203. "punpckhbw %%mm7, %%mm3\n"
  204. "psubw %%mm1, %%mm0\n"
  205. "psubw %%mm3, %%mm2\n"
  206. "psubw %%mm0, %%mm4\n"
  207. "psubw %%mm2, %%mm5\n"
  208. "pxor %%mm3, %%mm3\n"
  209. "pxor %%mm1, %%mm1\n"
  210. "pcmpgtw %%mm4, %%mm3\n\t"
  211. "pcmpgtw %%mm5, %%mm1\n\t"
  212. "pxor %%mm3, %%mm4\n"
  213. "pxor %%mm1, %%mm5\n"
  214. "psubw %%mm3, %%mm4\n"
  215. "psubw %%mm1, %%mm5\n"
  216. "paddw %%mm4, %%mm5\n"
  217. "paddw %%mm5, %%mm6\n"
  218. "add %2, %0\n"
  219. "movq (%0), %%mm4\n"
  220. "movq %%mm4, %%mm1\n"
  221. "psllq $8, %%mm4\n"
  222. "psrlq $8, %%mm1\n"
  223. "psrlq $8, %%mm4\n"
  224. "movq %%mm4, %%mm5\n"
  225. "movq %%mm1, %%mm3\n"
  226. "punpcklbw %%mm7, %%mm4\n"
  227. "punpcklbw %%mm7, %%mm1\n"
  228. "punpckhbw %%mm7, %%mm5\n"
  229. "punpckhbw %%mm7, %%mm3\n"
  230. "psubw %%mm1, %%mm4\n"
  231. "psubw %%mm3, %%mm5\n"
  232. "psubw %%mm4, %%mm0\n"
  233. "psubw %%mm5, %%mm2\n"
  234. "pxor %%mm3, %%mm3\n"
  235. "pxor %%mm1, %%mm1\n"
  236. "pcmpgtw %%mm0, %%mm3\n\t"
  237. "pcmpgtw %%mm2, %%mm1\n\t"
  238. "pxor %%mm3, %%mm0\n"
  239. "pxor %%mm1, %%mm2\n"
  240. "psubw %%mm3, %%mm0\n"
  241. "psubw %%mm1, %%mm2\n"
  242. "paddw %%mm0, %%mm2\n"
  243. "paddw %%mm2, %%mm6\n"
  244. "add %2, %0\n"
  245. "subl $2, %%ecx\n"
  246. " jnz 1b\n"
  247. "movq %%mm6, %%mm0\n"
  248. "punpcklwd %%mm7, %%mm0\n"
  249. "punpckhwd %%mm7, %%mm6\n"
  250. "paddd %%mm0, %%mm6\n"
  251. "movq %%mm6, %%mm0\n"
  252. "psrlq $32, %%mm6\n"
  253. "paddd %%mm6, %%mm0\n"
  254. "movd %%mm0, %1\n"
  255. : "+r" (pix1), "=r" (tmp)
  256. : "r" ((x86_reg) line_size), "g" (h - 2)
  257. : "%ecx");
  258. return tmp;
  259. }
  260. static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h)
  261. {
  262. int tmp;
  263. uint8_t *pix = pix1;
  264. __asm__ volatile (
  265. "movl %3, %%ecx\n"
  266. "pxor %%mm7, %%mm7\n"
  267. "pxor %%mm6, %%mm6\n"
  268. "movq (%0), %%mm0\n"
  269. "movq 1(%0), %%mm1\n"
  270. "movq %%mm0, %%mm2\n"
  271. "movq %%mm1, %%mm3\n"
  272. "punpcklbw %%mm7, %%mm0\n"
  273. "punpcklbw %%mm7, %%mm1\n"
  274. "punpckhbw %%mm7, %%mm2\n"
  275. "punpckhbw %%mm7, %%mm3\n"
  276. "psubw %%mm1, %%mm0\n"
  277. "psubw %%mm3, %%mm2\n"
  278. "add %2, %0\n"
  279. "movq (%0), %%mm4\n"
  280. "movq 1(%0), %%mm1\n"
  281. "movq %%mm4, %%mm5\n"
  282. "movq %%mm1, %%mm3\n"
  283. "punpcklbw %%mm7, %%mm4\n"
  284. "punpcklbw %%mm7, %%mm1\n"
  285. "punpckhbw %%mm7, %%mm5\n"
  286. "punpckhbw %%mm7, %%mm3\n"
  287. "psubw %%mm1, %%mm4\n"
  288. "psubw %%mm3, %%mm5\n"
  289. "psubw %%mm4, %%mm0\n"
  290. "psubw %%mm5, %%mm2\n"
  291. "pxor %%mm3, %%mm3\n"
  292. "pxor %%mm1, %%mm1\n"
  293. "pcmpgtw %%mm0, %%mm3\n\t"
  294. "pcmpgtw %%mm2, %%mm1\n\t"
  295. "pxor %%mm3, %%mm0\n"
  296. "pxor %%mm1, %%mm2\n"
  297. "psubw %%mm3, %%mm0\n"
  298. "psubw %%mm1, %%mm2\n"
  299. "paddw %%mm0, %%mm2\n"
  300. "paddw %%mm2, %%mm6\n"
  301. "add %2, %0\n"
  302. "1:\n"
  303. "movq (%0), %%mm0\n"
  304. "movq 1(%0), %%mm1\n"
  305. "movq %%mm0, %%mm2\n"
  306. "movq %%mm1, %%mm3\n"
  307. "punpcklbw %%mm7, %%mm0\n"
  308. "punpcklbw %%mm7, %%mm1\n"
  309. "punpckhbw %%mm7, %%mm2\n"
  310. "punpckhbw %%mm7, %%mm3\n"
  311. "psubw %%mm1, %%mm0\n"
  312. "psubw %%mm3, %%mm2\n"
  313. "psubw %%mm0, %%mm4\n"
  314. "psubw %%mm2, %%mm5\n"
  315. "pxor %%mm3, %%mm3\n"
  316. "pxor %%mm1, %%mm1\n"
  317. "pcmpgtw %%mm4, %%mm3\n\t"
  318. "pcmpgtw %%mm5, %%mm1\n\t"
  319. "pxor %%mm3, %%mm4\n"
  320. "pxor %%mm1, %%mm5\n"
  321. "psubw %%mm3, %%mm4\n"
  322. "psubw %%mm1, %%mm5\n"
  323. "paddw %%mm4, %%mm5\n"
  324. "paddw %%mm5, %%mm6\n"
  325. "add %2, %0\n"
  326. "movq (%0), %%mm4\n"
  327. "movq 1(%0), %%mm1\n"
  328. "movq %%mm4, %%mm5\n"
  329. "movq %%mm1, %%mm3\n"
  330. "punpcklbw %%mm7, %%mm4\n"
  331. "punpcklbw %%mm7, %%mm1\n"
  332. "punpckhbw %%mm7, %%mm5\n"
  333. "punpckhbw %%mm7, %%mm3\n"
  334. "psubw %%mm1, %%mm4\n"
  335. "psubw %%mm3, %%mm5\n"
  336. "psubw %%mm4, %%mm0\n"
  337. "psubw %%mm5, %%mm2\n"
  338. "pxor %%mm3, %%mm3\n"
  339. "pxor %%mm1, %%mm1\n"
  340. "pcmpgtw %%mm0, %%mm3\n\t"
  341. "pcmpgtw %%mm2, %%mm1\n\t"
  342. "pxor %%mm3, %%mm0\n"
  343. "pxor %%mm1, %%mm2\n"
  344. "psubw %%mm3, %%mm0\n"
  345. "psubw %%mm1, %%mm2\n"
  346. "paddw %%mm0, %%mm2\n"
  347. "paddw %%mm2, %%mm6\n"
  348. "add %2, %0\n"
  349. "subl $2, %%ecx\n"
  350. " jnz 1b\n"
  351. "movq %%mm6, %%mm0\n"
  352. "punpcklwd %%mm7, %%mm0\n"
  353. "punpckhwd %%mm7, %%mm6\n"
  354. "paddd %%mm0, %%mm6\n"
  355. "movq %%mm6, %%mm0\n"
  356. "psrlq $32, %%mm6\n"
  357. "paddd %%mm6, %%mm0\n"
  358. "movd %%mm0, %1\n"
  359. : "+r" (pix1), "=r" (tmp)
  360. : "r" ((x86_reg) line_size), "g" (h - 2)
  361. : "%ecx");
  362. return tmp + hf_noise8_mmx(pix + 8, line_size, h);
  363. }
  364. static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  365. int line_size, int h)
  366. {
  367. int score1, score2;
  368. if (c)
  369. score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
  370. else
  371. score1 = sse16_mmx(c, pix1, pix2, line_size, h);
  372. score2 = hf_noise16_mmx(pix1, line_size, h) -
  373. hf_noise16_mmx(pix2, line_size, h);
  374. if (c)
  375. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  376. else
  377. return score1 + FFABS(score2) * 8;
  378. }
  379. static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  380. int line_size, int h)
  381. {
  382. int score1 = sse8_mmx(c, pix1, pix2, line_size, h);
  383. int score2 = hf_noise8_mmx(pix1, line_size, h) -
  384. hf_noise8_mmx(pix2, line_size, h);
  385. if (c)
  386. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  387. else
  388. return score1 + FFABS(score2) * 8;
  389. }
  390. static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
  391. int line_size, int h)
  392. {
  393. int tmp;
  394. av_assert2((((int) pix) & 7) == 0);
  395. av_assert2((line_size & 7) == 0);
  396. #define SUM(in0, in1, out0, out1) \
  397. "movq (%0), %%mm2\n" \
  398. "movq 8(%0), %%mm3\n" \
  399. "add %2,%0\n" \
  400. "movq %%mm2, " #out0 "\n" \
  401. "movq %%mm3, " #out1 "\n" \
  402. "psubusb " #in0 ", %%mm2\n" \
  403. "psubusb " #in1 ", %%mm3\n" \
  404. "psubusb " #out0 ", " #in0 "\n" \
  405. "psubusb " #out1 ", " #in1 "\n" \
  406. "por %%mm2, " #in0 "\n" \
  407. "por %%mm3, " #in1 "\n" \
  408. "movq " #in0 ", %%mm2\n" \
  409. "movq " #in1 ", %%mm3\n" \
  410. "punpcklbw %%mm7, " #in0 "\n" \
  411. "punpcklbw %%mm7, " #in1 "\n" \
  412. "punpckhbw %%mm7, %%mm2\n" \
  413. "punpckhbw %%mm7, %%mm3\n" \
  414. "paddw " #in1 ", " #in0 "\n" \
  415. "paddw %%mm3, %%mm2\n" \
  416. "paddw %%mm2, " #in0 "\n" \
  417. "paddw " #in0 ", %%mm6\n"
  418. __asm__ volatile (
  419. "movl %3, %%ecx\n"
  420. "pxor %%mm6, %%mm6\n"
  421. "pxor %%mm7, %%mm7\n"
  422. "movq (%0), %%mm0\n"
  423. "movq 8(%0), %%mm1\n"
  424. "add %2, %0\n"
  425. "jmp 2f\n"
  426. "1:\n"
  427. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  428. "2:\n"
  429. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  430. "subl $2, %%ecx\n"
  431. "jnz 1b\n"
  432. "movq %%mm6, %%mm0\n"
  433. "psrlq $32, %%mm6\n"
  434. "paddw %%mm6, %%mm0\n"
  435. "movq %%mm0, %%mm6\n"
  436. "psrlq $16, %%mm0\n"
  437. "paddw %%mm6, %%mm0\n"
  438. "movd %%mm0, %1\n"
  439. : "+r" (pix), "=r" (tmp)
  440. : "r" ((x86_reg) line_size), "m" (h)
  441. : "%ecx");
  442. return tmp & 0xFFFF;
  443. }
  444. #undef SUM
  445. static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
  446. int line_size, int h)
  447. {
  448. int tmp;
  449. av_assert2((((int) pix) & 7) == 0);
  450. av_assert2((line_size & 7) == 0);
  451. #define SUM(in0, in1, out0, out1) \
  452. "movq (%0), " #out0 "\n" \
  453. "movq 8(%0), " #out1 "\n" \
  454. "add %2, %0\n" \
  455. "psadbw " #out0 ", " #in0 "\n" \
  456. "psadbw " #out1 ", " #in1 "\n" \
  457. "paddw " #in1 ", " #in0 "\n" \
  458. "paddw " #in0 ", %%mm6\n"
  459. __asm__ volatile (
  460. "movl %3, %%ecx\n"
  461. "pxor %%mm6, %%mm6\n"
  462. "pxor %%mm7, %%mm7\n"
  463. "movq (%0), %%mm0\n"
  464. "movq 8(%0), %%mm1\n"
  465. "add %2, %0\n"
  466. "jmp 2f\n"
  467. "1:\n"
  468. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  469. "2:\n"
  470. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  471. "subl $2, %%ecx\n"
  472. "jnz 1b\n"
  473. "movd %%mm6, %1\n"
  474. : "+r" (pix), "=r" (tmp)
  475. : "r" ((x86_reg) line_size), "m" (h)
  476. : "%ecx");
  477. return tmp;
  478. }
  479. #undef SUM
  480. static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  481. int line_size, int h)
  482. {
  483. int tmp;
  484. av_assert2((((int) pix1) & 7) == 0);
  485. av_assert2((((int) pix2) & 7) == 0);
  486. av_assert2((line_size & 7) == 0);
  487. #define SUM(in0, in1, out0, out1) \
  488. "movq (%0), %%mm2\n" \
  489. "movq (%1), " #out0 "\n" \
  490. "movq 8(%0), %%mm3\n" \
  491. "movq 8(%1), " #out1 "\n" \
  492. "add %3, %0\n" \
  493. "add %3, %1\n" \
  494. "psubb " #out0 ", %%mm2\n" \
  495. "psubb " #out1 ", %%mm3\n" \
  496. "pxor %%mm7, %%mm2\n" \
  497. "pxor %%mm7, %%mm3\n" \
  498. "movq %%mm2, " #out0 "\n" \
  499. "movq %%mm3, " #out1 "\n" \
  500. "psubusb " #in0 ", %%mm2\n" \
  501. "psubusb " #in1 ", %%mm3\n" \
  502. "psubusb " #out0 ", " #in0 "\n" \
  503. "psubusb " #out1 ", " #in1 "\n" \
  504. "por %%mm2, " #in0 "\n" \
  505. "por %%mm3, " #in1 "\n" \
  506. "movq " #in0 ", %%mm2\n" \
  507. "movq " #in1 ", %%mm3\n" \
  508. "punpcklbw %%mm7, " #in0 "\n" \
  509. "punpcklbw %%mm7, " #in1 "\n" \
  510. "punpckhbw %%mm7, %%mm2\n" \
  511. "punpckhbw %%mm7, %%mm3\n" \
  512. "paddw " #in1 ", " #in0 "\n" \
  513. "paddw %%mm3, %%mm2\n" \
  514. "paddw %%mm2, " #in0 "\n" \
  515. "paddw " #in0 ", %%mm6\n"
  516. __asm__ volatile (
  517. "movl %4, %%ecx\n"
  518. "pxor %%mm6, %%mm6\n"
  519. "pcmpeqw %%mm7, %%mm7\n"
  520. "psllw $15, %%mm7\n"
  521. "packsswb %%mm7, %%mm7\n"
  522. "movq (%0), %%mm0\n"
  523. "movq (%1), %%mm2\n"
  524. "movq 8(%0), %%mm1\n"
  525. "movq 8(%1), %%mm3\n"
  526. "add %3, %0\n"
  527. "add %3, %1\n"
  528. "psubb %%mm2, %%mm0\n"
  529. "psubb %%mm3, %%mm1\n"
  530. "pxor %%mm7, %%mm0\n"
  531. "pxor %%mm7, %%mm1\n"
  532. "jmp 2f\n"
  533. "1:\n"
  534. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  535. "2:\n"
  536. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  537. "subl $2, %%ecx\n"
  538. "jnz 1b\n"
  539. "movq %%mm6, %%mm0\n"
  540. "psrlq $32, %%mm6\n"
  541. "paddw %%mm6, %%mm0\n"
  542. "movq %%mm0, %%mm6\n"
  543. "psrlq $16, %%mm0\n"
  544. "paddw %%mm6, %%mm0\n"
  545. "movd %%mm0, %2\n"
  546. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  547. : "r" ((x86_reg) line_size), "m" (h)
  548. : "%ecx");
  549. return tmp & 0x7FFF;
  550. }
  551. #undef SUM
  552. static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  553. int line_size, int h)
  554. {
  555. int tmp;
  556. av_assert2((((int) pix1) & 7) == 0);
  557. av_assert2((((int) pix2) & 7) == 0);
  558. av_assert2((line_size & 7) == 0);
  559. #define SUM(in0, in1, out0, out1) \
  560. "movq (%0), " #out0 "\n" \
  561. "movq (%1), %%mm2\n" \
  562. "movq 8(%0), " #out1 "\n" \
  563. "movq 8(%1), %%mm3\n" \
  564. "add %3, %0\n" \
  565. "add %3, %1\n" \
  566. "psubb %%mm2, " #out0 "\n" \
  567. "psubb %%mm3, " #out1 "\n" \
  568. "pxor %%mm7, " #out0 "\n" \
  569. "pxor %%mm7, " #out1 "\n" \
  570. "psadbw " #out0 ", " #in0 "\n" \
  571. "psadbw " #out1 ", " #in1 "\n" \
  572. "paddw " #in1 ", " #in0 "\n" \
  573. "paddw " #in0 ", %%mm6\n "
  574. __asm__ volatile (
  575. "movl %4, %%ecx\n"
  576. "pxor %%mm6, %%mm6\n"
  577. "pcmpeqw %%mm7, %%mm7\n"
  578. "psllw $15, %%mm7\n"
  579. "packsswb %%mm7, %%mm7\n"
  580. "movq (%0), %%mm0\n"
  581. "movq (%1), %%mm2\n"
  582. "movq 8(%0), %%mm1\n"
  583. "movq 8(%1), %%mm3\n"
  584. "add %3, %0\n"
  585. "add %3, %1\n"
  586. "psubb %%mm2, %%mm0\n"
  587. "psubb %%mm3, %%mm1\n"
  588. "pxor %%mm7, %%mm0\n"
  589. "pxor %%mm7, %%mm1\n"
  590. "jmp 2f\n"
  591. "1:\n"
  592. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  593. "2:\n"
  594. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  595. "subl $2, %%ecx\n"
  596. "jnz 1b\n"
  597. "movd %%mm6, %2\n"
  598. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  599. : "r" ((x86_reg) line_size), "m" (h)
  600. : "%ecx");
  601. return tmp;
  602. }
  603. #undef SUM
  604. static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w)
  605. {
  606. x86_reg i = 0;
  607. if (w >= 16)
  608. __asm__ volatile (
  609. "1: \n\t"
  610. "movq (%2, %0), %%mm0 \n\t"
  611. "movq (%1, %0), %%mm1 \n\t"
  612. "psubb %%mm0, %%mm1 \n\t"
  613. "movq %%mm1, (%3, %0) \n\t"
  614. "movq 8(%2, %0), %%mm0 \n\t"
  615. "movq 8(%1, %0), %%mm1 \n\t"
  616. "psubb %%mm0, %%mm1 \n\t"
  617. "movq %%mm1, 8(%3, %0) \n\t"
  618. "add $16, %0 \n\t"
  619. "cmp %4, %0 \n\t"
  620. " jb 1b \n\t"
  621. : "+r" (i)
  622. : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w - 15));
  623. for (; i < w; i++)
  624. dst[i + 0] = src1[i + 0] - src2[i + 0];
  625. }
  626. static void sub_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *src1,
  627. const uint8_t *src2, int w,
  628. int *left, int *left_top)
  629. {
  630. x86_reg i = 0;
  631. uint8_t l, lt;
  632. __asm__ volatile (
  633. "movq (%1, %0), %%mm0 \n\t" // LT
  634. "psllq $8, %%mm0 \n\t"
  635. "1: \n\t"
  636. "movq (%1, %0), %%mm1 \n\t" // T
  637. "movq -1(%2, %0), %%mm2 \n\t" // L
  638. "movq (%2, %0), %%mm3 \n\t" // X
  639. "movq %%mm2, %%mm4 \n\t" // L
  640. "psubb %%mm0, %%mm2 \n\t"
  641. "paddb %%mm1, %%mm2 \n\t" // L + T - LT
  642. "movq %%mm4, %%mm5 \n\t" // L
  643. "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
  644. "pminub %%mm5, %%mm1 \n\t" // min(T, L)
  645. "pminub %%mm2, %%mm4 \n\t"
  646. "pmaxub %%mm1, %%mm4 \n\t"
  647. "psubb %%mm4, %%mm3 \n\t" // dst - pred
  648. "movq %%mm3, (%3, %0) \n\t"
  649. "add $8, %0 \n\t"
  650. "movq -1(%1, %0), %%mm0 \n\t" // LT
  651. "cmp %4, %0 \n\t"
  652. " jb 1b \n\t"
  653. : "+r" (i)
  654. : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w));
  655. l = *left;
  656. lt = *left_top;
  657. dst[0] = src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt) & 0xFF);
  658. *left_top = src1[w - 1];
  659. *left = src2[w - 1];
  660. }
  661. #define MMABS_MMX(a,z) \
  662. "pxor " #z ", " #z " \n\t" \
  663. "pcmpgtw " #a ", " #z " \n\t" \
  664. "pxor " #z ", " #a " \n\t" \
  665. "psubw " #z ", " #a " \n\t"
  666. #define MMABS_MMXEXT(a, z) \
  667. "pxor " #z ", " #z " \n\t" \
  668. "psubw " #a ", " #z " \n\t" \
  669. "pmaxsw " #z ", " #a " \n\t"
  670. #define MMABS_SSSE3(a,z) \
  671. "pabsw " #a ", " #a " \n\t"
  672. #define MMABS_SUM(a,z, sum) \
  673. MMABS(a,z) \
  674. "paddusw " #a ", " #sum " \n\t"
  675. /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
  676. * up to about 100k on extreme inputs. But that's very unlikely to occur in
  677. * natural video, and it's even more unlikely to not have any alternative
  678. * mvs/modes with lower cost. */
  679. #define HSUM_MMX(a, t, dst) \
  680. "movq " #a ", " #t " \n\t" \
  681. "psrlq $32, " #a " \n\t" \
  682. "paddusw " #t ", " #a " \n\t" \
  683. "movq " #a ", " #t " \n\t" \
  684. "psrlq $16, " #a " \n\t" \
  685. "paddusw " #t ", " #a " \n\t" \
  686. "movd " #a ", " #dst " \n\t" \
  687. #define HSUM_MMXEXT(a, t, dst) \
  688. "pshufw $0x0E, " #a ", " #t " \n\t" \
  689. "paddusw " #t ", " #a " \n\t" \
  690. "pshufw $0x01, " #a ", " #t " \n\t" \
  691. "paddusw " #t ", " #a " \n\t" \
  692. "movd " #a ", " #dst " \n\t" \
  693. #define HSUM_SSE2(a, t, dst) \
  694. "movhlps " #a ", " #t " \n\t" \
  695. "paddusw " #t ", " #a " \n\t" \
  696. "pshuflw $0x0E, " #a ", " #t " \n\t" \
  697. "paddusw " #t ", " #a " \n\t" \
  698. "pshuflw $0x01, " #a ", " #t " \n\t" \
  699. "paddusw " #t ", " #a " \n\t" \
  700. "movd " #a ", " #dst " \n\t" \
  701. #define DCT_SAD4(m, mm, o) \
  702. "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \
  703. "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \
  704. "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \
  705. "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \
  706. MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \
  707. MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \
  708. MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \
  709. MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \
  710. #define DCT_SAD_MMX \
  711. "pxor %%mm0, %%mm0 \n\t" \
  712. "pxor %%mm1, %%mm1 \n\t" \
  713. DCT_SAD4(q, %%mm, 0) \
  714. DCT_SAD4(q, %%mm, 8) \
  715. DCT_SAD4(q, %%mm, 64) \
  716. DCT_SAD4(q, %%mm, 72) \
  717. "paddusw %%mm1, %%mm0 \n\t" \
  718. HSUM(%%mm0, %%mm1, %0)
  719. #define DCT_SAD_SSE2 \
  720. "pxor %%xmm0, %%xmm0 \n\t" \
  721. "pxor %%xmm1, %%xmm1 \n\t" \
  722. DCT_SAD4(dqa, %%xmm, 0) \
  723. DCT_SAD4(dqa, %%xmm, 64) \
  724. "paddusw %%xmm1, %%xmm0 \n\t" \
  725. HSUM(%%xmm0, %%xmm1, %0)
  726. #define DCT_SAD_FUNC(cpu) \
  727. static int sum_abs_dctelem_ ## cpu(int16_t *block) \
  728. { \
  729. int sum; \
  730. __asm__ volatile ( \
  731. DCT_SAD \
  732. :"=r"(sum) \
  733. :"r"(block)); \
  734. return sum & 0xFFFF; \
  735. }
  736. #define DCT_SAD DCT_SAD_MMX
  737. #define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
  738. #define MMABS(a, z) MMABS_MMX(a, z)
  739. DCT_SAD_FUNC(mmx)
  740. #undef MMABS
  741. #undef HSUM
  742. #define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
  743. #define MMABS(a, z) MMABS_MMXEXT(a, z)
  744. DCT_SAD_FUNC(mmxext)
  745. #undef HSUM
  746. #undef DCT_SAD
  747. #define DCT_SAD DCT_SAD_SSE2
  748. #define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
  749. DCT_SAD_FUNC(sse2)
  750. #undef MMABS
  751. #if HAVE_SSSE3_INLINE
  752. #define MMABS(a, z) MMABS_SSSE3(a, z)
  753. DCT_SAD_FUNC(ssse3)
  754. #undef MMABS
  755. #endif
  756. #undef HSUM
  757. #undef DCT_SAD
  758. static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
  759. int size)
  760. {
  761. int sum;
  762. x86_reg i = size;
  763. __asm__ volatile (
  764. "pxor %%mm4, %%mm4 \n"
  765. "1: \n"
  766. "sub $8, %0 \n"
  767. "movq (%2, %0), %%mm2 \n"
  768. "movq (%3, %0, 2), %%mm0 \n"
  769. "movq 8(%3, %0, 2), %%mm1 \n"
  770. "punpckhbw %%mm2, %%mm3 \n"
  771. "punpcklbw %%mm2, %%mm2 \n"
  772. "psraw $8, %%mm3 \n"
  773. "psraw $8, %%mm2 \n"
  774. "psubw %%mm3, %%mm1 \n"
  775. "psubw %%mm2, %%mm0 \n"
  776. "pmaddwd %%mm1, %%mm1 \n"
  777. "pmaddwd %%mm0, %%mm0 \n"
  778. "paddd %%mm1, %%mm4 \n"
  779. "paddd %%mm0, %%mm4 \n"
  780. "jg 1b \n"
  781. "movq %%mm4, %%mm3 \n"
  782. "psrlq $32, %%mm3 \n"
  783. "paddd %%mm3, %%mm4 \n"
  784. "movd %%mm4, %1 \n"
  785. : "+r" (i), "=r" (sum)
  786. : "r" (pix1), "r" (pix2));
  787. return sum;
  788. }
  789. #define PHADDD(a, t) \
  790. "movq " #a ", " #t " \n\t" \
  791. "psrlq $32, " #a " \n\t" \
  792. "paddd " #t ", " #a " \n\t"
  793. /*
  794. * pmulhw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31]
  795. * pmulhrw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31]
  796. * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
  797. */
  798. #define PMULHRW(x, y, s, o) \
  799. "pmulhw " #s ", " #x " \n\t" \
  800. "pmulhw " #s ", " #y " \n\t" \
  801. "paddw " #o ", " #x " \n\t" \
  802. "paddw " #o ", " #y " \n\t" \
  803. "psraw $1, " #x " \n\t" \
  804. "psraw $1, " #y " \n\t"
  805. #define DEF(x) x ## _mmx
  806. #define SET_RND MOVQ_WONE
  807. #define SCALE_OFFSET 1
  808. #include "dsputil_qns_template.c"
  809. #undef DEF
  810. #undef SET_RND
  811. #undef SCALE_OFFSET
  812. #undef PMULHRW
  813. #define DEF(x) x ## _3dnow
  814. #define SET_RND(x)
  815. #define SCALE_OFFSET 0
  816. #define PMULHRW(x, y, s, o) \
  817. "pmulhrw " #s ", " #x " \n\t" \
  818. "pmulhrw " #s ", " #y " \n\t"
  819. #include "dsputil_qns_template.c"
  820. #undef DEF
  821. #undef SET_RND
  822. #undef SCALE_OFFSET
  823. #undef PMULHRW
  824. #if HAVE_SSSE3_INLINE
  825. #undef PHADDD
  826. #define DEF(x) x ## _ssse3
  827. #define SET_RND(x)
  828. #define SCALE_OFFSET -1
  829. #define PHADDD(a, t) \
  830. "pshufw $0x0E, " #a ", " #t " \n\t" \
  831. /* faster than phaddd on core2 */ \
  832. "paddd " #t ", " #a " \n\t"
  833. #define PMULHRW(x, y, s, o) \
  834. "pmulhrsw " #s ", " #x " \n\t" \
  835. "pmulhrsw " #s ", " #y " \n\t"
  836. #include "dsputil_qns_template.c"
  837. #undef DEF
  838. #undef SET_RND
  839. #undef SCALE_OFFSET
  840. #undef PMULHRW
  841. #undef PHADDD
  842. #endif /* HAVE_SSSE3_INLINE */
  843. #endif /* HAVE_INLINE_ASM */
  844. int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  845. int line_size, int h);
  846. #define hadamard_func(cpu) \
  847. int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
  848. uint8_t *src2, int stride, int h); \
  849. int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
  850. uint8_t *src2, int stride, int h);
  851. hadamard_func(mmx)
  852. hadamard_func(mmxext)
  853. hadamard_func(sse2)
  854. hadamard_func(ssse3)
  855. av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
  856. unsigned high_bit_depth)
  857. {
  858. int cpu_flags = av_get_cpu_flags();
  859. const int dct_algo = avctx->dct_algo;
  860. if (EXTERNAL_MMX(cpu_flags)) {
  861. if (!high_bit_depth)
  862. c->get_pixels = ff_get_pixels_mmx;
  863. c->diff_pixels = ff_diff_pixels_mmx;
  864. c->pix_sum = ff_pix_sum16_mmx;
  865. c->pix_norm1 = ff_pix_norm1_mmx;
  866. }
  867. if (EXTERNAL_SSE2(cpu_flags))
  868. if (!high_bit_depth)
  869. c->get_pixels = ff_get_pixels_sse2;
  870. #if HAVE_INLINE_ASM
  871. if (INLINE_MMX(cpu_flags)) {
  872. if (!high_bit_depth &&
  873. (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  874. c->fdct = ff_fdct_mmx;
  875. c->diff_bytes = diff_bytes_mmx;
  876. c->sum_abs_dctelem = sum_abs_dctelem_mmx;
  877. c->sse[0] = sse16_mmx;
  878. c->sse[1] = sse8_mmx;
  879. c->vsad[4] = vsad_intra16_mmx;
  880. c->nsse[0] = nsse16_mmx;
  881. c->nsse[1] = nsse8_mmx;
  882. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  883. c->vsad[0] = vsad16_mmx;
  884. c->try_8x8basis = try_8x8basis_mmx;
  885. }
  886. c->add_8x8basis = add_8x8basis_mmx;
  887. c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
  888. }
  889. if (INLINE_AMD3DNOW(cpu_flags)) {
  890. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  891. c->try_8x8basis = try_8x8basis_3dnow;
  892. }
  893. c->add_8x8basis = add_8x8basis_3dnow;
  894. }
  895. if (INLINE_MMXEXT(cpu_flags)) {
  896. if (!high_bit_depth &&
  897. (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  898. c->fdct = ff_fdct_mmxext;
  899. c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
  900. c->vsad[4] = vsad_intra16_mmxext;
  901. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  902. c->vsad[0] = vsad16_mmxext;
  903. }
  904. c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_mmxext;
  905. }
  906. if (INLINE_SSE2(cpu_flags)) {
  907. if (!high_bit_depth &&
  908. (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  909. c->fdct = ff_fdct_sse2;
  910. c->sum_abs_dctelem = sum_abs_dctelem_sse2;
  911. }
  912. #if HAVE_SSSE3_INLINE
  913. if (INLINE_SSSE3(cpu_flags)) {
  914. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  915. c->try_8x8basis = try_8x8basis_ssse3;
  916. }
  917. c->add_8x8basis = add_8x8basis_ssse3;
  918. c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
  919. }
  920. #endif
  921. #endif /* HAVE_INLINE_ASM */
  922. if (EXTERNAL_MMX(cpu_flags)) {
  923. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
  924. c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
  925. }
  926. if (EXTERNAL_MMXEXT(cpu_flags)) {
  927. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
  928. c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
  929. }
  930. if (EXTERNAL_SSE2(cpu_flags)) {
  931. c->sse[0] = ff_sse16_sse2;
  932. #if HAVE_ALIGNED_STACK
  933. c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
  934. c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
  935. #endif
  936. }
  937. if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
  938. c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
  939. c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
  940. }
  941. ff_dsputil_init_pix_mmx(c, avctx);
  942. }