h264_idct_sse2.asm 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. ;*****************************************************************************
  2. ;* SSE2-optimized H.264 iDCT
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2003-2008 x264 project
  5. ;*
  6. ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
  7. ;* Loren Merritt <lorenm@u.washington.edu>
  8. ;* Holger Lubitz <hal@duncan.ol.sub.de>
  9. ;* Min Chen <chenm001.163.com>
  10. ;*
  11. ;* This program is free software; you can redistribute it and/or modify
  12. ;* it under the terms of the GNU General Public License as published by
  13. ;* the Free Software Foundation; either version 2 of the License, or
  14. ;* (at your option) any later version.
  15. ;*
  16. ;* This program is distributed in the hope that it will be useful,
  17. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. ;* GNU General Public License for more details.
  20. ;*
  21. ;* You should have received a copy of the GNU General Public License
  22. ;* along with this program; if not, write to the Free Software
  23. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  24. ;*****************************************************************************
  25. %include "x86inc.asm"
  26. %include "x86util.asm"
  27. SECTION_RODATA
  28. pw_32: times 8 dw 32
  29. SECTION .text
  30. %macro IDCT4_1D 6
  31. SUMSUB_BA m%3, m%1
  32. SUMSUBD2_AB m%2, m%4, m%6, m%5
  33. SUMSUB_BADC m%2, m%3, m%5, m%1
  34. SWAP %1, %2, %5, %4, %3
  35. %endmacro
  36. INIT_XMM
  37. cglobal x264_add8x4_idct_sse2, 3,3
  38. movq m0, [r1+ 0]
  39. movq m1, [r1+ 8]
  40. movq m2, [r1+16]
  41. movq m3, [r1+24]
  42. movhps m0, [r1+32]
  43. movhps m1, [r1+40]
  44. movhps m2, [r1+48]
  45. movhps m3, [r1+56]
  46. IDCT4_1D 0,1,2,3,4,5
  47. TRANSPOSE2x4x4W 0,1,2,3,4
  48. paddw m0, [pw_32 GLOBAL]
  49. IDCT4_1D 0,1,2,3,4,5
  50. pxor m7, m7
  51. STORE_DIFF m0, m4, m7, [r0]
  52. STORE_DIFF m1, m4, m7, [r0+r2]
  53. lea r0, [r0+r2*2]
  54. STORE_DIFF m2, m4, m7, [r0]
  55. STORE_DIFF m3, m4, m7, [r0+r2]
  56. RET