vf_maskedmerge.asm 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. ;*****************************************************************************
  2. ;* x86-optimized functions for maskedmerge filter
  3. ;*
  4. ;* Copyright (C) 2015 Paul B Mahol
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;*****************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. pw_128: times 8 dw 128
  25. pw_256: times 8 dw 256
  26. SECTION .text
  27. INIT_XMM sse2
  28. %if ARCH_X86_64
  29. cglobal maskedmerge8, 8, 11, 7, bsrc, osrc, msrc, dst, blinesize, olinesize, mlinesize, dlinesize, w, h, x
  30. mov wd, dword wm
  31. mov hd, dword hm
  32. %else
  33. cglobal maskedmerge8, 5, 7, 7, bsrc, osrc, msrc, dst, blinesize, w, x
  34. mov wd, r8m
  35. %define olinesizeq r5mp
  36. %define mlinesizeq r6mp
  37. %define dlinesizeq r7mp
  38. %define hd r9mp
  39. %endif
  40. mova m4, [pw_256]
  41. mova m5, [pw_128]
  42. pxor m6, m6
  43. add bsrcq, wq
  44. add osrcq, wq
  45. add msrcq, wq
  46. add dstq, wq
  47. neg wq
  48. .nextrow:
  49. mov xq, wq
  50. .loop:
  51. movh m0, [bsrcq + xq]
  52. movh m1, [osrcq + xq]
  53. movh m3, [msrcq + xq]
  54. mova m2, m4
  55. punpcklbw m0, m6
  56. punpcklbw m1, m6
  57. punpcklbw m3, m6
  58. psubw m2, m3
  59. pmullw m2, m0
  60. pmullw m1, m3
  61. paddw m1, m2
  62. paddw m1, m5
  63. psrlw m1, 8
  64. packuswb m1, m1
  65. movh [dstq + xq], m1
  66. add xq, mmsize / 2
  67. jl .loop
  68. add bsrcq, blinesizeq
  69. add osrcq, olinesizeq
  70. add msrcq, mlinesizeq
  71. add dstq, dlinesizeq
  72. sub hd, 1
  73. jg .nextrow
  74. REP_RET