|
@@ -73,7 +73,7 @@ SECTION .text
|
|
|
|
|
|
INIT_MMX
|
|
|
; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
|
|
|
-cglobal h264_idct_add_mmx, 3, 3, 0
|
|
|
+cglobal h264_idct_add_8_mmx, 3, 3, 0
|
|
|
IDCT4_ADD r0, r1, r2
|
|
|
RET
|
|
|
|
|
@@ -125,7 +125,7 @@ cglobal h264_idct_add_mmx, 3, 3, 0
|
|
|
SUMSUB_BA w, 0, 4
|
|
|
SUMSUB_BA w, 3, 2
|
|
|
SUMSUB_BA w, 1, 5
|
|
|
- SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
|
|
|
+ SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
|
|
|
%endmacro
|
|
|
|
|
|
%macro IDCT8_1D_FULL 1
|
|
@@ -177,7 +177,7 @@ cglobal h264_idct_add_mmx, 3, 3, 0
|
|
|
|
|
|
INIT_MMX
|
|
|
; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
|
|
|
-cglobal h264_idct8_add_mmx, 3, 4, 0
|
|
|
+cglobal h264_idct8_add_8_mmx, 3, 4, 0
|
|
|
%assign pad 128+4-(stack_offset&7)
|
|
|
SUB rsp, pad
|
|
|
|
|
@@ -237,7 +237,7 @@ cglobal h264_idct8_add_mmx, 3, 4, 0
|
|
|
|
|
|
INIT_XMM
|
|
|
; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
|
|
|
-cglobal h264_idct8_add_sse2, 3, 4, 10
|
|
|
+cglobal h264_idct8_add_8_sse2, 3, 4, 10
|
|
|
IDCT8_ADD_SSE r0, r1, r2, r3
|
|
|
RET
|
|
|
|
|
@@ -261,7 +261,7 @@ cglobal h264_idct8_add_sse2, 3, 4, 10
|
|
|
packuswb m1, m1
|
|
|
%endmacro
|
|
|
|
|
|
-%macro DC_ADD_MMX2_OP 3-4
|
|
|
+%macro DC_ADD_MMX2_OP 4
|
|
|
%1 m2, [%2 ]
|
|
|
%1 m3, [%2+%3 ]
|
|
|
%1 m4, [%2+%3*2]
|
|
@@ -282,13 +282,13 @@ cglobal h264_idct8_add_sse2, 3, 4, 10
|
|
|
|
|
|
INIT_MMX
|
|
|
; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
|
|
|
-cglobal h264_idct_dc_add_mmx2, 3, 3, 0
|
|
|
+cglobal h264_idct_dc_add_8_mmx2, 3, 3, 0
|
|
|
DC_ADD_MMX2_INIT r1, r2
|
|
|
DC_ADD_MMX2_OP movh, r0, r2, r1
|
|
|
RET
|
|
|
|
|
|
; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
|
|
|
-cglobal h264_idct8_dc_add_mmx2, 3, 3, 0
|
|
|
+cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0
|
|
|
DC_ADD_MMX2_INIT r1, r2
|
|
|
DC_ADD_MMX2_OP mova, r0, r2, r1
|
|
|
lea r0, [r0+r2*4]
|
|
@@ -297,7 +297,7 @@ cglobal h264_idct8_dc_add_mmx2, 3, 3, 0
|
|
|
|
|
|
; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
|
|
|
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
|
|
-cglobal h264_idct_add16_mmx, 5, 7, 0
|
|
|
+cglobal h264_idct_add16_8_mmx, 5, 7, 0
|
|
|
xor r5, r5
|
|
|
%ifdef PIC
|
|
|
lea r11, [scan8_mem]
|
|
@@ -319,7 +319,7 @@ cglobal h264_idct_add16_mmx, 5, 7, 0
|
|
|
|
|
|
; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
|
|
|
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
|
|
-cglobal h264_idct8_add4_mmx, 5, 7, 0
|
|
|
+cglobal h264_idct8_add4_8_mmx, 5, 7, 0
|
|
|
%assign pad 128+4-(stack_offset&7)
|
|
|
SUB rsp, pad
|
|
|
|
|
@@ -351,7 +351,7 @@ cglobal h264_idct8_add4_mmx, 5, 7, 0
|
|
|
|
|
|
; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
|
|
|
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
|
|
-cglobal h264_idct_add16_mmx2, 5, 7, 0
|
|
|
+cglobal h264_idct_add16_8_mmx2, 5, 7, 0
|
|
|
xor r5, r5
|
|
|
%ifdef PIC
|
|
|
lea r11, [scan8_mem]
|
|
@@ -398,7 +398,7 @@ cglobal h264_idct_add16_mmx2, 5, 7, 0
|
|
|
|
|
|
; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
|
|
|
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
|
|
-cglobal h264_idct_add16intra_mmx, 5, 7, 0
|
|
|
+cglobal h264_idct_add16intra_8_mmx, 5, 7, 0
|
|
|
xor r5, r5
|
|
|
%ifdef PIC
|
|
|
lea r11, [scan8_mem]
|
|
@@ -421,7 +421,7 @@ cglobal h264_idct_add16intra_mmx, 5, 7, 0
|
|
|
|
|
|
; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
|
|
|
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
|
|
-cglobal h264_idct_add16intra_mmx2, 5, 7, 0
|
|
|
+cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0
|
|
|
xor r5, r5
|
|
|
%ifdef PIC
|
|
|
lea r11, [scan8_mem]
|
|
@@ -466,7 +466,7 @@ cglobal h264_idct_add16intra_mmx2, 5, 7, 0
|
|
|
|
|
|
; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
|
|
|
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
|
|
-cglobal h264_idct8_add4_mmx2, 5, 7, 0
|
|
|
+cglobal h264_idct8_add4_8_mmx2, 5, 7, 0
|
|
|
%assign pad 128+4-(stack_offset&7)
|
|
|
SUB rsp, pad
|
|
|
|
|
@@ -529,7 +529,7 @@ cglobal h264_idct8_add4_mmx2, 5, 7, 0
|
|
|
INIT_XMM
|
|
|
; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
|
|
|
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
|
|
-cglobal h264_idct8_add4_sse2, 5, 7, 10
|
|
|
+cglobal h264_idct8_add4_8_sse2, 5, 7, 10
|
|
|
xor r5, r5
|
|
|
%ifdef PIC
|
|
|
lea r11, [scan8_mem]
|
|
@@ -607,7 +607,7 @@ h264_idct_add8_mmx_plane:
|
|
|
|
|
|
; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
|
|
|
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
|
|
-cglobal h264_idct_add8_mmx, 5, 7, 0
|
|
|
+cglobal h264_idct_add8_8_mmx, 5, 7, 0
|
|
|
mov r5, 16
|
|
|
add r2, 512
|
|
|
%ifdef PIC
|
|
@@ -668,7 +668,7 @@ h264_idct_add8_mmx2_plane
|
|
|
|
|
|
; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
|
|
|
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
|
|
-cglobal h264_idct_add8_mmx2, 5, 7, 0
|
|
|
+cglobal h264_idct_add8_8_mmx2, 5, 7, 0
|
|
|
mov r5, 16
|
|
|
add r2, 512
|
|
|
%ifdef ARCH_X86_64
|
|
@@ -744,7 +744,7 @@ x264_add8x4_idct_sse2:
|
|
|
|
|
|
; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
|
|
|
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
|
|
-cglobal h264_idct_add16_sse2, 5, 5, 8
|
|
|
+cglobal h264_idct_add16_8_sse2, 5, 5, 8
|
|
|
%ifdef ARCH_X86_64
|
|
|
mov r10, r0
|
|
|
%endif
|
|
@@ -791,7 +791,7 @@ cglobal h264_idct_add16_sse2, 5, 5, 8
|
|
|
|
|
|
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
|
|
|
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
|
|
-cglobal h264_idct_add16intra_sse2, 5, 7, 8
|
|
|
+cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
|
|
|
%ifdef ARCH_X86_64
|
|
|
mov r10, r0
|
|
|
%endif
|
|
@@ -840,7 +840,7 @@ cglobal h264_idct_add16intra_sse2, 5, 7, 8
|
|
|
|
|
|
; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
|
|
|
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
|
|
-cglobal h264_idct_add8_sse2, 5, 7, 8
|
|
|
+cglobal h264_idct_add8_8_sse2, 5, 7, 8
|
|
|
add r2, 512
|
|
|
%ifdef ARCH_X86_64
|
|
|
mov r10, r0
|