|
@@ -35,6 +35,20 @@ pb_shuffle2013: db 2, 0, 1, 3, 6, 4, 5, 7, 10, 8, 9, 11, 14, 12, 13, 15
|
|
|
pb_shuffle2130: db 2, 1, 3, 0, 6, 5, 7, 4, 10, 9, 11, 8, 14, 13, 15, 12
|
|
|
pb_shuffle1203: db 1, 2, 0, 3, 5, 6, 4, 7, 9, 10, 8, 11, 13, 14, 12, 15
|
|
|
|
|
|
+%if HAVE_AVX512ICL_EXTERNAL
|
|
|
+; shuffle vector to rearrange packuswb result to be linear
|
|
|
+shuf_packus: db 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51,\
|
|
|
+ 4, 5, 6, 7, 20, 21, 22, 23, 36, 37, 38, 39, 52, 53, 54, 55,\
|
|
|
+ 8, 9, 10, 11, 24, 25, 26, 27, 40, 41, 42, 43, 56, 57, 58, 59,\
|
|
|
+ 12, 13, 14, 15, 28, 29, 30, 31, 44, 45, 46, 47, 60, 61, 62, 63
|
|
|
+
|
|
|
+; shuffle vector to combine odd elements from two vectors to extract Y
|
|
|
+shuf_perm2b: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,\
|
|
|
+ 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,\
|
|
|
+ 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,\
|
|
|
+ 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127
|
|
|
+%endif
|
|
|
+
|
|
|
SECTION .text
|
|
|
|
|
|
%macro RSHIFT_COPY 5
|
|
@@ -156,9 +170,20 @@ SHUFFLE_BYTES 1, 2, 0, 3
|
|
|
; int lumStride, int chromStride, int srcStride)
|
|
|
;-----------------------------------------------------------------------------------------------
|
|
|
%macro UYVY_TO_YUV422 0
|
|
|
+%if mmsize == 64
|
|
|
+; need two more registers to store shuffle vectors for AVX512ICL
|
|
|
+cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
|
|
|
+%else
|
|
|
cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
|
|
|
+%endif
|
|
|
pxor m0, m0
|
|
|
+%if mmsize == 64
|
|
|
+ vpternlogd m1, m1, m1, 0xff ; m1 = _mm512_set1_epi8(0xff)
|
|
|
+ movu m8, [shuf_packus]
|
|
|
+ movu m9, [shuf_perm2b]
|
|
|
+%else
|
|
|
pcmpeqw m1, m1
|
|
|
+%endif
|
|
|
psrlw m1, 8
|
|
|
|
|
|
movsxdifnidn wq, wd
|
|
@@ -188,6 +213,63 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|
|
and xq, mmsize * 2 - 1
|
|
|
je .loop_simd
|
|
|
|
|
|
+%if mmsize == 64
|
|
|
+ shr xq, 1
|
|
|
+ mov tmpq, -1
|
|
|
+ shlx tmpq, tmpq, xq
|
|
|
+ not tmpq
|
|
|
+ kmovq k7, tmpq ; write mask for U/V
|
|
|
+ kmovd k1, tmpd ; write mask for 1st half of Y
|
|
|
+ kmovw k3, tmpd ; read mask for 1st vector
|
|
|
+ shr tmpq, 16
|
|
|
+ kmovw k4, tmpd ; read mask for 2nd vector
|
|
|
+ shr tmpq, 16
|
|
|
+ kmovd k2, tmpd ; write mask for 2nd half of Y
|
|
|
+ kmovw k5, tmpd ; read mask for 3rd vector
|
|
|
+ shr tmpd, 16
|
|
|
+ kmovw k6, tmpd ; read mask for 4th vector
|
|
|
+
|
|
|
+ vmovdqu32 m2{k3}{z}, [srcq + wtwoq ]
|
|
|
+ vmovdqu32 m3{k4}{z}, [srcq + wtwoq + mmsize ]
|
|
|
+ vmovdqu32 m4{k5}{z}, [srcq + wtwoq + mmsize * 2]
|
|
|
+ vmovdqu32 m5{k6}{z}, [srcq + wtwoq + mmsize * 3]
|
|
|
+
|
|
|
+ ; extract y part 1
|
|
|
+ mova m6, m9
|
|
|
+ vpermi2b m6, m2, m3 ; UYVY UYVY -> YYYY using permute
|
|
|
+ vmovdqu16 [ydstq + wq]{k1}, m6
|
|
|
+
|
|
|
+ ; extract y part 2
|
|
|
+ mova m7, m9
|
|
|
+ vpermi2b m7, m4, m5 ; UYVY UYVY -> YYYY using permute
|
|
|
+ vmovdqu16 [ydstq + wq + mmsize]{k2}, m7
|
|
|
+
|
|
|
+ ; extract uv
|
|
|
+ pand m2, m1 ; UxVx...
|
|
|
+ pand m3, m1 ; UxVx...
|
|
|
+ pand m4, m1 ; UxVx...
|
|
|
+ pand m5, m1 ; UxVx...
|
|
|
+ packuswb m2, m3 ; UVUV...
|
|
|
+ packuswb m4, m5 ; UVUV...
|
|
|
+
|
|
|
+ ; U
|
|
|
+ pand m6, m2, m1 ; UxUx...
|
|
|
+ pand m7, m4, m1 ; UxUx...
|
|
|
+ packuswb m6, m7 ; UUUU
|
|
|
+ vpermb m6, m8, m6
|
|
|
+ vmovdqu8 [udstq + whalfq]{k7}, m6
|
|
|
+
|
|
|
+ ; V
|
|
|
+ psrlw m2, 8 ; VxVx...
|
|
|
+ psrlw m4, 8 ; VxVx...
|
|
|
+ packuswb m2, m4 ; VVVV
|
|
|
+ vpermb m2, m8, m2
|
|
|
+ vmovdqu8 [vdstq + whalfq]{k7}, m2
|
|
|
+
|
|
|
+ lea wq, [ wq + 2 * xq]
|
|
|
+ lea wtwoq, [wtwoq + 4 * xq]
|
|
|
+ add whalfq, xq
|
|
|
+%else
|
|
|
.loop_scalar:
|
|
|
mov tmpb, [srcq + wtwoq + 0]
|
|
|
mov [udstq + whalfq], tmpb
|
|
@@ -206,6 +288,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|
|
add whalfq, 1
|
|
|
sub xq, 2
|
|
|
jg .loop_scalar
|
|
|
+%endif
|
|
|
|
|
|
; check if simd loop is need
|
|
|
cmp wq, 0
|
|
@@ -228,6 +311,17 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|
|
movu m5, [srcq + wtwoq + mmsize * 3]
|
|
|
%endif
|
|
|
|
|
|
+%if mmsize == 64
|
|
|
+ ; extract y part 1
|
|
|
+ mova m6, m9
|
|
|
+ vpermi2b m6, m2, m3 ; UYVY UYVY -> YYYY using permute
|
|
|
+ movu [ydstq + wq], m6
|
|
|
+
|
|
|
+ ; extract y part 2
|
|
|
+ mova m7, m9
|
|
|
+ vpermi2b m7, m4, m5 ; UYVY UYVY -> YYYY using permute
|
|
|
+ movu [ydstq + wq + mmsize], m7
|
|
|
+%else
|
|
|
; extract y part 1
|
|
|
RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY...
|
|
|
pand m6, m1; YxYx YxYx...
|
|
@@ -247,6 +341,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|
|
|
|
|
packuswb m6, m7 ; YYYY YYYY...
|
|
|
movu [ydstq + wq + mmsize], m6
|
|
|
+%endif
|
|
|
|
|
|
; extract uv
|
|
|
pand m2, m1 ; UxVx...
|
|
@@ -262,6 +357,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|
|
pand m7, m4, m1 ; UxUx...
|
|
|
|
|
|
packuswb m6, m7 ; UUUU
|
|
|
+%if mmsize == 64
|
|
|
+ vpermb m6, m8, m6
|
|
|
+%endif
|
|
|
movu [udstq + whalfq], m6
|
|
|
|
|
|
|
|
@@ -269,6 +367,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|
|
psrlw m2, 8 ; VxVx...
|
|
|
psrlw m4, 8 ; VxVx...
|
|
|
packuswb m2, m4 ; VVVV
|
|
|
+%if mmsize == 64
|
|
|
+ vpermb m2, m8, m2
|
|
|
+%endif
|
|
|
movu [vdstq + whalfq], m2
|
|
|
|
|
|
add whalfq, mmsize
|
|
@@ -303,4 +404,8 @@ UYVY_TO_YUV422
|
|
|
INIT_YMM avx2
|
|
|
UYVY_TO_YUV422
|
|
|
%endif
|
|
|
+%if HAVE_AVX512ICL_EXTERNAL
|
|
|
+INIT_ZMM avx512icl
|
|
|
+UYVY_TO_YUV422
|
|
|
+%endif
|
|
|
%endif
|