Browse Source

Merge branch 'sws_32bit_integration'

* sws_32bit_integration:
  regtests/sws: update checksums for recent changes
  sws: dont mess with XInc when the code needing it isnt used
  sws: Fix chroma init for 32bit buffers.
  swscale: error dithering for 16/9/10-bit to 8-bit.
  swscale: fix overflow in 16-bit vertical scaling.
  swscale: fix crash in 8-bpc bilinear output without alpha.
  swscale: fix 16-bit scaling when output is 8-bits.
  sws: fix non native endian 9-15 bit input with 16bit out
  sws: disable scale16 when int32 is used
  sws: fix rgb -> 16bit
  sws: fix uv overwrite in 32bt
  sws: fix gray16_1
  sws:ix yuv2rgb48_1_c_template()
  sws: fix 16/32 bug from merge
  swscale: for >8bit scaling, read in native bit-depth.
  swscale: fix another yuv range conversion overflow in 16bit scaling. (cherry picked from commit 81cc7d0bd1eab0aa782ff8dd49e087025a42cdee)
  swscale: fix yuv range correction when using 16-bit scaling. (cherry picked from commit e0b8fff6c7a293e35079ba1931bd19372686b3f6)
  swscale: implement >8bit scaling support.

Merged-by: Michael Niedermayer <michaelni@gmx.at>
Michael Niedermayer 13 years ago
parent
commit
3b2d285afb

+ 4 - 2
libswscale/ppc/swscale_altivec.c

@@ -220,7 +220,7 @@ yuv2yuvX_altivec_real(SwsContext *c,
     }
 }
 
-static void hScale_altivec_real(int16_t *dst, int dstW,
+static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW,
                                 const uint8_t *src, const int16_t *filter,
                                 const int16_t *filterPos, int filterSize)
 {
@@ -406,7 +406,9 @@ void ff_sws_init_swScale_altivec(SwsContext *c)
     if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
         return;
 
-    c->hScale       = hScale_altivec_real;
+    if (c->scalingBpp == 8) {
+        c->hScale       = hScale_altivec_real;
+    }
     if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
         dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21 &&
         !c->alpPixBuf) {

File diff suppressed because it is too large
+ 396 - 205
libswscale/swscale.c


+ 16 - 5
libswscale/swscale_internal.h

@@ -77,8 +77,7 @@ typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t* src[],
 typedef void (*yuv2planar1_fn) (struct SwsContext *c,
                                 const int16_t *lumSrc, const int16_t *chrUSrc,
                                 const int16_t *chrVSrc, const int16_t *alpSrc,
-                                uint8_t *dest[4], int dstW, int chrDstW,
-                                const uint8_t *lumDither, const uint8_t *chrDither);
+                                uint8_t *dest[4], int dstW, int chrDstW);
 /**
  * Write one line of horizontally scaled Y/U/V/A to planar output
  * with multi-point vertical scaling between input pixels.
@@ -101,7 +100,7 @@ typedef void (*yuv2planarX_fn) (struct SwsContext *c, const int16_t *lumFilter,
                                 const int16_t *chrFilter, const int16_t **chrUSrc,
                                 const int16_t **chrVSrc,  int chrFilterSize,
                                 const int16_t **alpSrc, uint8_t *dest[4],
-                                int dstW, int chrDstW, const uint8_t *lumDither, const uint8_t *chrDither);
+                                int dstW, int chrDstW);
 /**
  * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
  * output without any additional vertical scaling (or point-scaling). Note
@@ -210,6 +209,7 @@ typedef struct SwsContext {
     enum PixelFormat srcFormat;   ///< Source      pixel format.
     int dstFormatBpp;             ///< Number of bits per pixel of the destination pixel format.
     int srcFormatBpp;             ///< Number of bits per pixel of the source      pixel format.
+    int scalingBpp;
     int chrSrcHSubSample;         ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in source      image.
     int chrSrcVSubSample;         ///< Binary logarithm of vertical   subsampling factor between luma/alpha and chroma planes in source      image.
     int chrDstHSubSample;         ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in destination image.
@@ -324,7 +324,7 @@ typedef struct SwsContext {
 #define UV_OFF                "11*8+4*4*256*3+48"
 #define UV_OFFx2              "11*8+4*4*256*3+56"
 #define DITHER16              "11*8+4*4*256*3+64"
-#define DITHER32              "11*8+4*4*256*3+64+16"
+#define DITHER32              "11*8+4*4*256*3+80"
 
     DECLARE_ALIGNED(8, uint64_t, redDither);
     DECLARE_ALIGNED(8, uint64_t, greenDither);
@@ -352,6 +352,8 @@ typedef struct SwsContext {
     uint16_t dither16[8];
     uint32_t dither32[8];
 
+    const uint8_t *chrDither8, *lumDither8;
+
 #if HAVE_ALTIVEC
     vector signed short   CY;
     vector signed short   CRV;
@@ -451,7 +453,7 @@ typedef struct SwsContext {
      *                   (and input coefficients thus padded with zeroes)
      *                   to simplify creating SIMD code.
      */
-    void (*hScale)(int16_t *dst, int dstW, const uint8_t *src,
+    void (*hScale)(struct SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
                    const int16_t *filter, const int16_t *filterPos,
                    int filterSize);
 
@@ -462,6 +464,15 @@ typedef struct SwsContext {
     void (*lumConvertRange)(int16_t *dst, int width); ///< Color range conversion function for luma plane if needed.
     void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width); ///< Color range conversion function for chroma planes if needed.
 
+    /**
+     * dst[..] = (src[..] << 8) | src[..];
+     */
+    void (*scale8To16Rv)(uint16_t *dst, const uint8_t *src, int len);
+    /**
+     * dst[..] = src[..] >> 4;
+     */
+    void (*scale19To15Fw)(int16_t *dst, const int32_t *src, int len);
+
     int needs_hcscale; ///< Set if there are chroma planes to be converted.
 
 } SwsContext;

+ 20 - 7
libswscale/utils.c

@@ -46,6 +46,7 @@
 #include "libavutil/bswap.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/avassert.h"
 
 unsigned swscale_version(void)
 {
@@ -777,7 +778,7 @@ SwsContext *sws_alloc_context(void)
 
 int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 {
-    int i;
+    int i, j;
     int usesVFilter, usesHFilter;
     int unscaled;
     SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
@@ -785,7 +786,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
     int srcH= c->srcH;
     int dstW= c->dstW;
     int dstH= c->dstH;
-    int dst_stride = FFALIGN(dstW * sizeof(int16_t)+66, 16), dst_stride_px = dst_stride >> 1;
+    int dst_stride = FFALIGN(dstW * sizeof(int16_t)+66, 16);
     int flags, cpu_flags;
     enum PixelFormat srcFormat= c->srcFormat;
     enum PixelFormat dstFormat= c->dstFormat;
@@ -882,8 +883,14 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
         }
     }
 
+    c->scalingBpp = FFMAX(av_pix_fmt_descriptors[srcFormat].comp[0].depth_minus1,
+                          av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1) >= 15 ? 16 : 8;
+
+    if (c->scalingBpp == 16)
+        dst_stride <<= 1;
+    av_assert0(c->scalingBpp<=16);
     FF_ALLOC_OR_GOTO(c, c->formatConvBuffer, FFALIGN(srcW*2+78, 16) * 2, fail);
-    if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2) {
+    if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2 && c->scalingBpp == 8) {
         c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
         if (!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR)) {
             if (flags&SWS_PRINT_INFO)
@@ -909,7 +916,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
             c->chrXInc+= 20;
         }
         //we don't use the x86 asm scaler if MMX is available
-        else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
+        else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX && c->scalingBpp == 8) {
             c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
             c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
         }
@@ -1040,12 +1047,12 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
         FF_ALLOCZ_OR_GOTO(c, c->lumPixBuf[i+c->vLumBufSize], dst_stride+1, fail);
         c->lumPixBuf[i] = c->lumPixBuf[i+c->vLumBufSize];
     }
-    c->uv_off = dst_stride_px;
+    c->uv_off = dst_stride>>1;
     c->uv_offx2 = dst_stride;
     for (i=0; i<c->vChrBufSize; i++) {
         FF_ALLOC_OR_GOTO(c, c->chrUPixBuf[i+c->vChrBufSize], dst_stride*2+1, fail);
         c->chrUPixBuf[i] = c->chrUPixBuf[i+c->vChrBufSize];
-        c->chrVPixBuf[i] = c->chrVPixBuf[i+c->vChrBufSize] = c->chrUPixBuf[i] + dst_stride_px;
+        c->chrVPixBuf[i] = c->chrVPixBuf[i+c->vChrBufSize] = c->chrUPixBuf[i] + (dst_stride >> 1);
     }
     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
         for (i=0; i<c->vLumBufSize; i++) {
@@ -1055,7 +1062,13 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 
     //try to avoid drawing green stuff between the right end and the stride end
     for (i=0; i<c->vChrBufSize; i++)
-        memset(c->chrUPixBuf[i], 64, dst_stride*2+1);
+        if(av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 == 15){
+            av_assert0(c->scalingBpp == 16);
+            for(j=0; j<dst_stride/2+1; j++)
+                ((int32_t*)(c->chrUPixBuf[i]))[j] = 1<<18;
+        } else
+            for(j=0; j<dst_stride+1; j++)
+                ((int16_t*)(c->chrUPixBuf[i]))[j] = 1<<14;
 
     assert(c->chrDstH <= dstH);
 

+ 136 - 98
libswscale/x86/swscale_template.c

@@ -70,26 +70,62 @@
         : "%"REG_d, "%"REG_S\
     );
 
+#if !COMPILE_TEMPLATE_MMX2
+static av_always_inline void
+dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot)
+{
+    if (rot) {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "psrlq       $24, %%mm3\n\t"
+                         "psllq       $40, %%mm4\n\t"
+                         "por       %%mm4, %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "punpcklbw %%mm0, %%mm3\n\t"
+                         "punpckhbw %%mm0, %%mm4\n\t"
+                         "psraw        $4, %%mm3\n\t"
+                         "psraw        $4, %%mm4\n\t"
+                         "movq      %%mm3, "DITHER16"+0(%1)\n\t"
+                         "movq      %%mm4, "DITHER16"+8(%1)\n\t"
+                         :: "r"(srcDither), "r"(&c->redDither)
+                         );
+    } else {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "punpcklbw %%mm0, %%mm3\n\t"
+                         "punpckhbw %%mm0, %%mm4\n\t"
+                         "psraw        $4, %%mm3\n\t"
+                         "psraw        $4, %%mm4\n\t"
+                         "movq      %%mm3, "DITHER16"+0(%1)\n\t"
+                         "movq      %%mm4, "DITHER16"+8(%1)\n\t"
+                         :: "r"(srcDither), "r"(&c->redDither)
+                         );
+    }
+}
+#endif
+
 static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
                              const int16_t **lumSrc, int lumFilterSize,
                              const int16_t *chrFilter, const int16_t **chrUSrc,
                              const int16_t **chrVSrc,
                              int chrFilterSize, const int16_t **alpSrc,
-                             uint8_t *dest[4], int dstW, int chrDstW,
-                             const uint8_t *lumDither, const uint8_t *chrDither)
+                             uint8_t *dest[4], int dstW, int chrDstW)
 {
     int i;
     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 
     if (uDest) {
-        x86_reg uv_off = c->uv_off;
-        for(i=0; i<8; i++) c->dither16[i] = chrDither[i]>>4;
+        x86_reg uv_off = c->uv_offx2 >> 1;
+        dither_8to16(c, chrDither, 0);
         YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
-        for(i=0; i<8; i++) c->dither16[i] = chrDither[(i+3)&7]>>4;
+        dither_8to16(c, chrDither, 1);
         YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
     }
-    for(i=0; i<8; i++) c->dither16[i] = lumDither[i]>>4;
+    dither_8to16(c, lumDither, 0);
     if (CONFIG_SWSCALE_ALPHA && aDest) {
         YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
     }
@@ -104,10 +140,6 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
         "movq                  "DITHER32"+8(%0), %%mm5      \n\t"\
         "movq                 "DITHER32"+16(%0), %%mm6      \n\t"\
         "movq                 "DITHER32"+24(%0), %%mm7      \n\t"\
-        "pxor                             %%mm4, %%mm4      \n\t"\
-        "pxor                             %%mm5, %%mm5      \n\t"\
-        "pxor                             %%mm6, %%mm6      \n\t"\
-        "pxor                             %%mm7, %%mm7      \n\t"\
         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
         ".p2align                             4             \n\t"\
         "1:                                                 \n\t"\
@@ -157,26 +189,87 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
         : "%"REG_a, "%"REG_d, "%"REG_S\
     );
 
+#if !COMPILE_TEMPLATE_MMX2
+static av_always_inline void
+dither_8to32(SwsContext *c, const uint8_t *srcDither, int rot)
+{
+int i;
+if(rot) for(i=0; i<8; i++) c->dither32[i] = srcDither[(i+3)&7]<<12;
+else    for(i=0; i<8; i++) c->dither32[i] = srcDither[i&7]<<12;
+return;
+
+    if (rot) {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm4\n\t"
+                         "movq      %%mm4, %%mm5\n\t"
+                         "psrlq       $24, %%mm4\n\t"
+                         "psllq       $40, %%mm5\n\t"
+                         "por       %%mm5, %%mm4\n\t"
+                         "movq      %%mm4, %%mm6\n\t"
+                         "punpcklbw %%mm0, %%mm4\n\t"
+                         "punpckhbw %%mm0, %%mm6\n\t"
+                         "movq      %%mm4, %%mm5\n\t"
+                         "movq      %%mm6, %%mm7\n\t"
+                         "punpcklwd %%mm0, %%mm4\n\t"
+                         "punpckhwd %%mm0, %%mm5\n\t"
+                         "punpcklwd %%mm0, %%mm6\n\t"
+                         "punpckhwd %%mm0, %%mm7\n\t"
+                         "psllw       $12, %%mm4\n\t"
+                         "psllw       $12, %%mm5\n\t"
+                         "psllw       $12, %%mm6\n\t"
+                         "psllw       $12, %%mm7\n\t"
+                         "movq      %%mm4, "DITHER32"+0(%1)\n\t"
+                         "movq      %%mm5, "DITHER32"+8(%1)\n\t"
+                         "movq      %%mm6, "DITHER32"+16(%1)\n\t"
+                         "movq      %%mm7, "DITHER32"+24(%1)\n\t"
+                         :: "r"(srcDither), "r"(&c->redDither)
+                         );
+    } else {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm4\n\t"
+                         "movq      %%mm4, %%mm6\n\t"
+                         "punpcklbw %%mm0, %%mm4\n\t"
+                         "punpckhbw %%mm0, %%mm6\n\t"
+                         "movq      %%mm4, %%mm5\n\t"
+                         "movq      %%mm6, %%mm7\n\t"
+                         "punpcklwd %%mm0, %%mm4\n\t"
+                         "punpckhwd %%mm0, %%mm5\n\t"
+                         "punpcklwd %%mm0, %%mm6\n\t"
+                         "punpckhwd %%mm0, %%mm7\n\t"
+                         "psllw       $12, %%mm4\n\t"
+                         "psllw       $12, %%mm5\n\t"
+                         "psllw       $12, %%mm6\n\t"
+                         "psllw       $12, %%mm7\n\t"
+                         "movq      %%mm4, "DITHER32"+0(%1)\n\t"
+                         "movq      %%mm5, "DITHER32"+8(%1)\n\t"
+                         "movq      %%mm6, "DITHER32"+16(%1)\n\t"
+                         "movq      %%mm7, "DITHER32"+24(%1)\n\t"
+                         :: "r"(srcDither), "r"(&c->redDither)
+                         );
+    }
+}
+#endif
+
 static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
                                 const int16_t **lumSrc, int lumFilterSize,
                                 const int16_t *chrFilter, const int16_t **chrUSrc,
                                 const int16_t **chrVSrc,
                                 int chrFilterSize, const int16_t **alpSrc,
-                                uint8_t *dest[4], int dstW, int chrDstW,
-                                const uint8_t *lumDither, const uint8_t *chrDither)
+                                uint8_t *dest[4], int dstW, int chrDstW)
 {
     int i;
     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 
     if (uDest) {
-        x86_reg uv_off = c->uv_off;
-        for(i=0; i<8; i++) c->dither32[i] = chrDither[i]<<12;
+        x86_reg uv_off = c->uv_offx2 >> 1;
+        dither_8to32(c, chrDither, 0);
         YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
-        for(i=0; i<8; i++) c->dither32[i] = chrDither[(i+3)&7]<<12;
+        dither_8to32(c, chrDither, 1);
         YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
     }
-    for(i=0; i<8; i++) c->dither32[i] = lumDither[i]<<12;
+    dither_8to32(c, lumDither, 0);
     if (CONFIG_SWSCALE_ALPHA && aDest) {
         YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
     }
@@ -187,8 +280,7 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
 static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
                              const int16_t *chrUSrc, const int16_t *chrVSrc,
                              const int16_t *alpSrc,
-                             uint8_t *dst[4], int dstW, int chrDstW,
-                             const uint8_t *lumDither, const uint8_t *chrDither)
+                             uint8_t *dst[4], int dstW, int chrDstW)
 {
     int p= 4;
     const int16_t *src[4]= {
@@ -222,8 +314,7 @@ static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
 static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
                                 const int16_t *chrUSrc, const int16_t *chrVSrc,
                                 const int16_t *alpSrc,
-                                uint8_t *dst[4], int dstW, int chrDstW,
-                                const uint8_t *lumDither, const uint8_t *chrDither)
+                                uint8_t *dst[4], int dstW, int chrDstW)
 {
     int p= 4;
     const int16_t *src[4]= {
@@ -231,15 +322,16 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
         chrVSrc + chrDstW, alpSrc + dstW
     };
     x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 
     while (p--) {
         if (dst[p]) {
             int i;
-            for(i=0; i<8; i++) c->dither16[i] = i<2 ? lumDither[i] : chrDither[i];
+            for(i=0; i<8; i++) c->dither16[i] = (p == 2 || p == 3) ? lumDither[i] : chrDither[i];
             __asm__ volatile(
                 "mov %2, %%"REG_a"                    \n\t"
-                "movq               0(%3), %%mm6      \n\t"
-                "movq               8(%3), %%mm7      \n\t"
+                "movq    "DITHER16"+0(%3), %%mm6      \n\t"
+                "movq    "DITHER16"+8(%3), %%mm7      \n\t"
                 ".p2align                4            \n\t" /* FIXME Unroll? */
                 "1:                                   \n\t"
                 "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"
@@ -253,7 +345,7 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
                 "add                   $8, %%"REG_a"  \n\t"
                 "jnc                   1b             \n\t"
                 :: "r" (src[p]), "r" (dst[p] + counter[p]),
-                   "g" (-counter[p]), "r"(c->dither16)
+                   "g" (-counter[p]), "r"(&c->redDither)
                 : "%"REG_a
             );
         }
@@ -485,7 +577,7 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off << 1;
+    x86_reg uv_off = c->uv_offx2;
 
     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
         YSCALEYUV2PACKEDX_ACCURATE
@@ -518,7 +610,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off << 1;
+    x86_reg uv_off = c->uv_offx2;
 
     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
         YSCALEYUV2PACKEDX
@@ -575,7 +667,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off << 1;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     YSCALEYUV2RGBX
@@ -599,7 +691,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off << 1;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     YSCALEYUV2RGBX
@@ -652,7 +744,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off << 1;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     YSCALEYUV2RGBX
@@ -676,7 +768,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off << 1;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     YSCALEYUV2RGBX
@@ -809,7 +901,7 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off << 1;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     YSCALEYUV2RGBX
@@ -833,7 +925,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off << 1;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     YSCALEYUV2RGBX
@@ -874,7 +966,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off << 1;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -895,7 +987,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off << 1;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -1637,32 +1729,6 @@ static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
     assert(src1 == src2);
 }
 
-static void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV,
-                           const uint8_t *src1, const uint8_t *src2,
-                           int width, uint32_t *unused)
-{
-    __asm__ volatile(
-        "mov                    %0, %%"REG_a"       \n\t"
-        "1:                                         \n\t"
-        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
-        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
-        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
-        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
-        "psrlw                  $8, %%mm0           \n\t"
-        "psrlw                  $8, %%mm1           \n\t"
-        "psrlw                  $8, %%mm2           \n\t"
-        "psrlw                  $8, %%mm3           \n\t"
-        "packuswb            %%mm1, %%mm0           \n\t"
-        "packuswb            %%mm3, %%mm2           \n\t"
-        "movq                %%mm0, (%3, %%"REG_a") \n\t"
-        "movq                %%mm2, (%4, %%"REG_a") \n\t"
-        "add                    $8, %%"REG_a"       \n\t"
-        " js                    1b                  \n\t"
-        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
-        : "%"REG_a
-    );
-}
-
 /* This is almost identical to the previous, end exists only because
  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
 static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src,
@@ -1712,33 +1778,6 @@ static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
     assert(src1 == src2);
 }
 
-static void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV,
-                           const uint8_t *src1, const uint8_t *src2,
-                           int width, uint32_t *unused)
-{
-    __asm__ volatile(
-        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
-        "mov                    %0, %%"REG_a"       \n\t"
-        "1:                                         \n\t"
-        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
-        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
-        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
-        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
-        "pand                %%mm4, %%mm0           \n\t"
-        "pand                %%mm4, %%mm1           \n\t"
-        "pand                %%mm4, %%mm2           \n\t"
-        "pand                %%mm4, %%mm3           \n\t"
-        "packuswb            %%mm1, %%mm0           \n\t"
-        "packuswb            %%mm3, %%mm2           \n\t"
-        "movq                %%mm0, (%3, %%"REG_a") \n\t"
-        "movq                %%mm2, (%4, %%"REG_a") \n\t"
-        "add                    $8, %%"REG_a"       \n\t"
-        " js                    1b                  \n\t"
-        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
-        : "%"REG_a
-    );
-}
-
 static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
                                               const uint8_t *src, int width)
 {
@@ -1921,7 +1960,7 @@ static void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV,
 
 #if !COMPILE_TEMPLATE_MMX2
 // bilinear / bicubic scaling
-static void RENAME(hScale)(int16_t *dst, int dstW,
+static void RENAME(hScale)(SwsContext *c, int16_t *dst, int dstW,
                            const uint8_t *src, const int16_t *filter,
                            const int16_t *filterPos, int filterSize)
 {
@@ -2433,6 +2472,7 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
         }
     }
 
+    if (c->scalingBpp == 8) {
 #if !COMPILE_TEMPLATE_MMX2
     c->hScale       = RENAME(hScale      );
 #endif /* !COMPILE_TEMPLATE_MMX2 */
@@ -2450,6 +2490,7 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
 #if COMPILE_TEMPLATE_MMX2
     }
 #endif /* COMPILE_TEMPLATE_MMX2 */
+    }
 
 #if !COMPILE_TEMPLATE_MMX2
     switch(srcFormat) {
@@ -2457,13 +2498,10 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
         case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
         case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
         case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
-        case PIX_FMT_GRAY16LE :
         case PIX_FMT_YUV420P9LE:
         case PIX_FMT_YUV422P10LE:
-        case PIX_FMT_YUV420P10LE:
-        case PIX_FMT_YUV420P16LE:
-        case PIX_FMT_YUV422P16LE:
-        case PIX_FMT_YUV444P16LE: c->hScale16= RENAME(hScale16); break;
+        case PIX_FMT_YUV420P10LE: c->hScale16= RENAME(hScale16); break;
+        default: break;
     }
 #endif /* !COMPILE_TEMPLATE_MMX2 */
     if (!c->chrSrcHSubSample) {
@@ -2477,10 +2515,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
     switch (srcFormat) {
 #if !COMPILE_TEMPLATE_MMX2
     case PIX_FMT_YUYV422  :
-    case PIX_FMT_Y400A    :
-                            c->lumToYV12 = RENAME(yuy2ToY); break;
-    case PIX_FMT_UYVY422  :
-                            c->lumToYV12 = RENAME(uyvyToY); break;
+    case PIX_FMT_Y400A    : c->lumToYV12 = RENAME(yuy2ToY); break;
+    case PIX_FMT_UYVY422  : c->lumToYV12 = RENAME(uyvyToY); break;
 #endif /* !COMPILE_TEMPLATE_MMX2 */
     case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
     case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
@@ -2494,6 +2530,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
         }
     }
 #endif /* !COMPILE_TEMPLATE_MMX2 */
-    if(isAnyRGB(c->srcFormat))
+    if(isAnyRGB(c->srcFormat) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
         c->hScale16= RENAME(hScale16);
+    if(c->scalingBpp != 8)
+        c->hScale16 = NULL;
 }

+ 2 - 2
tests/ref/lavf/pixfmt

@@ -28,9 +28,9 @@ efa7c0337cc00c796c6df615223716f1 *./tests/data/pixfmt/rgb565.yuv
 304128 ./tests/data/pixfmt/rgb555.yuv
 6be306b0cce5f8e6c271ea17fef9745b *./tests/data/pixfmt/gray.yuv
 304128 ./tests/data/pixfmt/gray.yuv
-31398104d2349dd48328a6862bc6711f *./tests/data/pixfmt/monow.yuv
+6c719671e39f1bcf67b47eab98fa529b *./tests/data/pixfmt/monow.yuv
 304128 ./tests/data/pixfmt/monow.yuv
-31398104d2349dd48328a6862bc6711f *./tests/data/pixfmt/monob.yuv
+6c719671e39f1bcf67b47eab98fa529b *./tests/data/pixfmt/monob.yuv
 304128 ./tests/data/pixfmt/monob.yuv
 00b85790df5740bab95e2559d81603a7 *./tests/data/pixfmt/yuv440p.yuv
 304128 ./tests/data/pixfmt/yuv440p.yuv

+ 8 - 8
tests/ref/lavfi/pixdesc

@@ -1,8 +1,8 @@
 abgr                037bf9df6a765520ad6d490066bf4b89
 argb                c442a8261c2265a07212ef0f72e35f5a
 bgr24               0d0cb38ab3fa0b2ec0865c14f78b217b
-bgr48be             4ba0ff7fc9e011ea264610ad1585bb1f
-bgr48le             d022bfdd6a07d5dcc693799322a386b4
+bgr48be             74dedaaacae8fd1ef46e05f78cf29d62
+bgr48le             0eb7d30801eac6058814bddd330b3c76
 bgr4_byte           50d23cc82d9dcef2fd12adb81fb9b806
 bgr555be            49f01b1f1f0c84fd9e776dd34cc3c280
 bgr555le            378d6ac4223651a1adcbf94a3d0d807b
@@ -18,8 +18,8 @@ monow               9251497f3b0634f1165d12d5a289d943
 nv12                e0af357888584d36eec5aa0f673793ef
 nv21                9a3297f3b34baa038b1f37cb202b512f
 rgb24               b41eba9651e1b5fe386289b506188105
-rgb48be             460b6de89b156290a12d3941db8bd731
-rgb48le             cd93cb34d15996987367dabda3a10128
+rgb48be             e3bc84c9af376fb6d0f0293cc7b713a6
+rgb48le             f51c0e71638a822458329abb2f4052c7
 rgb4_byte           c93ba89b74c504e7f5ae9d9ab1546c73
 rgb555be            912a62c5e53bfcbac2a0340e10973cf2
 rgb555le            a937a0fc764fb57dc1b3af87cba0273c
@@ -40,12 +40,12 @@ yuv420p9le          16543fda8f87d94a6cf857d2e8d4461a
 yuv422p             c9bba4529821d796a6ab09f6a5fd355a
 yuv422p10be         bdc13b630fd668b34c6fe1aae28dfc71
 yuv422p10le         d0607c260a45c973e6639f4e449730ad
-yuv422p16be         5499502e1c29534a158a1fe60e889f60
-yuv422p16le         e3d61fde6978591596bc36b914386623
+yuv422p16be         dc9886f2fccf87cc54b27e071a2c251e
+yuv422p16le         f181c8d8436f1233ba566d9bc88005ec
 yuv440p             5a064afe2b453bb52cdb3f176b1aa1cf
 yuv444p             0a98447b78fd476aa39686da6a74fa2e
-yuv444p16be         ea602a24b8e6969679265078bd8607b6
-yuv444p16le         1262a0dc57ee147967fc896d04206313
+yuv444p16be         af555dbaa401b142a995566864f47545
+yuv444p16le         a803e8016997dad95c5b2a72f54c34d6
 yuva420p            a29884f3f3dfe1e00b961bc17bef3d47
 yuvj420p            32eec78ba51857b16ce9b813a49b7189
 yuvj422p            0dfa0ed434f73be51428758c69e082cb

+ 8 - 8
tests/ref/lavfi/pixfmts_copy

@@ -1,8 +1,8 @@
 abgr                037bf9df6a765520ad6d490066bf4b89
 argb                c442a8261c2265a07212ef0f72e35f5a
 bgr24               0d0cb38ab3fa0b2ec0865c14f78b217b
-bgr48be             4ba0ff7fc9e011ea264610ad1585bb1f
-bgr48le             d022bfdd6a07d5dcc693799322a386b4
+bgr48be             74dedaaacae8fd1ef46e05f78cf29d62
+bgr48le             0eb7d30801eac6058814bddd330b3c76
 bgr4_byte           50d23cc82d9dcef2fd12adb81fb9b806
 bgr555be            49f01b1f1f0c84fd9e776dd34cc3c280
 bgr555le            378d6ac4223651a1adcbf94a3d0d807b
@@ -18,8 +18,8 @@ monow               9251497f3b0634f1165d12d5a289d943
 nv12                e0af357888584d36eec5aa0f673793ef
 nv21                9a3297f3b34baa038b1f37cb202b512f
 rgb24               b41eba9651e1b5fe386289b506188105
-rgb48be             460b6de89b156290a12d3941db8bd731
-rgb48le             cd93cb34d15996987367dabda3a10128
+rgb48be             e3bc84c9af376fb6d0f0293cc7b713a6
+rgb48le             f51c0e71638a822458329abb2f4052c7
 rgb4_byte           c93ba89b74c504e7f5ae9d9ab1546c73
 rgb555be            912a62c5e53bfcbac2a0340e10973cf2
 rgb555le            a937a0fc764fb57dc1b3af87cba0273c
@@ -40,12 +40,12 @@ yuv420p9le          16543fda8f87d94a6cf857d2e8d4461a
 yuv422p             c9bba4529821d796a6ab09f6a5fd355a
 yuv422p10be         bdc13b630fd668b34c6fe1aae28dfc71
 yuv422p10le         d0607c260a45c973e6639f4e449730ad
-yuv422p16be         5499502e1c29534a158a1fe60e889f60
-yuv422p16le         e3d61fde6978591596bc36b914386623
+yuv422p16be         dc9886f2fccf87cc54b27e071a2c251e
+yuv422p16le         f181c8d8436f1233ba566d9bc88005ec
 yuv440p             5a064afe2b453bb52cdb3f176b1aa1cf
 yuv444p             0a98447b78fd476aa39686da6a74fa2e
-yuv444p16be         ea602a24b8e6969679265078bd8607b6
-yuv444p16le         1262a0dc57ee147967fc896d04206313
+yuv444p16be         af555dbaa401b142a995566864f47545
+yuv444p16le         a803e8016997dad95c5b2a72f54c34d6
 yuva420p            a29884f3f3dfe1e00b961bc17bef3d47
 yuvj420p            32eec78ba51857b16ce9b813a49b7189
 yuvj422p            0dfa0ed434f73be51428758c69e082cb

+ 8 - 8
tests/ref/lavfi/pixfmts_crop

@@ -1,8 +1,8 @@
 abgr                cd761690872843d1b7ab0c695393c751
 argb                2ec6ef18769bcd651c2e8904d5a3ee67
 bgr24               3450fd00cf1493d1ded75544d82ba3ec
-bgr48be             90cb5d373a1123432d63c6a10c101afa
-bgr48le             9371f54ceda9010f1199e86f4930ac3f
+bgr48be             a9a7d177cef0914d3f1d266f00dff676
+bgr48le             b475d1b529ed80c728ddbacd22d35281
 bgr4_byte           2f6ac3cdd4676ab4e2982bdf0664945b
 bgr555be            d3a7c273604723adeb7e5f5dd1c4272b
 bgr555le            d22442fc13b464f9ba455b08df4e981f
@@ -14,8 +14,8 @@ gray                8c4850e66562a587a292dc728a65ea4a
 gray16be            daa5a6b98fb4a280c57c57bff1a2ab5a
 gray16le            84f5ea7259073edcb893113b42213c8e
 rgb24               3b90ed64b687d3dc186c6ef521dc71a8
-rgb48be             a808128041a1962deaa8620c7448feba
-rgb48le             ce92d02cc322608d5be377cb1940677b
+rgb48be             b8f9fd6aaa24d75275ee2f8b8a7b9e55
+rgb48le             3e52e831a040f086c3ae983241172cce
 rgb4_byte           6958029f73c6cdfed4f71020d816f027
 rgb555be            41a7d1836837bc90f2cae19a9c9df3b3
 rgb555le            eeb78f8ce6186fba55c941469e60ba67
@@ -29,12 +29,12 @@ yuv420p             bfea0188ddd4889787c403caae119cc7
 yuv420p16be         8365eff38b8c329aeb95fc605fa229bb
 yuv420p16le         5e8dd38d973d5854abe1ad4efad20cc1
 yuv422p             f2f930a91fe00d4252c4720b5ecd8961
-yuv422p16be         167e4338811a7d272925a4c6417d60da
-yuv422p16le         3359395d5875d581fa1e975013d30114
+yuv422p16be         93f9b6f33f9529db6de6a9f0ddd70eb5
+yuv422p16le         2e66dcfec54ca6b57aa4bbd9ac234639
 yuv440p             2472417d980e395ad6843cbb8b633b29
 yuv444p             1f151980486848c96bc5585ced99003e
-yuv444p16be         d69280c2856865d2ea94bd5292aac1c6
-yuv444p16le         33f43e030bedf9723be4f63c3e9fc80e
+yuv444p16be         e7d1ecf0c11a41b5db192f761f55bd3c
+yuv444p16le         3298a0043d982e7cf1a33a1292fa11f0
 yuva420p            7536753dfbc7932560fb50c921369a0e
 yuvj420p            21f891093006d42d7683b0e1d773a657
 yuvj422p            9a43d474c407590ad8f213880586b45e

+ 8 - 8
tests/ref/lavfi/pixfmts_hflip

@@ -1,8 +1,8 @@
 abgr                49468c6c9ceee5d52b08b1270a909323
 argb                50ba9f16c6475530602f2983278b82d0
 bgr24               cc53d2011d097972db0d22756c3699e3
-bgr48be             11641cf0f4516a9aed98f7872720f801
-bgr48le             b5440734eed128554dd9f83b34ba582f
+bgr48be             90374bc92471f1bd4931d71ef8b73f50
+bgr48le             696f628d0dd32121e60a0d61ac47d6e6
 bgr4_byte           aac987e7d1a6a96477cfc0b48a4285de
 bgr555be            bc07265898440116772200390d70c092
 bgr555le            ccee08679bac84a1f960c6c9070c5538
@@ -14,8 +14,8 @@ gray                03efcb4ab52a24c0af0e03cfd26c9377
 gray16be            9bcbca979601ddc4869f846f08f3d1dd
 gray16le            c1b8965adcc7f847ee343149ff507073
 rgb24               754f1722fc738590cc407ac65749bfe8
-rgb48be             10743e1577dc3198dbbc7c0b3b8f429e
-rgb48le             dd945a44f39119221407bf7a04f1bc49
+rgb48be             2397b9d3c296ac15f8a2325a703f81c7
+rgb48le             527043c72546d8b4bb1ce2dea4b294c3
 rgb4_byte           c8a3f995fcf3e0919239ea2c413ddc29
 rgb555be            045ce8607d3910586f4d97481dda8632
 rgb555le            8778ee0cf58ce9ad1d99a1eca9f95e87
@@ -29,12 +29,12 @@ yuv420p             2d5c80f9ba2ddd85b2aeda3564cc7d64
 yuv420p16be         758b0c1e2113b15e7afde48da4e4d024
 yuv420p16le         480ccd951dcb806bc875d307e02e50a0
 yuv422p             6e728f4eb9eae287c224f396d84be6ea
-yuv422p16be         a05d43cd62b790087bd37083174557de
-yuv422p16le         6954abebcbc62d81068d58d0c62bdd5b
+yuv422p16be         8657d2c8d443940300fdb4028d555631
+yuv422p16le         4ab27609981e50de5b1150125718ae76
 yuv440p             a99e2b57ed601f39852715c9d675d0d3
 yuv444p             947e47f7bb5fdccc659d19b7df2b6fc3
-yuv444p16be         e5ef45bc3d2f5b0b2542d5151340c382
-yuv444p16le         70793e3d66d0c23a0cdedabe9c24c2a7
+yuv444p16be         a5154ce329db0d2caf0bd43f1347dba3
+yuv444p16le         1f703308b90feb048191b3bccc695671
 yuva420p            d83ec0c01498189f179ec574918185f1
 yuvj420p            df3aaaec3bb157c3bde5f0365af30f4f
 yuvj422p            d113871528d510a192797af59df9c05c

Some files were not shown because too many files changed in this diff