Browse Source

avutil/mips: refactor msa load and store macros.

Replace STnxm_UB and LDnxm_SH with new macros ST_{H/W/D}{1/2/4/8}.
The old macros are difficult to use because they don't follow the same parameter passing rules.
Changing details as following:
1. remove LD4x4_SH.
2. replace ST2x4_UB with ST_H4.
3. replace ST4x2_UB with ST_W2.
4. replace ST4x4_UB with ST_W4.
5. replace ST4x8_UB with ST_W8.
6. replace ST6x4_UB with ST_W2 and ST_H2.
7. replace ST8x1_UB with ST_D1.
8. replace ST8x2_UB with ST_D2.
9. replace ST8x4_UB with ST_D4.
10. replace ST8x8_UB with ST_D8.
11. replace ST12x4_UB with ST_D4 and ST_W4.

Examples of new macro: ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)
ST_H4 store four half-word elements in vector 'in' to pdst with stride.
About the macro name:
1) 'ST' means store operation.
2) 'H/W/D' means type of vector element is 'half-word/word/double-word'.
3) Number '1/2/4/8' means how many elements will be stored.
About the macro parameter:
1) 'in0, in1...' 128-bits vector.
2) 'idx0, idx1...' elements index.
3) 'pdst' destination pointer to store to
4) 'stride' stride of each store operation.

Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Shiyou Yin 5 years ago
parent
commit
153c607525

+ 1 - 4
libavcodec/mips/h263dsp_msa.c

@@ -86,10 +86,7 @@ static void h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
     ILVR_B2_SH(in3, in0, in1, in2, temp0, temp1);
     in0 = (v16u8) __msa_ilvr_h(temp1, temp0);
     in3 = (v16u8) __msa_ilvl_h(temp1, temp0);
-    ST4x4_UB(in0, in0, 0, 1, 2, 3, src, stride);
-    src += 4 * stride;
-    ST4x4_UB(in3, in3, 0, 1, 2, 3, src, stride);
-    src += 4 * stride;
+    ST_W8(in0, in3, 0, 1, 2, 3, 0, 1, 2, 3, src, stride);
 }
 
 static void h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)

+ 44 - 43
libavcodec/mips/h264chroma_msa.c

@@ -85,7 +85,7 @@ static void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     res_r = __msa_sat_u_h(res_r, 7);
     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
-    ST2x4_UB(res, 0, dst, stride);
+    ST_H4(res, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -121,7 +121,7 @@ static void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     res_r = __msa_sat_u_h(res_r, 7);
     res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
-    ST4x2_UB(res, dst, stride);
+    ST_W2(res, 0, 1, dst, stride);
 }
 
 static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -144,7 +144,7 @@ static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     SRARI_H2_UH(res0_r, res1_r, 6);
     SAT_UH2_UH(res0_r, res1_r, 7);
     out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+    ST_W4(out, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -168,7 +168,7 @@ static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     SRARI_H4_UH(res0, res1, res2, res3, 6);
     SAT_UH4_UH(res0, res1, res2, res3, 7);
     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
-    ST4x8_UB(out0, out1, dst, stride);
+    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -204,7 +204,7 @@ static void avc_chroma_hz_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     SRARI_H4_UH(res0, res1, res2, res3, 6);
     SAT_UH4_UH(res0, res1, res2, res3, 7);
     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
-    ST8x4_UB(out0, out1, dst, stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -237,7 +237,7 @@ static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     SAT_UH4_UH(res4, res5, res6, res7, 7);
     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
     PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
-    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst,
@@ -266,7 +266,7 @@ static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst,
         SRARI_H4_UH(res0, res1, res2, res3, 6);
         SAT_UH4_UH(res0, res1, res2, res3, 7);
         PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
-        ST8x4_UB(out0, out1, dst, stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
         dst += (4 * stride);
     }
 
@@ -283,7 +283,7 @@ static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst,
             res0 = __msa_sat_u_h(res0, 7);
             res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
 
-            ST8x1_UB(res0, dst);
+            ST_D1(res0, 0, dst);
             dst += stride;
         }
     }
@@ -359,7 +359,7 @@ static void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
 
     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
-    ST2x4_UB(res, 0, dst, stride);
+    ST_H4(res, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -394,7 +394,7 @@ static void avc_chroma_vt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     res_r = __msa_sat_u_h(res_r, 7);
     res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
-    ST4x2_UB(res, dst, stride);
+    ST_W2(res, 0, 1, dst, stride);
 }
 
 static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -418,7 +418,7 @@ static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     SRARI_H2_UH(res0_r, res1_r, 6);
     SAT_UH2_UH(res0_r, res1_r, 7);
     out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+    ST_W4(out, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -446,7 +446,7 @@ static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     SRARI_H4_UH(res0, res1, res2, res3, 6);
     SAT_UH4_UH(res0, res1, res2, res3, 7);
     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
-    ST4x8_UB(out0, out1, dst, stride);
+    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -480,7 +480,7 @@ static void avc_chroma_vt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     SRARI_H4_UH(res0, res1, res2, res3, 6);
     SAT_UH4_UH(res0, res1, res2, res3, 7);
     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
-    ST8x4_UB(out0, out1, dst, stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -512,7 +512,7 @@ static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     SAT_UH4_UH(res0, res1, res2, res3, 7);
     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
     PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
-    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -592,7 +592,7 @@ static void avc_chroma_hv_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
 
     res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
-    ST2x4_UB(res, 0, dst, stride);
+    ST_H4(res, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_hv_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -634,7 +634,7 @@ static void avc_chroma_hv_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     res_vt0 = __msa_sat_u_h(res_vt0, 7);
     res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
-    ST4x2_UB(res, dst, stride);
+    ST_W2(res, 0, 1, dst, stride);
 }
 
 static void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -666,7 +666,8 @@ static void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     SRARI_H2_UH(res_vt0, res_vt1, 6);
     SAT_UH2_UH(res_vt0, res_vt1, 7);
     PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
-    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, stride);
+    ST_W2(res0, 0, 1, dst, stride);
+    ST_W2(res1, 0, 1, dst + 2 * stride, stride);
 }
 
 static void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -706,7 +707,7 @@ static void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
-    ST4x8_UB(res0, res1, dst, stride);
+    ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_hv_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -766,7 +767,7 @@ static void avc_chroma_hv_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
-    ST8x4_UB(out0, out1, dst, stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -822,7 +823,7 @@ static void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
     PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
-    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_chroma_hv_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -918,7 +919,7 @@ static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
     dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
     dst0 = __msa_aver_u_b(dst0, dst_data);
 
-    ST2x4_UB(dst0, 0, dst, stride);
+    ST_H4(dst0, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
@@ -962,7 +963,7 @@ static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
     res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
     dst_data = __msa_aver_u_b((v16u8) res, dst_data);
 
-    ST4x2_UB(dst_data, dst, stride);
+    ST_W2(dst_data, 0, 1, dst, stride);
 }
 
 static void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
@@ -991,7 +992,7 @@ static void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
     SAT_UH2_UH(res0_r, res1_r, 7);
     out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
     out = __msa_aver_u_b(out, dst_data);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+    ST_W4(out, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
@@ -1023,7 +1024,7 @@ static void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
     SAT_UH4_UH(res0, res1, res2, res3, 7);
     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
     AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
-    ST4x8_UB(out0, out1, dst, stride);
+    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
@@ -1066,7 +1067,7 @@ static void avc_chroma_hz_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
     SAT_UH4_UH(res0, res1, res2, res3, 7);
     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
     AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
-    ST8x4_UB(dst0, dst1, dst, stride);
+    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
@@ -1110,7 +1111,7 @@ static void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
     PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
     AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
     AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
-    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
@@ -1200,7 +1201,7 @@ static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
     res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
 
-    ST2x4_UB(res, 0, dst, stride);
+    ST_H4(res, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
@@ -1243,7 +1244,7 @@ static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
     res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
     res = __msa_aver_u_b(res, dst_data);
 
-    ST4x2_UB(res, dst, stride);
+    ST_W2(res, 0, 1, dst, stride);
 }
 
 static void avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
@@ -1273,7 +1274,7 @@ static void avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
     SAT_UH2_UH(res0_r, res1_r, 7);
     out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
     out = __msa_aver_u_b(out, dst0);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+    ST_W4(out, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
@@ -1309,7 +1310,7 @@ static void avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
     SAT_UH4_UH(res0, res1, res2, res3, 7);
     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
     AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
-    ST4x8_UB(out0, out1, dst, stride);
+    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
@@ -1351,7 +1352,7 @@ static void avc_chroma_vt_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
     SAT_UH4_UH(res0, res1, res2, res3, 7);
     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
     AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
-    ST8x4_UB(out0, out1, dst, stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
@@ -1394,7 +1395,7 @@ static void avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
     PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
     AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
     AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
-    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
@@ -1492,7 +1493,7 @@ static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
     res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
     dst0 = __msa_aver_u_b((v16u8) res, dst0);
 
-    ST2x4_UB(dst0, 0, dst, stride);
+    ST_H4(dst0, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
@@ -1545,7 +1546,7 @@ static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
     dst0 = (v16u8) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
     dst0 = __msa_aver_u_b(dst0, dst_data);
 
-    ST4x2_UB(dst0, dst, stride);
+    ST_W2(dst0, 0, 1, dst, stride);
 }
 
 static void avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
@@ -1584,7 +1585,7 @@ static void avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
     SAT_UH2_UH(res_vt0, res_vt1, 7);
     out = (v16u8) __msa_pckev_b((v16i8) res_vt1, (v16i8) res_vt0);
     out = __msa_aver_u_b(out, dst_data);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+    ST_W4(out, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
@@ -1633,7 +1634,7 @@ static void avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
-    ST4x8_UB(res0, res1, dst, stride);
+    ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
@@ -1701,7 +1702,7 @@ static void avc_chroma_hv_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
     AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
-    ST8x4_UB(out0, out1, dst, stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
@@ -1770,7 +1771,7 @@ static void avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
     PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
     AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
     AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
-    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
@@ -1848,21 +1849,21 @@ static void avg_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
         LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
         INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
         AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
-        ST4x8_UB(dst0, dst1, dst, stride);
+        ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
     } else if (4 == height) {
         LW4(src, stride, tp0, tp1, tp2, tp3);
         INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
         LW4(dst, stride, tp0, tp1, tp2, tp3);
         INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
         dst0 = __msa_aver_u_b(src0, dst0);
-        ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+        ST_W4(dst0, 0, 1, 2, 3, dst, stride);
     } else if (2 == height) {
         LW2(src, stride, tp0, tp1);
         INSERT_W2_UB(tp0, tp1, src0);
         LW2(dst, stride, tp0, tp1);
         INSERT_W2_UB(tp0, tp1, dst0);
         dst0 = __msa_aver_u_b(src0, dst0);
-        ST4x2_UB(dst0, dst, stride);
+        ST_W2(dst0, 0, 1, dst, stride);
     }
 }
 
@@ -1889,7 +1890,7 @@ static void avg_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
         INSERT_D2_UB(tp6, tp7, dst3);
         AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
                     dst2, dst3);
-        ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+        ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
     } else if (4 == height) {
         LD4(src, stride, tp0, tp1, tp2, tp3);
         INSERT_D2_UB(tp0, tp1, src0);
@@ -1898,7 +1899,7 @@ static void avg_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
         INSERT_D2_UB(tp0, tp1, dst0);
         INSERT_D2_UB(tp2, tp3, dst1);
         AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
-        ST8x4_UB(dst0, dst1, dst, stride);
+        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
     }
 }
 

+ 25 - 25
libavcodec/mips/h264dsp_msa.c

@@ -45,7 +45,7 @@ static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride,
     tmp0 = __msa_srlr_h(tmp0, denom);
     tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
     src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
-    ST4x2_UB(src0, data, stride);
+    ST_W2(src0, 0, 1, data, stride);
 }
 
 static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
@@ -71,7 +71,7 @@ static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
     tmp1 = __msa_srlr_h(tmp1, denom);
     SAT_UH2_SH(tmp0, tmp1, 7);
     src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
-    ST4x4_UB(src0, src0, 0, 1, 2, 3, data, stride);
+    ST_W4(src0, 0, 1, 2, 3, data, stride);
 }
 
 static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
@@ -102,7 +102,7 @@ static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
     SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
     SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
-    ST4x8_UB(src0, src1, data, stride);
+    ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride);
 }
 
 static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
@@ -133,7 +133,7 @@ static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
     SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
     SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
-    ST8x4_UB(src0, src1, data, stride);
+    ST_D4(src0, src1, 0, 1, 0, 1, data, stride);
 }
 
 static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
@@ -175,7 +175,7 @@ static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
     SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
     PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
                 src2, src3);
-    ST8x8_UB(src0, src1, src2, src3, data, stride);
+    ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
 }
 
 static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
@@ -218,7 +218,7 @@ static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
         SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
         PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
                     src2, src3);
-        ST8x8_UB(src0, src1, src2, src3, data, stride);
+        ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
         data += 8 * stride;
     }
 }
@@ -253,7 +253,7 @@ static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     tmp0 = __msa_maxi_s_h(tmp0, 0);
     tmp0 = __msa_min_s_h(max255, tmp0);
     dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
-    ST4x2_UB(dst0, dst, stride);
+    ST_W2(dst0, 0, 1, dst, stride);
 }
 
 static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -287,7 +287,7 @@ static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     tmp1 >>= denom;
     CLIP_SH2_0_255(tmp0, tmp1);
     dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
-    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -327,7 +327,7 @@ static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
     CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
-    ST4x8_UB(dst0, dst1, dst, stride);
+    ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -365,7 +365,7 @@ static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
     CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
-    ST8x4_UB(dst0, dst1, dst, stride);
+    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -417,7 +417,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
     CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
     PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
-    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@@ -479,7 +479,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
         CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
         PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
                     dst0, dst1, dst2, dst3);
-        ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+        ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
         dst += 8 * stride;
     }
 }
@@ -955,18 +955,18 @@ static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data,
         ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
 
         src = data - 3;
-        ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src, img_width);
-        ST2x4_UB(tmp2, 0, src + 4, img_width);
+        ST_W4(tmp3, 0, 1, 2, 3, src, img_width);
+        ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width);
         src += 4 * img_width;
-        ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src, img_width);
-        ST2x4_UB(tmp2, 4, src + 4, img_width);
+        ST_W4(tmp4, 0, 1, 2, 3, src, img_width);
+        ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width);
         src += 4 * img_width;
 
-        ST4x4_UB(tmp6, tmp6, 0, 1, 2, 3, src, img_width);
-        ST2x4_UB(tmp5, 0, src + 4, img_width);
+        ST_W4(tmp6, 0, 1, 2, 3, src, img_width);
+        ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width);
         src += 4 * img_width;
-        ST4x4_UB(tmp7, tmp7, 0, 1, 2, 3, src, img_width);
-        ST2x4_UB(tmp5, 4, src + 4, img_width);
+        ST_W4(tmp7, 0, 1, 2, 3, src, img_width);
+        ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width);
     }
     }
 }
@@ -1274,9 +1274,9 @@ static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
         tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
 
         data_cb_or_cr -= 1;
-        ST2x4_UB(tmp1, 0, data_cb_or_cr, img_width);
+        ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
         data_cb_or_cr += 4 * img_width;
-        ST2x4_UB(tmp1, 4, data_cb_or_cr, img_width);
+        ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
     }
 }
 
@@ -2110,9 +2110,9 @@ static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data,
             q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
             tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
             src = data - 1;
-            ST2x4_UB(tmp1, 0, src, img_width);
+            ST_H4(tmp1, 0, 1, 2, 3, src, img_width);
             src += 4 * img_width;
-            ST2x4_UB(tmp1, 4, src, img_width);
+            ST_H4(tmp1, 4, 5, 6, 7, src, img_width);
         }
     }
 }
@@ -2136,7 +2136,7 @@ static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride,
         }
 
         AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
-        ST2x4_UB(res, 0, (src - 1), stride);
+        ST_H4(res, 0, 1, 2, 3, (src - 1), stride);
         src += (4 * stride);
     }
 }

+ 3 - 7
libavcodec/mips/h264idct_msa.c

@@ -237,9 +237,7 @@ static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
     CLIP_SH4_0_255(res4, res5, res6, res7);
     PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
                 dst0, dst1, dst2, dst3);
-    ST8x4_UB(dst0, dst1, dst, dst_stride);
-    dst += (4 * dst_stride);
-    ST8x4_UB(dst2, dst3, dst, dst_stride);
+    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
 }
 
 static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
@@ -269,9 +267,7 @@ static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
     CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r);
     PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
                 dst0, dst1, dst2, dst3);
-    ST8x4_UB(dst0, dst1, dst, dst_stride);
-    dst += (4 * dst_stride);
-    ST8x4_UB(dst2, dst3, dst, dst_stride);
+    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
 }
 
 void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
@@ -340,7 +336,7 @@ void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
     ADD2(pred_r, input_dc, pred_l, input_dc, pred_r, pred_l);
     CLIP_SH2_0_255(pred_r, pred_l);
     out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
 }
 
 void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,

+ 68 - 70
libavcodec/mips/h264qpel_msa.c

@@ -149,7 +149,7 @@ static void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y,
 
     SAT_SH2_SH(out0, out1, 7);
     out = PCKEV_XORI128_UB(out0, out1);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+    ST_W4(out, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y,
@@ -220,7 +220,7 @@ static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y,
     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
-    ST8x4_UB(out0, out1, dst, stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
     dst += (4 * stride);
 
     LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
@@ -256,8 +256,7 @@ static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y,
     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
-    ST8x4_UB(out0, out1, dst, stride);
-    dst += (4 * stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x,
@@ -337,7 +336,7 @@ static void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x,
             SAT_SH4_SH(out0, out1, out2, out3, 7);
             tmp0 = PCKEV_XORI128_UB(out0, out1);
             tmp1 = PCKEV_XORI128_UB(out2, out3);
-            ST8x4_UB(tmp0, tmp1, dst, stride);
+            ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, stride);
             dst += (4 * stride);
 
             src_vt0 = src_vt4;
@@ -419,7 +418,7 @@ static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x,
     res = PCKEV_XORI128_UB(res0, res1);
     dst0 = __msa_aver_u_b(res, dst0);
 
-    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
 }
 
 static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
@@ -498,7 +497,7 @@ static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
     AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
-    ST8x4_UB(dst0, dst1, dst, stride);
+    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
     dst += (4 * stride);
 
     LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
@@ -539,8 +538,7 @@ static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
     AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
-    ST8x4_UB(dst0, dst1, dst, stride);
-    dst += (4 * stride);
+    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
 }
 
 static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
@@ -627,7 +625,7 @@ static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
             tmp0 = PCKEV_XORI128_UB(out0, out1);
             tmp1 = PCKEV_XORI128_UB(out2, out3);
             AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
-            ST8x4_UB(dst0, dst1, dst, stride);
+            ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
             dst += (4 * stride);
 
             src_vt0 = src_vt4;
@@ -723,7 +721,7 @@ void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
     AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
                 dst2, dst3);
 
-    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
@@ -739,7 +737,7 @@ void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
 
     dst0 = __msa_aver_u_b(src0, dst0);
 
-    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
@@ -930,7 +928,7 @@ void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
     tmp2 = __msa_aver_s_b(tmp2, src4);
     tmp3 = __msa_aver_s_b(tmp3, src5);
     XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
-    ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
+    ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
@@ -985,7 +983,7 @@ void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
     tmp2 = __msa_aver_s_b(tmp2, src4);
     tmp3 = __msa_aver_s_b(tmp3, src5);
     XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
-    ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
+    ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
@@ -1016,7 +1014,7 @@ void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
     src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
     res = __msa_aver_s_b(res, src0);
     res = (v16i8) __msa_xori_b((v16u8) res, 128);
-    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+    ST_W4(res, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
@@ -1047,7 +1045,7 @@ void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
     src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
     res = __msa_aver_s_b(res, src0);
     res = (v16i8) __msa_xori_b((v16u8) res, 128);
-    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+    ST_W4(res, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
@@ -1153,7 +1151,7 @@ void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
     out1 = PCKEV_XORI128_UB(res2, res3);
     out2 = PCKEV_XORI128_UB(res4, res5);
     out3 = PCKEV_XORI128_UB(res6, res7);
-    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
@@ -1178,7 +1176,7 @@ void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
     SRARI_H2_SH(res0, res1, 5);
     SAT_SH2_SH(res0, res1, 7);
     out = PCKEV_XORI128_UB(res0, res1);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+    ST_W4(out, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
@@ -1378,7 +1376,7 @@ void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
     out2 = __msa_aver_s_b(out2, tmp2);
     out3 = __msa_aver_s_b(out3, tmp3);
     XORI_B4_128_SB(out0, out1, out2, out3);
-    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
@@ -1431,7 +1429,7 @@ void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
     out2 = __msa_aver_s_b(out2, tmp2);
     out3 = __msa_aver_s_b(out3, tmp3);
     XORI_B4_128_SB(out0, out1, out2, out3);
-    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
@@ -1472,7 +1470,7 @@ void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
     src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
     src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
     out = __msa_aver_u_b(out, (v16u8) src32_r);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+    ST_W4(out, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
@@ -1513,7 +1511,7 @@ void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
     src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
     src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
     out = __msa_aver_u_b(out, (v16u8) src32_r);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+    ST_W4(out, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
@@ -1691,7 +1689,7 @@ void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
 
             out0 = PCKEV_XORI128_UB(dst0, dst1);
             out1 = PCKEV_XORI128_UB(dst2, dst3);
-            ST8x4_UB(out0, out1, dst, stride);
+            ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
             dst += (4 * stride);
 
             hz_out0 = hz_out4;
@@ -1804,7 +1802,7 @@ void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
 
             out0 = PCKEV_XORI128_UB(dst0, dst1);
             out1 = PCKEV_XORI128_UB(dst2, dst3);
-            ST8x4_UB(out0, out1, dst, stride);
+            ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
             dst += (4 * stride);
 
             hz_out0 = hz_out4;
@@ -1905,7 +1903,7 @@ void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
 
     out0 = PCKEV_XORI128_UB(dst0, dst1);
     out1 = PCKEV_XORI128_UB(dst2, dst3);
-    ST8x4_UB(out0, out1, dst, stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
     dst += (4 * stride);
 
     LD_SB4(src, stride, src9, src10, src11, src12);
@@ -1951,7 +1949,7 @@ void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
 
     out0 = PCKEV_XORI128_UB(dst0, dst1);
     out1 = PCKEV_XORI128_UB(dst2, dst3);
-    ST8x4_UB(out0, out1, dst, stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
@@ -2040,7 +2038,7 @@ void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
 
     out0 = PCKEV_XORI128_UB(dst0, dst1);
     out1 = PCKEV_XORI128_UB(dst2, dst3);
-    ST8x4_UB(out0, out1, dst, stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
     dst += (4 * stride);
 
     LD_SB4(src, stride, src9, src10, src11, src12);
@@ -2086,7 +2084,7 @@ void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
 
     out0 = PCKEV_XORI128_UB(dst0, dst1);
     out1 = PCKEV_XORI128_UB(dst2, dst3);
-    ST8x4_UB(out0, out1, dst, stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
@@ -2150,7 +2148,7 @@ void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
     dst1 = __msa_aver_s_h(dst1, hz_out4);
 
     res = PCKEV_XORI128_UB(dst0, dst1);
-    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+    ST_W4(res, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
@@ -2215,7 +2213,7 @@ void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
     dst1 = __msa_aver_s_h(dst1, hz_out1);
 
     res = PCKEV_XORI128_UB(dst0, dst1);
-    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+    ST_W4(res, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
@@ -2332,7 +2330,7 @@ void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
     out1 = PCKEV_XORI128_UB(out2_r, out3_r);
     out2 = PCKEV_XORI128_UB(out4_r, out5_r);
     out3 = PCKEV_XORI128_UB(out6_r, out7_r);
-    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
@@ -2369,7 +2367,7 @@ void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
     SRARI_H2_SH(out10, out32, 5);
     SAT_SH2_SH(out10, out32, 7);
     out = PCKEV_XORI128_UB(out10, out32);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+    ST_W4(out, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
@@ -2601,7 +2599,7 @@ void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
         dst0 = __msa_aver_s_h(dst2, dst0);
         dst1 = __msa_aver_s_h(dst3, dst1);
         out = PCKEV_XORI128_UB(dst0, dst1);
-        ST8x2_UB(out, dst, stride);
+        ST_D2(out, 0, 1, dst, stride);
         dst += (2 * stride);
 
         src0 = src2;
@@ -2677,7 +2675,7 @@ void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
         dst0 = __msa_aver_s_h(dst2, dst0);
         dst1 = __msa_aver_s_h(dst3, dst1);
         out = PCKEV_XORI128_UB(dst0, dst1);
-        ST8x2_UB(out, dst, stride);
+        ST_D2(out, 0, 1, dst, stride);
         dst += (2 * stride);
 
         src0 = src2;
@@ -2777,7 +2775,7 @@ void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
 
     PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
     out = PCKEV_XORI128_UB(dst0, dst2);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+    ST_W4(out, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
@@ -2873,7 +2871,7 @@ void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
 
     PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
     out = PCKEV_XORI128_UB(dst0, dst2);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+    ST_W4(out, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
@@ -2961,7 +2959,7 @@ void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
 
             out0 = PCKEV_XORI128_UB(dst0, dst1);
             out1 = PCKEV_XORI128_UB(dst2, dst3);
-            ST8x4_UB(out0, out1, dst, stride);
+            ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
             dst += (4 * stride);
 
             hz_out0 = hz_out4;
@@ -3049,7 +3047,7 @@ void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
     dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
     out0 = PCKEV_XORI128_UB(dst0, dst1);
     out1 = PCKEV_XORI128_UB(dst2, dst3);
-    ST8x4_UB(out0, out1, dst, stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
     dst += (4 * stride);
 
     LD_SB4(src, stride, src0, src1, src2, src3);
@@ -3086,7 +3084,7 @@ void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
     dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
     out0 = PCKEV_XORI128_UB(dst0, dst1);
     out1 = PCKEV_XORI128_UB(dst2, dst3);
-    ST8x4_UB(out0, out1, dst, stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
@@ -3141,7 +3139,7 @@ void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
                           filt2);
     dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
     res = PCKEV_XORI128_UB(dst0, dst1);
-    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+    ST_W4(res, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
@@ -3350,7 +3348,7 @@ void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
     INSERT_D2_UB(tp2, tp3, dst3);
     AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
     AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
-    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
@@ -3415,7 +3413,7 @@ void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
     INSERT_D2_UB(tp2, tp3, dst3);
     AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
     AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
-    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
@@ -3451,7 +3449,7 @@ void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
     LW4(dst, stride, tp0, tp1, tp2, tp3);
     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
     dst0 = __msa_aver_u_b((v16u8) res, dst0);
-    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
@@ -3487,7 +3485,7 @@ void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
     LW4(dst, stride, tp0, tp1, tp2, tp3);
     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
     dst0 = __msa_aver_u_b((v16u8) res, dst0);
-    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
@@ -3608,7 +3606,7 @@ void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
     INSERT_D2_UB(tp2, tp3, out7);
     AVER_UB2_UB(out0, out2, out1, out3, out0, out1);
     AVER_UB2_UB(out4, out6, out5, out7, out4, out5);
-    ST8x8_UB(out0, out1, out4, out5, dst, stride);
+    ST_D8(out0, out1, out4, out5, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
@@ -3637,7 +3635,7 @@ void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
     LW4(dst, stride, tp0, tp1, tp2, tp3);
     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
     res = __msa_aver_u_b(res, dst0);
-    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+    ST_W4(res, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
@@ -3856,7 +3854,7 @@ void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
     XORI_B4_128_SB(out0, out1, out2, out3);
     AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
                 dst2, dst3);
-    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
@@ -3922,7 +3920,7 @@ void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
     XORI_B4_128_SB(out0, out1, out2, out3);
     AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
                 dst2, dst3);
-    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
@@ -3967,7 +3965,7 @@ void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
     res = PCKEV_XORI128_UB(out10, out32);
     res = __msa_aver_u_b(res, (v16u8) src32_r);
     dst0 = __msa_aver_u_b(res, dst0);
-    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
@@ -4013,7 +4011,7 @@ void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
     src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
     res = __msa_aver_u_b(res, (v16u8) src32_r);
     dst0 = __msa_aver_u_b(res, dst0);
-    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
@@ -4196,7 +4194,7 @@ void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
 
             out0 = PCKEV_XORI128_UB(tmp0, tmp1);
             dst0 = __msa_aver_u_b(out0, dst0);
-            ST8x2_UB(dst0, dst, stride);
+            ST_D2(dst0, 0, 1, dst, stride);
             dst += (2 * stride);
 
             LD_SB2(src, stride, src7, src8);
@@ -4232,7 +4230,7 @@ void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
 
             out1 = PCKEV_XORI128_UB(tmp2, tmp3);
             dst1 = __msa_aver_u_b(out1, dst1);
-            ST8x2_UB(dst1, dst, stride);
+            ST_D2(dst1, 0, 1, dst, stride);
             dst += (2 * stride);
 
             hz_out0 = hz_out4;
@@ -4326,7 +4324,7 @@ void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
             INSERT_D2_UB(tp0, tp1, dst0);
             out0 = PCKEV_XORI128_UB(tmp0, tmp1);
             dst0 = __msa_aver_u_b(out0, dst0);
-            ST8x2_UB(dst0, dst, stride);
+            ST_D2(dst0, 0, 1, dst, stride);
             dst += (2 * stride);
 
             LD_SB2(src, stride, src7, src8);
@@ -4361,7 +4359,7 @@ void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
             INSERT_D2_UB(tp2, tp3, dst1);
             out1 = PCKEV_XORI128_UB(tmp2, tmp3);
             dst1 = __msa_aver_u_b(out1, dst1);
-            ST8x2_UB(dst1, dst, stride);
+            ST_D2(dst1, 0, 1, dst, stride);
             dst += (2 * stride);
 
             hz_out0 = hz_out4;
@@ -4468,7 +4466,7 @@ void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
     AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
-    ST8x4_UB(dst0, dst1, dst, stride);
+    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
     dst += (4 * stride);
 
     LD_SB4(src, stride, src9, src10, src11, src12);
@@ -4519,7 +4517,7 @@ void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
     AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
-    ST8x4_UB(dst0, dst1, dst, stride);
+    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
@@ -4614,7 +4612,7 @@ void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
     AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
-    ST8x4_UB(dst0, dst1, dst, stride);
+    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
     dst += (4 * stride);
 
     LD_SB4(src, stride, src9, src10, src11, src12);
@@ -4665,7 +4663,7 @@ void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
     AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
-    ST8x4_UB(dst0, dst1, dst, stride);
+    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
@@ -4732,7 +4730,7 @@ void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
     INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
     res = PCKEV_XORI128_UB(dst0, dst1);
     res = __msa_aver_u_b(res, out);
-    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+    ST_W4(res, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
@@ -4800,7 +4798,7 @@ void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
     INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
     res = PCKEV_XORI128_UB(dst0, dst1);
     res = __msa_aver_u_b(res, out);
-    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+    ST_W4(res, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
@@ -4936,7 +4934,7 @@ void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
     out3 = PCKEV_XORI128_UB(out6_r, out7_r);
     AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
                 dst2, dst3);
-    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
@@ -4977,7 +4975,7 @@ void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
     res = PCKEV_XORI128_UB(out10, out32);
     dst0 = __msa_aver_u_b(res, dst0);
-    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
@@ -5217,7 +5215,7 @@ void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
         tmp1 = __msa_aver_s_h(tmp3, tmp1);
         out = PCKEV_XORI128_UB(tmp0, tmp1);
         out = __msa_aver_u_b(out, dst0);
-        ST8x2_UB(out, dst, stride);
+        ST_D2(out, 0, 1, dst, stride);
         dst += (2 * stride);
 
         src0 = src2;
@@ -5297,7 +5295,7 @@ void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
         tmp1 = __msa_aver_s_h(tmp3, tmp1);
         out = PCKEV_XORI128_UB(tmp0, tmp1);
         out = __msa_aver_u_b(out, dst0);
-        ST8x2_UB(out, dst, stride);
+        ST_D2(out, 0, 1, dst, stride);
         dst += (2 * stride);
 
         src0 = src2;
@@ -5401,7 +5399,7 @@ void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
     PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
     out = PCKEV_XORI128_UB(dst0, dst2);
     out = __msa_aver_u_b(out, dstv);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+    ST_W4(out, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
@@ -5500,7 +5498,7 @@ void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
     PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
     out = PCKEV_XORI128_UB(dst0, dst2);
     out = __msa_aver_u_b(out, dstv);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+    ST_W4(out, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
@@ -5592,7 +5590,7 @@ void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
             out0 = PCKEV_XORI128_UB(res0, res1);
             out1 = PCKEV_XORI128_UB(res2, res3);
             AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
-            ST8x4_UB(out0, out1, dst, stride);
+            ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
             dst += (4 * stride);
 
             hz_out0 = hz_out4;
@@ -5685,7 +5683,7 @@ void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
     out0 = PCKEV_XORI128_UB(res0, res1);
     out1 = PCKEV_XORI128_UB(res2, res3);
     AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
-    ST8x4_UB(dst0, dst1, dst, stride);
+    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
     dst += (4 * stride);
 
     LD_SB4(src, stride, src0, src1, src2, src3);
@@ -5726,7 +5724,7 @@ void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
     out0 = PCKEV_XORI128_UB(res0, res1);
     out1 = PCKEV_XORI128_UB(res2, res3);
     AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
-    ST8x4_UB(dst0, dst1, dst, stride);
+    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
 }
 
 void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
@@ -5785,5 +5783,5 @@ void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
     res = PCKEV_XORI128_UB(res0, res1);
     res = __msa_aver_u_b(res, dst0);
-    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+    ST_W4(res, 0, 1, 2, 3, dst, stride);
 }

+ 3 - 4
libavcodec/mips/hevc_idct_msa.c

@@ -727,7 +727,7 @@ static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
     ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0);
     CLIP_SH2_0_255(dst_r0, dst_l0);
     dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0);
-    ST4x4_UB(dst_vec, dst_vec, 0, 1, 2, 3, dst, stride);
+    ST_W4(dst_vec, 0, 1, 2, 3, dst, stride);
 }
 
 static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
@@ -752,8 +752,7 @@ static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
          dst_r0, dst_l0, dst_r1, dst_l1);
     CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
     PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
-    ST8x4_UB(dst_r0, dst_r1, dst, stride);
-    dst += (4 * stride);
+    ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst, stride);
 
     LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
     INSERT_D2_SD(dst0, dst1, dst_vec0);
@@ -764,7 +763,7 @@ static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
          dst_r0, dst_l0, dst_r1, dst_l1);
     CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
     PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
-    ST8x4_UB(dst_r0, dst_r1, dst, stride);
+    ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst + 4 * stride, stride);
 }
 
 static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)

+ 13 - 19
libavcodec/mips/hevc_lpf_sao_msa.c

@@ -199,11 +199,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
             dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
             dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
 
-            ST8x4_UB(dst0, dst1, p2, stride);
-            p2 += (4 * stride);
-            SD(dst_val0, p2);
-            p2 += stride;
-            SD(dst_val1, p2);
+            ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
+            SD(dst_val0, p2 + 4 * stride);
+            SD(dst_val1, p2 + 5 * stride);
             /* strong filter ends */
         } else if (flag0 == flag1) { /* weak only */
             /* weak filter */
@@ -288,7 +286,7 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
             dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3);
 
             p2 += stride;
-            ST8x4_UB(dst0, dst1, p2, stride);
+            ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
             /* weak filter ends */
         } else { /* strong + weak */
             /* strong filter */
@@ -442,11 +440,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
             dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
             dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
 
-            ST8x4_UB(dst0, dst1, p2, stride);
-            p2 += (4 * stride);
-            SD(dst_val0, p2);
-            p2 += stride;
-            SD(dst_val1, p2);
+            ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
+            SD(dst_val0, p2 + 4 * stride);
+            SD(dst_val1, p2 + 5 * stride);
         }
     }
 }
@@ -976,7 +972,7 @@ static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride,
         temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
 
         temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
-        ST8x2_UB(temp0, p0_ptr, stride);
+        ST_D2(temp0, 0, 1, p0_ptr, stride);
     }
 }
 
@@ -1037,9 +1033,7 @@ static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride,
         temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
 
         src += 1;
-        ST2x4_UB(temp0, 0, src, stride);
-        src += (4 * stride);
-        ST2x4_UB(temp0, 4, src, stride);
+        ST_H8(temp0, 0, 1, 2, 3, 4, 5, 6, 7, src, stride);
     }
 }
 
@@ -1087,7 +1081,7 @@ static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
         LD_UB4(src, src_stride, src0, src1, src2, src3);
 
         /* store results */
-        ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+        ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
         dst += (4 * dst_stride);
     }
 
@@ -1102,7 +1096,7 @@ static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
     dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
 
     /* store results */
-    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
@@ -1153,7 +1147,7 @@ static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
         XORI_B2_128_SB(dst0, dst1);
 
         /* store results */
-        ST8x4_UB(dst0, dst1, dst, dst_stride);
+        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
         dst += dst_stride << 2;
     }
 
@@ -1173,7 +1167,7 @@ static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
     XORI_B2_128_SB(dst0, dst1);
 
     /* store results */
-    ST8x4_UB(dst0, dst1, dst, dst_stride);
+    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
 }
 
 static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst,

+ 74 - 61
libavcodec/mips/hevc_mc_bi_msa.c

@@ -86,7 +86,7 @@ static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
         dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
 
         dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
-        ST4x2_UB(dst0, dst, dst_stride);
+        ST_W2(dst0, 0, 1, dst, dst_stride);
     } else if (4 == height) {
         LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
         INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
@@ -97,7 +97,7 @@ static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
         SLLI_2V(dst0, dst1, 6);
         HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
         dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
-        ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+        ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
     } else if (0 == height % 8) {
         for (loop_cnt = (height >> 3); loop_cnt--;) {
             LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
@@ -120,7 +120,7 @@ static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
             HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
                                        dst3, 7, dst0, dst1, dst2, dst3);
             PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
-            ST4x8_UB(dst0, dst1, dst, dst_stride);
+            ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
             dst += (8 * dst_stride);
         }
     }
@@ -165,9 +165,15 @@ static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr,
                                    7, dst4, dst5, dst6, dst7);
         PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
         PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
-        ST6x4_UB(out0, out1, dst, dst_stride);
+        ST_W2(out0, 0, 2, dst, dst_stride);
+        ST_H2(out0, 2, 6, dst + 4, dst_stride);
+        ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
+        ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
         dst += (4 * dst_stride);
-        ST6x4_UB(out2, out3, dst, dst_stride);
+        ST_W2(out2, 0, 2, dst, dst_stride);
+        ST_H2(out2, 2, 6, dst + 4, dst_stride);
+        ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
+        ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -195,7 +201,7 @@ static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
         SLLI_2V(dst0, dst1, 6);
         HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
         out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
-        ST8x2_UB(out0, dst, dst_stride);
+        ST_D2(out0, 0, 1, dst, dst_stride);
     } else if (4 == height) {
         LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
         INSERT_D2_SB(tp0, tp1, src0);
@@ -207,7 +213,7 @@ static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
         HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
                                    7, dst0, dst1, dst2, dst3);
         PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
-        ST8x4_UB(out0, out1, dst, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
     } else if (6 == height) {
         LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
         src0_ptr += 4 * src_stride;
@@ -225,9 +231,8 @@ static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
                                    7, dst0, dst1, dst2, dst3);
         HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
-        ST8x4_UB(out0, out1, dst, dst_stride);
-        dst += (4 * dst_stride);
-        ST8x2_UB(out2, dst, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+        ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
     } else if (0 == height % 8) {
         uint32_t loop_cnt;
 
@@ -255,7 +260,7 @@ static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
                                        dst7, 7, dst4, dst5, dst6, dst7);
             PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
             PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
-            ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
+            ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
             dst += (8 * dst_stride);
         }
     }
@@ -294,7 +299,8 @@ static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr,
                                    7, dst0, dst1, dst2, dst3);
         HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
-        ST12x4_UB(out0, out1, out2, dst, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -378,7 +384,7 @@ static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr,
         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
         PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
         ST_UB4(out0, out1, out3, out4, dst, dst_stride);
-        ST8x4_UB(out2, out5, dst + 16, dst_stride);
+        ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -588,7 +594,7 @@ static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
                           dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
 
         PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
-        ST4x8_UB(dst0, dst1, dst, dst_stride);
+        ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
     }
 }
@@ -656,7 +662,7 @@ static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr,
                           dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
 
         PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
-        ST8x4_UB(dst0, dst1, dst, dst_stride);
+        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -1242,7 +1248,7 @@ static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr,
                           dst10, dst32, dst54, dst76);
 
         PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
-        ST4x8_UB(dst10, dst54, dst, dst_stride);
+        ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
 
         src2110 = src10998;
@@ -1316,7 +1322,7 @@ static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr,
                           dst0_r, dst1_r, dst2_r, dst3_r);
 
         PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
-        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
         dst += (4 * dst_stride);
 
         src10_r = src54_r;
@@ -1420,7 +1426,8 @@ static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr,
 
         PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
         dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
-        ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
+        ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
+        ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
         dst += (4 * dst_stride);
 
         src10_r = src54_r;
@@ -1721,7 +1728,7 @@ static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr,
         SRARI_H2_SH(out0, out1, 7);
         CLIP_SH2_0_255_MAX_SATU(out0, out1);
         out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
-        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
         dst += (4 * dst_stride);
 
         dst10 = dst54;
@@ -1849,7 +1856,7 @@ static void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr,
             tmp = __msa_srari_h(tmp, 7);
             tmp = CLIP_SH_0_255_MAX_SATU(tmp);
             out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
-            ST8x1_UB(out, dst_tmp);
+            ST_D1(out, 0, dst_tmp);
             dst_tmp += dst_stride;
 
             dst0 = dst1;
@@ -1995,7 +2002,7 @@ static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
         tmp = __msa_srari_h(tmp, 7);
         tmp = CLIP_SH_0_255_MAX_SATU(tmp);
         out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
-        ST8x1_UB(out, dst_tmp);
+        ST_D1(out, 0, dst_tmp);
         dst_tmp += dst_stride;
 
         dst0 = dst1;
@@ -2083,7 +2090,7 @@ static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
         SRARI_H2_SH(out0, out1, 7);
         CLIP_SH2_0_255_MAX_SATU(out0, out1);
         out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
-        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
         dst += (4 * dst_stride);
 
         dst10 = dst54;
@@ -2211,7 +2218,7 @@ static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
     tmp0 = CLIP_SH_0_255(tmp0);
     dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
 
-    ST4x2_UB(dst0, dst, dst_stride);
+    ST_W2(dst0, 0, 1, dst, dst_stride);
 }
 
 static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
@@ -2257,7 +2264,7 @@ static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
     HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
     dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
-    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
@@ -2318,7 +2325,7 @@ static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
                           tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
 
         PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
-        ST4x8_UB(dst0, dst1, dst, dst_stride);
+        ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
     }
 }
@@ -2398,7 +2405,10 @@ static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr,
                           dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
 
         PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
-        ST6x4_UB(dst0, dst1, dst, dst_stride);
+        ST_W2(dst0, 0, 2, dst, dst_stride);
+        ST_H2(dst0, 2, 6, dst + 4, dst_stride);
+        ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
+        ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -2443,7 +2453,7 @@ static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr,
     HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
 
     dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
-    ST8x2_UB(dst0, dst, dst_stride);
+    ST_D2(dst0, 0, 1, dst, dst_stride);
 }
 
 static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
@@ -2506,9 +2516,8 @@ static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
 
     PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
     dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
-    ST8x4_UB(dst0, dst1, dst, dst_stride);
-    dst += (4 * dst_stride);
-    ST8x2_UB(dst2, dst, dst_stride);
+    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+    ST_D2(dst2, 0, 1, dst + 4 * dst_stride, dst_stride);
 }
 
 static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
@@ -2564,7 +2573,7 @@ static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
                           dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
 
         PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
-        ST8x4_UB(dst0, dst1, dst, dst_stride);
+        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -2659,7 +2668,8 @@ static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr,
 
         PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
         dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
-        ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
+        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+        ST_W4(dst2, 0, 1, 2, 3, dst + 8, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -2825,7 +2835,7 @@ static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr,
                           dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
 
         PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
-        ST8x4_UB(dst0, dst1, dst_tmp, dst_stride);
+        ST_D4(dst0, dst1, 0, 1, 0, 1, dst_tmp, dst_stride);
         dst_tmp += (4 * dst_stride);
     }
 }
@@ -2936,7 +2946,7 @@ static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr,
     dst10 = CLIP_SH_0_255(dst10);
 
     dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
-    ST4x2_UB(dst10, dst, dst_stride);
+    ST_W2(dst10, 0, 1, dst, dst_stride);
 }
 
 static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
@@ -2985,7 +2995,7 @@ static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
     HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32);
 
     dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
-    ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
@@ -3056,7 +3066,7 @@ static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
                           dst10, dst32, dst54, dst76);
 
         PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
-        ST4x8_UB(dst10, dst54, dst, dst_stride);
+        ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
     }
 }
@@ -3147,7 +3157,10 @@ static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr,
                       dst0_r, dst1_r, dst2_r, dst3_r);
 
     PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
-    ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    ST_W2(dst0_r, 0, 2, dst, dst_stride);
+    ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
+    ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
+    ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
     dst += (4 * dst_stride);
 
     LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
@@ -3171,7 +3184,10 @@ static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr,
                       dst0_r, dst1_r, dst2_r, dst3_r);
 
     PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
-    ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    ST_W2(dst0_r, 0, 2, dst, dst_stride);
+    ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
+    ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
+    ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
     dst += (4 * dst_stride);
 }
 
@@ -3216,7 +3232,7 @@ static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr,
     HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
     dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
 
-    ST8x2_UB(dst0_r, dst, dst_stride);
+    ST_D2(dst0_r, 0, 1, dst, dst_stride);
 }
 
 static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
@@ -3275,9 +3291,8 @@ static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
 
     PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
     dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
-    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
-    dst += (4 * dst_stride);
-    ST8x2_UB(dst2_r, dst, dst_stride);
+    ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
+    ST_D2(dst2_r, 0, 1, dst + 4 * dst_stride, dst_stride);
 }
 
 static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
@@ -3337,7 +3352,7 @@ static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
                           dst0_r, dst1_r, dst2_r, dst3_r);
 
         PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
-        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -3436,7 +3451,8 @@ static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr,
 
         PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
         dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
-        ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
+        ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
+        ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
         dst += (4 * dst_stride);
 
         src2 = src6;
@@ -3610,7 +3626,7 @@ static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr,
         PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
         dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
         ST_SH2(dst0_r, dst1_r, dst, dst_stride);
-        ST8x2_UB(dst2_r, dst + 16, dst_stride);
+        ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
         dst += (2 * dst_stride);
 
         /* 16width */
@@ -3650,7 +3666,7 @@ static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr,
         PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
         dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
         ST_SH2(dst0_r, dst1_r, dst, dst_stride);
-        ST8x2_UB(dst2_r, dst + 16, dst_stride);
+        ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
         dst += (2 * dst_stride);
     }
 }
@@ -3829,7 +3845,7 @@ static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
     tmp = __msa_srari_h(tmp, 7);
     tmp = CLIP_SH_0_255_MAX_SATU(tmp);
     out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
-    ST4x2_UB(out, dst, dst_stride);
+    ST_W2(out, 0, 1, dst, dst_stride);
 }
 
 static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
@@ -3905,7 +3921,7 @@ static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
     SRARI_H2_SH(tmp0, tmp1, 7);
     CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
@@ -4018,7 +4034,7 @@ static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
         CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-        ST4x8_UB(out0, out1, dst, dst_stride);
+        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
 
         dst10_r = dst98_r;
@@ -4186,7 +4202,7 @@ static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
     CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-    ST4x8_UB(out0, out1, dst, dst_stride);
+    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
 
     LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
     src1_ptr += (4 * src2_stride);
@@ -4198,9 +4214,7 @@ static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
     SRARI_H2_SH(tmp4, tmp5, 7);
     CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
     out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
-    ST2x4_UB(out2, 0, dst + 4, dst_stride);
-    dst += 4 * dst_stride;
-    ST2x4_UB(out2, 4, dst + 4, dst_stride);
+    ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
 }
 
 static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
@@ -4274,7 +4288,7 @@ static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
     SRARI_H2_SH(tmp0, tmp1, 7);
     CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
-    ST8x2_UB(out, dst, dst_stride);
+    ST_D2(out, 0, 1, dst, dst_stride);
 }
 
 static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr,
@@ -4368,7 +4382,7 @@ static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr,
         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
         CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-        ST8x4_UB(out0, out1, dst, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
         dst += 8;
     }
 }
@@ -4485,9 +4499,8 @@ static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr,
     CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
     out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
-    ST8x4_UB(out0, out1, dst, dst_stride);
-    dst += (4 * dst_stride);
-    ST8x2_UB(out2, dst, dst_stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+    ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
 }
 
 static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
@@ -4599,7 +4612,7 @@ static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
             SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
             CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
             PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-            ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+            ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
             dst_tmp += (4 * dst_stride);
 
             dst10_r = dst54_r;
@@ -4749,7 +4762,7 @@ static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
         CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-        ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
         dst_tmp += (4 * dst_stride);
 
         dst10_r = dst54_r;
@@ -4835,7 +4848,7 @@ static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
         CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-        ST4x8_UB(out0, out1, dst, dst_stride);
+        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
 
         dst10_r = dst98_r;

+ 67 - 61
libavcodec/mips/hevc_mc_biw_msa.c

@@ -126,7 +126,7 @@ static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
         dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
         dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
         out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
-        ST4x2_UB(out0, dst, dst_stride);
+        ST_W2(out0, 0, 1, dst, dst_stride);
     } else if (4 == height) {
         LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
         INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
@@ -138,7 +138,7 @@ static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
         HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec,
                                     offset_vec, dst0, dst1);
         out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
-        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+        ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
     } else if (0 == height % 8) {
         for (loop_cnt = (height >> 3); loop_cnt--;) {
             LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
@@ -162,7 +162,7 @@ static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
                                         in3, weight_vec, rnd_vec, offset_vec,
                                         dst0, dst1, dst2, dst3);
             PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
-            ST4x8_UB(out0, out1, dst, dst_stride);
+            ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
             dst += (8 * dst_stride);
         }
     }
@@ -214,7 +214,10 @@ static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr,
                                     weight_vec, rnd_vec, offset_vec,
                                     dst0, dst1, dst2, dst3);
         PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
-        ST6x4_UB(out0, out1, dst, dst_stride);
+        ST_W2(out0, 0, 2, dst, dst_stride);
+        ST_H2(out0, 2, 6, dst + 4, dst_stride);
+        ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
+        ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -261,7 +264,7 @@ static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
                            dst0, dst1);
 
         out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
-        ST8x2_UB(out0, dst, dst_stride);
+        ST_D2(out0, 0, 1, dst, dst_stride);
     } else if (6 == height) {
         LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
         src0_ptr += 4 * src_stride;
@@ -281,9 +284,8 @@ static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
         HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
                                     offset_vec, dst4, dst5);
         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
-        ST8x4_UB(out0, out1, dst, dst_stride);
-        dst += (4 * dst_stride);
-        ST8x2_UB(out2, dst, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+        ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
     } else if (0 == height % 4) {
         uint32_t loop_cnt;
 
@@ -302,7 +304,7 @@ static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
                                         in3, weight_vec, rnd_vec, offset_vec,
                                         dst0, dst1, dst2, dst3);
             PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
-            ST8x4_UB(out0, out1, dst, dst_stride);
+            ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
             dst += (4 * dst_stride);
         }
     }
@@ -361,7 +363,8 @@ static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr,
         HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
                                     offset_vec, dst4, dst5);
         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
-        ST12x4_UB(out0, out1, out2, dst, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -480,7 +483,7 @@ static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr,
         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
         PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
         ST_UB4(out0, out1, out3, out4, dst, dst_stride);
-        ST8x4_UB(out2, out5, dst + 16, dst_stride);
+        ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -720,7 +723,7 @@ static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr,
                            out0, out1);
 
         out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
-        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+        ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -800,7 +803,7 @@ static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr,
                            out0, out1, out2, out3);
 
         PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
-        ST8x4_UB(out0, out1, dst, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -876,7 +879,7 @@ static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
                            weight_vec, rnd_vec, offset_vec, out0, out1, out2,
                            out3);
         PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
-        ST8x4_UB(out0, out1, dst, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
 
         LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
         src0_ptr += (4 * src_stride);
@@ -895,7 +898,7 @@ static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
         HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec,
                            offset_vec, out0, out1);
         out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
-        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
+        ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -1483,7 +1486,7 @@ static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr,
                            out0, out1, out2, out3);
 
         PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
-        ST4x8_UB(out0, out1, dst, dst_stride);
+        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
 
         src2110 = src10998;
@@ -1568,7 +1571,7 @@ static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr,
                            out0, out1, out2, out3);
 
         PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
-        ST8x4_UB(out0, out1, dst, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
         dst += (4 * dst_stride);
 
         src10_r = src54_r;
@@ -1674,8 +1677,8 @@ static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
         dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
         out2 = CLIP_SH_0_255(dst2_r);
         PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
-        ST8x2_UB(out0, dst, dst_stride);
-        ST4x2_UB(out2, dst + 8, dst_stride);
+        ST_D2(out0, 0, 1, dst, dst_stride);
+        ST_W2(out2, 0, 1, dst + 8, dst_stride);
         dst += (2 * dst_stride);
 
         src10_r = src32_r;
@@ -2048,7 +2051,7 @@ static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
         CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
         PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
         out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
-        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
         dst += (4 * dst_stride);
 
         dst10 = dst54;
@@ -2226,7 +2229,7 @@ static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
             CLIP_SW4_0_255_MAX_SATU(dst0_l, dst0_r, dst1_l, dst1_r);
             PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
             out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
-            ST8x2_UB(out, dst_tmp, dst_stride);
+            ST_D2(out, 0, 1, dst_tmp, dst_stride);
             dst_tmp += (2 * dst_stride);
 
             dst0 = dst2;
@@ -2412,7 +2415,7 @@ static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
         CLIP_SW4_0_255_MAX_SATU(dst1, dst0, dst3, dst2);
         PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
         out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
-        ST8x2_UB(out, dst_tmp, dst_stride);
+        ST_D2(out, 0, 1, dst_tmp, dst_stride);
         dst_tmp += (2 * dst_stride);
 
         dsth0 = dsth2;
@@ -2503,7 +2506,7 @@ static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
         CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
         PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
         out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
-        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
         dst += (4 * dst_stride);
 
         dst10 = dst54;
@@ -2683,7 +2686,7 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
     dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
     out0 = CLIP_SH_0_255(dst0_r);
     out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
-    ST4x2_UB(out0, dst, dst_stride);
+    ST_W2(out0, 0, 1, dst, dst_stride);
 }
 
 static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
@@ -2743,7 +2746,7 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
                        dst0, dst1);
 
     dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
-    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
@@ -2816,7 +2819,7 @@ static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
                            dst0, dst1, dst2, dst3);
 
         PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
-        ST4x8_UB(dst0, dst1, dst, dst_stride);
+        ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
     }
 }
@@ -2918,7 +2921,10 @@ static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr,
                            dst0, dst1, dst2, dst3);
 
         PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
-        ST6x4_UB(dst0, dst1, dst, dst_stride);
+        ST_W2(dst0, 0, 2, dst, dst_stride);
+        ST_H2(dst0, 2, 6, dst + 4, dst_stride);
+        ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
+        ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -2976,7 +2982,7 @@ static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
                        dst0, dst1);
 
     dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
-    ST8x2_UB(dst0, dst, dst_stride);
+    ST_D2(dst0, 0, 1, dst, dst_stride);
 }
 
 static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
@@ -3049,9 +3055,8 @@ static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
 
     PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
     dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
-    ST8x4_UB(dst0, dst1, dst, dst_stride);
-    dst += (4 * dst_stride);
-    ST8x2_UB(dst3, dst, dst_stride);
+    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+    ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride);
 }
 
 static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
@@ -3119,7 +3124,7 @@ static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
                            dst0, dst1, dst2, dst3);
 
         PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
-        ST8x4_UB(dst0, dst1, dst, dst_stride);
+        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -3235,7 +3240,8 @@ static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr,
 
         PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
         dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
-        ST12x4_UB(dst0, dst1, dst3, dst, dst_stride);
+        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+        ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -3411,7 +3417,7 @@ static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr,
                            dst0, dst1);
 
         dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
-        ST8x2_UB(dst0, (dst + 16), dst_stride);
+        ST_D2(dst0, 0, 1, (dst + 16), dst_stride);
         dst += (2 * dst_stride);
     }
 }
@@ -3551,7 +3557,7 @@ static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
     dst10_r = (v4i32) __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
     out = CLIP_SH_0_255(dst10_r);
     out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
-    ST4x2_UB(out, dst, dst_stride);
+    ST_W2(out, 0, 1, dst, dst_stride);
 }
 
 static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
@@ -3617,7 +3623,7 @@ static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
                        dst10, dst32);
 
     dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
-    ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
     dst += (4 * dst_stride);
 }
 
@@ -3702,7 +3708,7 @@ static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
                            dst10, dst32, dst54, dst76);
 
         PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
-        ST4x8_UB(dst10, dst32, dst, dst_stride);
+        ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
     }
 }
@@ -3807,7 +3813,10 @@ static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr,
                            tmp0, tmp1, tmp2, tmp3);
 
         PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-        ST6x4_UB(tmp0, tmp1, dst, dst_stride);
+        ST_W2(tmp0, 0, 2, dst, dst_stride);
+        ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
+        ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
+        ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -3866,7 +3875,7 @@ static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
                        tmp0, tmp1);
 
     tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
-    ST8x2_UB(tmp0, dst, dst_stride);
+    ST_D2(tmp0, 0, 1, dst, dst_stride);
 }
 
 static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
@@ -3936,9 +3945,8 @@ static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
 
     PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
     tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
-    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
-    dst += (4 * dst_stride);
-    ST8x2_UB(tmp3, dst, dst_stride);
+    ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+    ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride);
 }
 
 static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
@@ -4010,7 +4018,7 @@ static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
                            tmp0, tmp1, tmp2, tmp3);
 
         PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -4132,7 +4140,8 @@ static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr,
 
         PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
         tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
-        ST12x4_UB(tmp0, tmp1, tmp2, dst, dst_stride);
+        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+        ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -4323,7 +4332,7 @@ static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
         /* 8width */
         tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
         ST_SH2(tmp0, tmp1, dst, dst_stride);
-        ST8x2_UB(tmp2, dst + 16, dst_stride);
+        ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
         dst += (2 * dst_stride);
 
         /* 16width */
@@ -4363,7 +4372,7 @@ static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
         /* 8width */
         tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
         ST_SH2(tmp0, tmp1, dst, dst_stride);
-        ST8x2_UB(tmp2, dst + 16, dst_stride);
+        ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
         dst += (2 * dst_stride);
     }
 }
@@ -4568,7 +4577,7 @@ static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
     tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
     tmp = CLIP_SH_0_255_MAX_SATU(tmp);
     out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
-    ST4x2_UB(out, dst, dst_stride);
+    ST_W2(out, 0, 1, dst, dst_stride);
 }
 
 static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
@@ -4665,7 +4674,7 @@ static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
     PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
     CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
@@ -4803,7 +4812,7 @@ static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
                     tmp2, tmp3);
         CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-        ST4x8_UB(out0, out1, dst, dst_stride);
+        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
 
         dst10_r = dst98_r;
@@ -5001,7 +5010,7 @@ static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
                 tmp2, tmp3);
     CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-    ST4x8_UB(out0, out1, dst, dst_stride);
+    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
 
     PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
 
@@ -5023,9 +5032,7 @@ static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
 
     CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
     out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
-    ST2x4_UB(out2, 0, dst + 4, dst_stride);
-    dst += 4 * dst_stride;
-    ST2x4_UB(out2, 4, dst + 4, dst_stride);
+    ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
 }
 
 static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
@@ -5121,7 +5128,7 @@ static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
     PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
     CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
-    ST8x2_UB(out, dst, dst_stride);
+    ST_D2(out, 0, 1, dst, dst_stride);
 }
 
 static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
@@ -5243,7 +5250,7 @@ static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
                     tmp0, tmp1, tmp2, tmp3);
         CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-        ST8x4_UB(out0, out1, dst, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
         dst += 8;
     }
 }
@@ -5394,9 +5401,8 @@ static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
     PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
     CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
     out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
-    ST8x4_UB(out0, out1, dst, dst_stride);
-    dst += (4 * dst_stride);
-    ST8x2_UB(out2, dst, dst_stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+    ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
 }
 
 static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
@@ -5533,7 +5539,7 @@ static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
                         tmp0, tmp1, tmp2, tmp3);
             CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
             PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-            ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+            ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
             dst_tmp += (4 * dst_stride);
 
             dst10_r = dst54_r;
@@ -5720,7 +5726,7 @@ static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
                     tmp0, tmp1, tmp2, tmp3);
         CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-        ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
         dst_tmp += (4 * dst_stride);
 
         dst10_r = dst54_r;
@@ -5816,7 +5822,7 @@ static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
                     tmp0, tmp1, tmp2, tmp3);
         CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
         PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-        ST4x8_UB(out0, out1, dst, dst_stride);
+        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
 
         dst10_r = dst98_r;

+ 71 - 75
libavcodec/mips/hevc_mc_uni_msa.c

@@ -309,7 +309,7 @@ static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
     SRARI_H2_SH(out0, out1, 6);
     SAT_SH2_SH(out0, out1, 7);
     out = PCKEV_XORI128_UB(out0, out1);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
@@ -344,10 +344,9 @@ static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
     SRARI_H4_SH(out0, out1, out2, out3, 6);
     SAT_SH4_SH(out0, out1, out2, out3, 7);
     out = PCKEV_XORI128_UB(out0, out1);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-    dst += (4 * dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
     out = PCKEV_XORI128_UB(out2, out3);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
 }
 
 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
@@ -382,11 +381,10 @@ static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
     SRARI_H4_SH(out0, out1, out2, out3, 6);
     SAT_SH4_SH(out0, out1, out2, out3, 7);
     out = PCKEV_XORI128_UB(out0, out1);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-    dst += (4 * dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
     out = PCKEV_XORI128_UB(out2, out3);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-    dst += (4 * dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+    dst += (8 * dst_stride);
 
     LD_SB4(src, src_stride, src0, src1, src2, src3);
     XORI_B4_128_SB(src0, src1, src2, src3);
@@ -402,10 +400,9 @@ static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
     SRARI_H4_SH(out0, out1, out2, out3, 6);
     SAT_SH4_SH(out0, out1, out2, out3, 7);
     out = PCKEV_XORI128_UB(out0, out1);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-    dst += (4 * dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
     out = PCKEV_XORI128_UB(out2, out3);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
 }
 
 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
@@ -468,7 +465,7 @@ static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
         SAT_SH4_SH(out0, out1, out2, out3, 7);
         tmp0 = PCKEV_XORI128_UB(out0, out1);
         tmp1 = PCKEV_XORI128_UB(out2, out3);
-        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -546,8 +543,8 @@ static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
         tmp1 = PCKEV_XORI128_UB(out2, out3);
         tmp2 = PCKEV_XORI128_UB(out4, out5);
 
-        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
-        ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
+        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+        ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -670,7 +667,7 @@ static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
         SAT_SH4_SH(out0, out8, out2, out9, 7);
         SAT_SH2_SH(out1, out3, 7);
         out = PCKEV_XORI128_UB(out8, out9);
-        ST8x2_UB(out, dst + 16, dst_stride);
+        ST_D2(out, 0, 1, dst + 16, dst_stride);
         out = PCKEV_XORI128_UB(out0, out1);
         ST_UB(out, dst);
         dst += dst_stride;
@@ -965,10 +962,8 @@ static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
         SAT_SH2_SH(out54, out76, 7);
         out0 = PCKEV_XORI128_UB(out10, out32);
         out1 = PCKEV_XORI128_UB(out54, out76);
-        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
-        dst += (4 * dst_stride);
-        ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
-        dst += (4 * dst_stride);
+        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
+        dst += (8 * dst_stride);
 
         src2110 = src10998;
         src4332 = src12111110;
@@ -1019,7 +1014,7 @@ static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
         tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
         tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
-        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
         dst += (4 * dst_stride);
 
         src10_r = src54_r;
@@ -1458,10 +1453,8 @@ static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
         PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
         out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
         out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
-        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
-        dst += (4 * dst_stride);
-        ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
-        dst += (4 * dst_stride);
+        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
+        dst += (8 * dst_stride);
 
         dst10_r = dst98_r;
         dst32_r = dst1110_r;
@@ -1595,7 +1588,7 @@ static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
 
             PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
             out = PCKEV_XORI128_UB(dst0, dst1);
-            ST8x2_UB(out, dst_tmp, dst_stride);
+            ST_D2(out, 0, 1, dst_tmp, dst_stride);
             dst_tmp += (2 * dst_stride);
 
             dst0 = dst2;
@@ -1741,7 +1734,7 @@ static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
 
         PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
         out0 = PCKEV_XORI128_UB(dst0, dst1);
-        ST8x2_UB(out0, dst_tmp, dst_stride);
+        ST_D2(out0, 0, 1, dst_tmp, dst_stride);
         dst_tmp += (2 * dst_stride);
 
         dst0 = dst2;
@@ -1845,10 +1838,8 @@ static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
         PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
         out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
         out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
-        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
-        dst += (4 * dst_stride);
-        ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
-        dst += (4 * dst_stride);
+        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
+        dst += (8 * dst_stride);
 
         dst10_r = dst98_r;
         dst32_r = dst1110_r;
@@ -1944,7 +1935,7 @@ static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
     res0 = __msa_srari_h(res0, 6);
     res0 = __msa_sat_s_h(res0, 7);
     out = PCKEV_XORI128_UB(res0, res0);
-    ST4x2_UB(out, dst, dst_stride);
+    ST_W2(out, 0, 1, dst, dst_stride);
 }
 
 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
@@ -1971,7 +1962,7 @@ static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
     SRARI_H2_SH(out0, out1, 6);
     SAT_SH2_SH(out0, out1, 7);
     out = PCKEV_XORI128_UB(out0, out1);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
@@ -2004,10 +1995,9 @@ static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
     SRARI_H4_SH(out0, out1, out2, out3, 6);
     SAT_SH4_SH(out0, out1, out2, out3, 7);
     out = PCKEV_XORI128_UB(out0, out1);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-    dst += (4 * dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
     out = PCKEV_XORI128_UB(out2, out3);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
 }
 
 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
@@ -2038,11 +2028,10 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
     SRARI_H4_SH(out0, out1, out2, out3, 6);
     SAT_SH4_SH(out0, out1, out2, out3, 7);
     out = PCKEV_XORI128_UB(out0, out1);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-    dst += (4 * dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
     out = PCKEV_XORI128_UB(out2, out3);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-    dst += (4 * dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+    dst += (8 * dst_stride);
 
     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
     src += (8 * src_stride);
@@ -2054,10 +2043,9 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
     SRARI_H4_SH(out0, out1, out2, out3, 6);
     SAT_SH4_SH(out0, out1, out2, out3, 7);
     out = PCKEV_XORI128_UB(out0, out1);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-    dst += (4 * dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
     out = PCKEV_XORI128_UB(out2, out3);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
 }
 
 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
@@ -2102,7 +2090,10 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
     SAT_SH4_SH(out0, out1, out2, out3, 7);
     out4 = PCKEV_XORI128_UB(out0, out1);
     out5 = PCKEV_XORI128_UB(out2, out3);
-    ST6x4_UB(out4, out5, dst, dst_stride);
+    ST_W2(out4, 0, 2, dst, dst_stride);
+    ST_H2(out4, 2, 6, dst + 4, dst_stride);
+    ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
+    ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
     dst += (4 * dst_stride);
 
     LD_SB4(src, src_stride, src0, src1, src2, src3);
@@ -2115,8 +2106,10 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
     SAT_SH4_SH(out0, out1, out2, out3, 7);
     out4 = PCKEV_XORI128_UB(out0, out1);
     out5 = PCKEV_XORI128_UB(out2, out3);
-    ST6x4_UB(out4, out5, dst, dst_stride);
-    dst += (4 * dst_stride);
+    ST_W2(out4, 0, 2, dst, dst_stride);
+    ST_H2(out4, 2, 6, dst + 4, dst_stride);
+    ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
+    ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
 }
 
 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
@@ -2148,7 +2141,7 @@ static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
         SRARI_H2_SH(vec0, vec1, 6);
         SAT_SH2_SH(vec0, vec1, 7);
         out = PCKEV_XORI128_UB(vec0, vec1);
-        ST8x2_UB(out, dst, dst_stride);
+        ST_D2(out, 0, 1, dst, dst_stride);
         dst += (2 * dst_stride);
     }
 }
@@ -2182,7 +2175,7 @@ static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
         SAT_SH4_SH(out0, out1, out2, out3, 7);
         tmp0 = PCKEV_XORI128_UB(out0, out1);
         tmp1 = PCKEV_XORI128_UB(out2, out3);
-        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -2235,7 +2228,7 @@ static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
         SRARI_H2_SH(out0, out1, 6);
         SAT_SH2_SH(out0, out1, 7);
         tmp0 = PCKEV_XORI128_UB(out0, out1);
-        ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
+        ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
 
         VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
         VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
@@ -2249,7 +2242,7 @@ static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
         SAT_SH4_SH(out2, out3, out4, out5, 7);
         tmp0 = PCKEV_XORI128_UB(out2, out3);
         tmp1 = PCKEV_XORI128_UB(out4, out5);
-        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -2395,7 +2388,7 @@ static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
         SAT_SH4_SH(out0, out1, out2, out3, 7);
         tmp0 = PCKEV_XORI128_UB(out0, out1);
         tmp1 = PCKEV_XORI128_UB(out2, out3);
-        ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
+        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
         dst1 += (4 * dst_stride);
     }
 }
@@ -2496,7 +2489,7 @@ static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
     out10 = __msa_srari_h(out10, 6);
     out10 = __msa_sat_s_h(out10, 7);
     out = PCKEV_XORI128_UB(out10, out10);
-    ST4x2_UB(out, dst, dst_stride);
+    ST_W2(out, 0, 1, dst, dst_stride);
 }
 
 static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
@@ -2540,7 +2533,7 @@ static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
         SRARI_H2_SH(out10, out32, 6);
         SAT_SH2_SH(out10, out32, 7);
         out = PCKEV_XORI128_UB(out10, out32);
-        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
         dst += (4 * dst_stride);
     }
 }
@@ -2596,7 +2589,10 @@ static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
     SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
     out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
     out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
-    ST6x4_UB(out0, out1, dst, dst_stride);
+    ST_W2(out0, 0, 2, dst, dst_stride);
+    ST_H2(out0, 2, 6, dst + 4, dst_stride);
+    ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
+    ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
     dst += (4 * dst_stride);
 
     LD_SB2(src, src_stride, src3, src4);
@@ -2619,7 +2615,10 @@ static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
     SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
     out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
     out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
-    ST6x4_UB(out0, out1, dst, dst_stride);
+    ST_W2(out0, 0, 2, dst, dst_stride);
+    ST_H2(out0, 2, 6, dst + 4, dst_stride);
+    ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
+    ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
 }
 
 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
@@ -2645,7 +2644,7 @@ static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
     SRARI_H2_SH(tmp0, tmp1, 6);
     SAT_SH2_SH(tmp0, tmp1, 7);
     out = PCKEV_XORI128_UB(tmp0, tmp1);
-    ST8x2_UB(out, dst, dst_stride);
+    ST_D2(out, 0, 1, dst, dst_stride);
 }
 
 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
@@ -2737,7 +2736,7 @@ static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
         tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
         tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
-        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
         dst += (4 * dst_stride);
 
         src10_r = src98_r;
@@ -2811,9 +2810,9 @@ static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
         SAT_SH2_SH(dst0_l, dst1_l, 7);
         out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
         out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
-        ST8x4_UB(out0, out1, dst, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
         out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
-        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
+        ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
         dst += (4 * dst_stride);
 
         src2 = src6;
@@ -2982,12 +2981,12 @@ static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
         out = PCKEV_XORI128_UB(out0_r, out0_l);
         ST_UB(out, dst);
         out = PCKEV_XORI128_UB(out2_r, out2_r);
-        ST8x1_UB(out, dst + 16);
+        ST_D1(out, 0, dst + 16);
         dst += dst_stride;
         out = PCKEV_XORI128_UB(out1_r, out1_l);
         ST_UB(out, dst);
         out = PCKEV_XORI128_UB(out3_r, out3_r);
-        ST8x1_UB(out, dst + 16);
+        ST_D1(out, 0, dst + 16);
         dst += dst_stride;
     }
 }
@@ -3137,7 +3136,7 @@ static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
     tmp = __msa_srari_h(tmp, 6);
     tmp = __msa_sat_s_h(tmp, 7);
     out = PCKEV_XORI128_UB(tmp, tmp);
-    ST4x2_UB(out, dst, dst_stride);
+    ST_W2(out, 0, 1, dst, dst_stride);
 }
 
 static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
@@ -3196,7 +3195,7 @@ static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
     SRARI_H2_SH(tmp0, tmp1, 6);
     SAT_SH2_SH(tmp0, tmp1, 7);
     out = PCKEV_XORI128_UB(tmp0, tmp1);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
@@ -3288,7 +3287,7 @@ static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
         SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
         out0 = PCKEV_XORI128_UB(tmp0, tmp1);
         out1 = PCKEV_XORI128_UB(tmp2, tmp3);
-        ST4x8_UB(out0, out1, dst, dst_stride);
+        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
 
         dst10_r = dst98_r;
@@ -3432,10 +3431,8 @@ static void hevc_hv_uni_4t_6w_msa(uint8_t *src,
     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
     out2 = PCKEV_XORI128_UB(tmp4, tmp5);
-    ST4x8_UB(out0, out1, dst, dst_stride);
-    ST2x4_UB(out2, 0, dst + 4, dst_stride);
-    dst += 4 * dst_stride;
-    ST2x4_UB(out2, 4, dst + 4, dst_stride);
+    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
+    ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
 }
 
 static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
@@ -3497,7 +3494,7 @@ static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
     SRARI_H2_SH(out0_r, out1_r, 6);
     SAT_SH2_SH(out0_r, out1_r, 7);
     out = PCKEV_XORI128_UB(out0_r, out1_r);
-    ST8x2_UB(out, dst, dst_stride);
+    ST_D2(out, 0, 1, dst, dst_stride);
 }
 
 static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src,
@@ -3580,7 +3577,7 @@ static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src,
         SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
         out0 = PCKEV_XORI128_UB(tmp0, tmp1);
         out1 = PCKEV_XORI128_UB(tmp2, tmp3);
-        ST8x4_UB(out0, out1, dst, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
         dst += 8;
     }
 }
@@ -3684,9 +3681,8 @@ static void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
     out1 = PCKEV_XORI128_UB(out2_r, out3_r);
     out2 = PCKEV_XORI128_UB(out4_r, out5_r);
 
-    ST8x4_UB(out0, out1, dst, dst_stride);
-    dst += (4 * dst_stride);
-    ST8x2_UB(out2, dst, dst_stride);
+    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+    ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
 }
 
 static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src,
@@ -3788,7 +3784,7 @@ static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src,
             SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
             out0 = PCKEV_XORI128_UB(out0_r, out1_r);
             out1 = PCKEV_XORI128_UB(out2_r, out3_r);
-            ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+            ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
             dst_tmp += (4 * dst_stride);
 
             dst10_r = dst54_r;
@@ -3919,7 +3915,7 @@ static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
         SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
         out0 = PCKEV_XORI128_UB(tmp0, tmp1);
         out1 = PCKEV_XORI128_UB(tmp2, tmp3);
-        ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+        ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
         dst_tmp += (4 * dst_stride);
 
         dst10_r = dst54_r;
@@ -3985,7 +3981,7 @@ static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
         SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
         out0 = PCKEV_XORI128_UB(tmp0, tmp1);
         out1 = PCKEV_XORI128_UB(tmp2, tmp3);
-        ST4x8_UB(out0, out1, dst, dst_stride);
+        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
         dst += (8 * dst_stride);
 
         dst10_r = dst98_r;

Some files were not shown because too many files changed in this diff