mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-21 10:55:51 +02:00
lavc/mips: Fix bugs in me_cmp_msa.c file.
This patch fixes a bug where the fate-checkasm-motion fails when h is not a multiple of 8. Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
parent
629aa5fbc0
commit
d5679d6899
@ -25,11 +25,13 @@ static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
|
||||
const uint8_t *ref, int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
int32_t ht_cnt = height >> 2;
|
||||
int res = (height & 0x03);
|
||||
v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
|
||||
v8u16 zero = { 0 };
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
for (; ht_cnt--; ) {
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
|
||||
@ -39,6 +41,16 @@ static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
|
||||
src0, src1, ref0, ref1);
|
||||
sad += SAD_UB2_UH(src0, src1, ref0, ref1);
|
||||
}
|
||||
for (; res--; ) {
|
||||
v16u8 diff;
|
||||
src0 = LD_UB(src);
|
||||
ref0 = LD_UB(ref);
|
||||
src += src_stride;
|
||||
ref += ref_stride;
|
||||
diff = __msa_asub_u_b((v16u8) src0, (v16u8) ref0);
|
||||
diff = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)diff);
|
||||
sad += __msa_hadd_u_h((v16u8) diff, (v16u8) diff);
|
||||
}
|
||||
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
@ -47,11 +59,12 @@ static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
|
||||
const uint8_t *ref, int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
int32_t ht_cnt = height >> 2;
|
||||
int res = (height & 0x03);
|
||||
v16u8 src0, src1, ref0, ref1;
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
for (; ht_cnt--; ) {
|
||||
LD_UB2(src, src_stride, src0, src1);
|
||||
src += (2 * src_stride);
|
||||
LD_UB2(ref, ref_stride, ref0, ref1);
|
||||
@ -64,7 +77,15 @@ static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
|
||||
ref += (2 * ref_stride);
|
||||
sad += SAD_UB2_UH(src0, src1, ref0, ref1);
|
||||
}
|
||||
|
||||
for (; res > 0; res--) {
|
||||
v16u8 diff;
|
||||
src0 = LD_UB(src);
|
||||
ref0 = LD_UB(ref);
|
||||
src += src_stride;
|
||||
ref += ref_stride;
|
||||
diff = __msa_asub_u_b((v16u8) src0, (v16u8) ref0);
|
||||
sad += __msa_hadd_u_h((v16u8) diff, (v16u8) diff);
|
||||
}
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
|
||||
@ -74,12 +95,14 @@ static uint32_t sad_horiz_bilinear_filter_8width_msa(const uint8_t *src,
|
||||
int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
int32_t ht_cnt = height >> 3;
|
||||
int32_t res = height & 0x07;
|
||||
v16u8 src0, src1, src2, src3, comp0, comp1;
|
||||
v16u8 ref0, ref1, ref2, ref3, ref4, ref5;
|
||||
v8u16 zero = { 0 };
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 3); ht_cnt--;) {
|
||||
for (; ht_cnt--; ) {
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
|
||||
@ -107,6 +130,18 @@ static uint32_t sad_horiz_bilinear_filter_8width_msa(const uint8_t *src,
|
||||
sad += SAD_UB2_UH(src0, src1, comp0, comp1);
|
||||
}
|
||||
|
||||
for (; res--; ) {
|
||||
v16u8 diff;
|
||||
src0 = LD_UB(src);
|
||||
ref0 = LD_UB(ref);
|
||||
ref1 = LD_UB(ref + 1);
|
||||
src += src_stride;
|
||||
ref += ref_stride;
|
||||
comp0 = (v16u8)__msa_aver_u_b((v16u8) ref0, (v16u8) ref1);
|
||||
diff = __msa_asub_u_b((v16u8) src0, (v16u8) comp0);
|
||||
diff = (v16u8)__msa_ilvr_d((v2i64) zero, (v2i64) diff);
|
||||
sad += __msa_hadd_u_h((v16u8) diff, (v16u8) diff);
|
||||
}
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
|
||||
@ -116,12 +151,13 @@ static uint32_t sad_horiz_bilinear_filter_16width_msa(const uint8_t *src,
|
||||
int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
int32_t ht_cnt = height >> 3;
|
||||
int32_t res = height & 0x07;
|
||||
v16u8 src0, src1, src2, src3, comp0, comp1;
|
||||
v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 3); ht_cnt--;) {
|
||||
for (; ht_cnt--; ) {
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
|
||||
@ -145,6 +181,17 @@ static uint32_t sad_horiz_bilinear_filter_16width_msa(const uint8_t *src,
|
||||
sad += SAD_UB2_UH(src2, src3, comp0, comp1);
|
||||
}
|
||||
|
||||
for (; res--; ) {
|
||||
v16u8 diff;
|
||||
src0 = LD_UB(src);
|
||||
ref00 = LD_UB(ref);
|
||||
ref01 = LD_UB(ref + 1);
|
||||
src += src_stride;
|
||||
ref += ref_stride;
|
||||
comp0 = (v16u8)__msa_aver_u_b((v16u8) ref00, (v16u8) ref01);
|
||||
diff = __msa_asub_u_b((v16u8) src0, (v16u8) comp0);
|
||||
sad += __msa_hadd_u_h((v16u8) diff, (v16u8) diff);
|
||||
}
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
|
||||
@ -154,12 +201,14 @@ static uint32_t sad_vert_bilinear_filter_8width_msa(const uint8_t *src,
|
||||
int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
int32_t ht_cnt = height >> 3;
|
||||
int32_t res = height & 0x07;
|
||||
v16u8 src0, src1, src2, src3, comp0, comp1;
|
||||
v16u8 ref0, ref1, ref2, ref3, ref4;
|
||||
v8u16 zero = { 0 };
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 3); ht_cnt--;) {
|
||||
for (; ht_cnt--; ) {
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
|
||||
@ -183,6 +232,17 @@ static uint32_t sad_vert_bilinear_filter_8width_msa(const uint8_t *src,
|
||||
sad += SAD_UB2_UH(src0, src1, comp0, comp1);
|
||||
}
|
||||
|
||||
for (; res--; ) {
|
||||
v16u8 diff;
|
||||
src0 = LD_UB(src);
|
||||
LD_UB2(ref, ref_stride, ref0, ref1);
|
||||
src += src_stride;
|
||||
ref += ref_stride;
|
||||
comp0 = (v16u8)__msa_aver_u_b((v16u8) ref0, (v16u8) ref1);
|
||||
diff = __msa_asub_u_b((v16u8) src0, (v16u8) comp0);
|
||||
diff = (v16u8)__msa_ilvr_d((v2i64) zero, (v2i64) diff);
|
||||
sad += __msa_hadd_u_h((v16u8) diff, (v16u8) diff);
|
||||
}
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
|
||||
@ -192,12 +252,13 @@ static uint32_t sad_vert_bilinear_filter_16width_msa(const uint8_t *src,
|
||||
int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
int32_t ht_cnt = height >> 3;
|
||||
int32_t res = height & 0x07;
|
||||
v16u8 src0, src1, src2, src3, comp0, comp1;
|
||||
v16u8 ref0, ref1, ref2, ref3, ref4;
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 3); ht_cnt--;) {
|
||||
for (; ht_cnt--; ) {
|
||||
LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
|
||||
ref += (5 * ref_stride);
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
@ -221,6 +282,16 @@ static uint32_t sad_vert_bilinear_filter_16width_msa(const uint8_t *src,
|
||||
sad += SAD_UB2_UH(src2, src3, comp0, comp1);
|
||||
}
|
||||
|
||||
for (; res--; ) {
|
||||
v16u8 diff;
|
||||
src0 = LD_UB(src);
|
||||
LD_UB2(ref, ref_stride, ref0, ref1);
|
||||
src += src_stride;
|
||||
ref += ref_stride;
|
||||
comp0 = (v16u8)__msa_aver_u_b((v16u8) ref0, (v16u8) ref1);
|
||||
diff = __msa_asub_u_b((v16u8) src0, (v16u8) comp0);
|
||||
sad += __msa_hadd_u_h((v16u8) diff, (v16u8) diff);
|
||||
}
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
|
||||
@ -230,11 +301,13 @@ static uint32_t sad_hv_bilinear_filter_8width_msa(const uint8_t *src,
|
||||
int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
int32_t ht_cnt = height >> 2;
|
||||
int32_t res = height & 0x03;
|
||||
v16u8 src0, src1, src2, src3, temp0, temp1, diff;
|
||||
v16u8 ref0, ref1, ref2, ref3, ref4;
|
||||
v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
|
||||
v8u16 comp0, comp1, comp2, comp3;
|
||||
v8u16 zero = { 0 };
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
@ -277,6 +350,22 @@ static uint32_t sad_hv_bilinear_filter_8width_msa(const uint8_t *src,
|
||||
sad += __msa_hadd_u_h(diff, diff);
|
||||
}
|
||||
|
||||
for (; res--; ) {
|
||||
src0 = LD_UB(src);
|
||||
LD_UB2(ref, ref_stride, ref0, ref1);
|
||||
temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref0, (v16i8) ref0);
|
||||
temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1);
|
||||
src += src_stride;
|
||||
ref += ref_stride;
|
||||
comp0 = __msa_hadd_u_h(temp0, temp0);
|
||||
comp2 = __msa_hadd_u_h(temp1, temp1);
|
||||
comp2 += comp0;
|
||||
comp2 = (v8u16)__msa_srari_h((v8i16) comp2, 2);
|
||||
comp0 = (v16u8) __msa_pckev_b((v16i8) zero, (v16i8) comp2);
|
||||
diff = __msa_asub_u_b(src0, comp0);
|
||||
diff = (v16u8)__msa_ilvr_d((v2i64) zero, (v2i64) diff);
|
||||
sad += __msa_hadd_u_h(diff, diff);
|
||||
}
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
|
||||
@ -286,14 +375,15 @@ static uint32_t sad_hv_bilinear_filter_16width_msa(const uint8_t *src,
|
||||
int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
int32_t ht_cnt = height >> 3;
|
||||
int32_t res = height & 0x07;
|
||||
v16u8 src0, src1, src2, src3, comp, diff;
|
||||
v16u8 temp0, temp1, temp2, temp3;
|
||||
v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;
|
||||
v8u16 comp0, comp1, comp2, comp3;
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 3); ht_cnt--;) {
|
||||
for (; ht_cnt--; ) {
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03);
|
||||
@ -389,6 +479,25 @@ static uint32_t sad_hv_bilinear_filter_16width_msa(const uint8_t *src,
|
||||
diff = __msa_asub_u_b(src3, comp);
|
||||
sad += __msa_hadd_u_h(diff, diff);
|
||||
}
|
||||
for (; res--; ) {
|
||||
src0 = LD_UB(src);
|
||||
LD_UB2(ref, ref_stride, ref00, ref10);
|
||||
LD_UB2(ref + 1, ref_stride, ref01, ref11);
|
||||
src += src_stride;
|
||||
ref += ref_stride;
|
||||
ILVRL_B2_UB(ref10, ref00, temp0, temp1);
|
||||
ILVRL_B2_UB(ref11, ref01, temp2, temp3);
|
||||
comp0 = __msa_hadd_u_h(temp0, temp0);
|
||||
comp1 = __msa_hadd_u_h(temp1, temp1);
|
||||
comp2 = __msa_hadd_u_h(temp2, temp2);
|
||||
comp3 = __msa_hadd_u_h(temp3, temp3);
|
||||
comp2 += comp0;
|
||||
comp3 += comp1;
|
||||
SRARI_H2_UH(comp2, comp3, 2);
|
||||
comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
|
||||
diff = __msa_asub_u_b(src0, comp);
|
||||
sad += __msa_hadd_u_h(diff, diff);
|
||||
}
|
||||
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
@ -407,15 +516,17 @@ static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
int32_t ht_cnt = height >> 2;
|
||||
int32_t res = height & 0x03;
|
||||
uint32_t sse;
|
||||
uint32_t src0, src1, src2, src3;
|
||||
uint32_t ref0, ref1, ref2, ref3;
|
||||
v16u8 src = { 0 };
|
||||
v16u8 ref = { 0 };
|
||||
v4i32 var = { 0 };
|
||||
v16u8 src = { 0 };
|
||||
v16u8 ref = { 0 };
|
||||
v16u8 zero = { 0 };
|
||||
v4i32 var = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
for (; ht_cnt--; ) {
|
||||
LW4(src_ptr, src_stride, src0, src1, src2, src3);
|
||||
src_ptr += (4 * src_stride);
|
||||
LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
|
||||
@ -426,6 +537,20 @@ static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
CALC_MSE_B(src, ref, var);
|
||||
}
|
||||
|
||||
for (; res--; ) {
|
||||
v16u8 reg0;
|
||||
v8i16 tmp0;
|
||||
src0 = LW(src_ptr);
|
||||
ref0 = LW(ref_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref_ptr += ref_stride;
|
||||
src = (v16u8)__msa_insert_w((v4i32) src, 0, src0);
|
||||
ref = (v16u8)__msa_insert_w((v4i32) ref, 0, ref0);
|
||||
reg0 = (v16u8)__msa_ilvr_b(src, ref);
|
||||
reg0 = (v16u8)__msa_ilvr_d((v2i64) zero, (v2i64) reg0);
|
||||
tmp0 = (v8i16)__msa_hsub_u_h((v16u8) reg0, (v16u8) reg0);
|
||||
var = (v4i32)__msa_dpadd_s_w((v4i32) var, (v8i16) tmp0, (v8i16) tmp0);
|
||||
}
|
||||
sse = HADD_SW_S32(var);
|
||||
|
||||
return sse;
|
||||
@ -435,13 +560,14 @@ static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
int32_t ht_cnt = height >> 2;
|
||||
int32_t res = height & 0x03;
|
||||
uint32_t sse;
|
||||
v16u8 src0, src1, src2, src3;
|
||||
v16u8 ref0, ref1, ref2, ref3;
|
||||
v4i32 var = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
for (; ht_cnt--; ) {
|
||||
LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
|
||||
src_ptr += (4 * src_stride);
|
||||
LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
|
||||
@ -453,6 +579,16 @@ static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
CALC_MSE_B(src1, ref1, var);
|
||||
}
|
||||
|
||||
for (; res--; ) {
|
||||
v8i16 tmp0;
|
||||
src0 = LD_UB(src_ptr);
|
||||
ref0 = LD_UB(ref_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref_ptr += ref_stride;
|
||||
ref1 = (v16u8)__msa_ilvr_b(src0, ref0);
|
||||
tmp0 = (v8i16)__msa_hsub_u_h((v16u8) ref1, (v16u8) ref1);
|
||||
var = (v4i32)__msa_dpadd_s_w((v4i32) var, (v8i16) tmp0, (v8i16) tmp0);
|
||||
}
|
||||
sse = HADD_SW_S32(var);
|
||||
|
||||
return sse;
|
||||
@ -462,12 +598,13 @@ static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
int32_t ht_cnt = height >> 2;
|
||||
int32_t res = height & 0x03;
|
||||
uint32_t sse;
|
||||
v16u8 src, ref;
|
||||
v4i32 var = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
for (; ht_cnt--; ) {
|
||||
src = LD_UB(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref = LD_UB(ref_ptr);
|
||||
@ -493,6 +630,14 @@ static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
CALC_MSE_B(src, ref, var);
|
||||
}
|
||||
|
||||
for (; res--; ) {
|
||||
src = LD_UB(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref = LD_UB(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src, ref, var);
|
||||
}
|
||||
|
||||
sse = HADD_SW_S32(var);
|
||||
|
||||
return sse;
|
||||
@ -544,7 +689,7 @@ static int32_t hadamard_diff_8x8_msa(const uint8_t *src, int32_t src_stride,
|
||||
}
|
||||
|
||||
static int32_t hadamard_intra_8x8_msa(const uint8_t *src, int32_t src_stride,
|
||||
const uint8_t *ref, int32_t ref_stride)
|
||||
const uint8_t *dumy, int32_t ref_stride)
|
||||
{
|
||||
int32_t sum_res = 0;
|
||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
@ -659,10 +804,10 @@ int ff_hadamard8_diff8x8_msa(MpegEncContext *s, const uint8_t *dst, const uint8_
|
||||
return hadamard_diff_8x8_msa(src, stride, dst, stride);
|
||||
}
|
||||
|
||||
int ff_hadamard8_intra8x8_msa(MpegEncContext *s, const uint8_t *dst, const uint8_t *src,
|
||||
int ff_hadamard8_intra8x8_msa(MpegEncContext *s, const uint8_t *src, const uint8_t *dummy,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
return hadamard_intra_8x8_msa(src, stride, dst, stride);
|
||||
return hadamard_intra_8x8_msa(src, stride, dummy, stride);
|
||||
}
|
||||
|
||||
/* Hadamard Transform functions */
|
||||
|
Loading…
Reference in New Issue
Block a user