From 39d89b69666b1f8596c9edc8bfdc29a70610fb68 Mon Sep 17 00:00:00 2001
From: Michael Niedermayer <michaelni@gmx.at>
Date: Fri, 28 May 2004 13:23:53 +0000
Subject: [PATCH] per line lowpass filter in mmx

Originally committed as revision 3166 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavcodec/libpostproc/postprocess.c          |   1 +
 libavcodec/libpostproc/postprocess_template.c | 219 +++++++++++++++---
 2 files changed, 183 insertions(+), 37 deletions(-)

diff --git a/libavcodec/libpostproc/postprocess.c b/libavcodec/libpostproc/postprocess.c
index 7337e54f28..2ab85071bc 100644
--- a/libavcodec/libpostproc/postprocess.c
+++ b/libavcodec/libpostproc/postprocess.c
@@ -117,6 +117,7 @@ try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
 
 #ifdef ARCH_X86
 static uint64_t __attribute__((aligned(8))) attribute_used w05=		0x0005000500050005LL;
+static uint64_t __attribute__((aligned(8))) attribute_used w04=		0x0004000400040004LL;
 static uint64_t __attribute__((aligned(8))) attribute_used w20=		0x0020002000200020LL;
 static uint64_t __attribute__((aligned(8))) attribute_used b00= 		0x0000000000000000LL;
 static uint64_t __attribute__((aligned(8))) attribute_used b01= 		0x0101010101010101LL;
diff --git a/libavcodec/libpostproc/postprocess_template.c b/libavcodec/libpostproc/postprocess_template.c
index a52a10cdb4..317ac52566 100644
--- a/libavcodec/libpostproc/postprocess_template.c
+++ b/libavcodec/libpostproc/postprocess_template.c
@@ -2617,9 +2617,8 @@ Switch between
  * accurate deblock filter
  */
 static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
-	int y;
-	const int QP= c->QP;
 	int64_t dc_mask, eq_mask;
+	int64_t sums[10*8*2];
 	src+= step*3; // src points to begin of the 8x8 Block
 //START_TIMER
 asm volatile(
@@ -2725,9 +2724,188 @@ asm volatile(
 		: "%eax"
 		);
 
-	src+= step; // src points to begin of the 8x8 Block
+	if(dc_mask & eq_mask){
+		int offset= -8*step;
+		int64_t *temp_sums= sums;
+
+		asm volatile(
+		"movq %2, %%mm0					\n\t"  // QP,..., QP
+		"pxor %%mm4, %%mm4				\n\t"
+
+		"movq (%0), %%mm6				\n\t"
+		"movq (%0, %1), %%mm5				\n\t"
+		"movq %%mm5, %%mm1				\n\t"
+		"movq %%mm6, %%mm2				\n\t"
+		"psubusb %%mm6, %%mm5				\n\t"
+		"psubusb %%mm1, %%mm2				\n\t"
+		"por %%mm5, %%mm2				\n\t" // ABS Diff of lines
+		"psubusb %%mm2, %%mm0				\n\t" // diff >= QP -> 0
+		"pcmpeqb %%mm4, %%mm0				\n\t" // diff >= QP -> FF
+
+		"pxor %%mm6, %%mm1				\n\t"
+		"pand %%mm0, %%mm1				\n\t"
+		"pxor %%mm1, %%mm6				\n\t"
+		// 0:QP  6:First
+
+		"movq (%0, %1, 8), %%mm5			\n\t"
+		"addl %1, %0					\n\t" // %0 points to line 1 not 0
+		"movq (%0, %1, 8), %%mm7			\n\t"
+		"movq %%mm5, %%mm1				\n\t"
+		"movq %%mm7, %%mm2				\n\t"
+		"psubusb %%mm7, %%mm5				\n\t"
+		"psubusb %%mm1, %%mm2				\n\t"
+		"por %%mm5, %%mm2				\n\t" // ABS Diff of lines
+		"movq %2, %%mm0					\n\t"  // QP,..., QP
+		"psubusb %%mm2, %%mm0				\n\t" // diff >= QP -> 0
+		"pcmpeqb %%mm4, %%mm0				\n\t" // diff >= QP -> FF
+
+		"pxor %%mm7, %%mm1				\n\t"
+		"pand %%mm0, %%mm1				\n\t"
+		"pxor %%mm1, %%mm7				\n\t"
+		
+		"movq %%mm6, %%mm5				\n\t"
+		"punpckhbw %%mm4, %%mm6				\n\t"
+		"punpcklbw %%mm4, %%mm5				\n\t"
+		// 4:0 5/6:First 7:Last
+
+		"movq %%mm5, %%mm0				\n\t"
+		"movq %%mm6, %%mm1				\n\t"
+		"psllw $2, %%mm0				\n\t"
+		"psllw $2, %%mm1				\n\t"
+		"paddw "MANGLE(w04)", %%mm0			\n\t"
+		"paddw "MANGLE(w04)", %%mm1			\n\t"
+
+#define NEXT\
+		"movq (%0), %%mm2				\n\t"\
+		"movq (%0), %%mm3				\n\t"\
+		"addl %1, %0					\n\t"\
+		"punpcklbw %%mm4, %%mm2				\n\t"\
+		"punpckhbw %%mm4, %%mm3				\n\t"\
+		"paddw %%mm2, %%mm0				\n\t"\
+		"paddw %%mm3, %%mm1				\n\t"
+
+#define PREV\
+		"movq (%0), %%mm2				\n\t"\
+		"movq (%0), %%mm3				\n\t"\
+		"addl %1, %0					\n\t"\
+		"punpcklbw %%mm4, %%mm2				\n\t"\
+		"punpckhbw %%mm4, %%mm3				\n\t"\
+		"psubw %%mm2, %%mm0				\n\t"\
+		"psubw %%mm3, %%mm1				\n\t"
+
+				
+		NEXT //0
+		NEXT //1
+		NEXT //2
+		"movq %%mm0, (%3)				\n\t"
+		"movq %%mm1, 8(%3)				\n\t"
+
+		NEXT //3
+		"psubw %%mm5, %%mm0				\n\t"
+		"psubw %%mm6, %%mm1				\n\t"
+		"movq %%mm0, 16(%3)				\n\t"
+		"movq %%mm1, 24(%3)				\n\t"
+
+		NEXT //4
+		"psubw %%mm5, %%mm0				\n\t"
+		"psubw %%mm6, %%mm1				\n\t"
+		"movq %%mm0, 32(%3)				\n\t"
+		"movq %%mm1, 40(%3)				\n\t"
+
+		NEXT //5
+		"psubw %%mm5, %%mm0				\n\t"
+		"psubw %%mm6, %%mm1				\n\t"
+		"movq %%mm0, 48(%3)				\n\t"
+		"movq %%mm1, 56(%3)				\n\t"
+
+		NEXT //6
+		"psubw %%mm5, %%mm0				\n\t"
+		"psubw %%mm6, %%mm1				\n\t"
+		"movq %%mm0, 64(%3)				\n\t"
+		"movq %%mm1, 72(%3)				\n\t"
+
+		"movq %%mm7, %%mm6				\n\t"
+		"punpckhbw %%mm4, %%mm7				\n\t"
+		"punpcklbw %%mm4, %%mm6				\n\t"
+		
+		NEXT //7
+		"movl %4, %0					\n\t"
+		"addl %1, %0					\n\t"
+		PREV //0
+		"movq %%mm0, 80(%3)				\n\t"
+		"movq %%mm1, 88(%3)				\n\t"
+
+		PREV //1
+		"paddw %%mm6, %%mm0				\n\t"
+		"paddw %%mm7, %%mm1				\n\t"
+		"movq %%mm0, 96(%3)				\n\t"
+		"movq %%mm1, 104(%3)				\n\t"
+		
+		PREV //2
+		"paddw %%mm6, %%mm0				\n\t"
+		"paddw %%mm7, %%mm1				\n\t"
+		"movq %%mm0, 112(%3)				\n\t"
+		"movq %%mm1, 120(%3)				\n\t"
+
+		PREV //3
+		"paddw %%mm6, %%mm0				\n\t"
+		"paddw %%mm7, %%mm1				\n\t"
+		"movq %%mm0, 128(%3)				\n\t"
+		"movq %%mm1, 136(%3)				\n\t"
+
+		PREV //4
+		"paddw %%mm6, %%mm0				\n\t"
+		"paddw %%mm7, %%mm1				\n\t"
+		"movq %%mm0, 144(%3)				\n\t"
+		"movq %%mm1, 152(%3)				\n\t"
+
+		"movl %4, %0					\n\t" //FIXME
+
+		: "+&r"(src)
+		: "r" (step), "m" (c->pQPb), "r"(sums), "g"(src)
+		);
+
+		src+= step; // src points to begin of the 8x8 Block
+
+		asm volatile(
+		"movq %4, %%mm6					\n\t"
+		"pcmpeqb %%mm5, %%mm5				\n\t"
+		"pxor %%mm6, %%mm5				\n\t"
+		"pxor %%mm7, %%mm7				\n\t"
+
+		"1:						\n\t"
+		"movq (%1), %%mm0				\n\t"
+		"movq 8(%1), %%mm1				\n\t"
+		"paddw 32(%1), %%mm0				\n\t"
+		"paddw 40(%1), %%mm1				\n\t"
+		"movq (%0, %3), %%mm2				\n\t"
+		"movq %%mm2, %%mm3				\n\t"
+		"movq %%mm2, %%mm4				\n\t"
+		"punpcklbw %%mm7, %%mm2				\n\t"
+		"punpckhbw %%mm7, %%mm3				\n\t"
+		"paddw %%mm2, %%mm0				\n\t"
+		"paddw %%mm3, %%mm1				\n\t"
+		"paddw %%mm2, %%mm0				\n\t"
+		"paddw %%mm3, %%mm1				\n\t"
+		"psrlw $4, %%mm0				\n\t"
+		"psrlw $4, %%mm1				\n\t"
+		"packuswb %%mm1, %%mm0				\n\t"
+		"pand %%mm6, %%mm0				\n\t"
+		"pand %%mm5, %%mm4				\n\t"
+		"por %%mm4, %%mm0				\n\t"
+		"movq %%mm0, (%0, %3)				\n\t"
+		"addl $16, %1					\n\t"
+		"addl %2, %0					\n\t"
+		" js 1b						\n\t"
+
+		: "+r"(offset), "+r"(temp_sums)
+		: "r" (step), "r"(src - offset), "m"(dc_mask & eq_mask)
+		);
+	}else
+		src+= step; // src points to begin of the 8x8 Block
 
 	if(eq_mask != -1LL){
+		uint8_t *temp_src= src;
 		asm volatile(
 		"pxor %%mm7, %%mm7				\n\t"
 		"leal -40(%%esp), %%ecx				\n\t" // make space for 4 8-byte vars
@@ -2955,43 +3133,10 @@ asm volatile(
 		"psubb %%mm1, %%mm0				\n\t"
 		"movq %%mm0, (%0, %1)				\n\t"
 
-		: "+r" (src)
+		: "+r" (temp_src)
 		: "r" (step), "m" (c->pQPb), "m"(eq_mask)
 		: "%eax", "%ecx"
 		);
-		src-= 3*step; //reverse src change from asm
-	}
-
-	for(y=0; y<8; y++){
-		if((eq_mask>>(y*8))&1){
-			if((dc_mask>>(y*8))&1){
-				const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
-				const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
-				
-				int sums[10];
-				sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
-				sums[1] = sums[0] - first       + src[3*step];
-				sums[2] = sums[1] - first       + src[4*step];
-				sums[3] = sums[2] - first       + src[5*step];
-				sums[4] = sums[3] - first       + src[6*step];
-				sums[5] = sums[4] - src[0*step] + src[7*step];
-				sums[6] = sums[5] - src[1*step] + last;
-				sums[7] = sums[6] - src[2*step] + last;
-				sums[8] = sums[7] - src[3*step] + last;
-				sums[9] = sums[8] - src[4*step] + last;
-
-				src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
-				src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
-				src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
-				src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
-				src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
-				src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
-				src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
-				src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
-			}
-		}
-
-		src += stride;
 	}
 /*if(step==16){
     STOP_TIMER("step16")