adler32: rewrite using integer SIMD.

about twice as fast as before. the not CONFIG_SMALL case is also droped as it is not faster than the CONFIG_SMALL case. Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2024-11-21 10:55:51 +02:00 · 2012-02-04 07:52:31 +01:00 · 2012-02-04 07:52:31 +01:00 · 26585d2a7f
commit 26585d2a7f
parent 5b0103590c
1 changed files with 32 additions and 11 deletions
--- a/libavutil/adler32.c
+++ b/libavutil/adler32.c
@ -24,6 +24,7 @@
 #include "config.h"
 #include "adler32.h"
 #include "common.h"
+#include "intreadwrite.h"

 #define BASE 65521L /* largest prime smaller than 65536 */

@ -38,22 +39,42 @@ unsigned long av_adler32_update(unsigned long adler, const uint8_t * buf,
    unsigned long s2 = adler >> 16;

    while (len > 0) {
-        unsigned len2 = FFMIN((len-1) & ~15, 2048);
+#if HAVE_FAST_64BIT && HAVE_FAST_UNALIGNED && !CONFIG_SMALL
+        unsigned len2 = FFMIN((len-1) & ~7, 23*8);
        if (len2) {
+            uint64_t a1= 0;
+            uint64_t a2= 0;
+            uint64_t b1= 0;
+            uint64_t b2= 0;
            len -= len2;
-
-#if CONFIG_SMALL
-        while (len2 >= 4) {
-            DO4(buf);
-            len2 -= 4;
-        }
+            s2 += s1*len2;
+            while (len2 >= 8) {
+                uint64_t v = AV_RN64(buf);
+                a2 += a1;
+                b2 += b1;
+                a1 +=  v    &0x00FF00FF00FF00FF;
+                b1 += (v>>8)&0x00FF00FF00FF00FF;
+                len2 -= 8;
+                buf+=8;
+            }
+            s1 += ((a1+b1)*0x1000100010001)>>48;
+            s2 += ((((a2&0xFFFF0000FFFF)+(b2&0xFFFF0000FFFF)+((a2>>16)&0xFFFF0000FFFF)+((b2>>16)&0xFFFF0000FFFF))*0x800000008)>>32)
+#if HAVE_BIGENDIAN
+                 + 2*((b1*0x1000200030004)>>48)
+                 +   ((a1*0x1000100010001)>>48)
+                 + 2*((a1*0x0000100020003)>>48);
 #else
-        while (len2 >= 16) {
-            DO16(buf);
-            len2 -= 16;
-        }
+                 + 2*((a1*0x4000300020001)>>48)
+                 +   ((b1*0x1000100010001)>>48)
+                 + 2*((b1*0x3000200010000)>>48);
 #endif
        }
+#else
+        while (len > 4  && s2 < (1U << 31)) {
+            DO4(buf);
+            len -= 4;
+        }
+#endif
        DO1(buf); len--;
        s1 %= BASE;
        s2 %= BASE;