From 118bd609f048f457cf42d358a07510b87626f316 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Reimar=20D=C3=B6ffinger?= Date: Sat, 11 Aug 2012 14:15:09 +0200 Subject: [PATCH] Optimized unscaled yuvp9/yuvp10 -> yuvp16 conversion. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit About 30% faster on 32 bit Atom, 120% faster on 64 bit Phenom2. This is interesting because supporting P16 is easier in e.g. OpenGL (can misuse support for any 2-component 8 bit format), whereas supporting p9/p10 without conversion needs a texture format with at least 14 bits actual precision. The shiftonly == 0 case is not optimized since the code is more complex and the speed gain less obvious. Signed-off-by: Reimar Döffinger --- libswscale/swscale_unscaled.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c index c391a07d51..9180f2eb5c 100644 --- a/libswscale/swscale_unscaled.c +++ b/libswscale/swscale_unscaled.c @@ -830,7 +830,34 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t *src[], srcPtr += srcStride[plane]; } } else if (src_depth <= dst_depth) { + int orig_length = length; for (i = 0; i < height; i++) { + if(isBE(c->srcFormat) == HAVE_BIGENDIAN && + isBE(c->dstFormat) == HAVE_BIGENDIAN && + shiftonly) { + unsigned shift = dst_depth - src_depth; + length = orig_length; +#if HAVE_FAST_64BIT +#define FAST_COPY_UP(shift) \ + for (j = 0; j < length - 3; j += 4) { \ + uint64_t v = AV_RN64A(srcPtr2 + j); \ + AV_WN64A(dstPtr2 + j, v << shift); \ + } \ + length &= 3; +#else +#define FAST_COPY_UP(shift) \ + for (j = 0; j < length - 1; j += 2) { \ + uint32_t v = AV_RN32A(srcPtr2 + j); \ + AV_WN32A(dstPtr2 + j, v << shift); \ + } \ + length &= 1; +#endif + switch (shift) + { + case 6: FAST_COPY_UP(6); break; + case 7: FAST_COPY_UP(7); break; + } + } #define COPY_UP(r,w) \ if(shiftonly){\ for (j = 0; j < length; j++){ \