From f56d0db5bd9b352bbd33aa4524d642a13905a28c Mon Sep 17 00:00:00 2001 From: yzct12345 <87620833+yzct12345@users.noreply.github.com> Date: Mon, 2 Aug 2021 15:18:58 +0000 Subject: [PATCH] decoders: Optimize swizzle copy performance (#6790) This makes UnswizzleTexture up to two times faster. It is the main bottleneck in NVDEC video decoding. --- src/video_core/textures/decoders.cpp | 52 +++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index f1f523ad1..c32ae956a 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -18,9 +18,9 @@ namespace Tegra::Texture { namespace { -template -void Swizzle(std::span output, std::span input, u32 bytes_per_pixel, u32 width, - u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { +template +void SwizzleImpl(std::span output, std::span input, u32 width, u32 height, u32 depth, + u32 block_height, u32 block_depth, u32 stride_alignment) { // The origin of the transformation can be configured here, leave it as zero as the current API // doesn't expose it. static constexpr u32 origin_x = 0; @@ -28,9 +28,9 @@ void Swizzle(std::span output, std::span input, u32 bytes_per_pixe static constexpr u32 origin_z = 0; // We can configure here a custom pitch - // As it's not exposed 'width * bpp' will be the expected pitch. - const u32 pitch = width * bytes_per_pixel; - const u32 stride = Common::AlignUpLog2(width, stride_alignment) * bytes_per_pixel; + // As it's not exposed 'width * BYTES_PER_PIXEL' will be the expected pitch. + const u32 pitch = width * BYTES_PER_PIXEL; + const u32 stride = Common::AlignUpLog2(width, stride_alignment) * BYTES_PER_PIXEL; const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT); const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth); @@ -54,14 +54,14 @@ void Swizzle(std::span output, std::span input, u32 bytes_per_pixe ((block_y & block_height_mask) << GOB_SIZE_SHIFT); for (u32 column = 0; column < width; ++column) { - const u32 x = (column + origin_x) * bytes_per_pixel; + const u32 x = (column + origin_x) * BYTES_PER_PIXEL; const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift; const u32 base_swizzled_offset = offset_z + offset_y + offset_x; const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X]; const u32 unswizzled_offset = - slice * pitch * height + line * pitch + column * bytes_per_pixel; + slice * pitch * height + line * pitch + column * BYTES_PER_PIXEL; if (const auto offset = (TO_LINEAR ? unswizzled_offset : swizzled_offset); offset >= input.size()) { @@ -73,11 +73,45 @@ void Swizzle(std::span output, std::span input, u32 bytes_per_pixe u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset]; const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset]; - std::memcpy(dst, src, bytes_per_pixel); + + std::memcpy(dst, src, BYTES_PER_PIXEL); } } } } + +template +void Swizzle(std::span output, std::span input, u32 bytes_per_pixel, u32 width, + u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { + switch (bytes_per_pixel) { + case 1: + return SwizzleImpl(output, input, width, height, depth, block_height, + block_depth, stride_alignment); + case 2: + return SwizzleImpl(output, input, width, height, depth, block_height, + block_depth, stride_alignment); + case 3: + return SwizzleImpl(output, input, width, height, depth, block_height, + block_depth, stride_alignment); + case 4: + return SwizzleImpl(output, input, width, height, depth, block_height, + block_depth, stride_alignment); + case 6: + return SwizzleImpl(output, input, width, height, depth, block_height, + block_depth, stride_alignment); + case 8: + return SwizzleImpl(output, input, width, height, depth, block_height, + block_depth, stride_alignment); + case 12: + return SwizzleImpl(output, input, width, height, depth, block_height, + block_depth, stride_alignment); + case 16: + return SwizzleImpl(output, input, width, height, depth, block_height, + block_depth, stride_alignment); + default: + UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel); + } +} } // Anonymous namespace void UnswizzleTexture(std::span output, std::span input, u32 bytes_per_pixel,