From 22f4b290b6f0894d29302102f539dd8753961f04 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 18 Jul 2021 18:40:14 +0200 Subject: [PATCH 001/156] VideoCore: Initial Setup for the Resolution Scaler. --- src/common/settings.cpp | 1 + src/common/settings.h | 18 +++ src/video_core/dirty_flags.h | 1 + .../renderer_opengl/gl_texture_cache.cpp | 16 +++ .../renderer_opengl/gl_texture_cache.h | 8 ++ .../renderer_vulkan/vk_texture_cache.cpp | 81 ++++++++++--- .../renderer_vulkan/vk_texture_cache.h | 19 +++ src/video_core/texture_cache/image_base.h | 4 + src/video_core/texture_cache/image_info.h | 2 +- src/video_core/texture_cache/texture_cache.h | 113 +++++++++++++++++- .../texture_cache/texture_cache_base.h | 10 ++ 11 files changed, 255 insertions(+), 18 deletions(-) diff --git a/src/common/settings.cpp b/src/common/settings.cpp index 9dd5e3efb..8c6be2c84 100644 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -47,6 +47,7 @@ void LogSettings() { log_setting("System_TimeZoneIndex", values.time_zone_index.GetValue()); log_setting("Core_UseMultiCore", values.use_multi_core.GetValue()); log_setting("CPU_Accuracy", values.cpu_accuracy.GetValue()); + log_setting("Renderer_UseResolutionScaling", values.resolution_setup.GetValue()); log_setting("Renderer_UseResolutionFactor", values.resolution_factor.GetValue()); log_setting("Renderer_UseSpeedLimit", values.use_speed_limit.GetValue()); log_setting("Renderer_SpeedLimit", values.speed_limit.GetValue()); diff --git a/src/common/settings.h b/src/common/settings.h index 9ff4cf85d..08f3da055 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -52,6 +52,22 @@ enum class NvdecEmulation : u32 { GPU = 2, }; +enum class ResolutionSetup : u32 { + Res1_2X = 0, + Res3_4X = 1, + Res1X = 2, + Res3_2K = 3, + Res2X = 4, + Res3X = 5, +}; + +struct ResolutionScalingInfo { + u32 up_scale{2}; + u32 down_shift{0}; + f32 up_factor{2.0f}; + f32 down_factor{0.5f}; +}; + /** The BasicSetting class is a simple resource manager. It defines a label and default value * alongside the actual value of the setting for simpler and less-error prone use with frontend * configurations. Setting a default value and label is required, though subclasses may deviate from @@ -451,6 +467,8 @@ struct Values { "disable_shader_loop_safety_checks"}; Setting vulkan_device{0, "vulkan_device"}; + ResolutionScalingInfo resolution_info{}; + Setting resolution_setup{ResolutionSetup::Res1X, "resolution_setup"}; Setting resolution_factor{1, "resolution_factor"}; // *nix platforms may have issues with the borderless windowed fullscreen mode. // Default to exclusive fullscreen on these platforms for now. diff --git a/src/video_core/dirty_flags.h b/src/video_core/dirty_flags.h index f0d545f90..f11ff5d94 100644 --- a/src/video_core/dirty_flags.h +++ b/src/video_core/dirty_flags.h @@ -29,6 +29,7 @@ enum : u8 { ColorBuffer6, ColorBuffer7, ZetaBuffer, + Rescale, VertexBuffers, VertexBuffer0, diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 8c3ca3d82..1e594838f 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -849,6 +849,22 @@ void Image::CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t b } } +void Image::ScaleUp() { + if (True(flags & ImageFlagBits::Rescaled)) { + return; + } + flags |= ImageFlagBits::Rescaled; + UNIMPLEMENTED(); +} + +void Image::ScaleDown() { + if (False(flags & ImageFlagBits::Rescaled)) { + return; + } + flags &= ~ImageFlagBits::Rescaled; + UNIMPLEMENTED(); +} + ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, ImageId image_id_, Image& image) : VideoCommon::ImageViewBase{info, image.info, image_id_}, views{runtime.null_image_views} { diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 1ca2c90be..58b36494b 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -72,6 +72,8 @@ public: StateTracker& state_tracker); ~TextureCacheRuntime(); + void Init() {} + void Finish(); ImageBufferMap UploadStagingBuffer(size_t size); @@ -110,6 +112,8 @@ public: bool HasNativeASTC() const noexcept; + void TickFrame() {} + private: struct StagingBuffers { explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_); @@ -185,6 +189,10 @@ public: return gl_type; } + bool ScaleUp(); + + bool ScaleDown(); + private: void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 06c5fb867..be5b1d84d 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -32,6 +32,7 @@ using Tegra::Engines::Fermi2D; using Tegra::Texture::SwizzleSource; using Tegra::Texture::TextureMipmapFilter; using VideoCommon::BufferImageCopy; +using VideoCommon::ImageFlagBits; using VideoCommon::ImageInfo; using VideoCommon::ImageType; using VideoCommon::SubresourceRange; @@ -123,7 +124,8 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { } } -[[nodiscard]] VkImageCreateInfo MakeImageCreateInfo(const Device& device, const ImageInfo& info) { +[[nodiscard]] VkImageCreateInfo MakeImageCreateInfo(const Device& device, const ImageInfo& info, + u32 up, u32 down) { const PixelFormat format = StorageFormat(info.format); const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format); VkImageCreateFlags flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; @@ -142,9 +144,9 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { .imageType = ConvertImageType(info.type), .format = format_info.format, .extent{ - .width = info.size.width >> samples_x, - .height = info.size.height >> samples_y, - .depth = info.size.depth, + .width = ((info.size.width << up) >> down) >> samples_x, + .height = ((info.size.height << up) >> down) >> samples_y, + .depth = (info.size.depth << up) >> down, }, .mipLevels = static_cast(info.resources.levels), .arrayLayers = static_cast(info.resources.layers), @@ -158,11 +160,12 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { }; } -[[nodiscard]] vk::Image MakeImage(const Device& device, const ImageInfo& info) { +[[nodiscard]] vk::Image MakeImage(const Device& device, const ImageInfo& info, u32 up = 0, + u32 down = 0) { if (info.type == ImageType::Buffer) { return vk::Image{}; } - return device.GetLogical().CreateImage(MakeImageCreateInfo(device, info)); + return device.GetLogical().CreateImage(MakeImageCreateInfo(device, info, up, down)); } [[nodiscard]] VkImageAspectFlags ImageAspectMask(PixelFormat format) { @@ -590,6 +593,11 @@ struct RangedBarrierRange { } } // Anonymous namespace +void TextureCacheRuntime::Init() { + resolution = Settings::values.resolution_info; + is_rescaling_on = resolution.up_scale != 1 || resolution.down_shift != 0; +} + void TextureCacheRuntime::Finish() { scheduler.Finish(); } @@ -840,20 +848,26 @@ u64 TextureCacheRuntime::GetDeviceLocalMemory() const { return device.GetDeviceLocalMemory(); } -Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_addr_, +void TextureCacheRuntime::TickFrame() { + prescaled_images.Tick(); + prescaled_commits.Tick(); + prescaled_views.Tick(); +} + +Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) - : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime.scheduler}, - image(MakeImage(runtime.device, info)), - commit(runtime.memory_allocator.Commit(image, MemoryUsage::DeviceLocal)), - aspect_mask(ImageAspectMask(info.format)) { - if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) { + : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime_.scheduler}, + image(MakeImage(runtime_.device, info)), + commit(runtime_.memory_allocator.Commit(image, MemoryUsage::DeviceLocal)), + aspect_mask(ImageAspectMask(info.format)), runtime{&runtime_} { + if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported()) { if (Settings::values.accelerate_astc.GetValue()) { flags |= VideoCommon::ImageFlagBits::AcceleratedUpload; } else { flags |= VideoCommon::ImageFlagBits::Converted; } } - if (runtime.device.HasDebuggingToolAttached()) { + if (runtime->device.HasDebuggingToolAttached()) { image.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); } static constexpr VkImageViewUsageCreateInfo storage_image_view_usage_create_info{ @@ -861,8 +875,8 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_ .pNext = nullptr, .usage = VK_IMAGE_USAGE_STORAGE_BIT, }; - if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) { - const auto& device = runtime.device.GetLogical(); + if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported()) { + const auto& device = runtime->device.GetLogical(); storage_image_views.reserve(info.resources.levels); for (s32 level = 0; level < info.resources.levels; ++level) { storage_image_views.push_back(device.CreateImageView(VkImageViewCreateInfo{ @@ -907,6 +921,10 @@ void Image::UploadMemory(const StagingBufferRef& map, std::span copies) { + const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); + if (is_rescaled) { + ScaleDown(); + } std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); scheduler->RequestOutsideRenderPassOperationContext(); scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask, @@ -959,6 +977,39 @@ void Image::DownloadMemory(const StagingBufferRef& map, std::spanis_rescaling_on) { + flags |= ImageFlagBits::Rescaled; + return; + } + flags |= ImageFlagBits::Rescaled; + scaling_count++; + ASSERT(scaling_count < 10); + return; +} + +void Image::ScaleDown() { + if (False(flags & ImageFlagBits::Rescaled)) { + return; + } + ASSERT(info.type != ImageType::Linear); + if (!runtime->is_rescaling_on) { + flags &= ~ImageFlagBits::Rescaled; + return; + } + flags &= ~ImageFlagBits::Rescaled; + scaling_count++; + ASSERT(scaling_count < 10); + return; } ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index b09c468e4..f7e782c44 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -6,7 +6,9 @@ #include +#include "common/settings.h" #include "shader_recompiler/shader_info.h" +#include "video_core/delayed_destruction_ring.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/texture_cache/image_view_base.h" #include "video_core/texture_cache/texture_cache_base.h" @@ -15,6 +17,7 @@ namespace Vulkan { +using VideoCommon::DelayedDestructionRing; using VideoCommon::ImageId; using VideoCommon::NUM_RT; using VideoCommon::Region2D; @@ -39,6 +42,14 @@ struct TextureCacheRuntime { BlitImageHelper& blit_image_helper; ASTCDecoderPass& astc_decoder_pass; RenderPassCache& render_pass_cache; + static constexpr size_t TICKS_TO_DESTROY = 6; + DelayedDestructionRing prescaled_images; + DelayedDestructionRing prescaled_commits; + DelayedDestructionRing prescaled_views; + Settings::ResolutionScalingInfo resolution; + bool is_rescaling_on{}; + + void Init(); void Finish(); @@ -74,6 +85,8 @@ struct TextureCacheRuntime { return true; } + void TickFrame(); + u64 GetDeviceLocalMemory() const; }; @@ -113,6 +126,10 @@ public: return std::exchange(initialized, true); } + void ScaleUp(); + + void ScaleDown(); + private: VKScheduler* scheduler; vk::Image image; @@ -121,6 +138,8 @@ private: std::vector storage_image_views; VkImageAspectFlags aspect_mask = 0; bool initialized = false; + TextureCacheRuntime* runtime; + u32 scaling_count{}; }; class ImageView : public VideoCommon::ImageViewBase { diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index 0c17a791b..1cd30fd37 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -33,6 +33,10 @@ enum class ImageFlagBits : u32 { ///< garbage collection priority Alias = 1 << 11, ///< This image has aliases and has priority on garbage ///< collection + + // Rescaler + Rescaled = 1 << 12, + RescaleChecked = 1 << 13, }; DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h index 5049fc36e..16d4cee37 100644 --- a/src/video_core/texture_cache/image_info.h +++ b/src/video_core/texture_cache/image_info.h @@ -15,7 +15,7 @@ using Tegra::Texture::TICEntry; using VideoCore::Surface::PixelFormat; struct ImageInfo { - explicit ImageInfo() = default; + ImageInfo() = default; explicit ImageInfo(const TICEntry& config) noexcept; explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept; explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index f70c1f764..560da4f16 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -35,6 +35,7 @@ TextureCache

::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& Tegra::MemoryManager& gpu_memory_) : runtime{runtime_}, rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_}, gpu_memory{gpu_memory_} { + runtime.Init(); // Configure null sampler TSCEntry sampler_descriptor{}; sampler_descriptor.min_filter.Assign(Tegra::Texture::TextureFilter::Linear); @@ -103,6 +104,7 @@ void TextureCache

::TickFrame() { sentenced_images.Tick(); sentenced_framebuffers.Tick(); sentenced_image_view.Tick(); + runtime.TickFrame(); ++frame_tick; } @@ -208,18 +210,63 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { const bool force = flags[Dirty::RenderTargetControl]; flags[Dirty::RenderTargetControl] = false; + bool can_rescale = true; + std::array tmp_color_images{}; + ImageId tmp_depth_image{}; + const auto check_rescale = [&](ImageViewId view_id, ImageId& id_save) { + if (view_id) { + const auto& view = slot_image_views[view_id]; + const auto image_id = view.image_id; + id_save = image_id; + auto& image = slot_images[image_id]; + can_rescale &= ImageCanRescale(image); + } else { + id_save = CORRUPT_ID; + } + }; for (size_t index = 0; index < NUM_RT; ++index) { ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; if (flags[Dirty::ColorBuffer0 + index] || force) { flags[Dirty::ColorBuffer0 + index] = false; BindRenderTarget(&color_buffer_id, FindColorBuffer(index, is_clear)); } - PrepareImageView(color_buffer_id, true, is_clear && IsFullClear(color_buffer_id)); + check_rescale(color_buffer_id, tmp_color_images[index]); } if (flags[Dirty::ZetaBuffer] || force) { flags[Dirty::ZetaBuffer] = false; BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear)); } + check_rescale(render_targets.depth_buffer_id, tmp_depth_image); + + if (can_rescale) { + const auto scale_up = [this](ImageId image_id) { + if (image_id != CORRUPT_ID) { + Image& image = slot_images[image_id]; + image.ScaleUp(); + } + }; + for (size_t index = 0; index < NUM_RT; ++index) { + scale_up(tmp_color_images[index]); + } + scale_up(tmp_depth_image); + } else { + const auto scale_down = [this](ImageId image_id) { + if (image_id != CORRUPT_ID) { + Image& image = slot_images[image_id]; + image.ScaleDown(); + } + }; + for (size_t index = 0; index < NUM_RT; ++index) { + scale_down(tmp_color_images[index]); + } + scale_down(tmp_depth_image); + } + // Rescale End + + for (size_t index = 0; index < NUM_RT; ++index) { + ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; + PrepareImageView(color_buffer_id, true, is_clear && IsFullClear(color_buffer_id)); + } const ImageViewId depth_buffer_id = render_targets.depth_buffer_id; PrepareImageView(depth_buffer_id, true, is_clear && IsFullClear(depth_buffer_id)); @@ -623,6 +670,31 @@ ImageId TextureCache

::FindImage(const ImageInfo& info, GPUVAddr gpu_addr, return image_id; } +template +bool TextureCache

::ImageCanRescale(Image& image) { + if (True(image.flags & ImageFlagBits::Rescaled) || + True(image.flags & ImageFlagBits::RescaleChecked)) { + return true; + } + const auto& info = image.info; + const bool can_this_rescale = + (info.type == ImageType::e1D || info.type == ImageType::e2D) && info.block.depth == 0; + if (!can_this_rescale) { + image.flags &= ~ImageFlagBits::RescaleChecked; + return false; + } + image.flags |= ImageFlagBits::RescaleChecked; + for (const auto& alias : image.aliased_images) { + Image& other_image = slot_images[alias.id]; + if (!ImageCanRescale(other_image)) { + image.flags &= ~ImageFlagBits::RescaleChecked; + return false; + } + } + image.flags &= ~ImageFlagBits::RescaleChecked; + return true; +} + template ImageId TextureCache

::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr, RelaxedOptions options) { @@ -660,12 +732,18 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA std::vector right_aliased_ids; std::unordered_set ignore_textures; std::vector bad_overlap_ids; + std::vector all_siblings; + const bool this_is_linear = info.type == ImageType::Linear; const auto region_check = [&](ImageId overlap_id, ImageBase& overlap) { if (True(overlap.flags & ImageFlagBits::Remapped)) { ignore_textures.insert(overlap_id); return; } - if (info.type == ImageType::Linear) { + const bool overlap_is_linear = overlap.info.type == ImageType::Linear; + if (this_is_linear != overlap_is_linear) { + return; + } + if (this_is_linear && overlap_is_linear) { if (info.pitch == overlap.info.pitch && gpu_addr == overlap.gpu_addr) { // Alias linear images with the same pitch left_aliased_ids.push_back(overlap_id); @@ -681,6 +759,7 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA cpu_addr = solution->cpu_addr; new_info.resources = solution->resources; overlap_ids.push_back(overlap_id); + all_siblings.push_back(overlap_id); return; } static constexpr auto options = RelaxedOptions::Size | RelaxedOptions::Format; @@ -688,10 +767,12 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA if (IsSubresource(new_info, overlap, gpu_addr, options, broken_views, native_bgr)) { left_aliased_ids.push_back(overlap_id); overlap.flags |= ImageFlagBits::Alias; + all_siblings.push_back(overlap_id); } else if (IsSubresource(overlap.info, new_image_base, overlap.gpu_addr, options, broken_views, native_bgr)) { right_aliased_ids.push_back(overlap_id); overlap.flags |= ImageFlagBits::Alias; + all_siblings.push_back(overlap_id); } else { bad_overlap_ids.push_back(overlap_id); overlap.flags |= ImageFlagBits::BadOverlap; @@ -709,8 +790,36 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA } }; ForEachSparseImageInRegion(gpu_addr, size_bytes, region_check_gpu); + + bool can_rescale = + (info.type == ImageType::e1D || info.type == ImageType::e2D) && info.block.depth == 0; + for (const ImageId sibling_id : all_siblings) { + if (!can_rescale) { + break; + } + Image& sibling = slot_images[sibling_id]; + can_rescale &= ImageCanRescale(sibling); + } + + if (can_rescale) { + for (const ImageId sibling_id : all_siblings) { + Image& sibling = slot_images[sibling_id]; + sibling.ScaleUp(); + } + } else { + for (const ImageId sibling_id : all_siblings) { + Image& sibling = slot_images[sibling_id]; + sibling.ScaleDown(); + } + } + const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr); Image& new_image = slot_images[new_image_id]; + if (can_rescale) { + new_image.ScaleUp(); + } else { + new_image.ScaleDown(); + } if (!gpu_memory.IsContinousRange(new_image.gpu_addr, new_image.guest_size_bytes)) { new_image.flags |= ImageFlagBits::Sparse; diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 2d1893c1c..a4a2c0832 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -142,6 +142,14 @@ public: const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Config& copy); + /// Invalidate the contents of the color buffer index + /// These contents become unspecified, the cache can assume aggressive optimizations. + void InvalidateColorBuffer(size_t index); + + /// Invalidate the contents of the depth buffer + /// These contents become unspecified, the cache can assume aggressive optimizations. + void InvalidateDepthBuffer(); + /// Try to find a cached image view in the given CPU address [[nodiscard]] ImageView* TryFindFramebufferImageView(VAddr cpu_addr); @@ -318,6 +326,8 @@ private: /// Returns true if the current clear parameters clear the whole image of a given image view [[nodiscard]] bool IsFullClear(ImageViewId id); + bool ImageCanRescale(Image& image); + Runtime& runtime; VideoCore::RasterizerInterface& rasterizer; Tegra::Engines::Maxwell3D& maxwell3d; From 37ef9c913028e234509bcf70bad049b0210e4592 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 18 Jul 2021 20:33:20 +0200 Subject: [PATCH 002/156] Settings: Add resolution scaling to settings. --- src/common/settings.cpp | 51 +++++++++++++++ src/common/settings.h | 13 ++-- src/yuzu/configuration/config.cpp | 5 ++ src/yuzu/configuration/config.h | 1 + src/yuzu/configuration/configure_graphics.cpp | 26 +++++++- src/yuzu/configuration/configure_graphics.ui | 64 +++++++++++++++++++ 6 files changed, 155 insertions(+), 5 deletions(-) diff --git a/src/common/settings.cpp b/src/common/settings.cpp index 8c6be2c84..dd3a3d456 100644 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -106,6 +106,57 @@ float Volume() { return values.volume.GetValue() / 100.0f; } +void UpdateRescalingInfo() { + auto setup = values.resolution_setup.GetValue(); + auto& info = values.resolution_info; + switch (setup) { + case ResolutionSetup::Res1_2X: { + info.up_scale = 1; + info.down_shift = 1; + break; + } + case ResolutionSetup::Res3_4X: { + info.up_scale = 3; + info.down_shift = 2; + break; + } + case ResolutionSetup::Res1X: { + info.up_scale = 1; + info.down_shift = 0; + break; + } + case ResolutionSetup::Res3_2X: { + info.up_scale = 3; + info.down_shift = 1; + break; + } + case ResolutionSetup::Res2X: { + info.up_scale = 2; + info.down_shift = 0; + break; + } + case ResolutionSetup::Res3X: { + info.up_scale = 3; + info.down_shift = 0; + break; + } + case ResolutionSetup::Res4X: { + info.up_scale = 4; + info.down_shift = 0; + break; + } + default: { + UNREACHABLE(); + info.up_scale = 1; + info.down_shift = 0; + } + } + info.up_factor = static_cast(info.up_scale) / (1U << info.down_shift); + info.down_factor = static_cast(1U << info.down_shift) / info.up_scale; + info.size_up = info.up_scale * info.up_scale; + info.size_shift = info.down_shift * 2; +} + void RestoreGlobalState(bool is_powered_on) { // If a game is running, DO NOT restore the global settings state if (is_powered_on) { diff --git a/src/common/settings.h b/src/common/settings.h index 08f3da055..f4df2fc95 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -56,16 +56,19 @@ enum class ResolutionSetup : u32 { Res1_2X = 0, Res3_4X = 1, Res1X = 2, - Res3_2K = 3, + Res3_2X = 3, Res2X = 4, Res3X = 5, + Res4X = 6, }; struct ResolutionScalingInfo { - u32 up_scale{2}; + u32 up_scale{1}; u32 down_shift{0}; - f32 up_factor{2.0f}; - f32 down_factor{0.5f}; + f32 up_factor{1.0f}; + f32 down_factor{1.0f}; + u32 size_up{1}; + u32 size_shift{0}; }; /** The BasicSetting class is a simple resource manager. It defines a label and default value @@ -613,6 +616,8 @@ std::string GetTimeZoneString(); void LogSettings(); +void UpdateRescalingInfo(); + // Restore the global state of all applicable settings in the Values struct void RestoreGlobalState(bool is_powered_on); diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index faea5dda1..7ddc40b00 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -824,6 +824,7 @@ void Config::ReadRendererValues() { ReadGlobalSetting(Settings::values.vulkan_device); ReadGlobalSetting(Settings::values.fullscreen_mode); ReadGlobalSetting(Settings::values.aspect_ratio); + ReadGlobalSetting(Settings::values.resolution_setup); ReadGlobalSetting(Settings::values.max_anisotropy); ReadGlobalSetting(Settings::values.use_speed_limit); ReadGlobalSetting(Settings::values.speed_limit); @@ -1364,6 +1365,10 @@ void Config::SaveRendererValues() { static_cast(Settings::values.fullscreen_mode.GetDefault()), Settings::values.fullscreen_mode.UsingGlobal()); WriteGlobalSetting(Settings::values.aspect_ratio); + WriteSetting(QString::fromStdString(Settings::values.resolution_setup.GetLabel()), + static_cast(Settings::values.resolution_setup.GetValue(global)), + static_cast(Settings::values.resolution_setup.GetDefault()), + Settings::values.resolution_setup.UsingGlobal()); WriteGlobalSetting(Settings::values.max_anisotropy); WriteGlobalSetting(Settings::values.use_speed_limit); WriteGlobalSetting(Settings::values.speed_limit); diff --git a/src/yuzu/configuration/config.h b/src/yuzu/configuration/config.h index a7f4a6720..fbb91d312 100644 --- a/src/yuzu/configuration/config.h +++ b/src/yuzu/configuration/config.h @@ -189,5 +189,6 @@ Q_DECLARE_METATYPE(Settings::CPUAccuracy); Q_DECLARE_METATYPE(Settings::GPUAccuracy); Q_DECLARE_METATYPE(Settings::FullscreenMode); Q_DECLARE_METATYPE(Settings::NvdecEmulation); +Q_DECLARE_METATYPE(Settings::ResolutionSetup); Q_DECLARE_METATYPE(Settings::RendererBackend); Q_DECLARE_METATYPE(Settings::ShaderBackend); diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp index 8e20cc6f3..4f08ae3e0 100644 --- a/src/yuzu/configuration/configure_graphics.cpp +++ b/src/yuzu/configuration/configure_graphics.cpp @@ -102,6 +102,8 @@ void ConfigureGraphics::SetConfiguration() { ui->nvdec_emulation->setCurrentIndex( static_cast(Settings::values.nvdec_emulation.GetValue())); ui->aspect_ratio_combobox->setCurrentIndex(Settings::values.aspect_ratio.GetValue()); + ui->resolution_combobox->setCurrentIndex( + static_cast(Settings::values.resolution_setup.GetValue())); } else { ConfigurationShared::SetPerGameSetting(ui->api, &Settings::values.renderer_backend); ConfigurationShared::SetHighlight(ui->api_widget, @@ -122,6 +124,11 @@ void ConfigureGraphics::SetConfiguration() { ConfigurationShared::SetHighlight(ui->ar_label, !Settings::values.aspect_ratio.UsingGlobal()); + ConfigurationShared::SetPerGameSetting(ui->resolution_combobox, + &Settings::values.resolution_setup); + ConfigurationShared::SetHighlight(ui->resolution_label, + !Settings::values.resolution_setup.UsingGlobal()); + ui->bg_combobox->setCurrentIndex(Settings::values.bg_red.UsingGlobal() ? 0 : 1); ui->bg_button->setEnabled(!Settings::values.bg_red.UsingGlobal()); ConfigurationShared::SetHighlight(ui->bg_layout, !Settings::values.bg_red.UsingGlobal()); @@ -133,11 +140,14 @@ void ConfigureGraphics::SetConfiguration() { } void ConfigureGraphics::ApplyConfiguration() { + const auto resolution_setup = static_cast( + ui->resolution_combobox->currentIndex() - + ((Settings::IsConfiguringGlobal()) ? 0 : ConfigurationShared::USE_GLOBAL_OFFSET)); + ConfigurationShared::ApplyPerGameSetting(&Settings::values.fullscreen_mode, ui->fullscreen_mode_combobox); ConfigurationShared::ApplyPerGameSetting(&Settings::values.aspect_ratio, ui->aspect_ratio_combobox); - ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_disk_shader_cache, ui->use_disk_shader_cache, use_disk_shader_cache); ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_gpu_emulation, @@ -165,7 +175,16 @@ void ConfigureGraphics::ApplyConfiguration() { Settings::values.bg_green.SetValue(static_cast(bg_color.green())); Settings::values.bg_blue.SetValue(static_cast(bg_color.blue())); } + if (Settings::values.resolution_setup.UsingGlobal()) { + Settings::values.resolution_setup.SetValue(resolution_setup); + } } else { + if (ui->resolution_combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) { + Settings::values.resolution_setup.SetGlobal(true); + } else { + Settings::values.resolution_setup.SetGlobal(false); + Settings::values.resolution_setup.SetValue(resolution_setup); + } if (ui->api->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) { Settings::values.renderer_backend.SetGlobal(true); Settings::values.shader_backend.SetGlobal(true); @@ -207,6 +226,7 @@ void ConfigureGraphics::ApplyConfiguration() { Settings::values.bg_blue.SetValue(static_cast(bg_color.blue())); } } + Settings::UpdateRescalingInfo(); } void ConfigureGraphics::changeEvent(QEvent* event) { @@ -312,6 +332,7 @@ void ConfigureGraphics::SetupPerGameUI() { ui->device->setEnabled(Settings::values.renderer_backend.UsingGlobal()); ui->fullscreen_mode_combobox->setEnabled(Settings::values.fullscreen_mode.UsingGlobal()); ui->aspect_ratio_combobox->setEnabled(Settings::values.aspect_ratio.UsingGlobal()); + ui->resolution_combobox->setEnabled(Settings::values.resolution_setup.UsingGlobal()); ui->use_asynchronous_gpu_emulation->setEnabled( Settings::values.use_asynchronous_gpu_emulation.UsingGlobal()); ui->nvdec_emulation->setEnabled(Settings::values.nvdec_emulation.UsingGlobal()); @@ -340,6 +361,9 @@ void ConfigureGraphics::SetupPerGameUI() { ConfigurationShared::SetColoredComboBox( ui->fullscreen_mode_combobox, ui->fullscreen_mode_label, static_cast(Settings::values.fullscreen_mode.GetValue(true))); + ConfigurationShared::SetColoredComboBox( + ui->resolution_combobox, ui->resolution_label, + static_cast(Settings::values.resolution_setup.GetValue(true))); ConfigurationShared::InsertGlobalItem( ui->api, static_cast(Settings::values.renderer_backend.GetValue(true))); ConfigurationShared::InsertGlobalItem( diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui index beae74344..1b6ac3cbb 100644 --- a/src/yuzu/configuration/configure_graphics.ui +++ b/src/yuzu/configuration/configure_graphics.ui @@ -309,6 +309,70 @@ + + + + + 0 + + + 0 + + + 0 + + + 0 + + + + + Resolution: + + + + + + + + 0.5X (360p/540p) + + + + + 0.75X (540p/810p) + + + + + 1X (720p/1080p) + + + + + 1.5X (1080p/1620p) + + + + + 2X (1440p/2160[4K]p) + + + + + 3X (2160p[4K]/3240p[6K]) + + + + + 4X (2880p/4320p[8K]) + + + + + + + From 360e897ccd53bf863cea1ad6184d35e9b6ffbf40 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 18 Jul 2021 23:06:12 +0200 Subject: [PATCH 003/156] ShaderDecompiler: Add initial support for rescaling. --- .../ir_opt/rescaling_pass.cpp | 72 +++++++++++++++++++ src/shader_recompiler/shader_info.h | 1 + 2 files changed, 73 insertions(+) create mode 100644 src/shader_recompiler/ir_opt/rescaling_pass.cpp diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp new file mode 100644 index 000000000..d3ae3f159 --- /dev/null +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -0,0 +1,72 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/alignment.h" +#include "shader_recompiler/environment.h" +#include "shader_recompiler/frontend/ir/modifiers.h" +#include "shader_recompiler/frontend/ir/program.h" +#include "shader_recompiler/frontend/ir/value.h" +#include "shader_recompiler/ir_opt/passes.h" +#include "shader_recompiler/shader_info.h" + +namespace Shader::Optimization { +namespace { + +void PatchFragCoord(IR::Inst& inst) { + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + const IR::F32 inv_resolution_factor = IR::F32{Settings::values.resolution_info.down_factor}; + const IR::F32 new_get_attribute = ir.GetAttribute(inst.Arg(0).Attribute()); + const IR::F32 mul = ir.FMul(new_get_attribute, inv_resolution_factor); + const IR::U1 should_rescale = IR::U1{true}; + const IR::F32 selection = ir.Select(should_rescale, mul, new_get_attribute); + inst.ReplaceUsesWith(selection); +} + +void Visit(Info& info, IR::Inst& inst) { + info.requires_rescaling_uniform = false; + switch (inst.GetOpcode()) { + case IR::Opcode::GetAttribute: { + conast auto attrib = inst.Arg(0).Attribute(); + const bool is_frag = + attrib == IR::Attribute::PositionX || attrib == IR::Attribute::PositionY; + const bool must_path = is_frag && program.stage == Stage::Fragment; + if (must_path) { + PatchFragCoord(inst); + info.requires_rescaling_uniform = true; + } + break; + } + case IR::Opcode::ImageQueryDimensions: { + info.requires_rescaling_uniform |= true; + break; + } + case IR::Opcode::ImageFetch: { + info.requires_rescaling_uniform |= true; + break; + } + case IR::Opcode::ImageRead: { + info.requires_rescaling_uniform |= true; + break; + } + case IR::Opcode::ImageWrite: { + info.requires_rescaling_uniform |= true; + break; + } + default: + break; + } +} + +} // namespace + +void RescalingPass(Environment& env, IR::Program& program) { + Info& info{program.info}; + for (IR::Block* const block : program.post_order_blocks) { + for (IR::Inst& inst : block->Instructions()) { + Visit(info, inst); + } + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/shader_info.h b/src/shader_recompiler/shader_info.h index 4ef4dbd40..e7981a08c 100644 --- a/src/shader_recompiler/shader_info.h +++ b/src/shader_recompiler/shader_info.h @@ -172,6 +172,7 @@ struct Info { bool uses_global_memory{}; bool uses_atomic_image_u32{}; bool uses_shadow_lod{}; + bool requires_rescaling_uniform{}; IR::Type used_constant_buffer_types{}; IR::Type used_storage_buffer_types{}; From ba18047e8d06584de0ce18cdbb303a6d9a8742aa Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Mon, 19 Jul 2021 04:32:03 +0200 Subject: [PATCH 004/156] Texture Cache: Implement Vulkan UpScaling & DownScaling --- .../renderer_opengl/gl_texture_cache.cpp | 10 +- .../renderer_vulkan/vk_texture_cache.cpp | 259 ++++++++++++++++-- .../renderer_vulkan/vk_texture_cache.h | 10 +- src/video_core/texture_cache/image_info.cpp | 3 + src/video_core/texture_cache/texture_cache.h | 84 +++++- .../texture_cache/texture_cache_base.h | 3 + 6 files changed, 327 insertions(+), 42 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 1e594838f..cdd352aef 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -849,20 +849,22 @@ void Image::CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t b } } -void Image::ScaleUp() { +bool Image::ScaleUp() { if (True(flags & ImageFlagBits::Rescaled)) { - return; + return false; } flags |= ImageFlagBits::Rescaled; UNIMPLEMENTED(); + return true; } -void Image::ScaleDown() { +bool Image::ScaleDown() { if (False(flags & ImageFlagBits::Rescaled)) { - return; + return false; } flags &= ~ImageFlagBits::Rescaled; UNIMPLEMENTED(); + return true; } ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index be5b1d84d..668554d1e 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -137,6 +137,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT; } const auto [samples_x, samples_y] = VideoCommon::SamplesLog2(info.num_samples); + const bool is_2d = info.type == ImageType::e2D; return VkImageCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, @@ -144,9 +145,9 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { .imageType = ConvertImageType(info.type), .format = format_info.format, .extent{ - .width = ((info.size.width << up) >> down) >> samples_x, - .height = ((info.size.height << up) >> down) >> samples_y, - .depth = (info.size.depth << up) >> down, + .width = ((info.size.width * up) >> down) >> samples_x, + .height = (is_2d ? ((info.size.height * up) >> down) : info.size.height) >> samples_y, + .depth = info.size.depth, }, .mipLevels = static_cast(info.resources.levels), .arrayLayers = static_cast(info.resources.layers), @@ -160,7 +161,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { }; } -[[nodiscard]] vk::Image MakeImage(const Device& device, const ImageInfo& info, u32 up = 0, +[[nodiscard]] vk::Image MakeImage(const Device& device, const ImageInfo& info, u32 up = 1, u32 down = 0) { if (info.type == ImageType::Buffer) { return vk::Image{}; @@ -851,7 +852,6 @@ u64 TextureCacheRuntime::GetDeviceLocalMemory() const { void TextureCacheRuntime::TickFrame() { prescaled_images.Tick(); prescaled_commits.Tick(); - prescaled_views.Tick(); } Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu_addr_, @@ -923,7 +923,7 @@ void Image::UploadMemory(const StagingBufferRef& map, std::span copies) { const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); if (is_rescaled) { - ScaleDown(); + ScaleDown(true); } std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); scheduler->RequestOutsideRenderPassOperationContext(); @@ -978,38 +978,253 @@ void Image::DownloadMemory(const StagingBufferRef& map, std::span& blit_regions, + VkImageAspectFlags aspect_mask) { + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([dst_image, src_image, aspect_mask, + regions = std::move(blit_regions)](vk::CommandBuffer cmdbuf) { + const std::array read_barriers{ + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = src_image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }, + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst_image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }, + }; + VkImageMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst_image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }; + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, nullptr, nullptr, read_barriers); + const VkFilter vk_filter = VK_FILTER_NEAREST; + cmdbuf.BlitImage(src_image, VK_IMAGE_LAYOUT_GENERAL, dst_image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, regions, vk_filter); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + 0, write_barrier); + }); +} + +bool Image::ScaleUp(bool save_as_backup) { if (True(flags & ImageFlagBits::Rescaled)) { - return; + return false; } ASSERT(info.type != ImageType::Linear); - if (!runtime->is_rescaling_on) { - flags |= ImageFlagBits::Rescaled; - return; - } - flags |= ImageFlagBits::Rescaled; scaling_count++; ASSERT(scaling_count < 10); - return; + flags |= ImageFlagBits::Rescaled; + /*if (!runtime->is_rescaling_on) { + return; + }*/ + const auto& resolution = runtime->resolution; + vk::Image rescaled_image = + MakeImage(runtime->device, info, resolution.up_scale, resolution.down_shift); + MemoryCommit new_commit( + runtime->memory_allocator.Commit(rescaled_image, MemoryUsage::DeviceLocal)); + + const auto scale_up = [&](u32 value) { + return (value * resolution.up_scale) >> resolution.down_shift; + }; + + const bool is_2d = info.type == ImageType::e2D; + boost::container::small_vector vkRegions(info.resources.levels); + for (s32 level = 0; level < info.resources.levels; level++) { + VkImageBlit blit{ + .srcSubresource{ + .aspectMask = aspect_mask, + .mipLevel = u32(level), + .baseArrayLayer = 0, + .layerCount = u32(info.resources.layers), + }, + .srcOffsets{ + { + .x = 0, + .y = 0, + .z = 0, + }, + { + .x = s32(info.size.width), + .y = s32(info.size.height), + .z = 1, + }, + }, + .dstSubresource{ + .aspectMask = aspect_mask, + .mipLevel = u32(level), + .baseArrayLayer = 0, + .layerCount = u32(info.resources.layers), + }, + .dstOffsets{ + { + .x = 0, + .y = 0, + .z = 0, + }, + { + .x = s32(scale_up(info.size.width)), + .y = is_2d ? s32(scale_up(info.size.height)) : s32(info.size.height), + .z = 1, + }, + }, + }; + vkRegions.push_back(blit); + } + BlitScale(*scheduler, *image, *rescaled_image, vkRegions, aspect_mask); + if (save_as_backup) { + backup_image = std::move(image); + backup_commit = std::move(commit); + has_backup = true; + } else { + runtime->prescaled_images.Push(std::move(image)); + runtime->prescaled_commits.Push(std::move(commit)); + } + image = std::move(rescaled_image); + commit = std::move(new_commit); + return true; } -void Image::ScaleDown() { +void Image::SwapBackup() { + ASSERT(has_backup); + runtime->prescaled_images.Push(std::move(image)); + runtime->prescaled_commits.Push(std::move(commit)); + image = std::move(backup_image); + commit = std::move(backup_commit); + has_backup = false; +} + +bool Image::ScaleDown(bool save_as_backup) { if (False(flags & ImageFlagBits::Rescaled)) { - return; + return false; } ASSERT(info.type != ImageType::Linear); - if (!runtime->is_rescaling_on) { - flags &= ~ImageFlagBits::Rescaled; - return; - } flags &= ~ImageFlagBits::Rescaled; scaling_count++; ASSERT(scaling_count < 10); - return; + /*if (!runtime->is_rescaling_on) { + return false; + }*/ + + const auto& resolution = runtime->resolution; + vk::Image downscaled_image = + MakeImage(runtime->device, info, resolution.up_scale, resolution.down_shift); + MemoryCommit new_commit( + runtime->memory_allocator.Commit(downscaled_image, MemoryUsage::DeviceLocal)); + + const auto scale_up = [&](u32 value) { + return (value * resolution.up_scale) >> resolution.down_shift; + }; + + const bool is_2d = info.type == ImageType::e2D; + boost::container::small_vector vkRegions(info.resources.levels); + for (s32 level = 0; level < info.resources.levels; level++) { + VkImageBlit blit{ + .srcSubresource{ + .aspectMask = aspect_mask, + .mipLevel = u32(level), + .baseArrayLayer = 0, + .layerCount = u32(info.resources.layers), + }, + .srcOffsets{ + { + .x = 0, + .y = 0, + .z = 0, + }, + { + .x = s32(scale_up(info.size.width)), + .y = is_2d ? s32(scale_up(info.size.height)) : s32(info.size.height), + .z = 1, + }, + }, + .dstSubresource{ + .aspectMask = aspect_mask, + .mipLevel = u32(level), + .baseArrayLayer = 0, + .layerCount = u32(info.resources.layers), + }, + .dstOffsets{ + { + .x = 0, + .y = 0, + .z = 0, + }, + { + .x = s32(info.size.width), + .y = s32(info.size.height), + .z = 1, + }, + }, + }; + vkRegions.push_back(blit); + } + BlitScale(*scheduler, *image, *downscaled_image, vkRegions, aspect_mask); + if (save_as_backup) { + backup_image = std::move(image); + backup_commit = std::move(commit); + has_backup = true; + } else { + runtime->prescaled_images.Push(std::move(image)); + runtime->prescaled_commits.Push(std::move(commit)); + } + image = std::move(downscaled_image); + commit = std::move(new_commit); + return true; } ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index f7e782c44..958a64651 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -45,7 +45,6 @@ struct TextureCacheRuntime { static constexpr size_t TICKS_TO_DESTROY = 6; DelayedDestructionRing prescaled_images; DelayedDestructionRing prescaled_commits; - DelayedDestructionRing prescaled_views; Settings::ResolutionScalingInfo resolution; bool is_rescaling_on{}; @@ -126,9 +125,11 @@ public: return std::exchange(initialized, true); } - void ScaleUp(); + bool ScaleUp(bool save_as_backup = false); - void ScaleDown(); + bool ScaleDown(bool save_as_backup = false); + + void SwapBackup(); private: VKScheduler* scheduler; @@ -140,6 +141,9 @@ private: bool initialized = false; TextureCacheRuntime* runtime; u32 scaling_count{}; + vk::Image backup_image{}; + MemoryCommit backup_commit{}; + bool has_backup{}; }; class ImageView : public VideoCommon::ImageViewBase { diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index 64fd7010a..022ca9033 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -41,6 +41,7 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept { ASSERT(config.BaseLayer() == 0); type = ImageType::e1D; size.width = config.Width(); + resources.layers = 1; break; case TextureType::Texture1DArray: UNIMPLEMENTED_IF(config.BaseLayer() != 0); @@ -82,10 +83,12 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept { size.width = config.Width(); size.height = config.Height(); size.depth = config.Depth(); + resources.layers = 1; break; case TextureType::Texture1DBuffer: type = ImageType::Buffer; size.width = config.Width(); + resources.layers = 1; break; default: UNREACHABLE_MSG("Invalid texture_type={}", static_cast(config.texture_type.Value())); diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 560da4f16..95a9e8fe9 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -242,24 +242,36 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { const auto scale_up = [this](ImageId image_id) { if (image_id != CORRUPT_ID) { Image& image = slot_images[image_id]; - image.ScaleUp(); + return ScaleUp(image); } + return false; }; for (size_t index = 0; index < NUM_RT; ++index) { - scale_up(tmp_color_images[index]); + if (scale_up(tmp_color_images[index])) { + BindRenderTarget(&render_targets.color_buffer_ids[index], + FindColorBuffer(index, is_clear)); + } + } + if (scale_up(tmp_depth_image)) { + BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear)); } - scale_up(tmp_depth_image); } else { const auto scale_down = [this](ImageId image_id) { if (image_id != CORRUPT_ID) { Image& image = slot_images[image_id]; - image.ScaleDown(); + return ScaleDown(image); } + return false; }; for (size_t index = 0; index < NUM_RT; ++index) { - scale_down(tmp_color_images[index]); + if (scale_down(tmp_color_images[index])) { + BindRenderTarget(&render_targets.color_buffer_ids[index], + FindColorBuffer(index, is_clear)); + } + } + if (scale_down(tmp_depth_image)) { + BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear)); } - scale_down(tmp_depth_image); } // Rescale End @@ -695,6 +707,47 @@ bool TextureCache

::ImageCanRescale(Image& image) { return true; } +template +void TextureCache

::InvalidateScale(Image& image, bool invalidate_rt) { + const std::span image_view_ids = image.image_view_ids; + if (invalidate_rt) { + auto& dirty = maxwell3d.dirty.flags; + dirty[Dirty::RenderTargets] = true; + dirty[Dirty::ZetaBuffer] = true; + for (size_t rt = 0; rt < NUM_RT; ++rt) { + dirty[Dirty::ColorBuffer0 + rt] = true; + } + for (const ImageViewId image_view_id : image_view_ids) { + std::ranges::replace(render_targets.color_buffer_ids, image_view_id, ImageViewId{}); + if (render_targets.depth_buffer_id == image_view_id) { + render_targets.depth_buffer_id = ImageViewId{}; + } + } + } + RemoveImageViewReferences(image_view_ids); + RemoveFramebuffers(image_view_ids); +} + +template +bool TextureCache

::ScaleUp(Image& image, bool invalidate_rt) { + const bool rescaled = image.ScaleUp(); + if (!rescaled) { + return false; + } + InvalidateScale(image, invalidate_rt); + return true; +} + +template +bool TextureCache

::ScaleDown(Image& image, bool invalidate_rt) { + const bool rescaled = image.ScaleDown(); + if (!rescaled) { + return false; + } + InvalidateScale(image, invalidate_rt); + return true; +} + template ImageId TextureCache

::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr, RelaxedOptions options) { @@ -793,33 +846,32 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA bool can_rescale = (info.type == ImageType::e1D || info.type == ImageType::e2D) && info.block.depth == 0; + bool any_rescaled = false; for (const ImageId sibling_id : all_siblings) { if (!can_rescale) { break; } Image& sibling = slot_images[sibling_id]; can_rescale &= ImageCanRescale(sibling); + any_rescaled |= True(sibling.flags & ImageFlagBits::Rescaled); } + can_rescale &= any_rescaled; + if (can_rescale) { for (const ImageId sibling_id : all_siblings) { Image& sibling = slot_images[sibling_id]; - sibling.ScaleUp(); + ScaleUp(sibling, true); } } else { for (const ImageId sibling_id : all_siblings) { Image& sibling = slot_images[sibling_id]; - sibling.ScaleDown(); + ScaleDown(sibling, true); } } const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr); Image& new_image = slot_images[new_image_id]; - if (can_rescale) { - new_image.ScaleUp(); - } else { - new_image.ScaleDown(); - } if (!gpu_memory.IsContinousRange(new_image.gpu_addr, new_image.guest_size_bytes)) { new_image.flags |= ImageFlagBits::Sparse; @@ -840,6 +892,12 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA // TODO: Only upload what we need RefreshContents(new_image, new_image_id); + if (can_rescale) { + new_image.ScaleUp(); + } else { + new_image.ScaleDown(); + } + for (const ImageId overlap_id : overlap_ids) { Image& overlap = slot_images[overlap_id]; if (overlap.info.num_samples != new_image.info.num_samples) { diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index a4a2c0832..042678786 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -327,6 +327,9 @@ private: [[nodiscard]] bool IsFullClear(ImageViewId id); bool ImageCanRescale(Image& image); + void InvalidateScale(Image& image, bool invalidate_rt = false); + bool ScaleUp(Image& image, bool invalidate_rt = false); + bool ScaleDown(Image& image, bool invalidate_rt = false); Runtime& runtime; VideoCore::RasterizerInterface& rasterizer; From 10e5065a5c72e9a704bac1500d92f41e5adb5751 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 18 Jul 2021 23:59:31 -0400 Subject: [PATCH 005/156] gl_texture_cache: WIP texture rescale --- .../renderer_opengl/gl_texture_cache.cpp | 70 ++++++++++++++++++- .../renderer_opengl/gl_texture_cache.h | 2 + 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index cdd352aef..9b2a09007 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -698,7 +698,10 @@ void Image::UploadMemory(const ImageBufferMap& map, void Image::DownloadMemory(ImageBufferMap& map, std::span copies) { glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API - + const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); + if (is_rescaled) { + ScaleDown(); + } glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer); glPixelStorei(GL_PACK_ALIGNMENT, 1); @@ -716,6 +719,9 @@ void Image::DownloadMemory(ImageBufferMap& map, } CopyImageToBuffer(copy, map.offset); } + if (is_rescaled) { + ScaleUp(); + } } GLuint Image::StorageHandle() noexcept { @@ -849,12 +855,70 @@ void Image::CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t b } } +void Image::Scale() { + // TODO: Pass scaling factor? + if (gl_format == 0 || gl_type == 0) { + // compressed textures + return; + } + if (info.type == ImageType::Linear) { + UNIMPLEMENTED(); + return; + } + GLint prev_draw_fbo; + GLint prev_read_fbo; + glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &prev_draw_fbo); + glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &prev_read_fbo); + const GLenum attachment = [this] { + switch (GetFormatType(info.format)) { + case SurfaceType::ColorTexture: + return GL_COLOR_ATTACHMENT0; + case SurfaceType::Depth: + return GL_DEPTH_ATTACHMENT; + case SurfaceType::DepthStencil: + return GL_DEPTH_STENCIL_ATTACHMENT; + default: + UNREACHABLE(); + return GL_COLOR_ATTACHMENT0; + } + }(); + const GLenum mask = [this] { + switch (GetFormatType(info.format)) { + case SurfaceType::ColorTexture: + return GL_COLOR_BUFFER_BIT; + case SurfaceType::Depth: + return GL_DEPTH_BUFFER_BIT; + case SurfaceType::DepthStencil: + return GL_STENCIL_BUFFER_BIT | GL_DEPTH_BUFFER_BIT; + default: + UNREACHABLE(); + return GL_COLOR_BUFFER_BIT; + } + }(); + const GLenum filter = (mask & GL_COLOR_BUFFER_BIT) != 0 ? GL_LINEAR : GL_NEAREST; + GLuint fbo_handle; + glGenFramebuffers(1, &fbo_handle); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, fbo_handle); + glBindFramebuffer(GL_READ_FRAMEBUFFER, fbo_handle); + glNamedFramebufferTexture(fbo_handle, attachment, texture.handle, 0); + + const size_t scaled_width = info.size.width; + const size_t scaled_height = info.size.height * 2; + glBlitNamedFramebuffer(fbo_handle, fbo_handle, 0, 0, info.size.width, info.size.height, 0, 0, + scaled_width, scaled_height, mask, filter); + // TODO: resize texture? + glCopyTextureSubImage3D(texture.handle, 0, 0, 0, 0, 0, 0, scaled_width, scaled_height / 2); + // Restore previous framebuffers + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, prev_draw_fbo); + glBindFramebuffer(GL_READ_FRAMEBUFFER, prev_read_fbo); +} + bool Image::ScaleUp() { if (True(flags & ImageFlagBits::Rescaled)) { return false; } flags |= ImageFlagBits::Rescaled; - UNIMPLEMENTED(); + Scale(); return true; } @@ -863,7 +927,7 @@ bool Image::ScaleDown() { return false; } flags &= ~ImageFlagBits::Rescaled; - UNIMPLEMENTED(); + Scale(); return true; } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 58b36494b..324a0f1cb 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -198,6 +198,8 @@ private: void CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); + void Scale(); + OGLTexture texture; OGLTextureView store_view; GLenum gl_internal_format = GL_NONE; From 84f2aea8962146be899131b032fcdf9b4e1f6ddf Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 20 Jul 2021 07:40:05 +0200 Subject: [PATCH 006/156] Texture Cache: More rescaling fixes. --- .../renderer_opengl/gl_texture_cache.cpp | 4 +- .../renderer_vulkan/vk_texture_cache.cpp | 8 + src/video_core/texture_cache/texture_cache.h | 174 +++++++++--------- .../texture_cache/texture_cache_base.h | 6 +- 4 files changed, 102 insertions(+), 90 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 9b2a09007..2d9f770cd 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -918,7 +918,7 @@ bool Image::ScaleUp() { return false; } flags |= ImageFlagBits::Rescaled; - Scale(); + //Scale(); return true; } @@ -927,7 +927,7 @@ bool Image::ScaleDown() { return false; } flags &= ~ImageFlagBits::Rescaled; - Scale(); + //Scale(); return true; } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 668554d1e..5fd190825 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1078,6 +1078,10 @@ bool Image::ScaleUp(bool save_as_backup) { MemoryCommit new_commit( runtime->memory_allocator.Commit(rescaled_image, MemoryUsage::DeviceLocal)); + if (aspect_mask == 0) { + aspect_mask = ImageAspectMask(info.format); + } + const auto scale_up = [&](u32 value) { return (value * resolution.up_scale) >> resolution.down_shift; }; @@ -1170,6 +1174,10 @@ bool Image::ScaleDown(bool save_as_backup) { return (value * resolution.up_scale) >> resolution.down_shift; }; + if (aspect_mask == 0) { + aspect_mask = ImageAspectMask(info.format); + } + const bool is_2d = info.type == ImageType::e2D; boost::container::small_vector vkRegions(info.resources.levels); for (s32 level = 0; level < info.resources.levels; level++) { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 95a9e8fe9..b7d1ae92d 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -204,75 +204,68 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { PrepareImageView(depth_buffer_id, true, is_clear && IsFullClear(depth_buffer_id)); return; } - flags[Dirty::RenderTargets] = false; - // Render target control is used on all render targets, so force look ups when this one is up - const bool force = flags[Dirty::RenderTargetControl]; - flags[Dirty::RenderTargetControl] = false; + do { + flags[Dirty::RenderTargets] = false; - bool can_rescale = true; - std::array tmp_color_images{}; - ImageId tmp_depth_image{}; - const auto check_rescale = [&](ImageViewId view_id, ImageId& id_save) { - if (view_id) { - const auto& view = slot_image_views[view_id]; - const auto image_id = view.image_id; - id_save = image_id; - auto& image = slot_images[image_id]; - can_rescale &= ImageCanRescale(image); + has_deleted_images = false; + // Render target control is used on all render targets, so force look ups when this one is + // up + const bool force = flags[Dirty::RenderTargetControl]; + flags[Dirty::RenderTargetControl] = false; + + bool can_rescale = true; + std::array tmp_color_images{}; + ImageId tmp_depth_image{}; + const auto check_rescale = [&](ImageViewId view_id, ImageId& id_save) { + if (view_id) { + const auto& view = slot_image_views[view_id]; + const auto image_id = view.image_id; + id_save = image_id; + auto& image = slot_images[image_id]; + can_rescale &= ImageCanRescale(image); + } else { + id_save = CORRUPT_ID; + } + }; + for (size_t index = 0; index < NUM_RT; ++index) { + ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; + if (flags[Dirty::ColorBuffer0 + index] || force) { + flags[Dirty::ColorBuffer0 + index] = false; + BindRenderTarget(&color_buffer_id, FindColorBuffer(index, is_clear)); + } + check_rescale(color_buffer_id, tmp_color_images[index]); + } + if (flags[Dirty::ZetaBuffer] || force) { + flags[Dirty::ZetaBuffer] = false; + BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear)); + } + check_rescale(render_targets.depth_buffer_id, tmp_depth_image); + + if (can_rescale) { + const auto scale_up = [this](ImageId image_id) { + if (image_id != CORRUPT_ID) { + Image& image = slot_images[image_id]; + ScaleUp(image); + } + }; + for (size_t index = 0; index < NUM_RT; ++index) { + scale_up(tmp_color_images[index]); + } + scale_up(tmp_depth_image); } else { - id_save = CORRUPT_ID; - } - }; - for (size_t index = 0; index < NUM_RT; ++index) { - ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; - if (flags[Dirty::ColorBuffer0 + index] || force) { - flags[Dirty::ColorBuffer0 + index] = false; - BindRenderTarget(&color_buffer_id, FindColorBuffer(index, is_clear)); - } - check_rescale(color_buffer_id, tmp_color_images[index]); - } - if (flags[Dirty::ZetaBuffer] || force) { - flags[Dirty::ZetaBuffer] = false; - BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear)); - } - check_rescale(render_targets.depth_buffer_id, tmp_depth_image); - - if (can_rescale) { - const auto scale_up = [this](ImageId image_id) { - if (image_id != CORRUPT_ID) { - Image& image = slot_images[image_id]; - return ScaleUp(image); - } - return false; - }; - for (size_t index = 0; index < NUM_RT; ++index) { - if (scale_up(tmp_color_images[index])) { - BindRenderTarget(&render_targets.color_buffer_ids[index], - FindColorBuffer(index, is_clear)); + const auto scale_down = [this](ImageId image_id) { + if (image_id != CORRUPT_ID) { + Image& image = slot_images[image_id]; + ScaleDown(image); + } + }; + for (size_t index = 0; index < NUM_RT; ++index) { + scale_down(tmp_color_images[index]); } + scale_down(tmp_depth_image); } - if (scale_up(tmp_depth_image)) { - BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear)); - } - } else { - const auto scale_down = [this](ImageId image_id) { - if (image_id != CORRUPT_ID) { - Image& image = slot_images[image_id]; - return ScaleDown(image); - } - return false; - }; - for (size_t index = 0; index < NUM_RT; ++index) { - if (scale_down(tmp_color_images[index])) { - BindRenderTarget(&render_targets.color_buffer_ids[index], - FindColorBuffer(index, is_clear)); - } - } - if (scale_down(tmp_depth_image)) { - BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear)); - } - } + } while (has_deleted_images); // Rescale End for (size_t index = 0; index < NUM_RT; ++index) { @@ -708,43 +701,54 @@ bool TextureCache

::ImageCanRescale(Image& image) { } template -void TextureCache

::InvalidateScale(Image& image, bool invalidate_rt) { +void TextureCache

::InvalidateScale(Image& image) { const std::span image_view_ids = image.image_view_ids; - if (invalidate_rt) { - auto& dirty = maxwell3d.dirty.flags; - dirty[Dirty::RenderTargets] = true; - dirty[Dirty::ZetaBuffer] = true; - for (size_t rt = 0; rt < NUM_RT; ++rt) { - dirty[Dirty::ColorBuffer0 + rt] = true; - } - for (const ImageViewId image_view_id : image_view_ids) { - std::ranges::replace(render_targets.color_buffer_ids, image_view_id, ImageViewId{}); - if (render_targets.depth_buffer_id == image_view_id) { - render_targets.depth_buffer_id = ImageViewId{}; - } + auto& dirty = maxwell3d.dirty.flags; + dirty[Dirty::RenderTargets] = true; + dirty[Dirty::ZetaBuffer] = true; + for (size_t rt = 0; rt < NUM_RT; ++rt) { + dirty[Dirty::ColorBuffer0 + rt] = true; + } + for (const ImageViewId image_view_id : image_view_ids) { + std::ranges::replace(render_targets.color_buffer_ids, image_view_id, ImageViewId{}); + if (render_targets.depth_buffer_id == image_view_id) { + render_targets.depth_buffer_id = ImageViewId{}; } } RemoveImageViewReferences(image_view_ids); RemoveFramebuffers(image_view_ids); + for (const ImageViewId image_view_id : image_view_ids) { + sentenced_image_view.Push(std::move(slot_image_views[image_view_id])); + slot_image_views.erase(image_view_id); + } + image.image_view_ids.clear(); + image.image_view_infos.clear(); + if constexpr (ENABLE_VALIDATION) { + std::ranges::fill(graphics_image_view_ids, CORRUPT_ID); + std::ranges::fill(compute_image_view_ids, CORRUPT_ID); + } + graphics_image_table.Invalidate(); + compute_image_table.Invalidate(); + has_deleted_images = true; } template -bool TextureCache

::ScaleUp(Image& image, bool invalidate_rt) { +bool TextureCache

::ScaleUp(Image& image) { const bool rescaled = image.ScaleUp(); if (!rescaled) { return false; } - InvalidateScale(image, invalidate_rt); + InvalidateScale(image); return true; } template -bool TextureCache

::ScaleDown(Image& image, bool invalidate_rt) { +bool TextureCache

::ScaleDown(Image& image) { const bool rescaled = image.ScaleDown(); if (!rescaled) { return false; } - InvalidateScale(image, invalidate_rt); + InvalidateScale(image); return true; } @@ -861,12 +865,12 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA if (can_rescale) { for (const ImageId sibling_id : all_siblings) { Image& sibling = slot_images[sibling_id]; - ScaleUp(sibling, true); + ScaleUp(sibling); } } else { for (const ImageId sibling_id : all_siblings) { Image& sibling = slot_images[sibling_id]; - ScaleDown(sibling, true); + ScaleDown(sibling); } } @@ -893,9 +897,9 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA RefreshContents(new_image, new_image_id); if (can_rescale) { - new_image.ScaleUp(); + ScaleUp(new_image); } else { - new_image.ScaleDown(); + ScaleDown(new_image); } for (const ImageId overlap_id : overlap_ids) { diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 042678786..cdd99242b 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -327,9 +327,9 @@ private: [[nodiscard]] bool IsFullClear(ImageViewId id); bool ImageCanRescale(Image& image); - void InvalidateScale(Image& image, bool invalidate_rt = false); - bool ScaleUp(Image& image, bool invalidate_rt = false); - bool ScaleDown(Image& image, bool invalidate_rt = false); + void InvalidateScale(Image& image); + bool ScaleUp(Image& image); + bool ScaleDown(Image& image); Runtime& runtime; VideoCore::RasterizerInterface& rasterizer; From 71ca84d8299f7eb6779e95e808b3ec7f8505354b Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 20 Jul 2021 18:29:52 +0200 Subject: [PATCH 007/156] Settings: eliminate rescaling_factor. --- src/common/settings.cpp | 2 +- src/common/settings.h | 2 +- src/core/hle/service/am/am.cpp | 12 ++++-------- src/core/hle/service/vi/vi.cpp | 27 ++++++++------------------- src/core/telemetry_session.cpp | 2 -- src/video_core/video_core.cpp | 9 ++++----- src/video_core/video_core.h | 2 +- 7 files changed, 19 insertions(+), 37 deletions(-) diff --git a/src/common/settings.cpp b/src/common/settings.cpp index dd3a3d456..6f3acee79 100644 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -48,7 +48,6 @@ void LogSettings() { log_setting("Core_UseMultiCore", values.use_multi_core.GetValue()); log_setting("CPU_Accuracy", values.cpu_accuracy.GetValue()); log_setting("Renderer_UseResolutionScaling", values.resolution_setup.GetValue()); - log_setting("Renderer_UseResolutionFactor", values.resolution_factor.GetValue()); log_setting("Renderer_UseSpeedLimit", values.use_speed_limit.GetValue()); log_setting("Renderer_SpeedLimit", values.speed_limit.GetValue()); log_setting("Renderer_UseDiskShaderCache", values.use_disk_shader_cache.GetValue()); @@ -155,6 +154,7 @@ void UpdateRescalingInfo() { info.down_factor = static_cast(1U << info.down_shift) / info.up_scale; info.size_up = info.up_scale * info.up_scale; info.size_shift = info.down_shift * 2; + info.active = info.up_scale != 1 || info.down_shift != 0; } void RestoreGlobalState(bool is_powered_on) { diff --git a/src/common/settings.h b/src/common/settings.h index f4df2fc95..2b11984b4 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -69,6 +69,7 @@ struct ResolutionScalingInfo { f32 down_factor{1.0f}; u32 size_up{1}; u32 size_shift{0}; + bool active{}; }; /** The BasicSetting class is a simple resource manager. It defines a label and default value @@ -472,7 +473,6 @@ struct Values { ResolutionScalingInfo resolution_info{}; Setting resolution_setup{ResolutionSetup::Res1X, "resolution_setup"}; - Setting resolution_factor{1, "resolution_factor"}; // *nix platforms may have issues with the borderless windowed fullscreen mode. // Default to exclusive fullscreen on these platforms for now. RangedSetting fullscreen_mode{ diff --git a/src/core/hle/service/am/am.cpp b/src/core/hle/service/am/am.cpp index 50c2ace93..aee8d4f93 100644 --- a/src/core/hle/service/am/am.cpp +++ b/src/core/hle/service/am/am.cpp @@ -797,15 +797,11 @@ void ICommonStateGetter::GetDefaultDisplayResolution(Kernel::HLERequestContext& rb.Push(ResultSuccess); if (Settings::values.use_docked_mode.GetValue()) { - rb.Push(static_cast(Service::VI::DisplayResolution::DockedWidth) * - static_cast(Settings::values.resolution_factor.GetValue())); - rb.Push(static_cast(Service::VI::DisplayResolution::DockedHeight) * - static_cast(Settings::values.resolution_factor.GetValue())); + rb.Push(static_cast(Service::VI::DisplayResolution::DockedWidth)); + rb.Push(static_cast(Service::VI::DisplayResolution::DockedHeight)); } else { - rb.Push(static_cast(Service::VI::DisplayResolution::UndockedWidth) * - static_cast(Settings::values.resolution_factor.GetValue())); - rb.Push(static_cast(Service::VI::DisplayResolution::UndockedHeight) * - static_cast(Settings::values.resolution_factor.GetValue())); + rb.Push(static_cast(Service::VI::DisplayResolution::UndockedWidth)); + rb.Push(static_cast(Service::VI::DisplayResolution::UndockedHeight)); } } diff --git a/src/core/hle/service/vi/vi.cpp b/src/core/hle/service/vi/vi.cpp index 63d5242c4..75ee3e5e4 100644 --- a/src/core/hle/service/vi/vi.cpp +++ b/src/core/hle/service/vi/vi.cpp @@ -541,11 +541,8 @@ private: switch (transaction) { case TransactionId::Connect: { IGBPConnectRequestParcel request{ctx.ReadBuffer()}; - IGBPConnectResponseParcel response{ - static_cast(static_cast(DisplayResolution::UndockedWidth) * - Settings::values.resolution_factor.GetValue()), - static_cast(static_cast(DisplayResolution::UndockedHeight) * - Settings::values.resolution_factor.GetValue())}; + IGBPConnectResponseParcel response{static_cast(DisplayResolution::UndockedWidth), + static_cast(DisplayResolution::UndockedHeight)}; buffer_queue.Connect(); @@ -775,15 +772,11 @@ private: rb.Push(ResultSuccess); if (Settings::values.use_docked_mode.GetValue()) { - rb.Push(static_cast(Service::VI::DisplayResolution::DockedWidth) * - static_cast(Settings::values.resolution_factor.GetValue())); - rb.Push(static_cast(Service::VI::DisplayResolution::DockedHeight) * - static_cast(Settings::values.resolution_factor.GetValue())); + rb.Push(static_cast(Service::VI::DisplayResolution::DockedWidth)); + rb.Push(static_cast(Service::VI::DisplayResolution::DockedHeight)); } else { - rb.Push(static_cast(Service::VI::DisplayResolution::UndockedWidth) * - static_cast(Settings::values.resolution_factor.GetValue())); - rb.Push(static_cast(Service::VI::DisplayResolution::UndockedHeight) * - static_cast(Settings::values.resolution_factor.GetValue())); + rb.Push(static_cast(Service::VI::DisplayResolution::UndockedWidth)); + rb.Push(static_cast(Service::VI::DisplayResolution::UndockedHeight)); } rb.PushRaw(60.0f); // This wouldn't seem to be correct for 30 fps games. @@ -1063,10 +1056,8 @@ private: // This only returns the fixed values of 1280x720 and makes no distinguishing // between docked and undocked dimensions. We take the liberty of applying // the resolution scaling factor here. - rb.Push(static_cast(DisplayResolution::UndockedWidth) * - static_cast(Settings::values.resolution_factor.GetValue())); - rb.Push(static_cast(DisplayResolution::UndockedHeight) * - static_cast(Settings::values.resolution_factor.GetValue())); + rb.Push(static_cast(DisplayResolution::UndockedWidth)); + rb.Push(static_cast(DisplayResolution::UndockedHeight)); } void SetLayerScalingMode(Kernel::HLERequestContext& ctx) { @@ -1099,8 +1090,6 @@ private: LOG_WARNING(Service_VI, "(STUBBED) called"); DisplayInfo display_info; - display_info.width *= static_cast(Settings::values.resolution_factor.GetValue()); - display_info.height *= static_cast(Settings::values.resolution_factor.GetValue()); ctx.WriteBuffer(&display_info, sizeof(DisplayInfo)); IPC::ResponseBuilder rb{ctx, 4}; rb.Push(ResultSuccess); diff --git a/src/core/telemetry_session.cpp b/src/core/telemetry_session.cpp index 191475f71..654db0b52 100644 --- a/src/core/telemetry_session.cpp +++ b/src/core/telemetry_session.cpp @@ -229,8 +229,6 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader, AddField(field_type, "Core_UseMultiCore", Settings::values.use_multi_core.GetValue()); AddField(field_type, "Renderer_Backend", TranslateRenderer(Settings::values.renderer_backend.GetValue())); - AddField(field_type, "Renderer_ResolutionFactor", - Settings::values.resolution_factor.GetValue()); AddField(field_type, "Renderer_UseSpeedLimit", Settings::values.use_speed_limit.GetValue()); AddField(field_type, "Renderer_SpeedLimit", Settings::values.speed_limit.GetValue()); AddField(field_type, "Renderer_UseDiskShaderCache", diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index cae543a51..508173db3 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -53,11 +53,10 @@ std::unique_ptr CreateGPU(Core::Frontend::EmuWindow& emu_window, Cor } } -u16 GetResolutionScaleFactor(const RendererBase& renderer) { - return static_cast( - Settings::values.resolution_factor.GetValue() != 0 - ? Settings::values.resolution_factor.GetValue() - : renderer.GetRenderWindow().GetFramebufferLayout().GetScalingRatio()); +float GetResolutionScaleFactor(const RendererBase& renderer) { + return Settings::values.resolution_info.active + ? Settings::values.resolution_info.up_factor + : renderer.GetRenderWindow().GetFramebufferLayout().GetScalingRatio(); } } // namespace VideoCore diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h index f5c27125d..f86877e86 100644 --- a/src/video_core/video_core.h +++ b/src/video_core/video_core.h @@ -25,6 +25,6 @@ class RendererBase; /// Creates an emulated GPU instance using the given system context. std::unique_ptr CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system); -u16 GetResolutionScaleFactor(const RendererBase& renderer); +float GetResolutionScaleFactor(const RendererBase& renderer); } // namespace VideoCore From 778700ff9d6eca96945deebcd4415e70d58330d9 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 20 Jul 2021 19:36:38 +0200 Subject: [PATCH 008/156] TextureCache: Modify Viewports/Scissors according to Rescale. --- src/video_core/dirty_flags.h | 3 +- .../renderer_vulkan/vk_rasterizer.cpp | 87 ++++++++++++------- .../renderer_vulkan/vk_state_tracker.h | 6 +- src/video_core/texture_cache/texture_cache.h | 25 +++++- .../texture_cache/texture_cache_base.h | 3 + src/yuzu/configuration/config.cpp | 4 + 6 files changed, 93 insertions(+), 35 deletions(-) diff --git a/src/video_core/dirty_flags.h b/src/video_core/dirty_flags.h index f11ff5d94..d63ad5a35 100644 --- a/src/video_core/dirty_flags.h +++ b/src/video_core/dirty_flags.h @@ -29,7 +29,8 @@ enum : u8 { ColorBuffer6, ColorBuffer7, ZetaBuffer, - Rescale, + RescaleViewports, + RescaleScissors, VertexBuffers, VertexBuffer0, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 30b47a7a0..7a7374b78 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -58,19 +58,14 @@ struct DrawParams { bool is_indexed; }; -VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t index) { +VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t index, float scale) { const auto& src = regs.viewport_transform[index]; - const float width = src.scale_x * 2.0f; - float y = src.translate_y - src.scale_y; - float height = src.scale_y * 2.0f; - if (regs.screen_y_control.y_negate) { - y += height; - height = -height; - } + const float width = src.scale_x * 2.0f * scale; + const float height = src.scale_y * 2.0f * scale; const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f; VkViewport viewport{ - .x = src.translate_x - src.scale_x, - .y = y, + .x = (src.translate_x - src.scale_x) * scale, + .y = (src.translate_y - src.scale_y) * scale, .width = width != 0.0f ? width : 1.0f, .height = height != 0.0f ? height : 1.0f, .minDepth = src.translate_z - src.scale_z * reduce_z, @@ -83,14 +78,21 @@ VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t in return viewport; } -VkRect2D GetScissorState(const Maxwell& regs, size_t index) { +VkRect2D GetScissorState(const Maxwell& regs, size_t index, u32 up_scale = 1, u32 down_shift = 0) { const auto& src = regs.scissor_test[index]; VkRect2D scissor; + const auto scale_up = [&](u32 value) -> u32 { + if (value == 0) { + return 0U; + } + const u32 converted_value = (value * up_scale) >> down_shift; + return std::max(converted_value, 1U); + }; if (src.enable) { - scissor.offset.x = static_cast(src.min_x); - scissor.offset.y = static_cast(src.min_y); - scissor.extent.width = src.max_x - src.min_x; - scissor.extent.height = src.max_y - src.min_y; + scissor.offset.x = static_cast(scale_up(src.min_x)); + scissor.offset.y = static_cast(scale_up(src.min_y)); + scissor.extent.width = scale_up(src.max_x - src.min_x); + scissor.extent.height = scale_up(src.max_y - src.min_y); } else { scissor.offset.x = 0; scissor.offset.y = 0; @@ -214,8 +216,15 @@ void RasterizerVulkan::Clear() { const VkExtent2D render_area = framebuffer->RenderArea(); scheduler.RequestRenderpass(framebuffer); + u32 up_scale = 1; + u32 down_shift = 0; + if (texture_cache.IsRescaling()) { + up_scale = Settings::values.resolution_info.up_scale; + down_shift = Settings::values.resolution_info.down_shift; + } + VkClearRect clear_rect{ - .rect = GetScissorState(regs, 0), + .rect = GetScissorState(regs, 0, up_scale, down_shift), .baseArrayLayer = regs.clear_buffers.layer, .layerCount = 1, }; @@ -595,15 +604,17 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& reg if (!state_tracker.TouchViewports()) { return; } + const float scale = + texture_cache.IsRescaling() ? Settings::values.resolution_info.up_factor : 1.0f; const std::array viewports{ - GetViewportState(device, regs, 0), GetViewportState(device, regs, 1), - GetViewportState(device, regs, 2), GetViewportState(device, regs, 3), - GetViewportState(device, regs, 4), GetViewportState(device, regs, 5), - GetViewportState(device, regs, 6), GetViewportState(device, regs, 7), - GetViewportState(device, regs, 8), GetViewportState(device, regs, 9), - GetViewportState(device, regs, 10), GetViewportState(device, regs, 11), - GetViewportState(device, regs, 12), GetViewportState(device, regs, 13), - GetViewportState(device, regs, 14), GetViewportState(device, regs, 15), + GetViewportState(device, regs, 0, scale), GetViewportState(device, regs, 1, scale), + GetViewportState(device, regs, 2, scale), GetViewportState(device, regs, 3, scale), + GetViewportState(device, regs, 4, scale), GetViewportState(device, regs, 5, scale), + GetViewportState(device, regs, 6, scale), GetViewportState(device, regs, 7, scale), + GetViewportState(device, regs, 8, scale), GetViewportState(device, regs, 9, scale), + GetViewportState(device, regs, 10, scale), GetViewportState(device, regs, 11, scale), + GetViewportState(device, regs, 12, scale), GetViewportState(device, regs, 13, scale), + GetViewportState(device, regs, 14, scale), GetViewportState(device, regs, 15, scale), }; scheduler.Record([viewports](vk::CommandBuffer cmdbuf) { cmdbuf.SetViewport(0, viewports); }); } @@ -612,13 +623,29 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs if (!state_tracker.TouchScissors()) { return; } + u32 up_scale = 1; + u32 down_shift = 0; + if (texture_cache.IsRescaling()) { + up_scale = Settings::values.resolution_info.up_scale; + down_shift = Settings::values.resolution_info.down_shift; + } const std::array scissors{ - GetScissorState(regs, 0), GetScissorState(regs, 1), GetScissorState(regs, 2), - GetScissorState(regs, 3), GetScissorState(regs, 4), GetScissorState(regs, 5), - GetScissorState(regs, 6), GetScissorState(regs, 7), GetScissorState(regs, 8), - GetScissorState(regs, 9), GetScissorState(regs, 10), GetScissorState(regs, 11), - GetScissorState(regs, 12), GetScissorState(regs, 13), GetScissorState(regs, 14), - GetScissorState(regs, 15), + GetScissorState(regs, 0, up_scale, down_shift), + GetScissorState(regs, 1, up_scale, down_shift), + GetScissorState(regs, 2, up_scale, down_shift), + GetScissorState(regs, 3, up_scale, down_shift), + GetScissorState(regs, 4, up_scale, down_shift), + GetScissorState(regs, 5, up_scale, down_shift), + GetScissorState(regs, 6, up_scale, down_shift), + GetScissorState(regs, 7, up_scale, down_shift), + GetScissorState(regs, 8, up_scale, down_shift), + GetScissorState(regs, 9, up_scale, down_shift), + GetScissorState(regs, 10, up_scale, down_shift), + GetScissorState(regs, 11, up_scale, down_shift), + GetScissorState(regs, 12, up_scale, down_shift), + GetScissorState(regs, 13, up_scale, down_shift), + GetScissorState(regs, 14, up_scale, down_shift), + GetScissorState(regs, 15, up_scale, down_shift), }; scheduler.Record([scissors](vk::CommandBuffer cmdbuf) { cmdbuf.SetScissor(0, scissors); }); } diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h index 2f2d6b31f..ac2bbebe0 100644 --- a/src/video_core/renderer_vulkan/vk_state_tracker.h +++ b/src/video_core/renderer_vulkan/vk_state_tracker.h @@ -71,11 +71,13 @@ public: } bool TouchViewports() { - return Exchange(Dirty::Viewports, false); + return Exchange(Dirty::Viewports, false) || + Exchange(VideoCommon::Dirty::RescaleViewports, false); } bool TouchScissors() { - return Exchange(Dirty::Scissors, false); + return Exchange(Dirty::Scissors, false) || + Exchange(VideoCommon::Dirty::RescaleScissors, false); } bool TouchDepthBias() { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index b7d1ae92d..4e5031acc 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -7,6 +7,7 @@ #include #include "common/alignment.h" +#include "common/settings.h" #include "video_core/dirty_flags.h" #include "video_core/engines/kepler_compute.h" #include "video_core/texture_cache/image_view_base.h" @@ -205,6 +206,7 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { return; } + bool rescaled; do { flags[Dirty::RenderTargets] = false; @@ -243,6 +245,7 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { check_rescale(render_targets.depth_buffer_id, tmp_depth_image); if (can_rescale) { + rescaled = true; const auto scale_up = [this](ImageId image_id) { if (image_id != CORRUPT_ID) { Image& image = slot_images[image_id]; @@ -254,6 +257,7 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { } scale_up(tmp_depth_image); } else { + rescaled = false; const auto scale_down = [this](ImageId image_id) { if (image_id != CORRUPT_ID) { Image& image = slot_images[image_id]; @@ -268,6 +272,12 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { } while (has_deleted_images); // Rescale End + if (is_rescaling != rescaled) { + flags[Dirty::RescaleViewports] = true; + flags[Dirty::RescaleScissors] = true; + is_rescaling = rescaled; + } + for (size_t index = 0; index < NUM_RT; ++index) { ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; PrepareImageView(color_buffer_id, true, is_clear && IsFullClear(color_buffer_id)); @@ -279,9 +289,15 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { for (size_t index = 0; index < NUM_RT; ++index) { render_targets.draw_buffers[index] = static_cast(maxwell3d.regs.rt_control.Map(index)); } + u32 up_scale = 1; + u32 down_shift = 0; + if (is_rescaling) { + up_scale = Settings::values.resolution_info.up_scale; + down_shift = Settings::values.resolution_info.down_shift; + } render_targets.size = Extent2D{ - maxwell3d.regs.render_area.width, - maxwell3d.regs.render_area.height, + (maxwell3d.regs.render_area.width * up_scale) >> down_shift, + (maxwell3d.regs.render_area.height * up_scale) >> down_shift, }; flags[Dirty::DepthBiasGlobal] = true; @@ -538,6 +554,11 @@ void TextureCache

::PopAsyncFlushes() { committed_downloads.pop(); } +template +bool TextureCache

::IsRescaling() { + return is_rescaling; +} + template bool TextureCache

::IsRegionGpuModified(VAddr addr, size_t size) { bool is_modified = false; diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index cdd99242b..1f51fcee8 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -168,6 +168,8 @@ public: /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + [[nodiscard]] bool IsRescaling(); + std::mutex mutex; private: @@ -362,6 +364,7 @@ private: VAddr virtual_invalid_space{}; bool has_deleted_images = false; + bool is_rescaling = false; u64 total_used_memory = 0; u64 minimum_memory; u64 expected_memory; diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index 7ddc40b00..7ed833203 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -849,6 +849,8 @@ void Config::ReadRendererValues() { ReadBasicSetting(Settings::values.disable_shader_loop_safety_checks); } + Settings::UpdateRescalingInfo(); + qt_config->endGroup(); } @@ -1402,6 +1404,8 @@ void Config::SaveRendererValues() { WriteBasicSetting(Settings::values.disable_shader_loop_safety_checks); } + Settings::UpdateRescalingInfo(); + qt_config->endGroup(); } From 8704c939136e88876d65fc670bce98d8250a6588 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 20 Jul 2021 22:51:25 +0200 Subject: [PATCH 009/156] TextureCache: Fix rescaling of ImageCopies --- .../renderer_vulkan/vk_texture_cache.cpp | 40 +++++++++++------ src/video_core/texture_cache/texture_cache.h | 43 +++++++++++++++++-- .../texture_cache/texture_cache_base.h | 2 +- 3 files changed, 67 insertions(+), 18 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 5fd190825..54236e87f 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -136,6 +136,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { if (info.type == ImageType::e3D) { flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT; } + const auto scale_up = [&](u32 value) { return std::max((value * up) >> down, 1U); }; const auto [samples_x, samples_y] = VideoCommon::SamplesLog2(info.num_samples); const bool is_2d = info.type == ImageType::e2D; return VkImageCreateInfo{ @@ -145,8 +146,8 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { .imageType = ConvertImageType(info.type), .format = format_info.format, .extent{ - .width = ((info.size.width * up) >> down) >> samples_x, - .height = (is_2d ? ((info.size.height * up) >> down) : info.size.height) >> samples_y, + .width = scale_up(info.size.width) >> samples_x, + .height = (is_2d ? scale_up(info.size.height) : info.size.height) >> samples_y, .depth = info.size.depth, }, .mipLevels = static_cast(info.resources.levels), @@ -1078,12 +1079,35 @@ bool Image::ScaleUp(bool save_as_backup) { MemoryCommit new_commit( runtime->memory_allocator.Commit(rescaled_image, MemoryUsage::DeviceLocal)); + SCOPE_EXIT({ + if (save_as_backup) { + backup_image = std::move(image); + backup_commit = std::move(commit); + has_backup = true; + } else { + runtime->prescaled_images.Push(std::move(image)); + runtime->prescaled_commits.Push(std::move(commit)); + } + image = std::move(rescaled_image); + commit = std::move(new_commit); + }); + + const PixelFormat format = StorageFormat(info.format); + const auto format_info = + MaxwellToVK::SurfaceFormat(runtime->device, FormatType::Optimal, false, format); + const auto similar = runtime->device.GetSupportedFormat( + format_info.format, (VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT), + FormatType::Optimal); + + if (similar != format_info.format) { + return true; + } if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); } const auto scale_up = [&](u32 value) { - return (value * resolution.up_scale) >> resolution.down_shift; + return std::max((value * resolution.up_scale) >> resolution.down_shift, 1U); }; const bool is_2d = info.type == ImageType::e2D; @@ -1130,16 +1154,6 @@ bool Image::ScaleUp(bool save_as_backup) { vkRegions.push_back(blit); } BlitScale(*scheduler, *image, *rescaled_image, vkRegions, aspect_mask); - if (save_as_backup) { - backup_image = std::move(image); - backup_commit = std::move(commit); - has_backup = true; - } else { - runtime->prescaled_images.Push(std::move(image)); - runtime->prescaled_commits.Push(std::move(commit)); - } - image = std::move(rescaled_image); - commit = std::move(new_commit); return true; } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 4e5031acc..df697cdeb 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -929,8 +929,8 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA LOG_WARNING(HW_GPU, "Copying between images with different samples is not implemented"); } else { const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value(); - const auto copies = MakeShrinkImageCopies(new_info, overlap.info, base); - runtime.CopyImage(new_image, overlap, copies); + auto copies = MakeShrinkImageCopies(new_info, overlap.info, base); + runtime.CopyImage(new_image, overlap, std::move(copies)); } if (True(overlap.flags & ImageFlagBits::Tracked)) { UntrackImage(overlap, overlap_id); @@ -1569,9 +1569,33 @@ void TextureCache

::PrepareImageView(ImageViewId image_view_id, bool is_modifi } template -void TextureCache

::CopyImage(ImageId dst_id, ImageId src_id, std::span copies) { +void TextureCache

::CopyImage(ImageId dst_id, ImageId src_id, std::vector copies) { Image& dst = slot_images[dst_id]; Image& src = slot_images[src_id]; + const bool is_rescaled = True(src.flags & ImageFlagBits::Rescaled); + if (is_rescaled) { + ASSERT(True(dst.flags & ImageFlagBits::Rescaled)); + const bool both_2d{src.info.type == ImageType::e2D && dst.info.type == ImageType::e2D}; + const auto& resolution = Settings::values.resolution_info; + const auto scale_up = [&](u32 value) -> u32 { + if (value == 0) { + return 0U; + } + return std::max((value * resolution.up_scale) >> resolution.down_shift, 1U); + }; + for (auto& copy : copies) { + copy.src_offset.x = scale_up(copy.src_offset.x); + + copy.dst_offset.x = scale_up(copy.dst_offset.x); + + copy.extent.width = scale_up(copy.extent.width); + if (both_2d) { + copy.src_offset.y = scale_up(copy.src_offset.y); + copy.dst_offset.y = scale_up(copy.dst_offset.y); + copy.extent.height = scale_up(copy.extent.height); + } + } + } const auto dst_format_type = GetFormatType(dst.info.format); const auto src_format_type = GetFormatType(src.info.format); if (src_format_type == dst_format_type) { @@ -1639,10 +1663,21 @@ std::pair TextureCache

::RenderTargetFromImage( ImageId image_id, const ImageViewInfo& view_info) { const ImageViewId view_id = FindOrEmplaceImageView(image_id, view_info); const ImageBase& image = slot_images[image_id]; + const bool is_rescaled = True(image.flags & ImageFlagBits::Rescaled); const bool is_color = GetFormatType(image.info.format) == SurfaceType::ColorTexture; const ImageViewId color_view_id = is_color ? view_id : ImageViewId{}; const ImageViewId depth_view_id = is_color ? ImageViewId{} : view_id; - const Extent3D extent = MipSize(image.info.size, view_info.range.base.level); + Extent3D extent = MipSize(image.info.size, view_info.range.base.level); + if (is_rescaled) { + const auto& resolution = Settings::values.resolution_info; + const auto scale_up = [&](u32 value) { + return std::max((value * resolution.up_scale) >> resolution.down_shift, 1U); + }; + extent.width = scale_up(extent.width); + if (image.info.type == ImageType::e2D) { + extent.height = scale_up(extent.height); + } + } const u32 num_samples = image.info.num_samples; const auto [samples_x, samples_y] = SamplesLog2(num_samples); const FramebufferId framebuffer_id = GetFramebufferId(RenderTargets{ diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 1f51fcee8..deddf0d30 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -316,7 +316,7 @@ private: void PrepareImageView(ImageViewId image_view_id, bool is_modification, bool invalidate); /// Execute copies from one image to the other, even if they are incompatible - void CopyImage(ImageId dst_id, ImageId src_id, std::span copies); + void CopyImage(ImageId dst_id, ImageId src_id, std::vector copies); /// Bind an image view as render target, downloading resources preemtively if needed void BindRenderTarget(ImageViewId* old_id, ImageViewId new_id); From de66a69ed4b556ad96f38570ea0a31cb2a1870f1 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Wed, 21 Jul 2021 00:02:35 -0400 Subject: [PATCH 010/156] renderer_gl: Resolution scaling fixes --- .../renderer_opengl/gl_rasterizer.cpp | 23 ++- .../renderer_opengl/gl_texture_cache.cpp | 136 ++++++++++-------- .../renderer_opengl/gl_texture_cache.h | 9 +- 3 files changed, 107 insertions(+), 61 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index a6d9f7c43..b91e7edf8 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -577,6 +577,14 @@ void RasterizerOpenGL::SyncViewport() { const bool force = flags[Dirty::ViewportTransform]; flags[Dirty::ViewportTransform] = false; + const auto& resolution = Settings::values.resolution_info; + const auto scale_up = [&](u32 value) -> u32 { + if (value == 0) { + return 0U; + } + const u32 converted_value = (value * resolution.up_scale) >> resolution.down_shift; + return std::max(converted_value, 1U); + }; for (std::size_t i = 0; i < Maxwell::NumViewports; ++i) { if (!force && !flags[Dirty::Viewport0 + i]) { continue; @@ -585,8 +593,8 @@ void RasterizerOpenGL::SyncViewport() { const auto& src = regs.viewport_transform[i]; const Common::Rectangle rect{src.GetRect()}; - glViewportIndexedf(static_cast(i), rect.left, rect.bottom, rect.GetWidth(), - rect.GetHeight()); + glViewportIndexedf(static_cast(i), rect.left, rect.bottom, + scale_up(rect.GetWidth()), scale_up(rect.GetHeight())); const GLdouble reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne; const GLdouble near_depth = src.translate_z - src.scale_z * reduce_z; @@ -909,6 +917,15 @@ void RasterizerOpenGL::SyncScissorTest() { flags[Dirty::Scissors] = false; const auto& regs = maxwell3d.regs; + + const auto& resolution = Settings::values.resolution_info; + const auto scale_up = [&](u32 value) -> u32 { + if (value == 0) { + return 0U; + } + const u32 converted_value = (value * resolution.up_scale) >> resolution.down_shift; + return std::max(converted_value, 1U); + }; for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) { if (!flags[Dirty::Scissor0 + index]) { continue; @@ -919,7 +936,7 @@ void RasterizerOpenGL::SyncScissorTest() { if (src.enable) { glEnablei(GL_SCISSOR_TEST, static_cast(index)); glScissorIndexed(static_cast(index), src.min_x, src.min_y, - src.max_x - src.min_x, src.max_y - src.min_y); + scale_up(src.max_x - src.min_x), scale_up(src.max_y - src.min_y)); } else { glDisablei(GL_SCISSOR_TEST, static_cast(index)); } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 2d9f770cd..ecde5b600 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -316,6 +316,52 @@ void AttachTexture(GLuint fbo, GLenum attachment, const ImageView* image_view) { } } +OGLTexture MakeImage(const VideoCommon::ImageInfo& info, GLenum gl_internal_format) { + const GLenum target = ImageTarget(info); + const GLsizei width = info.size.width; + const GLsizei height = info.size.height; + const GLsizei depth = info.size.depth; + const int max_host_mip_levels = std::bit_width(info.size.width); + const GLsizei num_levels = std::min(info.resources.levels, max_host_mip_levels); + const GLsizei num_layers = info.resources.layers; + const GLsizei num_samples = info.num_samples; + + GLuint handle = 0; + OGLTexture texture; + if (target != GL_TEXTURE_BUFFER) { + texture.Create(target); + handle = texture.handle; + } + switch (target) { + case GL_TEXTURE_1D_ARRAY: + glTextureStorage2D(handle, num_levels, gl_internal_format, width, num_layers); + break; + case GL_TEXTURE_2D_ARRAY: + glTextureStorage3D(handle, num_levels, gl_internal_format, width, height, num_layers); + break; + case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: { + // TODO: Where should 'fixedsamplelocations' come from? + const auto [samples_x, samples_y] = SamplesLog2(info.num_samples); + glTextureStorage3DMultisample(handle, num_samples, gl_internal_format, width >> samples_x, + height >> samples_y, num_layers, GL_FALSE); + break; + } + case GL_TEXTURE_RECTANGLE: + glTextureStorage2D(handle, num_levels, gl_internal_format, width, height); + break; + case GL_TEXTURE_3D: + glTextureStorage3D(handle, num_levels, gl_internal_format, width, height, depth); + break; + case GL_TEXTURE_BUFFER: + UNREACHABLE(); + break; + default: + UNREACHABLE_MSG("Invalid target=0x{:x}", target); + break; + } + return texture; +} + [[nodiscard]] bool IsPixelFormatBGR(PixelFormat format) { switch (format) { case PixelFormat::B5G6R5_UNORM: @@ -430,6 +476,11 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& TextureCacheRuntime::~TextureCacheRuntime() = default; +void TextureCacheRuntime::Init() { + resolution = Settings::values.resolution_info; + is_rescaling_on = resolution.up_scale != 1 || resolution.down_shift != 0; +} + void TextureCacheRuntime::Finish() { glFinish(); } @@ -605,13 +656,13 @@ std::optional TextureCacheRuntime::StagingBuffers::FindBuffer(size_t req return found; } -Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_, GPUVAddr gpu_addr_, +Image::Image(TextureCacheRuntime& runtime_, const VideoCommon::ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) - : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_) { - if (CanBeAccelerated(runtime, info)) { + : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), runtime{&runtime_} { + if (CanBeAccelerated(*runtime, info)) { flags |= ImageFlagBits::AcceleratedUpload; } - if (IsConverted(runtime.device, info.format, info.type)) { + if (IsConverted(runtime->device, info.format, info.type)) { flags |= ImageFlagBits::Converted; gl_internal_format = IsPixelFormatSRGB(info.format) ? GL_SRGB8_ALPHA8 : GL_RGBA8; gl_format = GL_RGBA; @@ -622,51 +673,11 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_, gl_format = tuple.format; gl_type = tuple.type; } - const GLenum target = ImageTarget(info); - const GLsizei width = info.size.width; - const GLsizei height = info.size.height; - const GLsizei depth = info.size.depth; - const int max_host_mip_levels = std::bit_width(info.size.width); - const GLsizei num_levels = std::min(info.resources.levels, max_host_mip_levels); - const GLsizei num_layers = info.resources.layers; - const GLsizei num_samples = info.num_samples; - - GLuint handle = 0; - if (target != GL_TEXTURE_BUFFER) { - texture.Create(target); - handle = texture.handle; - } - switch (target) { - case GL_TEXTURE_1D_ARRAY: - glTextureStorage2D(handle, num_levels, gl_internal_format, width, num_layers); - break; - case GL_TEXTURE_2D_ARRAY: - glTextureStorage3D(handle, num_levels, gl_internal_format, width, height, num_layers); - break; - case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: { - // TODO: Where should 'fixedsamplelocations' come from? - const auto [samples_x, samples_y] = SamplesLog2(info.num_samples); - glTextureStorage3DMultisample(handle, num_samples, gl_internal_format, width >> samples_x, - height >> samples_y, num_layers, GL_FALSE); - break; - } - case GL_TEXTURE_RECTANGLE: - glTextureStorage2D(handle, num_levels, gl_internal_format, width, height); - break; - case GL_TEXTURE_3D: - glTextureStorage3D(handle, num_levels, gl_internal_format, width, height, depth); - break; - case GL_TEXTURE_BUFFER: - UNREACHABLE(); - break; - default: - UNREACHABLE_MSG("Invalid target=0x{:x}", target); - break; - } - if (runtime.device.HasDebuggingToolAttached()) { + texture = MakeImage(info, gl_internal_format); + if (runtime->device.HasDebuggingToolAttached()) { const std::string name = VideoCommon::Name(*this); - glObjectLabel(target == GL_TEXTURE_BUFFER ? GL_BUFFER : GL_TEXTURE, handle, - static_cast(name.size()), name.data()); + glObjectLabel(ImageTarget(info) == GL_TEXTURE_BUFFER ? GL_BUFFER : GL_TEXTURE, + texture.handle, static_cast(name.size()), name.data()); } } @@ -855,7 +866,7 @@ void Image::CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t b } } -void Image::Scale() { +void Image::Scale(u32 up, u32 down) { // TODO: Pass scaling factor? if (gl_format == 0 || gl_type == 0) { // compressed textures @@ -902,12 +913,22 @@ void Image::Scale() { glBindFramebuffer(GL_READ_FRAMEBUFFER, fbo_handle); glNamedFramebufferTexture(fbo_handle, attachment, texture.handle, 0); - const size_t scaled_width = info.size.width; - const size_t scaled_height = info.size.height * 2; - glBlitNamedFramebuffer(fbo_handle, fbo_handle, 0, 0, info.size.width, info.size.height, 0, 0, + const auto scale_up = [&](u32 value) { return std::max((value * up) >> down, 1U); }; + const u32 scaled_width = scale_up(info.size.width); + const u32 scaled_height = scale_up(info.size.height); + const u32 original_width = info.size.width; + const u32 original_height = info.size.height; + + auto scaled_info = info; + scaled_info.size.width = scaled_width; + scaled_info.size.height = scaled_height; + auto scaled_texture = MakeImage(scaled_info, gl_internal_format); + + glBlitNamedFramebuffer(fbo_handle, fbo_handle, 0, 0, original_width, original_height, 0, 0, scaled_width, scaled_height, mask, filter); - // TODO: resize texture? - glCopyTextureSubImage3D(texture.handle, 0, 0, 0, 0, 0, 0, scaled_width, scaled_height / 2); + glCopyTextureSubImage3D(scaled_texture.handle, 0, 0, 0, 0, 0, 0, scaled_width, scaled_height); + texture = std::move(scaled_texture); + // Restore previous framebuffers glBindFramebuffer(GL_DRAW_FRAMEBUFFER, prev_draw_fbo); glBindFramebuffer(GL_READ_FRAMEBUFFER, prev_read_fbo); @@ -918,7 +939,8 @@ bool Image::ScaleUp() { return false; } flags |= ImageFlagBits::Rescaled; - //Scale(); + const auto& resolution = runtime->resolution; + Scale(resolution.up_scale, resolution.down_shift); return true; } @@ -927,7 +949,9 @@ bool Image::ScaleDown() { return false; } flags &= ~ImageFlagBits::Rescaled; - //Scale(); + UNIMPLEMENTED(); + // const auto& resolution = runtime->resolution; + // Scale(); return true; } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 324a0f1cb..77ca14132 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -9,6 +9,7 @@ #include +#include "common/settings.h" #include "shader_recompiler/shader_info.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/util_shaders.h" @@ -72,7 +73,7 @@ public: StateTracker& state_tracker); ~TextureCacheRuntime(); - void Init() {} + void Init(); void Finish(); @@ -153,6 +154,9 @@ private: OGLTextureView null_image_view_cube; std::array null_image_views{}; + + Settings::ResolutionScalingInfo resolution; + bool is_rescaling_on{}; }; class Image : public VideoCommon::ImageBase { @@ -198,13 +202,14 @@ private: void CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); - void Scale(); + void Scale(u32 up, u32 down); OGLTexture texture; OGLTextureView store_view; GLenum gl_internal_format = GL_NONE; GLenum gl_format = GL_NONE; GLenum gl_type = GL_NONE; + TextureCacheRuntime* runtime; }; class ImageView : public VideoCommon::ImageViewBase { From 973f8f1d08995e4b3e3e849b6c5fa77e5ce824a7 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 21 Jul 2021 02:55:55 -0300 Subject: [PATCH 011/156] Fix blits --- .../renderer_vulkan/vk_texture_cache.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 54236e87f..b061ea08b 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1111,9 +1111,10 @@ bool Image::ScaleUp(bool save_as_backup) { }; const bool is_2d = info.type == ImageType::e2D; - boost::container::small_vector vkRegions(info.resources.levels); + boost::container::small_vector regions; + regions.reserve(info.resources.levels); for (s32 level = 0; level < info.resources.levels; level++) { - VkImageBlit blit{ + regions.push_back({ .srcSubresource{ .aspectMask = aspect_mask, .mipLevel = u32(level), @@ -1150,10 +1151,9 @@ bool Image::ScaleUp(bool save_as_backup) { .z = 1, }, }, - }; - vkRegions.push_back(blit); + }); } - BlitScale(*scheduler, *image, *rescaled_image, vkRegions, aspect_mask); + BlitScale(*scheduler, *image, *rescaled_image, regions, aspect_mask); return true; } @@ -1193,9 +1193,10 @@ bool Image::ScaleDown(bool save_as_backup) { } const bool is_2d = info.type == ImageType::e2D; - boost::container::small_vector vkRegions(info.resources.levels); + boost::container::small_vector regions; + regions.reserve(info.resources.levels); for (s32 level = 0; level < info.resources.levels; level++) { - VkImageBlit blit{ + regions.push_back({ .srcSubresource{ .aspectMask = aspect_mask, .mipLevel = u32(level), @@ -1232,10 +1233,9 @@ bool Image::ScaleDown(bool save_as_backup) { .z = 1, }, }, - }; - vkRegions.push_back(blit); + }); } - BlitScale(*scheduler, *image, *downscaled_image, vkRegions, aspect_mask); + BlitScale(*scheduler, *image, *downscaled_image, regions, aspect_mask); if (save_as_backup) { backup_image = std::move(image); backup_commit = std::move(commit); From d464b122d5385105a6f4b0bc34bc1695c706417c Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 21 Jul 2021 03:07:18 -0300 Subject: [PATCH 012/156] Fix blits with mips --- .../renderer_vulkan/vk_texture_cache.cpp | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index b061ea08b..4e05058c9 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1128,8 +1128,8 @@ bool Image::ScaleUp(bool save_as_backup) { .z = 0, }, { - .x = s32(info.size.width), - .y = s32(info.size.height), + .x = static_cast(info.size.width), + .y = static_cast(info.size.height), .z = 1, }, }, @@ -1146,8 +1146,10 @@ bool Image::ScaleUp(bool save_as_backup) { .z = 0, }, { - .x = s32(scale_up(info.size.width)), - .y = is_2d ? s32(scale_up(info.size.height)) : s32(info.size.height), + .x = std::max(1, static_cast(scale_up(info.size.width)) >> level), + .y = std::max(1, static_cast(is_2d ? scale_up(info.size.height) + : info.size.height) >> + level), .z = 1, }, }, @@ -1199,9 +1201,9 @@ bool Image::ScaleDown(bool save_as_backup) { regions.push_back({ .srcSubresource{ .aspectMask = aspect_mask, - .mipLevel = u32(level), + .mipLevel = static_cast(level), .baseArrayLayer = 0, - .layerCount = u32(info.resources.layers), + .layerCount = static_cast(info.resources.layers), }, .srcOffsets{ { @@ -1210,16 +1212,18 @@ bool Image::ScaleDown(bool save_as_backup) { .z = 0, }, { - .x = s32(scale_up(info.size.width)), - .y = is_2d ? s32(scale_up(info.size.height)) : s32(info.size.height), + .x = std::max(1, static_cast(scale_up(info.size.width)) >> level), + .y = std::max(1, static_cast(is_2d ? scale_up(info.size.height) + : info.size.height) >> + level), .z = 1, }, }, .dstSubresource{ .aspectMask = aspect_mask, - .mipLevel = u32(level), + .mipLevel = static_cast(level), .baseArrayLayer = 0, - .layerCount = u32(info.resources.layers), + .layerCount = static_cast(info.resources.layers), }, .dstOffsets{ { @@ -1228,8 +1232,8 @@ bool Image::ScaleDown(bool save_as_backup) { .z = 0, }, { - .x = s32(info.size.width), - .y = s32(info.size.height), + .x = std::max(1, static_cast(info.size.width) >> level), + .y = std::max(1, static_cast(info.size.height) >> level), .z = 1, }, }, From dfc65cd0a3b259588b47db1d32c827e7fc071aeb Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Wed, 21 Jul 2021 10:15:09 +0200 Subject: [PATCH 013/156] Texture Cache: Implement Rescaling on Aliases and Blits. --- src/video_core/texture_cache/texture_cache.h | 58 ++++++++++++++++++-- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index df697cdeb..25fea8240 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -437,8 +437,32 @@ void TextureCache

::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, PrepareImage(src_id, false, false); PrepareImage(dst_id, true, false); - ImageBase& dst_image = slot_images[dst_id]; - const ImageBase& src_image = slot_images[src_id]; + Image& dst_image = slot_images[dst_id]; + const Image& src_image = slot_images[src_id]; + + const bool is_src_rescaled = True(src_image.flags & ImageFlagBits::Rescaled); + bool is_dst_rescaled = True(dst_image.flags & ImageFlagBits::Rescaled); + + if (is_src_rescaled && !is_dst_rescaled) { + if (ImageCanRescale(dst_image)) { + is_dst_rescaled = dst_image.ScaleUp(); + } + } + + const auto& resolution = Settings::values.resolution_info; + const auto scale_up = [&](u32 value) -> u32 { + if (value == 0) { + return 0U; + } + return std::max((value * resolution.up_scale) >> resolution.down_shift, 1U); + }; + + const auto scale_region = [&](Region2D& region) { + region.start.x = scale_up(region.start.x); + region.start.y = scale_up(region.start.y); + region.end.x = scale_up(region.end.x); + region.end.y = scale_up(region.end.y); + }; // TODO: Deduplicate const std::optional src_base = src_image.TryFindBase(src.Address()); @@ -446,20 +470,26 @@ void TextureCache

::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, const ImageViewInfo src_view_info(ImageViewType::e2D, images.src_format, src_range); const auto [src_framebuffer_id, src_view_id] = RenderTargetFromImage(src_id, src_view_info); const auto [src_samples_x, src_samples_y] = SamplesLog2(src_image.info.num_samples); - const Region2D src_region{ + Region2D src_region{ Offset2D{.x = copy.src_x0 >> src_samples_x, .y = copy.src_y0 >> src_samples_y}, Offset2D{.x = copy.src_x1 >> src_samples_x, .y = copy.src_y1 >> src_samples_y}, }; + if (is_src_rescaled) { + scale_region(src_region); + } const std::optional dst_base = dst_image.TryFindBase(dst.Address()); const SubresourceRange dst_range{.base = dst_base.value(), .extent = {1, 1}}; const ImageViewInfo dst_view_info(ImageViewType::e2D, images.dst_format, dst_range); const auto [dst_framebuffer_id, dst_view_id] = RenderTargetFromImage(dst_id, dst_view_info); const auto [dst_samples_x, dst_samples_y] = SamplesLog2(dst_image.info.num_samples); - const Region2D dst_region{ + Region2D dst_region{ Offset2D{.x = copy.dst_x0 >> dst_samples_x, .y = copy.dst_y0 >> dst_samples_y}, Offset2D{.x = copy.dst_x1 >> dst_samples_x, .y = copy.dst_y1 >> dst_samples_y}, }; + if (is_dst_rescaled) { + scale_region(dst_region); + } // Always call this after src_framebuffer_id was queried, as the address might be invalidated. Framebuffer* const dst_framebuffer = &slot_framebuffers[dst_framebuffer_id]; @@ -1514,18 +1544,28 @@ void TextureCache

::MarkModification(ImageBase& image) noexcept { template void TextureCache

::SynchronizeAliases(ImageId image_id) { boost::container::small_vector aliased_images; - ImageBase& image = slot_images[image_id]; + Image& image = slot_images[image_id]; + bool any_rescaled = True(image.flags & ImageFlagBits::Rescaled); u64 most_recent_tick = image.modification_tick; for (const AliasedImage& aliased : image.aliased_images) { ImageBase& aliased_image = slot_images[aliased.id]; if (image.modification_tick < aliased_image.modification_tick) { most_recent_tick = std::max(most_recent_tick, aliased_image.modification_tick); aliased_images.push_back(&aliased); + any_rescaled |= True(image.flags & ImageFlagBits::Rescaled); } } if (aliased_images.empty()) { return; } + const bool can_rescale = ImageCanRescale(image); + if (any_rescaled) { + if (can_rescale) { + ScaleUp(image); + } else { + ScaleDown(image); + } + } image.modification_tick = most_recent_tick; std::ranges::sort(aliased_images, [this](const AliasedImage* lhs, const AliasedImage* rhs) { const ImageBase& lhs_image = slot_images[lhs->id]; @@ -1533,6 +1573,14 @@ void TextureCache

::SynchronizeAliases(ImageId image_id) { return lhs_image.modification_tick < rhs_image.modification_tick; }); for (const AliasedImage* const aliased : aliased_images) { + if (any_rescaled) { + Image& aliased_image = slot_images[aliased->id]; + if (can_rescale) { + ScaleUp(aliased_image); + } else { + ScaleDown(aliased_image); + } + } CopyImage(image_id, aliased->id, aliased->copies); } } From 0a6c895af76503e7658f8660cc1af097e5d9c11f Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Wed, 21 Jul 2021 20:50:15 -0400 Subject: [PATCH 014/156] gl_texture_cache: Rescale fixes for multi-layered textures --- .../renderer_opengl/gl_texture_cache.cpp | 47 ++++++++++++------- .../renderer_opengl/gl_texture_cache.h | 1 + 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index ecde5b600..53b5c0947 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -479,6 +479,9 @@ TextureCacheRuntime::~TextureCacheRuntime() = default; void TextureCacheRuntime::Init() { resolution = Settings::values.resolution_info; is_rescaling_on = resolution.up_scale != 1 || resolution.down_shift != 0; + if (is_rescaling_on) { + rescale_fbo.Create(); + } } void TextureCacheRuntime::Finish() { @@ -867,8 +870,10 @@ void Image::CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t b } void Image::Scale(u32 up, u32 down) { - // TODO: Pass scaling factor? - if (gl_format == 0 || gl_type == 0) { + if (!runtime->is_rescaling_on) { + return; + } + if (gl_format == 0 && gl_type == 0) { // compressed textures return; } @@ -876,9 +881,7 @@ void Image::Scale(u32 up, u32 down) { UNIMPLEMENTED(); return; } - GLint prev_draw_fbo; GLint prev_read_fbo; - glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &prev_draw_fbo); glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &prev_read_fbo); const GLenum attachment = [this] { switch (GetFormatType(info.format)) { @@ -907,15 +910,10 @@ void Image::Scale(u32 up, u32 down) { } }(); const GLenum filter = (mask & GL_COLOR_BUFFER_BIT) != 0 ? GL_LINEAR : GL_NEAREST; - GLuint fbo_handle; - glGenFramebuffers(1, &fbo_handle); - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, fbo_handle); - glBindFramebuffer(GL_READ_FRAMEBUFFER, fbo_handle); - glNamedFramebufferTexture(fbo_handle, attachment, texture.handle, 0); - const auto scale_up = [&](u32 value) { return std::max((value * up) >> down, 1U); }; + const bool is_2d = info.type == ImageType::e2D; const u32 scaled_width = scale_up(info.size.width); - const u32 scaled_height = scale_up(info.size.height); + const u32 scaled_height = is_2d ? scale_up(info.size.height) : info.size.height; const u32 original_width = info.size.width; const u32 original_height = info.size.height; @@ -923,14 +921,31 @@ void Image::Scale(u32 up, u32 down) { scaled_info.size.width = scaled_width; scaled_info.size.height = scaled_height; auto scaled_texture = MakeImage(scaled_info, gl_internal_format); - - glBlitNamedFramebuffer(fbo_handle, fbo_handle, 0, 0, original_width, original_height, 0, 0, - scaled_width, scaled_height, mask, filter); - glCopyTextureSubImage3D(scaled_texture.handle, 0, 0, 0, 0, 0, 0, scaled_width, scaled_height); + const auto& blit_fbo = runtime->rescale_fbo; + for (s32 level = 0; level < info.resources.levels; ++level) { + const u32 level_width = scaled_width >> level; + const u32 level_height = scaled_height >> level; + glBindFramebuffer(GL_READ_FRAMEBUFFER, blit_fbo.handle); + glNamedFramebufferTexture(blit_fbo.handle, attachment, texture.handle, level); + glBlitNamedFramebuffer(blit_fbo.handle, blit_fbo.handle, 0, 0, original_width, + original_height, 0, 0, level_width, level_height, mask, filter); + switch (info.type) { + case ImageType::e1D: + glCopyTextureSubImage2D(scaled_texture.handle, level, 0, 0, 0, 0, level_width, + level_height); + break; + case ImageType::e2D: + glCopyTextureSubImage3D(scaled_texture.handle, level, 0, 0, 0, 0, 0, level_width, + level_height); + break; + case ImageType::e3D: + default: + UNREACHABLE(); + } + } texture = std::move(scaled_texture); // Restore previous framebuffers - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, prev_draw_fbo); glBindFramebuffer(GL_READ_FRAMEBUFFER, prev_read_fbo); } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 77ca14132..03de50ad5 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -155,6 +155,7 @@ private: std::array null_image_views{}; + OGLFramebuffer rescale_fbo; Settings::ResolutionScalingInfo resolution; bool is_rescaling_on{}; }; From fddf372c689d466372ac94b6920beb883c7740e4 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Wed, 21 Jul 2021 21:23:00 -0400 Subject: [PATCH 015/156] gl_texture_cache: Implement ScaleDown --- .../renderer_opengl/gl_texture_cache.cpp | 60 +++++++++++-------- .../renderer_opengl/gl_texture_cache.h | 2 +- 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 53b5c0947..34f74e37d 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -869,17 +869,17 @@ void Image::CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t b } } -void Image::Scale(u32 up, u32 down) { +bool Image::Scale(bool scale_src, bool scale_dst) { if (!runtime->is_rescaling_on) { - return; + return false; } if (gl_format == 0 && gl_type == 0) { // compressed textures - return; + return false; } if (info.type == ImageType::Linear) { UNIMPLEMENTED(); - return; + return false; } GLint prev_read_fbo; glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &prev_read_fbo); @@ -910,43 +910,58 @@ void Image::Scale(u32 up, u32 down) { } }(); const GLenum filter = (mask & GL_COLOR_BUFFER_BIT) != 0 ? GL_LINEAR : GL_NEAREST; - const auto scale_up = [&](u32 value) { return std::max((value * up) >> down, 1U); }; const bool is_2d = info.type == ImageType::e2D; + const auto& resolution = runtime->resolution; + const u32 up = resolution.up_scale; + const u32 down = resolution.down_shift; + + const auto scale_up = [&](u32 value) { return std::max((value * up) >> down, 1U); }; const u32 scaled_width = scale_up(info.size.width); const u32 scaled_height = is_2d ? scale_up(info.size.height) : info.size.height; const u32 original_width = info.size.width; const u32 original_height = info.size.height; - auto scaled_info = info; - scaled_info.size.width = scaled_width; - scaled_info.size.height = scaled_height; - auto scaled_texture = MakeImage(scaled_info, gl_internal_format); + const u32 src_width = scale_src ? scaled_width : original_width; + const u32 src_height = scale_src ? scaled_height : original_height; + const u32 dst_width = scale_dst ? scaled_width : original_width; + const u32 dst_height = scale_dst ? scaled_height : original_height; + + auto dst_info = info; + dst_info.size.width = dst_width; + dst_info.size.height = dst_height; + auto dst_texture = MakeImage(dst_info, gl_internal_format); + const auto& blit_fbo = runtime->rescale_fbo; for (s32 level = 0; level < info.resources.levels; ++level) { - const u32 level_width = scaled_width >> level; - const u32 level_height = scaled_height >> level; + const u32 src_level_width = std::max(1u, src_width >> level); + const u32 src_level_height = std::max(1u, src_height >> level); + const u32 dst_level_width = std::max(1u, dst_width >> level); + const u32 dst_level_height = std::max(1u, dst_height >> level); + glBindFramebuffer(GL_READ_FRAMEBUFFER, blit_fbo.handle); glNamedFramebufferTexture(blit_fbo.handle, attachment, texture.handle, level); - glBlitNamedFramebuffer(blit_fbo.handle, blit_fbo.handle, 0, 0, original_width, - original_height, 0, 0, level_width, level_height, mask, filter); + glBlitNamedFramebuffer(blit_fbo.handle, blit_fbo.handle, 0, 0, src_level_width, + src_level_height, 0, 0, dst_level_width, dst_level_height, mask, + filter); switch (info.type) { case ImageType::e1D: - glCopyTextureSubImage2D(scaled_texture.handle, level, 0, 0, 0, 0, level_width, - level_height); + glCopyTextureSubImage2D(dst_texture.handle, level, 0, 0, 0, 0, dst_level_width, + dst_level_height); break; case ImageType::e2D: - glCopyTextureSubImage3D(scaled_texture.handle, level, 0, 0, 0, 0, 0, level_width, - level_height); + glCopyTextureSubImage3D(dst_texture.handle, level, 0, 0, 0, 0, 0, dst_level_width, + dst_level_height); break; case ImageType::e3D: default: UNREACHABLE(); } } - texture = std::move(scaled_texture); + texture = std::move(dst_texture); // Restore previous framebuffers glBindFramebuffer(GL_READ_FRAMEBUFFER, prev_read_fbo); + return true; } bool Image::ScaleUp() { @@ -954,9 +969,7 @@ bool Image::ScaleUp() { return false; } flags |= ImageFlagBits::Rescaled; - const auto& resolution = runtime->resolution; - Scale(resolution.up_scale, resolution.down_shift); - return true; + return Scale(false, true); } bool Image::ScaleDown() { @@ -964,10 +977,7 @@ bool Image::ScaleDown() { return false; } flags &= ~ImageFlagBits::Rescaled; - UNIMPLEMENTED(); - // const auto& resolution = runtime->resolution; - // Scale(); - return true; + return Scale(true, false); } ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 03de50ad5..f2e48b4c7 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -203,7 +203,7 @@ private: void CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); - void Scale(u32 up, u32 down); + bool Scale(bool scale_src, bool scale_dst); OGLTexture texture; OGLTextureView store_view; From fb924ea85c22353c1a6f108d38372ab14355695b Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 21 Jul 2021 22:24:33 -0300 Subject: [PATCH 016/156] shader: Add resolution down factor opcode --- .../backend/glasm/emit_glasm_instructions.h | 1 + .../backend/glasm/emit_glasm_not_implemented.cpp | 5 +++++ .../backend/glsl/emit_glsl_context_get_set.cpp | 5 +++++ src/shader_recompiler/backend/glsl/emit_glsl_instructions.h | 1 + .../backend/spirv/emit_spirv_context_get_set.cpp | 5 +++++ .../backend/spirv/emit_spirv_instructions.h | 1 + src/shader_recompiler/frontend/ir/ir_emitter.cpp | 4 ++++ src/shader_recompiler/frontend/ir/ir_emitter.h | 2 ++ src/shader_recompiler/frontend/ir/opcodes.inc | 1 + 9 files changed, 25 insertions(+) diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h b/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h index 12afda43b..cb7232704 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h +++ b/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h @@ -72,6 +72,7 @@ void EmitInvocationId(EmitContext& ctx, IR::Inst& inst); void EmitSampleId(EmitContext& ctx, IR::Inst& inst); void EmitIsHelperInvocation(EmitContext& ctx, IR::Inst& inst); void EmitYDirection(EmitContext& ctx, IR::Inst& inst); +void EmitResolutionDownFactor(EmitContext& ctx, IR::Inst& inst); void EmitLoadLocal(EmitContext& ctx, IR::Inst& inst, ScalarU32 word_offset); void EmitWriteLocal(EmitContext& ctx, ScalarU32 word_offset, ScalarU32 value); void EmitUndefU1(EmitContext& ctx, IR::Inst& inst); diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp index e537f6073..807494063 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp +++ b/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp @@ -210,6 +210,11 @@ void EmitYDirection(EmitContext& ctx, IR::Inst& inst) { ctx.Add("MOV.F {}.x,y_direction[0].w;", inst); } +void EmitResolutionDownFactor(EmitContext& ctx, IR::Inst& inst) { + UNIMPLEMENTED(); + ctx.Add("MOV.F {}.x,1;", inst); +} + void EmitUndefU1(EmitContext& ctx, IR::Inst& inst) { ctx.Add("MOV.S {}.x,0;", inst); } diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp index 170db269a..f4ed090e3 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp @@ -445,6 +445,11 @@ void EmitYDirection(EmitContext& ctx, IR::Inst& inst) { ctx.AddF32("{}=gl_FrontMaterial.ambient.a;", inst); } +void EmitResolutionDownFactor(EmitContext& ctx, IR::Inst& inst) { + UNIMPLEMENTED(); + ctx.AddF32("{}=1.0f;", inst); +} + void EmitLoadLocal(EmitContext& ctx, IR::Inst& inst, std::string_view word_offset) { ctx.AddU32("{}=lmem[{}];", inst, word_offset); } diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h b/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h index 5936d086f..6cae0b84a 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h +++ b/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h @@ -85,6 +85,7 @@ void EmitInvocationId(EmitContext& ctx, IR::Inst& inst); void EmitSampleId(EmitContext& ctx, IR::Inst& inst); void EmitIsHelperInvocation(EmitContext& ctx, IR::Inst& inst); void EmitYDirection(EmitContext& ctx, IR::Inst& inst); +void EmitResolutionDownFactor(EmitContext& ctx, IR::Inst& inst); void EmitLoadLocal(EmitContext& ctx, IR::Inst& inst, std::string_view word_offset); void EmitWriteLocal(EmitContext& ctx, std::string_view word_offset, std::string_view value); void EmitUndefU1(EmitContext& ctx, IR::Inst& inst); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index d3a93d5f4..43f440dfb 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -526,6 +526,11 @@ Id EmitYDirection(EmitContext& ctx) { return ctx.Const(ctx.runtime_info.y_negate ? -1.0f : 1.0f); } +Id EmitResolutionDownFactor(EmitContext& ctx) { + UNIMPLEMENTED(); + return ctx.Const(1.0f); +} + Id EmitLoadLocal(EmitContext& ctx, Id word_offset) { const Id pointer{ctx.OpAccessChain(ctx.private_u32, ctx.local_memory, word_offset)}; return ctx.OpLoad(ctx.U32[1], pointer); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index c9db1c164..3d90b2286 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -75,6 +75,7 @@ Id EmitInvocationId(EmitContext& ctx); Id EmitSampleId(EmitContext& ctx); Id EmitIsHelperInvocation(EmitContext& ctx); Id EmitYDirection(EmitContext& ctx); +Id EmitResolutionDownFactor(EmitContext& ctx); Id EmitLoadLocal(EmitContext& ctx, Id word_offset); void EmitWriteLocal(EmitContext& ctx, Id word_offset, Id value); Id EmitUndefU1(EmitContext& ctx); diff --git a/src/shader_recompiler/frontend/ir/ir_emitter.cpp b/src/shader_recompiler/frontend/ir/ir_emitter.cpp index 13159a68d..9ae5da2a1 100644 --- a/src/shader_recompiler/frontend/ir/ir_emitter.cpp +++ b/src/shader_recompiler/frontend/ir/ir_emitter.cpp @@ -375,6 +375,10 @@ F32 IREmitter::YDirection() { return Inst(Opcode::YDirection); } +F32 IREmitter::ResolutionDownFactor() { + return Inst(Opcode::ResolutionDownFactor); +} + U32 IREmitter::LaneId() { return Inst(Opcode::LaneId); } diff --git a/src/shader_recompiler/frontend/ir/ir_emitter.h b/src/shader_recompiler/frontend/ir/ir_emitter.h index 1b89ca5a0..0c664d2fe 100644 --- a/src/shader_recompiler/frontend/ir/ir_emitter.h +++ b/src/shader_recompiler/frontend/ir/ir_emitter.h @@ -102,6 +102,8 @@ public: [[nodiscard]] U1 IsHelperInvocation(); [[nodiscard]] F32 YDirection(); + [[nodiscard]] F32 ResolutionDownFactor(); + [[nodiscard]] U32 LaneId(); [[nodiscard]] U32 LoadGlobalU8(const U64& address); diff --git a/src/shader_recompiler/frontend/ir/opcodes.inc b/src/shader_recompiler/frontend/ir/opcodes.inc index d91098c80..72751c5a0 100644 --- a/src/shader_recompiler/frontend/ir/opcodes.inc +++ b/src/shader_recompiler/frontend/ir/opcodes.inc @@ -62,6 +62,7 @@ OPCODE(InvocationId, U32, OPCODE(SampleId, U32, ) OPCODE(IsHelperInvocation, U1, ) OPCODE(YDirection, F32, ) +OPCODE(ResolutionDownFactor, F32, ) // Undefined OPCODE(UndefU1, U1, ) From 1672e9ba092f6bc268ece7619c4bae793c00c580 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 21 Jul 2021 22:25:34 -0300 Subject: [PATCH 017/156] shader: Fix resolution scaling pass --- src/shader_recompiler/CMakeLists.txt | 1 + .../ir_opt/collect_shader_info_pass.cpp | 3 + src/shader_recompiler/ir_opt/passes.h | 1 + .../ir_opt/rescaling_pass.cpp | 60 ++++++++----------- src/shader_recompiler/shader_info.h | 2 +- 5 files changed, 32 insertions(+), 35 deletions(-) diff --git a/src/shader_recompiler/CMakeLists.txt b/src/shader_recompiler/CMakeLists.txt index b5b7e5e83..bc3df80c8 100644 --- a/src/shader_recompiler/CMakeLists.txt +++ b/src/shader_recompiler/CMakeLists.txt @@ -221,6 +221,7 @@ add_library(shader_recompiler STATIC ir_opt/lower_fp16_to_fp32.cpp ir_opt/lower_int64_to_int32.cpp ir_opt/passes.h + ir_opt/rescaling_pass.cpp ir_opt/ssa_rewrite_pass.cpp ir_opt/texture_pass.cpp ir_opt/verification_pass.cpp diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp index f69e1c9cc..ef918f4d4 100644 --- a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp +++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp @@ -430,6 +430,9 @@ void VisitUsages(Info& info, IR::Inst& inst) { case IR::Opcode::IsHelperInvocation: info.uses_is_helper_invocation = true; break; + case IR::Opcode::ResolutionDownFactor: + info.uses_rescaling_uniform = true; + break; case IR::Opcode::LaneId: info.uses_subgroup_invocation_id = true; break; diff --git a/src/shader_recompiler/ir_opt/passes.h b/src/shader_recompiler/ir_opt/passes.h index 2f89b1ea0..f877c7ba0 100644 --- a/src/shader_recompiler/ir_opt/passes.h +++ b/src/shader_recompiler/ir_opt/passes.h @@ -19,6 +19,7 @@ void GlobalMemoryToStorageBufferPass(IR::Program& program); void IdentityRemovalPass(IR::Program& program); void LowerFp16ToFp32(IR::Program& program); void LowerInt64ToInt32(IR::Program& program); +void RescalingPass(IR::Program& program); void SsaRewritePass(IR::Program& program); void TexturePass(Environment& env, IR::Program& program); void VerificationPass(const IR::Program& program); diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp index d3ae3f159..293593c78 100644 --- a/src/shader_recompiler/ir_opt/rescaling_pass.cpp +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -3,7 +3,9 @@ // Refer to the license.txt file included. #include "common/alignment.h" +#include "common/settings.h" #include "shader_recompiler/environment.h" +#include "shader_recompiler/frontend/ir/ir_emitter.h" #include "shader_recompiler/frontend/ir/modifiers.h" #include "shader_recompiler/frontend/ir/program.h" #include "shader_recompiler/frontend/ir/value.h" @@ -12,59 +14,49 @@ namespace Shader::Optimization { namespace { - -void PatchFragCoord(IR::Inst& inst) { +void PatchFragCoord(IR::Block& block, IR::Inst& inst) { IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; - const IR::F32 inv_resolution_factor = IR::F32{Settings::values.resolution_info.down_factor}; - const IR::F32 new_get_attribute = ir.GetAttribute(inst.Arg(0).Attribute()); - const IR::F32 mul = ir.FMul(new_get_attribute, inv_resolution_factor); - const IR::U1 should_rescale = IR::U1{true}; - const IR::F32 selection = ir.Select(should_rescale, mul, new_get_attribute); - inst.ReplaceUsesWith(selection); + const IR::F32 down_factor{ir.ResolutionDownFactor()}; + const IR::F32 frag_coord{&inst}; + const IR::F32 downscaled_frag_coord{ir.FPMul(frag_coord, down_factor)}; + inst.ReplaceUsesWith(downscaled_frag_coord); } -void Visit(Info& info, IR::Inst& inst) { - info.requires_rescaling_uniform = false; +void Visit(const IR::Program& program, IR::Block& block, IR::Inst& inst) { + const bool is_fragment_shader{program.stage == Stage::Fragment}; switch (inst.GetOpcode()) { case IR::Opcode::GetAttribute: { - conast auto attrib = inst.Arg(0).Attribute(); - const bool is_frag = - attrib == IR::Attribute::PositionX || attrib == IR::Attribute::PositionY; - const bool must_path = is_frag && program.stage == Stage::Fragment; - if (must_path) { - PatchFragCoord(inst); - info.requires_rescaling_uniform = true; + const IR::Attribute attr{inst.Arg(0).Attribute()}; + switch (attr) { + case IR::Attribute::PositionX: + case IR::Attribute::PositionY: + if (is_fragment_shader) { + PatchFragCoord(block, inst); + } + break; + default: + break; } break; } - case IR::Opcode::ImageQueryDimensions: { - info.requires_rescaling_uniform |= true; + case IR::Opcode::ImageQueryDimensions: break; - } - case IR::Opcode::ImageFetch: { - info.requires_rescaling_uniform |= true; + case IR::Opcode::ImageFetch: break; - } - case IR::Opcode::ImageRead: { - info.requires_rescaling_uniform |= true; + case IR::Opcode::ImageRead: break; - } - case IR::Opcode::ImageWrite: { - info.requires_rescaling_uniform |= true; + case IR::Opcode::ImageWrite: break; - } default: break; } } +} // Anonymous namespace -} // namespace - -void RescalingPass(Environment& env, IR::Program& program) { - Info& info{program.info}; +void RescalingPass(IR::Program& program) { for (IR::Block* const block : program.post_order_blocks) { for (IR::Inst& inst : block->Instructions()) { - Visit(info, inst); + Visit(program, *block, inst); } } } diff --git a/src/shader_recompiler/shader_info.h b/src/shader_recompiler/shader_info.h index e7981a08c..7bac9e2cd 100644 --- a/src/shader_recompiler/shader_info.h +++ b/src/shader_recompiler/shader_info.h @@ -172,7 +172,7 @@ struct Info { bool uses_global_memory{}; bool uses_atomic_image_u32{}; bool uses_shadow_lod{}; - bool requires_rescaling_uniform{}; + bool uses_rescaling_uniform{}; IR::Type used_constant_buffer_types{}; IR::Type used_storage_buffer_types{}; From 520c4a44f6dfa78a462517c807d2e99d4775b84e Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 21 Jul 2021 22:39:05 -0300 Subject: [PATCH 018/156] vk_texture_cache: Fix scaling blit validation errors --- .../renderer_vulkan/vk_texture_cache.cpp | 159 +++++++++--------- 1 file changed, 78 insertions(+), 81 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 4e05058c9..d95eeafb9 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -593,6 +593,82 @@ struct RangedBarrierRange { UNREACHABLE_MSG("Invalid image format={}", format); return VK_FORMAT_R32_UINT; } + +void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, + boost::container::small_vector&& blit_regions, + VkImageAspectFlags aspect_mask) { + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([dst_image, src_image, aspect_mask, + regions = std::move(blit_regions)](vk::CommandBuffer cmdbuf) { + const VkImageSubresourceRange subresource_range{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }; + const std::array read_barriers{ + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = src_image, + .subresourceRange = subresource_range, + }, + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED, // Discard contents + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst_image, + .subresourceRange = subresource_range, + }, + }; + const std::array write_barriers{ + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = 0, + .dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_MEMORY_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = src_image, + .subresourceRange = subresource_range, + }, + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_MEMORY_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst_image, + .subresourceRange = subresource_range, + }, + }; + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, nullptr, nullptr, read_barriers); + cmdbuf.BlitImage(src_image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst_image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, regions, VK_FILTER_NEAREST); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + 0, nullptr, nullptr, write_barriers); + }); +} } // Anonymous namespace void TextureCacheRuntime::Init() { @@ -983,85 +1059,6 @@ void Image::DownloadMemory(const StagingBufferRef& map, std::span& blit_regions, - VkImageAspectFlags aspect_mask) { - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([dst_image, src_image, aspect_mask, - regions = std::move(blit_regions)](vk::CommandBuffer cmdbuf) { - const std::array read_barriers{ - VkImageMemoryBarrier{ - .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | - VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, - .oldLayout = VK_IMAGE_LAYOUT_GENERAL, - .newLayout = VK_IMAGE_LAYOUT_GENERAL, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .image = src_image, - .subresourceRange{ - .aspectMask = aspect_mask, - .baseMipLevel = 0, - .levelCount = VK_REMAINING_MIP_LEVELS, - .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, - }, - }, - VkImageMemoryBarrier{ - .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | - VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .oldLayout = VK_IMAGE_LAYOUT_GENERAL, - .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .image = dst_image, - .subresourceRange{ - .aspectMask = aspect_mask, - .baseMipLevel = 0, - .levelCount = VK_REMAINING_MIP_LEVELS, - .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, - }, - }, - }; - VkImageMemoryBarrier write_barrier{ - .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | - VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, - .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - .newLayout = VK_IMAGE_LAYOUT_GENERAL, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .image = dst_image, - .subresourceRange{ - .aspectMask = aspect_mask, - .baseMipLevel = 0, - .levelCount = VK_REMAINING_MIP_LEVELS, - .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, - }, - }; - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - 0, nullptr, nullptr, read_barriers); - const VkFilter vk_filter = VK_FILTER_NEAREST; - cmdbuf.BlitImage(src_image, VK_IMAGE_LAYOUT_GENERAL, dst_image, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, regions, vk_filter); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - 0, write_barrier); - }); -} - bool Image::ScaleUp(bool save_as_backup) { if (True(flags & ImageFlagBits::Rescaled)) { return false; @@ -1155,7 +1152,7 @@ bool Image::ScaleUp(bool save_as_backup) { }, }); } - BlitScale(*scheduler, *image, *rescaled_image, regions, aspect_mask); + BlitScale(*scheduler, *image, *rescaled_image, std::move(regions), aspect_mask); return true; } @@ -1239,7 +1236,7 @@ bool Image::ScaleDown(bool save_as_backup) { }, }); } - BlitScale(*scheduler, *image, *downscaled_image, regions, aspect_mask); + BlitScale(*scheduler, *image, *downscaled_image, std::move(regions), aspect_mask); if (save_as_backup) { backup_image = std::move(image); backup_commit = std::move(commit); From 0fb4b84383c457328ff3786f6359071543a0728d Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 21 Jul 2021 22:57:53 -0300 Subject: [PATCH 019/156] vk_texture_cache: Simplify and optimize scaling blits --- .../renderer_vulkan/vk_texture_cache.cpp | 168 +++++++----------- 1 file changed, 62 insertions(+), 106 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index d95eeafb9..720247b4e 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -594,12 +594,67 @@ struct RangedBarrierRange { return VK_FORMAT_R32_UINT; } -void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, - boost::container::small_vector&& blit_regions, - VkImageAspectFlags aspect_mask) { +void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, const ImageInfo& info, + VkImageAspectFlags aspect_mask, const Settings::ResolutionScalingInfo& resolution) { + const auto type = info.type; + const auto resources = info.resources; + const VkExtent2D extent{ + .width = info.size.width, + .height = info.size.height, + }; scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([dst_image, src_image, aspect_mask, - regions = std::move(blit_regions)](vk::CommandBuffer cmdbuf) { + scheduler.Record([dst_image, src_image, extent, resources, aspect_mask, resolution, + type](vk::CommandBuffer cmdbuf) { + const auto scale_up = [&](u32 value) { + return std::max((value * resolution.up_scale) >> resolution.down_shift, 1U); + }; + const bool is_2d = type == ImageType::e2D; + const VkOffset2D mip0_size{ + .x = static_cast(scale_up(extent.width)), + .y = static_cast(is_2d ? scale_up(extent.height) : extent.height), + }; + boost::container::small_vector regions; + regions.reserve(resources.levels); + for (s32 level = 0; level < resources.levels; level++) { + regions.push_back({ + .srcSubresource{ + .aspectMask = aspect_mask, + .mipLevel = static_cast(level), + .baseArrayLayer = 0, + .layerCount = static_cast(resources.layers), + }, + .srcOffsets{ + { + .x = 0, + .y = 0, + .z = 0, + }, + { + .x = static_cast(extent.width), + .y = static_cast(extent.height), + .z = 1, + }, + }, + .dstSubresource{ + .aspectMask = aspect_mask, + .mipLevel = static_cast(level), + .baseArrayLayer = 0, + .layerCount = static_cast(resources.layers), + }, + .dstOffsets{ + { + .x = 0, + .y = 0, + .z = 0, + }, + { + .x = std::max(1, mip0_size.x >> level), + .y = std::max(1, mip0_size.y >> level), + .z = 1, + }, + }, + }); + } const VkImageSubresourceRange subresource_range{ .aspectMask = aspect_mask, .baseMipLevel = 0, @@ -1102,57 +1157,7 @@ bool Image::ScaleUp(bool save_as_backup) { if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); } - - const auto scale_up = [&](u32 value) { - return std::max((value * resolution.up_scale) >> resolution.down_shift, 1U); - }; - - const bool is_2d = info.type == ImageType::e2D; - boost::container::small_vector regions; - regions.reserve(info.resources.levels); - for (s32 level = 0; level < info.resources.levels; level++) { - regions.push_back({ - .srcSubresource{ - .aspectMask = aspect_mask, - .mipLevel = u32(level), - .baseArrayLayer = 0, - .layerCount = u32(info.resources.layers), - }, - .srcOffsets{ - { - .x = 0, - .y = 0, - .z = 0, - }, - { - .x = static_cast(info.size.width), - .y = static_cast(info.size.height), - .z = 1, - }, - }, - .dstSubresource{ - .aspectMask = aspect_mask, - .mipLevel = u32(level), - .baseArrayLayer = 0, - .layerCount = u32(info.resources.layers), - }, - .dstOffsets{ - { - .x = 0, - .y = 0, - .z = 0, - }, - { - .x = std::max(1, static_cast(scale_up(info.size.width)) >> level), - .y = std::max(1, static_cast(is_2d ? scale_up(info.size.height) - : info.size.height) >> - level), - .z = 1, - }, - }, - }); - } - BlitScale(*scheduler, *image, *rescaled_image, std::move(regions), aspect_mask); + BlitScale(*scheduler, *image, *rescaled_image, info, aspect_mask, resolution); return true; } @@ -1183,60 +1188,11 @@ bool Image::ScaleDown(bool save_as_backup) { MemoryCommit new_commit( runtime->memory_allocator.Commit(downscaled_image, MemoryUsage::DeviceLocal)); - const auto scale_up = [&](u32 value) { - return (value * resolution.up_scale) >> resolution.down_shift; - }; - if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); } + BlitScale(*scheduler, *image, *downscaled_image, info, aspect_mask, resolution); - const bool is_2d = info.type == ImageType::e2D; - boost::container::small_vector regions; - regions.reserve(info.resources.levels); - for (s32 level = 0; level < info.resources.levels; level++) { - regions.push_back({ - .srcSubresource{ - .aspectMask = aspect_mask, - .mipLevel = static_cast(level), - .baseArrayLayer = 0, - .layerCount = static_cast(info.resources.layers), - }, - .srcOffsets{ - { - .x = 0, - .y = 0, - .z = 0, - }, - { - .x = std::max(1, static_cast(scale_up(info.size.width)) >> level), - .y = std::max(1, static_cast(is_2d ? scale_up(info.size.height) - : info.size.height) >> - level), - .z = 1, - }, - }, - .dstSubresource{ - .aspectMask = aspect_mask, - .mipLevel = static_cast(level), - .baseArrayLayer = 0, - .layerCount = static_cast(info.resources.layers), - }, - .dstOffsets{ - { - .x = 0, - .y = 0, - .z = 0, - }, - { - .x = std::max(1, static_cast(info.size.width) >> level), - .y = std::max(1, static_cast(info.size.height) >> level), - .z = 1, - }, - }, - }); - } - BlitScale(*scheduler, *image, *downscaled_image, std::move(regions), aspect_mask); if (save_as_backup) { backup_image = std::move(image); backup_commit = std::move(commit); From d5143c83a9eacf23cc66616bcd1a1b0ccfda5082 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 22 Jul 2021 00:16:19 -0400 Subject: [PATCH 020/156] texture_cache: Fix typo in aliased image rescaling --- src/video_core/texture_cache/texture_cache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 25fea8240..179f37526 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -1552,7 +1552,7 @@ void TextureCache

::SynchronizeAliases(ImageId image_id) { if (image.modification_tick < aliased_image.modification_tick) { most_recent_tick = std::max(most_recent_tick, aliased_image.modification_tick); aliased_images.push_back(&aliased); - any_rescaled |= True(image.flags & ImageFlagBits::Rescaled); + any_rescaled |= True(aliased_image.flags & ImageFlagBits::Rescaled); } } if (aliased_images.empty()) { From fad2c92a39cb0cfba2bc3241e779e3a983646d82 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 22 Jul 2021 01:34:46 -0400 Subject: [PATCH 021/156] gl_texture_cache: Simplify rescaling --- .../renderer_opengl/gl_texture_cache.cpp | 31 ++++++++----------- .../renderer_opengl/gl_texture_cache.h | 3 +- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 34f74e37d..5e2695576 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -480,7 +480,8 @@ void TextureCacheRuntime::Init() { resolution = Settings::values.resolution_info; is_rescaling_on = resolution.up_scale != 1 || resolution.down_shift != 0; if (is_rescaling_on) { - rescale_fbo.Create(); + rescale_draw_fbo.Create(); + rescale_read_fbo.Create(); } } @@ -881,8 +882,11 @@ bool Image::Scale(bool scale_src, bool scale_dst) { UNIMPLEMENTED(); return false; } + GLint prev_draw_fbo; GLint prev_read_fbo; + glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &prev_draw_fbo); glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &prev_read_fbo); + const GLenum attachment = [this] { switch (GetFormatType(info.format)) { case SurfaceType::ColorTexture: @@ -931,35 +935,26 @@ bool Image::Scale(bool scale_src, bool scale_dst) { dst_info.size.height = dst_height; auto dst_texture = MakeImage(dst_info, gl_internal_format); - const auto& blit_fbo = runtime->rescale_fbo; + const auto& read_fbo = runtime->rescale_read_fbo; + const auto& draw_fbo = runtime->rescale_draw_fbo; + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw_fbo.handle); + glBindFramebuffer(GL_READ_FRAMEBUFFER, read_fbo.handle); for (s32 level = 0; level < info.resources.levels; ++level) { const u32 src_level_width = std::max(1u, src_width >> level); const u32 src_level_height = std::max(1u, src_height >> level); const u32 dst_level_width = std::max(1u, dst_width >> level); const u32 dst_level_height = std::max(1u, dst_height >> level); - glBindFramebuffer(GL_READ_FRAMEBUFFER, blit_fbo.handle); - glNamedFramebufferTexture(blit_fbo.handle, attachment, texture.handle, level); - glBlitNamedFramebuffer(blit_fbo.handle, blit_fbo.handle, 0, 0, src_level_width, + glNamedFramebufferTexture(read_fbo.handle, attachment, texture.handle, level); + glNamedFramebufferTexture(draw_fbo.handle, attachment, dst_texture.handle, level); + glBlitNamedFramebuffer(read_fbo.handle, draw_fbo.handle, 0, 0, src_level_width, src_level_height, 0, 0, dst_level_width, dst_level_height, mask, filter); - switch (info.type) { - case ImageType::e1D: - glCopyTextureSubImage2D(dst_texture.handle, level, 0, 0, 0, 0, dst_level_width, - dst_level_height); - break; - case ImageType::e2D: - glCopyTextureSubImage3D(dst_texture.handle, level, 0, 0, 0, 0, 0, dst_level_width, - dst_level_height); - break; - case ImageType::e3D: - default: - UNREACHABLE(); - } } texture = std::move(dst_texture); // Restore previous framebuffers + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, prev_draw_fbo); glBindFramebuffer(GL_READ_FRAMEBUFFER, prev_read_fbo); return true; } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index f2e48b4c7..787b63e87 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -155,7 +155,8 @@ private: std::array null_image_views{}; - OGLFramebuffer rescale_fbo; + OGLFramebuffer rescale_draw_fbo; + OGLFramebuffer rescale_read_fbo; Settings::ResolutionScalingInfo resolution; bool is_rescaling_on{}; }; From e580299467a8aa7f56e8b66a63112dc06c870b15 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 22 Jul 2021 04:29:00 -0300 Subject: [PATCH 022/156] shader: Fix rescaling pass --- src/shader_recompiler/ir_opt/rescaling_pass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp index 293593c78..f8d04b6e3 100644 --- a/src/shader_recompiler/ir_opt/rescaling_pass.cpp +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -17,7 +17,7 @@ namespace { void PatchFragCoord(IR::Block& block, IR::Inst& inst) { IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; const IR::F32 down_factor{ir.ResolutionDownFactor()}; - const IR::F32 frag_coord{&inst}; + const IR::F32 frag_coord{ir.GetAttribute(inst.Arg(0).Attribute())}; const IR::F32 downscaled_frag_coord{ir.FPMul(frag_coord, down_factor)}; inst.ReplaceUsesWith(downscaled_frag_coord); } From 43aa695a0415821e42fabf78a8a624edaadebab7 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 25 Jul 2021 21:20:12 -0300 Subject: [PATCH 023/156] common/settings: Remove unused scaling options --- src/common/settings.cpp | 23 +++++++---------------- src/common/settings.h | 2 -- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/src/common/settings.cpp b/src/common/settings.cpp index 6f3acee79..4b7fa4b82 100644 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -109,51 +109,42 @@ void UpdateRescalingInfo() { auto setup = values.resolution_setup.GetValue(); auto& info = values.resolution_info; switch (setup) { - case ResolutionSetup::Res1_2X: { + case ResolutionSetup::Res1_2X: info.up_scale = 1; info.down_shift = 1; break; - } - case ResolutionSetup::Res3_4X: { + case ResolutionSetup::Res3_4X: info.up_scale = 3; info.down_shift = 2; break; - } - case ResolutionSetup::Res1X: { + case ResolutionSetup::Res1X: info.up_scale = 1; info.down_shift = 0; break; - } case ResolutionSetup::Res3_2X: { info.up_scale = 3; info.down_shift = 1; break; } - case ResolutionSetup::Res2X: { + case ResolutionSetup::Res2X: info.up_scale = 2; info.down_shift = 0; break; - } - case ResolutionSetup::Res3X: { + case ResolutionSetup::Res3X: info.up_scale = 3; info.down_shift = 0; break; - } - case ResolutionSetup::Res4X: { + case ResolutionSetup::Res4X: info.up_scale = 4; info.down_shift = 0; break; - } - default: { + default: UNREACHABLE(); info.up_scale = 1; info.down_shift = 0; } - } info.up_factor = static_cast(info.up_scale) / (1U << info.down_shift); info.down_factor = static_cast(1U << info.down_shift) / info.up_scale; - info.size_up = info.up_scale * info.up_scale; - info.size_shift = info.down_shift * 2; info.active = info.up_scale != 1 || info.down_shift != 0; } diff --git a/src/common/settings.h b/src/common/settings.h index 2b11984b4..ca88c086b 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -67,8 +67,6 @@ struct ResolutionScalingInfo { u32 down_shift{0}; f32 up_factor{1.0f}; f32 down_factor{1.0f}; - u32 size_up{1}; - u32 size_shift{0}; bool active{}; }; From 95761cc6a70987b2625d68c4d9da4e2622f57808 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 25 Jul 2021 21:27:21 -0300 Subject: [PATCH 024/156] shader: Add integer division opcodes --- .../backend/glasm/emit_glasm_instructions.h | 2 ++ .../backend/glasm/emit_glasm_integer.cpp | 8 ++++++++ .../backend/glsl/emit_glsl_instructions.h | 2 ++ src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp | 8 ++++++++ .../backend/spirv/emit_spirv_instructions.h | 2 ++ .../backend/spirv/emit_spirv_integer.cpp | 8 ++++++++ src/shader_recompiler/frontend/ir/ir_emitter.cpp | 4 ++++ src/shader_recompiler/frontend/ir/ir_emitter.h | 1 + src/shader_recompiler/frontend/ir/opcodes.inc | 2 ++ 9 files changed, 37 insertions(+) diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h b/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h index cb7232704..4f8dd8e42 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h +++ b/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h @@ -304,6 +304,8 @@ void EmitIAdd64(EmitContext& ctx, IR::Inst& inst, Register a, Register b); void EmitISub32(EmitContext& ctx, IR::Inst& inst, ScalarS32 a, ScalarS32 b); void EmitISub64(EmitContext& ctx, IR::Inst& inst, Register a, Register b); void EmitIMul32(EmitContext& ctx, IR::Inst& inst, ScalarS32 a, ScalarS32 b); +void EmitSDiv32(EmitContext& ctx, IR::Inst& inst, ScalarS32 a, ScalarS32 b); +void EmitUDiv32(EmitContext& ctx, IR::Inst& inst, ScalarU32 a, ScalarU32 b); void EmitINeg32(EmitContext& ctx, IR::Inst& inst, ScalarS32 value); void EmitINeg64(EmitContext& ctx, IR::Inst& inst, Register value); void EmitIAbs32(EmitContext& ctx, IR::Inst& inst, ScalarS32 value); diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_integer.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_integer.cpp index f55c26b76..8aa494a4d 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm_integer.cpp +++ b/src/shader_recompiler/backend/glasm/emit_glasm_integer.cpp @@ -90,6 +90,14 @@ void EmitIMul32(EmitContext& ctx, IR::Inst& inst, ScalarS32 a, ScalarS32 b) { ctx.Add("MUL.S {}.x,{},{};", inst, a, b); } +void EmitSDiv32(EmitContext& ctx, IR::Inst& inst, ScalarS32 a, ScalarS32 b) { + ctx.Add("DIV.S {}.x,{},{};", inst, a, b); +} + +void EmitUDiv32(EmitContext& ctx, IR::Inst& inst, ScalarU32 a, ScalarU32 b) { + ctx.Add("DIV.U {}.x,{},{};", inst, a, b); +} + void EmitINeg32(EmitContext& ctx, IR::Inst& inst, ScalarS32 value) { if (value.type != Type::Register && static_cast(value.imm_u32) < 0) { ctx.Add("MOV.S {},{};", inst, -static_cast(value.imm_u32)); diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h b/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h index 6cae0b84a..159e4b770 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h +++ b/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h @@ -363,6 +363,8 @@ void EmitIAdd64(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::strin void EmitISub32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b); void EmitISub64(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b); void EmitIMul32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b); +void EmitSDiv32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b); +void EmitUDiv32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b); void EmitINeg32(EmitContext& ctx, IR::Inst& inst, std::string_view value); void EmitINeg64(EmitContext& ctx, IR::Inst& inst, std::string_view value); void EmitIAbs32(EmitContext& ctx, IR::Inst& inst, std::string_view value); diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp index 38419f88f..88c1d4c5e 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp @@ -78,6 +78,14 @@ void EmitIMul32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::strin ctx.AddU32("{}=uint({}*{});", inst, a, b); } +void EmitSDiv32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b) { + ctx.AddU32("{}=uint(int({})/int({}));", inst, a, b); +} + +void EmitUDiv32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b) { + ctx.AddU32("{}={}/{};", inst, a, b); +} + void EmitINeg32(EmitContext& ctx, IR::Inst& inst, std::string_view value) { ctx.AddU32("{}=uint(-({}));", inst, value); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 3d90b2286..44eda16ca 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -284,6 +284,8 @@ Id EmitIAdd64(EmitContext& ctx, Id a, Id b); Id EmitISub32(EmitContext& ctx, Id a, Id b); Id EmitISub64(EmitContext& ctx, Id a, Id b); Id EmitIMul32(EmitContext& ctx, Id a, Id b); +Id EmitSDiv32(EmitContext& ctx, Id a, Id b); +Id EmitUDiv32(EmitContext& ctx, Id a, Id b); Id EmitINeg32(EmitContext& ctx, Id value); Id EmitINeg64(EmitContext& ctx, Id value); Id EmitIAbs32(EmitContext& ctx, Id value); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp index 3501d7495..50277eec3 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp @@ -72,6 +72,14 @@ Id EmitIMul32(EmitContext& ctx, Id a, Id b) { return ctx.OpIMul(ctx.U32[1], a, b); } +Id EmitSDiv32(EmitContext& ctx, Id a, Id b) { + return ctx.OpSDiv(ctx.U32[1], a, b); +} + +Id EmitUDiv32(EmitContext& ctx, Id a, Id b) { + return ctx.OpUDiv(ctx.U32[1], a, b); +} + Id EmitINeg32(EmitContext& ctx, Id value) { return ctx.OpSNegate(ctx.U32[1], value); } diff --git a/src/shader_recompiler/frontend/ir/ir_emitter.cpp b/src/shader_recompiler/frontend/ir/ir_emitter.cpp index 9ae5da2a1..3dfba8e71 100644 --- a/src/shader_recompiler/frontend/ir/ir_emitter.cpp +++ b/src/shader_recompiler/frontend/ir/ir_emitter.cpp @@ -1145,6 +1145,10 @@ U32 IREmitter::IMul(const U32& a, const U32& b) { return Inst(Opcode::IMul32, a, b); } +U32 IREmitter::IDiv(const U32& a, const U32& b, bool is_signed) { + return Inst(is_signed ? Opcode::SDiv32 : Opcode::UDiv32, a, b); +} + U32U64 IREmitter::INeg(const U32U64& value) { switch (value.Type()) { case Type::U32: diff --git a/src/shader_recompiler/frontend/ir/ir_emitter.h b/src/shader_recompiler/frontend/ir/ir_emitter.h index 0c664d2fe..1959be42e 100644 --- a/src/shader_recompiler/frontend/ir/ir_emitter.h +++ b/src/shader_recompiler/frontend/ir/ir_emitter.h @@ -209,6 +209,7 @@ public: [[nodiscard]] U32U64 IAdd(const U32U64& a, const U32U64& b); [[nodiscard]] U32U64 ISub(const U32U64& a, const U32U64& b); [[nodiscard]] U32 IMul(const U32& a, const U32& b); + [[nodiscard]] U32 IDiv(const U32& a, const U32& b, bool is_signed = false); [[nodiscard]] U32U64 INeg(const U32U64& value); [[nodiscard]] U32 IAbs(const U32& value); [[nodiscard]] U32U64 ShiftLeftLogical(const U32U64& base, const U32& shift); diff --git a/src/shader_recompiler/frontend/ir/opcodes.inc b/src/shader_recompiler/frontend/ir/opcodes.inc index 72751c5a0..c05e6d312 100644 --- a/src/shader_recompiler/frontend/ir/opcodes.inc +++ b/src/shader_recompiler/frontend/ir/opcodes.inc @@ -287,6 +287,8 @@ OPCODE(IAdd64, U64, U64, OPCODE(ISub32, U32, U32, U32, ) OPCODE(ISub64, U64, U64, U64, ) OPCODE(IMul32, U32, U32, U32, ) +OPCODE(SDiv32, U32, U32, U32, ) +OPCODE(UDiv32, U32, U32, U32, ) OPCODE(INeg32, U32, U32, ) OPCODE(INeg64, U64, U64, ) OPCODE(IAbs32, U32, U32, ) From c892359d1bf228d3c119c953c20fff44f280a7c4 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 25 Jul 2021 21:34:17 -0300 Subject: [PATCH 025/156] shader: Add copy constructor to instructions --- src/shader_recompiler/frontend/ir/basic_block.cpp | 5 +++++ src/shader_recompiler/frontend/ir/basic_block.h | 3 +++ .../frontend/ir/microinstruction.cpp | 11 +++++++++++ src/shader_recompiler/frontend/ir/value.h | 2 +- 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/shader_recompiler/frontend/ir/basic_block.cpp b/src/shader_recompiler/frontend/ir/basic_block.cpp index 7c08b25ce..974efa4a0 100644 --- a/src/shader_recompiler/frontend/ir/basic_block.cpp +++ b/src/shader_recompiler/frontend/ir/basic_block.cpp @@ -22,6 +22,11 @@ void Block::AppendNewInst(Opcode op, std::initializer_list args) { PrependNewInst(end(), op, args); } +Block::iterator Block::PrependNewInst(iterator insertion_point, const Inst& base_inst) { + Inst* const inst{inst_pool->Create(base_inst)}; + return instructions.insert(insertion_point, *inst); +} + Block::iterator Block::PrependNewInst(iterator insertion_point, Opcode op, std::initializer_list args, u32 flags) { Inst* const inst{inst_pool->Create(op, flags)}; diff --git a/src/shader_recompiler/frontend/ir/basic_block.h b/src/shader_recompiler/frontend/ir/basic_block.h index 9ce1ed07e..fbfe98266 100644 --- a/src/shader_recompiler/frontend/ir/basic_block.h +++ b/src/shader_recompiler/frontend/ir/basic_block.h @@ -40,6 +40,9 @@ public: /// Appends a new instruction to the end of this basic block. void AppendNewInst(Opcode op, std::initializer_list args); + /// Prepends a copy of an instruction to this basic block before the insertion point. + iterator PrependNewInst(iterator insertion_point, const Inst& base_inst); + /// Prepends a new instruction to this basic block before the insertion point. iterator PrependNewInst(iterator insertion_point, Opcode op, std::initializer_list args = {}, u32 flags = 0); diff --git a/src/shader_recompiler/frontend/ir/microinstruction.cpp b/src/shader_recompiler/frontend/ir/microinstruction.cpp index 30b470bdd..97e2bf6af 100644 --- a/src/shader_recompiler/frontend/ir/microinstruction.cpp +++ b/src/shader_recompiler/frontend/ir/microinstruction.cpp @@ -47,6 +47,17 @@ Inst::Inst(IR::Opcode op_, u32 flags_) noexcept : op{op_}, flags{flags_} { } } +Inst::Inst(const Inst& base) : op{base.op}, flags{base.flags} { + if (base.op == Opcode::Phi) { + throw NotImplementedException("Copying phi node"); + } + std::construct_at(&args); + const size_t num_args{base.NumArgs()}; + for (size_t index = 0; index < num_args; ++index) { + SetArg(index, base.Arg(index)); + } +} + Inst::~Inst() { if (op == Opcode::Phi) { std::destroy_at(&phi_args); diff --git a/src/shader_recompiler/frontend/ir/value.h b/src/shader_recompiler/frontend/ir/value.h index 6c9ef6bdd..947579852 100644 --- a/src/shader_recompiler/frontend/ir/value.h +++ b/src/shader_recompiler/frontend/ir/value.h @@ -116,10 +116,10 @@ public: class Inst : public boost::intrusive::list_base_hook<> { public: explicit Inst(IR::Opcode op_, u32 flags_) noexcept; + explicit Inst(const Inst& base); ~Inst(); Inst& operator=(const Inst&) = delete; - Inst(const Inst&) = delete; Inst& operator=(Inst&&) = delete; Inst(Inst&&) = delete; From 74efa57c1b78b4a07ad0003e847bd5f0aa7c7bb5 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 25 Jul 2021 21:44:13 -0300 Subject: [PATCH 026/156] texture_cache: Add image getters --- src/video_core/texture_cache/texture_cache.h | 10 ++++++++++ src/video_core/texture_cache/texture_cache_base.h | 6 ++++++ 2 files changed, 16 insertions(+) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 179f37526..ae74a6ecf 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -119,6 +119,16 @@ typename P::ImageView& TextureCache

::GetImageView(ImageViewId id) noexcept { return slot_image_views[id]; } +template +const typename P::Image& TextureCache

::GetImage(ImageId id) const noexcept { + return slot_images[id]; +} + +template +typename P::Image& TextureCache

::GetImage(ImageId id) noexcept { + return slot_images[id]; +} + template void TextureCache

::MarkModification(ImageId id) noexcept { MarkModification(slot_images[id]); diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index deddf0d30..0d2d9ec2e 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -95,6 +95,12 @@ public: /// Return a reference to the given image view id [[nodiscard]] ImageView& GetImageView(ImageViewId id) noexcept; + /// Return a constant reference to the given image id + [[nodiscard]] const Image& GetImage(ImageId id) const noexcept; + + /// Return a reference to the given image id + [[nodiscard]] Image& GetImage(ImageId id) noexcept; + /// Mark an image as modified from the GPU void MarkModification(ImageId id) noexcept; From c15332c44fa50dc44e2ebd1a682048f1e30dc136 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 25 Jul 2021 22:04:53 -0300 Subject: [PATCH 027/156] shader: Add IsTextureScaled opcode --- src/shader_recompiler/backend/glasm/emit_glasm_image.cpp | 8 ++++++++ .../backend/glasm/emit_glasm_instructions.h | 1 + src/shader_recompiler/backend/glsl/emit_glsl_image.cpp | 8 ++++++++ .../backend/glsl/emit_glsl_instructions.h | 2 ++ src/shader_recompiler/backend/spirv/emit_spirv_image.cpp | 4 ++++ .../backend/spirv/emit_spirv_instructions.h | 1 + src/shader_recompiler/frontend/ir/ir_emitter.cpp | 4 ++++ src/shader_recompiler/frontend/ir/ir_emitter.h | 3 +++ src/shader_recompiler/frontend/ir/opcodes.inc | 2 ++ src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp | 1 + 10 files changed, 34 insertions(+) diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp index 09e3a9b82..583ed3cf2 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp +++ b/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp @@ -608,6 +608,14 @@ void EmitImageWrite(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Re ctx.Add("STOREIM.{} {},{},{},{};", format, image, color, coord, type); } +void EmitIsTextureScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index) { + if (!index.IsImmediate()) { + throw NotImplementedException("Non-constant texture rescaling"); + } + UNIMPLEMENTED(); + ctx.Add("MOV.S {}.x,-1;", inst); +} + void EmitImageAtomicIAdd32(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Register coord, ScalarU32 value) { ImageAtomic(ctx, inst, index, coord, value, "ADD.U32"); diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h b/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h index 4f8dd8e42..e2b7d601d 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h +++ b/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h @@ -556,6 +556,7 @@ void EmitImageGradient(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, void EmitImageRead(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Register coord); void EmitImageWrite(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Register coord, Register color); +void EmitIsTextureScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index); void EmitBindlessImageAtomicIAdd32(EmitContext&); void EmitBindlessImageAtomicSMin32(EmitContext&); void EmitBindlessImageAtomicUMin32(EmitContext&); diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp index 447eb8e0a..099e0160b 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp @@ -612,6 +612,14 @@ void EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst& inst, const IR::Value value); } +void EmitIsTextureScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index) { + if (!index.IsImmediate()) { + throw NotImplementedException("Non-constant texture rescaling"); + } + UNIMPLEMENTED(); + ctx.AddU1("{}=true;", inst); +} + void EmitBindlessImageSampleImplicitLod(EmitContext&) { NotImplemented(); } diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h b/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h index 159e4b770..f86502e4c 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h +++ b/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h @@ -630,6 +630,8 @@ void EmitImageRead(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, std::string_view coords); void EmitImageWrite(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, std::string_view coords, std::string_view color); +void EmitIsTextureScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index); +void EmitIsImageScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index); void EmitBindlessImageAtomicIAdd32(EmitContext&); void EmitBindlessImageAtomicSMin32(EmitContext&); void EmitBindlessImageAtomicUMin32(EmitContext&); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index 1d5364309..2f925cc3e 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp @@ -470,4 +470,8 @@ void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id ctx.OpImageWrite(Image(ctx, index, info), coords, color); } +Id EmitIsTextureScaled([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] const IR::Value& index) { + return ctx.false_value; +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 44eda16ca..69fc18f5f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -513,6 +513,7 @@ Id EmitImageGradient(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, I Id derivates, Id offset, Id lod_clamp); Id EmitImageRead(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords); void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, Id color); +Id EmitIsTextureScaled(EmitContext& ctx, const IR::Value& index); Id EmitBindlessImageAtomicIAdd32(EmitContext&); Id EmitBindlessImageAtomicSMin32(EmitContext&); Id EmitBindlessImageAtomicUMin32(EmitContext&); diff --git a/src/shader_recompiler/frontend/ir/ir_emitter.cpp b/src/shader_recompiler/frontend/ir/ir_emitter.cpp index 3dfba8e71..3ccd91c10 100644 --- a/src/shader_recompiler/frontend/ir/ir_emitter.cpp +++ b/src/shader_recompiler/frontend/ir/ir_emitter.cpp @@ -1946,6 +1946,10 @@ Value IREmitter::ImageAtomicExchange(const Value& handle, const Value& coords, c return Inst(op, Flags{info}, handle, coords, value); } +U1 IREmitter::IsTextureScaled(const U32& index) { + return Inst(Opcode::IsTextureScaled, index); +} + U1 IREmitter::VoteAll(const U1& value) { return Inst(Opcode::VoteAll, value); } diff --git a/src/shader_recompiler/frontend/ir/ir_emitter.h b/src/shader_recompiler/frontend/ir/ir_emitter.h index 1959be42e..a78628413 100644 --- a/src/shader_recompiler/frontend/ir/ir_emitter.h +++ b/src/shader_recompiler/frontend/ir/ir_emitter.h @@ -359,6 +359,9 @@ public: TextureInstInfo info); [[nodiscard]] Value ImageAtomicExchange(const Value& handle, const Value& coords, const Value& value, TextureInstInfo info); + + [[nodiscard]] U1 IsTextureScaled(const U32& index); + [[nodiscard]] U1 VoteAll(const U1& value); [[nodiscard]] U1 VoteAny(const U1& value); [[nodiscard]] U1 VoteEqual(const U1& value); diff --git a/src/shader_recompiler/frontend/ir/opcodes.inc b/src/shader_recompiler/frontend/ir/opcodes.inc index c05e6d312..ec629428a 100644 --- a/src/shader_recompiler/frontend/ir/opcodes.inc +++ b/src/shader_recompiler/frontend/ir/opcodes.inc @@ -493,6 +493,8 @@ OPCODE(ImageGradient, F32x4, Opaq OPCODE(ImageRead, U32x4, Opaque, Opaque, ) OPCODE(ImageWrite, Void, Opaque, Opaque, U32x4, ) +OPCODE(IsTextureScaled, U1, U32, ) + // Atomic Image operations OPCODE(BindlessImageAtomicIAdd32, U32, U32, Opaque, U32, ) diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp index ef918f4d4..ed82fa2ac 100644 --- a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp +++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp @@ -431,6 +431,7 @@ void VisitUsages(Info& info, IR::Inst& inst) { info.uses_is_helper_invocation = true; break; case IR::Opcode::ResolutionDownFactor: + case IR::Opcode::IsTextureScaled: info.uses_rescaling_uniform = true; break; case IR::Opcode::LaneId: From 01379c5e3cd1181c7c7d37a672364fbcad627fb0 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 25 Jul 2021 22:20:56 -0300 Subject: [PATCH 028/156] shader/rescaling_pass: Patch more instructions --- .../ir_opt/rescaling_pass.cpp | 105 +++++++++++++++++- 1 file changed, 101 insertions(+), 4 deletions(-) diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp index f8d04b6e3..d5b98ae6e 100644 --- a/src/shader_recompiler/ir_opt/rescaling_pass.cpp +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -22,6 +22,105 @@ void PatchFragCoord(IR::Block& block, IR::Inst& inst) { inst.ReplaceUsesWith(downscaled_frag_coord); } +[[nodiscard]] IR::U32 Scale(IR::IREmitter& ir, const IR::U1& is_scaled, const IR::U32& value) { + IR::U32 scaled_value{value}; + bool changed{}; + if (const u32 up_scale = Settings::values.resolution_info.up_scale; up_scale != 1) { + scaled_value = ir.IMul(value, ir.Imm32(up_scale)); + changed = true; + } + if (const u32 down_shift = Settings::values.resolution_info.down_shift; down_shift != 0) { + scaled_value = ir.ShiftRightArithmetic(value, ir.Imm32(down_shift)); + changed = true; + } + if (changed) { + return IR::U32{ir.Select(is_scaled, scaled_value, value)}; + } else { + return value; + } +} + +[[nodiscard]] IR::U32 DownScale(IR::IREmitter& ir, IR::U32 value) { + if (const u32 down_shift = Settings::values.resolution_info.down_shift; down_shift != 0) { + value = ir.ShiftLeftLogical(value, ir.Imm32(down_shift)); + } + if (const u32 up_scale = Settings::values.resolution_info.up_scale; up_scale != 1) { + value = ir.IDiv(value, ir.Imm32(up_scale)); + } + return value; +} + +void PatchImageQueryDimensions(IR::Block& block, IR::Inst& inst) { + const auto it{IR::Block::InstructionList::s_iterator_to(inst)}; + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + const auto info{inst.Flags()}; + switch (info.type) { + case TextureType::Color1D: + case TextureType::ColorArray1D: { + const IR::Value new_inst{&*block.PrependNewInst(it, inst)}; + const IR::U32 width{DownScale(ir, IR::U32{ir.CompositeExtract(new_inst, 0)})}; + const IR::Value replacement{ir.CompositeConstruct(width, ir.CompositeExtract(new_inst, 1), + ir.CompositeExtract(new_inst, 2), + ir.CompositeExtract(new_inst, 3))}; + inst.ReplaceUsesWith(replacement); + break; + } + case TextureType::Color2D: + case TextureType::ColorArray2D: { + const IR::Value new_inst{&*block.PrependNewInst(it, inst)}; + const IR::U32 width{DownScale(ir, IR::U32{ir.CompositeExtract(new_inst, 0)})}; + const IR::U32 height{DownScale(ir, IR::U32{ir.CompositeExtract(new_inst, 1)})}; + const IR::Value replacement{ir.CompositeConstruct( + width, height, ir.CompositeExtract(new_inst, 2), ir.CompositeExtract(new_inst, 3))}; + inst.ReplaceUsesWith(replacement); + break; + } + case TextureType::Color3D: + case TextureType::ColorCube: + case TextureType::ColorArrayCube: + case TextureType::Buffer: + // Nothing to patch here + break; + } +} + +void PatchImageFetch(IR::Block& block, IR::Inst& inst) { + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + const auto info{inst.Flags()}; + const IR::U1 is_scaled{ir.IsTextureScaled(ir.Imm32(info.descriptor_index))}; + const IR::Value coord{inst.Arg(1)}; + switch (info.type) { + case TextureType::Color1D: + inst.SetArg(1, Scale(ir, is_scaled, IR::U32{coord})); + break; + case TextureType::ColorArray1D: { + const IR::U32 x{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)})}; + const IR::U32 y{ir.CompositeExtract(coord, 1)}; + inst.SetArg(1, ir.CompositeConstruct(x, y)); + break; + } + case TextureType::Color2D: { + const IR::U32 x{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)})}; + const IR::U32 y{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 1)})}; + inst.SetArg(1, ir.CompositeConstruct(x, y)); + break; + } + case TextureType::ColorArray2D: { + const IR::U32 x{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)})}; + const IR::U32 y{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 1)})}; + const IR::U32 z{ir.CompositeExtract(coord, 2)}; + inst.SetArg(1, ir.CompositeConstruct(x, y, z)); + break; + } + case TextureType::Color3D: + case TextureType::ColorCube: + case TextureType::ColorArrayCube: + case TextureType::Buffer: + // Nothing to patch here + break; + } +} + void Visit(const IR::Program& program, IR::Block& block, IR::Inst& inst) { const bool is_fragment_shader{program.stage == Stage::Fragment}; switch (inst.GetOpcode()) { @@ -40,12 +139,10 @@ void Visit(const IR::Program& program, IR::Block& block, IR::Inst& inst) { break; } case IR::Opcode::ImageQueryDimensions: + PatchImageQueryDimensions(block, inst); break; case IR::Opcode::ImageFetch: - break; - case IR::Opcode::ImageRead: - break; - case IR::Opcode::ImageWrite: + PatchImageFetch(block, inst); break; default: break; From 656adee630cb1de738ed4dc6cf1cbc35af40f64f Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 25 Jul 2021 22:26:23 -0300 Subject: [PATCH 029/156] spirv: Implement rescaling patching --- .../backend/spirv/emit_context.cpp | 35 +++++++++++++++++++ .../backend/spirv/emit_context.h | 5 +++ .../backend/spirv/emit_spirv.h | 5 ++- .../spirv/emit_spirv_context_get_set.cpp | 6 ++-- .../backend/spirv/emit_spirv_image.cpp | 26 ++++++++++++-- .../frontend/maxwell/translate_program.cpp | 3 ++ src/shader_recompiler/runtime_info.h | 2 ++ src/shader_recompiler/shader_info.h | 9 +++++ 8 files changed, 86 insertions(+), 5 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_context.cpp b/src/shader_recompiler/backend/spirv/emit_context.cpp index 3c84e6466..222baa177 100644 --- a/src/shader_recompiler/backend/spirv/emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/emit_context.cpp @@ -7,6 +7,8 @@ #include #include +#include + #include #include "common/common_types.h" @@ -496,6 +498,7 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf DefineImages(program.info, image_binding); DefineAttributeMemAccess(program.info); DefineGlobalMemoryFunctions(program.info); + DefineRescalingInput(program.info); } EmitContext::~EmitContext() = default; @@ -996,6 +999,38 @@ void EmitContext::DefineGlobalMemoryFunctions(const Info& info) { define(&StorageDefinitions::U32x4, storage_types.U32x4, U32[4], sizeof(u32[4])); } +void EmitContext::DefineRescalingInput(const Info& info) { + if (!info.uses_rescaling_uniform) { + return; + } + boost::container::static_vector members{F32[1]}; + u32 member_index{0}; + const u32 num_texture_words{Common::DivCeil(runtime_info.num_textures, 32u)}; + if (runtime_info.num_textures > 0) { + rescaling_textures_type = TypeArray(U32[1], Const(num_texture_words)); + Decorate(rescaling_textures_type, spv::Decoration::ArrayStride, 4u); + members.push_back(rescaling_textures_type); + rescaling_textures_member_index = ++member_index; + } + const Id push_constant_struct{TypeStruct(std::span(members.data(), members.size()))}; + Decorate(push_constant_struct, spv::Decoration::Block); + Name(push_constant_struct, "ResolutionInfo"); + MemberDecorate(push_constant_struct, 0u, spv::Decoration::Offset, 0u); + MemberName(push_constant_struct, 0u, "down_factor"); + if (runtime_info.num_textures > 0) { + MemberDecorate(push_constant_struct, rescaling_textures_member_index, + spv::Decoration::Offset, 4u); + MemberName(push_constant_struct, rescaling_textures_member_index, "rescaling_textures"); + } + const Id pointer_type{TypePointer(spv::StorageClass::PushConstant, push_constant_struct)}; + rescaling_push_constants = AddGlobalVariable(pointer_type, spv::StorageClass::PushConstant); + Name(rescaling_push_constants, "rescaling_push_constants"); + + if (profile.supported_spirv >= 0x00010400) { + interfaces.push_back(rescaling_push_constants); + } +} + void EmitContext::DefineConstantBuffers(const Info& info, u32& binding) { if (info.constant_buffer_descriptors.empty()) { return; diff --git a/src/shader_recompiler/backend/spirv/emit_context.h b/src/shader_recompiler/backend/spirv/emit_context.h index 112c52382..a7917ac51 100644 --- a/src/shader_recompiler/backend/spirv/emit_context.h +++ b/src/shader_recompiler/backend/spirv/emit_context.h @@ -238,6 +238,10 @@ public: Id indexed_load_func{}; Id indexed_store_func{}; + Id rescaling_push_constants{}; + Id rescaling_textures_type{}; + u32 rescaling_textures_member_index{}; + Id local_memory{}; Id shared_memory_u8{}; @@ -314,6 +318,7 @@ private: void DefineImages(const Info& info, u32& binding); void DefineAttributeMemAccess(const Info& info); void DefineGlobalMemoryFunctions(const Info& info); + void DefineRescalingInput(const Info& info); void DefineInputs(const IR::Program& program); void DefineOutputs(const IR::Program& program); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.h b/src/shader_recompiler/backend/spirv/emit_spirv.h index db0c935fe..7b0d8d980 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv.h @@ -20,8 +20,11 @@ namespace Shader::Backend::SPIRV { IR::Program& program, Bindings& bindings); [[nodiscard]] inline std::vector EmitSPIRV(const Profile& profile, IR::Program& program) { + RuntimeInfo runtime_info{}; + runtime_info.num_textures = Shader::NumDescriptors(program.info.texture_descriptors); + Bindings binding; - return EmitSPIRV(profile, {}, program, binding); + return EmitSPIRV(profile, runtime_info, program, binding); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 43f440dfb..6bb791b03 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -527,8 +527,10 @@ Id EmitYDirection(EmitContext& ctx) { } Id EmitResolutionDownFactor(EmitContext& ctx) { - UNIMPLEMENTED(); - return ctx.Const(1.0f); + const Id pointer_type{ctx.TypePointer(spv::StorageClass::PushConstant, ctx.F32[1])}; + const Id pointer{ + ctx.OpAccessChain(pointer_type, ctx.rescaling_push_constants, ctx.u32_zero_value)}; + return ctx.OpLoad(ctx.F32[1], pointer); } Id EmitLoadLocal(EmitContext& ctx, Id word_offset) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index 2f925cc3e..7d7c0627e 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp @@ -470,8 +470,30 @@ void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id ctx.OpImageWrite(Image(ctx, index, info), coords, color); } -Id EmitIsTextureScaled([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] const IR::Value& index) { - return ctx.false_value; +Id EmitIsTextureScaled(EmitContext& ctx, const IR::Value& index) { + const Id push_constant_u32{ctx.TypePointer(spv::StorageClass::PushConstant, ctx.U32[1])}; + const Id member_index{ctx.Const(ctx.rescaling_textures_member_index)}; + Id bit{}; + if (index.IsImmediate()) { + // Use BitwiseAnd instead of BitfieldExtract for better codegen on Nvidia OpenGL. + // LOP32I.NZ is used to set the predicate rather than BFE+ISETP. + const u32 index_value{index.U32()}; + const Id word_index{ctx.Const(index_value / 32)}; + const Id bit_index_mask{ctx.Const(1u << (index_value % 32))}; + const Id pointer{ctx.OpAccessChain(push_constant_u32, ctx.rescaling_push_constants, + member_index, word_index)}; + const Id word{ctx.OpLoad(ctx.U32[1], pointer)}; + bit = ctx.OpBitwiseAnd(ctx.U32[1], word, bit_index_mask); + } else { + const Id index_value{ctx.Def(index)}; + const Id word_index{ctx.OpShiftRightArithmetic(ctx.U32[1], index_value, ctx.Const(5u))}; + const Id pointer{ctx.OpAccessChain(push_constant_u32, ctx.rescaling_push_constants, + member_index, word_index)}; + const Id word{ctx.OpLoad(ctx.U32[1], pointer)}; + const Id bit_index{ctx.OpBitwiseAnd(ctx.U32[1], index_value, ctx.Const(31u))}; + bit = ctx.OpBitFieldUExtract(ctx.U32[1], index_value, bit_index, ctx.Const(1u)); + } + return ctx.OpINotEqual(ctx.U1, bit, ctx.u32_zero_value); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp index 2fc542f0e..795f5cf08 100644 --- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp @@ -178,6 +178,9 @@ IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool generic_input_types{}; VaryingState previous_stage_stores; + u32 num_textures{}; + bool convert_depth_mode{}; bool force_early_z{}; diff --git a/src/shader_recompiler/shader_info.h b/src/shader_recompiler/shader_info.h index 7bac9e2cd..9f375c30e 100644 --- a/src/shader_recompiler/shader_info.h +++ b/src/shader_recompiler/shader_info.h @@ -191,4 +191,13 @@ struct Info { ImageDescriptors image_descriptors; }; +template +u32 NumDescriptors(const Descriptors& descriptors) { + u32 num{}; + for (const auto& desc : descriptors) { + num += desc.count; + } + return num; +} + } // namespace Shader From 6f3a41abe2caa617ae540fb7e4a3c4a092478963 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 25 Jul 2021 22:27:16 -0300 Subject: [PATCH 030/156] opengl: Use Shader::NumDescriptors when possible --- .../renderer_opengl/gl_compute_pipeline.cpp | 19 +++-------- .../renderer_opengl/gl_graphics_pipeline.cpp | 32 ++++++------------- .../renderer_opengl/gl_shader_cache.cpp | 15 +++------ 3 files changed, 20 insertions(+), 46 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp index aa1cc592f..19c8ca7b2 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp @@ -19,15 +19,6 @@ using VideoCommon::ImageId; constexpr u32 MAX_TEXTURES = 64; constexpr u32 MAX_IMAGES = 16; -template -u32 AccumulateCount(const Range& range) { - u32 num{}; - for (const auto& desc : range) { - num += desc.count; - } - return num; -} - size_t ComputePipelineKey::Hash() const noexcept { return static_cast( Common::CityHash64(reinterpret_cast(this), sizeof *this)); @@ -58,17 +49,17 @@ ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cac std::copy_n(info.constant_buffer_used_sizes.begin(), uniform_buffer_sizes.size(), uniform_buffer_sizes.begin()); - num_texture_buffers = AccumulateCount(info.texture_buffer_descriptors); - num_image_buffers = AccumulateCount(info.image_buffer_descriptors); + num_texture_buffers = Shader::NumDescriptors(info.texture_buffer_descriptors); + num_image_buffers = Shader::NumDescriptors(info.image_buffer_descriptors); - const u32 num_textures{num_texture_buffers + AccumulateCount(info.texture_descriptors)}; + const u32 num_textures{num_texture_buffers + Shader::NumDescriptors(info.texture_descriptors)}; ASSERT(num_textures <= MAX_TEXTURES); - const u32 num_images{num_image_buffers + AccumulateCount(info.image_descriptors)}; + const u32 num_images{num_image_buffers + Shader::NumDescriptors(info.image_descriptors)}; ASSERT(num_images <= MAX_IMAGES); const bool is_glasm{assembly_program.handle != 0}; - const u32 num_storage_buffers{AccumulateCount(info.storage_buffers_descriptors)}; + const u32 num_storage_buffers{Shader::NumDescriptors(info.storage_buffers_descriptors)}; use_storage_buffers = !is_glasm || num_storage_buffers < device.GetMaxGLASMStorageBufferBlocks(); writes_global_memory = !use_storage_buffers && diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index bccb37a58..43ab5c03b 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -27,6 +27,7 @@ namespace OpenGL { namespace { using Shader::ImageBufferDescriptor; using Shader::ImageDescriptor; +using Shader::NumDescriptors; using Shader::TextureBufferDescriptor; using Shader::TextureDescriptor; using Tegra::Texture::TexturePair; @@ -35,15 +36,6 @@ using VideoCommon::ImageId; constexpr u32 MAX_TEXTURES = 64; constexpr u32 MAX_IMAGES = 8; -template -u32 AccumulateCount(const Range& range) { - u32 num{}; - for (const auto& desc : range) { - num += desc.count; - } - return num; -} - GLenum Stage(size_t stage_index) { switch (stage_index) { case 0: @@ -204,23 +196,23 @@ GraphicsPipeline::GraphicsPipeline( base_uniform_bindings[stage + 1] = base_uniform_bindings[stage]; base_storage_bindings[stage + 1] = base_storage_bindings[stage]; - base_uniform_bindings[stage + 1] += AccumulateCount(info.constant_buffer_descriptors); - base_storage_bindings[stage + 1] += AccumulateCount(info.storage_buffers_descriptors); + base_uniform_bindings[stage + 1] += NumDescriptors(info.constant_buffer_descriptors); + base_storage_bindings[stage + 1] += NumDescriptors(info.storage_buffers_descriptors); } enabled_uniform_buffer_masks[stage] = info.constant_buffer_mask; std::ranges::copy(info.constant_buffer_used_sizes, uniform_buffer_sizes[stage].begin()); - const u32 num_tex_buffer_bindings{AccumulateCount(info.texture_buffer_descriptors)}; + const u32 num_tex_buffer_bindings{NumDescriptors(info.texture_buffer_descriptors)}; num_texture_buffers[stage] += num_tex_buffer_bindings; num_textures += num_tex_buffer_bindings; - const u32 num_img_buffers_bindings{AccumulateCount(info.image_buffer_descriptors)}; + const u32 num_img_buffers_bindings{NumDescriptors(info.image_buffer_descriptors)}; num_image_buffers[stage] += num_img_buffers_bindings; num_images += num_img_buffers_bindings; - num_textures += AccumulateCount(info.texture_descriptors); - num_images += AccumulateCount(info.image_descriptors); - num_storage_buffers += AccumulateCount(info.storage_buffers_descriptors); + num_textures += NumDescriptors(info.texture_descriptors); + num_images += NumDescriptors(info.image_descriptors); + num_storage_buffers += NumDescriptors(info.storage_buffers_descriptors); writes_global_memory |= std::ranges::any_of( info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); @@ -423,13 +415,9 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { add_buffer(desc); } } - for (const auto& desc : info.texture_descriptors) { - texture_buffer_index += desc.count; - } + texture_buffer_index += Shader::NumDescriptors(info.texture_descriptors); if constexpr (Spec::has_images) { - for (const auto& desc : info.image_descriptors) { - texture_buffer_index += desc.count; - } + texture_buffer_index += Shader::NumDescriptors(info.image_descriptors); } }}; if constexpr (Spec::enabled_stages[0]) { diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 02682bd76..42ef67628 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -426,16 +426,14 @@ std::unique_ptr ShaderCache::CreateGraphicsPipeline( // Normal path programs[index] = TranslateProgram(pools.inst, pools.block, env, cfg, host_info); - for (const auto& desc : programs[index].info.storage_buffers_descriptors) { - total_storage_buffers += desc.count; - } + total_storage_buffers += + Shader::NumDescriptors(programs[index].info.storage_buffers_descriptors); } else { // VertexB path when VertexA is present. auto& program_va{programs[0]}; auto program_vb{TranslateProgram(pools.inst, pools.block, env, cfg, host_info)}; - for (const auto& desc : program_vb.info.storage_buffers_descriptors) { - total_storage_buffers += desc.count; - } + total_storage_buffers += + Shader::NumDescriptors(program_vb.info.storage_buffers_descriptors); programs[index] = MergeDualVertexPrograms(program_va, program_vb, env); } } @@ -510,10 +508,7 @@ std::unique_ptr ShaderCache::CreateComputePipeline( Shader::Maxwell::Flow::CFG cfg{env, pools.flow_block, env.StartAddress()}; auto program{TranslateProgram(pools.inst, pools.block, env, cfg, host_info)}; - u32 num_storage_buffers{}; - for (const auto& desc : program.info.storage_buffers_descriptors) { - num_storage_buffers += desc.count; - } + const u32 num_storage_buffers{Shader::NumDescriptors(program.info.storage_buffers_descriptors)}; Shader::RuntimeInfo info; info.glasm_use_storage_buffers = num_storage_buffers <= device.GetMaxGLASMStorageBufferBlocks(); From baf0993d5c019a1ef617a94fe17035f872ae6954 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 25 Jul 2021 22:28:26 -0300 Subject: [PATCH 031/156] vk_graphics_pipeline: Use Shader::NumDescriptors when possible --- .../renderer_vulkan/vk_graphics_pipeline.cpp | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 8634c3316..7e48d4458 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -402,13 +402,9 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { add_buffer(desc); } } - for (const auto& desc : info.texture_descriptors) { - texture_buffer_index += desc.count; - } + texture_buffer_index += Shader::NumDescriptors(info.texture_descriptors); if constexpr (Spec::has_images) { - for (const auto& desc : info.image_descriptors) { - texture_buffer_index += desc.count; - } + texture_buffer_index += Shader::NumDescriptors(info.image_descriptors); } }}; if constexpr (Spec::enabled_stages[0]) { @@ -826,18 +822,10 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) { void GraphicsPipeline::Validate() { size_t num_images{}; for (const auto& info : stage_infos) { - for (const auto& desc : info.texture_buffer_descriptors) { - num_images += desc.count; - } - for (const auto& desc : info.image_buffer_descriptors) { - num_images += desc.count; - } - for (const auto& desc : info.texture_descriptors) { - num_images += desc.count; - } - for (const auto& desc : info.image_descriptors) { - num_images += desc.count; - } + num_images += Shader::NumDescriptors(info.texture_buffer_descriptors); + num_images += Shader::NumDescriptors(info.image_buffer_descriptors); + num_images += Shader::NumDescriptors(info.texture_descriptors); + num_images += Shader::NumDescriptors(info.image_descriptors); } ASSERT(num_images <= MAX_IMAGE_ELEMENTS); } From dc72d4d4f5240b12bc771427404e7a74d56c87d9 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 25 Jul 2021 22:28:57 -0300 Subject: [PATCH 032/156] vk_texture_cache: Properly scale blit source images --- src/video_core/renderer_vulkan/vk_texture_cache.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 720247b4e..575f12566 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -630,8 +630,8 @@ void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, con .z = 0, }, { - .x = static_cast(extent.width), - .y = static_cast(extent.height), + .x = std::max(1, static_cast(extent.width) >> level), + .y = std::max(1, static_cast(extent.height) >> level), .z = 1, }, }, From d2388dd0d0d36a230b58efbdd17f8366c79555b5 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 25 Jul 2021 22:34:12 -0300 Subject: [PATCH 033/156] vulkan: Implement rescaling shader patching --- .../renderer_vulkan/pipeline_helper.h | 56 +++++++++++++++---- .../renderer_vulkan/vk_compute_pipeline.cpp | 30 ++++++---- .../renderer_vulkan/vk_compute_pipeline.h | 1 + .../renderer_vulkan/vk_graphics_pipeline.cpp | 21 +++++-- .../renderer_vulkan/vk_graphics_pipeline.h | 4 +- .../renderer_vulkan/vk_pipeline_cache.cpp | 3 + .../renderer_vulkan/vk_scheduler.cpp | 10 ++++ src/video_core/renderer_vulkan/vk_scheduler.h | 5 ++ 8 files changed, 103 insertions(+), 27 deletions(-) diff --git a/src/video_core/renderer_vulkan/pipeline_helper.h b/src/video_core/renderer_vulkan/pipeline_helper.h index 4847db6b6..7ba6078df 100644 --- a/src/video_core/renderer_vulkan/pipeline_helper.h +++ b/src/video_core/renderer_vulkan/pipeline_helper.h @@ -20,6 +20,8 @@ namespace Vulkan { +constexpr size_t MAX_RESCALING_WORDS = 4; + class DescriptorLayoutBuilder { public: DescriptorLayoutBuilder(const Device& device_) : device{&device_} {} @@ -68,18 +70,26 @@ public: } vk::PipelineLayout CreatePipelineLayout(VkDescriptorSetLayout descriptor_set_layout) const { + const VkPushConstantRange range{ + .stageFlags = static_cast( + is_compute ? VK_SHADER_STAGE_COMPUTE_BIT : VK_SHADER_STAGE_ALL_GRAPHICS), + .offset = 0, + .size = (is_compute ? 0 : sizeof(f32)) + sizeof(std::array), + }; return device->GetLogical().CreatePipelineLayout({ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, .pNext = nullptr, .flags = 0, .setLayoutCount = descriptor_set_layout ? 1U : 0U, .pSetLayouts = bindings.empty() ? nullptr : &descriptor_set_layout, - .pushConstantRangeCount = 0, - .pPushConstantRanges = nullptr, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &range, }); } void Add(const Shader::Info& info, VkShaderStageFlags stage) { + is_compute |= (stage & VK_SHADER_STAGE_COMPUTE_BIT) != 0; + Add(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, stage, info.constant_buffer_descriptors); Add(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, stage, info.storage_buffers_descriptors); Add(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, stage, info.texture_buffer_descriptors); @@ -115,6 +125,7 @@ private: } const Device* device{}; + bool is_compute{}; boost::container::small_vector bindings; boost::container::small_vector entries; u32 binding{}; @@ -122,21 +133,46 @@ private: size_t offset{}; }; +class RescalingPushConstant { +public: + explicit RescalingPushConstant(u32 num_textures) noexcept {} + + void PushTexture(bool is_rescaled) noexcept { + *texture_ptr |= is_rescaled ? texture_bit : 0; + texture_bit <<= 1; + if (texture_bit == 0) { + texture_bit = 1u; + ++texture_ptr; + } + } + + const std::array& Data() const noexcept { + return words; + } + +private: + std::array words{}; + u32* texture_ptr{words.data()}; + u32 texture_bit{1u}; +}; + inline void PushImageDescriptors(const Shader::Info& info, const VkSampler*& samplers, const ImageId*& image_view_ids, TextureCache& texture_cache, - VKUpdateDescriptorQueue& update_descriptor_queue) { - for (const auto& desc : info.texture_buffer_descriptors) { - image_view_ids += desc.count; - } - for (const auto& desc : info.image_buffer_descriptors) { - image_view_ids += desc.count; - } + VKUpdateDescriptorQueue& update_descriptor_queue, + RescalingPushConstant& rescaling) { + static constexpr VideoCommon::ImageViewId NULL_IMAGE_VIEW_ID{0}; + image_view_ids += Shader::NumDescriptors(info.texture_buffer_descriptors); + image_view_ids += Shader::NumDescriptors(info.image_buffer_descriptors); for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { + const VideoCommon::ImageViewId image_view_id{*(image_view_ids++)}; const VkSampler sampler{*(samplers++)}; - ImageView& image_view{texture_cache.GetImageView(*(image_view_ids++))}; + ImageView& image_view{texture_cache.GetImageView(image_view_id)}; + const Image& image{texture_cache.GetImage(image_view.image_id)}; const VkImageView vk_image_view{image_view.Handle(desc.type)}; update_descriptor_queue.AddSampledImage(vk_image_view, sampler); + rescaling.PushTexture(image_view_id != NULL_IMAGE_VIEW_ID && + True(image.flags & VideoCommon::ImageFlagBits::Rescaled)); } } for (const auto& desc : info.image_descriptors) { diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 44faf626a..bda75788c 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -180,9 +180,11 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute, buffer_cache.UpdateComputeBuffers(); buffer_cache.BindHostComputeBuffers(); + RescalingPushConstant rescaling(num_textures); const VkSampler* samplers_it{samplers.data()}; const ImageId* views_it{image_view_ids.data()}; - PushImageDescriptors(info, samplers_it, views_it, texture_cache, update_descriptor_queue); + PushImageDescriptors(info, samplers_it, views_it, texture_cache, update_descriptor_queue, + rescaling); if (!is_built.load(std::memory_order::relaxed)) { // Wait for the pipeline to be built @@ -192,17 +194,21 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute, }); } const void* const descriptor_data{update_descriptor_queue.UpdateData()}; - scheduler.Record([this, descriptor_data](vk::CommandBuffer cmdbuf) { - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); - if (!descriptor_set_layout) { - return; - } - const VkDescriptorSet descriptor_set{descriptor_allocator.Commit()}; - const vk::Device& dev{device.GetLogical()}; - dev.UpdateDescriptorSet(descriptor_set, *descriptor_update_template, descriptor_data); - cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline_layout, 0, - descriptor_set, nullptr); - }); + scheduler.Record( + [this, descriptor_data, rescaling_data = rescaling.Data()](vk::CommandBuffer cmdbuf) { + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); + if (!descriptor_set_layout) { + return; + } + if (num_textures > 0) { + cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, rescaling_data); + } + const VkDescriptorSet descriptor_set{descriptor_allocator.Commit()}; + const vk::Device& dev{device.GetLogical()}; + dev.UpdateDescriptorSet(descriptor_set, *descriptor_update_template, descriptor_data); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline_layout, 0, + descriptor_set, nullptr); + }); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.h b/src/video_core/renderer_vulkan/vk_compute_pipeline.h index 8c4b0a301..e79ce4d7c 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.h @@ -59,6 +59,7 @@ private: vk::PipelineLayout pipeline_layout; vk::DescriptorUpdateTemplateKHR descriptor_update_template; vk::Pipeline pipeline; + u32 num_textures{}; std::condition_variable build_condvar; std::mutex build_mutex; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 7e48d4458..967762c37 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -235,6 +235,7 @@ GraphicsPipeline::GraphicsPipeline( stage_infos[stage] = *info; enabled_uniform_buffer_masks[stage] = info->constant_buffer_mask; std::ranges::copy(info->constant_buffer_used_sizes, uniform_buffer_sizes[stage].begin()); + num_textures += Shader::NumDescriptors(info->texture_descriptors); } auto func{[this, shader_notify, &render_pass_cache, &descriptor_pool, pipeline_statistics] { DescriptorLayoutBuilder builder{MakeBuilder(device, stage_infos)}; @@ -428,12 +429,13 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { update_descriptor_queue.Acquire(); + RescalingPushConstant rescaling(num_textures); const VkSampler* samplers_it{samplers.data()}; const ImageId* views_it{image_view_ids.data()}; const auto prepare_stage{[&](size_t stage) LAMBDA_FORCEINLINE { buffer_cache.BindHostStageBuffers(stage); PushImageDescriptors(stage_infos[stage], samplers_it, views_it, texture_cache, - update_descriptor_queue); + update_descriptor_queue, rescaling); }}; if constexpr (Spec::enabled_stages[0]) { prepare_stage(0); @@ -450,10 +452,10 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { if constexpr (Spec::enabled_stages[4]) { prepare_stage(4); } - ConfigureDraw(); + ConfigureDraw(rescaling); } -void GraphicsPipeline::ConfigureDraw() { +void GraphicsPipeline::ConfigureDraw(const RescalingPushConstant& rescaling) { texture_cache.UpdateRenderTargets(false); scheduler.RequestRenderpass(texture_cache.GetFramebuffer()); @@ -464,12 +466,23 @@ void GraphicsPipeline::ConfigureDraw() { build_condvar.wait(lock, [this] { return is_built.load(std::memory_order::relaxed); }); }); } + const bool is_rescaling{texture_cache.IsRescaling()}; + const bool update_rescaling{scheduler.UpdateRescaling(is_rescaling)}; const bool bind_pipeline{scheduler.UpdateGraphicsPipeline(this)}; const void* const descriptor_data{update_descriptor_queue.UpdateData()}; - scheduler.Record([this, descriptor_data, bind_pipeline](vk::CommandBuffer cmdbuf) { + scheduler.Record([this, descriptor_data, bind_pipeline, rescaling_data = rescaling.Data(), + is_rescaling, update_rescaling](vk::CommandBuffer cmdbuf) { if (bind_pipeline) { cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline); } + if (update_rescaling) { + const f32 config_down_factor{Settings::values.resolution_info.down_factor}; + const float scale_down_factor{is_rescaling ? config_down_factor : 1.0f}; + cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_ALL_GRAPHICS, 0, + sizeof(scale_down_factor), &scale_down_factor); + } + cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_ALL_GRAPHICS, sizeof(f32), + sizeof(rescaling_data), rescaling_data.data()); if (!descriptor_set_layout) { return; } diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 1c780e944..a0c1d8f07 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -62,6 +62,7 @@ namespace Vulkan { class Device; class PipelineStatistics; class RenderPassCache; +class RescalingPushConstant; class VKScheduler; class VKUpdateDescriptorQueue; @@ -113,7 +114,7 @@ private: template void ConfigureImpl(bool is_indexed); - void ConfigureDraw(); + void ConfigureDraw(const RescalingPushConstant& rescaling); void MakePipeline(VkRenderPass render_pass); @@ -138,6 +139,7 @@ private: std::array stage_infos; std::array enabled_uniform_buffer_masks{}; VideoCommon::UniformBufferSizes uniform_buffer_sizes{}; + u32 num_textures{}; vk::DescriptorSetLayout descriptor_set_layout; DescriptorAllocator descriptor_allocator; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index eb8b4e08b..691ef0841 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -139,6 +139,9 @@ Shader::RuntimeInfo MakeRuntimeInfo(std::span program } else { info.previous_stage_stores.mask.set(); } + for (const auto& stage : programs) { + info.num_textures += Shader::NumDescriptors(stage.info.texture_descriptors); + } const Shader::Stage stage{program.stage}; const bool has_geometry{key.unique_hashes[4] != 0 && !programs[4].is_geometry_passthrough}; const bool gl_ndc{key.state.ndc_minus_one_to_one != 0}; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 0c11c814f..3bfdf41ba 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -128,6 +128,15 @@ bool VKScheduler::UpdateGraphicsPipeline(GraphicsPipeline* pipeline) { return true; } +bool VKScheduler::UpdateRescaling(bool is_rescaling) { + if (state.rescaling_defined && is_rescaling == state.is_rescaling) { + return false; + } + state.rescaling_defined = true; + state.is_rescaling = is_rescaling; + return true; +} + void VKScheduler::WorkerThread(std::stop_token stop_token) { Common::SetCurrentThreadName("yuzu:VulkanWorker"); do { @@ -227,6 +236,7 @@ void VKScheduler::AllocateNewContext() { void VKScheduler::InvalidateState() { state.graphics_pipeline = nullptr; + state.rescaling_defined = false; state_tracker.InvalidateCommandBufferState(); } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 85fc1712f..1b06c9296 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -56,6 +56,9 @@ public: /// Update the pipeline to the current execution context. bool UpdateGraphicsPipeline(GraphicsPipeline* pipeline); + /// Update the rescaling state. Returns true if the state has to be updated. + bool UpdateRescaling(bool is_rescaling); + /// Invalidates current command buffer state except for render passes void InvalidateState(); @@ -185,6 +188,8 @@ private: VkFramebuffer framebuffer = nullptr; VkExtent2D render_area = {0, 0}; GraphicsPipeline* graphics_pipeline = nullptr; + bool is_rescaling = false; + bool rescaling_defined = false; }; void WorkerThread(std::stop_token stop_token); From 138d9d7effa17d9bae9af60493c4cdc99a2d0487 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Mon, 26 Jul 2021 01:58:02 -0400 Subject: [PATCH 034/156] main: Add resolution scale label in the status bar Shows the resolution scale as "Scale: {}x" in the status bar, where {} is a floating point value representing the current resolution scaling factor. --- src/yuzu/main.cpp | 13 +++++++++++-- src/yuzu/main.h | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index 4e5552d2a..a246f6bb3 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -747,6 +747,8 @@ void GMainWindow::InitializeWidgets() { shader_building_label = new QLabel(); shader_building_label->setToolTip(tr("The amount of shaders currently being built")); + res_scale_label = new QLabel(); + res_scale_label->setToolTip(tr("The current selected resolution scaling multiplier.")); emu_speed_label = new QLabel(); emu_speed_label->setToolTip( tr("Current emulation speed. Values higher or lower than 100% " @@ -759,8 +761,8 @@ void GMainWindow::InitializeWidgets() { tr("Time taken to emulate a Switch frame, not counting framelimiting or v-sync. For " "full-speed emulation this should be at most 16.67 ms.")); - for (auto& label : - {shader_building_label, emu_speed_label, game_fps_label, emu_frametime_label}) { + for (auto& label : {shader_building_label, res_scale_label, emu_speed_label, game_fps_label, + emu_frametime_label}) { label->setVisible(false); label->setFrameStyle(QFrame::NoFrame); label->setContentsMargins(4, 0, 4, 0); @@ -1535,6 +1537,7 @@ void GMainWindow::ShutdownGame() { // Disable status bar updates status_bar_update_timer.stop(); shader_building_label->setVisible(false); + res_scale_label->setVisible(false); emu_speed_label->setVisible(false); game_fps_label->setVisible(false); emu_frametime_label->setVisible(false); @@ -2981,6 +2984,11 @@ void GMainWindow::UpdateStatusBar() { shader_building_label->setVisible(false); } + const auto res_info = Settings::values.resolution_info; + const auto res_scale = res_info.up_factor; + res_scale_label->setText( + tr("Scale: %1x", "%1 is the resolution scaling factor").arg(res_scale)); + if (Settings::values.use_speed_limit.GetValue()) { emu_speed_label->setText(tr("Speed: %1% / %2%") .arg(results.emulation_speed * 100.0, 0, 'f', 0) @@ -2996,6 +3004,7 @@ void GMainWindow::UpdateStatusBar() { } emu_frametime_label->setText(tr("Frame: %1 ms").arg(results.frametime * 1000.0, 0, 'f', 2)); + res_scale_label->setVisible(true); emu_speed_label->setVisible(!Settings::values.use_multi_core.GetValue()); game_fps_label->setVisible(true); emu_frametime_label->setVisible(true); diff --git a/src/yuzu/main.h b/src/yuzu/main.h index 981102daa..beb4f2984 100644 --- a/src/yuzu/main.h +++ b/src/yuzu/main.h @@ -328,6 +328,7 @@ private: // Status bar elements QLabel* message_label = nullptr; QLabel* shader_building_label = nullptr; + QLabel* res_scale_label = nullptr; QLabel* emu_speed_label = nullptr; QLabel* game_fps_label = nullptr; QLabel* emu_frametime_label = nullptr; From 0e8cf38f392f2ea6f7f5195070ad721b78590c04 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Mon, 26 Jul 2021 09:33:00 +0200 Subject: [PATCH 035/156] Texture Cache: Implement Blacklisting. --- .../renderer_vulkan/vk_compute_pipeline.cpp | 29 ++++++++++++++-- .../renderer_vulkan/vk_graphics_pipeline.cpp | 28 ++++++++++++++- src/video_core/texture_cache/image_base.h | 1 + src/video_core/texture_cache/texture_cache.h | 34 ++++++++++++++++++- .../texture_cache/texture_cache_base.h | 2 ++ 5 files changed, 90 insertions(+), 4 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index bda75788c..5c591e345 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -111,6 +111,7 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute, std::array image_view_ids; boost::container::static_vector image_view_indices; boost::container::static_vector samplers; + boost::container::static_vector image_view_blacklist; const auto& qmd{kepler_compute.launch_description}; const auto& cbufs{qmd.const_buffer_config}; @@ -151,10 +152,34 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute, samplers.push_back(sampler->Handle()); } } - std::ranges::for_each(info.image_descriptors, add_image); + const u32 black_list_base = image_view_indices.size(); + bool atleast_one_blacklisted = false; + for (const auto& desc : info.image_descriptors) { + const bool is_black_listed = + desc.is_written && (desc.type == Shader::TextureType::Color2D || + desc.type == Shader::TextureType::ColorArray2D); + for (u32 index = 0; index < desc.count; ++index) { + image_view_blacklist.push_back(is_black_listed); + } + atleast_one_blacklisted |= is_black_listed; + add_image(desc); + } const std::span indices_span(image_view_indices.data(), image_view_indices.size()); - texture_cache.FillComputeImageViews(indices_span, image_view_ids); + bool has_listed_stuffs; + do { + has_listed_stuffs = false; + texture_cache.FillComputeImageViews(indices_span, image_view_ids); + if (atleast_one_blacklisted) { + for (u32 index = 0; index < image_view_blacklist.size(); index++) { + if (image_view_blacklist[index]) { + ImageView& image_view{ + texture_cache.GetImageView(image_view_ids[index + black_list_base])}; + has_listed_stuffs |= texture_cache.BlackListImage(image_view.image_id); + } + } + } + } while (has_listed_stuffs); buffer_cache.UnbindComputeTextureBuffers(); ImageId* texture_buffer_ids{image_view_ids.data()}; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 967762c37..4d966ee4b 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -280,6 +280,7 @@ template void GraphicsPipeline::ConfigureImpl(bool is_indexed) { std::array image_view_ids; std::array image_view_indices; + std::array image_view_blacklist; std::array samplers; size_t sampler_index{}; size_t image_index{}; @@ -290,6 +291,8 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { const auto& regs{maxwell3d.regs}; const bool via_header_index{regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex}; + u32 start_black_list = std::numeric_limits::max(); + u32 end_black_list = 0; const auto config_stage{[&](size_t stage) LAMBDA_FORCEINLINE { const Shader::Info& info{stage_infos[stage]}; buffer_cache.UnbindGraphicsStorageBuffers(stage); @@ -350,6 +353,15 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { } if constexpr (Spec::has_images) { for (const auto& desc : info.image_descriptors) { + if (desc.is_written && (desc.type == Shader::TextureType::Color2D || + desc.type == Shader::TextureType::ColorArray2D)) { + auto index_copy = image_index; + for (u32 index = 0; index < desc.count; ++index) { + start_black_list = std::min(start_black_list, index_copy); + image_view_blacklist[index_copy++] = true; + end_black_list = std::max(end_black_list, index_copy); + } + } add_image(desc); } } @@ -370,7 +382,21 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { config_stage(4); } const std::span indices_span(image_view_indices.data(), image_index); - texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); + bool has_listed_stuffs; + do { + has_listed_stuffs = false; + texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); + if constexpr (Spec::has_images) { + if (start_black_list < end_black_list) { + for (u32 index = start_black_list; index < end_black_list; index++) { + if (image_view_blacklist[index]) { + ImageView& image_view{texture_cache.GetImageView(image_view_ids[index])}; + has_listed_stuffs |= texture_cache.BlackListImage(image_view.image_id); + } + } + } + } + } while (has_listed_stuffs); ImageId* texture_buffer_index{image_view_ids.data()}; const auto bind_stage_info{[&](size_t stage) LAMBDA_FORCEINLINE { diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index 1cd30fd37..10dd52e28 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -37,6 +37,7 @@ enum class ImageFlagBits : u32 { // Rescaler Rescaled = 1 << 12, RescaleChecked = 1 << 13, + Blacklisted = 1 << 14, }; DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index ae74a6ecf..ce5994d5f 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -227,6 +227,7 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { flags[Dirty::RenderTargetControl] = false; bool can_rescale = true; + bool any_blacklisted = false; std::array tmp_color_images{}; ImageId tmp_depth_image{}; const auto check_rescale = [&](ImageViewId view_id, ImageId& id_save) { @@ -236,6 +237,7 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { id_save = image_id; auto& image = slot_images[image_id]; can_rescale &= ImageCanRescale(image); + any_blacklisted |= True(image.flags & ImageFlagBits::Blacklisted); } else { id_save = CORRUPT_ID; } @@ -268,10 +270,13 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { scale_up(tmp_depth_image); } else { rescaled = false; - const auto scale_down = [this](ImageId image_id) { + const auto scale_down = [this, any_blacklisted](ImageId image_id) { if (image_id != CORRUPT_ID) { Image& image = slot_images[image_id]; ScaleDown(image); + if (any_blacklisted) { + image.flags |= ImageFlagBits::Blacklisted; + } } }; for (size_t index = 0; index < NUM_RT; ++index) { @@ -736,8 +741,22 @@ ImageId TextureCache

::FindImage(const ImageInfo& info, GPUVAddr gpu_addr, return image_id; } +template +bool TextureCache

::BlackListImage(ImageId image_id) { + auto& image = slot_images[image_id]; + if (True(image.flags & ImageFlagBits::Blacklisted)) { + return false; + } + image.flags |= ImageFlagBits::Blacklisted; + ScaleDown(image); + return true; +} + template bool TextureCache

::ImageCanRescale(Image& image) { + if (True(image.flags & ImageFlagBits::Blacklisted)) { + return false; + } if (True(image.flags & ImageFlagBits::Rescaled) || True(image.flags & ImageFlagBits::RescaleChecked)) { return true; @@ -912,6 +931,7 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA bool can_rescale = (info.type == ImageType::e1D || info.type == ImageType::e2D) && info.block.depth == 0; bool any_rescaled = false; + bool any_blacklisted = false; for (const ImageId sibling_id : all_siblings) { if (!can_rescale) { break; @@ -919,6 +939,7 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA Image& sibling = slot_images[sibling_id]; can_rescale &= ImageCanRescale(sibling); any_rescaled |= True(sibling.flags & ImageFlagBits::Rescaled); + any_blacklisted |= True(sibling.flags & ImageFlagBits::Blacklisted); } can_rescale &= any_rescaled; @@ -932,6 +953,9 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA for (const ImageId sibling_id : all_siblings) { Image& sibling = slot_images[sibling_id]; ScaleDown(sibling); + if (any_blacklisted) { + sibling.flags |= ImageFlagBits::Blacklisted; + } } } @@ -1556,6 +1580,7 @@ void TextureCache

::SynchronizeAliases(ImageId image_id) { boost::container::small_vector aliased_images; Image& image = slot_images[image_id]; bool any_rescaled = True(image.flags & ImageFlagBits::Rescaled); + bool any_blacklisted = True(image.flags & ImageFlagBits::Blacklisted); u64 most_recent_tick = image.modification_tick; for (const AliasedImage& aliased : image.aliased_images) { ImageBase& aliased_image = slot_images[aliased.id]; @@ -1563,6 +1588,7 @@ void TextureCache

::SynchronizeAliases(ImageId image_id) { most_recent_tick = std::max(most_recent_tick, aliased_image.modification_tick); aliased_images.push_back(&aliased); any_rescaled |= True(aliased_image.flags & ImageFlagBits::Rescaled); + any_blacklisted |= True(aliased_image.flags & ImageFlagBits::Blacklisted); } } if (aliased_images.empty()) { @@ -1574,6 +1600,9 @@ void TextureCache

::SynchronizeAliases(ImageId image_id) { ScaleUp(image); } else { ScaleDown(image); + if (any_blacklisted) { + image.flags |= ImageFlagBits::Blacklisted; + } } } image.modification_tick = most_recent_tick; @@ -1589,6 +1618,9 @@ void TextureCache

::SynchronizeAliases(ImageId image_id) { ScaleUp(aliased_image); } else { ScaleDown(aliased_image); + if (any_blacklisted) { + aliased_image.flags |= ImageFlagBits::Blacklisted; + } } } CopyImage(image_id, aliased->id, aliased->copies); diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 0d2d9ec2e..35a29cd9b 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -176,6 +176,8 @@ public: [[nodiscard]] bool IsRescaling(); + [[nodiscard]] bool BlackListImage(ImageId image_id); + std::mutex mutex; private: From 117f8ee7a4fa17e76a4f7eb29560fcc534ce8099 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 27 Jul 2021 00:15:27 +0200 Subject: [PATCH 036/156] Vulkan: Fix AA when rescaling. --- src/video_core/renderer_vulkan/vk_blit_screen.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 888bc7392..7051e6559 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -614,7 +614,7 @@ void VKBlitScreen::CreateSampler() { .pNext = nullptr, .flags = 0, .magFilter = VK_FILTER_LINEAR, - .minFilter = VK_FILTER_NEAREST, + .minFilter = VK_FILTER_LINEAR, .mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR, .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, From cee7eba64e639d53613fee45f569377e05a4c6f9 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 27 Jul 2021 00:48:22 +0200 Subject: [PATCH 037/156] OpenGL: set linear mag filter when blitting a downscaled image. --- src/video_core/renderer_opengl/renderer_opengl.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 7d7cba69c..0f7b69c6d 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -257,6 +257,7 @@ void RendererOpenGL::InitOpenGLObjects() { // Generate presentation sampler present_sampler.Create(); glSamplerParameteri(present_sampler.handle, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glSamplerParameteri(present_sampler.handle, GL_TEXTURE_MAG_FILTER, GL_LINEAR); // Generate VBO handle for drawing vertex_buffer.Create(); From 07c564f38b238af9be7a9d8aee1149a353c2880b Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 27 Jul 2021 01:29:55 +0200 Subject: [PATCH 038/156] Texture Cache: Implement Rating System. --- src/video_core/texture_cache/image_base.cpp | 6 +-- src/video_core/texture_cache/image_base.h | 2 + src/video_core/texture_cache/image_info.cpp | 11 +++++ src/video_core/texture_cache/image_info.h | 1 + src/video_core/texture_cache/texture_cache.h | 42 ++++++++++++++------ 5 files changed, 47 insertions(+), 15 deletions(-) diff --git a/src/video_core/texture_cache/image_base.cpp b/src/video_core/texture_cache/image_base.cpp index 6052d148a..e9e725edf 100644 --- a/src/video_core/texture_cache/image_base.cpp +++ b/src/video_core/texture_cache/image_base.cpp @@ -60,9 +60,9 @@ namespace { ImageBase::ImageBase(const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) : info{info_}, guest_size_bytes{CalculateGuestSizeInBytes(info)}, unswizzled_size_bytes{CalculateUnswizzledSizeBytes(info)}, - converted_size_bytes{CalculateConvertedSizeBytes(info)}, gpu_addr{gpu_addr_}, - cpu_addr{cpu_addr_}, cpu_addr_end{cpu_addr + guest_size_bytes}, - mip_level_offsets{CalculateMipLevelOffsets(info)} { + converted_size_bytes{CalculateConvertedSizeBytes(info)}, scale_rating{}, + scale_tick{}, gpu_addr{gpu_addr_}, cpu_addr{cpu_addr_}, + cpu_addr_end{cpu_addr + guest_size_bytes}, mip_level_offsets{CalculateMipLevelOffsets(info)} { if (info.type == ImageType::e3D) { slice_offsets = CalculateSliceOffsets(info); slice_subresources = CalculateSliceSubresources(info); diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index 10dd52e28..97f107b4d 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -78,6 +78,8 @@ struct ImageBase { u32 guest_size_bytes = 0; u32 unswizzled_size_bytes = 0; u32 converted_size_bytes = 0; + u32 scale_rating = 0; + u64 scale_tick = 0; ImageFlagBits flags = ImageFlagBits::CpuModified; GPUVAddr gpu_addr = 0; diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index 022ca9033..7fa8fd4fe 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -31,6 +31,7 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept { .depth = config.block_depth, }; } + rescaleable = false; tile_width_spacing = config.tile_width_spacing; if (config.texture_type != TextureType::Texture2D && config.texture_type != TextureType::Texture2DNoMipmap) { @@ -53,12 +54,14 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept { case TextureType::Texture2DNoMipmap: ASSERT(config.Depth() == 1); type = config.IsPitchLinear() ? ImageType::Linear : ImageType::e2D; + rescaleable = !config.IsPitchLinear(); size.width = config.Width(); size.height = config.Height(); resources.layers = config.BaseLayer() + 1; break; case TextureType::Texture2DArray: type = ImageType::e2D; + rescaleable = true; size.width = config.Width(); size.height = config.Height(); resources.layers = config.BaseLayer() + config.Depth(); @@ -98,12 +101,14 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept { // FIXME: Call this without passing *this layer_stride = CalculateLayerStride(*this); maybe_unaligned_layer_stride = CalculateLayerSize(*this); + rescaleable &= (block.depth == 0) && resources.levels == 1; } } ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept { const auto& rt = regs.rt[index]; format = VideoCore::Surface::PixelFormatFromRenderTargetFormat(rt.format); + rescaleable = false; if (rt.tile_mode.is_pitch_linear) { ASSERT(rt.tile_mode.is_3d == 0); type = ImageType::Linear; @@ -129,6 +134,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) type = ImageType::e3D; size.depth = rt.depth; } else { + rescaleable = block.depth == 0 && size.height > 256; type = ImageType::e2D; resources.layers = rt.depth; } @@ -138,6 +144,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept { format = VideoCore::Surface::PixelFormatFromDepthFormat(regs.zeta.format); size.width = regs.zeta_width; size.height = regs.zeta_height; + rescaleable = false; resources.levels = 1; layer_stride = regs.zeta.layer_stride * 4; maybe_unaligned_layer_stride = layer_stride; @@ -156,6 +163,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept { type = ImageType::e3D; size.depth = regs.zeta_depth; } else { + rescaleable = block.depth == 0 && size.height > 256; type = ImageType::e2D; resources.layers = regs.zeta_depth; } @@ -164,6 +172,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept { ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept { UNIMPLEMENTED_IF_MSG(config.layer != 0, "Surface layer is not zero"); format = VideoCore::Surface::PixelFormatFromRenderTargetFormat(config.format); + rescaleable = false; if (config.linear == Tegra::Engines::Fermi2D::MemoryLayout::Pitch) { type = ImageType::Linear; size = Extent3D{ @@ -174,6 +183,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept { pitch = config.pitch; } else { type = config.block_depth > 0 ? ImageType::e3D : ImageType::e2D; + block = Extent3D{ .width = config.block_width, .height = config.block_height, @@ -186,6 +196,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept { .height = config.height, .depth = 1, }; + rescaleable = block.depth == 0 && size.height > 256; } } diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h index 16d4cee37..e874d2870 100644 --- a/src/video_core/texture_cache/image_info.h +++ b/src/video_core/texture_cache/image_info.h @@ -33,6 +33,7 @@ struct ImageInfo { u32 maybe_unaligned_layer_stride = 0; u32 num_samples = 1; u32 tile_width_spacing = 0; + bool rescaleable = false; }; } // namespace VideoCommon diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index ce5994d5f..be40f6b88 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -216,7 +216,10 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { return; } + u32 scale_rating; bool rescaled; + std::array tmp_color_images{}; + ImageId tmp_depth_image{}; do { flags[Dirty::RenderTargets] = false; @@ -226,10 +229,10 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { const bool force = flags[Dirty::RenderTargetControl]; flags[Dirty::RenderTargetControl] = false; + scale_rating = 0; + bool any_rescaled = false; bool can_rescale = true; bool any_blacklisted = false; - std::array tmp_color_images{}; - ImageId tmp_depth_image{}; const auto check_rescale = [&](ImageViewId view_id, ImageId& id_save) { if (view_id) { const auto& view = slot_image_views[view_id]; @@ -238,6 +241,10 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { auto& image = slot_images[image_id]; can_rescale &= ImageCanRescale(image); any_blacklisted |= True(image.flags & ImageFlagBits::Blacklisted); + any_rescaled |= True(image.flags & ImageFlagBits::Rescaled); + scale_rating = std::max(scale_rating, image.scale_tick <= frame_tick + ? image.scale_rating + 1U + : image.scale_rating); } else { id_save = CORRUPT_ID; } @@ -257,17 +264,19 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { check_rescale(render_targets.depth_buffer_id, tmp_depth_image); if (can_rescale) { - rescaled = true; + rescaled = any_rescaled || scale_rating >= 2; const auto scale_up = [this](ImageId image_id) { if (image_id != CORRUPT_ID) { Image& image = slot_images[image_id]; ScaleUp(image); } }; - for (size_t index = 0; index < NUM_RT; ++index) { - scale_up(tmp_color_images[index]); + if (rescaled) { + for (size_t index = 0; index < NUM_RT; ++index) { + scale_up(tmp_color_images[index]); + } + scale_up(tmp_depth_image); } - scale_up(tmp_depth_image); } else { rescaled = false; const auto scale_down = [this, any_blacklisted](ImageId image_id) { @@ -283,10 +292,23 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { scale_down(tmp_color_images[index]); } scale_down(tmp_depth_image); + scale_rating = 0; } } while (has_deleted_images); // Rescale End + const auto set_rating = [this, scale_rating](ImageId image_id) { + if (image_id != CORRUPT_ID) { + Image& image = slot_images[image_id]; + image.scale_rating = scale_rating; + image.scale_tick = frame_tick + 1; + } + }; + for (size_t index = 0; index < NUM_RT; ++index) { + set_rating(tmp_color_images[index]); + } + set_rating(tmp_depth_image); + if (is_rescaling != rescaled) { flags[Dirty::RescaleViewports] = true; flags[Dirty::RescaleScissors] = true; @@ -761,10 +783,7 @@ bool TextureCache

::ImageCanRescale(Image& image) { True(image.flags & ImageFlagBits::RescaleChecked)) { return true; } - const auto& info = image.info; - const bool can_this_rescale = - (info.type == ImageType::e1D || info.type == ImageType::e2D) && info.block.depth == 0; - if (!can_this_rescale) { + if (!image.info.rescaleable) { image.flags &= ~ImageFlagBits::RescaleChecked; return false; } @@ -928,8 +947,7 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA }; ForEachSparseImageInRegion(gpu_addr, size_bytes, region_check_gpu); - bool can_rescale = - (info.type == ImageType::e1D || info.type == ImageType::e2D) && info.block.depth == 0; + bool can_rescale = info.rescaleable; bool any_rescaled = false; bool any_blacklisted = false; for (const ImageId sibling_id : all_siblings) { From 48d81506a32deb019dc65cd8099be290a392fd8d Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 27 Jul 2021 01:46:15 +0200 Subject: [PATCH 039/156] Vulkan: Fix downscaling Blit. --- .../renderer_vulkan/vk_texture_cache.cpp | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 575f12566..b21992fce 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -595,7 +595,8 @@ struct RangedBarrierRange { } void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, const ImageInfo& info, - VkImageAspectFlags aspect_mask, const Settings::ResolutionScalingInfo& resolution) { + VkImageAspectFlags aspect_mask, const Settings::ResolutionScalingInfo& resolution, + bool scaling) { const auto type = info.type; const auto resources = info.resources; const VkExtent2D extent{ @@ -603,15 +604,18 @@ void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, con .height = info.size.height, }; scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([dst_image, src_image, extent, resources, aspect_mask, resolution, - type](vk::CommandBuffer cmdbuf) { + scheduler.Record([dst_image, src_image, extent, resources, aspect_mask, resolution, type, + scaling](vk::CommandBuffer cmdbuf) { const auto scale_up = [&](u32 value) { return std::max((value * resolution.up_scale) >> resolution.down_shift, 1U); }; - const bool is_2d = type == ImageType::e2D; - const VkOffset2D mip0_size{ - .x = static_cast(scale_up(extent.width)), - .y = static_cast(is_2d ? scale_up(extent.height) : extent.height), + const VkOffset2D src_size{ + .x = static_cast(scaling ? extent.width : scale_up(extent.width)), + .y = static_cast(scaling ? extent.height : scale_up(extent.height)), + }; + const VkOffset2D dst_size{ + .x = static_cast(scaling ? scale_up(extent.width) : extent.width), + .y = static_cast(scaling ? scale_up(extent.height) : extent.height), }; boost::container::small_vector regions; regions.reserve(resources.levels); @@ -630,8 +634,8 @@ void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, con .z = 0, }, { - .x = std::max(1, static_cast(extent.width) >> level), - .y = std::max(1, static_cast(extent.height) >> level), + .x = std::max(1, src_size.x >> level), + .y = std::max(1, src_size.y >> level), .z = 1, }, }, @@ -648,8 +652,8 @@ void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, con .z = 0, }, { - .x = std::max(1, mip0_size.x >> level), - .y = std::max(1, mip0_size.y >> level), + .x = std::max(1, dst_size.x >> level), + .y = std::max(1, dst_size.y >> level), .z = 1, }, }, @@ -1157,7 +1161,7 @@ bool Image::ScaleUp(bool save_as_backup) { if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); } - BlitScale(*scheduler, *image, *rescaled_image, info, aspect_mask, resolution); + BlitScale(*scheduler, *image, *rescaled_image, info, aspect_mask, resolution, true); return true; } @@ -1184,14 +1188,14 @@ bool Image::ScaleDown(bool save_as_backup) { const auto& resolution = runtime->resolution; vk::Image downscaled_image = - MakeImage(runtime->device, info, resolution.up_scale, resolution.down_shift); + MakeImage(runtime->device, info); MemoryCommit new_commit( runtime->memory_allocator.Commit(downscaled_image, MemoryUsage::DeviceLocal)); if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); } - BlitScale(*scheduler, *image, *downscaled_image, info, aspect_mask, resolution); + BlitScale(*scheduler, *image, *downscaled_image, info, aspect_mask, resolution, false); if (save_as_backup) { backup_image = std::move(image); From 56ccda1d9952368d0c1e29d7c4b486c547de9549 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 28 Jul 2021 02:47:06 -0300 Subject: [PATCH 040/156] texture_cache: Simplify image view queries and blacklisting --- .../renderer_opengl/gl_compute_pipeline.cpp | 36 ++++++----- .../renderer_opengl/gl_graphics_pipeline.cpp | 42 ++++++------- .../renderer_opengl/gl_texture_cache.cpp | 10 ++-- .../renderer_opengl/gl_texture_cache.h | 5 +- .../renderer_vulkan/pipeline_helper.h | 19 +++--- .../renderer_vulkan/vk_compute_pipeline.cpp | 60 +++++++------------ .../renderer_vulkan/vk_graphics_pipeline.cpp | 54 ++++------------- .../renderer_vulkan/vk_texture_cache.cpp | 22 ++++--- .../renderer_vulkan/vk_texture_cache.h | 41 ++++++++----- src/video_core/texture_cache/image_base.cpp | 2 + src/video_core/texture_cache/image_base.h | 3 + .../texture_cache/image_view_base.cpp | 2 +- .../texture_cache/image_view_base.h | 4 +- src/video_core/texture_cache/texture_cache.h | 47 ++++++++------- .../texture_cache/texture_cache_base.h | 30 ++++++---- src/video_core/texture_cache/types.h | 7 +++ 16 files changed, 192 insertions(+), 192 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp index 19c8ca7b2..ab2baefbb 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp @@ -79,8 +79,7 @@ void ComputePipeline::Configure() { } texture_cache.SynchronizeComputeDescriptors(); - std::array image_view_ids; - boost::container::static_vector image_view_indices; + boost::container::static_vector views; std::array samplers; std::array textures; std::array images; @@ -110,33 +109,39 @@ void ComputePipeline::Configure() { } return TexturePair(gpu_memory.Read(addr), via_header_index); }}; - const auto add_image{[&](const auto& desc) { + const auto add_image{[&](const auto& desc, bool blacklist) { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; - image_view_indices.push_back(handle.first); + views.push_back({ + .index = handle.first, + .blacklist = blacklist, + .id = {}, + }); } }}; for (const auto& desc : info.texture_buffer_descriptors) { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; - image_view_indices.push_back(handle.first); + views.push_back({handle.first}); samplers[sampler_binding++] = 0; } } - std::ranges::for_each(info.image_buffer_descriptors, add_image); + for (const auto& desc : info.image_buffer_descriptors) { + add_image(desc, false); + } for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; - image_view_indices.push_back(handle.first); + views.push_back({handle.first}); Sampler* const sampler = texture_cache.GetComputeSampler(handle.second); samplers[sampler_binding++] = sampler->Handle(); } } - std::ranges::for_each(info.image_descriptors, add_image); - - const std::span indices_span(image_view_indices.data(), image_view_indices.size()); - texture_cache.FillComputeImageViews(indices_span, image_view_ids); + for (const auto& desc : info.image_descriptors) { + add_image(desc, true); + } + texture_cache.FillComputeImageViews(std::span(views.data(), views.size())); if (assembly_program.handle != 0) { program_manager.BindComputeAssemblyProgram(assembly_program.handle); @@ -152,7 +157,7 @@ void ComputePipeline::Configure() { if constexpr (is_image) { is_written = desc.is_written; } - ImageView& image_view{texture_cache.GetImageView(image_view_ids[texbuf_index])}; + ImageView& image_view{texture_cache.GetImageView(views[texbuf_index].id)}; buffer_cache.BindComputeTextureBuffer(texbuf_index, image_view.GpuAddr(), image_view.BufferSize(), image_view.format, is_written, is_image); @@ -168,19 +173,20 @@ void ComputePipeline::Configure() { buffer_cache.runtime.SetImagePointers(textures.data(), images.data()); buffer_cache.BindHostComputeBuffers(); - const ImageId* views_it{image_view_ids.data() + num_texture_buffers + num_image_buffers}; + const VideoCommon::ImageViewInOut* views_it{views.data() + num_texture_buffers + + num_image_buffers}; texture_binding += num_texture_buffers; image_binding += num_image_buffers; for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { - ImageView& image_view{texture_cache.GetImageView(*(views_it++))}; + ImageView& image_view{texture_cache.GetImageView((views_it++)->id)}; textures[texture_binding++] = image_view.Handle(desc.type); } } for (const auto& desc : info.image_descriptors) { for (u32 index = 0; index < desc.count; ++index) { - ImageView& image_view{texture_cache.GetImageView(*(views_it++))}; + ImageView& image_view{texture_cache.GetImageView((views_it++)->id)}; if (desc.is_written) { texture_cache.MarkModification(image_view.image_id); } diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index 43ab5c03b..0bbda7951 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -15,7 +15,7 @@ #include "video_core/renderer_opengl/gl_shader_util.h" #include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/shader_notify.h" -#include "video_core/texture_cache/texture_cache_base.h" +#include "video_core/texture_cache/texture_cache.h" #if defined(_MSC_VER) && defined(NDEBUG) #define LAMBDA_FORCEINLINE [[msvc::forceinline]] @@ -280,10 +280,9 @@ GraphicsPipeline::GraphicsPipeline( template void GraphicsPipeline::ConfigureImpl(bool is_indexed) { - std::array image_view_ids; - std::array image_view_indices; + std::array views; std::array samplers; - size_t image_view_index{}; + size_t views_index{}; GLsizei sampler_binding{}; texture_cache.SynchronizeGraphicsDescriptors(); @@ -328,30 +327,34 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { } return TexturePair(gpu_memory.Read(addr), via_header_index); }}; - const auto add_image{[&](const auto& desc) { + const auto add_image{[&](const auto& desc, bool blacklist) LAMBDA_FORCEINLINE { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; - image_view_indices[image_view_index++] = handle.first; + views[views_index++] = { + .index = handle.first, + .blacklist = blacklist, + .id = {}, + }; } }}; if constexpr (Spec::has_texture_buffers) { for (const auto& desc : info.texture_buffer_descriptors) { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; - image_view_indices[image_view_index++] = handle.first; + views[views_index++] = {handle.first}; samplers[sampler_binding++] = 0; } } } if constexpr (Spec::has_image_buffers) { for (const auto& desc : info.image_buffer_descriptors) { - add_image(desc); + add_image(desc, false); } } for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; - image_view_indices[image_view_index++] = handle.first; + views[views_index++] = {handle.first}; Sampler* const sampler{texture_cache.GetGraphicsSampler(handle.second)}; samplers[sampler_binding++] = sampler->Handle(); @@ -359,7 +362,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { } if constexpr (Spec::has_images) { for (const auto& desc : info.image_descriptors) { - add_image(desc); + add_image(desc, true); } } }}; @@ -378,13 +381,12 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { if constexpr (Spec::enabled_stages[4]) { config_stage(4); } - const std::span indices_span(image_view_indices.data(), image_view_index); - texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); + texture_cache.FillGraphicsImageViews(std::span(views.data(), views_index)); texture_cache.UpdateRenderTargets(false); state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); - ImageId* texture_buffer_index{image_view_ids.data()}; + VideoCommon::ImageViewInOut* texture_buffer_it{views.data()}; const auto bind_stage_info{[&](size_t stage) LAMBDA_FORCEINLINE { size_t index{}; const auto add_buffer{[&](const auto& desc) { @@ -394,12 +396,12 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { if constexpr (is_image) { is_written = desc.is_written; } - ImageView& image_view{texture_cache.GetImageView(*texture_buffer_index)}; + ImageView& image_view{texture_cache.GetImageView(texture_buffer_it->id)}; buffer_cache.BindGraphicsTextureBuffer(stage, index, image_view.GpuAddr(), image_view.BufferSize(), image_view.format, is_written, is_image); ++index; - ++texture_buffer_index; + ++texture_buffer_it; } }}; const Shader::Info& info{stage_infos[stage]}; @@ -415,9 +417,9 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { add_buffer(desc); } } - texture_buffer_index += Shader::NumDescriptors(info.texture_descriptors); + texture_buffer_it += Shader::NumDescriptors(info.texture_descriptors); if constexpr (Spec::has_images) { - texture_buffer_index += Shader::NumDescriptors(info.image_descriptors); + texture_buffer_it += Shader::NumDescriptors(info.image_descriptors); } }}; if constexpr (Spec::enabled_stages[0]) { @@ -446,7 +448,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { } else { program_manager.BindSourcePrograms(source_programs); } - const ImageId* views_it{image_view_ids.data()}; + const VideoCommon::ImageViewInOut* views_it{views.data()}; GLsizei texture_binding = 0; GLsizei image_binding = 0; std::array textures; @@ -464,13 +466,13 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { const auto& info{stage_infos[stage]}; for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { - ImageView& image_view{texture_cache.GetImageView(*(views_it++))}; + ImageView& image_view{texture_cache.GetImageView((views_it++)->id)}; textures[texture_binding++] = image_view.Handle(desc.type); } } for (const auto& desc : info.image_descriptors) { for (u32 index = 0; index < desc.count; ++index) { - ImageView& image_view{texture_cache.GetImageView(*(views_it++))}; + ImageView& image_view{texture_cache.GetImageView((views_it++)->id)}; if (desc.is_written) { texture_cache.MarkModification(image_view.image_id); } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 5e2695576..5d14bfc97 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -472,11 +472,7 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& set_view(Shader::TextureType::ColorArray1D, null_image_1d_array.handle); set_view(Shader::TextureType::ColorArray2D, null_image_view_2d_array.handle); set_view(Shader::TextureType::ColorArrayCube, null_image_cube_array.handle); -} -TextureCacheRuntime::~TextureCacheRuntime() = default; - -void TextureCacheRuntime::Init() { resolution = Settings::values.resolution_info; is_rescaling_on = resolution.up_scale != 1 || resolution.down_shift != 0; if (is_rescaling_on) { @@ -485,6 +481,8 @@ void TextureCacheRuntime::Init() { } } +TextureCacheRuntime::~TextureCacheRuntime() = default; + void TextureCacheRuntime::Finish() { glFinish(); } @@ -685,6 +683,8 @@ Image::Image(TextureCacheRuntime& runtime_, const VideoCommon::ImageInfo& info_, } } +Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBase{params} {} + Image::~Image() = default; void Image::UploadMemory(const ImageBufferMap& map, @@ -1076,7 +1076,7 @@ ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, const VideoCommon::ImageViewInfo& view_info) : VideoCommon::ImageViewBase{info, view_info} {} -ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::NullImageParams& params) +ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::NullImageViewParams& params) : VideoCommon::ImageViewBase{params}, views{runtime.null_image_views} {} GLuint ImageView::StorageView(Shader::TextureType texture_type, Shader::ImageFormat image_format) { diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 787b63e87..e76ec522a 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -73,8 +73,6 @@ public: StateTracker& state_tracker); ~TextureCacheRuntime(); - void Init(); - void Finish(); ImageBufferMap UploadStagingBuffer(size_t size); @@ -167,6 +165,7 @@ class Image : public VideoCommon::ImageBase { public: explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); + explicit Image(const VideoCommon::NullImageParams&); ~Image(); @@ -223,7 +222,7 @@ public: const VideoCommon::ImageViewInfo&, GPUVAddr); explicit ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, const VideoCommon::ImageViewInfo& view_info); - explicit ImageView(TextureCacheRuntime&, const VideoCommon::NullImageParams&); + explicit ImageView(TextureCacheRuntime&, const VideoCommon::NullImageViewParams&); [[nodiscard]] GLuint StorageView(Shader::TextureType texture_type, Shader::ImageFormat image_format); diff --git a/src/video_core/renderer_vulkan/pipeline_helper.h b/src/video_core/renderer_vulkan/pipeline_helper.h index 7ba6078df..bf18b34d1 100644 --- a/src/video_core/renderer_vulkan/pipeline_helper.h +++ b/src/video_core/renderer_vulkan/pipeline_helper.h @@ -156,28 +156,27 @@ private: u32 texture_bit{1u}; }; -inline void PushImageDescriptors(const Shader::Info& info, const VkSampler*& samplers, - const ImageId*& image_view_ids, TextureCache& texture_cache, +inline void PushImageDescriptors(TextureCache& texture_cache, VKUpdateDescriptorQueue& update_descriptor_queue, - RescalingPushConstant& rescaling) { - static constexpr VideoCommon::ImageViewId NULL_IMAGE_VIEW_ID{0}; - image_view_ids += Shader::NumDescriptors(info.texture_buffer_descriptors); - image_view_ids += Shader::NumDescriptors(info.image_buffer_descriptors); + const Shader::Info& info, RescalingPushConstant& rescaling, + const VkSampler*& samplers, + const VideoCommon::ImageViewInOut*& views) { + views += Shader::NumDescriptors(info.texture_buffer_descriptors); + views += Shader::NumDescriptors(info.image_buffer_descriptors); for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { - const VideoCommon::ImageViewId image_view_id{*(image_view_ids++)}; + const VideoCommon::ImageViewId image_view_id{(views++)->id}; const VkSampler sampler{*(samplers++)}; ImageView& image_view{texture_cache.GetImageView(image_view_id)}; const Image& image{texture_cache.GetImage(image_view.image_id)}; const VkImageView vk_image_view{image_view.Handle(desc.type)}; update_descriptor_queue.AddSampledImage(vk_image_view, sampler); - rescaling.PushTexture(image_view_id != NULL_IMAGE_VIEW_ID && - True(image.flags & VideoCommon::ImageFlagBits::Rescaled)); + rescaling.PushTexture(True(image.flags & VideoCommon::ImageFlagBits::Rescaled)); } } for (const auto& desc : info.image_descriptors) { for (u32 index = 0; index < desc.count; ++index) { - ImageView& image_view{texture_cache.GetImageView(*(image_view_ids++))}; + ImageView& image_view{texture_cache.GetImageView((views++)->id)}; if (desc.is_written) { texture_cache.MarkModification(image_view.image_id); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 5c591e345..f89b84c6e 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -108,10 +108,8 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute, texture_cache.SynchronizeComputeDescriptors(); static constexpr size_t max_elements = 64; - std::array image_view_ids; - boost::container::static_vector image_view_indices; + boost::container::static_vector views; boost::container::static_vector samplers; - boost::container::static_vector image_view_blacklist; const auto& qmd{kepler_compute.launch_description}; const auto& cbufs{qmd.const_buffer_config}; @@ -135,54 +133,37 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute, } return TexturePair(gpu_memory.Read(addr), via_header_index); }}; - const auto add_image{[&](const auto& desc) { + const auto add_image{[&](const auto& desc, bool blacklist) { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; - image_view_indices.push_back(handle.first); + views.push_back({ + .index = handle.first, + .blacklist = blacklist, + .id = {}, + }); } }}; - std::ranges::for_each(info.texture_buffer_descriptors, add_image); - std::ranges::for_each(info.image_buffer_descriptors, add_image); + for (const auto& desc : info.texture_buffer_descriptors) { + add_image(desc, false); + } + for (const auto& desc : info.image_buffer_descriptors) { + add_image(desc, false); + } for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; - image_view_indices.push_back(handle.first); + views.push_back({handle.first}); Sampler* const sampler = texture_cache.GetComputeSampler(handle.second); samplers.push_back(sampler->Handle()); } } - const u32 black_list_base = image_view_indices.size(); - bool atleast_one_blacklisted = false; for (const auto& desc : info.image_descriptors) { - const bool is_black_listed = - desc.is_written && (desc.type == Shader::TextureType::Color2D || - desc.type == Shader::TextureType::ColorArray2D); - for (u32 index = 0; index < desc.count; ++index) { - image_view_blacklist.push_back(is_black_listed); - } - atleast_one_blacklisted |= is_black_listed; - add_image(desc); + add_image(desc, true); } - - const std::span indices_span(image_view_indices.data(), image_view_indices.size()); - bool has_listed_stuffs; - do { - has_listed_stuffs = false; - texture_cache.FillComputeImageViews(indices_span, image_view_ids); - if (atleast_one_blacklisted) { - for (u32 index = 0; index < image_view_blacklist.size(); index++) { - if (image_view_blacklist[index]) { - ImageView& image_view{ - texture_cache.GetImageView(image_view_ids[index + black_list_base])}; - has_listed_stuffs |= texture_cache.BlackListImage(image_view.image_id); - } - } - } - } while (has_listed_stuffs); + texture_cache.FillComputeImageViews(std::span(views.data(), views.size())); buffer_cache.UnbindComputeTextureBuffers(); - ImageId* texture_buffer_ids{image_view_ids.data()}; size_t index{}; const auto add_buffer{[&](const auto& desc) { constexpr bool is_image = std::is_same_v; @@ -191,11 +172,10 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute, if constexpr (is_image) { is_written = desc.is_written; } - ImageView& image_view = texture_cache.GetImageView(*texture_buffer_ids); + ImageView& image_view = texture_cache.GetImageView(views[index].id); buffer_cache.BindComputeTextureBuffer(index, image_view.GpuAddr(), image_view.BufferSize(), image_view.format, is_written, is_image); - ++texture_buffer_ids; ++index; } }}; @@ -207,9 +187,9 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute, RescalingPushConstant rescaling(num_textures); const VkSampler* samplers_it{samplers.data()}; - const ImageId* views_it{image_view_ids.data()}; - PushImageDescriptors(info, samplers_it, views_it, texture_cache, update_descriptor_queue, - rescaling); + const VideoCommon::ImageViewInOut* views_it{views.data()}; + PushImageDescriptors(texture_cache, update_descriptor_queue, info, rescaling, samplers_it, + views_it); if (!is_built.load(std::memory_order::relaxed)) { // Wait for the pipeline to be built diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 4d966ee4b..4efb5d735 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -278,12 +278,10 @@ void GraphicsPipeline::AddTransition(GraphicsPipeline* transition) { template void GraphicsPipeline::ConfigureImpl(bool is_indexed) { - std::array image_view_ids; - std::array image_view_indices; - std::array image_view_blacklist; + std::array views; std::array samplers; size_t sampler_index{}; - size_t image_index{}; + size_t view_index{}; texture_cache.SynchronizeGraphicsDescriptors(); @@ -291,8 +289,6 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { const auto& regs{maxwell3d.regs}; const bool via_header_index{regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex}; - u32 start_black_list = std::numeric_limits::max(); - u32 end_black_list = 0; const auto config_stage{[&](size_t stage) LAMBDA_FORCEINLINE { const Shader::Info& info{stage_infos[stage]}; buffer_cache.UnbindGraphicsStorageBuffers(stage); @@ -329,7 +325,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { const auto add_image{[&](const auto& desc) { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; - image_view_indices[image_index++] = handle.first; + views[view_index++] = {handle.first}; } }}; if constexpr (Spec::has_texture_buffers) { @@ -345,7 +341,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; - image_view_indices[image_index++] = handle.first; + views[view_index++] = {handle.first}; Sampler* const sampler{texture_cache.GetGraphicsSampler(handle.second)}; samplers[sampler_index++] = sampler->Handle(); @@ -353,15 +349,6 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { } if constexpr (Spec::has_images) { for (const auto& desc : info.image_descriptors) { - if (desc.is_written && (desc.type == Shader::TextureType::Color2D || - desc.type == Shader::TextureType::ColorArray2D)) { - auto index_copy = image_index; - for (u32 index = 0; index < desc.count; ++index) { - start_black_list = std::min(start_black_list, index_copy); - image_view_blacklist[index_copy++] = true; - end_black_list = std::max(end_black_list, index_copy); - } - } add_image(desc); } } @@ -381,24 +368,9 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { if constexpr (Spec::enabled_stages[4]) { config_stage(4); } - const std::span indices_span(image_view_indices.data(), image_index); - bool has_listed_stuffs; - do { - has_listed_stuffs = false; - texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); - if constexpr (Spec::has_images) { - if (start_black_list < end_black_list) { - for (u32 index = start_black_list; index < end_black_list; index++) { - if (image_view_blacklist[index]) { - ImageView& image_view{texture_cache.GetImageView(image_view_ids[index])}; - has_listed_stuffs |= texture_cache.BlackListImage(image_view.image_id); - } - } - } - } - } while (has_listed_stuffs); + texture_cache.FillGraphicsImageViews(std::span(views.data(), view_index)); - ImageId* texture_buffer_index{image_view_ids.data()}; + VideoCommon::ImageViewInOut* texture_buffer_it{views.data()}; const auto bind_stage_info{[&](size_t stage) LAMBDA_FORCEINLINE { size_t index{}; const auto add_buffer{[&](const auto& desc) { @@ -408,12 +380,12 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { if constexpr (is_image) { is_written = desc.is_written; } - ImageView& image_view{texture_cache.GetImageView(*texture_buffer_index)}; + ImageView& image_view{texture_cache.GetImageView(texture_buffer_it->id)}; buffer_cache.BindGraphicsTextureBuffer(stage, index, image_view.GpuAddr(), image_view.BufferSize(), image_view.format, is_written, is_image); ++index; - ++texture_buffer_index; + ++texture_buffer_it; } }}; buffer_cache.UnbindGraphicsTextureBuffers(stage); @@ -429,9 +401,9 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { add_buffer(desc); } } - texture_buffer_index += Shader::NumDescriptors(info.texture_descriptors); + texture_buffer_it += Shader::NumDescriptors(info.texture_descriptors); if constexpr (Spec::has_images) { - texture_buffer_index += Shader::NumDescriptors(info.image_descriptors); + texture_buffer_it += Shader::NumDescriptors(info.image_descriptors); } }}; if constexpr (Spec::enabled_stages[0]) { @@ -457,11 +429,11 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { RescalingPushConstant rescaling(num_textures); const VkSampler* samplers_it{samplers.data()}; - const ImageId* views_it{image_view_ids.data()}; + const VideoCommon::ImageViewInOut* views_it{views.data()}; const auto prepare_stage{[&](size_t stage) LAMBDA_FORCEINLINE { buffer_cache.BindHostStageBuffers(stage); - PushImageDescriptors(stage_infos[stage], samplers_it, views_it, texture_cache, - update_descriptor_queue, rescaling); + PushImageDescriptors(texture_cache, update_descriptor_queue, stage_infos[stage], rescaling, + samplers_it, views_it); }}; if constexpr (Spec::enabled_stages[0]) { prepare_stage(0); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index b21992fce..3400066a6 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -730,10 +730,17 @@ void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, con } } // Anonymous namespace -void TextureCacheRuntime::Init() { - resolution = Settings::values.resolution_info; - is_rescaling_on = resolution.up_scale != 1 || resolution.down_shift != 0; -} +TextureCacheRuntime::TextureCacheRuntime(const Device& device_, VKScheduler& scheduler_, + MemoryAllocator& memory_allocator_, + StagingBufferPool& staging_buffer_pool_, + BlitImageHelper& blit_image_helper_, + ASTCDecoderPass& astc_decoder_pass_, + RenderPassCache& render_pass_cache_) + : device{device_}, scheduler{scheduler_}, memory_allocator{memory_allocator_}, + staging_buffer_pool{staging_buffer_pool_}, blit_image_helper{blit_image_helper_}, + astc_decoder_pass{astc_decoder_pass_}, render_pass_cache{render_pass_cache_}, + resolution{Settings::values.resolution_info}, + is_rescaling_on(resolution.up_scale != 1 || resolution.down_shift != 0) {} void TextureCacheRuntime::Finish() { scheduler.Finish(); @@ -1040,6 +1047,8 @@ Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu } } +Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBase{params} {} + Image::~Image() = default; void Image::UploadMemory(const StagingBufferRef& map, std::span copies) { @@ -1187,8 +1196,7 @@ bool Image::ScaleDown(bool save_as_backup) { }*/ const auto& resolution = runtime->resolution; - vk::Image downscaled_image = - MakeImage(runtime->device, info); + vk::Image downscaled_image = MakeImage(runtime->device, info); MemoryCommit new_commit( runtime->memory_allocator.Commit(downscaled_image, MemoryUsage::DeviceLocal)); @@ -1301,7 +1309,7 @@ ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, : VideoCommon::ImageViewBase{info, view_info}, gpu_addr{gpu_addr_}, buffer_size{VideoCommon::CalculateGuestSizeInBytes(info)} {} -ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::NullImageParams& params) +ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::NullImageViewParams& params) : VideoCommon::ImageViewBase{params} {} VkImageView ImageView::DepthView() { diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 958a64651..9c39a6d99 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -34,21 +34,16 @@ class RenderPassCache; class StagingBufferPool; class VKScheduler; -struct TextureCacheRuntime { - const Device& device; - VKScheduler& scheduler; - MemoryAllocator& memory_allocator; - StagingBufferPool& staging_buffer_pool; - BlitImageHelper& blit_image_helper; - ASTCDecoderPass& astc_decoder_pass; - RenderPassCache& render_pass_cache; +class TextureCacheRuntime { +public: static constexpr size_t TICKS_TO_DESTROY = 6; - DelayedDestructionRing prescaled_images; - DelayedDestructionRing prescaled_commits; - Settings::ResolutionScalingInfo resolution; - bool is_rescaling_on{}; - void Init(); + explicit TextureCacheRuntime(const Device& device_, VKScheduler& scheduler_, + MemoryAllocator& memory_allocator_, + StagingBufferPool& staging_buffer_pool_, + BlitImageHelper& blit_image_helper_, + ASTCDecoderPass& astc_decoder_pass_, + RenderPassCache& render_pass_cache_); void Finish(); @@ -56,6 +51,10 @@ struct TextureCacheRuntime { StagingBufferRef DownloadStagingBuffer(size_t size); + void TickFrame(); + + u64 GetDeviceLocalMemory() const; + void BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src, const Region2D& dst_region, const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, @@ -84,15 +83,25 @@ struct TextureCacheRuntime { return true; } - void TickFrame(); + const Device& device; + VKScheduler& scheduler; + MemoryAllocator& memory_allocator; + StagingBufferPool& staging_buffer_pool; + BlitImageHelper& blit_image_helper; + ASTCDecoderPass& astc_decoder_pass; + RenderPassCache& render_pass_cache; - u64 GetDeviceLocalMemory() const; + DelayedDestructionRing prescaled_images; + DelayedDestructionRing prescaled_commits; + Settings::ResolutionScalingInfo resolution; + bool is_rescaling_on{}; }; class Image : public VideoCommon::ImageBase { public: explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); + explicit Image(const VideoCommon::NullImageParams&); ~Image(); @@ -151,7 +160,7 @@ public: explicit ImageView(TextureCacheRuntime&, const VideoCommon::ImageViewInfo&, ImageId, Image&); explicit ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo&, const VideoCommon::ImageViewInfo&, GPUVAddr); - explicit ImageView(TextureCacheRuntime&, const VideoCommon::NullImageParams&); + explicit ImageView(TextureCacheRuntime&, const VideoCommon::NullImageViewParams&); [[nodiscard]] VkImageView DepthView(); diff --git a/src/video_core/texture_cache/image_base.cpp b/src/video_core/texture_cache/image_base.cpp index e9e725edf..25a211df8 100644 --- a/src/video_core/texture_cache/image_base.cpp +++ b/src/video_core/texture_cache/image_base.cpp @@ -69,6 +69,8 @@ ImageBase::ImageBase(const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_ } } +ImageBase::ImageBase(const NullImageParams&) {} + ImageMapView::ImageMapView(GPUVAddr gpu_addr_, VAddr cpu_addr_, size_t size_, ImageId image_id_) : gpu_addr{gpu_addr_}, cpu_addr{cpu_addr_}, size{size_}, image_id{image_id_} {} diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index 97f107b4d..9c34687e0 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -48,8 +48,11 @@ struct AliasedImage { ImageId id; }; +struct NullImageParams {}; + struct ImageBase { explicit ImageBase(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); + explicit ImageBase(const NullImageParams&); [[nodiscard]] std::optional TryFindBase(GPUVAddr other_addr) const noexcept; diff --git a/src/video_core/texture_cache/image_view_base.cpp b/src/video_core/texture_cache/image_view_base.cpp index 450becbeb..e66dc9320 100644 --- a/src/video_core/texture_cache/image_view_base.cpp +++ b/src/video_core/texture_cache/image_view_base.cpp @@ -45,6 +45,6 @@ ImageViewBase::ImageViewBase(const ImageInfo& info, const ImageViewInfo& view_in ASSERT_MSG(view_info.type == ImageViewType::Buffer, "Expected texture buffer"); } -ImageViewBase::ImageViewBase(const NullImageParams&) {} +ImageViewBase::ImageViewBase(const NullImageViewParams&) : image_id{NULL_IMAGE_ID} {} } // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_view_base.h b/src/video_core/texture_cache/image_view_base.h index 903f715c5..9c24c5359 100644 --- a/src/video_core/texture_cache/image_view_base.h +++ b/src/video_core/texture_cache/image_view_base.h @@ -15,7 +15,7 @@ using VideoCore::Surface::PixelFormat; struct ImageViewInfo; struct ImageInfo; -struct NullImageParams {}; +struct NullImageViewParams {}; enum class ImageViewFlagBits : u16 { PreemtiveDownload = 1 << 0, @@ -28,7 +28,7 @@ struct ImageViewBase { explicit ImageViewBase(const ImageViewInfo& info, const ImageInfo& image_info, ImageId image_id); explicit ImageViewBase(const ImageInfo& info, const ImageViewInfo& view_info); - explicit ImageViewBase(const NullImageParams&); + explicit ImageViewBase(const NullImageViewParams&); [[nodiscard]] bool IsBuffer() const noexcept { return type == ImageViewType::Buffer; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index be40f6b88..4e97a9e6a 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -36,7 +36,6 @@ TextureCache

::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& Tegra::MemoryManager& gpu_memory_) : runtime{runtime_}, rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_}, gpu_memory{gpu_memory_} { - runtime.Init(); // Configure null sampler TSCEntry sampler_descriptor{}; sampler_descriptor.min_filter.Assign(Tegra::Texture::TextureFilter::Linear); @@ -46,7 +45,8 @@ TextureCache

::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& // Make sure the first index is reserved for the null resources // This way the null resource becomes a compile time constant - void(slot_image_views.insert(runtime, NullImageParams{})); + void(slot_images.insert(NullImageParams{})); + void(slot_image_views.insert(runtime, NullImageViewParams{})); void(slot_samplers.insert(runtime, sampler_descriptor)); if constexpr (HAS_DEVICE_MEMORY_INFO) { @@ -57,7 +57,7 @@ TextureCache

::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& critical_memory = std::max(possible_critical_memory, DEFAULT_CRITICAL_MEMORY); minimum_memory = 0; } else { - // on OGL we can be more conservatives as the driver takes care. + // On OpenGL we can be more conservatives as the driver takes care. expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB; critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB; minimum_memory = expected_memory; @@ -135,15 +135,14 @@ void TextureCache

::MarkModification(ImageId id) noexcept { } template -void TextureCache

::FillGraphicsImageViews(std::span indices, - std::span image_view_ids) { - FillImageViews(graphics_image_table, graphics_image_view_ids, indices, image_view_ids); +template +void TextureCache

::FillGraphicsImageViews(std::span views) { + FillImageViews(graphics_image_table, graphics_image_view_ids, views); } template -void TextureCache

::FillComputeImageViews(std::span indices, - std::span image_view_ids) { - FillImageViews(compute_image_table, compute_image_view_ids, indices, image_view_ids); +void TextureCache

::FillComputeImageViews(std::span views) { + FillImageViews(compute_image_table, compute_image_view_ids, views); } template @@ -346,17 +345,26 @@ typename P::Framebuffer* TextureCache

::GetFramebuffer() { } template +template void TextureCache

::FillImageViews(DescriptorTable& table, std::span cached_image_view_ids, - std::span indices, - std::span image_view_ids) { - ASSERT(indices.size() <= image_view_ids.size()); + std::span views) { + bool has_blacklisted; do { has_deleted_images = false; - std::ranges::transform(indices, image_view_ids.begin(), [&](u32 index) { - return VisitImageView(table, cached_image_view_ids, index); - }); - } while (has_deleted_images); + if constexpr (has_blacklists) { + has_blacklisted = false; + } + for (ImageViewInOut& view : views) { + view.id = VisitImageView(table, cached_image_view_ids, view.index); + if constexpr (has_blacklists) { + if (view.blacklist && view.id != NULL_IMAGE_VIEW_ID) { + const ImageViewBase& image_view{slot_image_views[view.id]}; + has_blacklisted |= BlackListImage(image_view.image_id); + } + } + } + } while (has_deleted_images || (has_blacklists && has_blacklisted)); } template @@ -622,7 +630,7 @@ void TextureCache

::PopAsyncFlushes() { } template -bool TextureCache

::IsRescaling() { +bool TextureCache

::IsRescaling() const noexcept { return is_rescaling; } @@ -775,12 +783,11 @@ bool TextureCache

::BlackListImage(ImageId image_id) { } template -bool TextureCache

::ImageCanRescale(Image& image) { +bool TextureCache

::ImageCanRescale(ImageBase& image) { if (True(image.flags & ImageFlagBits::Blacklisted)) { return false; } - if (True(image.flags & ImageFlagBits::Rescaled) || - True(image.flags & ImageFlagBits::RescaleChecked)) { + if (True(image.flags & (ImageFlagBits::Rescaled | ImageFlagBits::RescaleChecked))) { return true; } if (!image.info.rescaleable) { diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 35a29cd9b..b6cc09682 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -39,6 +39,16 @@ using VideoCore::Surface::PixelFormatFromDepthFormat; using VideoCore::Surface::PixelFormatFromRenderTargetFormat; using namespace Common::Literals; +struct ImageViewInOut { + u32 index; + bool blacklist; + union { + struct Empty { + } empty{}; + ImageViewId id; + }; +}; + template class TextureCache { /// Address shift for caching images into a hash table @@ -53,11 +63,6 @@ class TextureCache { /// True when the API can provide info about the memory of the device. static constexpr bool HAS_DEVICE_MEMORY_INFO = P::HAS_DEVICE_MEMORY_INFO; - /// Image view ID for null descriptors - static constexpr ImageViewId NULL_IMAGE_VIEW_ID{0}; - /// Sampler ID for bugged sampler ids - static constexpr SamplerId NULL_SAMPLER_ID{0}; - static constexpr u64 DEFAULT_EXPECTED_MEMORY = 1_GiB; static constexpr u64 DEFAULT_CRITICAL_MEMORY = 2_GiB; @@ -105,11 +110,11 @@ public: void MarkModification(ImageId id) noexcept; /// Fill image_view_ids with the graphics images in indices - void FillGraphicsImageViews(std::span indices, - std::span image_view_ids); + template + void FillGraphicsImageViews(std::span views); /// Fill image_view_ids with the compute images in indices - void FillComputeImageViews(std::span indices, std::span image_view_ids); + void FillComputeImageViews(std::span views); /// Get the sampler from the graphics descriptor table in the specified index Sampler* GetGraphicsSampler(u32 index); @@ -174,7 +179,7 @@ public: /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); - [[nodiscard]] bool IsRescaling(); + [[nodiscard]] bool IsRescaling() const noexcept; [[nodiscard]] bool BlackListImage(ImageId image_id); @@ -216,9 +221,10 @@ private: void RunGarbageCollector(); /// Fills image_view_ids in the image views in indices + template void FillImageViews(DescriptorTable& table, - std::span cached_image_view_ids, std::span indices, - std::span image_view_ids); + std::span cached_image_view_ids, + std::span views); /// Find or create an image view in the guest descriptor table ImageViewId VisitImageView(DescriptorTable& table, @@ -336,7 +342,7 @@ private: /// Returns true if the current clear parameters clear the whole image of a given image view [[nodiscard]] bool IsFullClear(ImageViewId id); - bool ImageCanRescale(Image& image); + bool ImageCanRescale(ImageBase& image); void InvalidateScale(Image& image); bool ScaleUp(Image& image); bool ScaleDown(Image& image); diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h index 47a11cb2f..5c274abdf 100644 --- a/src/video_core/texture_cache/types.h +++ b/src/video_core/texture_cache/types.h @@ -22,6 +22,13 @@ using ImageAllocId = SlotId; using SamplerId = SlotId; using FramebufferId = SlotId; +/// Fake image ID for null image views +constexpr ImageId NULL_IMAGE_ID{0}; +/// Image view ID for null descriptors +constexpr ImageViewId NULL_IMAGE_VIEW_ID{0}; +/// Sampler ID for bugged sampler ids +constexpr SamplerId NULL_SAMPLER_ID{0}; + enum class ImageType : u32 { e1D, e2D, From 2182d2575010a5a85c99c09c6a1c57962242444d Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 28 Jul 2021 02:53:24 -0300 Subject: [PATCH 041/156] texture_cache: Fix blacklists on compute --- src/video_core/texture_cache/texture_cache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 4e97a9e6a..4dbded635 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -142,7 +142,7 @@ void TextureCache

::FillGraphicsImageViews(std::span views) { template void TextureCache

::FillComputeImageViews(std::span views) { - FillImageViews(compute_image_table, compute_image_view_ids, views); + FillImageViews(compute_image_table, compute_image_view_ids, views); } template From f086c82e1f80cae088bb22de9092598dc51979da Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 29 Jul 2021 13:27:01 -0400 Subject: [PATCH 042/156] gl_graphics_pipeline: Add downscale factor to shader uniforms --- .../backend/glasm/emit_glasm_not_implemented.cpp | 3 +-- .../backend/glsl/emit_context.cpp | 3 +++ .../backend/glsl/emit_glsl_context_get_set.cpp | 3 +-- .../renderer_opengl/gl_graphics_pipeline.cpp | 15 ++++++++++++++- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp index 807494063..77ee6dc0e 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp +++ b/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp @@ -211,8 +211,7 @@ void EmitYDirection(EmitContext& ctx, IR::Inst& inst) { } void EmitResolutionDownFactor(EmitContext& ctx, IR::Inst& inst) { - UNIMPLEMENTED(); - ctx.Add("MOV.F {}.x,1;", inst); + ctx.Add("MOV.F {}.x,program.env[0].x;", inst); } void EmitUndefU1(EmitContext& ctx, IR::Inst& inst) { diff --git a/src/shader_recompiler/backend/glsl/emit_context.cpp b/src/shader_recompiler/backend/glsl/emit_context.cpp index 4e6f2c0fe..7c9ed9159 100644 --- a/src/shader_recompiler/backend/glsl/emit_context.cpp +++ b/src/shader_recompiler/backend/glsl/emit_context.cpp @@ -393,6 +393,9 @@ EmitContext::EmitContext(IR::Program& program, Bindings& bindings, const Profile DefineGenericOutput(index, program.invocations); } } + if (info.uses_rescaling_uniform) { + header += "layout(location=0) uniform float down_factor;"; + } DefineConstantBuffers(bindings); DefineStorageBuffers(bindings); SetupImages(bindings); diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp index f4ed090e3..3db3083f9 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp @@ -446,8 +446,7 @@ void EmitYDirection(EmitContext& ctx, IR::Inst& inst) { } void EmitResolutionDownFactor(EmitContext& ctx, IR::Inst& inst) { - UNIMPLEMENTED(); - ctx.AddF32("{}=1.0f;", inst); + ctx.AddF32("{}=down_factor;", inst); } void EmitLoadLocal(EmitContext& ctx, IR::Inst& inst, std::string_view word_offset) { diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index 0bbda7951..92fda9af0 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -443,11 +443,24 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { if (!is_built.load(std::memory_order::relaxed)) { WaitForBuild(); } - if (assembly_programs[0].handle != 0) { + const bool use_assembly{assembly_programs[0].handle != 0}; + const bool is_rescaling{texture_cache.IsRescaling()}; + const f32 config_down_factor{Settings::values.resolution_info.down_factor}; + const f32 down_factor{is_rescaling ? config_down_factor : 1.0f}; + if (use_assembly) { program_manager.BindAssemblyPrograms(assembly_programs, enabled_stages_mask); } else { program_manager.BindSourcePrograms(source_programs); } + for (size_t stage = 0; stage < source_programs.size(); ++stage) { + if (stage_infos[stage].uses_rescaling_uniform) { + if (use_assembly) { + glProgramEnvParameter4fARB(AssemblyStage(stage), 0, down_factor, 0.0f, 0.0f, 1.0f); + } else { + glProgramUniform1f(source_programs[stage].handle, 0, down_factor); + } + } + } const VideoCommon::ImageViewInOut* views_it{views.data()}; GLsizei texture_binding = 0; GLsizei image_binding = 0; From 9bc7b04ca587a349a9fc865d05e30966d6a84d65 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 29 Jul 2021 14:19:51 -0400 Subject: [PATCH 043/156] gl_rasterizer: Fix rescale dirty state checking --- src/video_core/renderer_opengl/gl_rasterizer.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index b91e7edf8..615704711 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -533,7 +533,8 @@ void RasterizerOpenGL::SyncViewport() { auto& flags = maxwell3d.dirty.flags; const auto& regs = maxwell3d.regs; - const bool dirty_viewport = flags[Dirty::Viewports]; + const bool rescale_viewports = flags[VideoCommon::Dirty::RescaleViewports]; + const bool dirty_viewport = flags[Dirty::Viewports] || rescale_viewports; const bool dirty_clip_control = flags[Dirty::ClipControl]; if (dirty_clip_control || flags[Dirty::FrontFace]) { @@ -574,8 +575,9 @@ void RasterizerOpenGL::SyncViewport() { if (dirty_viewport) { flags[Dirty::Viewports] = false; - const bool force = flags[Dirty::ViewportTransform]; + const bool force = flags[Dirty::ViewportTransform] || rescale_viewports; flags[Dirty::ViewportTransform] = false; + flags[VideoCommon::Dirty::RescaleViewports] = false; const auto& resolution = Settings::values.resolution_info; const auto scale_up = [&](u32 value) -> u32 { @@ -911,11 +913,14 @@ void RasterizerOpenGL::SyncLogicOpState() { void RasterizerOpenGL::SyncScissorTest() { auto& flags = maxwell3d.dirty.flags; - if (!flags[Dirty::Scissors]) { + if (!flags[Dirty::Scissors] && !flags[VideoCommon::Dirty::RescaleScissors]) { return; } flags[Dirty::Scissors] = false; + const bool force = flags[VideoCommon::Dirty::RescaleScissors]; + flags[VideoCommon::Dirty::RescaleScissors] = false; + const auto& regs = maxwell3d.regs; const auto& resolution = Settings::values.resolution_info; @@ -927,7 +932,7 @@ void RasterizerOpenGL::SyncScissorTest() { return std::max(converted_value, 1U); }; for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) { - if (!flags[Dirty::Scissor0 + index]) { + if (!force && !flags[Dirty::Scissor0 + index]) { continue; } flags[Dirty::Scissor0 + index] = false; From b6060873ce1eea02f99a350f955362e57391ecd1 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 29 Jul 2021 15:45:53 -0400 Subject: [PATCH 044/156] gl_compute_pipeline: Add downscale factor to shader uniforms --- src/video_core/renderer_opengl/gl_compute_pipeline.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp index ab2baefbb..a11bd5a02 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp @@ -143,10 +143,19 @@ void ComputePipeline::Configure() { } texture_cache.FillComputeImageViews(std::span(views.data(), views.size())); + const bool is_rescaling{texture_cache.IsRescaling()}; + const f32 config_down_factor{Settings::values.resolution_info.down_factor}; + const f32 down_factor{is_rescaling ? config_down_factor : 1.0f}; if (assembly_program.handle != 0) { program_manager.BindComputeAssemblyProgram(assembly_program.handle); + if (info.uses_rescaling_uniform) { + glProgramEnvParameter4fARB(GL_COMPUTE_PROGRAM_NV, 0, down_factor, 0.0f, 0.0f, 1.0f); + } } else { program_manager.BindComputeProgram(source_program.handle); + if (info.uses_rescaling_uniform) { + glProgramUniform1f(source_program.handle, 0, down_factor); + } } buffer_cache.UnbindComputeTextureBuffers(); size_t texbuf_index{}; From 05d98d9bbffde2f43ff9558a8b1676dfca0bd0f3 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 29 Jul 2021 18:08:06 -0400 Subject: [PATCH 045/156] gl_texture_cache: Fix multi layered texture Scale --- .../renderer_opengl/gl_texture_cache.cpp | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 5d14bfc97..8b86136e0 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -939,17 +939,21 @@ bool Image::Scale(bool scale_src, bool scale_dst) { const auto& draw_fbo = runtime->rescale_draw_fbo; glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw_fbo.handle); glBindFramebuffer(GL_READ_FRAMEBUFFER, read_fbo.handle); - for (s32 level = 0; level < info.resources.levels; ++level) { - const u32 src_level_width = std::max(1u, src_width >> level); - const u32 src_level_height = std::max(1u, src_height >> level); - const u32 dst_level_width = std::max(1u, dst_width >> level); - const u32 dst_level_height = std::max(1u, dst_height >> level); + for (s32 layer = 0; layer < info.resources.layers; ++layer) { + for (s32 level = 0; level < info.resources.levels; ++level) { + const u32 src_level_width = std::max(1u, src_width >> level); + const u32 src_level_height = std::max(1u, src_height >> level); + const u32 dst_level_width = std::max(1u, dst_width >> level); + const u32 dst_level_height = std::max(1u, dst_height >> level); - glNamedFramebufferTexture(read_fbo.handle, attachment, texture.handle, level); - glNamedFramebufferTexture(draw_fbo.handle, attachment, dst_texture.handle, level); - glBlitNamedFramebuffer(read_fbo.handle, draw_fbo.handle, 0, 0, src_level_width, - src_level_height, 0, 0, dst_level_width, dst_level_height, mask, - filter); + glNamedFramebufferTextureLayer(read_fbo.handle, attachment, texture.handle, level, + layer); + glNamedFramebufferTextureLayer(draw_fbo.handle, attachment, dst_texture.handle, level, + layer); + glBlitNamedFramebuffer(read_fbo.handle, draw_fbo.handle, 0, 0, src_level_width, + src_level_height, 0, 0, dst_level_width, dst_level_height, mask, + filter); + } } texture = std::move(dst_texture); From 4a512d6827609b13cf991d8e8efb0c789936167f Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sat, 31 Jul 2021 02:37:06 -0300 Subject: [PATCH 046/156] gl_rasterizer: Properly scale viewports and scissors --- .../renderer_opengl/gl_rasterizer.cpp | 45 ++++++++++--------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 615704711..d94f1e89f 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -214,8 +214,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { query_cache.UpdateCounters(); - SyncState(); - GraphicsPipeline* const pipeline{shader_cache.CurrentGraphicsPipeline()}; if (!pipeline) { return; @@ -223,6 +221,8 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; pipeline->Configure(is_indexed); + SyncState(); + const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology); BeginTransformFeedback(pipeline, primitive_mode); @@ -554,7 +554,6 @@ void RasterizerOpenGL::SyncViewport() { } glFrontFace(mode); } - if (dirty_viewport || flags[Dirty::ClipControl]) { flags[Dirty::ClipControl] = false; @@ -571,6 +570,8 @@ void RasterizerOpenGL::SyncViewport() { state_tracker.ClipControl(origin, depth); state_tracker.SetYNegate(regs.screen_y_control.y_negate != 0); } + const bool is_rescaling{texture_cache.IsRescaling()}; + const float scale = is_rescaling ? Settings::values.resolution_info.up_factor : 1.0f; if (dirty_viewport) { flags[Dirty::Viewports] = false; @@ -579,38 +580,38 @@ void RasterizerOpenGL::SyncViewport() { flags[Dirty::ViewportTransform] = false; flags[VideoCommon::Dirty::RescaleViewports] = false; - const auto& resolution = Settings::values.resolution_info; - const auto scale_up = [&](u32 value) -> u32 { - if (value == 0) { - return 0U; - } - const u32 converted_value = (value * resolution.up_scale) >> resolution.down_shift; - return std::max(converted_value, 1U); - }; - for (std::size_t i = 0; i < Maxwell::NumViewports; ++i) { - if (!force && !flags[Dirty::Viewport0 + i]) { + for (size_t index = 0; index < Maxwell::NumViewports; ++index) { + if (!force && !flags[Dirty::Viewport0 + index]) { continue; } - flags[Dirty::Viewport0 + i] = false; + flags[Dirty::Viewport0 + index] = false; - const auto& src = regs.viewport_transform[i]; - const Common::Rectangle rect{src.GetRect()}; - glViewportIndexedf(static_cast(i), rect.left, rect.bottom, - scale_up(rect.GetWidth()), scale_up(rect.GetHeight())); + const auto& src = regs.viewport_transform[index]; + GLfloat x = (src.translate_x - src.scale_x) * scale; + GLfloat y = (src.translate_y - src.scale_y) * scale; + GLfloat width = src.scale_x * 2.0f * scale; + GLfloat height = src.scale_y * 2.0f * scale; + if (height < 0) { + y += height; + height = -height; + } + glViewportIndexedf(static_cast(index), x, y, width != 0.0f ? width : 1.0f, + height != 0.0f ? height : 1.0f); const GLdouble reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne; const GLdouble near_depth = src.translate_z - src.scale_z * reduce_z; const GLdouble far_depth = src.translate_z + src.scale_z; if (device.HasDepthBufferFloat()) { - glDepthRangeIndexeddNV(static_cast(i), near_depth, far_depth); + glDepthRangeIndexeddNV(static_cast(index), near_depth, far_depth); } else { - glDepthRangeIndexed(static_cast(i), near_depth, far_depth); + glDepthRangeIndexed(static_cast(index), near_depth, far_depth); } if (!GLAD_GL_NV_viewport_swizzle) { continue; } - glViewportSwizzleNV(static_cast(i), MaxwellToGL::ViewportSwizzle(src.swizzle.x), + glViewportSwizzleNV(static_cast(index), + MaxwellToGL::ViewportSwizzle(src.swizzle.x), MaxwellToGL::ViewportSwizzle(src.swizzle.y), MaxwellToGL::ViewportSwizzle(src.swizzle.z), MaxwellToGL::ViewportSwizzle(src.swizzle.w)); @@ -940,7 +941,7 @@ void RasterizerOpenGL::SyncScissorTest() { const auto& src = regs.scissor_test[index]; if (src.enable) { glEnablei(GL_SCISSOR_TEST, static_cast(index)); - glScissorIndexed(static_cast(index), src.min_x, src.min_y, + glScissorIndexed(static_cast(index), scale_up(src.min_x), scale_up(src.min_y), scale_up(src.max_x - src.min_x), scale_up(src.max_y - src.min_y)); } else { glDisablei(GL_SCISSOR_TEST, static_cast(index)); From cfeb161c7ebf93bf6ac39e430fc998dc13abfc66 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sat, 31 Jul 2021 03:04:08 -0300 Subject: [PATCH 047/156] glsl/glasm: Pass and use scaling parameters in shaders --- .../backend/glasm/emit_glasm.cpp | 3 ++ .../backend/glasm/emit_glasm_image.cpp | 5 +-- .../glasm/emit_glasm_not_implemented.cpp | 2 +- .../backend/glsl/emit_context.cpp | 2 +- .../glsl/emit_glsl_context_get_set.cpp | 2 +- .../backend/glsl/emit_glsl_image.cpp | 4 +-- .../renderer_opengl/gl_compute_pipeline.cpp | 23 ++++++++---- .../renderer_opengl/gl_graphics_pipeline.cpp | 36 ++++++++++++------- .../renderer_opengl/gl_texture_cache.h | 2 +- 9 files changed, 51 insertions(+), 28 deletions(-) diff --git a/src/shader_recompiler/backend/glasm/emit_glasm.cpp b/src/shader_recompiler/backend/glasm/emit_glasm.cpp index 4ce1c4f54..004658546 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm.cpp +++ b/src/shader_recompiler/backend/glasm/emit_glasm.cpp @@ -448,6 +448,9 @@ std::string EmitGLASM(const Profile& profile, const RuntimeInfo& runtime_info, I header += fmt::format("SHARED_MEMORY {};", program.shared_memory_size); header += fmt::format("SHARED shared_mem[]={{program.sharedmem}};"); } + if (program.info.uses_rescaling_uniform) { + header += "PARAM scaling[1]={program.local[0..0]};"; + } header += "TEMP "; for (size_t index = 0; index < ctx.reg_alloc.NumUsedRegisters(); ++index) { header += fmt::format("R{},", index); diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp index 583ed3cf2..05e88cd97 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp +++ b/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp @@ -612,8 +612,9 @@ void EmitIsTextureScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& inde if (!index.IsImmediate()) { throw NotImplementedException("Non-constant texture rescaling"); } - UNIMPLEMENTED(); - ctx.Add("MOV.S {}.x,-1;", inst); + ctx.Add("AND.U RC.x,scaling[0].x,{};" + "SNE.S {},RC.x,0;", + 1u << index.U32(), ctx.reg_alloc.Define(inst)); } void EmitImageAtomicIAdd32(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Register coord, diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp index 77ee6dc0e..c0f8ddcad 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp +++ b/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp @@ -211,7 +211,7 @@ void EmitYDirection(EmitContext& ctx, IR::Inst& inst) { } void EmitResolutionDownFactor(EmitContext& ctx, IR::Inst& inst) { - ctx.Add("MOV.F {}.x,program.env[0].x;", inst); + ctx.Add("MOV.F {}.x,scaling[0].y;", inst); } void EmitUndefU1(EmitContext& ctx, IR::Inst& inst) { diff --git a/src/shader_recompiler/backend/glsl/emit_context.cpp b/src/shader_recompiler/backend/glsl/emit_context.cpp index 7c9ed9159..97bd59302 100644 --- a/src/shader_recompiler/backend/glsl/emit_context.cpp +++ b/src/shader_recompiler/backend/glsl/emit_context.cpp @@ -394,7 +394,7 @@ EmitContext::EmitContext(IR::Program& program, Bindings& bindings, const Profile } } if (info.uses_rescaling_uniform) { - header += "layout(location=0) uniform float down_factor;"; + header += "layout(location=0) uniform vec4 scaling;"; } DefineConstantBuffers(bindings); DefineStorageBuffers(bindings); diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp index 3db3083f9..542a79230 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp @@ -446,7 +446,7 @@ void EmitYDirection(EmitContext& ctx, IR::Inst& inst) { } void EmitResolutionDownFactor(EmitContext& ctx, IR::Inst& inst) { - ctx.AddF32("{}=down_factor;", inst); + ctx.AddF32("{}=scaling.y;", inst); } void EmitLoadLocal(EmitContext& ctx, IR::Inst& inst, std::string_view word_offset) { diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp index 099e0160b..82b6f0d77 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp @@ -616,8 +616,8 @@ void EmitIsTextureScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& inde if (!index.IsImmediate()) { throw NotImplementedException("Non-constant texture rescaling"); } - UNIMPLEMENTED(); - ctx.AddU1("{}=true;", inst); + const u32 image_index{index.U32()}; + ctx.AddU1("{}=(ftou(scaling.x)&{})!=0;", inst, 1u << image_index); } void EmitBindlessImageSampleImplicitLod(EmitContext&) { diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp index a11bd5a02..12093c3c4 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp @@ -148,14 +148,8 @@ void ComputePipeline::Configure() { const f32 down_factor{is_rescaling ? config_down_factor : 1.0f}; if (assembly_program.handle != 0) { program_manager.BindComputeAssemblyProgram(assembly_program.handle); - if (info.uses_rescaling_uniform) { - glProgramEnvParameter4fARB(GL_COMPUTE_PROGRAM_NV, 0, down_factor, 0.0f, 0.0f, 1.0f); - } } else { program_manager.BindComputeProgram(source_program.handle); - if (info.uses_rescaling_uniform) { - glProgramUniform1f(source_program.handle, 0, down_factor); - } } buffer_cache.UnbindComputeTextureBuffers(); size_t texbuf_index{}; @@ -187,10 +181,16 @@ void ComputePipeline::Configure() { texture_binding += num_texture_buffers; image_binding += num_image_buffers; + u32 scaling_mask{}; for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { ImageView& image_view{texture_cache.GetImageView((views_it++)->id)}; - textures[texture_binding++] = image_view.Handle(desc.type); + textures[texture_binding] = image_view.Handle(desc.type); + if (True(texture_cache.GetImage(image_view.image_id).flags & + VideoCommon::ImageFlagBits::Rescaled)) { + scaling_mask |= 1u << texture_binding; + } + ++texture_binding; } } for (const auto& desc : info.image_descriptors) { @@ -202,6 +202,15 @@ void ComputePipeline::Configure() { images[image_binding++] = image_view.StorageView(desc.type, desc.format); } } + if (info.uses_rescaling_uniform) { + const f32 float_scaling_mask{Common::BitCast(scaling_mask)}; + if (assembly_program.handle != 0) { + glProgramLocalParameter4fARB(GL_COMPUTE_PROGRAM_NV, 0, float_scaling_mask, 0.0f, 0.0f, + 0.0f); + } else { + glProgramUniform4f(source_program.handle, 0, float_scaling_mask, 0.0f, 0.0f, 0.0f); + } + } if (texture_binding != 0) { ASSERT(texture_binding == sampler_binding); glBindTextures(0, texture_binding, textures.data()); diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index 92fda9af0..01aa2897a 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -444,23 +444,11 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { WaitForBuild(); } const bool use_assembly{assembly_programs[0].handle != 0}; - const bool is_rescaling{texture_cache.IsRescaling()}; - const f32 config_down_factor{Settings::values.resolution_info.down_factor}; - const f32 down_factor{is_rescaling ? config_down_factor : 1.0f}; if (use_assembly) { program_manager.BindAssemblyPrograms(assembly_programs, enabled_stages_mask); } else { program_manager.BindSourcePrograms(source_programs); } - for (size_t stage = 0; stage < source_programs.size(); ++stage) { - if (stage_infos[stage].uses_rescaling_uniform) { - if (use_assembly) { - glProgramEnvParameter4fARB(AssemblyStage(stage), 0, down_factor, 0.0f, 0.0f, 1.0f); - } else { - glProgramUniform1f(source_programs[stage].handle, 0, down_factor); - } - } - } const VideoCommon::ImageViewInOut* views_it{views.data()}; GLsizei texture_binding = 0; GLsizei image_binding = 0; @@ -476,11 +464,20 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { views_it += num_texture_buffers[stage]; views_it += num_image_buffers[stage]; + u32 scaling_mask{}; + u32 stage_texture_binding{}; + const auto& info{stage_infos[stage]}; for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { ImageView& image_view{texture_cache.GetImageView((views_it++)->id)}; - textures[texture_binding++] = image_view.Handle(desc.type); + textures[texture_binding] = image_view.Handle(desc.type); + if (True(texture_cache.GetImage(image_view.image_id).flags & + VideoCommon::ImageFlagBits::Rescaled)) { + scaling_mask |= 1u << stage_texture_binding; + } + ++texture_binding; + ++stage_texture_binding; } } for (const auto& desc : info.image_descriptors) { @@ -492,6 +489,19 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { images[image_binding++] = image_view.StorageView(desc.type, desc.format); } } + if (info.uses_rescaling_uniform) { + const f32 float_scaling_mask{Common::BitCast(scaling_mask)}; + const bool is_rescaling{texture_cache.IsRescaling()}; + const f32 config_down_factor{Settings::values.resolution_info.down_factor}; + const f32 down_factor{is_rescaling ? config_down_factor : 1.0f}; + if (use_assembly) { + glProgramLocalParameter4fARB(AssemblyStage(stage), 0, float_scaling_mask, + down_factor, 0.0f, 0.0f); + } else { + glProgramUniform4f(source_programs[stage].handle, 0, float_scaling_mask, + down_factor, 0.0f, 0.0f); + } + } }}; if constexpr (Spec::enabled_stages[0]) { prepare_stage(0); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index e76ec522a..28c91b368 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -210,7 +210,7 @@ private: GLenum gl_internal_format = GL_NONE; GLenum gl_format = GL_NONE; GLenum gl_type = GL_NONE; - TextureCacheRuntime* runtime; + TextureCacheRuntime* runtime{}; }; class ImageView : public VideoCommon::ImageViewBase { From c9238555f7bb809e322adf7c70676d008e1413ff Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sat, 31 Jul 2021 03:04:33 -0300 Subject: [PATCH 048/156] gl_texture_cache: Fix scaling blits --- .../renderer_opengl/gl_texture_cache.cpp | 32 +++++++------------ 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 8b86136e0..6aea375f1 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -478,6 +478,10 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& if (is_rescaling_on) { rescale_draw_fbo.Create(); rescale_read_fbo.Create(); + + // Make sure the framebuffer is created without DSA + glBindFramebuffer(GL_READ_FRAMEBUFFER, rescale_draw_fbo.handle); + glBindFramebuffer(GL_READ_FRAMEBUFFER, rescale_read_fbo.handle); } } @@ -882,11 +886,6 @@ bool Image::Scale(bool scale_src, bool scale_dst) { UNIMPLEMENTED(); return false; } - GLint prev_draw_fbo; - GLint prev_read_fbo; - glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &prev_draw_fbo); - glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &prev_read_fbo); - const GLenum attachment = [this] { switch (GetFormatType(info.format)) { case SurfaceType::ColorTexture: @@ -935,10 +934,8 @@ bool Image::Scale(bool scale_src, bool scale_dst) { dst_info.size.height = dst_height; auto dst_texture = MakeImage(dst_info, gl_internal_format); - const auto& read_fbo = runtime->rescale_read_fbo; - const auto& draw_fbo = runtime->rescale_draw_fbo; - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw_fbo.handle); - glBindFramebuffer(GL_READ_FRAMEBUFFER, read_fbo.handle); + const GLuint read_fbo = runtime->rescale_read_fbo.handle; + const GLuint draw_fbo = runtime->rescale_draw_fbo.handle; for (s32 layer = 0; layer < info.resources.layers; ++layer) { for (s32 level = 0; level < info.resources.levels; ++level) { const u32 src_level_width = std::max(1u, src_width >> level); @@ -946,20 +943,15 @@ bool Image::Scale(bool scale_src, bool scale_dst) { const u32 dst_level_width = std::max(1u, dst_width >> level); const u32 dst_level_height = std::max(1u, dst_height >> level); - glNamedFramebufferTextureLayer(read_fbo.handle, attachment, texture.handle, level, - layer); - glNamedFramebufferTextureLayer(draw_fbo.handle, attachment, dst_texture.handle, level, - layer); - glBlitNamedFramebuffer(read_fbo.handle, draw_fbo.handle, 0, 0, src_level_width, - src_level_height, 0, 0, dst_level_width, dst_level_height, mask, - filter); + glNamedFramebufferTextureLayer(read_fbo, attachment, texture.handle, level, layer); + glNamedFramebufferTextureLayer(draw_fbo, attachment, dst_texture.handle, level, layer); + glBlitNamedFramebuffer(read_fbo, draw_fbo, 0, 0, src_level_width, src_level_height, 0, + 0, dst_level_width, dst_level_height, mask, filter); + glNamedFramebufferTextureLayer(read_fbo, attachment, 0, level, layer); + glNamedFramebufferTextureLayer(draw_fbo, attachment, 0, level, layer); } } texture = std::move(dst_texture); - - // Restore previous framebuffers - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, prev_draw_fbo); - glBindFramebuffer(GL_READ_FRAMEBUFFER, prev_read_fbo); return true; } From 526e47f1486c361e10fc930eff1df4f13d178816 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sat, 31 Jul 2021 03:04:44 -0300 Subject: [PATCH 049/156] vk_rasterizer: Minor style change --- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 7a7374b78..20bb05e7d 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -604,8 +604,8 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& reg if (!state_tracker.TouchViewports()) { return; } - const float scale = - texture_cache.IsRescaling() ? Settings::values.resolution_info.up_factor : 1.0f; + const bool is_rescaling{texture_cache.IsRescaling()}; + const float scale = is_rescaling ? Settings::values.resolution_info.up_factor : 1.0f; const std::array viewports{ GetViewportState(device, regs, 0, scale), GetViewportState(device, regs, 1, scale), GetViewportState(device, regs, 2, scale), GetViewportState(device, regs, 3, scale), From c7a1cbad44487b2c5f9da31ce6d3c76b7dec4f05 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sat, 31 Jul 2021 17:42:37 -0300 Subject: [PATCH 050/156] texture_cache: Add getter to query if image view is rescaled --- .../renderer_opengl/gl_compute_pipeline.cpp | 3 +-- .../renderer_opengl/gl_graphics_pipeline.cpp | 3 +-- src/video_core/renderer_vulkan/pipeline_helper.h | 3 +-- src/video_core/texture_cache/texture_cache.h | 16 ++++++---------- .../texture_cache/texture_cache_base.h | 9 +++------ 5 files changed, 12 insertions(+), 22 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp index 12093c3c4..02853b078 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp @@ -186,8 +186,7 @@ void ComputePipeline::Configure() { for (u32 index = 0; index < desc.count; ++index) { ImageView& image_view{texture_cache.GetImageView((views_it++)->id)}; textures[texture_binding] = image_view.Handle(desc.type); - if (True(texture_cache.GetImage(image_view.image_id).flags & - VideoCommon::ImageFlagBits::Rescaled)) { + if (texture_cache.IsRescaling(image_view)) { scaling_mask |= 1u << texture_binding; } ++texture_binding; diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index 01aa2897a..c3d549a6e 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -472,8 +472,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { for (u32 index = 0; index < desc.count; ++index) { ImageView& image_view{texture_cache.GetImageView((views_it++)->id)}; textures[texture_binding] = image_view.Handle(desc.type); - if (True(texture_cache.GetImage(image_view.image_id).flags & - VideoCommon::ImageFlagBits::Rescaled)) { + if (texture_cache.IsRescaling(image_view)) { scaling_mask |= 1u << stage_texture_binding; } ++texture_binding; diff --git a/src/video_core/renderer_vulkan/pipeline_helper.h b/src/video_core/renderer_vulkan/pipeline_helper.h index bf18b34d1..bce4220c6 100644 --- a/src/video_core/renderer_vulkan/pipeline_helper.h +++ b/src/video_core/renderer_vulkan/pipeline_helper.h @@ -168,10 +168,9 @@ inline void PushImageDescriptors(TextureCache& texture_cache, const VideoCommon::ImageViewId image_view_id{(views++)->id}; const VkSampler sampler{*(samplers++)}; ImageView& image_view{texture_cache.GetImageView(image_view_id)}; - const Image& image{texture_cache.GetImage(image_view.image_id)}; const VkImageView vk_image_view{image_view.Handle(desc.type)}; update_descriptor_queue.AddSampledImage(vk_image_view, sampler); - rescaling.PushTexture(True(image.flags & VideoCommon::ImageFlagBits::Rescaled)); + rescaling.PushTexture(texture_cache.IsRescaling(image_view)); } } for (const auto& desc : info.image_descriptors) { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 4dbded635..0e70c4db2 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -119,16 +119,6 @@ typename P::ImageView& TextureCache

::GetImageView(ImageViewId id) noexcept { return slot_image_views[id]; } -template -const typename P::Image& TextureCache

::GetImage(ImageId id) const noexcept { - return slot_images[id]; -} - -template -typename P::Image& TextureCache

::GetImage(ImageId id) noexcept { - return slot_images[id]; -} - template void TextureCache

::MarkModification(ImageId id) noexcept { MarkModification(slot_images[id]); @@ -634,6 +624,12 @@ bool TextureCache

::IsRescaling() const noexcept { return is_rescaling; } +template +bool TextureCache

::IsRescaling(const ImageViewBase& image_view) const noexcept { + const ImageBase& image = slot_images[image_view.image_id]; + return True(image.flags & ImageFlagBits::Rescaled); +} + template bool TextureCache

::IsRegionGpuModified(VAddr addr, size_t size) { bool is_modified = false; diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index b6cc09682..8b417b611 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -21,6 +21,7 @@ #include "video_core/texture_cache/descriptor_table.h" #include "video_core/texture_cache/image_base.h" #include "video_core/texture_cache/image_info.h" +#include "video_core/texture_cache/image_view_base.h" #include "video_core/texture_cache/image_view_info.h" #include "video_core/texture_cache/render_targets.h" #include "video_core/texture_cache/slot_vector.h" @@ -100,12 +101,6 @@ public: /// Return a reference to the given image view id [[nodiscard]] ImageView& GetImageView(ImageViewId id) noexcept; - /// Return a constant reference to the given image id - [[nodiscard]] const Image& GetImage(ImageId id) const noexcept; - - /// Return a reference to the given image id - [[nodiscard]] Image& GetImage(ImageId id) noexcept; - /// Mark an image as modified from the GPU void MarkModification(ImageId id) noexcept; @@ -181,6 +176,8 @@ public: [[nodiscard]] bool IsRescaling() const noexcept; + [[nodiscard]] bool IsRescaling(const ImageViewBase& image_view) const noexcept; + [[nodiscard]] bool BlackListImage(ImageId image_id); std::mutex mutex; From fc9bb3c3fed4721b06bda46deea3770e5285b104 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 1 Aug 2021 02:26:02 -0300 Subject: [PATCH 051/156] shader: Properly blacklist and scale image loads --- .../ir_opt/rescaling_pass.cpp | 22 ++++++++++++++++--- .../renderer_opengl/gl_compute_pipeline.cpp | 2 +- .../renderer_opengl/gl_graphics_pipeline.cpp | 2 +- .../renderer_vulkan/vk_compute_pipeline.cpp | 2 +- .../renderer_vulkan/vk_graphics_pipeline.cpp | 14 +++++++----- 5 files changed, 31 insertions(+), 11 deletions(-) diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp index d5b98ae6e..86c8f0c69 100644 --- a/src/shader_recompiler/ir_opt/rescaling_pass.cpp +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -84,10 +84,8 @@ void PatchImageQueryDimensions(IR::Block& block, IR::Inst& inst) { } } -void PatchImageFetch(IR::Block& block, IR::Inst& inst) { - IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; +void ScaleIntegerCoord(IR::IREmitter& ir, IR::Inst& inst, const IR::U1& is_scaled) { const auto info{inst.Flags()}; - const IR::U1 is_scaled{ir.IsTextureScaled(ir.Imm32(info.descriptor_index))}; const IR::Value coord{inst.Arg(1)}; switch (info.type) { case TextureType::Color1D: @@ -121,6 +119,21 @@ void PatchImageFetch(IR::Block& block, IR::Inst& inst) { } } +void PatchImageFetch(IR::Block& block, IR::Inst& inst) { + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + const auto info{inst.Flags()}; + const IR::U1 is_scaled{ir.IsTextureScaled(ir.Imm32(info.descriptor_index))}; + ScaleIntegerCoord(ir, inst, is_scaled); +} + +void PatchImageRead(IR::Block& block, IR::Inst& inst) { + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + const auto info{inst.Flags()}; + // TODO: Scale conditionally + const IR::U1 is_scaled{IR::Value{true}}; + ScaleIntegerCoord(ir, inst, is_scaled); +} + void Visit(const IR::Program& program, IR::Block& block, IR::Inst& inst) { const bool is_fragment_shader{program.stage == Stage::Fragment}; switch (inst.GetOpcode()) { @@ -144,6 +157,9 @@ void Visit(const IR::Program& program, IR::Block& block, IR::Inst& inst) { case IR::Opcode::ImageFetch: PatchImageFetch(block, inst); break; + case IR::Opcode::ImageRead: + PatchImageRead(block, inst); + break; default: break; } diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp index 02853b078..60c65047b 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp @@ -139,7 +139,7 @@ void ComputePipeline::Configure() { } } for (const auto& desc : info.image_descriptors) { - add_image(desc, true); + add_image(desc, desc.is_written); } texture_cache.FillComputeImageViews(std::span(views.data(), views.size())); diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index c3d549a6e..11559d6ce 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -362,7 +362,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { } if constexpr (Spec::has_images) { for (const auto& desc : info.image_descriptors) { - add_image(desc, true); + add_image(desc, desc.is_written); } } }}; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index f89b84c6e..6dc52e399 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -159,7 +159,7 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute, } } for (const auto& desc : info.image_descriptors) { - add_image(desc, true); + add_image(desc, desc.is_written); } texture_cache.FillComputeImageViews(std::span(views.data(), views.size())); diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 4efb5d735..c29bab678 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -322,20 +322,24 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { } return TexturePair(gpu_memory.Read(addr), via_header_index); }}; - const auto add_image{[&](const auto& desc) { + const auto add_image{[&](const auto& desc, bool blacklist) LAMBDA_FORCEINLINE { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; - views[view_index++] = {handle.first}; + views[view_index++] = { + .index = handle.first, + .blacklist = blacklist, + .id = {}, + }; } }}; if constexpr (Spec::has_texture_buffers) { for (const auto& desc : info.texture_buffer_descriptors) { - add_image(desc); + add_image(desc, false); } } if constexpr (Spec::has_image_buffers) { for (const auto& desc : info.image_buffer_descriptors) { - add_image(desc); + add_image(desc, false); } } for (const auto& desc : info.texture_descriptors) { @@ -349,7 +353,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { } if constexpr (Spec::has_images) { for (const auto& desc : info.image_descriptors) { - add_image(desc); + add_image(desc, desc.is_written); } } }}; From e66d5b88a6f1c2d85c5cd8e351c6ed52c96a0ecf Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 1 Aug 2021 18:57:45 -0300 Subject: [PATCH 052/156] shader: Properly scale image reads and add GL SPIR-V support Thanks for everything! --- src/shader_recompiler/backend/bindings.h | 2 + .../backend/glasm/emit_context.cpp | 4 +- .../backend/glasm/emit_glasm.h | 2 + .../backend/glasm/emit_glasm_image.cpp | 9 +++ .../backend/glasm/emit_glasm_instructions.h | 1 + .../glasm/emit_glasm_not_implemented.cpp | 2 +- .../glsl/emit_glsl_context_get_set.cpp | 2 +- .../backend/glsl/emit_glsl_image.cpp | 8 ++ .../backend/spirv/emit_context.cpp | 65 ++++++++++++---- .../backend/spirv/emit_context.h | 11 ++- .../backend/spirv/emit_spirv.h | 16 +++- .../spirv/emit_spirv_context_get_set.cpp | 13 +++- .../backend/spirv/emit_spirv_image.cpp | 74 +++++++++++++------ .../backend/spirv/emit_spirv_instructions.h | 1 + .../frontend/ir/ir_emitter.cpp | 4 + .../frontend/ir/ir_emitter.h | 1 + src/shader_recompiler/frontend/ir/opcodes.inc | 1 + .../ir_opt/collect_shader_info_pass.cpp | 1 + .../ir_opt/rescaling_pass.cpp | 3 +- src/shader_recompiler/runtime_info.h | 2 - .../renderer_opengl/gl_buffer_cache.cpp | 13 +++- .../renderer_opengl/gl_compute_pipeline.cpp | 21 ++++-- .../renderer_opengl/gl_graphics_pipeline.cpp | 24 ++++-- .../renderer_vulkan/pipeline_helper.h | 22 +++++- .../renderer_vulkan/vk_pipeline_cache.cpp | 3 - 25 files changed, 228 insertions(+), 77 deletions(-) diff --git a/src/shader_recompiler/backend/bindings.h b/src/shader_recompiler/backend/bindings.h index 35503000c..669702553 100644 --- a/src/shader_recompiler/backend/bindings.h +++ b/src/shader_recompiler/backend/bindings.h @@ -14,6 +14,8 @@ struct Bindings { u32 storage_buffer{}; u32 texture{}; u32 image{}; + u32 texture_scaling_index{}; + u32 image_scaling_index{}; }; } // namespace Shader::Backend diff --git a/src/shader_recompiler/backend/glasm/emit_context.cpp b/src/shader_recompiler/backend/glasm/emit_context.cpp index 069c019ad..8fd459dfe 100644 --- a/src/shader_recompiler/backend/glasm/emit_context.cpp +++ b/src/shader_recompiler/backend/glasm/emit_context.cpp @@ -6,6 +6,7 @@ #include "shader_recompiler/backend/bindings.h" #include "shader_recompiler/backend/glasm/emit_context.h" +#include "shader_recompiler/backend/glasm/emit_glasm.h" #include "shader_recompiler/frontend/ir/program.h" #include "shader_recompiler/profile.h" #include "shader_recompiler/runtime_info.h" @@ -55,7 +56,8 @@ EmitContext::EmitContext(IR::Program& program, Bindings& bindings, const Profile } if (!runtime_info.glasm_use_storage_buffers) { if (const size_t num = info.storage_buffers_descriptors.size(); num > 0) { - Add("PARAM c[{}]={{program.local[0..{}]}};", num, num - 1); + const size_t index{num + PROGRAM_LOCAL_PARAMETER_STORAGE_BUFFER_BASE}; + Add("PARAM c[{}]={{program.local[0..{}]}};", index, index - 1); } } stage = program.stage; diff --git a/src/shader_recompiler/backend/glasm/emit_glasm.h b/src/shader_recompiler/backend/glasm/emit_glasm.h index bcb55f062..292655acb 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm.h +++ b/src/shader_recompiler/backend/glasm/emit_glasm.h @@ -13,6 +13,8 @@ namespace Shader::Backend::GLASM { +constexpr u32 PROGRAM_LOCAL_PARAMETER_STORAGE_BUFFER_BASE = 1; + [[nodiscard]] std::string EmitGLASM(const Profile& profile, const RuntimeInfo& runtime_info, IR::Program& program, Bindings& bindings); diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp index 05e88cd97..d325d31c7 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp +++ b/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp @@ -617,6 +617,15 @@ void EmitIsTextureScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& inde 1u << index.U32(), ctx.reg_alloc.Define(inst)); } +void EmitIsImageScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index) { + if (!index.IsImmediate()) { + throw NotImplementedException("Non-constant texture rescaling"); + } + ctx.Add("AND.U RC.x,scaling[0].y,{};" + "SNE.S {},RC.x,0;", + 1u << index.U32(), ctx.reg_alloc.Define(inst)); +} + void EmitImageAtomicIAdd32(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Register coord, ScalarU32 value) { ImageAtomic(ctx, inst, index, coord, value, "ADD.U32"); diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h b/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h index e2b7d601d..1f343bff5 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h +++ b/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h @@ -557,6 +557,7 @@ void EmitImageRead(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Reg void EmitImageWrite(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Register coord, Register color); void EmitIsTextureScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index); +void EmitIsImageScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index); void EmitBindlessImageAtomicIAdd32(EmitContext&); void EmitBindlessImageAtomicSMin32(EmitContext&); void EmitBindlessImageAtomicUMin32(EmitContext&); diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp index c0f8ddcad..681aeda8d 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp +++ b/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp @@ -211,7 +211,7 @@ void EmitYDirection(EmitContext& ctx, IR::Inst& inst) { } void EmitResolutionDownFactor(EmitContext& ctx, IR::Inst& inst) { - ctx.Add("MOV.F {}.x,scaling[0].y;", inst); + ctx.Add("MOV.F {}.x,scaling[0].z;", inst); } void EmitUndefU1(EmitContext& ctx, IR::Inst& inst) { diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp index 542a79230..4c26f3829 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp @@ -446,7 +446,7 @@ void EmitYDirection(EmitContext& ctx, IR::Inst& inst) { } void EmitResolutionDownFactor(EmitContext& ctx, IR::Inst& inst) { - ctx.AddF32("{}=scaling.y;", inst); + ctx.AddF32("{}=scaling.z;", inst); } void EmitLoadLocal(EmitContext& ctx, IR::Inst& inst, std::string_view word_offset) { diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp index 82b6f0d77..2f78d0267 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp @@ -620,6 +620,14 @@ void EmitIsTextureScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& inde ctx.AddU1("{}=(ftou(scaling.x)&{})!=0;", inst, 1u << image_index); } +void EmitIsImageScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index) { + if (!index.IsImmediate()) { + throw NotImplementedException("Non-constant texture rescaling"); + } + const u32 image_index{index.U32()}; + ctx.AddU1("{}=(ftou(scaling.y)&{})!=0;", inst, 1u << image_index); +} + void EmitBindlessImageSampleImplicitLod(EmitContext&) { NotImplemented(); } diff --git a/src/shader_recompiler/backend/spirv/emit_context.cpp b/src/shader_recompiler/backend/spirv/emit_context.cpp index 222baa177..8646fe989 100644 --- a/src/shader_recompiler/backend/spirv/emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/emit_context.cpp @@ -14,6 +14,7 @@ #include "common/common_types.h" #include "common/div_ceil.h" #include "shader_recompiler/backend/spirv/emit_context.h" +#include "shader_recompiler/backend/spirv/emit_spirv.h" namespace Shader::Backend::SPIRV { namespace { @@ -476,8 +477,9 @@ void VectorTypes::Define(Sirit::Module& sirit_ctx, Id base_type, std::string_vie EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_info_, IR::Program& program, Bindings& bindings) - : Sirit::Module(profile_.supported_spirv), profile{profile_}, - runtime_info{runtime_info_}, stage{program.stage} { + : Sirit::Module(profile_.supported_spirv), profile{profile_}, runtime_info{runtime_info_}, + stage{program.stage}, texture_rescaling_index{bindings.texture_scaling_index}, + image_rescaling_index{bindings.image_scaling_index} { const bool is_unified{profile.unified_descriptor_binding}; u32& uniform_binding{is_unified ? bindings.unified : bindings.uniform_buffer}; u32& storage_binding{is_unified ? bindings.unified : bindings.storage_buffer}; @@ -494,8 +496,8 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf DefineStorageBuffers(program.info, storage_binding); DefineTextureBuffers(program.info, texture_binding); DefineImageBuffers(program.info, image_binding); - DefineTextures(program.info, texture_binding); - DefineImages(program.info, image_binding); + DefineTextures(program.info, texture_binding, bindings.texture_scaling_index); + DefineImages(program.info, image_binding, bindings.image_scaling_index); DefineAttributeMemAccess(program.info); DefineGlobalMemoryFunctions(program.info); DefineRescalingInput(program.info); @@ -1003,25 +1005,49 @@ void EmitContext::DefineRescalingInput(const Info& info) { if (!info.uses_rescaling_uniform) { return; } - boost::container::static_vector members{F32[1]}; + if (profile.unified_descriptor_binding) { + DefineRescalingInputPushConstant(info); + } else { + DefineRescalingInputUniformConstant(); + } +} + +void EmitContext::DefineRescalingInputPushConstant(const Info& info) { + boost::container::static_vector members{F32[1]}; u32 member_index{0}; - const u32 num_texture_words{Common::DivCeil(runtime_info.num_textures, 32u)}; - if (runtime_info.num_textures > 0) { - rescaling_textures_type = TypeArray(U32[1], Const(num_texture_words)); + if (!info.texture_descriptors.empty()) { + rescaling_textures_type = TypeArray(U32[1], Const(4u)); Decorate(rescaling_textures_type, spv::Decoration::ArrayStride, 4u); members.push_back(rescaling_textures_type); rescaling_textures_member_index = ++member_index; } + if (!info.image_descriptors.empty()) { + rescaling_images_type = TypeArray(U32[1], Const(NUM_IMAGE_SCALING_WORDS)); + if (rescaling_textures_type.value != rescaling_images_type.value) { + Decorate(rescaling_images_type, spv::Decoration::ArrayStride, 4u); + } + members.push_back(rescaling_images_type); + rescaling_images_member_index = ++member_index; + } const Id push_constant_struct{TypeStruct(std::span(members.data(), members.size()))}; Decorate(push_constant_struct, spv::Decoration::Block); Name(push_constant_struct, "ResolutionInfo"); + MemberDecorate(push_constant_struct, 0u, spv::Decoration::Offset, 0u); MemberName(push_constant_struct, 0u, "down_factor"); - if (runtime_info.num_textures > 0) { - MemberDecorate(push_constant_struct, rescaling_textures_member_index, - spv::Decoration::Offset, 4u); + + const u32 offset_bias = stage == Stage::Compute ? sizeof(u32) : 0; + if (!info.texture_descriptors.empty()) { + MemberDecorate( + push_constant_struct, rescaling_textures_member_index, spv::Decoration::Offset, + static_cast(offsetof(RescalingLayout, rescaling_textures) - offset_bias)); MemberName(push_constant_struct, rescaling_textures_member_index, "rescaling_textures"); } + if (!info.image_descriptors.empty()) { + MemberDecorate(push_constant_struct, rescaling_images_member_index, spv::Decoration::Offset, + static_cast(offsetof(RescalingLayout, rescaling_images) - offset_bias)); + MemberName(push_constant_struct, rescaling_images_member_index, "rescaling_images"); + } const Id pointer_type{TypePointer(spv::StorageClass::PushConstant, push_constant_struct)}; rescaling_push_constants = AddGlobalVariable(pointer_type, spv::StorageClass::PushConstant); Name(rescaling_push_constants, "rescaling_push_constants"); @@ -1031,6 +1057,17 @@ void EmitContext::DefineRescalingInput(const Info& info) { } } +void EmitContext::DefineRescalingInputUniformConstant() { + const Id pointer_type{TypePointer(spv::StorageClass::UniformConstant, F32[4])}; + rescaling_uniform_constant = + AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant); + Decorate(rescaling_uniform_constant, spv::Decoration::Location, 0u); + + if (profile.supported_spirv >= 0x00010400) { + interfaces.push_back(rescaling_uniform_constant); + } +} + void EmitContext::DefineConstantBuffers(const Info& info, u32& binding) { if (info.constant_buffer_descriptors.empty()) { return; @@ -1219,7 +1256,7 @@ void EmitContext::DefineImageBuffers(const Info& info, u32& binding) { } } -void EmitContext::DefineTextures(const Info& info, u32& binding) { +void EmitContext::DefineTextures(const Info& info, u32& binding, u32& scaling_index) { textures.reserve(info.texture_descriptors.size()); for (const TextureDescriptor& desc : info.texture_descriptors) { const Id image_type{ImageType(*this, desc)}; @@ -1241,13 +1278,14 @@ void EmitContext::DefineTextures(const Info& info, u32& binding) { interfaces.push_back(id); } ++binding; + ++scaling_index; } if (info.uses_atomic_image_u32) { image_u32 = TypePointer(spv::StorageClass::Image, U32[1]); } } -void EmitContext::DefineImages(const Info& info, u32& binding) { +void EmitContext::DefineImages(const Info& info, u32& binding, u32& scaling_index) { images.reserve(info.image_descriptors.size()); for (const ImageDescriptor& desc : info.image_descriptors) { if (desc.count != 1) { @@ -1268,6 +1306,7 @@ void EmitContext::DefineImages(const Info& info, u32& binding) { interfaces.push_back(id); } ++binding; + ++scaling_index; } } diff --git a/src/shader_recompiler/backend/spirv/emit_context.h b/src/shader_recompiler/backend/spirv/emit_context.h index a7917ac51..b67704baa 100644 --- a/src/shader_recompiler/backend/spirv/emit_context.h +++ b/src/shader_recompiler/backend/spirv/emit_context.h @@ -238,9 +238,14 @@ public: Id indexed_load_func{}; Id indexed_store_func{}; + Id rescaling_uniform_constant{}; Id rescaling_push_constants{}; Id rescaling_textures_type{}; + Id rescaling_images_type{}; u32 rescaling_textures_member_index{}; + u32 rescaling_images_member_index{}; + u32 texture_rescaling_index{}; + u32 image_rescaling_index{}; Id local_memory{}; @@ -314,11 +319,13 @@ private: void DefineStorageBuffers(const Info& info, u32& binding); void DefineTextureBuffers(const Info& info, u32& binding); void DefineImageBuffers(const Info& info, u32& binding); - void DefineTextures(const Info& info, u32& binding); - void DefineImages(const Info& info, u32& binding); + void DefineTextures(const Info& info, u32& binding, u32& scaling_index); + void DefineImages(const Info& info, u32& binding, u32& scaling_index); void DefineAttributeMemAccess(const Info& info); void DefineGlobalMemoryFunctions(const Info& info); void DefineRescalingInput(const Info& info); + void DefineRescalingInputPushConstant(const Info& info); + void DefineRescalingInputUniformConstant(); void DefineInputs(const IR::Program& program); void DefineOutputs(const IR::Program& program); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.h b/src/shader_recompiler/backend/spirv/emit_spirv.h index 7b0d8d980..db0998ad6 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv.h @@ -16,15 +16,23 @@ namespace Shader::Backend::SPIRV { +constexpr u32 NUM_TEXTURE_SCALING_WORDS = 4; +constexpr u32 NUM_IMAGE_SCALING_WORDS = 2; +constexpr u32 NUM_TEXTURE_AND_IMAGE_SCALING_WORDS = + NUM_TEXTURE_SCALING_WORDS + NUM_IMAGE_SCALING_WORDS; + +struct RescalingLayout { + u32 down_factor; + std::array rescaling_textures; + std::array rescaling_images; +}; + [[nodiscard]] std::vector EmitSPIRV(const Profile& profile, const RuntimeInfo& runtime_info, IR::Program& program, Bindings& bindings); [[nodiscard]] inline std::vector EmitSPIRV(const Profile& profile, IR::Program& program) { - RuntimeInfo runtime_info{}; - runtime_info.num_textures = Shader::NumDescriptors(program.info.texture_descriptors); - Bindings binding; - return EmitSPIRV(profile, runtime_info, program, binding); + return EmitSPIRV(profile, {}, program, binding); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 6bb791b03..c0db7452f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -527,10 +527,15 @@ Id EmitYDirection(EmitContext& ctx) { } Id EmitResolutionDownFactor(EmitContext& ctx) { - const Id pointer_type{ctx.TypePointer(spv::StorageClass::PushConstant, ctx.F32[1])}; - const Id pointer{ - ctx.OpAccessChain(pointer_type, ctx.rescaling_push_constants, ctx.u32_zero_value)}; - return ctx.OpLoad(ctx.F32[1], pointer); + if (ctx.profile.unified_descriptor_binding) { + const Id pointer_type{ctx.TypePointer(spv::StorageClass::PushConstant, ctx.F32[1])}; + const Id pointer{ + ctx.OpAccessChain(pointer_type, ctx.rescaling_push_constants, ctx.u32_zero_value)}; + return ctx.OpLoad(ctx.F32[1], pointer); + } else { + const Id composite{ctx.OpLoad(ctx.F32[4], ctx.rescaling_uniform_constant)}; + return ctx.OpCompositeExtract(ctx.F32[1], composite, 2u); + } } Id EmitLoadLocal(EmitContext& ctx, Id word_offset) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index 7d7c0627e..519ce8b9b 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp @@ -224,6 +224,40 @@ Id Emit(MethodPtrType sparse_ptr, MethodPtrType non_sparse_ptr, EmitContext& ctx Decorate(ctx, inst, sample); return ctx.OpCompositeExtract(result_type, sample, 1U); } + +Id IsScaled(EmitContext& ctx, const IR::Value& index, Id member_index, u32 base_index) { + const Id push_constant_u32{ctx.TypePointer(spv::StorageClass::PushConstant, ctx.U32[1])}; + Id bit{}; + if (index.IsImmediate()) { + // Use BitwiseAnd instead of BitfieldExtract for better codegen on Nvidia OpenGL. + // LOP32I.NZ is used to set the predicate rather than BFE+ISETP. + const u32 index_value{index.U32() + base_index}; + const Id word_index{ctx.Const(index_value / 32)}; + const Id bit_index_mask{ctx.Const(1u << (index_value % 32))}; + const Id pointer{ctx.OpAccessChain(push_constant_u32, ctx.rescaling_push_constants, + member_index, word_index)}; + const Id word{ctx.OpLoad(ctx.U32[1], pointer)}; + bit = ctx.OpBitwiseAnd(ctx.U32[1], word, bit_index_mask); + } else { + Id index_value{ctx.Def(index)}; + if (base_index != 0) { + index_value = ctx.OpIAdd(ctx.U32[1], index_value, ctx.Const(base_index)); + } + const Id word_index{ctx.OpShiftRightArithmetic(ctx.U32[1], index_value, ctx.Const(5u))}; + const Id pointer{ctx.OpAccessChain(push_constant_u32, ctx.rescaling_push_constants, + member_index, word_index)}; + const Id word{ctx.OpLoad(ctx.U32[1], pointer)}; + const Id bit_index{ctx.OpBitwiseAnd(ctx.U32[1], index_value, ctx.Const(31u))}; + bit = ctx.OpBitFieldUExtract(ctx.U32[1], index_value, bit_index, ctx.Const(1u)); + } + return ctx.OpINotEqual(ctx.U1, bit, ctx.u32_zero_value); +} + +Id BitTest(EmitContext& ctx, Id mask, Id bit) { + const Id shifted{ctx.OpShiftRightLogical(ctx.U32[1], mask, bit)}; + const Id bit_value{ctx.OpBitwiseAnd(ctx.U32[1], shifted, ctx.Const(1u))}; + return ctx.OpINotEqual(ctx.U1, bit_value, ctx.u32_zero_value); +} } // Anonymous namespace Id EmitBindlessImageSampleImplicitLod(EmitContext&) { @@ -471,29 +505,27 @@ void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id } Id EmitIsTextureScaled(EmitContext& ctx, const IR::Value& index) { - const Id push_constant_u32{ctx.TypePointer(spv::StorageClass::PushConstant, ctx.U32[1])}; - const Id member_index{ctx.Const(ctx.rescaling_textures_member_index)}; - Id bit{}; - if (index.IsImmediate()) { - // Use BitwiseAnd instead of BitfieldExtract for better codegen on Nvidia OpenGL. - // LOP32I.NZ is used to set the predicate rather than BFE+ISETP. - const u32 index_value{index.U32()}; - const Id word_index{ctx.Const(index_value / 32)}; - const Id bit_index_mask{ctx.Const(1u << (index_value % 32))}; - const Id pointer{ctx.OpAccessChain(push_constant_u32, ctx.rescaling_push_constants, - member_index, word_index)}; - const Id word{ctx.OpLoad(ctx.U32[1], pointer)}; - bit = ctx.OpBitwiseAnd(ctx.U32[1], word, bit_index_mask); + if (ctx.profile.unified_descriptor_binding) { + const Id member_index{ctx.Const(ctx.rescaling_textures_member_index)}; + return IsScaled(ctx, index, member_index, ctx.texture_rescaling_index); } else { - const Id index_value{ctx.Def(index)}; - const Id word_index{ctx.OpShiftRightArithmetic(ctx.U32[1], index_value, ctx.Const(5u))}; - const Id pointer{ctx.OpAccessChain(push_constant_u32, ctx.rescaling_push_constants, - member_index, word_index)}; - const Id word{ctx.OpLoad(ctx.U32[1], pointer)}; - const Id bit_index{ctx.OpBitwiseAnd(ctx.U32[1], index_value, ctx.Const(31u))}; - bit = ctx.OpBitFieldUExtract(ctx.U32[1], index_value, bit_index, ctx.Const(1u)); + const Id composite{ctx.OpLoad(ctx.F32[4], ctx.rescaling_uniform_constant)}; + const Id mask_f32{ctx.OpCompositeExtract(ctx.F32[1], composite, 0u)}; + const Id mask{ctx.OpBitcast(ctx.U32[1], mask_f32)}; + return BitTest(ctx, mask, ctx.Def(index)); + } +} + +Id EmitIsImageScaled(EmitContext& ctx, const IR::Value& index) { + if (ctx.profile.unified_descriptor_binding) { + const Id member_index{ctx.Const(ctx.rescaling_images_member_index)}; + return IsScaled(ctx, index, member_index, ctx.image_rescaling_index); + } else { + const Id composite{ctx.OpLoad(ctx.F32[4], ctx.rescaling_uniform_constant)}; + const Id mask_f32{ctx.OpCompositeExtract(ctx.F32[1], composite, 1u)}; + const Id mask{ctx.OpBitcast(ctx.U32[1], mask_f32)}; + return BitTest(ctx, mask, ctx.Def(index)); } - return ctx.OpINotEqual(ctx.U1, bit, ctx.u32_zero_value); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 69fc18f5f..6cd22dd3e 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -514,6 +514,7 @@ Id EmitImageGradient(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, I Id EmitImageRead(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords); void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, Id color); Id EmitIsTextureScaled(EmitContext& ctx, const IR::Value& index); +Id EmitIsImageScaled(EmitContext& ctx, const IR::Value& index); Id EmitBindlessImageAtomicIAdd32(EmitContext&); Id EmitBindlessImageAtomicSMin32(EmitContext&); Id EmitBindlessImageAtomicUMin32(EmitContext&); diff --git a/src/shader_recompiler/frontend/ir/ir_emitter.cpp b/src/shader_recompiler/frontend/ir/ir_emitter.cpp index 3ccd91c10..356f889ac 100644 --- a/src/shader_recompiler/frontend/ir/ir_emitter.cpp +++ b/src/shader_recompiler/frontend/ir/ir_emitter.cpp @@ -1950,6 +1950,10 @@ U1 IREmitter::IsTextureScaled(const U32& index) { return Inst(Opcode::IsTextureScaled, index); } +U1 IREmitter::IsImageScaled(const U32& index) { + return Inst(Opcode::IsImageScaled, index); +} + U1 IREmitter::VoteAll(const U1& value) { return Inst(Opcode::VoteAll, value); } diff --git a/src/shader_recompiler/frontend/ir/ir_emitter.h b/src/shader_recompiler/frontend/ir/ir_emitter.h index a78628413..13eefa88b 100644 --- a/src/shader_recompiler/frontend/ir/ir_emitter.h +++ b/src/shader_recompiler/frontend/ir/ir_emitter.h @@ -361,6 +361,7 @@ public: const Value& value, TextureInstInfo info); [[nodiscard]] U1 IsTextureScaled(const U32& index); + [[nodiscard]] U1 IsImageScaled(const U32& index); [[nodiscard]] U1 VoteAll(const U1& value); [[nodiscard]] U1 VoteAny(const U1& value); diff --git a/src/shader_recompiler/frontend/ir/opcodes.inc b/src/shader_recompiler/frontend/ir/opcodes.inc index ec629428a..6929919df 100644 --- a/src/shader_recompiler/frontend/ir/opcodes.inc +++ b/src/shader_recompiler/frontend/ir/opcodes.inc @@ -494,6 +494,7 @@ OPCODE(ImageRead, U32x4, Opaq OPCODE(ImageWrite, Void, Opaque, Opaque, U32x4, ) OPCODE(IsTextureScaled, U1, U32, ) +OPCODE(IsImageScaled, U1, U32, ) // Atomic Image operations diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp index ed82fa2ac..1e476d83d 100644 --- a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp +++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp @@ -432,6 +432,7 @@ void VisitUsages(Info& info, IR::Inst& inst) { break; case IR::Opcode::ResolutionDownFactor: case IR::Opcode::IsTextureScaled: + case IR::Opcode::IsImageScaled: info.uses_rescaling_uniform = true; break; case IR::Opcode::LaneId: diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp index 86c8f0c69..2af12fc07 100644 --- a/src/shader_recompiler/ir_opt/rescaling_pass.cpp +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -129,8 +129,7 @@ void PatchImageFetch(IR::Block& block, IR::Inst& inst) { void PatchImageRead(IR::Block& block, IR::Inst& inst) { IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; const auto info{inst.Flags()}; - // TODO: Scale conditionally - const IR::U1 is_scaled{IR::Value{true}}; + const IR::U1 is_scaled{ir.IsImageScaled(ir.Imm32(info.descriptor_index))}; ScaleIntegerCoord(ir, inst, is_scaled); } diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index dc89cb923..f3f83a258 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -63,8 +63,6 @@ struct RuntimeInfo { std::array generic_input_types{}; VaryingState previous_stage_stores; - u32 num_textures{}; - bool convert_depth_mode{}; bool force_early_z{}; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 187a28e4d..d4dd10bb6 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -5,6 +5,7 @@ #include #include +#include "shader_recompiler/backend/glasm/emit_glasm.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" @@ -229,8 +230,10 @@ void BufferCacheRuntime::BindStorageBuffer(size_t stage, u32 binding_index, Buff .padding = 0, }; buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY); - glProgramLocalParametersI4uivNV(PROGRAM_LUT[stage], binding_index, 1, - reinterpret_cast(&ssbo)); + glProgramLocalParametersI4uivNV( + PROGRAM_LUT[stage], + Shader::Backend::GLASM::PROGRAM_LOCAL_PARAMETER_STORAGE_BUFFER_BASE + binding_index, 1, + reinterpret_cast(&ssbo)); } } @@ -250,8 +253,10 @@ void BufferCacheRuntime::BindComputeStorageBuffer(u32 binding_index, Buffer& buf .padding = 0, }; buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY); - glProgramLocalParametersI4uivNV(GL_COMPUTE_PROGRAM_NV, binding_index, 1, - reinterpret_cast(&ssbo)); + glProgramLocalParametersI4uivNV( + GL_COMPUTE_PROGRAM_NV, + Shader::Backend::GLASM::PROGRAM_LOCAL_PARAMETER_STORAGE_BUFFER_BASE + binding_index, 1, + reinterpret_cast(&ssbo)); } } diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp index 60c65047b..9af61c340 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp @@ -181,33 +181,40 @@ void ComputePipeline::Configure() { texture_binding += num_texture_buffers; image_binding += num_image_buffers; - u32 scaling_mask{}; + u32 texture_scaling_mask{}; for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { ImageView& image_view{texture_cache.GetImageView((views_it++)->id)}; textures[texture_binding] = image_view.Handle(desc.type); if (texture_cache.IsRescaling(image_view)) { - scaling_mask |= 1u << texture_binding; + texture_scaling_mask |= 1u << texture_binding; } ++texture_binding; } } + u32 image_scaling_mask{}; for (const auto& desc : info.image_descriptors) { for (u32 index = 0; index < desc.count; ++index) { ImageView& image_view{texture_cache.GetImageView((views_it++)->id)}; if (desc.is_written) { texture_cache.MarkModification(image_view.image_id); } - images[image_binding++] = image_view.StorageView(desc.type, desc.format); + images[image_binding] = image_view.StorageView(desc.type, desc.format); + if (texture_cache.IsRescaling(image_view)) { + image_scaling_mask |= 1u << image_binding; + } + ++image_binding; } } if (info.uses_rescaling_uniform) { - const f32 float_scaling_mask{Common::BitCast(scaling_mask)}; + const f32 float_texture_scaling_mask{Common::BitCast(texture_scaling_mask)}; + const f32 float_image_scaling_mask{Common::BitCast(image_scaling_mask)}; if (assembly_program.handle != 0) { - glProgramLocalParameter4fARB(GL_COMPUTE_PROGRAM_NV, 0, float_scaling_mask, 0.0f, 0.0f, - 0.0f); + glProgramLocalParameter4fARB(GL_COMPUTE_PROGRAM_NV, 0, float_texture_scaling_mask, + float_image_scaling_mask, 0.0f, 0.0f); } else { - glProgramUniform4f(source_program.handle, 0, float_scaling_mask, 0.0f, 0.0f, 0.0f); + glProgramUniform4f(source_program.handle, 0, float_texture_scaling_mask, + float_image_scaling_mask, 0.0f, 0.0f); } } if (texture_binding != 0) { diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index 11559d6ce..f8495896c 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -464,8 +464,10 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { views_it += num_texture_buffers[stage]; views_it += num_image_buffers[stage]; - u32 scaling_mask{}; + u32 texture_scaling_mask{}; + u32 image_scaling_mask{}; u32 stage_texture_binding{}; + u32 stage_image_binding{}; const auto& info{stage_infos[stage]}; for (const auto& desc : info.texture_descriptors) { @@ -473,7 +475,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { ImageView& image_view{texture_cache.GetImageView((views_it++)->id)}; textures[texture_binding] = image_view.Handle(desc.type); if (texture_cache.IsRescaling(image_view)) { - scaling_mask |= 1u << stage_texture_binding; + texture_scaling_mask |= 1u << stage_texture_binding; } ++texture_binding; ++stage_texture_binding; @@ -485,20 +487,26 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { if (desc.is_written) { texture_cache.MarkModification(image_view.image_id); } - images[image_binding++] = image_view.StorageView(desc.type, desc.format); + images[image_binding] = image_view.StorageView(desc.type, desc.format); + if (texture_cache.IsRescaling(image_view)) { + image_scaling_mask |= 1u << stage_image_binding; + } + ++image_binding; + ++stage_image_binding; } } if (info.uses_rescaling_uniform) { - const f32 float_scaling_mask{Common::BitCast(scaling_mask)}; + const f32 float_texture_scaling_mask{Common::BitCast(texture_scaling_mask)}; + const f32 float_image_scaling_mask{Common::BitCast(image_scaling_mask)}; const bool is_rescaling{texture_cache.IsRescaling()}; const f32 config_down_factor{Settings::values.resolution_info.down_factor}; const f32 down_factor{is_rescaling ? config_down_factor : 1.0f}; if (use_assembly) { - glProgramLocalParameter4fARB(AssemblyStage(stage), 0, float_scaling_mask, - down_factor, 0.0f, 0.0f); + glProgramLocalParameter4fARB(AssemblyStage(stage), 0, float_texture_scaling_mask, + float_image_scaling_mask, down_factor, 0.0f); } else { - glProgramUniform4f(source_programs[stage].handle, 0, float_scaling_mask, - down_factor, 0.0f, 0.0f); + glProgramUniform4f(source_programs[stage].handle, 0, float_texture_scaling_mask, + float_image_scaling_mask, down_factor, 0.0f); } } }}; diff --git a/src/video_core/renderer_vulkan/pipeline_helper.h b/src/video_core/renderer_vulkan/pipeline_helper.h index bce4220c6..85ae726d1 100644 --- a/src/video_core/renderer_vulkan/pipeline_helper.h +++ b/src/video_core/renderer_vulkan/pipeline_helper.h @@ -10,6 +10,7 @@ #include "common/assert.h" #include "common/common_types.h" +#include "shader_recompiler/backend/spirv/emit_spirv.h" #include "shader_recompiler/shader_info.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" @@ -20,7 +21,7 @@ namespace Vulkan { -constexpr size_t MAX_RESCALING_WORDS = 4; +using Shader::Backend::SPIRV::NUM_TEXTURE_AND_IMAGE_SCALING_WORDS; class DescriptorLayoutBuilder { public: @@ -74,7 +75,8 @@ public: .stageFlags = static_cast( is_compute ? VK_SHADER_STAGE_COMPUTE_BIT : VK_SHADER_STAGE_ALL_GRAPHICS), .offset = 0, - .size = (is_compute ? 0 : sizeof(f32)) + sizeof(std::array), + .size = (is_compute ? 0 : sizeof(f32)) + + sizeof(std::array), }; return device->GetLogical().CreatePipelineLayout({ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, @@ -146,14 +148,25 @@ public: } } - const std::array& Data() const noexcept { + void PushImage(bool is_rescaled) noexcept { + *image_ptr |= is_rescaled ? image_bit : 0; + image_bit <<= 1; + if (image_bit == 0) { + image_bit = 1u; + ++image_ptr; + } + } + + const std::array& Data() const noexcept { return words; } private: - std::array words{}; + std::array words{}; u32* texture_ptr{words.data()}; + u32* image_ptr{words.data() + Shader::Backend::SPIRV::NUM_TEXTURE_SCALING_WORDS}; u32 texture_bit{1u}; + u32 image_bit{1u}; }; inline void PushImageDescriptors(TextureCache& texture_cache, @@ -181,6 +194,7 @@ inline void PushImageDescriptors(TextureCache& texture_cache, } const VkImageView vk_image_view{image_view.StorageView(desc.type, desc.format)}; update_descriptor_queue.AddImage(vk_image_view); + rescaling.PushImage(texture_cache.IsRescaling(image_view)); } } } diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 691ef0841..eb8b4e08b 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -139,9 +139,6 @@ Shader::RuntimeInfo MakeRuntimeInfo(std::span program } else { info.previous_stage_stores.mask.set(); } - for (const auto& stage : programs) { - info.num_textures += Shader::NumDescriptors(stage.info.texture_descriptors); - } const Shader::Stage stage{program.stage}; const bool has_geometry{key.unique_hashes[4] != 0 && !programs[4].is_geometry_passthrough}; const bool gl_ndc{key.state.ndc_minus_one_to_one != 0}; From dc28284437c7f99baa98a242f4713a1ab94418c8 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Wed, 4 Aug 2021 00:30:16 -0400 Subject: [PATCH 053/156] emit_spirv: Fix RescalingLayout alignment --- src/shader_recompiler/backend/spirv/emit_spirv.h | 4 ++-- src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.h b/src/shader_recompiler/backend/spirv/emit_spirv.h index db0998ad6..dd6dff0c8 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv.h @@ -23,8 +23,8 @@ constexpr u32 NUM_TEXTURE_AND_IMAGE_SCALING_WORDS = struct RescalingLayout { u32 down_factor; - std::array rescaling_textures; - std::array rescaling_images; + alignas(16) std::array rescaling_textures; + alignas(16) std::array rescaling_images; }; [[nodiscard]] std::vector EmitSPIRV(const Profile& profile, const RuntimeInfo& runtime_info, diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index c29bab678..5ad1180bb 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -479,7 +479,7 @@ void GraphicsPipeline::ConfigureDraw(const RescalingPushConstant& rescaling) { } if (update_rescaling) { const f32 config_down_factor{Settings::values.resolution_info.down_factor}; - const float scale_down_factor{is_rescaling ? config_down_factor : 1.0f}; + const f32 scale_down_factor{is_rescaling ? config_down_factor : 1.0f}; cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_ALL_GRAPHICS, 0, sizeof(scale_down_factor), &scale_down_factor); } From ed675cfd8cc89d64c763becfd991d1dd40deac5a Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Wed, 4 Aug 2021 19:02:30 -0400 Subject: [PATCH 054/156] texture_cache: Disable dst_image scaling in BlitImage Fixes scaling in Super Mario Party --- src/video_core/texture_cache/texture_cache.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 0e70c4db2..d86f80b5d 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -478,11 +478,13 @@ void TextureCache

::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, const bool is_src_rescaled = True(src_image.flags & ImageFlagBits::Rescaled); bool is_dst_rescaled = True(dst_image.flags & ImageFlagBits::Rescaled); - if (is_src_rescaled && !is_dst_rescaled) { - if (ImageCanRescale(dst_image)) { - is_dst_rescaled = dst_image.ScaleUp(); - } - } + // TODO: This requires the rendertarget image views to be updated with the upscaled sizes, + // otherwise the blit will use a larger framebuffer size than the image view attachment. + // if (is_src_rescaled && !is_dst_rescaled) { + // if (ImageCanRescale(dst_image)) { + // is_dst_rescaled = dst_image.ScaleUp(); + // } + // } const auto& resolution = Settings::values.resolution_info; const auto scale_up = [&](u32 value) -> u32 { From 8f78444de35bdbdc83a709b8a822d66018bb9852 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Mon, 2 Aug 2021 01:03:15 +0200 Subject: [PATCH 055/156] shader: Fix TextureSize check on rescaling. --- .../ir_opt/rescaling_pass.cpp | 48 ++++++++----------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp index 2af12fc07..b94273aa5 100644 --- a/src/shader_recompiler/ir_opt/rescaling_pass.cpp +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -26,11 +26,11 @@ void PatchFragCoord(IR::Block& block, IR::Inst& inst) { IR::U32 scaled_value{value}; bool changed{}; if (const u32 up_scale = Settings::values.resolution_info.up_scale; up_scale != 1) { - scaled_value = ir.IMul(value, ir.Imm32(up_scale)); + scaled_value = ir.IMul(scaled_value, ir.Imm32(up_scale)); changed = true; } if (const u32 down_shift = Settings::values.resolution_info.down_shift; down_shift != 0) { - scaled_value = ir.ShiftRightArithmetic(value, ir.Imm32(down_shift)); + scaled_value = ir.ShiftRightArithmetic(scaled_value, ir.Imm32(down_shift)); changed = true; } if (changed) { @@ -40,41 +40,42 @@ void PatchFragCoord(IR::Block& block, IR::Inst& inst) { } } -[[nodiscard]] IR::U32 DownScale(IR::IREmitter& ir, IR::U32 value) { +[[nodiscard]] IR::U32 DownScale(IR::IREmitter& ir, const IR::U1& is_scaled, IR::U32 value) { + IR::U32 scaled_value{value}; + bool changed{}; if (const u32 down_shift = Settings::values.resolution_info.down_shift; down_shift != 0) { - value = ir.ShiftLeftLogical(value, ir.Imm32(down_shift)); + scaled_value = ir.ShiftLeftLogical(scaled_value, ir.Imm32(down_shift)); + changed = true; } if (const u32 up_scale = Settings::values.resolution_info.up_scale; up_scale != 1) { - value = ir.IDiv(value, ir.Imm32(up_scale)); + scaled_value = ir.IDiv(scaled_value, ir.Imm32(up_scale)); + changed = true; + } + if (changed) { + return IR::U32{ir.Select(is_scaled, scaled_value, value)}; + } else { + return value; } - return value; } void PatchImageQueryDimensions(IR::Block& block, IR::Inst& inst) { const auto it{IR::Block::InstructionList::s_iterator_to(inst)}; IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; const auto info{inst.Flags()}; + const IR::U1 is_scaled{ir.IsTextureScaled(ir.Imm32(info.descriptor_index))}; switch (info.type) { - case TextureType::Color1D: - case TextureType::ColorArray1D: { - const IR::Value new_inst{&*block.PrependNewInst(it, inst)}; - const IR::U32 width{DownScale(ir, IR::U32{ir.CompositeExtract(new_inst, 0)})}; - const IR::Value replacement{ir.CompositeConstruct(width, ir.CompositeExtract(new_inst, 1), - ir.CompositeExtract(new_inst, 2), - ir.CompositeExtract(new_inst, 3))}; - inst.ReplaceUsesWith(replacement); - break; - } case TextureType::Color2D: case TextureType::ColorArray2D: { const IR::Value new_inst{&*block.PrependNewInst(it, inst)}; - const IR::U32 width{DownScale(ir, IR::U32{ir.CompositeExtract(new_inst, 0)})}; - const IR::U32 height{DownScale(ir, IR::U32{ir.CompositeExtract(new_inst, 1)})}; + const IR::U32 width{DownScale(ir, is_scaled, IR::U32{ir.CompositeExtract(new_inst, 0)})}; + const IR::U32 height{DownScale(ir, is_scaled, IR::U32{ir.CompositeExtract(new_inst, 1)})}; const IR::Value replacement{ir.CompositeConstruct( width, height, ir.CompositeExtract(new_inst, 2), ir.CompositeExtract(new_inst, 3))}; inst.ReplaceUsesWith(replacement); break; } + case TextureType::Color1D: + case TextureType::ColorArray1D: case TextureType::Color3D: case TextureType::ColorCube: case TextureType::ColorArrayCube: @@ -88,15 +89,6 @@ void ScaleIntegerCoord(IR::IREmitter& ir, IR::Inst& inst, const IR::U1& is_scale const auto info{inst.Flags()}; const IR::Value coord{inst.Arg(1)}; switch (info.type) { - case TextureType::Color1D: - inst.SetArg(1, Scale(ir, is_scaled, IR::U32{coord})); - break; - case TextureType::ColorArray1D: { - const IR::U32 x{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)})}; - const IR::U32 y{ir.CompositeExtract(coord, 1)}; - inst.SetArg(1, ir.CompositeConstruct(x, y)); - break; - } case TextureType::Color2D: { const IR::U32 x{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)})}; const IR::U32 y{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 1)})}; @@ -110,6 +102,8 @@ void ScaleIntegerCoord(IR::IREmitter& ir, IR::Inst& inst, const IR::U1& is_scale inst.SetArg(1, ir.CompositeConstruct(x, y, z)); break; } + case TextureType::Color1D: + case TextureType::ColorArray1D: case TextureType::Color3D: case TextureType::ColorCube: case TextureType::ColorArrayCube: From 4b1393a691d1d8d79c57e7b73734cb8287b91760 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 7 Aug 2021 02:15:24 +0200 Subject: [PATCH 056/156] Texture Cache: Correctly fix Blits Rescaling. --- src/video_core/texture_cache/texture_cache.h | 21 +++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index d86f80b5d..2de439889 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -473,18 +473,21 @@ void TextureCache

::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, PrepareImage(dst_id, true, false); Image& dst_image = slot_images[dst_id]; - const Image& src_image = slot_images[src_id]; + Image& src_image = slot_images[src_id]; - const bool is_src_rescaled = True(src_image.flags & ImageFlagBits::Rescaled); + bool is_src_rescaled = True(src_image.flags & ImageFlagBits::Rescaled); bool is_dst_rescaled = True(dst_image.flags & ImageFlagBits::Rescaled); - // TODO: This requires the rendertarget image views to be updated with the upscaled sizes, - // otherwise the blit will use a larger framebuffer size than the image view attachment. - // if (is_src_rescaled && !is_dst_rescaled) { - // if (ImageCanRescale(dst_image)) { - // is_dst_rescaled = dst_image.ScaleUp(); - // } - // } + if (is_src_rescaled != is_dst_rescaled) { + if (ImageCanRescale(dst_image)) { + ScaleUp(dst_image); + is_dst_rescaled = True(dst_image.flags & ImageFlagBits::Rescaled); + } + if (ImageCanRescale(src_image)) { + ScaleUp(src_image); + is_src_rescaled = True(src_image.flags & ImageFlagBits::Rescaled); + } + } const auto& resolution = Settings::values.resolution_info; const auto scale_up = [&](u32 value) -> u32 { From dfa82915262ca26d0884528b7bdae791554332ca Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 7 Aug 2021 02:59:05 +0200 Subject: [PATCH 057/156] RescalingPass: Agregate pixels on texelFetch while on Fragment Shader --- .../ir_opt/rescaling_pass.cpp | 100 +++++++++++++++++- 1 file changed, 97 insertions(+), 3 deletions(-) diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp index b94273aa5..71c9d9e6f 100644 --- a/src/shader_recompiler/ir_opt/rescaling_pass.cpp +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -40,6 +40,22 @@ void PatchFragCoord(IR::Block& block, IR::Inst& inst) { } } +[[nodiscard]] IR::U32 SubScale(IR::IREmitter& ir, const IR::U1& is_scaled, const IR::U32& value, + const IR::Attribute attrib) { + if (Settings::values.resolution_info.active) { + const IR::F32 opt1{ir.Imm32(Settings::values.resolution_info.up_factor)}; + const IR::F32 base{ir.FPMul(ir.ConvertUToF(32, 32, value), opt1)}; + const IR::F32 frag_coord{ir.GetAttribute(attrib)}; + const IR::F32 opt2{ir.Imm32(Settings::values.resolution_info.down_factor)}; + const IR::F32 floor{ir.FPMul(opt1, ir.FPFloor(ir.FPMul(frag_coord, opt2)))}; + const IR::U32 deviation{ + ir.ConvertFToU(32, ir.FPAdd(base, ir.FPAdd(frag_coord, ir.FPNeg(floor))))}; + return IR::U32{ir.Select(is_scaled, deviation, value)}; + } else { + return value; + } +} + [[nodiscard]] IR::U32 DownScale(IR::IREmitter& ir, const IR::U1& is_scaled, IR::U32 value) { IR::U32 scaled_value{value}; bool changed{}; @@ -113,6 +129,74 @@ void ScaleIntegerCoord(IR::IREmitter& ir, IR::Inst& inst, const IR::U1& is_scale } } +void SubScaleImageFetch(IR::Block& block, IR::Inst& inst) { + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + const auto info{inst.Flags()}; + const IR::U1 is_scaled{ir.IsTextureScaled(ir.Imm32(info.descriptor_index))}; + const IR::Value coord{inst.Arg(1)}; + switch (info.type) { + case TextureType::Color2D: { + const IR::U32 x{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)}, + IR::Attribute::PositionX)}; + const IR::U32 y{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 1)}, + IR::Attribute::PositionY)}; + inst.SetArg(1, ir.CompositeConstruct(x, y)); + break; + } + case TextureType::ColorArray2D: { + const IR::U32 x{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)}, + IR::Attribute::PositionX)}; + const IR::U32 y{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 1)}, + IR::Attribute::PositionY)}; + const IR::U32 z{ir.CompositeExtract(coord, 2)}; + inst.SetArg(1, ir.CompositeConstruct(x, y, z)); + break; + } + case TextureType::Color1D: + case TextureType::ColorArray1D: + case TextureType::Color3D: + case TextureType::ColorCube: + case TextureType::ColorArrayCube: + case TextureType::Buffer: + // Nothing to patch here + break; + } +} + +void SubScaleImageRead(IR::Block& block, IR::Inst& inst) { + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + const auto info{inst.Flags()}; + const IR::U1 is_scaled{ir.IsImageScaled(ir.Imm32(info.descriptor_index))}; + const IR::Value coord{inst.Arg(1)}; + switch (info.type) { + case TextureType::Color2D: { + const IR::U32 x{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)}, + IR::Attribute::PositionX)}; + const IR::U32 y{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 1)}, + IR::Attribute::PositionY)}; + inst.SetArg(1, ir.CompositeConstruct(x, y)); + break; + } + case TextureType::ColorArray2D: { + const IR::U32 x{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)}, + IR::Attribute::PositionX)}; + const IR::U32 y{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 1)}, + IR::Attribute::PositionY)}; + const IR::U32 z{ir.CompositeExtract(coord, 2)}; + inst.SetArg(1, ir.CompositeConstruct(x, y, z)); + break; + } + case TextureType::Color1D: + case TextureType::ColorArray1D: + case TextureType::Color3D: + case TextureType::ColorCube: + case TextureType::ColorArrayCube: + case TextureType::Buffer: + // Nothing to patch here + break; + } +} + void PatchImageFetch(IR::Block& block, IR::Inst& inst) { IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; const auto info{inst.Flags()}; @@ -145,13 +229,23 @@ void Visit(const IR::Program& program, IR::Block& block, IR::Inst& inst) { break; } case IR::Opcode::ImageQueryDimensions: - PatchImageQueryDimensions(block, inst); + if (program.stage == Stage::Compute) { + PatchImageQueryDimensions(block, inst); + } break; case IR::Opcode::ImageFetch: - PatchImageFetch(block, inst); + if (is_fragment_shader) { + SubScaleImageFetch(block, inst); + } else if (program.stage == Stage::Compute) { + PatchImageFetch(block, inst); + } break; case IR::Opcode::ImageRead: - PatchImageRead(block, inst); + if (is_fragment_shader) { + SubScaleImageRead(block, inst); + } else if (program.stage == Stage::Compute) { + PatchImageRead(block, inst); + } break; default: break; From d7c97921696486a95aaaf5c805b9fcc12230de77 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 7 Aug 2021 04:32:17 +0200 Subject: [PATCH 058/156] TextureCache: Fix Buffer Views Scaling. --- src/video_core/texture_cache/image_view_base.cpp | 11 ++++++----- src/video_core/texture_cache/texture_cache.h | 3 +++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/video_core/texture_cache/image_view_base.cpp b/src/video_core/texture_cache/image_view_base.cpp index e66dc9320..c7b4fc231 100644 --- a/src/video_core/texture_cache/image_view_base.cpp +++ b/src/video_core/texture_cache/image_view_base.cpp @@ -37,11 +37,12 @@ ImageViewBase::ImageViewBase(const ImageViewInfo& info, const ImageInfo& image_i } ImageViewBase::ImageViewBase(const ImageInfo& info, const ImageViewInfo& view_info) - : format{info.format}, type{ImageViewType::Buffer}, size{ - .width = info.size.width, - .height = 1, - .depth = 1, - } { + : image_id{NULL_IMAGE_ID}, format{info.format}, type{ImageViewType::Buffer}, + size{ + .width = info.size.width, + .height = 1, + .depth = 1, + } { ASSERT_MSG(view_info.type == ImageViewType::Buffer, "Expected texture buffer"); } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 2de439889..764984546 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -631,6 +631,9 @@ bool TextureCache

::IsRescaling() const noexcept { template bool TextureCache

::IsRescaling(const ImageViewBase& image_view) const noexcept { + if (image_view.type == ImageViewType::Buffer) { + return false; + } const ImageBase& image = slot_images[image_view.image_id]; return True(image.flags & ImageFlagBits::Rescaled); } From 65781f88f80a322b08241dc7dbceceed83434e30 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Wed, 4 Aug 2021 00:30:16 -0400 Subject: [PATCH 059/156] emit_spirv: Fix RescalingLayout alignment --- src/shader_recompiler/backend/spirv/emit_spirv.h | 1 + src/video_core/renderer_vulkan/pipeline_helper.h | 6 ++++-- src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp | 5 +++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.h b/src/shader_recompiler/backend/spirv/emit_spirv.h index dd6dff0c8..cf59f2572 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv.h @@ -26,6 +26,7 @@ struct RescalingLayout { alignas(16) std::array rescaling_textures; alignas(16) std::array rescaling_images; }; +constexpr u32 RESCALING_PUSH_CONSTANT_WORDS_OFFSET = offsetof(RescalingLayout, rescaling_textures); [[nodiscard]] std::vector EmitSPIRV(const Profile& profile, const RuntimeInfo& runtime_info, IR::Program& program, Bindings& bindings); diff --git a/src/video_core/renderer_vulkan/pipeline_helper.h b/src/video_core/renderer_vulkan/pipeline_helper.h index 85ae726d1..3612e8a18 100644 --- a/src/video_core/renderer_vulkan/pipeline_helper.h +++ b/src/video_core/renderer_vulkan/pipeline_helper.h @@ -22,6 +22,7 @@ namespace Vulkan { using Shader::Backend::SPIRV::NUM_TEXTURE_AND_IMAGE_SCALING_WORDS; +using Shader::Backend::SPIRV::RESCALING_PUSH_CONSTANT_WORDS_OFFSET; class DescriptorLayoutBuilder { public: @@ -71,12 +72,13 @@ public: } vk::PipelineLayout CreatePipelineLayout(VkDescriptorSetLayout descriptor_set_layout) const { + using Shader::Backend::SPIRV::RescalingLayout; + const u32 push_offset = is_compute ? RESCALING_PUSH_CONSTANT_WORDS_OFFSET : 0; const VkPushConstantRange range{ .stageFlags = static_cast( is_compute ? VK_SHADER_STAGE_COMPUTE_BIT : VK_SHADER_STAGE_ALL_GRAPHICS), .offset = 0, - .size = (is_compute ? 0 : sizeof(f32)) + - sizeof(std::array), + .size = sizeof(RescalingLayout) - push_offset, }; return device->GetLogical().CreatePipelineLayout({ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 5ad1180bb..f08e9e840 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -483,8 +483,9 @@ void GraphicsPipeline::ConfigureDraw(const RescalingPushConstant& rescaling) { cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_ALL_GRAPHICS, 0, sizeof(scale_down_factor), &scale_down_factor); } - cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_ALL_GRAPHICS, sizeof(f32), - sizeof(rescaling_data), rescaling_data.data()); + cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_ALL_GRAPHICS, + RESCALING_PUSH_CONSTANT_WORDS_OFFSET, sizeof(rescaling_data), + rescaling_data.data()); if (!descriptor_set_layout) { return; } From 68e038404cc0069d9f59068a60b56e67b4321e7a Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 14 Aug 2021 00:01:47 -0400 Subject: [PATCH 060/156] shader, video_core: Fix GCC build errors --- .../backend/spirv/emit_spirv_image.cpp | 4 ---- src/video_core/renderer_opengl/gl_compute_pipeline.cpp | 3 --- src/video_core/texture_cache/texture_cache_base.h | 10 +++------- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index 519ce8b9b..4d168a96d 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp @@ -243,10 +243,6 @@ Id IsScaled(EmitContext& ctx, const IR::Value& index, Id member_index, u32 base_ if (base_index != 0) { index_value = ctx.OpIAdd(ctx.U32[1], index_value, ctx.Const(base_index)); } - const Id word_index{ctx.OpShiftRightArithmetic(ctx.U32[1], index_value, ctx.Const(5u))}; - const Id pointer{ctx.OpAccessChain(push_constant_u32, ctx.rescaling_push_constants, - member_index, word_index)}; - const Id word{ctx.OpLoad(ctx.U32[1], pointer)}; const Id bit_index{ctx.OpBitwiseAnd(ctx.U32[1], index_value, ctx.Const(31u))}; bit = ctx.OpBitFieldUExtract(ctx.U32[1], index_value, bit_index, ctx.Const(1u)); } diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp index 9af61c340..5c1f21c65 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp @@ -143,9 +143,6 @@ void ComputePipeline::Configure() { } texture_cache.FillComputeImageViews(std::span(views.data(), views.size())); - const bool is_rescaling{texture_cache.IsRescaling()}; - const f32 config_down_factor{Settings::values.resolution_info.down_factor}; - const f32 down_factor{is_rescaling ? config_down_factor : 1.0f}; if (assembly_program.handle != 0) { program_manager.BindComputeAssemblyProgram(assembly_program.handle); } else { diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 8b417b611..517a4c224 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -41,13 +41,9 @@ using VideoCore::Surface::PixelFormatFromRenderTargetFormat; using namespace Common::Literals; struct ImageViewInOut { - u32 index; - bool blacklist; - union { - struct Empty { - } empty{}; - ImageViewId id; - }; + u32 index{}; + bool blacklist{}; + ImageViewId id{}; }; template From c5bbbf3902d97fc89775d6db5e5bebd3929ca24b Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 27 Aug 2021 23:24:05 +0200 Subject: [PATCH 061/156] Texture Cache: fix scaling on upload and stop scaling on base resolution. --- .../renderer_vulkan/vk_texture_cache.cpp | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 3400066a6..9afe49387 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1053,6 +1053,10 @@ Image::~Image() = default; void Image::UploadMemory(const StagingBufferRef& map, std::span copies) { // TODO: Move this to another API + const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); + if (is_rescaled) { + ScaleDown(true); + } scheduler->RequestOutsideRenderPassOperationContext(); std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); const VkBuffer src_buffer = map.buffer; @@ -1063,6 +1067,9 @@ void Image::UploadMemory(const StagingBufferRef& map, std::span copies) { @@ -1133,17 +1140,23 @@ bool Image::ScaleUp(bool save_as_backup) { } ASSERT(info.type != ImageType::Linear); scaling_count++; - ASSERT(scaling_count < 10); flags |= ImageFlagBits::Rescaled; - /*if (!runtime->is_rescaling_on) { - return; - }*/ + if (!runtime->is_rescaling_on) { + return true; + } + const auto& resolution = runtime->resolution; vk::Image rescaled_image = - MakeImage(runtime->device, info, resolution.up_scale, resolution.down_shift); - MemoryCommit new_commit( - runtime->memory_allocator.Commit(rescaled_image, MemoryUsage::DeviceLocal)); + has_backup ? std::move(backup_image) + : MakeImage(runtime->device, info, resolution.up_scale, resolution.down_shift); + MemoryCommit new_commit = has_backup ? std::move(backup_commit) + : MemoryCommit(runtime->memory_allocator.Commit( + rescaled_image, MemoryUsage::DeviceLocal)); + has_backup = false; + if (aspect_mask == 0) { + aspect_mask = ImageAspectMask(info.format); + } SCOPE_EXIT({ if (save_as_backup) { backup_image = std::move(image); @@ -1175,6 +1188,9 @@ bool Image::ScaleUp(bool save_as_backup) { } void Image::SwapBackup() { + if (!runtime->is_rescaling_on) { + return; + } ASSERT(has_backup); runtime->prescaled_images.Push(std::move(image)); runtime->prescaled_commits.Push(std::move(commit)); @@ -1190,16 +1206,18 @@ bool Image::ScaleDown(bool save_as_backup) { ASSERT(info.type != ImageType::Linear); flags &= ~ImageFlagBits::Rescaled; scaling_count++; - ASSERT(scaling_count < 10); - /*if (!runtime->is_rescaling_on) { - return false; - }*/ + if (!runtime->is_rescaling_on) { + return true; + } const auto& resolution = runtime->resolution; - vk::Image downscaled_image = MakeImage(runtime->device, info); - MemoryCommit new_commit( - runtime->memory_allocator.Commit(downscaled_image, MemoryUsage::DeviceLocal)); + vk::Image downscaled_image = + has_backup ? std::move(backup_image) : MakeImage(runtime->device, info); + MemoryCommit new_commit = has_backup ? std::move(backup_commit) + : MemoryCommit(runtime->memory_allocator.Commit( + downscaled_image, MemoryUsage::DeviceLocal)); + has_backup = false; if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); } From a6b88e85bfb14c45345f6443b54d15a61e3975d5 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 17 Aug 2021 00:12:52 +0200 Subject: [PATCH 062/156] Renderer: Implement Bicubic and ScaleForce filters. --- src/common/settings.cpp | 10 +- src/common/settings.h | 17 ++- src/video_core/host_shaders/CMakeLists.txt | 4 + .../host_shaders/opengl_present_bicubic.frag | 56 +++++++ .../opengl_present_scaleforce.frag | 135 +++++++++++++++++ .../host_shaders/vulkan_present_bicubic.frag | 56 +++++++ .../vulkan_present_scaleforce.frag | 137 ++++++++++++++++++ .../renderer_opengl/renderer_opengl.cpp | 27 +++- .../renderer_opengl/renderer_opengl.h | 4 +- .../renderer_vulkan/vk_blit_screen.cpp | 123 +++++++++++++++- .../renderer_vulkan/vk_blit_screen.h | 8 +- src/yuzu/configuration/config.cpp | 5 + src/yuzu/configuration/config.h | 1 + src/yuzu/configuration/configure_graphics.cpp | 25 ++++ src/yuzu/configuration/configure_graphics.ui | 54 +++++-- 15 files changed, 624 insertions(+), 38 deletions(-) create mode 100644 src/video_core/host_shaders/opengl_present_bicubic.frag create mode 100644 src/video_core/host_shaders/opengl_present_scaleforce.frag create mode 100644 src/video_core/host_shaders/vulkan_present_bicubic.frag create mode 100644 src/video_core/host_shaders/vulkan_present_scaleforce.frag diff --git a/src/common/settings.cpp b/src/common/settings.cpp index 4b7fa4b82..f0686a7c5 100644 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -48,6 +48,7 @@ void LogSettings() { log_setting("Core_UseMultiCore", values.use_multi_core.GetValue()); log_setting("CPU_Accuracy", values.cpu_accuracy.GetValue()); log_setting("Renderer_UseResolutionScaling", values.resolution_setup.GetValue()); + log_setting("Renderer_ScalingFilter", values.scaling_filter.GetValue()); log_setting("Renderer_UseSpeedLimit", values.use_speed_limit.GetValue()); log_setting("Renderer_SpeedLimit", values.speed_limit.GetValue()); log_setting("Renderer_UseDiskShaderCache", values.use_disk_shader_cache.GetValue()); @@ -113,19 +114,10 @@ void UpdateRescalingInfo() { info.up_scale = 1; info.down_shift = 1; break; - case ResolutionSetup::Res3_4X: - info.up_scale = 3; - info.down_shift = 2; - break; case ResolutionSetup::Res1X: info.up_scale = 1; info.down_shift = 0; break; - case ResolutionSetup::Res3_2X: { - info.up_scale = 3; - info.down_shift = 1; - break; - } case ResolutionSetup::Res2X: info.up_scale = 2; info.down_shift = 0; diff --git a/src/common/settings.h b/src/common/settings.h index ca88c086b..f629c7c56 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -54,12 +54,16 @@ enum class NvdecEmulation : u32 { enum class ResolutionSetup : u32 { Res1_2X = 0, - Res3_4X = 1, - Res1X = 2, - Res3_2X = 3, - Res2X = 4, - Res3X = 5, - Res4X = 6, + Res1X = 1, + Res2X = 2, + Res3X = 3, + Res4X = 4, +}; + +enum class ScalingFilter : u32 { + Bilinear = 0, + Bicubic = 1, + ScaleForce = 2, }; struct ResolutionScalingInfo { @@ -471,6 +475,7 @@ struct Values { ResolutionScalingInfo resolution_info{}; Setting resolution_setup{ResolutionSetup::Res1X, "resolution_setup"}; + Setting scaling_filter{ScalingFilter::Bilinear, "scaling_filter"}; // *nix platforms may have issues with the borderless windowed fullscreen mode. // Default to exclusive fullscreen on these platforms for now. RangedSetting fullscreen_mode{ diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 20d748c12..835b37944 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -6,11 +6,15 @@ set(SHADER_FILES convert_float_to_depth.frag full_screen_triangle.vert opengl_copy_bc4.comp + opengl_present_scaleforce.frag + opengl_present_bicubic.frag opengl_present.frag opengl_present.vert pitch_unswizzle.comp vulkan_blit_color_float.frag vulkan_blit_depth_stencil.frag + vulkan_present_bicubic.frag + vulkan_present_scaleforce.frag vulkan_present.frag vulkan_present.vert vulkan_quad_indexed.comp diff --git a/src/video_core/host_shaders/opengl_present_bicubic.frag b/src/video_core/host_shaders/opengl_present_bicubic.frag new file mode 100644 index 000000000..17772095a --- /dev/null +++ b/src/video_core/host_shaders/opengl_present_bicubic.frag @@ -0,0 +1,56 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 460 core + +layout (location = 0) in vec2 frag_tex_coord; + +layout (location = 0) out vec4 color; + +layout (binding = 1) uniform sampler2D color_texture; + +vec4 cubic(float v) { + vec4 n = vec4(1.0, 2.0, 3.0, 4.0) - v; + vec4 s = n * n * n; + float x = s.x; + float y = s.y - 4.0 * s.x; + float z = s.z - 4.0 * s.y + 6.0 * s.x; + float w = 6.0 - x - y - z; + return vec4(x, y, z, w) * (1.0 / 6.0); +} + +vec4 textureBicubic( sampler2D textureSampler, vec2 texCoords ) { + + vec2 texSize = textureSize(textureSampler, 0); + vec2 invTexSize = 1.0 / texSize; + + texCoords = texCoords * texSize - 0.5; + + vec2 fxy = fract(texCoords); + texCoords -= fxy; + + vec4 xcubic = cubic(fxy.x); + vec4 ycubic = cubic(fxy.y); + + vec4 c = texCoords.xxyy + vec2(-0.5, +1.5).xyxy; + + vec4 s = vec4(xcubic.xz + xcubic.yw, ycubic.xz + ycubic.yw); + vec4 offset = c + vec4(xcubic.yw, ycubic.yw) / s; + + offset *= invTexSize.xxyy; + + vec4 sample0 = texture(textureSampler, offset.xz); + vec4 sample1 = texture(textureSampler, offset.yz); + vec4 sample2 = texture(textureSampler, offset.xw); + vec4 sample3 = texture(textureSampler, offset.yw); + + float sx = s.x / (s.x + s.y); + float sy = s.z / (s.z + s.w); + + return mix(mix(sample3, sample2, sx), mix(sample1, sample0, sx), sy); +} + +void main() { + color = vec4(textureBicubic(color_texture, frag_tex_coord).rgb, 1.0f); +} diff --git a/src/video_core/host_shaders/opengl_present_scaleforce.frag b/src/video_core/host_shaders/opengl_present_scaleforce.frag new file mode 100644 index 000000000..0153f62c0 --- /dev/null +++ b/src/video_core/host_shaders/opengl_present_scaleforce.frag @@ -0,0 +1,135 @@ +// from https://github.com/BreadFish64/ScaleFish/tree/master/scale_force + +// MIT License +// +// Copyright (c) 2020 BreadFish64 +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +precision mediump float; + +layout (location = 0) in vec2 tex_coord; + +layout (location = 0) out vec4 frag_color; + +layout (binding = 1) uniform sampler2D input_texture; + +vec2 tex_size; +vec2 inv_tex_size; + +vec4 cubic(float v) { + vec3 n = vec3(1.0, 2.0, 3.0) - v; + vec3 s = n * n * n; + float x = s.x; + float y = s.y - 4.0 * s.x; + float z = s.z - 4.0 * s.y + 6.0 * s.x; + float w = 6.0 - x - y - z; + return vec4(x, y, z, w) / 6.0; +} + +// Bicubic interpolation +vec4 textureBicubic(vec2 tex_coords) { + tex_coords = tex_coords * tex_size - 0.5; + + vec2 fxy = modf(tex_coords, tex_coords); + + vec4 xcubic = cubic(fxy.x); + vec4 ycubic = cubic(fxy.y); + + vec4 c = tex_coords.xxyy + vec2(-0.5, +1.5).xyxy; + + vec4 s = vec4(xcubic.xz + xcubic.yw, ycubic.xz + ycubic.yw); + vec4 offset = c + vec4(xcubic.yw, ycubic.yw) / s; + + offset *= inv_tex_size.xxyy; + + vec4 sample0 = textureLod(input_texture, offset.xz, 0.0); + vec4 sample1 = textureLod(input_texture, offset.yz, 0.0); + vec4 sample2 = textureLod(input_texture, offset.xw, 0.0); + vec4 sample3 = textureLod(input_texture, offset.yw, 0.0); + + float sx = s.x / (s.x + s.y); + float sy = s.z / (s.z + s.w); + + return mix(mix(sample3, sample2, sx), mix(sample1, sample0, sx), sy); +} + +mat4x3 center_matrix; +vec4 center_alpha; + +// Finds the distance between four colors and cc in YCbCr space +vec4 ColorDist(vec4 A, vec4 B, vec4 C, vec4 D) { + // https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.2020_conversion + const vec3 K = vec3(0.2627, 0.6780, 0.0593); + const float LUMINANCE_WEIGHT = .6; + const mat3 YCBCR_MATRIX = + mat3(K * LUMINANCE_WEIGHT, -.5 * K.r / (1.0 - K.b), -.5 * K.g / (1.0 - K.b), .5, .5, + -.5 * K.g / (1.0 - K.r), -.5 * K.b / (1.0 - K.r)); + + mat4x3 colors = mat4x3(A.rgb, B.rgb, C.rgb, D.rgb) - center_matrix; + mat4x3 YCbCr = YCBCR_MATRIX * colors; + vec4 color_dist = vec3(1.0) * YCbCr; + color_dist *= color_dist; + vec4 alpha = vec4(A.a, B.a, C.a, D.a); + + return sqrt((color_dist + abs(center_alpha - alpha)) * alpha * center_alpha); +} + +void main() { + vec4 bl = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(-1, -1)); + vec4 bc = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(0, -1)); + vec4 br = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(1, -1)); + vec4 cl = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(-1, 0)); + vec4 cc = textureLod(input_texture, tex_coord, 0.0); + vec4 cr = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(1, 0)); + vec4 tl = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(-1, 1)); + vec4 tc = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(0, 1)); + vec4 tr = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(1, 1)); + + + tex_size = vec2(textureSize(input_texture, 0)); + inv_tex_size = 1.0 / tex_size; + center_matrix = mat4x3(cc.rgb, cc.rgb, cc.rgb, cc.rgb); + center_alpha = cc.aaaa; + + vec4 offset_tl = ColorDist(tl, tc, tr, cr); + vec4 offset_br = ColorDist(br, bc, bl, cl); + + // Calculate how different cc is from the texels around it + float total_dist = dot(offset_tl + offset_br, vec4(1.0)); + + // Add together all the distances with direction taken into account + vec4 tmp = offset_tl - offset_br; + vec2 total_offset = tmp.wy + tmp.zz + vec2(-tmp.x, tmp.x); + + if (total_dist == 0.0) { + // Doing bicubic filtering just past the edges where the offset is 0 causes black floaters + // and it doesn't really matter which filter is used when the colors aren't changing. + frag_color = vec4(cc.rgb, 1.0f); + } else { + // When the image has thin points, they tend to split apart. + // This is because the texels all around are different + // and total_offset reaches into clear areas. + // This works pretty well to keep the offset in bounds for these cases. + float clamp_val = length(total_offset) / total_dist; + vec2 final_offset = clamp(total_offset, -clamp_val, clamp_val) * inv_tex_size; + + frag_color = vec4(textureBicubic(tex_coord - final_offset).rgb, 1.0f); + } +} diff --git a/src/video_core/host_shaders/vulkan_present_bicubic.frag b/src/video_core/host_shaders/vulkan_present_bicubic.frag new file mode 100644 index 000000000..17772095a --- /dev/null +++ b/src/video_core/host_shaders/vulkan_present_bicubic.frag @@ -0,0 +1,56 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 460 core + +layout (location = 0) in vec2 frag_tex_coord; + +layout (location = 0) out vec4 color; + +layout (binding = 1) uniform sampler2D color_texture; + +vec4 cubic(float v) { + vec4 n = vec4(1.0, 2.0, 3.0, 4.0) - v; + vec4 s = n * n * n; + float x = s.x; + float y = s.y - 4.0 * s.x; + float z = s.z - 4.0 * s.y + 6.0 * s.x; + float w = 6.0 - x - y - z; + return vec4(x, y, z, w) * (1.0 / 6.0); +} + +vec4 textureBicubic( sampler2D textureSampler, vec2 texCoords ) { + + vec2 texSize = textureSize(textureSampler, 0); + vec2 invTexSize = 1.0 / texSize; + + texCoords = texCoords * texSize - 0.5; + + vec2 fxy = fract(texCoords); + texCoords -= fxy; + + vec4 xcubic = cubic(fxy.x); + vec4 ycubic = cubic(fxy.y); + + vec4 c = texCoords.xxyy + vec2(-0.5, +1.5).xyxy; + + vec4 s = vec4(xcubic.xz + xcubic.yw, ycubic.xz + ycubic.yw); + vec4 offset = c + vec4(xcubic.yw, ycubic.yw) / s; + + offset *= invTexSize.xxyy; + + vec4 sample0 = texture(textureSampler, offset.xz); + vec4 sample1 = texture(textureSampler, offset.yz); + vec4 sample2 = texture(textureSampler, offset.xw); + vec4 sample3 = texture(textureSampler, offset.yw); + + float sx = s.x / (s.x + s.y); + float sy = s.z / (s.z + s.w); + + return mix(mix(sample3, sample2, sx), mix(sample1, sample0, sx), sy); +} + +void main() { + color = vec4(textureBicubic(color_texture, frag_tex_coord).rgb, 1.0f); +} diff --git a/src/video_core/host_shaders/vulkan_present_scaleforce.frag b/src/video_core/host_shaders/vulkan_present_scaleforce.frag new file mode 100644 index 000000000..801c8eae9 --- /dev/null +++ b/src/video_core/host_shaders/vulkan_present_scaleforce.frag @@ -0,0 +1,137 @@ +#version 320 es + +// from https://github.com/BreadFish64/ScaleFish/tree/master/scale_force + +// MIT License +// +// Copyright (c) 2020 BreadFish64 +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +precision mediump float; + +layout (location = 0) in vec2 tex_coord; + +layout (location = 0) out vec4 frag_color; + +layout (binding = 1) uniform sampler2D input_texture; + +vec2 tex_size; +vec2 inv_tex_size; + +vec4 cubic(float v) { + vec3 n = vec3(1.0, 2.0, 3.0) - v; + vec3 s = n * n * n; + float x = s.x; + float y = s.y - 4.0 * s.x; + float z = s.z - 4.0 * s.y + 6.0 * s.x; + float w = 6.0 - x - y - z; + return vec4(x, y, z, w) / 6.0; +} + +// Bicubic interpolation +vec4 textureBicubic(vec2 tex_coords) { + tex_coords = tex_coords * tex_size - 0.5; + + vec2 fxy = modf(tex_coords, tex_coords); + + vec4 xcubic = cubic(fxy.x); + vec4 ycubic = cubic(fxy.y); + + vec4 c = tex_coords.xxyy + vec2(-0.5, +1.5).xyxy; + + vec4 s = vec4(xcubic.xz + xcubic.yw, ycubic.xz + ycubic.yw); + vec4 offset = c + vec4(xcubic.yw, ycubic.yw) / s; + + offset *= inv_tex_size.xxyy; + + vec4 sample0 = textureLod(input_texture, offset.xz, 0.0); + vec4 sample1 = textureLod(input_texture, offset.yz, 0.0); + vec4 sample2 = textureLod(input_texture, offset.xw, 0.0); + vec4 sample3 = textureLod(input_texture, offset.yw, 0.0); + + float sx = s.x / (s.x + s.y); + float sy = s.z / (s.z + s.w); + + return mix(mix(sample3, sample2, sx), mix(sample1, sample0, sx), sy); +} + +mat4x3 center_matrix; +vec4 center_alpha; + +// Finds the distance between four colors and cc in YCbCr space +vec4 ColorDist(vec4 A, vec4 B, vec4 C, vec4 D) { + // https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.2020_conversion + const vec3 K = vec3(0.2627, 0.6780, 0.0593); + const float LUMINANCE_WEIGHT = .6; + const mat3 YCBCR_MATRIX = + mat3(K * LUMINANCE_WEIGHT, -.5 * K.r / (1.0 - K.b), -.5 * K.g / (1.0 - K.b), .5, .5, + -.5 * K.g / (1.0 - K.r), -.5 * K.b / (1.0 - K.r)); + + mat4x3 colors = mat4x3(A.rgb, B.rgb, C.rgb, D.rgb) - center_matrix; + mat4x3 YCbCr = YCBCR_MATRIX * colors; + vec4 color_dist = vec3(1.0) * YCbCr; + color_dist *= color_dist; + vec4 alpha = vec4(A.a, B.a, C.a, D.a); + + return sqrt((color_dist + abs(center_alpha - alpha)) * alpha * center_alpha); +} + +void main() { + vec4 bl = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(-1, -1)); + vec4 bc = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(0, -1)); + vec4 br = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(1, -1)); + vec4 cl = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(-1, 0)); + vec4 cc = textureLod(input_texture, tex_coord, 0.0); + vec4 cr = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(1, 0)); + vec4 tl = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(-1, 1)); + vec4 tc = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(0, 1)); + vec4 tr = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(1, 1)); + + + tex_size = vec2(textureSize(input_texture, 0)); + inv_tex_size = 1.0 / tex_size; + center_matrix = mat4x3(cc.rgb, cc.rgb, cc.rgb, cc.rgb); + center_alpha = cc.aaaa; + + vec4 offset_tl = ColorDist(tl, tc, tr, cr); + vec4 offset_br = ColorDist(br, bc, bl, cl); + + // Calculate how different cc is from the texels around it + float total_dist = dot(offset_tl + offset_br, vec4(1.0)); + + // Add together all the distances with direction taken into account + vec4 tmp = offset_tl - offset_br; + vec2 total_offset = tmp.wy + tmp.zz + vec2(-tmp.x, tmp.x); + + if (total_dist == 0.0) { + // Doing bicubic filtering just past the edges where the offset is 0 causes black floaters + // and it doesn't really matter which filter is used when the colors aren't changing. + frag_color = vec4(cc.rgb, 1.0f); + } else { + // When the image has thin points, they tend to split apart. + // This is because the texels all around are different + // and total_offset reaches into clear areas. + // This works pretty well to keep the offset in bounds for these cases. + float clamp_val = length(total_offset) / total_dist; + vec2 final_offset = clamp(total_offset, -clamp_val, clamp_val) * inv_tex_size; + + frag_color = vec4(textureBicubic(tex_coord - final_offset).rgb, 1.0f); + } +} diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 0f7b69c6d..71a5e3adf 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -21,7 +21,9 @@ #include "core/memory.h" #include "core/perf_stats.h" #include "core/telemetry_session.h" +#include "video_core/host_shaders/opengl_present_bicubic_frag.h" #include "video_core/host_shaders/opengl_present_frag.h" +#include "video_core/host_shaders/opengl_present_scaleforce_frag.h" #include "video_core/host_shaders/opengl_present_vert.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_manager.h" @@ -252,7 +254,11 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color void RendererOpenGL::InitOpenGLObjects() { // Create shader programs present_vertex = CreateProgram(HostShaders::OPENGL_PRESENT_VERT, GL_VERTEX_SHADER); - present_fragment = CreateProgram(HostShaders::OPENGL_PRESENT_FRAG, GL_FRAGMENT_SHADER); + present_bilinear_fragment = CreateProgram(HostShaders::OPENGL_PRESENT_FRAG, GL_FRAGMENT_SHADER); + present_bicubic_fragment = + CreateProgram(HostShaders::OPENGL_PRESENT_BICUBIC_FRAG, GL_FRAGMENT_SHADER); + present_scaleforce_fragment = + CreateProgram(HostShaders::OPENGL_PRESENT_SCALEFORCE_FRAG, GL_FRAGMENT_SHADER); // Generate presentation sampler present_sampler.Create(); @@ -337,7 +343,24 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { // Set projection matrix const std::array ortho_matrix = MakeOrthographicMatrix(static_cast(layout.width), static_cast(layout.height)); - program_manager.BindPresentPrograms(present_vertex.handle, present_fragment.handle); + + GLuint fragment_handle; + const auto filter = Settings::values.scaling_filter.GetValue(); + switch (filter) { + case Settings::ScalingFilter::Bilinear: + fragment_handle = present_bilinear_fragment.handle; + break; + case Settings::ScalingFilter::Bicubic: + fragment_handle = present_bicubic_fragment.handle; + break; + case Settings::ScalingFilter::ScaleForce: + fragment_handle = present_scaleforce_fragment.handle; + break; + default: + fragment_handle = present_bilinear_fragment.handle; + break; + } + program_manager.BindPresentPrograms(present_vertex.handle, fragment_handle); glProgramUniformMatrix3x2fv(present_vertex.handle, ModelViewMatrixLocation, 1, GL_FALSE, ortho_matrix.data()); diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index d455f572f..bf3d3502c 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -111,7 +111,9 @@ private: OGLSampler present_sampler; OGLBuffer vertex_buffer; OGLProgram present_vertex; - OGLProgram present_fragment; + OGLProgram present_bilinear_fragment; + OGLProgram present_bicubic_fragment; + OGLProgram present_scaleforce_fragment; OGLFramebuffer screenshot_framebuffer; // GPU address of the vertex buffer diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 7051e6559..19d91ecfc 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -12,11 +12,14 @@ #include "common/assert.h" #include "common/common_types.h" #include "common/math_util.h" +#include "common/settings.h" #include "core/core.h" #include "core/frontend/emu_window.h" #include "core/memory.h" #include "video_core/gpu.h" +#include "video_core/host_shaders/vulkan_present_bicubic_frag_spv.h" #include "video_core/host_shaders/vulkan_present_frag_spv.h" +#include "video_core/host_shaders/vulkan_present_scaleforce_frag_spv.h" #include "video_core/host_shaders/vulkan_present_vert_spv.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" @@ -258,8 +261,22 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, .offset = {0, 0}, .extent = size, }; + const auto filter = Settings::values.scaling_filter.GetValue(); cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE); - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline); + switch (filter) { + case Settings::ScalingFilter::Bilinear: + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *bilinear_pipeline); + break; + case Settings::ScalingFilter::Bicubic: + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *bicubic_pipeline); + break; + case Settings::ScalingFilter::ScaleForce: + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *scaleforce_pipeline); + break; + default: + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *bilinear_pipeline); + break; + } cmdbuf.SetViewport(0, viewport); cmdbuf.SetScissor(0, scissor); @@ -324,7 +341,9 @@ void VKBlitScreen::RefreshResources(const Tegra::FramebufferConfig& framebuffer) void VKBlitScreen::CreateShaders() { vertex_shader = BuildShader(device, VULKAN_PRESENT_VERT_SPV); - fragment_shader = BuildShader(device, VULKAN_PRESENT_FRAG_SPV); + bilinear_fragment_shader = BuildShader(device, VULKAN_PRESENT_FRAG_SPV); + bicubic_fragment_shader = BuildShader(device, VULKAN_PRESENT_BICUBIC_FRAG_SPV); + scaleforce_fragment_shader = BuildShader(device, VULKAN_PRESENT_SCALEFORCE_FRAG_SPV); } void VKBlitScreen::CreateSemaphores() { @@ -468,7 +487,7 @@ void VKBlitScreen::CreatePipelineLayout() { } void VKBlitScreen::CreateGraphicsPipeline() { - const std::array shader_stages{{ + const std::array bilinear_shader_stages{{ { .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, .pNext = nullptr, @@ -483,7 +502,49 @@ void VKBlitScreen::CreateGraphicsPipeline() { .pNext = nullptr, .flags = 0, .stage = VK_SHADER_STAGE_FRAGMENT_BIT, - .module = *fragment_shader, + .module = *bilinear_fragment_shader, + .pName = "main", + .pSpecializationInfo = nullptr, + }, + }}; + + const std::array bicubic_shader_stages{{ + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = *vertex_shader, + .pName = "main", + .pSpecializationInfo = nullptr, + }, + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = *bicubic_fragment_shader, + .pName = "main", + .pSpecializationInfo = nullptr, + }, + }}; + + const std::array scaleforce_shader_stages{{ + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = *vertex_shader, + .pName = "main", + .pSpecializationInfo = nullptr, + }, + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = *scaleforce_fragment_shader, .pName = "main", .pSpecializationInfo = nullptr, }, @@ -583,12 +644,12 @@ void VKBlitScreen::CreateGraphicsPipeline() { .pDynamicStates = dynamic_states.data(), }; - const VkGraphicsPipelineCreateInfo pipeline_ci{ + const VkGraphicsPipelineCreateInfo bilinear_pipeline_ci{ .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, .pNext = nullptr, .flags = 0, - .stageCount = static_cast(shader_stages.size()), - .pStages = shader_stages.data(), + .stageCount = static_cast(bilinear_shader_stages.size()), + .pStages = bilinear_shader_stages.data(), .pVertexInputState = &vertex_input_ci, .pInputAssemblyState = &input_assembly_ci, .pTessellationState = nullptr, @@ -605,7 +666,53 @@ void VKBlitScreen::CreateGraphicsPipeline() { .basePipelineIndex = 0, }; - pipeline = device.GetLogical().CreateGraphicsPipeline(pipeline_ci); + const VkGraphicsPipelineCreateInfo bicubic_pipeline_ci{ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast(bicubic_shader_stages.size()), + .pStages = bicubic_shader_stages.data(), + .pVertexInputState = &vertex_input_ci, + .pInputAssemblyState = &input_assembly_ci, + .pTessellationState = nullptr, + .pViewportState = &viewport_state_ci, + .pRasterizationState = &rasterization_ci, + .pMultisampleState = &multisampling_ci, + .pDepthStencilState = nullptr, + .pColorBlendState = &color_blend_ci, + .pDynamicState = &dynamic_state_ci, + .layout = *pipeline_layout, + .renderPass = *renderpass, + .subpass = 0, + .basePipelineHandle = 0, + .basePipelineIndex = 0, + }; + + const VkGraphicsPipelineCreateInfo scaleforce_pipeline_ci{ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast(scaleforce_shader_stages.size()), + .pStages = scaleforce_shader_stages.data(), + .pVertexInputState = &vertex_input_ci, + .pInputAssemblyState = &input_assembly_ci, + .pTessellationState = nullptr, + .pViewportState = &viewport_state_ci, + .pRasterizationState = &rasterization_ci, + .pMultisampleState = &multisampling_ci, + .pDepthStencilState = nullptr, + .pColorBlendState = &color_blend_ci, + .pDynamicState = &dynamic_state_ci, + .layout = *pipeline_layout, + .renderPass = *renderpass, + .subpass = 0, + .basePipelineHandle = 0, + .basePipelineIndex = 0, + }; + + bilinear_pipeline = device.GetLogical().CreateGraphicsPipeline(bilinear_pipeline_ci); + bicubic_pipeline = device.GetLogical().CreateGraphicsPipeline(bicubic_pipeline_ci); + scaleforce_pipeline = device.GetLogical().CreateGraphicsPipeline(scaleforce_pipeline_ci); } void VKBlitScreen::CreateSampler() { diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index 430bcfbca..d3a16f0ba 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -107,11 +107,15 @@ private: const VKScreenInfo& screen_info; vk::ShaderModule vertex_shader; - vk::ShaderModule fragment_shader; + vk::ShaderModule bilinear_fragment_shader; + vk::ShaderModule bicubic_fragment_shader; + vk::ShaderModule scaleforce_fragment_shader; vk::DescriptorPool descriptor_pool; vk::DescriptorSetLayout descriptor_set_layout; vk::PipelineLayout pipeline_layout; - vk::Pipeline pipeline; + vk::Pipeline bilinear_pipeline; + vk::Pipeline bicubic_pipeline; + vk::Pipeline scaleforce_pipeline; vk::RenderPass renderpass; std::vector framebuffers; vk::DescriptorSets descriptor_sets; diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index 7ed833203..3803bf501 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -825,6 +825,7 @@ void Config::ReadRendererValues() { ReadGlobalSetting(Settings::values.fullscreen_mode); ReadGlobalSetting(Settings::values.aspect_ratio); ReadGlobalSetting(Settings::values.resolution_setup); + ReadGlobalSetting(Settings::values.scaling_filter); ReadGlobalSetting(Settings::values.max_anisotropy); ReadGlobalSetting(Settings::values.use_speed_limit); ReadGlobalSetting(Settings::values.speed_limit); @@ -1371,6 +1372,10 @@ void Config::SaveRendererValues() { static_cast(Settings::values.resolution_setup.GetValue(global)), static_cast(Settings::values.resolution_setup.GetDefault()), Settings::values.resolution_setup.UsingGlobal()); + WriteSetting(QString::fromStdString(Settings::values.scaling_filter.GetLabel()), + static_cast(Settings::values.scaling_filter.GetValue(global)), + static_cast(Settings::values.scaling_filter.GetDefault()), + Settings::values.scaling_filter.UsingGlobal()); WriteGlobalSetting(Settings::values.max_anisotropy); WriteGlobalSetting(Settings::values.use_speed_limit); WriteGlobalSetting(Settings::values.speed_limit); diff --git a/src/yuzu/configuration/config.h b/src/yuzu/configuration/config.h index fbb91d312..97dc1bb47 100644 --- a/src/yuzu/configuration/config.h +++ b/src/yuzu/configuration/config.h @@ -190,5 +190,6 @@ Q_DECLARE_METATYPE(Settings::GPUAccuracy); Q_DECLARE_METATYPE(Settings::FullscreenMode); Q_DECLARE_METATYPE(Settings::NvdecEmulation); Q_DECLARE_METATYPE(Settings::ResolutionSetup); +Q_DECLARE_METATYPE(Settings::ScalingFilter); Q_DECLARE_METATYPE(Settings::RendererBackend); Q_DECLARE_METATYPE(Settings::ShaderBackend); diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp index 4f08ae3e0..e01efaeda 100644 --- a/src/yuzu/configuration/configure_graphics.cpp +++ b/src/yuzu/configuration/configure_graphics.cpp @@ -89,6 +89,7 @@ void ConfigureGraphics::SetConfiguration() { ui->use_asynchronous_gpu_emulation->setEnabled(runtime_lock); ui->use_disk_shader_cache->setEnabled(runtime_lock); ui->nvdec_emulation_widget->setEnabled(runtime_lock); + ui->resolution_combobox->setEnabled(runtime_lock); ui->accelerate_astc->setEnabled(runtime_lock); ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache.GetValue()); ui->use_asynchronous_gpu_emulation->setChecked( @@ -104,6 +105,8 @@ void ConfigureGraphics::SetConfiguration() { ui->aspect_ratio_combobox->setCurrentIndex(Settings::values.aspect_ratio.GetValue()); ui->resolution_combobox->setCurrentIndex( static_cast(Settings::values.resolution_setup.GetValue())); + ui->scaling_filter_combobox->setCurrentIndex( + static_cast(Settings::values.scaling_filter.GetValue())); } else { ConfigurationShared::SetPerGameSetting(ui->api, &Settings::values.renderer_backend); ConfigurationShared::SetHighlight(ui->api_widget, @@ -129,6 +132,11 @@ void ConfigureGraphics::SetConfiguration() { ConfigurationShared::SetHighlight(ui->resolution_label, !Settings::values.resolution_setup.UsingGlobal()); + ConfigurationShared::SetPerGameSetting(ui->scaling_filter_combobox, + &Settings::values.scaling_filter); + ConfigurationShared::SetHighlight(ui->scaling_filter_label, + !Settings::values.scaling_filter.UsingGlobal()); + ui->bg_combobox->setCurrentIndex(Settings::values.bg_red.UsingGlobal() ? 0 : 1); ui->bg_button->setEnabled(!Settings::values.bg_red.UsingGlobal()); ConfigurationShared::SetHighlight(ui->bg_layout, !Settings::values.bg_red.UsingGlobal()); @@ -144,6 +152,10 @@ void ConfigureGraphics::ApplyConfiguration() { ui->resolution_combobox->currentIndex() - ((Settings::IsConfiguringGlobal()) ? 0 : ConfigurationShared::USE_GLOBAL_OFFSET)); + const auto scaling_filter = static_cast( + ui->scaling_filter_combobox->currentIndex() - + ((Settings::IsConfiguringGlobal()) ? 0 : ConfigurationShared::USE_GLOBAL_OFFSET)); + ConfigurationShared::ApplyPerGameSetting(&Settings::values.fullscreen_mode, ui->fullscreen_mode_combobox); ConfigurationShared::ApplyPerGameSetting(&Settings::values.aspect_ratio, @@ -178,6 +190,9 @@ void ConfigureGraphics::ApplyConfiguration() { if (Settings::values.resolution_setup.UsingGlobal()) { Settings::values.resolution_setup.SetValue(resolution_setup); } + if (Settings::values.scaling_filter.UsingGlobal()) { + Settings::values.scaling_filter.SetValue(scaling_filter); + } } else { if (ui->resolution_combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) { Settings::values.resolution_setup.SetGlobal(true); @@ -185,6 +200,12 @@ void ConfigureGraphics::ApplyConfiguration() { Settings::values.resolution_setup.SetGlobal(false); Settings::values.resolution_setup.SetValue(resolution_setup); } + if (ui->scaling_filter_combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) { + Settings::values.scaling_filter.SetGlobal(true); + } else { + Settings::values.scaling_filter.SetGlobal(false); + Settings::values.scaling_filter.SetValue(scaling_filter); + } if (ui->api->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) { Settings::values.renderer_backend.SetGlobal(true); Settings::values.shader_backend.SetGlobal(true); @@ -333,6 +354,7 @@ void ConfigureGraphics::SetupPerGameUI() { ui->fullscreen_mode_combobox->setEnabled(Settings::values.fullscreen_mode.UsingGlobal()); ui->aspect_ratio_combobox->setEnabled(Settings::values.aspect_ratio.UsingGlobal()); ui->resolution_combobox->setEnabled(Settings::values.resolution_setup.UsingGlobal()); + ui->scaling_filter_combobox->setEnabled(Settings::values.scaling_filter.UsingGlobal()); ui->use_asynchronous_gpu_emulation->setEnabled( Settings::values.use_asynchronous_gpu_emulation.UsingGlobal()); ui->nvdec_emulation->setEnabled(Settings::values.nvdec_emulation.UsingGlobal()); @@ -364,6 +386,9 @@ void ConfigureGraphics::SetupPerGameUI() { ConfigurationShared::SetColoredComboBox( ui->resolution_combobox, ui->resolution_label, static_cast(Settings::values.resolution_setup.GetValue(true))); + ConfigurationShared::SetColoredComboBox( + ui->scaling_filter_combobox, ui->scaling_filter_label, + static_cast(Settings::values.scaling_filter.GetValue(true))); ConfigurationShared::InsertGlobalItem( ui->api, static_cast(Settings::values.renderer_backend.GetValue(true))); ConfigurationShared::InsertGlobalItem( diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui index 1b6ac3cbb..d5e0d4e89 100644 --- a/src/yuzu/configuration/configure_graphics.ui +++ b/src/yuzu/configuration/configure_graphics.ui @@ -338,21 +338,11 @@ 0.5X (360p/540p) - - - 0.75X (540p/810p) - - 1X (720p/1080p) - - - 1.5X (1080p/1620p) - - 2X (1440p/2160[4K]p) @@ -373,6 +363,50 @@ + + + + + 0 + + + 0 + + + 0 + + + 0 + + + + + Window Adapting Filter: + + + + + + + + Bilinear + + + + + Bicubic + + + + + ScaleForce + + + + + + + From 29710f3250413bac75eddb613b7f7d2c079021c2 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 9 Sep 2021 19:31:20 -0400 Subject: [PATCH 063/156] gl_texture_cache: fix scaling on upload --- src/video_core/renderer_opengl/gl_texture_cache.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 6aea375f1..edb8503cb 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -693,6 +693,10 @@ Image::~Image() = default; void Image::UploadMemory(const ImageBufferMap& map, std::span copies) { + const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); + if (is_rescaled) { + ScaleDown(); + } glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes); @@ -712,6 +716,9 @@ void Image::UploadMemory(const ImageBufferMap& map, } CopyBufferToImage(copy, map.offset); } + if (is_rescaled) { + ScaleUp(); + } } void Image::DownloadMemory(ImageBufferMap& map, From ae8d19d17eb5448207ece99ae07507c542c6ddae Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 9 Sep 2021 23:01:39 -0400 Subject: [PATCH 064/156] Renderers: Unify post processing filter shaders --- src/video_core/host_shaders/CMakeLists.txt | 6 +- .../opengl_present_scaleforce.frag | 135 ------------------ ...sent_bicubic.frag => present_bicubic.frag} | 13 +- ...caleforce.frag => present_scaleforce.frag} | 20 ++- .../host_shaders/vulkan_present_bicubic.frag | 56 -------- .../renderer_opengl/renderer_opengl.cpp | 9 +- .../renderer_vulkan/vk_blit_screen.cpp | 8 +- 7 files changed, 36 insertions(+), 211 deletions(-) delete mode 100644 src/video_core/host_shaders/opengl_present_scaleforce.frag rename src/video_core/host_shaders/{opengl_present_bicubic.frag => present_bicubic.frag} (87%) rename src/video_core/host_shaders/{vulkan_present_scaleforce.frag => present_scaleforce.frag} (94%) delete mode 100644 src/video_core/host_shaders/vulkan_present_bicubic.frag diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 835b37944..664d6ce5d 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -6,15 +6,13 @@ set(SHADER_FILES convert_float_to_depth.frag full_screen_triangle.vert opengl_copy_bc4.comp - opengl_present_scaleforce.frag - opengl_present_bicubic.frag opengl_present.frag opengl_present.vert pitch_unswizzle.comp + present_scaleforce.frag + present_bicubic.frag vulkan_blit_color_float.frag vulkan_blit_depth_stencil.frag - vulkan_present_bicubic.frag - vulkan_present_scaleforce.frag vulkan_present.frag vulkan_present.vert vulkan_quad_indexed.comp diff --git a/src/video_core/host_shaders/opengl_present_scaleforce.frag b/src/video_core/host_shaders/opengl_present_scaleforce.frag deleted file mode 100644 index 0153f62c0..000000000 --- a/src/video_core/host_shaders/opengl_present_scaleforce.frag +++ /dev/null @@ -1,135 +0,0 @@ -// from https://github.com/BreadFish64/ScaleFish/tree/master/scale_force - -// MIT License -// -// Copyright (c) 2020 BreadFish64 -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -precision mediump float; - -layout (location = 0) in vec2 tex_coord; - -layout (location = 0) out vec4 frag_color; - -layout (binding = 1) uniform sampler2D input_texture; - -vec2 tex_size; -vec2 inv_tex_size; - -vec4 cubic(float v) { - vec3 n = vec3(1.0, 2.0, 3.0) - v; - vec3 s = n * n * n; - float x = s.x; - float y = s.y - 4.0 * s.x; - float z = s.z - 4.0 * s.y + 6.0 * s.x; - float w = 6.0 - x - y - z; - return vec4(x, y, z, w) / 6.0; -} - -// Bicubic interpolation -vec4 textureBicubic(vec2 tex_coords) { - tex_coords = tex_coords * tex_size - 0.5; - - vec2 fxy = modf(tex_coords, tex_coords); - - vec4 xcubic = cubic(fxy.x); - vec4 ycubic = cubic(fxy.y); - - vec4 c = tex_coords.xxyy + vec2(-0.5, +1.5).xyxy; - - vec4 s = vec4(xcubic.xz + xcubic.yw, ycubic.xz + ycubic.yw); - vec4 offset = c + vec4(xcubic.yw, ycubic.yw) / s; - - offset *= inv_tex_size.xxyy; - - vec4 sample0 = textureLod(input_texture, offset.xz, 0.0); - vec4 sample1 = textureLod(input_texture, offset.yz, 0.0); - vec4 sample2 = textureLod(input_texture, offset.xw, 0.0); - vec4 sample3 = textureLod(input_texture, offset.yw, 0.0); - - float sx = s.x / (s.x + s.y); - float sy = s.z / (s.z + s.w); - - return mix(mix(sample3, sample2, sx), mix(sample1, sample0, sx), sy); -} - -mat4x3 center_matrix; -vec4 center_alpha; - -// Finds the distance between four colors and cc in YCbCr space -vec4 ColorDist(vec4 A, vec4 B, vec4 C, vec4 D) { - // https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.2020_conversion - const vec3 K = vec3(0.2627, 0.6780, 0.0593); - const float LUMINANCE_WEIGHT = .6; - const mat3 YCBCR_MATRIX = - mat3(K * LUMINANCE_WEIGHT, -.5 * K.r / (1.0 - K.b), -.5 * K.g / (1.0 - K.b), .5, .5, - -.5 * K.g / (1.0 - K.r), -.5 * K.b / (1.0 - K.r)); - - mat4x3 colors = mat4x3(A.rgb, B.rgb, C.rgb, D.rgb) - center_matrix; - mat4x3 YCbCr = YCBCR_MATRIX * colors; - vec4 color_dist = vec3(1.0) * YCbCr; - color_dist *= color_dist; - vec4 alpha = vec4(A.a, B.a, C.a, D.a); - - return sqrt((color_dist + abs(center_alpha - alpha)) * alpha * center_alpha); -} - -void main() { - vec4 bl = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(-1, -1)); - vec4 bc = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(0, -1)); - vec4 br = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(1, -1)); - vec4 cl = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(-1, 0)); - vec4 cc = textureLod(input_texture, tex_coord, 0.0); - vec4 cr = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(1, 0)); - vec4 tl = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(-1, 1)); - vec4 tc = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(0, 1)); - vec4 tr = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(1, 1)); - - - tex_size = vec2(textureSize(input_texture, 0)); - inv_tex_size = 1.0 / tex_size; - center_matrix = mat4x3(cc.rgb, cc.rgb, cc.rgb, cc.rgb); - center_alpha = cc.aaaa; - - vec4 offset_tl = ColorDist(tl, tc, tr, cr); - vec4 offset_br = ColorDist(br, bc, bl, cl); - - // Calculate how different cc is from the texels around it - float total_dist = dot(offset_tl + offset_br, vec4(1.0)); - - // Add together all the distances with direction taken into account - vec4 tmp = offset_tl - offset_br; - vec2 total_offset = tmp.wy + tmp.zz + vec2(-tmp.x, tmp.x); - - if (total_dist == 0.0) { - // Doing bicubic filtering just past the edges where the offset is 0 causes black floaters - // and it doesn't really matter which filter is used when the colors aren't changing. - frag_color = vec4(cc.rgb, 1.0f); - } else { - // When the image has thin points, they tend to split apart. - // This is because the texels all around are different - // and total_offset reaches into clear areas. - // This works pretty well to keep the offset in bounds for these cases. - float clamp_val = length(total_offset) / total_dist; - vec2 final_offset = clamp(total_offset, -clamp_val, clamp_val) * inv_tex_size; - - frag_color = vec4(textureBicubic(tex_coord - final_offset).rgb, 1.0f); - } -} diff --git a/src/video_core/host_shaders/opengl_present_bicubic.frag b/src/video_core/host_shaders/present_bicubic.frag similarity index 87% rename from src/video_core/host_shaders/opengl_present_bicubic.frag rename to src/video_core/host_shaders/present_bicubic.frag index 17772095a..f3e5410e7 100644 --- a/src/video_core/host_shaders/opengl_present_bicubic.frag +++ b/src/video_core/host_shaders/present_bicubic.frag @@ -4,11 +4,22 @@ #version 460 core +#ifdef VULKAN + +#define BINDING_COLOR_TEXTURE 1 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#define BINDING_COLOR_TEXTURE 0 + +#endif + + layout (location = 0) in vec2 frag_tex_coord; layout (location = 0) out vec4 color; -layout (binding = 1) uniform sampler2D color_texture; +layout (binding = BINDING_COLOR_TEXTURE) uniform sampler2D color_texture; vec4 cubic(float v) { vec4 n = vec4(1.0, 2.0, 3.0, 4.0) - v; diff --git a/src/video_core/host_shaders/vulkan_present_scaleforce.frag b/src/video_core/host_shaders/present_scaleforce.frag similarity index 94% rename from src/video_core/host_shaders/vulkan_present_scaleforce.frag rename to src/video_core/host_shaders/present_scaleforce.frag index 801c8eae9..1829a9be8 100644 --- a/src/video_core/host_shaders/vulkan_present_scaleforce.frag +++ b/src/video_core/host_shaders/present_scaleforce.frag @@ -1,7 +1,3 @@ -#version 320 es - -// from https://github.com/BreadFish64/ScaleFish/tree/master/scale_force - // MIT License // // Copyright (c) 2020 BreadFish64 @@ -24,13 +20,25 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -precision mediump float; +// Adapted from https://github.com/BreadFish64/ScaleFish/tree/master/scaleforce + +#version 460 + +#ifdef VULKAN + +#define BINDING_COLOR_TEXTURE 1 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#define BINDING_COLOR_TEXTURE 0 + +#endif layout (location = 0) in vec2 tex_coord; layout (location = 0) out vec4 frag_color; -layout (binding = 1) uniform sampler2D input_texture; +layout (binding = BINDING_COLOR_TEXTURE) uniform sampler2D input_texture; vec2 tex_size; vec2 inv_tex_size; diff --git a/src/video_core/host_shaders/vulkan_present_bicubic.frag b/src/video_core/host_shaders/vulkan_present_bicubic.frag deleted file mode 100644 index 17772095a..000000000 --- a/src/video_core/host_shaders/vulkan_present_bicubic.frag +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#version 460 core - -layout (location = 0) in vec2 frag_tex_coord; - -layout (location = 0) out vec4 color; - -layout (binding = 1) uniform sampler2D color_texture; - -vec4 cubic(float v) { - vec4 n = vec4(1.0, 2.0, 3.0, 4.0) - v; - vec4 s = n * n * n; - float x = s.x; - float y = s.y - 4.0 * s.x; - float z = s.z - 4.0 * s.y + 6.0 * s.x; - float w = 6.0 - x - y - z; - return vec4(x, y, z, w) * (1.0 / 6.0); -} - -vec4 textureBicubic( sampler2D textureSampler, vec2 texCoords ) { - - vec2 texSize = textureSize(textureSampler, 0); - vec2 invTexSize = 1.0 / texSize; - - texCoords = texCoords * texSize - 0.5; - - vec2 fxy = fract(texCoords); - texCoords -= fxy; - - vec4 xcubic = cubic(fxy.x); - vec4 ycubic = cubic(fxy.y); - - vec4 c = texCoords.xxyy + vec2(-0.5, +1.5).xyxy; - - vec4 s = vec4(xcubic.xz + xcubic.yw, ycubic.xz + ycubic.yw); - vec4 offset = c + vec4(xcubic.yw, ycubic.yw) / s; - - offset *= invTexSize.xxyy; - - vec4 sample0 = texture(textureSampler, offset.xz); - vec4 sample1 = texture(textureSampler, offset.yz); - vec4 sample2 = texture(textureSampler, offset.xw); - vec4 sample3 = texture(textureSampler, offset.yw); - - float sx = s.x / (s.x + s.y); - float sy = s.z / (s.z + s.w); - - return mix(mix(sample3, sample2, sx), mix(sample1, sample0, sx), sy); -} - -void main() { - color = vec4(textureBicubic(color_texture, frag_tex_coord).rgb, 1.0f); -} diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 71a5e3adf..955dbc744 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -21,10 +21,10 @@ #include "core/memory.h" #include "core/perf_stats.h" #include "core/telemetry_session.h" -#include "video_core/host_shaders/opengl_present_bicubic_frag.h" #include "video_core/host_shaders/opengl_present_frag.h" -#include "video_core/host_shaders/opengl_present_scaleforce_frag.h" #include "video_core/host_shaders/opengl_present_vert.h" +#include "video_core/host_shaders/present_bicubic_frag.h" +#include "video_core/host_shaders/present_scaleforce_frag.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_shader_util.h" @@ -255,10 +255,9 @@ void RendererOpenGL::InitOpenGLObjects() { // Create shader programs present_vertex = CreateProgram(HostShaders::OPENGL_PRESENT_VERT, GL_VERTEX_SHADER); present_bilinear_fragment = CreateProgram(HostShaders::OPENGL_PRESENT_FRAG, GL_FRAGMENT_SHADER); - present_bicubic_fragment = - CreateProgram(HostShaders::OPENGL_PRESENT_BICUBIC_FRAG, GL_FRAGMENT_SHADER); + present_bicubic_fragment = CreateProgram(HostShaders::PRESENT_BICUBIC_FRAG, GL_FRAGMENT_SHADER); present_scaleforce_fragment = - CreateProgram(HostShaders::OPENGL_PRESENT_SCALEFORCE_FRAG, GL_FRAGMENT_SHADER); + CreateProgram(HostShaders::PRESENT_SCALEFORCE_FRAG, GL_FRAGMENT_SHADER); // Generate presentation sampler present_sampler.Create(); diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 19d91ecfc..c91b24e3a 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -17,9 +17,9 @@ #include "core/frontend/emu_window.h" #include "core/memory.h" #include "video_core/gpu.h" -#include "video_core/host_shaders/vulkan_present_bicubic_frag_spv.h" +#include "video_core/host_shaders/present_bicubic_frag_spv.h" +#include "video_core/host_shaders/present_scaleforce_frag_spv.h" #include "video_core/host_shaders/vulkan_present_frag_spv.h" -#include "video_core/host_shaders/vulkan_present_scaleforce_frag_spv.h" #include "video_core/host_shaders/vulkan_present_vert_spv.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" @@ -342,8 +342,8 @@ void VKBlitScreen::RefreshResources(const Tegra::FramebufferConfig& framebuffer) void VKBlitScreen::CreateShaders() { vertex_shader = BuildShader(device, VULKAN_PRESENT_VERT_SPV); bilinear_fragment_shader = BuildShader(device, VULKAN_PRESENT_FRAG_SPV); - bicubic_fragment_shader = BuildShader(device, VULKAN_PRESENT_BICUBIC_FRAG_SPV); - scaleforce_fragment_shader = BuildShader(device, VULKAN_PRESENT_SCALEFORCE_FRAG_SPV); + bicubic_fragment_shader = BuildShader(device, PRESENT_BICUBIC_FRAG_SPV); + scaleforce_fragment_shader = BuildShader(device, PRESENT_SCALEFORCE_FRAG_SPV); } void VKBlitScreen::CreateSemaphores() { From fcf2b2c78a3c45ddcda6594b2fd3df733ceb951c Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Fri, 10 Sep 2021 01:10:59 -0400 Subject: [PATCH 065/156] gl_texture_cache: Simplify scaling We don't need to reconstruct new textures every time we ScaleUp/ScaleDown. We can scale up once, and revert to the original texture whenever scaling down. Fixes memory leaks due to glDeleteTextures being deferred for later handling on some drivers --- .../renderer_opengl/gl_texture_cache.cpp | 67 ++++++++++--------- .../renderer_opengl/gl_texture_cache.h | 3 +- 2 files changed, 39 insertions(+), 31 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index edb8503cb..22fffb19b 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -696,6 +696,7 @@ void Image::UploadMemory(const ImageBufferMap& map, const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); if (is_rescaled) { ScaleDown(); + scale_backup.Release(); } glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes); @@ -727,6 +728,7 @@ void Image::DownloadMemory(ImageBufferMap& map, const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); if (is_rescaled) { ScaleDown(); + scale_backup.Release(); } glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer); glPixelStorei(GL_PACK_ALIGNMENT, 1); @@ -881,17 +883,11 @@ void Image::CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t b } } -bool Image::Scale(bool scale_src, bool scale_dst) { - if (!runtime->is_rescaling_on) { - return false; - } - if (gl_format == 0 && gl_type == 0) { - // compressed textures - return false; - } - if (info.type == ImageType::Linear) { - UNIMPLEMENTED(); - return false; +bool Image::Scale() { + if (scale_backup.handle) { + // This was a texture which was scaled previously, no need to repeat scaling + std::swap(texture, scale_backup); + return true; } const GLenum attachment = [this] { switch (GetFormatType(info.format)) { @@ -924,41 +920,36 @@ bool Image::Scale(bool scale_src, bool scale_dst) { const auto& resolution = runtime->resolution; const u32 up = resolution.up_scale; const u32 down = resolution.down_shift; + const auto scale = [&](u32 value) { return std::max((value * up) >> down, 1U); }; - const auto scale_up = [&](u32 value) { return std::max((value * up) >> down, 1U); }; - const u32 scaled_width = scale_up(info.size.width); - const u32 scaled_height = is_2d ? scale_up(info.size.height) : info.size.height; + const u32 scaled_width = scale(info.size.width); + const u32 scaled_height = is_2d ? scale(info.size.height) : info.size.height; const u32 original_width = info.size.width; const u32 original_height = info.size.height; - const u32 src_width = scale_src ? scaled_width : original_width; - const u32 src_height = scale_src ? scaled_height : original_height; - const u32 dst_width = scale_dst ? scaled_width : original_width; - const u32 dst_height = scale_dst ? scaled_height : original_height; - auto dst_info = info; - dst_info.size.width = dst_width; - dst_info.size.height = dst_height; - auto dst_texture = MakeImage(dst_info, gl_internal_format); + dst_info.size.width = scaled_width; + dst_info.size.height = scaled_height; + scale_backup = MakeImage(dst_info, gl_internal_format); const GLuint read_fbo = runtime->rescale_read_fbo.handle; const GLuint draw_fbo = runtime->rescale_draw_fbo.handle; for (s32 layer = 0; layer < info.resources.layers; ++layer) { for (s32 level = 0; level < info.resources.levels; ++level) { - const u32 src_level_width = std::max(1u, src_width >> level); - const u32 src_level_height = std::max(1u, src_height >> level); - const u32 dst_level_width = std::max(1u, dst_width >> level); - const u32 dst_level_height = std::max(1u, dst_height >> level); + const u32 src_level_width = std::max(1u, original_width >> level); + const u32 src_level_height = std::max(1u, original_height >> level); + const u32 dst_level_width = std::max(1u, scaled_width >> level); + const u32 dst_level_height = std::max(1u, scaled_height >> level); glNamedFramebufferTextureLayer(read_fbo, attachment, texture.handle, level, layer); - glNamedFramebufferTextureLayer(draw_fbo, attachment, dst_texture.handle, level, layer); + glNamedFramebufferTextureLayer(draw_fbo, attachment, scale_backup.handle, level, layer); glBlitNamedFramebuffer(read_fbo, draw_fbo, 0, 0, src_level_width, src_level_height, 0, 0, dst_level_width, dst_level_height, mask, filter); glNamedFramebufferTextureLayer(read_fbo, attachment, 0, level, layer); glNamedFramebufferTextureLayer(draw_fbo, attachment, 0, level, layer); } } - texture = std::move(dst_texture); + std::swap(texture, scale_backup); return true; } @@ -966,16 +957,32 @@ bool Image::ScaleUp() { if (True(flags & ImageFlagBits::Rescaled)) { return false; } + if (!runtime->is_rescaling_on) { + return false; + } + if (gl_format == 0 && gl_type == 0) { + // compressed textures + return false; + } + if (info.type == ImageType::Linear) { + UNIMPLEMENTED(); + return false; + } flags |= ImageFlagBits::Rescaled; - return Scale(false, true); + return Scale(); } bool Image::ScaleDown() { if (False(flags & ImageFlagBits::Rescaled)) { return false; } + if (!scale_backup.handle) { + LOG_ERROR(Render_OpenGL, "Downscaling an upscaled texture that didn't backup original"); + return false; + } flags &= ~ImageFlagBits::Rescaled; - return Scale(true, false); + std::swap(texture, scale_backup); + return true; } ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 28c91b368..f4dcc6f9b 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -203,9 +203,10 @@ private: void CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); - bool Scale(bool scale_src, bool scale_dst); + bool Scale(); OGLTexture texture; + OGLTexture scale_backup; OGLTextureView store_view; GLenum gl_internal_format = GL_NONE; GLenum gl_format = GL_NONE; From 80f8d4989eca127c7ca8c7bd63134127d6fd5edc Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Fri, 10 Sep 2021 01:28:02 -0400 Subject: [PATCH 066/156] bootmanager: Fix screenshot resolution factor usage Fixes screenshots at non integer scaling --- src/core/frontend/framebuffer_layout.cpp | 15 ++++++--------- src/core/frontend/framebuffer_layout.h | 2 +- src/yuzu/bootmanager.cpp | 8 +++----- src/yuzu/bootmanager.h | 2 +- src/yuzu/debugger/profiler.cpp | 2 +- src/yuzu/main.cpp | 3 +-- src/yuzu/uisettings.h | 1 - 7 files changed, 13 insertions(+), 20 deletions(-) diff --git a/src/core/frontend/framebuffer_layout.cpp b/src/core/frontend/framebuffer_layout.cpp index 0832463d6..4b58b672a 100644 --- a/src/core/frontend/framebuffer_layout.cpp +++ b/src/core/frontend/framebuffer_layout.cpp @@ -44,16 +44,13 @@ FramebufferLayout DefaultFrameLayout(u32 width, u32 height) { return res; } -FramebufferLayout FrameLayoutFromResolutionScale(u32 res_scale) { - u32 width, height; +FramebufferLayout FrameLayoutFromResolutionScale(f32 res_scale) { + const bool is_docked = Settings::values.use_docked_mode.GetValue(); + const u32 screen_width = is_docked ? ScreenDocked::Width : ScreenUndocked::Width; + const u32 screen_height = is_docked ? ScreenDocked::Height : ScreenUndocked::Height; - if (Settings::values.use_docked_mode.GetValue()) { - width = ScreenDocked::Width * res_scale; - height = ScreenDocked::Height * res_scale; - } else { - width = ScreenUndocked::Width * res_scale; - height = ScreenUndocked::Height * res_scale; - } + const u32 width = static_cast(static_cast(screen_width) * res_scale); + const u32 height = static_cast(static_cast(screen_height) * res_scale); return DefaultFrameLayout(width, height); } diff --git a/src/core/frontend/framebuffer_layout.h b/src/core/frontend/framebuffer_layout.h index e2e3bbbb3..2e36c0163 100644 --- a/src/core/frontend/framebuffer_layout.h +++ b/src/core/frontend/framebuffer_layout.h @@ -60,7 +60,7 @@ FramebufferLayout DefaultFrameLayout(u32 width, u32 height); * Convenience method to get frame layout by resolution scale * @param res_scale resolution scale factor */ -FramebufferLayout FrameLayoutFromResolutionScale(u32 res_scale); +FramebufferLayout FrameLayoutFromResolutionScale(f32 res_scale); /** * Convenience method to determine emulation aspect ratio diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp index 46ab0603d..976acd176 100644 --- a/src/yuzu/bootmanager.cpp +++ b/src/yuzu/bootmanager.cpp @@ -628,11 +628,9 @@ void GRenderWindow::ReleaseRenderTarget() { main_context.reset(); } -void GRenderWindow::CaptureScreenshot(u32 res_scale, const QString& screenshot_path) { - VideoCore::RendererBase& renderer = system.Renderer(); - if (res_scale == 0) { - res_scale = VideoCore::GetResolutionScaleFactor(renderer); - } +void GRenderWindow::CaptureScreenshot(const QString& screenshot_path) { + auto& renderer = system.Renderer(); + const f32 res_scale = VideoCore::GetResolutionScaleFactor(renderer); const Layout::FramebufferLayout layout{Layout::FrameLayoutFromResolutionScale(res_scale)}; screenshot_image = QImage(QSize(layout.width, layout.height), QImage::Format_RGB32); diff --git a/src/yuzu/bootmanager.h b/src/yuzu/bootmanager.h index e6a0666e9..40fd4a9d6 100644 --- a/src/yuzu/bootmanager.h +++ b/src/yuzu/bootmanager.h @@ -178,7 +178,7 @@ public: bool IsLoadingComplete() const; - void CaptureScreenshot(u32 res_scale, const QString& screenshot_path); + void CaptureScreenshot(const QString& screenshot_path); std::pair ScaleTouch(const QPointF& pos) const; diff --git a/src/yuzu/debugger/profiler.cpp b/src/yuzu/debugger/profiler.cpp index 33110685a..a8b254199 100644 --- a/src/yuzu/debugger/profiler.cpp +++ b/src/yuzu/debugger/profiler.cpp @@ -163,7 +163,7 @@ void MicroProfileWidget::mouseReleaseEvent(QMouseEvent* ev) { } void MicroProfileWidget::wheelEvent(QWheelEvent* ev) { - const auto wheel_position = ev->position().toPoint(); + const auto wheel_position = ev->pos(); MicroProfileMousePosition(wheel_position.x() / x_scale, wheel_position.y() / y_scale, ev->angleDelta().y() / 120); ev->accept(); diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index a246f6bb3..3cb146982 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -2892,8 +2892,7 @@ void GMainWindow::OnCaptureScreenshot() { } } #endif - render_window->CaptureScreenshot(UISettings::values.screenshot_resolution_factor.GetValue(), - filename); + render_window->CaptureScreenshot(filename); } // TODO: Written 2020-10-01: Remove per-game config migration code when it is irrelevant diff --git a/src/yuzu/uisettings.h b/src/yuzu/uisettings.h index cac19452f..936914ef3 100644 --- a/src/yuzu/uisettings.h +++ b/src/yuzu/uisettings.h @@ -68,7 +68,6 @@ struct Values { Settings::BasicSetting enable_discord_presence{true, "enable_discord_presence"}; Settings::BasicSetting enable_screenshot_save_as{true, "enable_screenshot_save_as"}; - Settings::BasicSetting screenshot_resolution_factor{0, "screenshot_resolution_factor"}; QString roms_path; QString symbols_path; From 6000fe69a4b2805a48ce045d9a383fda27d5e57b Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Tue, 14 Sep 2021 00:45:50 -0400 Subject: [PATCH 067/156] image_info: Mark MSAA textures as non-rescalable Blitting or resolving multisampled images requires the dimensions of the src and dst to be equal for valid usage, making them difficult for resolution scaling using the current implementation. --- src/video_core/texture_cache/image_info.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index 7fa8fd4fe..bdf306bf9 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -101,7 +101,7 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept { // FIXME: Call this without passing *this layer_stride = CalculateLayerStride(*this); maybe_unaligned_layer_stride = CalculateLayerSize(*this); - rescaleable &= (block.depth == 0) && resources.levels == 1; + rescaleable &= (block.depth == 0) && resources.levels == 1 && num_samples == 1; } } @@ -134,7 +134,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) type = ImageType::e3D; size.depth = rt.depth; } else { - rescaleable = block.depth == 0 && size.height > 256; + rescaleable = block.depth == 0 && size.height > 256 && num_samples == 1; type = ImageType::e2D; resources.layers = rt.depth; } From b3a9c8f108d90234c7e5e88b41f8e4bc9c163d96 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 18 Sep 2021 02:26:33 +0200 Subject: [PATCH 068/156] Shader: Don't rescale FragCoord if used by Shuffle --- .../frontend/maxwell/translate_program.cpp | 4 +- .../ir_opt/rescaling_pass.cpp | 53 ++++++++++++++++++- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp index 795f5cf08..743fb2420 100644 --- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp @@ -178,10 +178,12 @@ IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPoolArg(0).Attribute()}; + switch (attr) { + case IR::Attribute::PositionX: + case IR::Attribute::PositionY: + if (is_fragment_shader) { + op->SetFlags(0xDEADBEEF); + } + break; + default: + break; + } + }; + const IR::Value param_1{inst.Arg(0)}; + if (param_1.IsImmediate()) { + break; + } + IR::Inst* op_a{param_1.InstRecursive()}; + if (op_a->GetOpcode() == IR::Opcode::GetAttribute) { + try_mark(op_a); + break; + } + if (op_a->GetOpcode() != IR::Opcode::BitCastF32U32) { + break; + } + const IR::Value param_2{op_a->Arg(0)}; + if (param_2.IsImmediate()) { + break; + } + IR::Inst* op_b{param_2.InstRecursive()}; + if (op_b->GetOpcode() == IR::Opcode::GetAttribute) { + try_mark(op_b); + } + break; + } + default: + break; + } +} void PatchFragCoord(IR::Block& block, IR::Inst& inst) { IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; const IR::F32 down_factor{ir.ResolutionDownFactor()}; @@ -219,7 +265,7 @@ void Visit(const IR::Program& program, IR::Block& block, IR::Inst& inst) { switch (attr) { case IR::Attribute::PositionX: case IR::Attribute::PositionY: - if (is_fragment_shader) { + if (is_fragment_shader && inst.Flags() != 0xDEADBEEF) { PatchFragCoord(block, inst); } break; @@ -254,6 +300,11 @@ void Visit(const IR::Program& program, IR::Block& block, IR::Inst& inst) { } // Anonymous namespace void RescalingPass(IR::Program& program) { + for (IR::Block* const block : program.post_order_blocks) { + for (IR::Inst& inst : block->Instructions()) { + VisitMark(program, inst); + } + } for (IR::Block* const block : program.post_order_blocks) { for (IR::Inst& inst : block->Instructions()) { Visit(program, *block, inst); From edb5844240c339846d505735d2c2e1ad731f8be7 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Fri, 17 Sep 2021 21:31:29 -0400 Subject: [PATCH 069/156] rescaling_pass: Fix and simplify shuffle/fragcoord pass --- .../ir_opt/rescaling_pass.cpp | 46 ++++++++----------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp index 4d23b60c8..8bbaa55e4 100644 --- a/src/shader_recompiler/ir_opt/rescaling_pass.cpp +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -14,45 +14,39 @@ namespace Shader::Optimization { namespace { -void VisitMark(const IR::Program& program, IR::Inst& inst) { +void VisitMark(const IR::Program& program, const IR::Inst& inst) { const bool is_fragment_shader{program.stage == Stage::Fragment}; + if (!is_fragment_shader) { + return; + } switch (inst.GetOpcode()) { case IR::Opcode::ShuffleIndex: case IR::Opcode::ShuffleUp: case IR::Opcode::ShuffleDown: case IR::Opcode::ShuffleButterfly: { - const auto try_mark = [is_fragment_shader](IR::Inst* op) { - const IR::Attribute attr{op->Arg(0).Attribute()}; + const IR::Value shfl_arg{inst.Arg(0)}; + if (shfl_arg.IsImmediate()) { + break; + } + const IR::Inst* const arg_inst{shfl_arg.InstRecursive()}; + if (arg_inst->GetOpcode() != IR::Opcode::BitCastU32F32) { + break; + } + const IR::Value bitcast_arg{arg_inst->Arg(0)}; + if (bitcast_arg.IsImmediate()) { + break; + } + IR::Inst* const bitcast_inst{bitcast_arg.InstRecursive()}; + if (bitcast_inst->GetOpcode() == IR::Opcode::GetAttribute) { + const IR::Attribute attr{bitcast_inst->Arg(0).Attribute()}; switch (attr) { case IR::Attribute::PositionX: case IR::Attribute::PositionY: - if (is_fragment_shader) { - op->SetFlags(0xDEADBEEF); - } + bitcast_inst->SetFlags(0xDEADBEEF); break; default: break; } - }; - const IR::Value param_1{inst.Arg(0)}; - if (param_1.IsImmediate()) { - break; - } - IR::Inst* op_a{param_1.InstRecursive()}; - if (op_a->GetOpcode() == IR::Opcode::GetAttribute) { - try_mark(op_a); - break; - } - if (op_a->GetOpcode() != IR::Opcode::BitCastF32U32) { - break; - } - const IR::Value param_2{op_a->Arg(0)}; - if (param_2.IsImmediate()) { - break; - } - IR::Inst* op_b{param_2.InstRecursive()}; - if (op_b->GetOpcode() == IR::Opcode::GetAttribute) { - try_mark(op_b); } break; } From c8a971be919158a265ec4c0f934ba368b8a3f315 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Fri, 17 Sep 2021 21:31:50 -0400 Subject: [PATCH 070/156] vk_texture_cache: Minor cleanup --- .../renderer_vulkan/vk_texture_cache.cpp | 18 ++++++++---------- .../renderer_vulkan/vk_texture_cache.h | 1 - 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 9afe49387..855f0a5d7 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -739,8 +739,7 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, VKScheduler& sch : device{device_}, scheduler{scheduler_}, memory_allocator{memory_allocator_}, staging_buffer_pool{staging_buffer_pool_}, blit_image_helper{blit_image_helper_}, astc_decoder_pass{astc_decoder_pass_}, render_pass_cache{render_pass_cache_}, - resolution{Settings::values.resolution_info}, - is_rescaling_on(resolution.up_scale != 1 || resolution.down_shift != 0) {} + resolution{Settings::values.resolution_info} {} void TextureCacheRuntime::Finish() { scheduler.Finish(); @@ -1141,11 +1140,11 @@ bool Image::ScaleUp(bool save_as_backup) { ASSERT(info.type != ImageType::Linear); scaling_count++; flags |= ImageFlagBits::Rescaled; - if (!runtime->is_rescaling_on) { - return true; - } const auto& resolution = runtime->resolution; + if (!resolution.active) { + return true; + } vk::Image rescaled_image = has_backup ? std::move(backup_image) : MakeImage(runtime->device, info, resolution.up_scale, resolution.down_shift); @@ -1188,7 +1187,7 @@ bool Image::ScaleUp(bool save_as_backup) { } void Image::SwapBackup() { - if (!runtime->is_rescaling_on) { + if (!runtime->resolution.active) { return; } ASSERT(has_backup); @@ -1206,17 +1205,16 @@ bool Image::ScaleDown(bool save_as_backup) { ASSERT(info.type != ImageType::Linear); flags &= ~ImageFlagBits::Rescaled; scaling_count++; - if (!runtime->is_rescaling_on) { - return true; - } const auto& resolution = runtime->resolution; + if (!resolution.active) { + return true; + } vk::Image downscaled_image = has_backup ? std::move(backup_image) : MakeImage(runtime->device, info); MemoryCommit new_commit = has_backup ? std::move(backup_commit) : MemoryCommit(runtime->memory_allocator.Commit( downscaled_image, MemoryUsage::DeviceLocal)); - has_backup = false; if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 9c39a6d99..84194b833 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -94,7 +94,6 @@ public: DelayedDestructionRing prescaled_images; DelayedDestructionRing prescaled_commits; Settings::ResolutionScalingInfo resolution; - bool is_rescaling_on{}; }; class Image : public VideoCommon::ImageBase { From b027fac7945184d644aa00940e528a20edcf0d06 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 18 Sep 2021 00:43:41 -0400 Subject: [PATCH 071/156] gl_texture_cache/rescaling_pass: minor cleanup --- .../ir_opt/rescaling_pass.cpp | 20 ++++++++----------- .../renderer_opengl/gl_texture_cache.cpp | 5 ++--- .../renderer_opengl/gl_texture_cache.h | 1 - 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp index 8bbaa55e4..357e41f2b 100644 --- a/src/shader_recompiler/ir_opt/rescaling_pass.cpp +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -82,18 +82,14 @@ void PatchFragCoord(IR::Block& block, IR::Inst& inst) { [[nodiscard]] IR::U32 SubScale(IR::IREmitter& ir, const IR::U1& is_scaled, const IR::U32& value, const IR::Attribute attrib) { - if (Settings::values.resolution_info.active) { - const IR::F32 opt1{ir.Imm32(Settings::values.resolution_info.up_factor)}; - const IR::F32 base{ir.FPMul(ir.ConvertUToF(32, 32, value), opt1)}; - const IR::F32 frag_coord{ir.GetAttribute(attrib)}; - const IR::F32 opt2{ir.Imm32(Settings::values.resolution_info.down_factor)}; - const IR::F32 floor{ir.FPMul(opt1, ir.FPFloor(ir.FPMul(frag_coord, opt2)))}; - const IR::U32 deviation{ - ir.ConvertFToU(32, ir.FPAdd(base, ir.FPAdd(frag_coord, ir.FPNeg(floor))))}; - return IR::U32{ir.Select(is_scaled, deviation, value)}; - } else { - return value; - } + const IR::F32 opt1{ir.Imm32(Settings::values.resolution_info.up_factor)}; + const IR::F32 base{ir.FPMul(ir.ConvertUToF(32, 32, value), opt1)}; + const IR::F32 frag_coord{ir.GetAttribute(attrib)}; + const IR::F32 opt2{ir.Imm32(Settings::values.resolution_info.down_factor)}; + const IR::F32 floor{ir.FPMul(opt1, ir.FPFloor(ir.FPMul(frag_coord, opt2)))}; + const IR::U32 deviation{ + ir.ConvertFToU(32, ir.FPAdd(base, ir.FPAdd(frag_coord, ir.FPNeg(floor))))}; + return IR::U32{ir.Select(is_scaled, deviation, value)}; } [[nodiscard]] IR::U32 DownScale(IR::IREmitter& ir, const IR::U1& is_scaled, IR::U32 value) { diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 22fffb19b..64bd88c3b 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -474,8 +474,7 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& set_view(Shader::TextureType::ColorArrayCube, null_image_cube_array.handle); resolution = Settings::values.resolution_info; - is_rescaling_on = resolution.up_scale != 1 || resolution.down_shift != 0; - if (is_rescaling_on) { + if (resolution.active) { rescale_draw_fbo.Create(); rescale_read_fbo.Create(); @@ -957,7 +956,7 @@ bool Image::ScaleUp() { if (True(flags & ImageFlagBits::Rescaled)) { return false; } - if (!runtime->is_rescaling_on) { + if (!runtime->resolution.active) { return false; } if (gl_format == 0 && gl_type == 0) { diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index f4dcc6f9b..6c8033003 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -156,7 +156,6 @@ private: OGLFramebuffer rescale_draw_fbo; OGLFramebuffer rescale_read_fbo; Settings::ResolutionScalingInfo resolution; - bool is_rescaling_on{}; }; class Image : public VideoCommon::ImageBase { From 27af298e78ffa976bf126d7f276fa95c4f4f1363 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 18 Sep 2021 19:15:10 -0400 Subject: [PATCH 072/156] gl_texture_cache: Fix depth and integer format scaling blits --- .../renderer_opengl/gl_texture_cache.cpp | 73 +++++++++++++++---- .../renderer_opengl/gl_texture_cache.h | 4 +- 2 files changed, 61 insertions(+), 16 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 64bd88c3b..dc850e7b0 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -395,6 +395,33 @@ OGLTexture MakeImage(const VideoCommon::ImageInfo& info, GLenum gl_internal_form UNREACHABLE_MSG("Invalid image format={}", format); return GL_R32UI; } + +[[nodiscard]] bool IsPixelFormatInteger(PixelFormat format) { + switch (format) { + case PixelFormat::A8B8G8R8_SINT: + case PixelFormat::A8B8G8R8_UINT: + case PixelFormat::A2B10G10R10_UINT: + case PixelFormat::R8_SINT: + case PixelFormat::R8_UINT: + case PixelFormat::R16G16B16A16_SINT: + case PixelFormat::R16G16B16A16_UINT: + case PixelFormat::R32G32B32A32_UINT: + case PixelFormat::R32G32B32A32_SINT: + case PixelFormat::R32G32_SINT: + case PixelFormat::R16_UINT: + case PixelFormat::R16_SINT: + case PixelFormat::R16G16_UINT: + case PixelFormat::R16G16_SINT: + case PixelFormat::R8G8_SINT: + case PixelFormat::R8G8_UINT: + case PixelFormat::R32G32_UINT: + case PixelFormat::R32_UINT: + case PixelFormat::R32_SINT: + return true; + default: + return false; + } +} } // Anonymous namespace ImageBufferMap::~ImageBufferMap() { @@ -475,12 +502,14 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& resolution = Settings::values.resolution_info; if (resolution.active) { - rescale_draw_fbo.Create(); - rescale_read_fbo.Create(); + for (size_t i = 0; i < rescale_draw_fbos.size(); ++i) { + rescale_draw_fbos[i].Create(); + rescale_read_fbos[i].Create(); - // Make sure the framebuffer is created without DSA - glBindFramebuffer(GL_READ_FRAMEBUFFER, rescale_draw_fbo.handle); - glBindFramebuffer(GL_READ_FRAMEBUFFER, rescale_read_fbo.handle); + // Make sure the framebuffer is created without DSA + glBindFramebuffer(GL_READ_FRAMEBUFFER, rescale_draw_fbos[i].handle); + glBindFramebuffer(GL_READ_FRAMEBUFFER, rescale_read_fbos[i].handle); + } } } @@ -888,8 +917,9 @@ bool Image::Scale() { std::swap(texture, scale_backup); return true; } - const GLenum attachment = [this] { - switch (GetFormatType(info.format)) { + const auto format_type = GetFormatType(info.format); + const GLenum attachment = [format_type] { + switch (format_type) { case SurfaceType::ColorTexture: return GL_COLOR_ATTACHMENT0; case SurfaceType::Depth: @@ -901,8 +931,8 @@ bool Image::Scale() { return GL_COLOR_ATTACHMENT0; } }(); - const GLenum mask = [this] { - switch (GetFormatType(info.format)) { + const GLenum mask = [format_type] { + switch (format_type) { case SurfaceType::ColorTexture: return GL_COLOR_BUFFER_BIT; case SurfaceType::Depth: @@ -914,8 +944,25 @@ bool Image::Scale() { return GL_COLOR_BUFFER_BIT; } }(); - const GLenum filter = (mask & GL_COLOR_BUFFER_BIT) != 0 ? GL_LINEAR : GL_NEAREST; + const size_t fbo_index = [format_type] { + switch (format_type) { + case SurfaceType::ColorTexture: + return 0; + case SurfaceType::Depth: + return 1; + case SurfaceType::DepthStencil: + return 2; + default: + UNREACHABLE(); + return 0; + } + }(); const bool is_2d = info.type == ImageType::e2D; + const bool is_color{(mask & GL_COLOR_BUFFER_BIT) != 0}; + // Integer formats must use NEAREST filter + const bool linear_color_format{is_color && !IsPixelFormatInteger(info.format)}; + const GLenum filter = linear_color_format ? GL_LINEAR : GL_NEAREST; + const auto& resolution = runtime->resolution; const u32 up = resolution.up_scale; const u32 down = resolution.down_shift; @@ -931,8 +978,8 @@ bool Image::Scale() { dst_info.size.height = scaled_height; scale_backup = MakeImage(dst_info, gl_internal_format); - const GLuint read_fbo = runtime->rescale_read_fbo.handle; - const GLuint draw_fbo = runtime->rescale_draw_fbo.handle; + const GLuint read_fbo = runtime->rescale_read_fbos[fbo_index].handle; + const GLuint draw_fbo = runtime->rescale_draw_fbos[fbo_index].handle; for (s32 layer = 0; layer < info.resources.layers; ++layer) { for (s32 level = 0; level < info.resources.levels; ++level) { const u32 src_level_width = std::max(1u, original_width >> level); @@ -944,8 +991,6 @@ bool Image::Scale() { glNamedFramebufferTextureLayer(draw_fbo, attachment, scale_backup.handle, level, layer); glBlitNamedFramebuffer(read_fbo, draw_fbo, 0, 0, src_level_width, src_level_height, 0, 0, dst_level_width, dst_level_height, mask, filter); - glNamedFramebufferTextureLayer(read_fbo, attachment, 0, level, layer); - glNamedFramebufferTextureLayer(draw_fbo, attachment, 0, level, layer); } } std::swap(texture, scale_backup); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 6c8033003..79448f670 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -153,8 +153,8 @@ private: std::array null_image_views{}; - OGLFramebuffer rescale_draw_fbo; - OGLFramebuffer rescale_read_fbo; + std::array rescale_draw_fbos; + std::array rescale_read_fbos; Settings::ResolutionScalingInfo resolution; }; From 16017ac4503603bcf8189583120ad8888242b0e1 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 18 Sep 2021 20:50:00 -0400 Subject: [PATCH 073/156] vk_texture_cache: Use nearest neighbor scaling when available --- .../renderer_opengl/gl_texture_cache.cpp | 27 ------------------- .../renderer_vulkan/vk_texture_cache.cpp | 9 +++++-- src/video_core/surface.cpp | 27 +++++++++++++++++++ src/video_core/surface.h | 2 ++ 4 files changed, 36 insertions(+), 29 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index dc850e7b0..c75386e37 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -395,33 +395,6 @@ OGLTexture MakeImage(const VideoCommon::ImageInfo& info, GLenum gl_internal_form UNREACHABLE_MSG("Invalid image format={}", format); return GL_R32UI; } - -[[nodiscard]] bool IsPixelFormatInteger(PixelFormat format) { - switch (format) { - case PixelFormat::A8B8G8R8_SINT: - case PixelFormat::A8B8G8R8_UINT: - case PixelFormat::A2B10G10R10_UINT: - case PixelFormat::R8_SINT: - case PixelFormat::R8_UINT: - case PixelFormat::R16G16B16A16_SINT: - case PixelFormat::R16G16B16A16_UINT: - case PixelFormat::R32G32B32A32_UINT: - case PixelFormat::R32G32B32A32_SINT: - case PixelFormat::R32G32_SINT: - case PixelFormat::R16_UINT: - case PixelFormat::R16_SINT: - case PixelFormat::R16G16_UINT: - case PixelFormat::R16G16_SINT: - case PixelFormat::R8G8_SINT: - case PixelFormat::R8G8_UINT: - case PixelFormat::R32G32_UINT: - case PixelFormat::R32_UINT: - case PixelFormat::R32_SINT: - return true; - default: - return false; - } -} } // Anonymous namespace ImageBufferMap::~ImageBufferMap() { diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 855f0a5d7..a34cd31f0 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -37,6 +37,7 @@ using VideoCommon::ImageInfo; using VideoCommon::ImageType; using VideoCommon::SubresourceRange; using VideoCore::Surface::IsPixelFormatASTC; +using VideoCore::Surface::IsPixelFormatInteger; namespace { constexpr VkBorderColor ConvertBorderColor(const std::array& color) { @@ -603,9 +604,13 @@ void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, con .width = info.size.width, .height = info.size.height, }; + const bool is_zeta = (aspect_mask & VK_IMAGE_ASPECT_DEPTH_BIT) != 0; + const bool is_int_format = IsPixelFormatInteger(info.format); + const VkFilter vk_filter = (is_zeta || is_int_format) ? VK_FILTER_NEAREST : VK_FILTER_LINEAR; + scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([dst_image, src_image, extent, resources, aspect_mask, resolution, type, - scaling](vk::CommandBuffer cmdbuf) { + scaling, vk_filter](vk::CommandBuffer cmdbuf) { const auto scale_up = [&](u32 value) { return std::max((value * resolution.up_scale) >> resolution.down_shift, 1U); }; @@ -723,7 +728,7 @@ void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, con cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, nullptr, nullptr, read_barriers); cmdbuf.BlitImage(src_image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst_image, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, regions, VK_FILTER_NEAREST); + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, regions, vk_filter); cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, nullptr, nullptr, write_barriers); }); diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index eb1746265..64941a486 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -279,6 +279,33 @@ bool IsPixelFormatSRGB(PixelFormat format) { } } +bool IsPixelFormatInteger(PixelFormat format) { + switch (format) { + case PixelFormat::A8B8G8R8_SINT: + case PixelFormat::A8B8G8R8_UINT: + case PixelFormat::A2B10G10R10_UINT: + case PixelFormat::R8_SINT: + case PixelFormat::R8_UINT: + case PixelFormat::R16G16B16A16_SINT: + case PixelFormat::R16G16B16A16_UINT: + case PixelFormat::R32G32B32A32_UINT: + case PixelFormat::R32G32B32A32_SINT: + case PixelFormat::R32G32_SINT: + case PixelFormat::R16_UINT: + case PixelFormat::R16_SINT: + case PixelFormat::R16G16_UINT: + case PixelFormat::R16G16_SINT: + case PixelFormat::R8G8_SINT: + case PixelFormat::R8G8_UINT: + case PixelFormat::R32G32_UINT: + case PixelFormat::R32_UINT: + case PixelFormat::R32_SINT: + return true; + default: + return false; + } +} + std::pair GetASTCBlockSize(PixelFormat format) { return {DefaultBlockWidth(format), DefaultBlockHeight(format)}; } diff --git a/src/video_core/surface.h b/src/video_core/surface.h index 1503db81f..3bb24abb7 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -460,6 +460,8 @@ bool IsPixelFormatASTC(PixelFormat format); bool IsPixelFormatSRGB(PixelFormat format); +bool IsPixelFormatInteger(PixelFormat format); + std::pair GetASTCBlockSize(PixelFormat format); u64 EstimatedDecompressedSize(u64 base_size, PixelFormat format); From 122ddeb7ff948a607b0bee9bae968dc4d9c72188 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 19 Sep 2021 00:03:14 -0400 Subject: [PATCH 074/156] vk_rasterizer: Fix scaling on Y_NEGATE --- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 20bb05e7d..87f265e09 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -60,12 +60,18 @@ struct DrawParams { VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t index, float scale) { const auto& src = regs.viewport_transform[index]; + const float x = (src.translate_x - src.scale_x) * scale; const float width = src.scale_x * 2.0f * scale; - const float height = src.scale_y * 2.0f * scale; + float y = (src.translate_y - src.scale_y) * scale; + float height = src.scale_y * 2.0f * scale; + if (regs.screen_y_control.y_negate) { + y += height; + height = -height; + } const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f; VkViewport viewport{ - .x = (src.translate_x - src.scale_x) * scale, - .y = (src.translate_y - src.scale_y) * scale, + .x = x, + .y = y, .width = width != 0.0f ? width : 1.0f, .height = height != 0.0f ? height : 1.0f, .minDepth = src.translate_z - src.scale_z * reduce_z, From 8183142cd4c70355e6275eaba3d2939211b4b9c9 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Tue, 21 Sep 2021 20:28:22 -0400 Subject: [PATCH 075/156] gl_texture_cache: Fix scaling backup logic --- .../renderer_opengl/gl_texture_cache.cpp | 33 ++++++++----------- .../renderer_opengl/gl_texture_cache.h | 3 +- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index c75386e37..7ded7415d 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -681,6 +681,7 @@ Image::Image(TextureCacheRuntime& runtime_, const VideoCommon::ImageInfo& info_, gl_type = tuple.type; } texture = MakeImage(info, gl_internal_format); + original_backup = texture.handle; if (runtime->device.HasDebuggingToolAttached()) { const std::string name = VideoCommon::Name(*this); glObjectLabel(ImageTarget(info) == GL_TEXTURE_BUFFER ? GL_BUFFER : GL_TEXTURE, @@ -697,7 +698,6 @@ void Image::UploadMemory(const ImageBufferMap& map, const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); if (is_rescaled) { ScaleDown(); - scale_backup.Release(); } glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes); @@ -729,7 +729,6 @@ void Image::DownloadMemory(ImageBufferMap& map, const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); if (is_rescaled) { ScaleDown(); - scale_backup.Release(); } glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer); glPixelStorei(GL_PACK_ALIGNMENT, 1); @@ -749,7 +748,7 @@ void Image::DownloadMemory(ImageBufferMap& map, CopyImageToBuffer(copy, map.offset); } if (is_rescaled) { - ScaleUp(); + texture.handle = upscaled_backup.handle; } } @@ -885,11 +884,6 @@ void Image::CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t b } bool Image::Scale() { - if (scale_backup.handle) { - // This was a texture which was scaled previously, no need to repeat scaling - std::swap(texture, scale_backup); - return true; - } const auto format_type = GetFormatType(info.format); const GLenum attachment = [format_type] { switch (format_type) { @@ -949,8 +943,9 @@ bool Image::Scale() { auto dst_info = info; dst_info.size.width = scaled_width; dst_info.size.height = scaled_height; - scale_backup = MakeImage(dst_info, gl_internal_format); - + if (!upscaled_backup.handle) { + upscaled_backup = MakeImage(dst_info, gl_internal_format); + } const GLuint read_fbo = runtime->rescale_read_fbos[fbo_index].handle; const GLuint draw_fbo = runtime->rescale_draw_fbos[fbo_index].handle; for (s32 layer = 0; layer < info.resources.layers; ++layer) { @@ -960,13 +955,14 @@ bool Image::Scale() { const u32 dst_level_width = std::max(1u, scaled_width >> level); const u32 dst_level_height = std::max(1u, scaled_height >> level); - glNamedFramebufferTextureLayer(read_fbo, attachment, texture.handle, level, layer); - glNamedFramebufferTextureLayer(draw_fbo, attachment, scale_backup.handle, level, layer); + glNamedFramebufferTextureLayer(read_fbo, attachment, original_backup, level, layer); + glNamedFramebufferTextureLayer(draw_fbo, attachment, upscaled_backup.handle, level, + layer); glBlitNamedFramebuffer(read_fbo, draw_fbo, 0, 0, src_level_width, src_level_height, 0, 0, dst_level_width, dst_level_height, mask, filter); } } - std::swap(texture, scale_backup); + texture.handle = upscaled_backup.handle; return true; } @@ -985,20 +981,19 @@ bool Image::ScaleUp() { UNIMPLEMENTED(); return false; } + if (!Scale()) { + return false; + } flags |= ImageFlagBits::Rescaled; - return Scale(); + return true; } bool Image::ScaleDown() { if (False(flags & ImageFlagBits::Rescaled)) { return false; } - if (!scale_backup.handle) { - LOG_ERROR(Render_OpenGL, "Downscaling an upscaled texture that didn't backup original"); - return false; - } flags &= ~ImageFlagBits::Rescaled; - std::swap(texture, scale_backup); + texture.handle = original_backup; return true; } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 79448f670..61f9b0259 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -205,12 +205,13 @@ private: bool Scale(); OGLTexture texture; - OGLTexture scale_backup; + OGLTexture upscaled_backup; OGLTextureView store_view; GLenum gl_internal_format = GL_NONE; GLenum gl_format = GL_NONE; GLenum gl_type = GL_NONE; TextureCacheRuntime* runtime{}; + GLuint original_backup{}; }; class ImageView : public VideoCommon::ImageViewBase { From 36f261edefd2e16d34f2726f0a0295e089ed1c17 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Tue, 21 Sep 2021 22:22:24 -0400 Subject: [PATCH 076/156] vk_texture_cache: Simplify scaled image management --- .../renderer_vulkan/vk_texture_cache.cpp | 111 ++++-------------- .../renderer_vulkan/vk_texture_cache.h | 30 ++--- 2 files changed, 34 insertions(+), 107 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index a34cd31f0..5b4f51a31 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -996,17 +996,14 @@ u64 TextureCacheRuntime::GetDeviceLocalMemory() const { return device.GetDeviceLocalMemory(); } -void TextureCacheRuntime::TickFrame() { - prescaled_images.Tick(); - prescaled_commits.Tick(); -} +void TextureCacheRuntime::TickFrame() {} Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime_.scheduler}, - image(MakeImage(runtime_.device, info)), - commit(runtime_.memory_allocator.Commit(image, MemoryUsage::DeviceLocal)), - aspect_mask(ImageAspectMask(info.format)), runtime{&runtime_} { + runtime{&runtime_}, original_image(MakeImage(runtime_.device, info)), + commit(runtime_.memory_allocator.Commit(original_image, MemoryUsage::DeviceLocal)), + aspect_mask(ImageAspectMask(info.format)) { if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported()) { if (Settings::values.accelerate_astc.GetValue()) { flags |= VideoCommon::ImageFlagBits::AcceleratedUpload; @@ -1015,13 +1012,14 @@ Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu } } if (runtime->device.HasDebuggingToolAttached()) { - image.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); + original_image.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); } static constexpr VkImageViewUsageCreateInfo storage_image_view_usage_create_info{ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO, .pNext = nullptr, .usage = VK_IMAGE_USAGE_STORAGE_BIT, }; + current_image = *original_image; if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported()) { const auto& device = runtime->device.GetLogical(); storage_image_views.reserve(info.resources.levels); @@ -1030,7 +1028,7 @@ Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, .pNext = &storage_image_view_usage_create_info, .flags = 0, - .image = *image, + .image = *original_image, .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY, .format = VK_FORMAT_A8B8G8R8_UNORM_PACK32, .components{ @@ -1059,12 +1057,12 @@ void Image::UploadMemory(const StagingBufferRef& map, std::spanRequestOutsideRenderPassOperationContext(); std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); const VkBuffer src_buffer = map.buffer; - const VkImage vk_image = *image; + const VkImage vk_image = *original_image; const VkImageAspectFlags vk_aspect_mask = aspect_mask; const bool is_initialized = std::exchange(initialized, true); scheduler->Record([src_buffer, vk_image, vk_aspect_mask, is_initialized, @@ -1072,18 +1070,14 @@ void Image::UploadMemory(const StagingBufferRef& map, std::span copies) { - const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); - if (is_rescaled) { - ScaleDown(true); - } std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); scheduler->RequestOutsideRenderPassOperationContext(); - scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask, + scheduler->Record([buffer = map.buffer, image = *original_image, aspect_mask = aspect_mask, vk_copies](vk::CommandBuffer cmdbuf) { const VkImageMemoryBarrier read_barrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, @@ -1133,51 +1127,31 @@ void Image::DownloadMemory(const StagingBufferRef& map, std::spanresolution; if (!resolution.active) { return true; } - vk::Image rescaled_image = - has_backup ? std::move(backup_image) - : MakeImage(runtime->device, info, resolution.up_scale, resolution.down_shift); - MemoryCommit new_commit = has_backup ? std::move(backup_commit) - : MemoryCommit(runtime->memory_allocator.Commit( - rescaled_image, MemoryUsage::DeviceLocal)); - has_backup = false; - + const auto& device = runtime->device; + if (!scaled_image) { + scaled_image = MakeImage(device, info, resolution.up_scale, resolution.down_shift); + auto& allocator = runtime->memory_allocator; + scaled_commit = MemoryCommit(allocator.Commit(scaled_image, MemoryUsage::DeviceLocal)); + } if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); } - SCOPE_EXIT({ - if (save_as_backup) { - backup_image = std::move(image); - backup_commit = std::move(commit); - has_backup = true; - } else { - runtime->prescaled_images.Push(std::move(image)); - runtime->prescaled_commits.Push(std::move(commit)); - } - image = std::move(rescaled_image); - commit = std::move(new_commit); - }); - const PixelFormat format = StorageFormat(info.format); - const auto format_info = - MaxwellToVK::SurfaceFormat(runtime->device, FormatType::Optimal, false, format); - const auto similar = runtime->device.GetSupportedFormat( + const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format); + const auto similar = device.GetSupportedFormat( format_info.format, (VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT), FormatType::Optimal); @@ -1187,55 +1161,18 @@ bool Image::ScaleUp(bool save_as_backup) { if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); } - BlitScale(*scheduler, *image, *rescaled_image, info, aspect_mask, resolution, true); + BlitScale(*scheduler, *original_image, *scaled_image, info, aspect_mask, resolution, true); + current_image = *scaled_image; return true; } -void Image::SwapBackup() { - if (!runtime->resolution.active) { - return; - } - ASSERT(has_backup); - runtime->prescaled_images.Push(std::move(image)); - runtime->prescaled_commits.Push(std::move(commit)); - image = std::move(backup_image); - commit = std::move(backup_commit); - has_backup = false; -} - -bool Image::ScaleDown(bool save_as_backup) { +bool Image::ScaleDown() { if (False(flags & ImageFlagBits::Rescaled)) { return false; } ASSERT(info.type != ImageType::Linear); flags &= ~ImageFlagBits::Rescaled; - scaling_count++; - - const auto& resolution = runtime->resolution; - if (!resolution.active) { - return true; - } - vk::Image downscaled_image = - has_backup ? std::move(backup_image) : MakeImage(runtime->device, info); - MemoryCommit new_commit = has_backup ? std::move(backup_commit) - : MemoryCommit(runtime->memory_allocator.Commit( - downscaled_image, MemoryUsage::DeviceLocal)); - has_backup = false; - if (aspect_mask == 0) { - aspect_mask = ImageAspectMask(info.format); - } - BlitScale(*scheduler, *image, *downscaled_image, info, aspect_mask, resolution, false); - - if (save_as_backup) { - backup_image = std::move(image); - backup_commit = std::move(commit); - has_backup = true; - } else { - runtime->prescaled_images.Push(std::move(image)); - runtime->prescaled_commits.Push(std::move(commit)); - } - image = std::move(downscaled_image); - commit = std::move(new_commit); + current_image = *original_image; return true; } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 84194b833..e5060e3f1 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -8,7 +8,6 @@ #include "common/settings.h" #include "shader_recompiler/shader_info.h" -#include "video_core/delayed_destruction_ring.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/texture_cache/image_view_base.h" #include "video_core/texture_cache/texture_cache_base.h" @@ -17,7 +16,6 @@ namespace Vulkan { -using VideoCommon::DelayedDestructionRing; using VideoCommon::ImageId; using VideoCommon::NUM_RT; using VideoCommon::Region2D; @@ -36,8 +34,6 @@ class VKScheduler; class TextureCacheRuntime { public: - static constexpr size_t TICKS_TO_DESTROY = 6; - explicit TextureCacheRuntime(const Device& device_, VKScheduler& scheduler_, MemoryAllocator& memory_allocator_, StagingBufferPool& staging_buffer_pool_, @@ -90,9 +86,6 @@ public: BlitImageHelper& blit_image_helper; ASTCDecoderPass& astc_decoder_pass; RenderPassCache& render_pass_cache; - - DelayedDestructionRing prescaled_images; - DelayedDestructionRing prescaled_commits; Settings::ResolutionScalingInfo resolution; }; @@ -117,7 +110,7 @@ public: std::span copies); [[nodiscard]] VkImage Handle() const noexcept { - return *image; + return current_image; } [[nodiscard]] VkImageAspectFlags AspectMask() const noexcept { @@ -133,25 +126,22 @@ public: return std::exchange(initialized, true); } - bool ScaleUp(bool save_as_backup = false); + bool ScaleUp(); - bool ScaleDown(bool save_as_backup = false); - - void SwapBackup(); + bool ScaleDown(); private: - VKScheduler* scheduler; - vk::Image image; + VKScheduler* scheduler{}; + TextureCacheRuntime* runtime{}; + + vk::Image original_image; MemoryCommit commit; - vk::ImageView image_view; std::vector storage_image_views; VkImageAspectFlags aspect_mask = 0; bool initialized = false; - TextureCacheRuntime* runtime; - u32 scaling_count{}; - vk::Image backup_image{}; - MemoryCommit backup_commit{}; - bool has_backup{}; + vk::Image scaled_image{}; + MemoryCommit scaled_commit{}; + VkImage current_image{}; }; class ImageView : public VideoCommon::ImageViewBase { From dd663844513c82a24456dbc68b9ad6665506bea9 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 26 Sep 2021 22:43:06 -0400 Subject: [PATCH 077/156] rescaling_pass: Enable PatchImageQueryDimensions on fragment stages --- src/shader_recompiler/ir_opt/rescaling_pass.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp index 357e41f2b..51125f45a 100644 --- a/src/shader_recompiler/ir_opt/rescaling_pass.cpp +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -249,6 +249,7 @@ void PatchImageRead(IR::Block& block, IR::Inst& inst) { void Visit(const IR::Program& program, IR::Block& block, IR::Inst& inst) { const bool is_fragment_shader{program.stage == Stage::Fragment}; + const bool is_compute_shader{program.stage == Stage::Compute}; switch (inst.GetOpcode()) { case IR::Opcode::GetAttribute: { const IR::Attribute attr{inst.Arg(0).Attribute()}; @@ -265,21 +266,19 @@ void Visit(const IR::Program& program, IR::Block& block, IR::Inst& inst) { break; } case IR::Opcode::ImageQueryDimensions: - if (program.stage == Stage::Compute) { - PatchImageQueryDimensions(block, inst); - } + PatchImageQueryDimensions(block, inst); break; case IR::Opcode::ImageFetch: if (is_fragment_shader) { SubScaleImageFetch(block, inst); - } else if (program.stage == Stage::Compute) { + } else if (is_compute_shader) { PatchImageFetch(block, inst); } break; case IR::Opcode::ImageRead: if (is_fragment_shader) { SubScaleImageRead(block, inst); - } else if (program.stage == Stage::Compute) { + } else if (is_compute_shader) { PatchImageRead(block, inst); } break; From 276565973f373b2dbf7f19a68a6d0304803dc2c3 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Tue, 28 Sep 2021 21:29:17 -0400 Subject: [PATCH 078/156] rescaling_pass: Scale ImageFetch offset if it exists Plus some code deduplication --- .../ir_opt/rescaling_pass.cpp | 118 +++++++----------- 1 file changed, 48 insertions(+), 70 deletions(-) diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp index 51125f45a..2aa9c31dc 100644 --- a/src/shader_recompiler/ir_opt/rescaling_pass.cpp +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -137,21 +137,50 @@ void PatchImageQueryDimensions(IR::Block& block, IR::Inst& inst) { } } -void ScaleIntegerCoord(IR::IREmitter& ir, IR::Inst& inst, const IR::U1& is_scaled) { +void ScaleIntegerComposite(IR::IREmitter& ir, IR::Inst& inst, const IR::U1& is_scaled, + size_t index) { + const IR::Value composite{inst.Arg(index)}; + if (composite.IsEmpty()) { + return; + } const auto info{inst.Flags()}; - const IR::Value coord{inst.Arg(1)}; + const IR::U32 x{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(composite, 0)})}; + const IR::U32 y{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(composite, 1)})}; switch (info.type) { - case TextureType::Color2D: { - const IR::U32 x{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)})}; - const IR::U32 y{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 1)})}; - inst.SetArg(1, ir.CompositeConstruct(x, y)); + case TextureType::Color2D: + inst.SetArg(index, ir.CompositeConstruct(x, y)); + break; + case TextureType::ColorArray2D: { + const IR::U32 z{ir.CompositeExtract(composite, 2)}; + inst.SetArg(index, ir.CompositeConstruct(x, y, z)); break; } + case TextureType::Color1D: + case TextureType::ColorArray1D: + case TextureType::Color3D: + case TextureType::ColorCube: + case TextureType::ColorArrayCube: + case TextureType::Buffer: + // Nothing to patch here + break; + } +} + +void SubScaleCoord(IR::IREmitter& ir, IR::Inst& inst, const IR::U1& is_scaled) { + const auto info{inst.Flags()}; + const IR::Value coord{inst.Arg(1)}; + const IR::U32 coord_x{ir.CompositeExtract(coord, 0)}; + const IR::U32 coord_y{ir.CompositeExtract(coord, 1)}; + + const IR::U32 scaled_x{SubScale(ir, is_scaled, coord_x, IR::Attribute::PositionX)}; + const IR::U32 scaled_y{SubScale(ir, is_scaled, coord_y, IR::Attribute::PositionY)}; + switch (info.type) { + case TextureType::Color2D: + inst.SetArg(1, ir.CompositeConstruct(scaled_x, scaled_y)); + break; case TextureType::ColorArray2D: { - const IR::U32 x{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)})}; - const IR::U32 y{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 1)})}; const IR::U32 z{ir.CompositeExtract(coord, 2)}; - inst.SetArg(1, ir.CompositeConstruct(x, y, z)); + inst.SetArg(1, ir.CompositeConstruct(scaled_x, scaled_y, z)); break; } case TextureType::Color1D: @@ -169,87 +198,36 @@ void SubScaleImageFetch(IR::Block& block, IR::Inst& inst) { IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; const auto info{inst.Flags()}; const IR::U1 is_scaled{ir.IsTextureScaled(ir.Imm32(info.descriptor_index))}; - const IR::Value coord{inst.Arg(1)}; - switch (info.type) { - case TextureType::Color2D: { - const IR::U32 x{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)}, - IR::Attribute::PositionX)}; - const IR::U32 y{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 1)}, - IR::Attribute::PositionY)}; - inst.SetArg(1, ir.CompositeConstruct(x, y)); - break; - } - case TextureType::ColorArray2D: { - const IR::U32 x{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)}, - IR::Attribute::PositionX)}; - const IR::U32 y{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 1)}, - IR::Attribute::PositionY)}; - const IR::U32 z{ir.CompositeExtract(coord, 2)}; - inst.SetArg(1, ir.CompositeConstruct(x, y, z)); - break; - } - case TextureType::Color1D: - case TextureType::ColorArray1D: - case TextureType::Color3D: - case TextureType::ColorCube: - case TextureType::ColorArrayCube: - case TextureType::Buffer: - // Nothing to patch here - break; - } + SubScaleCoord(ir, inst, is_scaled); + // Scale ImageFetch offset + ScaleIntegerComposite(ir, inst, is_scaled, 2); } void SubScaleImageRead(IR::Block& block, IR::Inst& inst) { IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; const auto info{inst.Flags()}; const IR::U1 is_scaled{ir.IsImageScaled(ir.Imm32(info.descriptor_index))}; - const IR::Value coord{inst.Arg(1)}; - switch (info.type) { - case TextureType::Color2D: { - const IR::U32 x{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)}, - IR::Attribute::PositionX)}; - const IR::U32 y{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 1)}, - IR::Attribute::PositionY)}; - inst.SetArg(1, ir.CompositeConstruct(x, y)); - break; - } - case TextureType::ColorArray2D: { - const IR::U32 x{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 0)}, - IR::Attribute::PositionX)}; - const IR::U32 y{SubScale(ir, is_scaled, IR::U32{ir.CompositeExtract(coord, 1)}, - IR::Attribute::PositionY)}; - const IR::U32 z{ir.CompositeExtract(coord, 2)}; - inst.SetArg(1, ir.CompositeConstruct(x, y, z)); - break; - } - case TextureType::Color1D: - case TextureType::ColorArray1D: - case TextureType::Color3D: - case TextureType::ColorCube: - case TextureType::ColorArrayCube: - case TextureType::Buffer: - // Nothing to patch here - break; - } + SubScaleCoord(ir, inst, is_scaled); } void PatchImageFetch(IR::Block& block, IR::Inst& inst) { IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; const auto info{inst.Flags()}; const IR::U1 is_scaled{ir.IsTextureScaled(ir.Imm32(info.descriptor_index))}; - ScaleIntegerCoord(ir, inst, is_scaled); + ScaleIntegerComposite(ir, inst, is_scaled, 1); + // Scale ImageFetch offset + ScaleIntegerComposite(ir, inst, is_scaled, 2); } void PatchImageRead(IR::Block& block, IR::Inst& inst) { IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; const auto info{inst.Flags()}; const IR::U1 is_scaled{ir.IsImageScaled(ir.Imm32(info.descriptor_index))}; - ScaleIntegerCoord(ir, inst, is_scaled); + ScaleIntegerComposite(ir, inst, is_scaled, 1); } void Visit(const IR::Program& program, IR::Block& block, IR::Inst& inst) { const bool is_fragment_shader{program.stage == Stage::Fragment}; - const bool is_compute_shader{program.stage == Stage::Compute}; switch (inst.GetOpcode()) { case IR::Opcode::GetAttribute: { const IR::Attribute attr{inst.Arg(0).Attribute()}; @@ -271,14 +249,14 @@ void Visit(const IR::Program& program, IR::Block& block, IR::Inst& inst) { case IR::Opcode::ImageFetch: if (is_fragment_shader) { SubScaleImageFetch(block, inst); - } else if (is_compute_shader) { + } else { PatchImageFetch(block, inst); } break; case IR::Opcode::ImageRead: if (is_fragment_shader) { SubScaleImageRead(block, inst); - } else if (is_compute_shader) { + } else { PatchImageRead(block, inst); } break; From 99eec162da567ce08a7ab6ce4d1f4b5fa8b5af5e Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Tue, 28 Sep 2021 21:37:54 -0400 Subject: [PATCH 079/156] rescaling_pass: Logic simplification and minor style cleanup --- .../frontend/maxwell/translate_program.cpp | 1 - .../ir_opt/rescaling_pass.cpp | 49 +++++++------------ 2 files changed, 17 insertions(+), 33 deletions(-) diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp index 743fb2420..267ebe4af 100644 --- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp @@ -183,7 +183,6 @@ IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPoolInstructions()) { - VisitMark(program, inst); + const bool is_fragment_shader{program.stage == Stage::Fragment}; + if (is_fragment_shader) { + for (IR::Block* const block : program.post_order_blocks) { + for (IR::Inst& inst : block->Instructions()) { + VisitMark(inst); + } } } for (IR::Block* const block : program.post_order_blocks) { From 19ca0c9ab5cbaa86e30743ea760e0aab5c40c1d6 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Mon, 20 Sep 2021 19:11:03 +0200 Subject: [PATCH 080/156] TextureCache: Base fixes on rescaling. --- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 3 ++- src/video_core/texture_cache/texture_cache.h | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 87f265e09..1ceffa718 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -207,7 +207,7 @@ void RasterizerVulkan::Clear() { query_cache.UpdateCounters(); - const auto& regs = maxwell3d.regs; + auto& regs = maxwell3d.regs; const bool use_color = regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A; const bool use_depth = regs.clear_buffers.Z; @@ -228,6 +228,7 @@ void RasterizerVulkan::Clear() { up_scale = Settings::values.resolution_info.up_scale; down_shift = Settings::values.resolution_info.down_shift; } + UpdateViewportsState(regs); VkClearRect clear_rect{ .rect = GetScissorState(regs, 0, up_scale, down_shift), diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 764984546..a543776fd 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -205,8 +205,8 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { return; } - u32 scale_rating; - bool rescaled; + u32 scale_rating = 0; + bool rescaled = false; std::array tmp_color_images{}; ImageId tmp_depth_image{}; do { @@ -223,7 +223,7 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { bool can_rescale = true; bool any_blacklisted = false; const auto check_rescale = [&](ImageViewId view_id, ImageId& id_save) { - if (view_id) { + if (view_id != NULL_IMAGE_VIEW_ID && view_id != ImageViewId{}) { const auto& view = slot_image_views[view_id]; const auto image_id = view.image_id; id_save = image_id; @@ -265,6 +265,7 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { scale_up(tmp_color_images[index]); } scale_up(tmp_depth_image); + scale_rating = 2; } } else { rescaled = false; From ea82bd4b7e4c4f23a40f8a35858d8b74950fc347 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Mon, 20 Sep 2021 22:18:15 +0200 Subject: [PATCH 081/156] Texture Cache: Fix Rescaling on Multisample --- .../renderer_vulkan/vk_texture_cache.cpp | 8 ++++++-- src/video_core/texture_cache/image_info.cpp | 4 ++-- src/video_core/texture_cache/texture_cache.h | 17 +++++++++++++---- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 5b4f51a31..4f0bab274 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -860,9 +860,10 @@ void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, nullptr, nullptr, read_barriers); if (is_resolve) { + VkImageResolve resolve_info = + MakeImageResolve(dst_region, src_region, dst_layers, src_layers); cmdbuf.ResolveImage(src_image, VK_IMAGE_LAYOUT_GENERAL, dst_image, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - MakeImageResolve(dst_region, src_region, dst_layers, src_layers)); + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, resolve_info); } else { const bool is_linear = filter == Fermi2D::Filter::Bilinear; const VkFilter vk_filter = is_linear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST; @@ -1149,6 +1150,9 @@ bool Image::ScaleUp() { if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); } + if (info.num_samples > 1) { + return true; + } const PixelFormat format = StorageFormat(info.format); const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format); const auto similar = device.GetSupportedFormat( diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index bdf306bf9..7fa8fd4fe 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -101,7 +101,7 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept { // FIXME: Call this without passing *this layer_stride = CalculateLayerStride(*this); maybe_unaligned_layer_stride = CalculateLayerSize(*this); - rescaleable &= (block.depth == 0) && resources.levels == 1 && num_samples == 1; + rescaleable &= (block.depth == 0) && resources.levels == 1; } } @@ -134,7 +134,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) type = ImageType::e3D; size.depth = rt.depth; } else { - rescaleable = block.depth == 0 && size.height > 256 && num_samples == 1; + rescaleable = block.depth == 0 && size.height > 256; type = ImageType::e2D; resources.layers = rt.depth; } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index a543776fd..b60f840c1 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -476,17 +476,26 @@ void TextureCache

::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, Image& dst_image = slot_images[dst_id]; Image& src_image = slot_images[src_id]; + bool is_resolve = src_image.info.num_samples != 1 && dst_image.info.num_samples == 1; + bool is_src_rescaled = True(src_image.flags & ImageFlagBits::Rescaled); bool is_dst_rescaled = True(dst_image.flags & ImageFlagBits::Rescaled); if (is_src_rescaled != is_dst_rescaled) { - if (ImageCanRescale(dst_image)) { - ScaleUp(dst_image); - is_dst_rescaled = True(dst_image.flags & ImageFlagBits::Rescaled); - } if (ImageCanRescale(src_image)) { ScaleUp(src_image); is_src_rescaled = True(src_image.flags & ImageFlagBits::Rescaled); + if (is_resolve) { + dst_image.info.rescaleable = true; + for (const auto& alias : dst_image.aliased_images) { + Image& other_image = slot_images[alias.id]; + other_image.info.rescaleable = true; + } + } + } + if (ImageCanRescale(dst_image)) { + ScaleUp(dst_image); + is_dst_rescaled = True(dst_image.flags & ImageFlagBits::Rescaled); } } From 581ea900627b398c2fa06b70facd5dcd8bbb7d68 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Wed, 29 Sep 2021 20:53:30 -0400 Subject: [PATCH 082/156] rescaling_pass: Fix IR errors when unscalable texture types are encountered --- .../ir_opt/rescaling_pass.cpp | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp index 0d642dd0d..a5fa4ee83 100644 --- a/src/shader_recompiler/ir_opt/rescaling_pass.cpp +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -14,6 +14,22 @@ namespace Shader::Optimization { namespace { +[[nodiscard]] bool IsTextureTypeRescalable(TextureType type) { + switch (type) { + case TextureType::Color2D: + case TextureType::ColorArray2D: + return true; + case TextureType::Color1D: + case TextureType::ColorArray1D: + case TextureType::Color3D: + case TextureType::ColorCube: + case TextureType::ColorArrayCube: + case TextureType::Buffer: + break; + } + return false; +} + void VisitMark(const IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::ShuffleIndex: @@ -179,6 +195,9 @@ void SubScaleCoord(IR::IREmitter& ir, IR::Inst& inst, const IR::U1& is_scaled) { void SubScaleImageFetch(IR::Block& block, IR::Inst& inst) { IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; const auto info{inst.Flags()}; + if (!IsTextureTypeRescalable(info.type)) { + return; + } const IR::U1 is_scaled{ir.IsTextureScaled(ir.Imm32(info.descriptor_index))}; SubScaleCoord(ir, inst, is_scaled); // Scale ImageFetch offset @@ -188,6 +207,9 @@ void SubScaleImageFetch(IR::Block& block, IR::Inst& inst) { void SubScaleImageRead(IR::Block& block, IR::Inst& inst) { IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; const auto info{inst.Flags()}; + if (!IsTextureTypeRescalable(info.type)) { + return; + } const IR::U1 is_scaled{ir.IsImageScaled(ir.Imm32(info.descriptor_index))}; SubScaleCoord(ir, inst, is_scaled); } @@ -195,6 +217,9 @@ void SubScaleImageRead(IR::Block& block, IR::Inst& inst) { void PatchImageFetch(IR::Block& block, IR::Inst& inst) { IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; const auto info{inst.Flags()}; + if (!IsTextureTypeRescalable(info.type)) { + return; + } const IR::U1 is_scaled{ir.IsTextureScaled(ir.Imm32(info.descriptor_index))}; ScaleIntegerComposite(ir, inst, is_scaled, 1); // Scale ImageFetch offset @@ -204,6 +229,9 @@ void PatchImageFetch(IR::Block& block, IR::Inst& inst) { void PatchImageRead(IR::Block& block, IR::Inst& inst) { IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; const auto info{inst.Flags()}; + if (!IsTextureTypeRescalable(info.type)) { + return; + } const IR::U1 is_scaled{ir.IsImageScaled(ir.Imm32(info.descriptor_index))}; ScaleIntegerComposite(ir, inst, is_scaled, 1); } From e0a383085598afd43d2960e40ff60288d25c7d0e Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Wed, 29 Sep 2021 21:34:56 -0400 Subject: [PATCH 083/156] gl_texture_cache: Fix BGR pbo size for scaled textures --- .../renderer_opengl/gl_texture_cache.cpp | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 7ded7415d..fafee62ee 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -1264,25 +1264,24 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span copies) { static constexpr VideoCommon::Offset3D zero_offset{0, 0, 0}; - const u32 requested_pbo_size = - std::max(src_image.unswizzled_size_bytes, dst_image.unswizzled_size_bytes); - - if (bgr_pbo_size < requested_pbo_size) { - bgr_pbo.Create(); - bgr_pbo_size = requested_pbo_size; - glNamedBufferData(bgr_pbo.handle, bgr_pbo_size, nullptr, GL_STREAM_COPY); - } + const u32 img_bpp = BytesPerBlock(src_image.info.format); for (const ImageCopy& copy : copies) { ASSERT(copy.src_offset == zero_offset); ASSERT(copy.dst_offset == zero_offset); - + const u32 num_src_layers = static_cast(copy.src_subresource.num_layers); + const u32 copy_size = copy.extent.width * copy.extent.height * num_src_layers * img_bpp; + if (bgr_pbo_size < copy_size) { + bgr_pbo.Create(); + bgr_pbo_size = copy_size; + glNamedBufferData(bgr_pbo.handle, bgr_pbo_size, nullptr, GL_STREAM_COPY); + } // Copy from source to PBO glPixelStorei(GL_PACK_ALIGNMENT, 1); glPixelStorei(GL_PACK_ROW_LENGTH, copy.extent.width); glBindBuffer(GL_PIXEL_PACK_BUFFER, bgr_pbo.handle); glGetTextureSubImage(src_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height, - copy.src_subresource.num_layers, src_image.GlFormat(), - src_image.GlType(), static_cast(bgr_pbo_size), nullptr); + num_src_layers, src_image.GlFormat(), src_image.GlType(), + static_cast(bgr_pbo_size), nullptr); // Copy from PBO to destination in desired GL format glPixelStorei(GL_UNPACK_ALIGNMENT, 1); From 237a43004fb27a273495a0b44515cf7389dea553 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 3 Oct 2021 22:42:29 +0200 Subject: [PATCH 084/156] Texture Cache: Fix calculations when scaling. --- src/video_core/texture_cache/texture_cache.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index b60f840c1..691198853 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -858,6 +858,12 @@ bool TextureCache

::ScaleUp(Image& image) { if (!rescaled) { return false; } + const auto& add_to_size = Settings::values.resolution_info.up_factor - 1.0f; + const auto sign = std::signbit(add_to_size); + const u64 tentative_size = static_cast( + std::max(image.guest_size_bytes, image.unswizzled_size_bytes) * std::abs(add_to_size)); + const u64 fitted_size = Common::AlignUp(tentative_size, 1024); + total_used_memory += sign ? -fitted_size : fitted_size; InvalidateScale(image); return true; } @@ -868,6 +874,12 @@ bool TextureCache

::ScaleDown(Image& image) { if (!rescaled) { return false; } + const auto& add_to_size = Settings::values.resolution_info.up_factor - 1.0f; + const auto sign = std::signbit(add_to_size); + const u64 tentative_size = static_cast( + std::max(image.guest_size_bytes, image.unswizzled_size_bytes) * std::abs(add_to_size)); + const u64 fitted_size = Common::AlignUp(tentative_size, 1024); + total_used_memory += sign ? fitted_size : -fitted_size; InvalidateScale(image); return true; } From 88ef04dbaf26ab83ec85bfa3c68434c283c66e50 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Tue, 5 Oct 2021 00:07:51 -0400 Subject: [PATCH 085/156] texture_cache: Refactor scaled image size calculation --- src/video_core/texture_cache/texture_cache.h | 24 +++++++++---------- .../texture_cache/texture_cache_base.h | 1 + 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 691198853..b708e41b5 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -852,18 +852,23 @@ void TextureCache

::InvalidateScale(Image& image) { has_deleted_images = true; } +template +u64 TextureCache

::GetScaledImageSizeBytes(Image& image) { + const f32 add_to_size = Settings::values.resolution_info.up_factor - 1.0f; + const bool sign = std::signbit(add_to_size); + const u32 image_size_bytes = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); + const u64 tentative_size = static_cast(image_size_bytes * std::abs(add_to_size)); + const u64 fitted_size = Common::AlignUp(tentative_size, 1024); + return sign ? -fitted_size : fitted_size; +} + template bool TextureCache

::ScaleUp(Image& image) { const bool rescaled = image.ScaleUp(); if (!rescaled) { return false; } - const auto& add_to_size = Settings::values.resolution_info.up_factor - 1.0f; - const auto sign = std::signbit(add_to_size); - const u64 tentative_size = static_cast( - std::max(image.guest_size_bytes, image.unswizzled_size_bytes) * std::abs(add_to_size)); - const u64 fitted_size = Common::AlignUp(tentative_size, 1024); - total_used_memory += sign ? -fitted_size : fitted_size; + total_used_memory += GetScaledImageSizeBytes(image); InvalidateScale(image); return true; } @@ -874,12 +879,7 @@ bool TextureCache

::ScaleDown(Image& image) { if (!rescaled) { return false; } - const auto& add_to_size = Settings::values.resolution_info.up_factor - 1.0f; - const auto sign = std::signbit(add_to_size); - const u64 tentative_size = static_cast( - std::max(image.guest_size_bytes, image.unswizzled_size_bytes) * std::abs(add_to_size)); - const u64 fitted_size = Common::AlignUp(tentative_size, 1024); - total_used_memory += sign ? fitted_size : -fitted_size; + total_used_memory += GetScaledImageSizeBytes(image); InvalidateScale(image); return true; } diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 517a4c224..40e003b60 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -339,6 +339,7 @@ private: void InvalidateScale(Image& image); bool ScaleUp(Image& image); bool ScaleDown(Image& image); + u64 GetScaledImageSizeBytes(Image& image); Runtime& runtime; VideoCore::RasterizerInterface& rasterizer; From 31478c6c1b841b9a820742830b136775fafe270f Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Wed, 6 Oct 2021 01:18:00 -0400 Subject: [PATCH 086/156] video_core: Misc resolution scaling related refactoring --- src/common/settings.cpp | 2 +- .../renderer_opengl/gl_rasterizer.cpp | 4 +-- .../renderer_opengl/gl_texture_cache.cpp | 31 +++++++------------ .../renderer_opengl/gl_texture_cache.h | 11 ++++--- .../renderer_vulkan/vk_state_tracker.h | 10 +++--- .../renderer_vulkan/vk_texture_cache.cpp | 31 +++++++++++-------- .../renderer_vulkan/vk_texture_cache.h | 7 +++-- src/video_core/texture_cache/texture_cache.h | 2 -- 8 files changed, 51 insertions(+), 47 deletions(-) diff --git a/src/common/settings.cpp b/src/common/settings.cpp index f0686a7c5..12fdb0f9b 100644 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -107,7 +107,7 @@ float Volume() { } void UpdateRescalingInfo() { - auto setup = values.resolution_setup.GetValue(); + const auto setup = values.resolution_setup.GetValue(); auto& info = values.resolution_info; switch (setup) { case ResolutionSetup::Res1_2X: diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index d94f1e89f..bb24a0656 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -554,7 +554,7 @@ void RasterizerOpenGL::SyncViewport() { } glFrontFace(mode); } - if (dirty_viewport || flags[Dirty::ClipControl]) { + if (dirty_viewport || dirty_clip_control) { flags[Dirty::ClipControl] = false; bool flip_y = false; @@ -925,7 +925,7 @@ void RasterizerOpenGL::SyncScissorTest() { const auto& regs = maxwell3d.regs; const auto& resolution = Settings::values.resolution_info; - const auto scale_up = [&](u32 value) -> u32 { + const auto scale_up = [resolution](u32 value) -> u32 { if (value == 0) { return 0U; } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index fafee62ee..c68a51ebb 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -405,7 +405,8 @@ ImageBufferMap::~ImageBufferMap() { TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& program_manager, StateTracker& state_tracker_) - : device{device_}, state_tracker{state_tracker_}, util_shaders(program_manager) { + : device{device_}, state_tracker{state_tracker_}, + util_shaders(program_manager), resolution{Settings::values.resolution_info} { static constexpr std::array TARGETS{GL_TEXTURE_1D_ARRAY, GL_TEXTURE_2D_ARRAY, GL_TEXTURE_3D}; for (size_t i = 0; i < TARGETS.size(); ++i) { const GLenum target = TARGETS[i]; @@ -473,7 +474,6 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& set_view(Shader::TextureType::ColorArray2D, null_image_view_2d_array.handle); set_view(Shader::TextureType::ColorArrayCube, null_image_cube_array.handle); - resolution = Settings::values.resolution_info; if (resolution.active) { for (size_t i = 0; i < rescale_draw_fbos.size(); ++i) { rescale_draw_fbos[i].Create(); @@ -681,7 +681,7 @@ Image::Image(TextureCacheRuntime& runtime_, const VideoCommon::ImageInfo& info_, gl_type = tuple.type; } texture = MakeImage(info, gl_internal_format); - original_backup = texture.handle; + current_texture = texture.handle; if (runtime->device.HasDebuggingToolAttached()) { const std::string name = VideoCommon::Name(*this); glObjectLabel(ImageTarget(info) == GL_TEXTURE_BUFFER ? GL_BUFFER : GL_TEXTURE, @@ -726,10 +726,6 @@ void Image::UploadMemory(const ImageBufferMap& map, void Image::DownloadMemory(ImageBufferMap& map, std::span copies) { glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API - const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); - if (is_rescaled) { - ScaleDown(); - } glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer); glPixelStorei(GL_PACK_ALIGNMENT, 1); @@ -747,9 +743,6 @@ void Image::DownloadMemory(ImageBufferMap& map, } CopyImageToBuffer(copy, map.offset); } - if (is_rescaled) { - texture.handle = upscaled_backup.handle; - } } GLuint Image::StorageHandle() noexcept { @@ -775,11 +768,11 @@ GLuint Image::StorageHandle() noexcept { return store_view.handle; } store_view.Create(); - glTextureView(store_view.handle, ImageTarget(info), texture.handle, GL_RGBA8, 0, + glTextureView(store_view.handle, ImageTarget(info), current_texture, GL_RGBA8, 0, info.resources.levels, 0, info.resources.layers); return store_view.handle; default: - return texture.handle; + return current_texture; } } @@ -940,10 +933,10 @@ bool Image::Scale() { const u32 original_width = info.size.width; const u32 original_height = info.size.height; - auto dst_info = info; - dst_info.size.width = scaled_width; - dst_info.size.height = scaled_height; if (!upscaled_backup.handle) { + auto dst_info = info; + dst_info.size.width = scaled_width; + dst_info.size.height = scaled_height; upscaled_backup = MakeImage(dst_info, gl_internal_format); } const GLuint read_fbo = runtime->rescale_read_fbos[fbo_index].handle; @@ -955,14 +948,14 @@ bool Image::Scale() { const u32 dst_level_width = std::max(1u, scaled_width >> level); const u32 dst_level_height = std::max(1u, scaled_height >> level); - glNamedFramebufferTextureLayer(read_fbo, attachment, original_backup, level, layer); + glNamedFramebufferTextureLayer(read_fbo, attachment, texture.handle, level, layer); glNamedFramebufferTextureLayer(draw_fbo, attachment, upscaled_backup.handle, level, layer); glBlitNamedFramebuffer(read_fbo, draw_fbo, 0, 0, src_level_width, src_level_height, 0, 0, dst_level_width, dst_level_height, mask, filter); } } - texture.handle = upscaled_backup.handle; + current_texture = upscaled_backup.handle; return true; } @@ -993,7 +986,7 @@ bool Image::ScaleDown() { return false; } flags &= ~ImageFlagBits::Rescaled; - texture.handle = original_backup; + current_texture = texture.handle; return true; } @@ -1010,7 +1003,7 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI flat_range = info.range; set_object_label = device.HasDebuggingToolAttached(); is_render_target = info.IsRenderTarget(); - original_texture = image.texture.handle; + original_texture = image.Handle(); num_samples = image.info.num_samples; if (!is_render_target) { swizzle[0] = info.x_source; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 61f9b0259..cf7f37a16 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -9,13 +9,16 @@ #include -#include "common/settings.h" #include "shader_recompiler/shader_info.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/util_shaders.h" #include "video_core/texture_cache/image_view_base.h" #include "video_core/texture_cache/texture_cache_base.h" +namespace Settings { +struct ResolutionScalingInfo; +} + namespace OpenGL { class Device; @@ -155,7 +158,7 @@ private: std::array rescale_draw_fbos; std::array rescale_read_fbos; - Settings::ResolutionScalingInfo resolution; + const Settings::ResolutionScalingInfo& resolution; }; class Image : public VideoCommon::ImageBase { @@ -182,7 +185,7 @@ public: GLuint StorageHandle() noexcept; GLuint Handle() const noexcept { - return texture.handle; + return current_texture; } GLuint GlFormat() const noexcept { @@ -211,7 +214,7 @@ private: GLenum gl_format = GL_NONE; GLenum gl_type = GL_NONE; TextureCacheRuntime* runtime{}; - GLuint original_backup{}; + GLuint current_texture{}; }; class ImageView : public VideoCommon::ImageViewBase { diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h index ac2bbebe0..40a149832 100644 --- a/src/video_core/renderer_vulkan/vk_state_tracker.h +++ b/src/video_core/renderer_vulkan/vk_state_tracker.h @@ -71,13 +71,15 @@ public: } bool TouchViewports() { - return Exchange(Dirty::Viewports, false) || - Exchange(VideoCommon::Dirty::RescaleViewports, false); + const bool dirty_viewports = Exchange(Dirty::Viewports, false); + const bool rescale_viewports = Exchange(VideoCommon::Dirty::RescaleViewports, false); + return dirty_viewports || rescale_viewports; } bool TouchScissors() { - return Exchange(Dirty::Scissors, false) || - Exchange(VideoCommon::Dirty::RescaleScissors, false); + const bool dirty_scissors = Exchange(Dirty::Scissors, false); + const bool rescale_scissors = Exchange(VideoCommon::Dirty::RescaleScissors, false); + return dirty_scissors || rescale_scissors; } bool TouchDepthBias() { diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 4f0bab274..930c7d569 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -125,8 +125,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { } } -[[nodiscard]] VkImageCreateInfo MakeImageCreateInfo(const Device& device, const ImageInfo& info, - u32 up, u32 down) { +[[nodiscard]] VkImageCreateInfo MakeImageCreateInfo(const Device& device, const ImageInfo& info) { const PixelFormat format = StorageFormat(info.format); const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format); VkImageCreateFlags flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; @@ -137,9 +136,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { if (info.type == ImageType::e3D) { flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT; } - const auto scale_up = [&](u32 value) { return std::max((value * up) >> down, 1U); }; const auto [samples_x, samples_y] = VideoCommon::SamplesLog2(info.num_samples); - const bool is_2d = info.type == ImageType::e2D; return VkImageCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, @@ -147,8 +144,8 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { .imageType = ConvertImageType(info.type), .format = format_info.format, .extent{ - .width = scale_up(info.size.width) >> samples_x, - .height = (is_2d ? scale_up(info.size.height) : info.size.height) >> samples_y, + .width = info.size.width >> samples_x, + .height = info.size.height >> samples_y, .depth = info.size.depth, }, .mipLevels = static_cast(info.resources.levels), @@ -163,12 +160,11 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { }; } -[[nodiscard]] vk::Image MakeImage(const Device& device, const ImageInfo& info, u32 up = 1, - u32 down = 0) { +[[nodiscard]] vk::Image MakeImage(const Device& device, const ImageInfo& info) { if (info.type == ImageType::Buffer) { return vk::Image{}; } - return device.GetLogical().CreateImage(MakeImageCreateInfo(device, info, up, down)); + return device.GetLogical().CreateImage(MakeImageCreateInfo(device, info)); } [[nodiscard]] VkImageAspectFlags ImageAspectMask(PixelFormat format) { @@ -860,10 +856,9 @@ void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, nullptr, nullptr, read_barriers); if (is_resolve) { - VkImageResolve resolve_info = - MakeImageResolve(dst_region, src_region, dst_layers, src_layers); cmdbuf.ResolveImage(src_image, VK_IMAGE_LAYOUT_GENERAL, dst_image, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, resolve_info); + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + MakeImageResolve(dst_region, src_region, dst_layers, src_layers)); } else { const bool is_linear = filter == Fermi2D::Filter::Bilinear; const VkFilter vk_filter = is_linear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST; @@ -1143,7 +1138,17 @@ bool Image::ScaleUp() { } const auto& device = runtime->device; if (!scaled_image) { - scaled_image = MakeImage(device, info, resolution.up_scale, resolution.down_shift); + const u32 up = resolution.up_scale; + const u32 down = resolution.down_shift; + const auto scale = [&](u32 value) { return std::max((value * up) >> down, 1U); }; + + const bool is_2d = info.type == ImageType::e2D; + const u32 scaled_width = scale(info.size.width); + const u32 scaled_height = is_2d ? scale(info.size.height) : info.size.height; + auto scaled_info = info; + scaled_info.size.width = scaled_width; + scaled_info.size.height = scaled_height; + scaled_image = MakeImage(device, scaled_info); auto& allocator = runtime->memory_allocator; scaled_commit = MemoryCommit(allocator.Commit(scaled_image, MemoryUsage::DeviceLocal)); } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index e5060e3f1..5381343e9 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -6,7 +6,6 @@ #include -#include "common/settings.h" #include "shader_recompiler/shader_info.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/texture_cache/image_view_base.h" @@ -14,6 +13,10 @@ #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" +namespace Settings { +struct ResolutionScalingInfo; +} + namespace Vulkan { using VideoCommon::ImageId; @@ -86,7 +89,7 @@ public: BlitImageHelper& blit_image_helper; ASTCDecoderPass& astc_decoder_pass; RenderPassCache& render_pass_cache; - Settings::ResolutionScalingInfo resolution; + const Settings::ResolutionScalingInfo& resolution; }; class Image : public VideoCommon::ImageBase { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index b708e41b5..630c73005 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -1726,9 +1726,7 @@ void TextureCache

::CopyImage(ImageId dst_id, ImageId src_id, std::vector Date: Wed, 6 Oct 2021 02:02:05 -0400 Subject: [PATCH 087/156] vk_texture_cache: Fix early returns on unsupported scales --- .../renderer_vulkan/vk_texture_cache.cpp | 28 +++++++------------ src/video_core/texture_cache/texture_cache.h | 2 +- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 930c7d569..1ab2b1fe9 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1130,13 +1130,19 @@ bool Image::ScaleUp() { return false; } ASSERT(info.type != ImageType::Linear); - flags |= ImageFlagBits::Rescaled; - const auto& resolution = runtime->resolution; if (!resolution.active) { - return true; + return false; } const auto& device = runtime->device; + const PixelFormat format = StorageFormat(info.format); + const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format); + const auto similar = device.GetSupportedFormat( + format_info.format, (VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT), + FormatType::Optimal); + if (similar != format_info.format) { + return true; + } if (!scaled_image) { const u32 up = resolution.up_scale; const u32 down = resolution.down_shift; @@ -1155,23 +1161,9 @@ bool Image::ScaleUp() { if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); } - if (info.num_samples > 1) { - return true; - } - const PixelFormat format = StorageFormat(info.format); - const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format); - const auto similar = device.GetSupportedFormat( - format_info.format, (VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT), - FormatType::Optimal); - - if (similar != format_info.format) { - return true; - } - if (aspect_mask == 0) { - aspect_mask = ImageAspectMask(info.format); - } BlitScale(*scheduler, *original_image, *scaled_image, info, aspect_mask, resolution, true); current_image = *scaled_image; + flags |= ImageFlagBits::Rescaled; return true; } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 630c73005..de522cc43 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -857,7 +857,7 @@ u64 TextureCache

::GetScaledImageSizeBytes(Image& image) { const f32 add_to_size = Settings::values.resolution_info.up_factor - 1.0f; const bool sign = std::signbit(add_to_size); const u32 image_size_bytes = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); - const u64 tentative_size = static_cast(image_size_bytes * std::abs(add_to_size)); + const u64 tentative_size = image_size_bytes * static_cast(std::abs(add_to_size)); const u64 fitted_size = Common::AlignUp(tentative_size, 1024); return sign ? -fitted_size : fitted_size; } From 89a7e566c7a101d688e96641cc2a485f2da54d4b Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 7 Oct 2021 02:15:16 -0400 Subject: [PATCH 088/156] vk_texture_cache: Fix unsupported blit format error checking --- src/video_core/renderer_vulkan/vk_texture_cache.cpp | 10 +++++----- src/video_core/vulkan_common/vulkan_device.h | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 1ab2b1fe9..65506f75e 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1137,11 +1137,11 @@ bool Image::ScaleUp() { const auto& device = runtime->device; const PixelFormat format = StorageFormat(info.format); const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format); - const auto similar = device.GetSupportedFormat( - format_info.format, (VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT), - FormatType::Optimal); - if (similar != format_info.format) { - return true; + const auto blit_usage = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; + if (!device.IsFormatSupported(format_info.format, blit_usage, FormatType::Optimal)) { + LOG_ERROR(Render_Vulkan, "Device does not support scaling format {}", format); + // TODO: Use helper blits where applicable + return false; } if (!scaled_image) { const u32 up = resolution.up_scale; diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 2d5daf6cd..10653ac6b 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -40,6 +40,10 @@ public: VkFormat GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, FormatType format_type) const; + /// Returns true if a format is supported. + bool IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, + FormatType format_type) const; + /// Reports a device loss. void ReportLoss() const; @@ -370,10 +374,6 @@ private: /// Returns true if the device natively supports blitting depth stencil images. bool TestDepthStencilBlits() const; - /// Returns true if a format is supported. - bool IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, - FormatType format_type) const; - VkInstance instance; ///< Vulkan instance. vk::DeviceDispatch dld; ///< Device function pointers. vk::PhysicalDevice physical; ///< Physical device. From 3233fa5dc8780975497dc8ce70d10d0186e50b62 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 7 Oct 2021 23:55:40 -0400 Subject: [PATCH 089/156] gl_texture_cache: Disable scissor test when scaling textures Fixes a bug on BOTW where some objects were no longer being rendered after blitting --- src/video_core/renderer_opengl/gl_texture_cache.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index c68a51ebb..3dfd13d6a 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -939,6 +939,11 @@ bool Image::Scale() { dst_info.size.height = scaled_height; upscaled_backup = MakeImage(dst_info, gl_internal_format); } + // TODO (ameerj): Investigate other GL states that affect blitting. + GLboolean scissor_test; + glGetBooleani_v(GL_SCISSOR_TEST, 0, &scissor_test); + glDisablei(GL_SCISSOR_TEST, 0); + const GLuint read_fbo = runtime->rescale_read_fbos[fbo_index].handle; const GLuint draw_fbo = runtime->rescale_draw_fbos[fbo_index].handle; for (s32 layer = 0; layer < info.resources.layers; ++layer) { @@ -955,6 +960,9 @@ bool Image::Scale() { 0, dst_level_width, dst_level_height, mask, filter); } } + if (scissor_test != GL_FALSE) { + glEnablei(GL_SCISSOR_TEST, 0); + } current_texture = upscaled_backup.handle; return true; } From 1c93476a803f8cb93de341a43a013d5cc302b05b Mon Sep 17 00:00:00 2001 From: lat9nq <22451773+lat9nq@users.noreply.github.com> Date: Fri, 8 Oct 2021 20:47:14 -0400 Subject: [PATCH 090/156] video_core,yuzu: Move UpdateRescalingInfo call to video_core This only needs to happen once per game boot, so we can just call it during CreateGPU and be done with it, avoiding the need to call it in the frontends. --- src/video_core/video_core.cpp | 2 ++ src/yuzu/configuration/config.cpp | 4 ---- src/yuzu/configuration/configure_graphics.cpp | 1 - 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index 508173db3..e852c817e 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -37,6 +37,8 @@ std::unique_ptr CreateRenderer( namespace VideoCore { std::unique_ptr CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) { + Settings::UpdateRescalingInfo(); + const auto nvdec_value = Settings::values.nvdec_emulation.GetValue(); const bool use_nvdec = nvdec_value != Settings::NvdecEmulation::Off; const bool use_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index 3803bf501..4c296a94d 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -850,8 +850,6 @@ void Config::ReadRendererValues() { ReadBasicSetting(Settings::values.disable_shader_loop_safety_checks); } - Settings::UpdateRescalingInfo(); - qt_config->endGroup(); } @@ -1409,8 +1407,6 @@ void Config::SaveRendererValues() { WriteBasicSetting(Settings::values.disable_shader_loop_safety_checks); } - Settings::UpdateRescalingInfo(); - qt_config->endGroup(); } diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp index e01efaeda..02498fad7 100644 --- a/src/yuzu/configuration/configure_graphics.cpp +++ b/src/yuzu/configuration/configure_graphics.cpp @@ -247,7 +247,6 @@ void ConfigureGraphics::ApplyConfiguration() { Settings::values.bg_blue.SetValue(static_cast(bg_color.blue())); } } - Settings::UpdateRescalingInfo(); } void ConfigureGraphics::changeEvent(QEvent* event) { From 49c0c7efd2485e01cd014928f7987533ce62509b Mon Sep 17 00:00:00 2001 From: lat9nq <22451773+lat9nq@users.noreply.github.com> Date: Fri, 8 Oct 2021 20:48:10 -0400 Subject: [PATCH 091/156] yuzu_cmd: Read resolution_setup and scaling_filter from config Also adds descriptions and the settings to the default config. --- src/yuzu_cmd/config.cpp | 2 ++ src/yuzu_cmd/default_ini.h | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index 0b8fde691..3c888d84e 100644 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp @@ -451,6 +451,8 @@ void Config::ReadValues() { ReadSetting("Renderer", Settings::values.disable_shader_loop_safety_checks); ReadSetting("Renderer", Settings::values.vulkan_device); + ReadSetting("Renderer", Settings::values.resolution_setup); + ReadSetting("Renderer", Settings::values.scaling_filter); ReadSetting("Renderer", Settings::values.fullscreen_mode); ReadSetting("Renderer", Settings::values.aspect_ratio); ReadSetting("Renderer", Settings::values.max_anisotropy); diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h index 339dca766..ecdc271a8 100644 --- a/src/yuzu_cmd/default_ini.h +++ b/src/yuzu_cmd/default_ini.h @@ -236,6 +236,29 @@ disable_shader_loop_safety_checks = # Which Vulkan physical device to use (defaults to 0) vulkan_device = +# 0: 0.5x (360p/540p) [EXPERIMENTAL] +# 1: 0.75x (540p/810p) [EXPERIMENTAL] +# 2 (default): 1x (720p/1080p) +# 3: 2x (1440p/2160p) +# 4: 3x (2160p/3240p) +# 5: 4x (2880p/4320p) +# 6: 5x (3600p/5400p) +# 7: 6x (4320p/6480p) +resolution_setup = + +# Pixel filter to use when up- or down-sampling rendered frames. +# 0: Nearest Neighbor +# 1 (default): Bilinear +# 2: Bicubic +# 3: Gaussian +# 4: ScaleForce +# 5: AMD FidelityFX™️ Super Resolution [Vulkan Only] +scaling_filter = + +# Anti-Aliasing (AA) +# 0 (default): None, 1: FXAA +anti_aliasing = + # Whether to use fullscreen or borderless window mode # 0 (Windows default): Borderless window, 1 (All other default): Exclusive fullscreen fullscreen_mode = From b14f2c7c826b8bbea02c1f2674ab024a5ae0695e Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Mon, 11 Oct 2021 23:55:53 -0400 Subject: [PATCH 092/156] texture_cache: Fix image resolves when src/dst are not both scaled --- src/video_core/texture_cache/texture_cache.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index de522cc43..38895c2e9 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -475,12 +475,10 @@ void TextureCache

::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, Image& dst_image = slot_images[dst_id]; Image& src_image = slot_images[src_id]; - - bool is_resolve = src_image.info.num_samples != 1 && dst_image.info.num_samples == 1; - bool is_src_rescaled = True(src_image.flags & ImageFlagBits::Rescaled); bool is_dst_rescaled = True(dst_image.flags & ImageFlagBits::Rescaled); + const bool is_resolve = src_image.info.num_samples != 1 && dst_image.info.num_samples == 1; if (is_src_rescaled != is_dst_rescaled) { if (ImageCanRescale(src_image)) { ScaleUp(src_image); @@ -498,7 +496,13 @@ void TextureCache

::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, is_dst_rescaled = True(dst_image.flags & ImageFlagBits::Rescaled); } } - + if (is_resolve && (is_src_rescaled != is_dst_rescaled)) { + // A resolve requires both images to be the same dimensions. Resize down if needed. + ScaleDown(src_image); + ScaleDown(dst_image); + is_src_rescaled = True(src_image.flags & ImageFlagBits::Rescaled); + is_dst_rescaled = True(dst_image.flags & ImageFlagBits::Rescaled); + } const auto& resolution = Settings::values.resolution_info; const auto scale_up = [&](u32 value) -> u32 { if (value == 0) { @@ -506,7 +510,6 @@ void TextureCache

::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, } return std::max((value * resolution.up_scale) >> resolution.down_shift, 1U); }; - const auto scale_region = [&](Region2D& region) { region.start.x = scale_up(region.start.x); region.start.y = scale_up(region.start.y); From abd07e41582b6d8f7efdedb936cdd7a7fddf9912 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Tue, 12 Oct 2021 00:35:01 -0400 Subject: [PATCH 093/156] video_core: Refactor resolution scale function --- src/common/settings.h | 14 +++++++ .../renderer_opengl/gl_texture_cache.cpp | 8 +--- .../renderer_vulkan/vk_texture_cache.cpp | 19 +++------ src/video_core/texture_cache/texture_cache.h | 39 ++++++------------- 4 files changed, 34 insertions(+), 46 deletions(-) diff --git a/src/common/settings.h b/src/common/settings.h index f629c7c56..09f7cdd84 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -72,6 +72,20 @@ struct ResolutionScalingInfo { f32 up_factor{1.0f}; f32 down_factor{1.0f}; bool active{}; + + s32 ScaleUp(s32 value) const { + if (value == 0) { + return 0; + } + return std::max((value * static_cast(up_scale)) >> static_cast(down_shift), 1); + } + + u32 ScaleUp(u32 value) const { + if (value == 0U) { + return 0U; + } + return std::max((value * up_scale) >> down_shift, 1U); + } }; /** The BasicSetting class is a simple resource manager. It defines a label and default value diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 3dfd13d6a..ec1afd31a 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -924,12 +924,8 @@ bool Image::Scale() { const GLenum filter = linear_color_format ? GL_LINEAR : GL_NEAREST; const auto& resolution = runtime->resolution; - const u32 up = resolution.up_scale; - const u32 down = resolution.down_shift; - const auto scale = [&](u32 value) { return std::max((value * up) >> down, 1U); }; - - const u32 scaled_width = scale(info.size.width); - const u32 scaled_height = is_2d ? scale(info.size.height) : info.size.height; + const u32 scaled_width = resolution.ScaleUp(info.size.width); + const u32 scaled_height = is_2d ? resolution.ScaleUp(info.size.height) : info.size.height; const u32 original_width = info.size.width; const u32 original_height = info.size.height; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 65506f75e..caefce5fc 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -607,16 +607,13 @@ void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, con scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([dst_image, src_image, extent, resources, aspect_mask, resolution, type, scaling, vk_filter](vk::CommandBuffer cmdbuf) { - const auto scale_up = [&](u32 value) { - return std::max((value * resolution.up_scale) >> resolution.down_shift, 1U); - }; const VkOffset2D src_size{ - .x = static_cast(scaling ? extent.width : scale_up(extent.width)), - .y = static_cast(scaling ? extent.height : scale_up(extent.height)), + .x = static_cast(scaling ? extent.width : resolution.ScaleUp(extent.width)), + .y = static_cast(scaling ? extent.height : resolution.ScaleUp(extent.height)), }; const VkOffset2D dst_size{ - .x = static_cast(scaling ? scale_up(extent.width) : extent.width), - .y = static_cast(scaling ? scale_up(extent.height) : extent.height), + .x = static_cast(scaling ? resolution.ScaleUp(extent.width) : extent.width), + .y = static_cast(scaling ? resolution.ScaleUp(extent.height) : extent.height), }; boost::container::small_vector regions; regions.reserve(resources.levels); @@ -1144,13 +1141,9 @@ bool Image::ScaleUp() { return false; } if (!scaled_image) { - const u32 up = resolution.up_scale; - const u32 down = resolution.down_shift; - const auto scale = [&](u32 value) { return std::max((value * up) >> down, 1U); }; - const bool is_2d = info.type == ImageType::e2D; - const u32 scaled_width = scale(info.size.width); - const u32 scaled_height = is_2d ? scale(info.size.height) : info.size.height; + const u32 scaled_width = resolution.ScaleUp(info.size.width); + const u32 scaled_height = is_2d ? resolution.ScaleUp(info.size.height) : info.size.height; auto scaled_info = info; scaled_info.size.width = scaled_width; scaled_info.size.height = scaled_height; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 38895c2e9..c77332b46 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -504,17 +504,11 @@ void TextureCache

::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, is_dst_rescaled = True(dst_image.flags & ImageFlagBits::Rescaled); } const auto& resolution = Settings::values.resolution_info; - const auto scale_up = [&](u32 value) -> u32 { - if (value == 0) { - return 0U; - } - return std::max((value * resolution.up_scale) >> resolution.down_shift, 1U); - }; const auto scale_region = [&](Region2D& region) { - region.start.x = scale_up(region.start.x); - region.start.y = scale_up(region.start.y); - region.end.x = scale_up(region.end.x); - region.end.y = scale_up(region.end.y); + region.start.x = resolution.ScaleUp(region.start.x); + region.start.y = resolution.ScaleUp(region.start.y); + region.end.x = resolution.ScaleUp(region.end.x); + region.end.y = resolution.ScaleUp(region.end.y); }; // TODO: Deduplicate @@ -1721,20 +1715,14 @@ void TextureCache

::CopyImage(ImageId dst_id, ImageId src_id, std::vector u32 { - if (value == 0) { - return 0U; - } - return std::max((value * resolution.up_scale) >> resolution.down_shift, 1U); - }; for (auto& copy : copies) { - copy.src_offset.x = scale_up(copy.src_offset.x); - copy.dst_offset.x = scale_up(copy.dst_offset.x); - copy.extent.width = scale_up(copy.extent.width); + copy.src_offset.x = resolution.ScaleUp(copy.src_offset.x); + copy.dst_offset.x = resolution.ScaleUp(copy.dst_offset.x); + copy.extent.width = resolution.ScaleUp(copy.extent.width); if (both_2d) { - copy.src_offset.y = scale_up(copy.src_offset.y); - copy.dst_offset.y = scale_up(copy.dst_offset.y); - copy.extent.height = scale_up(copy.extent.height); + copy.src_offset.y = resolution.ScaleUp(copy.src_offset.y); + copy.dst_offset.y = resolution.ScaleUp(copy.dst_offset.y); + copy.extent.height = resolution.ScaleUp(copy.extent.height); } } } @@ -1812,12 +1800,9 @@ std::pair TextureCache

::RenderTargetFromImage( Extent3D extent = MipSize(image.info.size, view_info.range.base.level); if (is_rescaled) { const auto& resolution = Settings::values.resolution_info; - const auto scale_up = [&](u32 value) { - return std::max((value * resolution.up_scale) >> resolution.down_shift, 1U); - }; - extent.width = scale_up(extent.width); + extent.width = resolution.ScaleUp(extent.width); if (image.info.type == ImageType::e2D) { - extent.height = scale_up(extent.height); + extent.height = resolution.ScaleUp(extent.height); } } const u32 num_samples = image.info.num_samples; From b1ae935f114e1011b19d4ada352c401e6655279a Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Tue, 12 Oct 2021 00:54:28 -0400 Subject: [PATCH 094/156] vk_texture_cache: Fix BlitScale of non-2D images --- .../renderer_vulkan/vk_texture_cache.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index caefce5fc..ccfdf64ea 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -592,9 +592,8 @@ struct RangedBarrierRange { } void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, const ImageInfo& info, - VkImageAspectFlags aspect_mask, const Settings::ResolutionScalingInfo& resolution, - bool scaling) { - const auto type = info.type; + VkImageAspectFlags aspect_mask, const Settings::ResolutionScalingInfo& resolution) { + const bool is_2d = info.type == ImageType::e2D; const auto resources = info.resources; const VkExtent2D extent{ .width = info.size.width, @@ -605,15 +604,15 @@ void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, con const VkFilter vk_filter = (is_zeta || is_int_format) ? VK_FILTER_NEAREST : VK_FILTER_LINEAR; scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([dst_image, src_image, extent, resources, aspect_mask, resolution, type, - scaling, vk_filter](vk::CommandBuffer cmdbuf) { + scheduler.Record([dst_image, src_image, extent, resources, aspect_mask, resolution, is_2d, + vk_filter](vk::CommandBuffer cmdbuf) { const VkOffset2D src_size{ - .x = static_cast(scaling ? extent.width : resolution.ScaleUp(extent.width)), - .y = static_cast(scaling ? extent.height : resolution.ScaleUp(extent.height)), + .x = static_cast(extent.width), + .y = static_cast(extent.height), }; const VkOffset2D dst_size{ - .x = static_cast(scaling ? resolution.ScaleUp(extent.width) : extent.width), - .y = static_cast(scaling ? resolution.ScaleUp(extent.height) : extent.height), + .x = static_cast(resolution.ScaleUp(extent.width)), + .y = static_cast(is_2d ? resolution.ScaleUp(extent.height) : extent.height), }; boost::container::small_vector regions; regions.reserve(resources.levels); @@ -1154,7 +1153,7 @@ bool Image::ScaleUp() { if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); } - BlitScale(*scheduler, *original_image, *scaled_image, info, aspect_mask, resolution, true); + BlitScale(*scheduler, *original_image, *scaled_image, info, aspect_mask, resolution); current_image = *scaled_image; flags |= ImageFlagBits::Rescaled; return true; From 4de584005fe8ae00608f8c3267a78e7cf0eb52aa Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Tue, 12 Oct 2021 01:45:54 -0400 Subject: [PATCH 095/156] texture_cache: Fix infinitely recursive ImageCanRescale check --- src/video_core/texture_cache/image_base.cpp | 2 ++ src/video_core/texture_cache/image_base.h | 5 +++-- src/video_core/texture_cache/texture_cache.h | 16 ++++++++-------- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/video_core/texture_cache/image_base.cpp b/src/video_core/texture_cache/image_base.cpp index 25a211df8..1909c9ecb 100644 --- a/src/video_core/texture_cache/image_base.cpp +++ b/src/video_core/texture_cache/image_base.cpp @@ -256,6 +256,8 @@ void AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_i } lhs.aliased_images.push_back(std::move(lhs_alias)); rhs.aliased_images.push_back(std::move(rhs_alias)); + lhs.flags &= ~ImageFlagBits::IsRescalable; + rhs.flags &= ~ImageFlagBits::IsRescalable; } } // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index 9c34687e0..bab290ac7 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -36,8 +36,9 @@ enum class ImageFlagBits : u32 { // Rescaler Rescaled = 1 << 12, - RescaleChecked = 1 << 13, - Blacklisted = 1 << 14, + CheckingRescalable = 1 << 13, + IsRescalable = 1 << 14, + Blacklisted = 1 << 15, }; DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index c77332b46..c1fb12679 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -795,25 +795,25 @@ bool TextureCache

::BlackListImage(ImageId image_id) { template bool TextureCache

::ImageCanRescale(ImageBase& image) { - if (True(image.flags & ImageFlagBits::Blacklisted)) { + if (!image.info.rescaleable || True(image.flags & ImageFlagBits::Blacklisted)) { return false; } - if (True(image.flags & (ImageFlagBits::Rescaled | ImageFlagBits::RescaleChecked))) { + if (True(image.flags & (ImageFlagBits::Rescaled | ImageFlagBits::CheckingRescalable))) { return true; } - if (!image.info.rescaleable) { - image.flags &= ~ImageFlagBits::RescaleChecked; - return false; + if (True(image.flags & ImageFlagBits::IsRescalable)) { + return true; } - image.flags |= ImageFlagBits::RescaleChecked; + image.flags |= ImageFlagBits::CheckingRescalable; for (const auto& alias : image.aliased_images) { Image& other_image = slot_images[alias.id]; if (!ImageCanRescale(other_image)) { - image.flags &= ~ImageFlagBits::RescaleChecked; + image.flags &= ~ImageFlagBits::CheckingRescalable; return false; } } - image.flags &= ~ImageFlagBits::RescaleChecked; + image.flags &= ~ImageFlagBits::CheckingRescalable; + image.flags |= ImageFlagBits::IsRescalable; return true; } From ebf36f23dd781c06fd100de10cc2ec25d4bec215 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Wed, 13 Oct 2021 23:53:59 -0400 Subject: [PATCH 096/156] vk_texture_cache: Use 3D to scale images when blit is unsupported --- src/video_core/renderer_vulkan/blit_image.cpp | 3 +- src/video_core/renderer_vulkan/blit_image.h | 2 +- .../renderer_vulkan/vk_texture_cache.cpp | 100 +++++++++++++----- .../renderer_vulkan/vk_texture_cache.h | 11 +- 4 files changed, 87 insertions(+), 29 deletions(-) diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp index 6c1b2f063..b97aac550 100644 --- a/src/video_core/renderer_vulkan/blit_image.cpp +++ b/src/video_core/renderer_vulkan/blit_image.cpp @@ -363,7 +363,7 @@ BlitImageHelper::BlitImageHelper(const Device& device_, VKScheduler& scheduler_, BlitImageHelper::~BlitImageHelper() = default; -void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, const ImageView& src_image_view, +void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, VkImageView src_view, const Region2D& dst_region, const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation) { @@ -373,7 +373,6 @@ void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, const ImageV .operation = operation, }; const VkPipelineLayout layout = *one_texture_pipeline_layout; - const VkImageView src_view = src_image_view.Handle(Shader::TextureType::Color2D); const VkSampler sampler = is_linear ? *linear_sampler : *nearest_sampler; const VkPipeline pipeline = FindOrEmplacePipeline(key); scheduler.RequestRenderpass(dst_framebuffer); diff --git a/src/video_core/renderer_vulkan/blit_image.h b/src/video_core/renderer_vulkan/blit_image.h index 33ee095c1..e11f8c214 100644 --- a/src/video_core/renderer_vulkan/blit_image.h +++ b/src/video_core/renderer_vulkan/blit_image.h @@ -34,7 +34,7 @@ public: StateTracker& state_tracker, DescriptorPool& descriptor_pool); ~BlitImageHelper(); - void BlitColor(const Framebuffer* dst_framebuffer, const ImageView& src_image_view, + void BlitColor(const Framebuffer* dst_framebuffer, VkImageView src_image_view, const Region2D& dst_region, const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index ccfdf64ea..9b90c7d9b 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -762,8 +762,8 @@ void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst return; } if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT && !is_src_msaa && !is_dst_msaa) { - blit_image_helper.BlitColor(dst_framebuffer, src, dst_region, src_region, filter, - operation); + blit_image_helper.BlitColor(dst_framebuffer, src.Handle(Shader::TextureType::Color2D), + dst_region, src_region, filter, operation); return; } if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { @@ -1131,18 +1131,10 @@ bool Image::ScaleUp() { return false; } const auto& device = runtime->device; - const PixelFormat format = StorageFormat(info.format); - const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format); - const auto blit_usage = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; - if (!device.IsFormatSupported(format_info.format, blit_usage, FormatType::Optimal)) { - LOG_ERROR(Render_Vulkan, "Device does not support scaling format {}", format); - // TODO: Use helper blits where applicable - return false; - } + const bool is_2d = info.type == ImageType::e2D; + const u32 scaled_width = resolution.ScaleUp(info.size.width); + const u32 scaled_height = is_2d ? resolution.ScaleUp(info.size.height) : info.size.height; if (!scaled_image) { - const bool is_2d = info.type == ImageType::e2D; - const u32 scaled_width = resolution.ScaleUp(info.size.width); - const u32 scaled_height = is_2d ? resolution.ScaleUp(info.size.height) : info.size.height; auto scaled_info = info; scaled_info.size.width = scaled_width; scaled_info.size.height = scaled_height; @@ -1150,11 +1142,56 @@ bool Image::ScaleUp() { auto& allocator = runtime->memory_allocator; scaled_commit = MemoryCommit(allocator.Commit(scaled_image, MemoryUsage::DeviceLocal)); } + current_image = *scaled_image; + if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); } - BlitScale(*scheduler, *original_image, *scaled_image, info, aspect_mask, resolution); - current_image = *scaled_image; + static constexpr auto OPTIMAL_FORMAT = FormatType::Optimal; + const PixelFormat format = StorageFormat(info.format); + const auto vk_format = MaxwellToVK::SurfaceFormat(device, OPTIMAL_FORMAT, false, format).format; + const auto blit_usage = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; + if (device.IsFormatSupported(vk_format, blit_usage, OPTIMAL_FORMAT)) { + BlitScale(*scheduler, *original_image, *scaled_image, info, aspect_mask, resolution); + } else { + using namespace VideoCommon; + static constexpr auto BLIT_OPERATION = Tegra::Engines::Fermi2D::Operation::SrcCopy; + + const auto view_info = ImageViewInfo(ImageViewType::e2D, info.format); + scale_view = std::make_unique(*runtime, view_info, NULL_IMAGE_ID, *this); + auto* view_ptr = scale_view.get(); + + const Region2D src_region{ + .start = {0, 0}, + .end = {static_cast(info.size.width), static_cast(info.size.height)}, + }; + const Region2D dst_region{ + .start = {0, 0}, + .end = {static_cast(scaled_width), static_cast(scaled_height)}, + }; + const VkExtent2D extent{ + .width = scaled_width, + .height = scaled_height, + }; + if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) { + scale_framebuffer = std::make_unique(*runtime, view_ptr, nullptr, extent); + const auto color_view = scale_view->Handle(Shader::TextureType::Color2D); + + runtime->blit_image_helper.BlitColor( + scale_framebuffer.get(), color_view, dst_region, src_region, + Tegra::Engines::Fermi2D::Filter::Bilinear, BLIT_OPERATION); + } else if (!runtime->device.IsBlitDepthStencilSupported() && + aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { + scale_framebuffer = std::make_unique(*runtime, nullptr, view_ptr, extent); + runtime->blit_image_helper.BlitDepthStencil( + scale_framebuffer.get(), scale_view->DepthView(), scale_view->StencilView(), + dst_region, src_region, Tegra::Engines::Fermi2D::Filter::Point, BLIT_OPERATION); + } else { + // TODO: Use helper blits where applicable + LOG_ERROR(Render_Vulkan, "Device does not support scaling format {}", format); + return false; + } + } flags |= ImageFlagBits::Rescaled; return true; } @@ -1370,7 +1407,27 @@ Sampler::Sampler(TextureCacheRuntime& runtime, const Tegra::Texture::TSCEntry& t } Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span color_buffers, - ImageView* depth_buffer, const VideoCommon::RenderTargets& key) { + ImageView* depth_buffer, const VideoCommon::RenderTargets& key) + : render_area{VkExtent2D{ + .width = key.size.width, + .height = key.size.height, + }} { + CreateFramebuffer(runtime, color_buffers, depth_buffer); + if (runtime.device.HasDebuggingToolAttached()) { + framebuffer.SetObjectNameEXT(VideoCommon::Name(key).c_str()); + } +} + +Framebuffer::Framebuffer(TextureCacheRuntime& runtime, ImageView* color_buffer, + ImageView* depth_buffer, VkExtent2D extent) + : render_area{extent} { + std::array color_buffers{color_buffer}; + CreateFramebuffer(runtime, color_buffers, depth_buffer); +} + +void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime, + std::span color_buffers, + ImageView* depth_buffer) { std::vector attachments; RenderPassKey renderpass_key{}; s32 num_layers = 1; @@ -1408,10 +1465,6 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span(num_colors); framebuffer = runtime.device.GetLogical().CreateFramebuffer({ .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, @@ -1420,13 +1473,10 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span(attachments.size()), .pAttachments = attachments.data(), - .width = key.size.width, - .height = key.size.height, + .width = render_area.width, + .height = render_area.height, .layers = static_cast(std::max(num_layers, 1)), }); - if (runtime.device.HasDebuggingToolAttached()) { - framebuffer.SetObjectNameEXT(VideoCommon::Name(key).c_str()); - } } void TextureCacheRuntime::AccelerateImageUpload( diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 5381343e9..dc9175ee1 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -145,6 +145,9 @@ private: vk::Image scaled_image{}; MemoryCommit scaled_commit{}; VkImage current_image{}; + + std::unique_ptr scale_framebuffer; + std::unique_ptr scale_view; }; class ImageView : public VideoCommon::ImageViewBase { @@ -221,9 +224,15 @@ private: class Framebuffer { public: - explicit Framebuffer(TextureCacheRuntime&, std::span color_buffers, + explicit Framebuffer(TextureCacheRuntime& runtime, std::span color_buffers, ImageView* depth_buffer, const VideoCommon::RenderTargets& key); + explicit Framebuffer(TextureCacheRuntime& runtime, ImageView* color_buffer, + ImageView* depth_buffer, VkExtent2D extent); + + void CreateFramebuffer(TextureCacheRuntime& runtime, + std::span color_buffers, ImageView* depth_buffer); + [[nodiscard]] VkFramebuffer Handle() const noexcept { return *framebuffer; } From ca1db6311631df4945a223c556dba1b9db5b5484 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 14 Oct 2021 00:08:57 -0400 Subject: [PATCH 097/156] yuzu: Fix build errors --- src/yuzu/game_list.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/yuzu/game_list.cpp b/src/yuzu/game_list.cpp index 6bd0f9ee9..2af95dbe5 100644 --- a/src/yuzu/game_list.cpp +++ b/src/yuzu/game_list.cpp @@ -159,7 +159,7 @@ GameListSearchField::GameListSearchField(GameList* parent) : QWidget{parent} { * @return true if the haystack contains all words of userinput */ static bool ContainsAllWords(const QString& haystack, const QString& userinput) { - const QStringList userinput_split = userinput.split(QLatin1Char{' '}, Qt::SkipEmptyParts); + const QStringList userinput_split = userinput.split(QLatin1Char{' '}, QString::SkipEmptyParts); return std::all_of(userinput_split.begin(), userinput_split.end(), [&haystack](const QString& s) { return haystack.contains(s); }); From 0f14c9379eae9c3caf8f4b932eace0a84d728f94 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 14 Oct 2021 14:12:25 -0400 Subject: [PATCH 098/156] texture_cache_base: Remove unused function declarations --- src/video_core/texture_cache/texture_cache_base.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 40e003b60..4dbe050af 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -144,14 +144,6 @@ public: const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Config& copy); - /// Invalidate the contents of the color buffer index - /// These contents become unspecified, the cache can assume aggressive optimizations. - void InvalidateColorBuffer(size_t index); - - /// Invalidate the contents of the depth buffer - /// These contents become unspecified, the cache can assume aggressive optimizations. - void InvalidateDepthBuffer(); - /// Try to find a cached image view in the given CPU address [[nodiscard]] ImageView* TryFindFramebufferImageView(VAddr cpu_addr); From b7ccc58f235d9e442677eb10259b7196a387c6bc Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 15 Oct 2021 22:59:16 +0200 Subject: [PATCH 099/156] Texture Cahe: Fix downscaling on SMO. --- src/common/settings.cpp | 2 ++ src/common/settings.h | 1 + src/video_core/texture_cache/image_info.cpp | 4 ++++ src/video_core/texture_cache/image_info.h | 1 + src/video_core/texture_cache/texture_cache.h | 3 +++ 5 files changed, 11 insertions(+) diff --git a/src/common/settings.cpp b/src/common/settings.cpp index 12fdb0f9b..bc2c8c7d7 100644 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -109,10 +109,12 @@ float Volume() { void UpdateRescalingInfo() { const auto setup = values.resolution_setup.GetValue(); auto& info = values.resolution_info; + info.downscale = false; switch (setup) { case ResolutionSetup::Res1_2X: info.up_scale = 1; info.down_shift = 1; + info.downscale = true; break; case ResolutionSetup::Res1X: info.up_scale = 1; diff --git a/src/common/settings.h b/src/common/settings.h index 09f7cdd84..a09db0822 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -72,6 +72,7 @@ struct ResolutionScalingInfo { f32 up_factor{1.0f}; f32 down_factor{1.0f}; bool active{}; + bool downscale{}; s32 ScaleUp(s32 value) const { if (value == 0) { diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index 7fa8fd4fe..d8e414247 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -102,6 +102,7 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept { layer_stride = CalculateLayerStride(*this); maybe_unaligned_layer_stride = CalculateLayerSize(*this); rescaleable &= (block.depth == 0) && resources.levels == 1; + downscaleable = size.height > 512; } } @@ -135,6 +136,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) size.depth = rt.depth; } else { rescaleable = block.depth == 0 && size.height > 256; + downscaleable = size.height > 512; type = ImageType::e2D; resources.layers = rt.depth; } @@ -164,6 +166,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept { size.depth = regs.zeta_depth; } else { rescaleable = block.depth == 0 && size.height > 256; + downscaleable = size.height > 512; type = ImageType::e2D; resources.layers = regs.zeta_depth; } @@ -197,6 +200,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept { .depth = 1, }; rescaleable = block.depth == 0 && size.height > 256; + downscaleable = size.height > 512; } } diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h index e874d2870..5932dcaba 100644 --- a/src/video_core/texture_cache/image_info.h +++ b/src/video_core/texture_cache/image_info.h @@ -34,6 +34,7 @@ struct ImageInfo { u32 num_samples = 1; u32 tile_width_spacing = 0; bool rescaleable = false; + bool downscaleable = false; }; } // namespace VideoCommon diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index c1fb12679..261cb6c48 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -798,6 +798,9 @@ bool TextureCache

::ImageCanRescale(ImageBase& image) { if (!image.info.rescaleable || True(image.flags & ImageFlagBits::Blacklisted)) { return false; } + if (Settings::values.resolution_info.downscale && !image.info.downscaleable) { + return false; + } if (True(image.flags & (ImageFlagBits::Rescaled | ImageFlagBits::CheckingRescalable))) { return true; } From 618de4e7871898f165c028293becd235ce3ccb09 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 16 Oct 2021 00:30:43 -0400 Subject: [PATCH 100/156] vulkan: Fix rescaling push constant usage --- .../backend/spirv/emit_context.cpp | 58 +++++++++---------- .../backend/spirv/emit_context.h | 3 +- .../backend/spirv/emit_spirv.h | 5 +- .../spirv/emit_spirv_context_get_set.cpp | 4 +- .../renderer_vulkan/pipeline_helper.h | 25 ++++---- .../renderer_vulkan/vk_compute_pipeline.cpp | 36 +++++++----- .../renderer_vulkan/vk_compute_pipeline.h | 1 - .../renderer_vulkan/vk_graphics_pipeline.cpp | 15 +++-- 8 files changed, 78 insertions(+), 69 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_context.cpp b/src/shader_recompiler/backend/spirv/emit_context.cpp index 8646fe989..723455462 100644 --- a/src/shader_recompiler/backend/spirv/emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/emit_context.cpp @@ -1006,47 +1006,47 @@ void EmitContext::DefineRescalingInput(const Info& info) { return; } if (profile.unified_descriptor_binding) { - DefineRescalingInputPushConstant(info); + DefineRescalingInputPushConstant(); } else { DefineRescalingInputUniformConstant(); } } -void EmitContext::DefineRescalingInputPushConstant(const Info& info) { - boost::container::static_vector members{F32[1]}; +void EmitContext::DefineRescalingInputPushConstant() { + boost::container::static_vector members{}; u32 member_index{0}; - if (!info.texture_descriptors.empty()) { - rescaling_textures_type = TypeArray(U32[1], Const(4u)); - Decorate(rescaling_textures_type, spv::Decoration::ArrayStride, 4u); - members.push_back(rescaling_textures_type); - rescaling_textures_member_index = ++member_index; - } - if (!info.image_descriptors.empty()) { - rescaling_images_type = TypeArray(U32[1], Const(NUM_IMAGE_SCALING_WORDS)); - if (rescaling_textures_type.value != rescaling_images_type.value) { - Decorate(rescaling_images_type, spv::Decoration::ArrayStride, 4u); - } - members.push_back(rescaling_images_type); - rescaling_images_member_index = ++member_index; + + rescaling_textures_type = TypeArray(U32[1], Const(4u)); + Decorate(rescaling_textures_type, spv::Decoration::ArrayStride, 4u); + members.push_back(rescaling_textures_type); + rescaling_textures_member_index = member_index++; + + rescaling_images_type = TypeArray(U32[1], Const(NUM_IMAGE_SCALING_WORDS)); + Decorate(rescaling_images_type, spv::Decoration::ArrayStride, 4u); + members.push_back(rescaling_images_type); + rescaling_images_member_index = member_index++; + + if (stage != Stage::Compute) { + members.push_back(F32[1]); + rescaling_downfactor_member_index = member_index++; } const Id push_constant_struct{TypeStruct(std::span(members.data(), members.size()))}; Decorate(push_constant_struct, spv::Decoration::Block); Name(push_constant_struct, "ResolutionInfo"); - MemberDecorate(push_constant_struct, 0u, spv::Decoration::Offset, 0u); - MemberName(push_constant_struct, 0u, "down_factor"); + MemberDecorate(push_constant_struct, rescaling_textures_member_index, spv::Decoration::Offset, + static_cast(offsetof(RescalingLayout, rescaling_textures))); + MemberName(push_constant_struct, rescaling_textures_member_index, "rescaling_textures"); - const u32 offset_bias = stage == Stage::Compute ? sizeof(u32) : 0; - if (!info.texture_descriptors.empty()) { - MemberDecorate( - push_constant_struct, rescaling_textures_member_index, spv::Decoration::Offset, - static_cast(offsetof(RescalingLayout, rescaling_textures) - offset_bias)); - MemberName(push_constant_struct, rescaling_textures_member_index, "rescaling_textures"); - } - if (!info.image_descriptors.empty()) { - MemberDecorate(push_constant_struct, rescaling_images_member_index, spv::Decoration::Offset, - static_cast(offsetof(RescalingLayout, rescaling_images) - offset_bias)); - MemberName(push_constant_struct, rescaling_images_member_index, "rescaling_images"); + MemberDecorate(push_constant_struct, rescaling_images_member_index, spv::Decoration::Offset, + static_cast(offsetof(RescalingLayout, rescaling_images))); + MemberName(push_constant_struct, rescaling_images_member_index, "rescaling_images"); + + if (stage != Stage::Compute) { + MemberDecorate(push_constant_struct, rescaling_downfactor_member_index, + spv::Decoration::Offset, + static_cast(offsetof(RescalingLayout, down_factor))); + MemberName(push_constant_struct, rescaling_downfactor_member_index, "down_factor"); } const Id pointer_type{TypePointer(spv::StorageClass::PushConstant, push_constant_struct)}; rescaling_push_constants = AddGlobalVariable(pointer_type, spv::StorageClass::PushConstant); diff --git a/src/shader_recompiler/backend/spirv/emit_context.h b/src/shader_recompiler/backend/spirv/emit_context.h index b67704baa..63f8185d9 100644 --- a/src/shader_recompiler/backend/spirv/emit_context.h +++ b/src/shader_recompiler/backend/spirv/emit_context.h @@ -244,6 +244,7 @@ public: Id rescaling_images_type{}; u32 rescaling_textures_member_index{}; u32 rescaling_images_member_index{}; + u32 rescaling_downfactor_member_index{}; u32 texture_rescaling_index{}; u32 image_rescaling_index{}; @@ -324,7 +325,7 @@ private: void DefineAttributeMemAccess(const Info& info); void DefineGlobalMemoryFunctions(const Info& info); void DefineRescalingInput(const Info& info); - void DefineRescalingInputPushConstant(const Info& info); + void DefineRescalingInputPushConstant(); void DefineRescalingInputUniformConstant(); void DefineInputs(const IR::Program& program); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.h b/src/shader_recompiler/backend/spirv/emit_spirv.h index cf59f2572..4b25534ce 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv.h @@ -22,11 +22,12 @@ constexpr u32 NUM_TEXTURE_AND_IMAGE_SCALING_WORDS = NUM_TEXTURE_SCALING_WORDS + NUM_IMAGE_SCALING_WORDS; struct RescalingLayout { - u32 down_factor; alignas(16) std::array rescaling_textures; alignas(16) std::array rescaling_images; + alignas(16) u32 down_factor; }; -constexpr u32 RESCALING_PUSH_CONSTANT_WORDS_OFFSET = offsetof(RescalingLayout, rescaling_textures); +constexpr u32 RESCALING_LAYOUT_WORDS_OFFSET = offsetof(RescalingLayout, rescaling_textures); +constexpr u32 RESCALING_LAYOUT_DOWN_FACTOR_OFFSET = offsetof(RescalingLayout, down_factor); [[nodiscard]] std::vector EmitSPIRV(const Profile& profile, const RuntimeInfo& runtime_info, IR::Program& program, Bindings& bindings); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index c0db7452f..bac683ae1 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -529,8 +529,8 @@ Id EmitYDirection(EmitContext& ctx) { Id EmitResolutionDownFactor(EmitContext& ctx) { if (ctx.profile.unified_descriptor_binding) { const Id pointer_type{ctx.TypePointer(spv::StorageClass::PushConstant, ctx.F32[1])}; - const Id pointer{ - ctx.OpAccessChain(pointer_type, ctx.rescaling_push_constants, ctx.u32_zero_value)}; + const Id index{ctx.Const(ctx.rescaling_downfactor_member_index)}; + const Id pointer{ctx.OpAccessChain(pointer_type, ctx.rescaling_push_constants, index)}; return ctx.OpLoad(ctx.F32[1], pointer); } else { const Id composite{ctx.OpLoad(ctx.F32[4], ctx.rescaling_uniform_constant)}; diff --git a/src/video_core/renderer_vulkan/pipeline_helper.h b/src/video_core/renderer_vulkan/pipeline_helper.h index 3612e8a18..ae5e66ef4 100644 --- a/src/video_core/renderer_vulkan/pipeline_helper.h +++ b/src/video_core/renderer_vulkan/pipeline_helper.h @@ -22,7 +22,6 @@ namespace Vulkan { using Shader::Backend::SPIRV::NUM_TEXTURE_AND_IMAGE_SCALING_WORDS; -using Shader::Backend::SPIRV::RESCALING_PUSH_CONSTANT_WORDS_OFFSET; class DescriptorLayoutBuilder { public: @@ -73,12 +72,12 @@ public: vk::PipelineLayout CreatePipelineLayout(VkDescriptorSetLayout descriptor_set_layout) const { using Shader::Backend::SPIRV::RescalingLayout; - const u32 push_offset = is_compute ? RESCALING_PUSH_CONSTANT_WORDS_OFFSET : 0; + const u32 size_offset = is_compute ? sizeof(RescalingLayout::down_factor) : 0u; const VkPushConstantRange range{ .stageFlags = static_cast( is_compute ? VK_SHADER_STAGE_COMPUTE_BIT : VK_SHADER_STAGE_ALL_GRAPHICS), .offset = 0, - .size = sizeof(RescalingLayout) - push_offset, + .size = sizeof(RescalingLayout) - size_offset, }; return device->GetLogical().CreatePipelineLayout({ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, @@ -139,21 +138,21 @@ private: class RescalingPushConstant { public: - explicit RescalingPushConstant(u32 num_textures) noexcept {} + explicit RescalingPushConstant() noexcept {} void PushTexture(bool is_rescaled) noexcept { - *texture_ptr |= is_rescaled ? texture_bit : 0; - texture_bit <<= 1; - if (texture_bit == 0) { + *texture_ptr |= is_rescaled ? texture_bit : 0u; + texture_bit <<= 1u; + if (texture_bit == 0u) { texture_bit = 1u; ++texture_ptr; } } void PushImage(bool is_rescaled) noexcept { - *image_ptr |= is_rescaled ? image_bit : 0; - image_bit <<= 1; - if (image_bit == 0) { + *image_ptr |= is_rescaled ? image_bit : 0u; + image_bit <<= 1u; + if (image_bit == 0u) { image_bit = 1u; ++image_ptr; } @@ -176,8 +175,10 @@ inline void PushImageDescriptors(TextureCache& texture_cache, const Shader::Info& info, RescalingPushConstant& rescaling, const VkSampler*& samplers, const VideoCommon::ImageViewInOut*& views) { - views += Shader::NumDescriptors(info.texture_buffer_descriptors); - views += Shader::NumDescriptors(info.image_buffer_descriptors); + const u32 num_texture_buffers = Shader::NumDescriptors(info.texture_buffer_descriptors); + const u32 num_image_buffers = Shader::NumDescriptors(info.image_buffer_descriptors); + views += num_texture_buffers; + views += num_image_buffers; for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { const VideoCommon::ImageViewId image_view_id{(views++)->id}; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 6dc52e399..de36bcdb7 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -22,6 +22,7 @@ namespace Vulkan { using Shader::ImageBufferDescriptor; +using Shader::Backend::SPIRV::RESCALING_LAYOUT_WORDS_OFFSET; using Tegra::Texture::TexturePair; ComputePipeline::ComputePipeline(const Device& device_, DescriptorPool& descriptor_pool, @@ -185,7 +186,7 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute, buffer_cache.UpdateComputeBuffers(); buffer_cache.BindHostComputeBuffers(); - RescalingPushConstant rescaling(num_textures); + RescalingPushConstant rescaling; const VkSampler* samplers_it{samplers.data()}; const VideoCommon::ImageViewInOut* views_it{views.data()}; PushImageDescriptors(texture_cache, update_descriptor_queue, info, rescaling, samplers_it, @@ -199,21 +200,24 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute, }); } const void* const descriptor_data{update_descriptor_queue.UpdateData()}; - scheduler.Record( - [this, descriptor_data, rescaling_data = rescaling.Data()](vk::CommandBuffer cmdbuf) { - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); - if (!descriptor_set_layout) { - return; - } - if (num_textures > 0) { - cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, rescaling_data); - } - const VkDescriptorSet descriptor_set{descriptor_allocator.Commit()}; - const vk::Device& dev{device.GetLogical()}; - dev.UpdateDescriptorSet(descriptor_set, *descriptor_update_template, descriptor_data); - cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline_layout, 0, - descriptor_set, nullptr); - }); + const bool is_rescaling = !info.texture_descriptors.empty() || !info.image_descriptors.empty(); + scheduler.Record([this, descriptor_data, is_rescaling, + rescaling_data = rescaling.Data()](vk::CommandBuffer cmdbuf) { + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); + if (!descriptor_set_layout) { + return; + } + if (is_rescaling) { + cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, + RESCALING_LAYOUT_WORDS_OFFSET, sizeof(rescaling_data), + rescaling_data.data()); + } + const VkDescriptorSet descriptor_set{descriptor_allocator.Commit()}; + const vk::Device& dev{device.GetLogical()}; + dev.UpdateDescriptorSet(descriptor_set, *descriptor_update_template, descriptor_data); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline_layout, 0, + descriptor_set, nullptr); + }); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.h b/src/video_core/renderer_vulkan/vk_compute_pipeline.h index e79ce4d7c..8c4b0a301 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.h @@ -59,7 +59,6 @@ private: vk::PipelineLayout pipeline_layout; vk::DescriptorUpdateTemplateKHR descriptor_update_template; vk::Pipeline pipeline; - u32 num_textures{}; std::condition_variable build_condvar; std::mutex build_mutex; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index f08e9e840..616a7b457 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -32,6 +32,8 @@ namespace { using boost::container::small_vector; using boost::container::static_vector; using Shader::ImageBufferDescriptor; +using Shader::Backend::SPIRV::RESCALING_LAYOUT_DOWN_FACTOR_OFFSET; +using Shader::Backend::SPIRV::RESCALING_LAYOUT_WORDS_OFFSET; using Tegra::Texture::TexturePair; using VideoCore::Surface::PixelFormat; using VideoCore::Surface::PixelFormatFromDepthFormat; @@ -431,7 +433,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { update_descriptor_queue.Acquire(); - RescalingPushConstant rescaling(num_textures); + RescalingPushConstant rescaling; const VkSampler* samplers_it{samplers.data()}; const VideoCommon::ImageViewInOut* views_it{views.data()}; const auto prepare_stage{[&](size_t stage) LAMBDA_FORCEINLINE { @@ -477,15 +479,16 @@ void GraphicsPipeline::ConfigureDraw(const RescalingPushConstant& rescaling) { if (bind_pipeline) { cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline); } + cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_ALL_GRAPHICS, + RESCALING_LAYOUT_WORDS_OFFSET, sizeof(rescaling_data), + rescaling_data.data()); if (update_rescaling) { const f32 config_down_factor{Settings::values.resolution_info.down_factor}; const f32 scale_down_factor{is_rescaling ? config_down_factor : 1.0f}; - cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_ALL_GRAPHICS, 0, - sizeof(scale_down_factor), &scale_down_factor); + cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_ALL_GRAPHICS, + RESCALING_LAYOUT_DOWN_FACTOR_OFFSET, sizeof(scale_down_factor), + &scale_down_factor); } - cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_ALL_GRAPHICS, - RESCALING_PUSH_CONSTANT_WORDS_OFFSET, sizeof(rescaling_data), - rescaling_data.data()); if (!descriptor_set_layout) { return; } From ef1dc4263586f5b81b53a5158db2c1cd2086ed4c Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 17 Oct 2021 01:22:13 +0200 Subject: [PATCH 101/156] Texture cache: Fix memory consumption and ignore rating when a depth texture is rendered. --- src/video_core/renderer_opengl/gl_texture_cache.cpp | 10 ++++++++-- src/video_core/renderer_vulkan/vk_texture_cache.cpp | 9 +++++++-- src/video_core/texture_cache/texture_cache.h | 7 ++++--- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index ec1afd31a..944a3aa65 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -967,21 +967,24 @@ bool Image::ScaleUp() { if (True(flags & ImageFlagBits::Rescaled)) { return false; } + flags |= ImageFlagBits::Rescaled; if (!runtime->resolution.active) { return false; } if (gl_format == 0 && gl_type == 0) { // compressed textures + flags &= ~ImageFlagBits::Rescaled; return false; } if (info.type == ImageType::Linear) { - UNIMPLEMENTED(); + UNREACHABLE(); + flags &= ~ImageFlagBits::Rescaled; return false; } if (!Scale()) { + flags &= ~ImageFlagBits::Rescaled; return false; } - flags |= ImageFlagBits::Rescaled; return true; } @@ -990,6 +993,9 @@ bool Image::ScaleDown() { return false; } flags &= ~ImageFlagBits::Rescaled; + if (!runtime->resolution.active) { + return false; + } current_texture = texture.handle; return true; } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 9b90c7d9b..a4fbbc735 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1126,6 +1126,7 @@ bool Image::ScaleUp() { return false; } ASSERT(info.type != ImageType::Linear); + flags |= ImageFlagBits::Rescaled; const auto& resolution = runtime->resolution; if (!resolution.active) { return false; @@ -1188,11 +1189,11 @@ bool Image::ScaleUp() { dst_region, src_region, Tegra::Engines::Fermi2D::Filter::Point, BLIT_OPERATION); } else { // TODO: Use helper blits where applicable + flags &= ~ImageFlagBits::Rescaled; LOG_ERROR(Render_Vulkan, "Device does not support scaling format {}", format); return false; } } - flags |= ImageFlagBits::Rescaled; return true; } @@ -1200,8 +1201,12 @@ bool Image::ScaleDown() { if (False(flags & ImageFlagBits::Rescaled)) { return false; } - ASSERT(info.type != ImageType::Linear); flags &= ~ImageFlagBits::Rescaled; + const auto& resolution = runtime->resolution; + if (!resolution.active) { + return false; + } + ASSERT(info.type != ImageType::Linear); current_image = *original_image; return true; } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 261cb6c48..c06cddae9 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -230,7 +230,8 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { auto& image = slot_images[image_id]; can_rescale &= ImageCanRescale(image); any_blacklisted |= True(image.flags & ImageFlagBits::Blacklisted); - any_rescaled |= True(image.flags & ImageFlagBits::Rescaled); + any_rescaled |= True(image.flags & ImageFlagBits::Rescaled) || + GetFormatType(image.info.format) != SurfaceType::ColorTexture; scale_rating = std::max(scale_rating, image.scale_tick <= frame_tick ? image.scale_rating + 1U : image.scale_rating); @@ -857,7 +858,7 @@ u64 TextureCache

::GetScaledImageSizeBytes(Image& image) { const f32 add_to_size = Settings::values.resolution_info.up_factor - 1.0f; const bool sign = std::signbit(add_to_size); const u32 image_size_bytes = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); - const u64 tentative_size = image_size_bytes * static_cast(std::abs(add_to_size)); + const u64 tentative_size = image_size_bytes * static_cast(std::abs(add_to_size)); const u64 fitted_size = Common::AlignUp(tentative_size, 1024); return sign ? -fitted_size : fitted_size; } @@ -879,7 +880,7 @@ bool TextureCache

::ScaleDown(Image& image) { if (!rescaled) { return false; } - total_used_memory += GetScaledImageSizeBytes(image); + total_used_memory -= GetScaledImageSizeBytes(image); InvalidateScale(image); return true; } From d4f5193bd308988a80f52941d9eefc4c857bfa99 Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Sun, 17 Oct 2021 02:21:26 +0200 Subject: [PATCH 102/156] Texture Cache: Rescale conversions between depth and color --- .../renderer_opengl/gl_texture_cache.h | 2 +- src/video_core/renderer_vulkan/blit_image.cpp | 29 +++++++++++-------- src/video_core/renderer_vulkan/blit_image.h | 14 +++++---- .../renderer_vulkan/vk_texture_cache.cpp | 13 +++++---- .../renderer_vulkan/vk_texture_cache.h | 2 +- src/video_core/texture_cache/texture_cache.h | 2 +- 6 files changed, 37 insertions(+), 25 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index cf7f37a16..f90dbfe9e 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -84,7 +84,7 @@ public: void CopyImage(Image& dst, Image& src, std::span copies); - void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view) { + void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view, bool rescaled) { UNIMPLEMENTED(); } diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp index b97aac550..bc3e4b93d 100644 --- a/src/video_core/renderer_vulkan/blit_image.cpp +++ b/src/video_core/renderer_vulkan/blit_image.cpp @@ -418,40 +418,45 @@ void BlitImageHelper::BlitDepthStencil(const Framebuffer* dst_framebuffer, } void BlitImageHelper::ConvertD32ToR32(const Framebuffer* dst_framebuffer, - const ImageView& src_image_view) { + const ImageView& src_image_view, u32 up_scale, + u32 down_shift) { ConvertDepthToColorPipeline(convert_d32_to_r32_pipeline, dst_framebuffer->RenderPass()); - Convert(*convert_d32_to_r32_pipeline, dst_framebuffer, src_image_view); + Convert(*convert_d32_to_r32_pipeline, dst_framebuffer, src_image_view, up_scale, down_shift); } void BlitImageHelper::ConvertR32ToD32(const Framebuffer* dst_framebuffer, - const ImageView& src_image_view) { + const ImageView& src_image_view, u32 up_scale, + u32 down_shift) { ConvertColorToDepthPipeline(convert_r32_to_d32_pipeline, dst_framebuffer->RenderPass()); - Convert(*convert_r32_to_d32_pipeline, dst_framebuffer, src_image_view); + Convert(*convert_r32_to_d32_pipeline, dst_framebuffer, src_image_view, up_scale, down_shift); } void BlitImageHelper::ConvertD16ToR16(const Framebuffer* dst_framebuffer, - const ImageView& src_image_view) { + const ImageView& src_image_view, u32 up_scale, + u32 down_shift) { ConvertDepthToColorPipeline(convert_d16_to_r16_pipeline, dst_framebuffer->RenderPass()); - Convert(*convert_d16_to_r16_pipeline, dst_framebuffer, src_image_view); + Convert(*convert_d16_to_r16_pipeline, dst_framebuffer, src_image_view, up_scale, down_shift); } void BlitImageHelper::ConvertR16ToD16(const Framebuffer* dst_framebuffer, - const ImageView& src_image_view) { + const ImageView& src_image_view, u32 up_scale, + u32 down_shift) { ConvertColorToDepthPipeline(convert_r16_to_d16_pipeline, dst_framebuffer->RenderPass()); - Convert(*convert_r16_to_d16_pipeline, dst_framebuffer, src_image_view); + Convert(*convert_r16_to_d16_pipeline, dst_framebuffer, src_image_view, up_scale, down_shift); } void BlitImageHelper::Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer, - const ImageView& src_image_view) { + const ImageView& src_image_view, u32 up_scale, u32 down_shift) { const VkPipelineLayout layout = *one_texture_pipeline_layout; const VkImageView src_view = src_image_view.Handle(Shader::TextureType::Color2D); const VkSampler sampler = *nearest_sampler; const VkExtent2D extent{ - .width = src_image_view.size.width, - .height = src_image_view.size.height, + .width = std::max((src_image_view.size.width * up_scale) >> down_shift, 1U), + .height = std::max((src_image_view.size.height * up_scale) >> down_shift, 1U), }; scheduler.RequestRenderpass(dst_framebuffer); - scheduler.Record([pipeline, layout, sampler, src_view, extent, this](vk::CommandBuffer cmdbuf) { + scheduler.Record([pipeline, layout, sampler, src_view, extent, up_scale, down_shift, + this](vk::CommandBuffer cmdbuf) { const VkOffset2D offset{ .x = 0, .y = 0, diff --git a/src/video_core/renderer_vulkan/blit_image.h b/src/video_core/renderer_vulkan/blit_image.h index e11f8c214..c0f4a16a4 100644 --- a/src/video_core/renderer_vulkan/blit_image.h +++ b/src/video_core/renderer_vulkan/blit_image.h @@ -44,17 +44,21 @@ public: const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation); - void ConvertD32ToR32(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); + void ConvertD32ToR32(const Framebuffer* dst_framebuffer, const ImageView& src_image_view, + u32 up_scale, u32 down_shift); - void ConvertR32ToD32(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); + void ConvertR32ToD32(const Framebuffer* dst_framebuffer, const ImageView& src_image_view, + u32 up_scale, u32 down_shift); - void ConvertD16ToR16(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); + void ConvertD16ToR16(const Framebuffer* dst_framebuffer, const ImageView& src_image_view, + u32 up_scale, u32 down_shift); - void ConvertR16ToD16(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); + void ConvertR16ToD16(const Framebuffer* dst_framebuffer, const ImageView& src_image_view, + u32 up_scale, u32 down_shift); private: void Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer, - const ImageView& src_image_view); + const ImageView& src_image_view, u32 up_scale, u32 down_shift); [[nodiscard]] VkPipeline FindOrEmplacePipeline(const BlitImagePipelineKey& key); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index a4fbbc735..17c62e27d 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -867,26 +867,29 @@ void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst }); } -void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view) { +void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view, + bool rescaled) { + const u32 up_scale = rescaled ? resolution.up_scale : 1; + const u32 down_shift = rescaled ? resolution.down_shift : 0; switch (dst_view.format) { case PixelFormat::R16_UNORM: if (src_view.format == PixelFormat::D16_UNORM) { - return blit_image_helper.ConvertD16ToR16(dst, src_view); + return blit_image_helper.ConvertD16ToR16(dst, src_view, up_scale, down_shift); } break; case PixelFormat::R32_FLOAT: if (src_view.format == PixelFormat::D32_FLOAT) { - return blit_image_helper.ConvertD32ToR32(dst, src_view); + return blit_image_helper.ConvertD32ToR32(dst, src_view, up_scale, down_shift); } break; case PixelFormat::D16_UNORM: if (src_view.format == PixelFormat::R16_UNORM) { - return blit_image_helper.ConvertR16ToD16(dst, src_view); + return blit_image_helper.ConvertR16ToD16(dst, src_view, up_scale, down_shift); } break; case PixelFormat::D32_FLOAT: if (src_view.format == PixelFormat::R32_FLOAT) { - return blit_image_helper.ConvertR32ToD32(dst, src_view); + return blit_image_helper.ConvertR32ToD32(dst, src_view, up_scale, down_shift); } break; default: diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index dc9175ee1..6dc190632 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -61,7 +61,7 @@ public: void CopyImage(Image& dst, Image& src, std::span copies); - void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view); + void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view, bool rescaled); bool CanAccelerateImageUpload(Image&) const noexcept { return false; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index c06cddae9..a035d2b18 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -1774,7 +1774,7 @@ void TextureCache

::CopyImage(ImageId dst_id, ImageId src_id, std::vector Date: Sat, 16 Oct 2021 20:33:58 -0500 Subject: [PATCH 103/156] vulkan: Implement FidelityFX Super Resolution --- src/common/settings.h | 1 + src/video_core/CMakeLists.txt | 2 + src/video_core/host_shaders/CMakeLists.txt | 17 +- .../host_shaders/fidelityfx_fsr.comp | 114 ++++++ .../vulkan_fidelityfx_fsr_easu.comp | 13 + .../vulkan_fidelityfx_fsr_rcas.comp | 13 + .../renderer_vulkan/vk_blit_screen.cpp | 62 ++- .../renderer_vulkan/vk_blit_screen.h | 4 + src/video_core/renderer_vulkan/vk_fsr.cpp | 375 ++++++++++++++++++ src/video_core/renderer_vulkan/vk_fsr.h | 54 +++ src/yuzu/configuration/configure_graphics.ui | 5 + 11 files changed, 643 insertions(+), 17 deletions(-) create mode 100644 src/video_core/host_shaders/fidelityfx_fsr.comp create mode 100644 src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu.comp create mode 100644 src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas.comp create mode 100644 src/video_core/renderer_vulkan/vk_fsr.cpp create mode 100644 src/video_core/renderer_vulkan/vk_fsr.h diff --git a/src/common/settings.h b/src/common/settings.h index a09db0822..9da447ce0 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -64,6 +64,7 @@ enum class ScalingFilter : u32 { Bilinear = 0, Bicubic = 1, ScaleForce = 2, + Fsr = 3, }; struct ResolutionScalingInfo { diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 6aac7f305..91a30fef7 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -132,6 +132,8 @@ add_library(video_core STATIC renderer_vulkan/vk_descriptor_pool.h renderer_vulkan/vk_fence_manager.cpp renderer_vulkan/vk_fence_manager.h + renderer_vulkan/vk_fsr.cpp + renderer_vulkan/vk_fsr.h renderer_vulkan/vk_graphics_pipeline.cpp renderer_vulkan/vk_graphics_pipeline.h renderer_vulkan/vk_master_semaphore.cpp diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 664d6ce5d..32e2ab500 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -1,3 +1,11 @@ +set(FIDELITYFX_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/externals/FidelityFX-FSR/ffx-fsr) + +set(GLSL_INCLUDES + fidelityfx_fsr.comp + ${FIDELITYFX_INCLUDE_DIR}/ffx_a.h + ${FIDELITYFX_INCLUDE_DIR}/ffx_fsr1.h +) + set(SHADER_FILES astc_decoder.comp block_linear_unswizzle_2d.comp @@ -13,6 +21,8 @@ set(SHADER_FILES present_bicubic.frag vulkan_blit_color_float.frag vulkan_blit_depth_stencil.frag + vulkan_fidelityfx_fsr_easu.comp + vulkan_fidelityfx_fsr_rcas.comp vulkan_present.frag vulkan_present.vert vulkan_quad_indexed.comp @@ -78,7 +88,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES}) OUTPUT ${SPIRV_HEADER_FILE} COMMAND - ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} + ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} MAIN_DEPENDENCY ${SOURCE_FILE} ) @@ -86,9 +96,12 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES}) endif() endforeach() +set(SHADER_SOURCES ${SHADER_FILES}) +list(APPEND SHADER_SOURCES ${GLSL_INCLUDES}) + add_custom_target(host_shaders DEPENDS ${SHADER_HEADERS} SOURCES - ${SHADER_FILES} + ${SHADER_SOURCES} ) diff --git a/src/video_core/host_shaders/fidelityfx_fsr.comp b/src/video_core/host_shaders/fidelityfx_fsr.comp new file mode 100644 index 000000000..cbb601580 --- /dev/null +++ b/src/video_core/host_shaders/fidelityfx_fsr.comp @@ -0,0 +1,114 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +//!#version 460 core +#extension GL_ARB_separate_shader_objects : enable +#extension GL_ARB_shading_language_420pack : enable +#extension GL_GOOGLE_include_directive : enable +#extension GL_EXT_shader_explicit_arithmetic_types : require + +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +layout( push_constant ) uniform constants { + u32vec2 input_size; +}; + +uvec4 Const0; +uvec4 Const1; +uvec4 Const2; +uvec4 Const3; + +#define A_GPU 1 +#define A_GLSL 1 +#define A_HALF + +#include "ffx_a.h" + +f16vec4 LinearToSRGB(f16vec4 linear) { + bvec4 selector = greaterThan(linear, f16vec4(0.00313066844250063)); + f16vec4 low = linear * float16_t(12.92); + f16vec4 high = float16_t(1.055) * pow(linear, f16vec4(1 / 2.4)) - float16_t(0.055); + return mix(low, high, selector); +} + +f16vec4 SRGBToLinear(f16vec4 srgb) { + bvec4 selector = greaterThan(srgb, f16vec4(0.0404482362771082)); + f16vec4 low = srgb * float16_t(1.0 / 12.92); + f16vec4 high = pow((srgb + float16_t(0.055)) * float16_t(1.0 / 1.055), f16vec4(2.4)); + return mix(low, high, selector); +} + +#if USE_EASU + #define FSR_EASU_H 1 + f16vec4 FsrEasuRH(vec2 p) { f16vec4 res = f16vec4(textureGather(InputTexture, p, 0)); return res; } + f16vec4 FsrEasuGH(vec2 p) { f16vec4 res = f16vec4(textureGather(InputTexture, p, 1)); return res; } + f16vec4 FsrEasuBH(vec2 p) { f16vec4 res = f16vec4(textureGather(InputTexture, p, 2)); return res; } +#endif +#if USE_RCAS + #define FSR_RCAS_H 1 + f16vec4 FsrRcasLoadH(ASW2 p) { return f16vec4(texelFetch(InputTexture, ASU2(p), 0)); } + void FsrRcasInputH(inout float16_t r, inout float16_t g, inout float16_t b) {} +#endif + +#include "ffx_fsr1.h" + +void CurrFilter(u32vec2 pos) { + // For debugging +#if USE_BILINEAR + vec2 pp = (vec2(pos) * vec2_AU2(Const0.xy) + vec2_AU2(Const0.zw)) * vec2_AU2(Const1.xy) + vec2(0.5, -0.5) * vec2_AU2(Const1.zw); + imageStore(OutputTexture, ivec2(pos), textureLod(InputTexture, pp, 0.0)); +#endif +#if USE_EASU + f16vec3 c; + FsrEasuH(c, pos, Const0, Const1, Const2, Const3); + imageStore(OutputTexture, ivec2(pos), f16vec4(c, 1)); +#endif +#if USE_RCAS + f16vec3 c; + FsrRcasH(c.r, c.g, c.b, pos, Const0); + imageStore(OutputTexture, ivec2(pos), f16vec4(c, 1)); +#endif + +} + +layout(local_size_x=64) in; +void main() { + +#if USE_EASU || USE_BILINEAR + vec2 ires = vec2(input_size); + vec2 tres = textureSize(InputTexture, 0); + vec2 ores = imageSize(OutputTexture); + FsrEasuCon(Const0, Const1, Const2, Const3, ires.x, ires.y, tres.x, tres.y, ores.x, ores.y); +#endif +#if USE_RCAS + FsrRcasCon(Const0, 0.25f); +#endif + + // Do remapping of local xy in workgroup for a more PS-like swizzle pattern. + AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u); + CurrFilter(gxy); + gxy.x += 8u; + CurrFilter(gxy); + gxy.y += 8u; + CurrFilter(gxy); + gxy.x -= 8u; + CurrFilter(gxy); +} diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu.comp b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu.comp new file mode 100644 index 000000000..6525eeeb5 --- /dev/null +++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu.comp @@ -0,0 +1,13 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 460 core +#extension GL_GOOGLE_include_directive : enable + +layout(set=0,binding=0) uniform sampler2D InputTexture; +layout(set=0,binding=1,rgba16f) uniform image2D OutputTexture; + +#define USE_EASU 1 + +#include "fidelityfx_fsr.comp" diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas.comp b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas.comp new file mode 100644 index 000000000..9463ed842 --- /dev/null +++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas.comp @@ -0,0 +1,13 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 460 core +#extension GL_GOOGLE_include_directive : enable + +layout(set=0,binding=0) uniform sampler2D InputTexture; +layout(set=0,binding=1,rgba16f) uniform image2D OutputTexture; + +#define USE_RCAS 1 + +#include "fidelityfx_fsr.comp" diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index c91b24e3a..8ce60e874 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -23,6 +23,7 @@ #include "video_core/host_shaders/vulkan_present_vert_spv.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" +#include "video_core/renderer_vulkan/vk_fsr.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_util.h" @@ -147,8 +148,12 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, scheduler.Wait(resource_ticks[image_index]); resource_ticks[image_index] = scheduler.CurrentTick(); - UpdateDescriptorSet(image_index, - use_accelerated ? screen_info.image_view : *raw_image_views[image_index]); + const VkImageView source_image_view = + use_accelerated ? screen_info.image_view : *raw_image_views[image_index]; + + if (!fsr) { + UpdateDescriptorSet(image_index, source_image_view); + } BufferData data; SetUniformData(data, layout); @@ -225,9 +230,26 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, read_barrier); cmdbuf.CopyBufferToImage(*buffer, image, VK_IMAGE_LAYOUT_GENERAL, copy); cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, write_barrier); + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + 0, write_barrier); }); } + + if (fsr) { + auto crop_rect = framebuffer.crop_rect; + if (crop_rect.GetWidth() == 0) { + crop_rect.right = framebuffer.width; + } + if (crop_rect.GetHeight() == 0) { + crop_rect.bottom = framebuffer.height; + } + crop_rect = crop_rect.Scale(Settings::values.resolution_info.up_factor); + VkImageView fsr_image_view = + fsr->Draw(scheduler, image_index, source_image_view, crop_rect); + UpdateDescriptorSet(image_index, fsr_image_view); + } + scheduler.Record( [this, host_framebuffer, image_index, size = render_area](vk::CommandBuffer cmdbuf) { const f32 bg_red = Settings::values.bg_red.GetValue() / 255.0f; @@ -325,6 +347,13 @@ void VKBlitScreen::CreateDynamicResources() { CreateRenderPass(); CreateFramebuffers(); CreateGraphicsPipeline(); + fsr.reset(); + if (Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::Fsr) { + const auto& layout = render_window.GetFramebufferLayout(); + fsr = std::make_unique( + device, memory_allocator, image_count, + VkExtent2D{.width = layout.screen.GetWidth(), .height = layout.screen.GetHeight()}); + } } void VKBlitScreen::RefreshResources(const Tegra::FramebufferConfig& framebuffer) { @@ -716,13 +745,14 @@ void VKBlitScreen::CreateGraphicsPipeline() { } void VKBlitScreen::CreateSampler() { + bool linear = Settings::values.scaling_filter.GetValue() != Settings::ScalingFilter::Fsr; const VkSamplerCreateInfo ci{ .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .magFilter = VK_FILTER_LINEAR, - .minFilter = VK_FILTER_LINEAR, - .mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR, + .magFilter = linear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST, + .minFilter = linear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST, + .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST, .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, @@ -905,17 +935,19 @@ void VKBlitScreen::SetVertexData(BufferData& data, const Tegra::FramebufferConfi UNIMPLEMENTED_IF(framebuffer_crop_rect.top != 0); UNIMPLEMENTED_IF(framebuffer_crop_rect.left != 0); - // Scale the output by the crop width/height. This is commonly used with 1280x720 rendering - // (e.g. handheld mode) on a 1920x1080 framebuffer. f32 scale_u = 1.0f; f32 scale_v = 1.0f; - if (framebuffer_crop_rect.GetWidth() > 0) { - scale_u = static_cast(framebuffer_crop_rect.GetWidth()) / - static_cast(screen_info.width); - } - if (framebuffer_crop_rect.GetHeight() > 0) { - scale_v = static_cast(framebuffer_crop_rect.GetHeight()) / - static_cast(screen_info.height); + // Scale the output by the crop width/height. This is commonly used with 1280x720 rendering + // (e.g. handheld mode) on a 1920x1080 framebuffer. + if (!fsr) { + if (framebuffer_crop_rect.GetWidth() > 0) { + scale_u = static_cast(framebuffer_crop_rect.GetWidth()) / + static_cast(screen_info.width); + } + if (framebuffer_crop_rect.GetHeight() > 0) { + scale_v = static_cast(framebuffer_crop_rect.GetHeight()) / + static_cast(screen_info.height); + } } const auto& screen = layout.screen; diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index d3a16f0ba..337931468 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -38,6 +38,8 @@ class RasterizerVulkan; class VKScheduler; class VKSwapchain; +class FSR; + struct VKScreenInfo { VkImageView image_view{}; u32 width{}; @@ -132,6 +134,8 @@ private: std::vector raw_buffer_commits; u32 raw_width = 0; u32 raw_height = 0; + + std::unique_ptr fsr; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_fsr.cpp b/src/video_core/renderer_vulkan/vk_fsr.cpp new file mode 100644 index 000000000..fd0a4aa42 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_fsr.cpp @@ -0,0 +1,375 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/div_ceil.h" +#include "video_core/host_shaders/vulkan_fidelityfx_fsr_easu_comp_spv.h" +#include "video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_comp_spv.h" +#include "video_core/renderer_vulkan/vk_fsr.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_shader_util.h" +#include "video_core/vulkan_common/vulkan_device.h" + +namespace Vulkan { + +FSR::FSR(const Device& device, MemoryAllocator& memory_allocator, size_t image_count, + VkExtent2D output_size) + : device{device}, memory_allocator{memory_allocator}, image_count{image_count}, + output_size{output_size} { + + CreateImages(); + CreateSampler(); + CreateShaders(); + CreateDescriptorPool(); + CreateDescriptorSetLayout(); + CreateDescriptorSets(); + CreatePipelineLayout(); + CreatePipeline(); +} + +VkImageView FSR::Draw(VKScheduler& scheduler, size_t image_index, VkImageView image_view, + const Common::Rectangle& crop_rect) { + + UpdateDescriptorSet(image_index, image_view); + + scheduler.Record([this, image_index, crop_rect](vk::CommandBuffer cmdbuf) { + const VkImageMemoryBarrier base_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = 0, + .dstAccessMask = 0, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = {}, + .subresourceRange = + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }; + + // TODO: Support clear color + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *easu_pipeline); + cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, + VkExtent2D{ + .width = static_cast(crop_rect.GetWidth()), + .height = static_cast(crop_rect.GetHeight()), + }); + + { + VkImageMemoryBarrier fsr_write_barrier = base_barrier; + fsr_write_barrier.image = *images[image_index], + fsr_write_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, fsr_write_barrier); + } + + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline_layout, 0, + descriptor_sets[image_index * 2], {}); + cmdbuf.Dispatch(Common::DivCeil(output_size.width, 16u), + Common::DivCeil(output_size.height, 16u), 1); + + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *rcas_pipeline); + cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, output_size); + + { + std::array barriers; + auto& fsr_read_barrier = barriers[0]; + auto& blit_write_barrier = barriers[1]; + + fsr_read_barrier = base_barrier; + fsr_read_barrier.image = *images[image_index]; + fsr_read_barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + fsr_read_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + + blit_write_barrier = base_barrier; + blit_write_barrier.image = *images[image_count + image_index]; + blit_write_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + blit_write_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, {}, {}, barriers); + } + + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline_layout, 0, + descriptor_sets[image_index * 2 + 1], {}); + cmdbuf.Dispatch(Common::DivCeil(output_size.width, 16u), + Common::DivCeil(output_size.height, 16u), 1); + + { + std::array barriers; + auto& blit_read_barrier = barriers[0]; + + blit_read_barrier = base_barrier; + blit_read_barrier.image = *images[image_count + image_index]; + blit_read_barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + blit_read_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, {}, {}, barriers); + } + }); + + return *image_views[image_count + image_index]; +} + +void FSR::CreateDescriptorPool() { + const std::array pool_sizes{{ + { + .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .descriptorCount = static_cast(image_count * 2), + }, + { + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .descriptorCount = static_cast(image_count * 2), + }, + }}; + + const VkDescriptorPoolCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, + .maxSets = static_cast(image_count * 2), + .poolSizeCount = static_cast(pool_sizes.size()), + .pPoolSizes = pool_sizes.data(), + }; + descriptor_pool = device.GetLogical().CreateDescriptorPool(ci); +} + +void FSR::CreateDescriptorSetLayout() { + const std::array layout_bindings{{ + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = sampler.address(), + }, + { + .binding = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = sampler.address(), + }, + }}; + + const VkDescriptorSetLayoutCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .bindingCount = static_cast(layout_bindings.size()), + .pBindings = layout_bindings.data(), + }; + + descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout(ci); +} + +void FSR::CreateDescriptorSets() { + const u32 sets = static_cast(image_count * 2); + const std::vector layouts(sets, *descriptor_set_layout); + + const VkDescriptorSetAllocateInfo ai{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .pNext = nullptr, + .descriptorPool = *descriptor_pool, + .descriptorSetCount = sets, + .pSetLayouts = layouts.data(), + }; + + descriptor_sets = descriptor_pool.Allocate(ai); +} + +void FSR::CreateImages() { + images.resize(image_count * 2); + image_views.resize(image_count * 2); + buffer_commits.resize(image_count * 2); + + for (size_t i = 0; i < image_count * 2; ++i) { + images[i] = device.GetLogical().CreateImage(VkImageCreateInfo{ + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .imageType = VK_IMAGE_TYPE_2D, + .format = VK_FORMAT_R16G16B16A16_SFLOAT, + .extent = + { + .width = output_size.width, + .height = output_size.height, + .depth = 1, + }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT | + VK_IMAGE_USAGE_SAMPLED_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + }); + buffer_commits[i] = memory_allocator.Commit(images[i], MemoryUsage::DeviceLocal); + image_views[i] = device.GetLogical().CreateImageView(VkImageViewCreateInfo{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .image = *images[i], + .viewType = VK_IMAGE_VIEW_TYPE_2D, + .format = VK_FORMAT_R16G16B16A16_SFLOAT, + .components = + { + .r = VK_COMPONENT_SWIZZLE_IDENTITY, + .g = VK_COMPONENT_SWIZZLE_IDENTITY, + .b = VK_COMPONENT_SWIZZLE_IDENTITY, + .a = VK_COMPONENT_SWIZZLE_IDENTITY, + }, + .subresourceRange = + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }); + } +} + +void FSR::CreatePipelineLayout() { + VkPushConstantRange push_const{ + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = sizeof(std::array), + }; + VkPipelineLayoutCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .setLayoutCount = 1, + .pSetLayouts = descriptor_set_layout.address(), + .pushConstantRangeCount = 1, + .pPushConstantRanges = &push_const, + }; + + pipeline_layout = device.GetLogical().CreatePipelineLayout(ci); +} + +void FSR::UpdateDescriptorSet(std::size_t image_index, VkImageView image_view) const { + const auto fsr_image_view = *image_views[image_index]; + const auto blit_image_view = *image_views[image_count + image_index]; + + const VkDescriptorImageInfo image_info{ + .imageView = image_view, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + const VkDescriptorImageInfo fsr_image_info{ + .imageView = fsr_image_view, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + const VkDescriptorImageInfo blit_image_info{ + .imageView = blit_image_view, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + + VkWriteDescriptorSet sampler_write{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .pNext = nullptr, + .dstSet = descriptor_sets[image_index * 2], + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .pImageInfo = &image_info, + .pBufferInfo = nullptr, + .pTexelBufferView = nullptr, + }; + + VkWriteDescriptorSet output_write{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .pNext = nullptr, + .dstSet = descriptor_sets[image_index * 2], + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .pImageInfo = &fsr_image_info, + .pBufferInfo = nullptr, + .pTexelBufferView = nullptr, + }; + + device.GetLogical().UpdateDescriptorSets(std::array{sampler_write, output_write}, {}); + + sampler_write.dstSet = descriptor_sets[image_index * 2 + 1]; + sampler_write.pImageInfo = &fsr_image_info; + output_write.dstSet = descriptor_sets[image_index * 2 + 1]; + output_write.pImageInfo = &blit_image_info; + + device.GetLogical().UpdateDescriptorSets(std::array{sampler_write, output_write}, {}); +} + +void FSR::CreateSampler() { + const VkSamplerCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .magFilter = VK_FILTER_LINEAR, + .minFilter = VK_FILTER_LINEAR, + .mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR, + .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, + .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, + .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, + .mipLodBias = 0.0f, + .anisotropyEnable = VK_FALSE, + .maxAnisotropy = 0.0f, + .compareEnable = VK_FALSE, + .compareOp = VK_COMPARE_OP_NEVER, + .minLod = 0.0f, + .maxLod = 0.0f, + .borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK, + .unnormalizedCoordinates = VK_FALSE, + }; + + sampler = device.GetLogical().CreateSampler(ci); +} + +void FSR::CreateShaders() { + easu_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_EASU_COMP_SPV); + rcas_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_EASU_COMP_SPV); +} + +void FSR::CreatePipeline() { + VkPipelineShaderStageCreateInfo shader_stage{ + + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .pName = "main", + .pSpecializationInfo = nullptr, + }; + + VkComputePipelineCreateInfo pipeline_ci{ + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .layout = *pipeline_layout, + .basePipelineIndex = 0, + }; + + shader_stage.module = *easu_shader; + pipeline_ci.stage = shader_stage; + easu_pipeline = device.GetLogical().CreateComputePipeline(pipeline_ci); + + shader_stage.module = *rcas_shader; + pipeline_ci.stage = shader_stage; + rcas_pipeline = device.GetLogical().CreateComputePipeline(pipeline_ci); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_fsr.h b/src/video_core/renderer_vulkan/vk_fsr.h new file mode 100644 index 000000000..8391e2e58 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_fsr.h @@ -0,0 +1,54 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/math_util.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { + +class Device; +class VKScheduler; + +class FSR { +public: + explicit FSR(const Device& device, MemoryAllocator& memory_allocator, size_t image_count, + VkExtent2D output_size); + VkImageView Draw(VKScheduler& scheduler, size_t image_index, VkImageView image_view, + const Common::Rectangle& crop_rect); + +private: + void CreateDescriptorPool(); + void CreateDescriptorSetLayout(); + void CreateDescriptorSets(); + void CreateImages(); + void CreateSampler(); + void CreateShaders(); + void CreatePipeline(); + void CreatePipelineLayout(); + + void UpdateDescriptorSet(std::size_t image_index, VkImageView image_view) const; + + const Device& device; + MemoryAllocator& memory_allocator; + size_t image_count; + VkExtent2D output_size; + + vk::DescriptorPool descriptor_pool; + vk::DescriptorSetLayout descriptor_set_layout; + vk::DescriptorSets descriptor_sets; + vk::PipelineLayout pipeline_layout; + vk::ShaderModule easu_shader; + vk::ShaderModule rcas_shader; + vk::Pipeline easu_pipeline; + vk::Pipeline rcas_pipeline; + vk::Sampler sampler; + std::vector images; + std::vector image_views; + std::vector buffer_commits; +}; + +} // namespace Vulkan diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui index d5e0d4e89..014ca6683 100644 --- a/src/yuzu/configuration/configure_graphics.ui +++ b/src/yuzu/configuration/configure_graphics.ui @@ -402,6 +402,11 @@ ScaleForce + + + FidelityFX Super Resolution + + From 77b0812d69c05e5df998b03ffa57655973a452d3 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 23 Oct 2021 23:35:02 -0400 Subject: [PATCH 104/156] externals: Add only included ffx-fsr headers The submodule adds a lot of unneeded bloat due its addition of samples that contain large media files that are difficult to compress. --- externals/FidelityFX-FSR/ffx-fsr/ffx_a.h | 2656 +++++++++++++++++++ externals/FidelityFX-FSR/ffx-fsr/ffx_fsr1.h | 1199 +++++++++ externals/FidelityFX-FSR/license.txt | 19 + 3 files changed, 3874 insertions(+) create mode 100644 externals/FidelityFX-FSR/ffx-fsr/ffx_a.h create mode 100644 externals/FidelityFX-FSR/ffx-fsr/ffx_fsr1.h create mode 100644 externals/FidelityFX-FSR/license.txt diff --git a/externals/FidelityFX-FSR/ffx-fsr/ffx_a.h b/externals/FidelityFX-FSR/ffx-fsr/ffx_a.h new file mode 100644 index 000000000..d04bff55c --- /dev/null +++ b/externals/FidelityFX-FSR/ffx-fsr/ffx_a.h @@ -0,0 +1,2656 @@ +//============================================================================================================================== +// +// [A] SHADER PORTABILITY 1.20210629 +// +//============================================================================================================================== +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// MIT LICENSE +// =========== +// Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS"). +// ----------- +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// ----------- +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +// Software. +// ----------- +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// Common central point for high-level shading language and C portability for various shader headers. +//------------------------------------------------------------------------------------------------------------------------------ +// DEFINES +// ======= +// A_CPU ..... Include the CPU related code. +// A_GPU ..... Include the GPU related code. +// A_GLSL .... Using GLSL. +// A_HLSL .... Using HLSL. +// A_HLSL_6_2 Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types'). +// A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan) +// A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default). +// ======= +// A_BYTE .... Support 8-bit integer. +// A_HALF .... Support 16-bit integer and floating point. +// A_LONG .... Support 64-bit integer. +// A_DUBL .... Support 64-bit floating point. +// ======= +// A_WAVE .... Support wave-wide operations. +//------------------------------------------------------------------------------------------------------------------------------ +// To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'. +//------------------------------------------------------------------------------------------------------------------------------ +// SIMPLIFIED TYPE SYSTEM +// ====================== +// - All ints will be unsigned with exception of when signed is required. +// - Type naming simplified and shortened "A<#components>", +// - H = 16-bit float (half) +// - F = 32-bit float (float) +// - D = 64-bit float (double) +// - P = 1-bit integer (predicate, not using bool because 'B' is used for byte) +// - B = 8-bit integer (byte) +// - W = 16-bit integer (word) +// - U = 32-bit integer (unsigned) +// - L = 64-bit integer (long) +// - Using "AS<#components>" for signed when required. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops). +//------------------------------------------------------------------------------------------------------------------------------ +// CHANGE LOG +// ========== +// 20200914 - Expanded wave ops and prx code. +// 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COMMON +//============================================================================================================================== +#define A_2PI 6.28318530718 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// CPU +// +// +//============================================================================================================================== +#ifdef A_CPU + // Supporting user defined overrides. + #ifndef A_RESTRICT + #define A_RESTRICT __restrict + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifndef A_STATIC + #define A_STATIC static + #endif +//------------------------------------------------------------------------------------------------------------------------------ + // Same types across CPU and GPU. + // Predicate uses 32-bit integer (C friendly bool). + typedef uint32_t AP1; + typedef float AF1; + typedef double AD1; + typedef uint8_t AB1; + typedef uint16_t AW1; + typedef uint32_t AU1; + typedef uint64_t AL1; + typedef int8_t ASB1; + typedef int16_t ASW1; + typedef int32_t ASU1; + typedef int64_t ASL1; +//------------------------------------------------------------------------------------------------------------------------------ + #define AD1_(a) ((AD1)(a)) + #define AF1_(a) ((AF1)(a)) + #define AL1_(a) ((AL1)(a)) + #define AU1_(a) ((AU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1_(a) ((ASL1)(a)) + #define ASU1_(a) ((ASU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;} +//------------------------------------------------------------------------------------------------------------------------------ + #define A_TRUE 1 + #define A_FALSE 0 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// CPU/GPU PORTING +// +//------------------------------------------------------------------------------------------------------------------------------ +// Get CPU and GPU to share all setup code, without duplicate code paths. +// This uses a lower-case prefix for special vector constructs. +// - In C restrict pointers are used. +// - In the shading language, in/inout/out arguments are used. +// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]). +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD1 *A_RESTRICT + #define retAD3 AD1 *A_RESTRICT + #define retAD4 AD1 *A_RESTRICT + #define retAF2 AF1 *A_RESTRICT + #define retAF3 AF1 *A_RESTRICT + #define retAF4 AF1 *A_RESTRICT + #define retAL2 AL1 *A_RESTRICT + #define retAL3 AL1 *A_RESTRICT + #define retAL4 AL1 *A_RESTRICT + #define retAU2 AU1 *A_RESTRICT + #define retAU3 AU1 *A_RESTRICT + #define retAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 AD1 *A_RESTRICT + #define inAD3 AD1 *A_RESTRICT + #define inAD4 AD1 *A_RESTRICT + #define inAF2 AF1 *A_RESTRICT + #define inAF3 AF1 *A_RESTRICT + #define inAF4 AF1 *A_RESTRICT + #define inAL2 AL1 *A_RESTRICT + #define inAL3 AL1 *A_RESTRICT + #define inAL4 AL1 *A_RESTRICT + #define inAU2 AU1 *A_RESTRICT + #define inAU3 AU1 *A_RESTRICT + #define inAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 AD1 *A_RESTRICT + #define inoutAD3 AD1 *A_RESTRICT + #define inoutAD4 AD1 *A_RESTRICT + #define inoutAF2 AF1 *A_RESTRICT + #define inoutAF3 AF1 *A_RESTRICT + #define inoutAF4 AF1 *A_RESTRICT + #define inoutAL2 AL1 *A_RESTRICT + #define inoutAL3 AL1 *A_RESTRICT + #define inoutAL4 AL1 *A_RESTRICT + #define inoutAU2 AU1 *A_RESTRICT + #define inoutAU3 AU1 *A_RESTRICT + #define inoutAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 AD1 *A_RESTRICT + #define outAD3 AD1 *A_RESTRICT + #define outAD4 AD1 *A_RESTRICT + #define outAF2 AF1 *A_RESTRICT + #define outAF3 AF1 *A_RESTRICT + #define outAF4 AF1 *A_RESTRICT + #define outAL2 AL1 *A_RESTRICT + #define outAL3 AL1 *A_RESTRICT + #define outAL4 AL1 *A_RESTRICT + #define outAU2 AU1 *A_RESTRICT + #define outAU3 AU1 *A_RESTRICT + #define outAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD1 x[2] + #define varAD3(x) AD1 x[3] + #define varAD4(x) AD1 x[4] + #define varAF2(x) AF1 x[2] + #define varAF3(x) AF1 x[3] + #define varAF4(x) AF1 x[4] + #define varAL2(x) AL1 x[2] + #define varAL3(x) AL1 x[3] + #define varAL4(x) AL1 x[4] + #define varAU2(x) AU1 x[2] + #define varAU3(x) AU1 x[3] + #define varAU4(x) AU1 x[4] +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) {x,y} + #define initAD3(x,y,z) {x,y,z} + #define initAD4(x,y,z,w) {x,y,z,w} + #define initAF2(x,y) {x,y} + #define initAF3(x,y,z) {x,y,z} + #define initAF4(x,y,z,w) {x,y,z,w} + #define initAL2(x,y) {x,y} + #define initAL3(x,y,z) {x,y,z} + #define initAL4(x,y,z,w) {x,y,z,w} + #define initAU2(x,y) {x,y} + #define initAU3(x,y,z) {x,y,z} + #define initAU4(x,y,z,w) {x,y,z,w} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Replace transcendentals with manual versions. +//============================================================================================================================== + #ifdef A_GCC + A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));} + #else + A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);} + #else + A_STATIC AD1 ACosD1(AD1 a){return cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return cosf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} + A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);} + #else + A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);} + #else + A_STATIC AD1 AFloorD1(AD1 a){return floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);} + A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);} + #else + A_STATIC AD1 ALog2D1(AD1 a){return log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;} + A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;} + A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;} + A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + // These follow the convention that A integer types don't have signage, until they are operated on. + A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;} + A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a>ASL1_(b));} + A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);} + #else + A_STATIC AD1 ASinD1(AD1 a){return sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return sinf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);} + #else + A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));} + A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);} + A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));} + A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));} + A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));} + A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;} + A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;} + A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;} + A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;} + A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} + A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} + A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} + A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} + A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;} + A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;} + A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;} + A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;} + A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;} + A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;} + A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;} + A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;} + A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;} + A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;} + A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;} + A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;} + A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF FLOAT PACKING +//============================================================================================================================== + // Convert float to half (in lower 16-bits of output). + // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf + // Supports denormals. + // Conversion rules are to make computations possibly "safer" on the GPU, + // -INF & -NaN -> -65504 + // +INF & +NaN -> +65504 + A_STATIC AU1 AU1_AH1_AF1(AF1 f){ + static AW1 base[512]={ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100, + 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00, + 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100, + 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00, + 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff}; + static AB1 shift[512]={ + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18}; + union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);} +//------------------------------------------------------------------------------------------------------------------------------ + // Used to output packed constant. + A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GLSL +// +// +//============================================================================================================================== +#if defined(A_GLSL) && defined(A_GPU) + #ifndef A_SKIP_EXT + #ifdef A_HALF + #extension GL_EXT_shader_16bit_storage:require + #extension GL_EXT_shader_explicit_arithmetic_types:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_LONG + #extension GL_ARB_gpu_shader_int64:require + #extension GL_NV_shader_atomic_int64:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_WAVE + #extension GL_KHR_shader_subgroup_arithmetic:require + #extension GL_KHR_shader_subgroup_ballot:require + #extension GL_KHR_shader_subgroup_quad:require + #extension GL_KHR_shader_subgroup_shuffle:require + #endif + #endif +//============================================================================================================================== + #define AP1 bool + #define AP2 bvec2 + #define AP3 bvec3 + #define AP4 bvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 vec2 + #define AF3 vec3 + #define AF4 vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uvec2 + #define AU3 uvec3 + #define AU4 uvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 ivec2 + #define ASU3 ivec3 + #define ASU4 ivec4 +//============================================================================================================================== + #define AF1_AU1(x) uintBitsToFloat(AU1(x)) + #define AF2_AU2(x) uintBitsToFloat(AU2(x)) + #define AF3_AU3(x) uintBitsToFloat(AU3(x)) + #define AF4_AU4(x) uintBitsToFloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) floatBitsToUint(AF1(x)) + #define AU2_AF2(x) floatBitsToUint(AF2(x)) + #define AU3_AF3(x) floatBitsToUint(AF3(x)) + #define AU4_AF4(x) floatBitsToUint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));} + #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2_AF2 packHalf2x16 + #define AU1_AW2Unorm_AF2 packUnorm2x16 + #define AU1_AB4Unorm_AF4 packUnorm4x8 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF2_AH2_AU1 unpackHalf2x16 + #define AF2_AW2Unorm_AU1 unpackUnorm2x16 + #define AF4_AB4Unorm_AU1 unpackUnorm4x8 +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #define AB1 uint8_t + #define AB2 u8vec2 + #define AB3 u8vec3 + #define AB4 u8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASB1 int8_t + #define ASB2 i8vec2 + #define ASB3 i8vec3 + #define ASB4 i8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + AB1 AB1_x(AB1 a){return AB1(a);} + AB2 AB2_x(AB1 a){return AB2(a,a);} + AB3 AB3_x(AB1 a){return AB3(a,a,a);} + AB4 AB4_x(AB1 a){return AB4(a,a,a,a);} + #define AB1_(a) AB1_x(AB1(a)) + #define AB2_(a) AB2_x(AB1(a)) + #define AB3_(a) AB3_x(AB1(a)) + #define AB4_(a) AB4_x(AB1(a)) + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #define AH1 float16_t + #define AH2 f16vec2 + #define AH3 f16vec3 + #define AH4 f16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 u16vec2 + #define AW3 u16vec3 + #define AW4 u16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 i16vec2 + #define ASW3 i16vec3 + #define ASW4 i16vec4 +//============================================================================================================================== + #define AH2_AU1(x) unpackFloat2x16(AU1(x)) + AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));} + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) unpackUint2x16(AU1(x)) + #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x))) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2(x) packFloat2x16(AH2(x)) + AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));} + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) packUint2x16(AW2(x)) + #define AU2_AW4(x) unpack32(packUint4x16(AW4(x))) +//============================================================================================================================== + #define AW1_AH1(x) halfBitsToUint16(AH1(x)) + #define AW2_AH2(x) halfBitsToUint16(AH2(x)) + #define AW3_AH3(x) halfBitsToUint16(AH3(x)) + #define AW4_AH4(x) halfBitsToUint16(AH4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AH1_AW1(x) uint16BitsToHalf(AW1(x)) + #define AH2_AW2(x) uint16BitsToHalf(AW2(x)) + #define AH3_AW3(x) uint16BitsToHalf(AW3(x)) + #define AH4_AW4(x) uint16BitsToHalf(AW4(x)) +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);} + AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);} + AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);} + AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFractH1(AH1 x){return fract(x);} + AH2 AFractH2(AH2 x){return fract(x);} + AH3 AFractH3(AH3 x){return fract(x);} + AH4 AFractH4(AH4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of max3. + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of min3. + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;} + AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;} + AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;} + AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);} + AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);} + AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);} + AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));} + AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));} + AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));} + AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #define AD1 double + #define AD2 dvec2 + #define AD3 dvec3 + #define AD4 dvec4 +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 x){return fract(x);} + AD2 AFractD2(AD2 x){return fract(x);} + AD3 AFractD3(AD3 x){return fract(x);} + AD4 AFractD4(AD4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;} + AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;} + AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;} + AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);} + AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);} + AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);} + AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));} + AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));} + AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));} + AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL LONG +//============================================================================================================================== + #ifdef A_LONG + #define AL1 uint64_t + #define AL2 u64vec2 + #define AL3 u64vec3 + #define AL4 u64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1 int64_t + #define ASL2 i64vec2 + #define ASL3 i64vec3 + #define ASL4 i64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AL1_AU2(x) packUint2x32(AU2(x)) + #define AU2_AL1(x) unpackUint2x32(AL1(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AL1_x(AL1 a){return AL1(a);} + AL2 AL2_x(AL1 a){return AL2(a,a);} + AL3 AL3_x(AL1 a){return AL3(a,a,a);} + AL4 AL4_x(AL1 a){return AL4(a,a,a,a);} + #define AL1_(a) AL1_x(AL1(a)) + #define AL2_(a) AL2_x(AL1(a)) + #define AL3_(a) AL3_x(AL1(a)) + #define AL4_(a) AL4_x(AL1(a)) +//============================================================================================================================== + AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));} + AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));} + AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));} + AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));} + AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));} + AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));} + AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));} + AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));} + AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));} + AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// WAVE OPERATIONS +//============================================================================================================================== + #ifdef A_WAVE + // Where 'x' must be a compile time literal. + AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);} + AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);} + AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);} + AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);} + AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);} + AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);} + AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);} + AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));} + AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));} + AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));} + AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));} + #endif + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// HLSL +// +// +//============================================================================================================================== +#if defined(A_HLSL) && defined(A_GPU) + #ifdef A_HLSL_6_2 + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float32_t + #define AF2 float32_t2 + #define AF3 float32_t3 + #define AF4 float32_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint32_t + #define AU2 uint32_t2 + #define AU3 uint32_t3 + #define AU4 uint32_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int32_t + #define ASU2 int32_t2 + #define ASU3 int32_t3 + #define ASU4 int32_t4 + #else + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 float2 + #define AF3 float3 + #define AF4 float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uint2 + #define AU3 uint3 + #define AU4 uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 int2 + #define ASU3 int3 + #define ASU4 int4 + #endif +//============================================================================================================================== + #define AF1_AU1(x) asfloat(AU1(x)) + #define AF2_AU2(x) asfloat(AU2(x)) + #define AF3_AU3(x) asfloat(AU3(x)) + #define AF4_AU4(x) asfloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) asuint(AF1(x)) + #define AU2_AF2(x) asuint(AF2(x)) + #define AU3_AF3(x) asuint(AF3(x)) + #define AU4_AF4(x) asuint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);} + #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);} + #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) + #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));} + #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x)) +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<>off)&mask;} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #ifdef A_HLSL_6_2 + #define AH1 float16_t + #define AH2 float16_t2 + #define AH3 float16_t3 + #define AH4 float16_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 uint16_t2 + #define AW3 uint16_t3 + #define AW4 uint16_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 int16_t2 + #define ASW3 int16_t3 + #define ASW4 int16_t4 + #else + #define AH1 min16float + #define AH2 min16float2 + #define AH3 min16float3 + #define AH4 min16float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 min16uint + #define AW2 min16uint2 + #define AW3 min16uint3 + #define AW4 min16uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 min16int + #define ASW2 min16int2 + #define ASW3 min16int3 + #define ASW4 min16int4 + #endif +//============================================================================================================================== + // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly). + // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/ + AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);} + AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));} + AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);} + AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));} + #define AH2_AU1(x) AH2_AU1_x(AU1(x)) + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) AW2_AU1_x(AU1(x)) + #define AW4_AU2(x) AW4_AU2_x(AU2(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);} + AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));} + AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);} + AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));} + #define AU1_AH2(x) AU1_AH2_x(AH2(x)) + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) AU1_AW2_x(AW2(x)) + #define AU2_AW4(x) AU2_AW4_x(AW4(x)) +//============================================================================================================================== + #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) + #define AW1_AH1(x) asuint16(x) + #define AW2_AH2(x) asuint16(x) + #define AW3_AH3(x) asuint16(x) + #define AW4_AH4(x) asuint16(x) + #else + #define AW1_AH1(a) AW1(f32tof16(AF1(a))) + #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y)) + #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z)) + #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w)) + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) + #define AH1_AW1(x) asfloat16(x) + #define AH2_AW2(x) asfloat16(x) + #define AH3_AW3(x) asfloat16(x) + #define AH4_AW4(x) asfloat16(x) + #else + #define AH1_AW1(a) AH1(f16tof32(AU1(a))) + #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y)) + #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z)) + #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w)) + #endif +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));} + AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));} + AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));} + AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_FRACT_F16 (note DX frac() is different). + AH1 AFractH1(AH1 x){return x-floor(x);} + AH2 AFractH2(AH2 x){return x-floor(x);} + AH3 AFractH3(AH3 x){return x-floor(x);} + AH4 AFractH4(AH4 x){return x-floor(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return rcp(x);} + AH2 ARcpH2(AH2 x){return rcp(x);} + AH3 ARcpH3(AH3 x){return rcp(x);} + AH4 ARcpH4(AH4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return rsqrt(x);} + AH2 ARsqH2(AH2 x){return rsqrt(x);} + AH3 ARsqH3(AH3 x){return rsqrt(x);} + AH4 ARsqH4(AH4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return saturate(x);} + AH2 ASatH2(AH2 x){return saturate(x);} + AH3 ASatH3(AH3 x){return saturate(x);} + AH4 ASatH4(AH4 x){return saturate(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #ifdef A_HLSL_6_2 + #define AD1 float64_t + #define AD2 float64_t2 + #define AD3 float64_t3 + #define AD4 float64_t4 + #else + #define AD1 double + #define AD2 double2 + #define AD3 double3 + #define AD4 double4 + #endif +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 a){return a-floor(a);} + AD2 AFractD2(AD2 a){return a-floor(a);} + AD3 AFractD3(AD3 a){return a-floor(a);} + AD4 AFractD4(AD4 a){return a-floor(a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return rcp(x);} + AD2 ARcpD2(AD2 x){return rcp(x);} + AD3 ARcpD3(AD3 x){return rcp(x);} + AD4 ARcpD4(AD4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return rsqrt(x);} + AD2 ARsqD2(AD2 x){return rsqrt(x);} + AD3 ARsqD3(AD3 x){return rsqrt(x);} + AD4 ARsqD4(AD4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return saturate(x);} + AD2 ASatD2(AD2 x){return saturate(x);} + AD3 ASatD3(AD3 x){return saturate(x);} + AD4 ASatD4(AD4 x){return saturate(x);} + #endif +//============================================================================================================================== +// HLSL WAVE +//============================================================================================================================== + #ifdef A_WAVE + // Where 'x' must be a compile time literal. + AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));} + AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));} + AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));} + AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));} + #endif + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU COMMON +// +// +//============================================================================================================================== +#ifdef A_GPU + // Negative and positive infinity. + #define A_INFP_F AF1_AU1(0x7f800000u) + #define A_INFN_F AF1_AU1(0xff800000u) +//------------------------------------------------------------------------------------------------------------------------------ + // Copy sign from 's' to positive 'd'. + AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));} + AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));} + AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));} + AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Single operation to return (useful to create a mask to use in lerp for branch free logic), + // m=NaN := 0 + // m>=0 := 0 + // m<0 := 1 + // Uses the following useful floating point logic, + // saturate(+a*(-INF)==-INF) := 0 + // saturate( 0*(-INF)== NaN) := 0 + // saturate(-a*(-INF)==+INF) := 1 + AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));} + AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));} + AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));} + AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));} + AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));} + AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));} + AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));} +//============================================================================================================================== + #ifdef A_HALF + #ifdef A_HLSL_6_2 + #define A_INFP_H AH1_AW1((uint16_t)0x7c00u) + #define A_INFN_H AH1_AW1((uint16_t)0xfc00u) + #else + #define A_INFP_H AH1_AW1(0x7c00u) + #define A_INFN_H AH1_AW1(0xfc00u) + #endif + +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));} + AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));} + AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));} + AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));} + AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));} + AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));} + AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));} + AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));} + AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));} + AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [FIS] FLOAT INTEGER SORTABLE +//------------------------------------------------------------------------------------------------------------------------------ +// Float to integer sortable. +// - If sign bit=0, flip the sign bit (positives). +// - If sign bit=1, flip all bits (negatives). +// Integer sortable to float. +// - If sign bit=1, flip the sign bit (positives). +// - If sign bit=0, flip all bits (negatives). +// Has nice side effects. +// - Larger integers are more positive values. +// - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage). +// Burns 3 ops for conversion {shift,or,xor}. +//============================================================================================================================== + AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} + AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} +//------------------------------------------------------------------------------------------------------------------------------ + // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value). + AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} + AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));} + AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));} +//------------------------------------------------------------------------------------------------------------------------------ + AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));} + AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [PERM] V_PERM_B32 +//------------------------------------------------------------------------------------------------------------------------------ +// Support for V_PERM_B32 started in the 3rd generation of GCN. +//------------------------------------------------------------------------------------------------------------------------------ +// yyyyxxxx - The 'i' input. +// 76543210 +// ======== +// HGFEDCBA - Naming on permutation. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure compiler optimizes this. +//============================================================================================================================== + #ifdef A_HALF + AU1 APerm0E0A(AU2 i){return((i.x )&0xffu)|((i.y<<16)&0xff0000u);} + AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);} + AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y )&0xff0000u);} + AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 APermHGFA(AU2 i){return((i.x )&0x000000ffu)|(i.y&0xffffff00u);} + AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);} + AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);} + AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);} + AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);} + AU1 APermHCFE(AU2 i){return((i.x )&0x00ff0000u)|(i.y&0xff00ffffu);} + AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);} + AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);} + AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [BUC] BYTE UNSIGNED CONVERSION +//------------------------------------------------------------------------------------------------------------------------------ +// Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation. +// Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively. +//------------------------------------------------------------------------------------------------------------------------------ +// OPCODE NOTES +// ============ +// GCN does not do UNORM or SNORM for bytes in opcodes. +// - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float. +// - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer). +// V_PERM_B32 does byte packing with ability to zero fill bytes as well. +// - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops. +// ==== ===== +// 0 : 0 +// 1 : 1 +// ... +// 255 : 255 +// : 256 (just outside the encoding range) +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. +// ==== ===== +// 0 : 0 +// 1 : 1/512 +// 2 : 1/256 +// ... +// 64 : 1/8 +// 128 : 1/4 +// 255 : 255/512 +// : 1/2 (just outside the encoding range) +//------------------------------------------------------------------------------------------------------------------------------ +// OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES +// ============================================ +// r=ABuc0FromU1(i) +// V_CVT_F32_UBYTE0 r,i +// -------------------------------------------- +// r=ABuc0ToU1(d,i) +// V_CVT_PKACCUM_U8_F32 r,i,0,d +// -------------------------------------------- +// d=ABuc0FromU2(i) +// Where 'k0' is an SGPR with 0x0E0A +// Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits +// V_PERM_B32 d,i.x,i.y,k0 +// V_PK_FMA_F16 d,d,k1.x,0 +// -------------------------------------------- +// r=ABuc0ToU2(d,i) +// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +// Where 'k1' is an SGPR with 0x???? +// Where 'k2' is an SGPR with 0x???? +// V_PK_FMA_F16 i,i,k0.x,0 +// V_PERM_B32 r.x,i,i,k1 +// V_PERM_B32 r.y,i,i,k2 +//============================================================================================================================== + // Peak range for 32-bit and 16-bit operations. + #define A_BUC_32 (255.0) + #define A_BUC_16 (255.0/512.0) +//============================================================================================================================== + #if 1 + // Designed to be one V_CVT_PKACCUM_U8_F32. + // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32. + AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u) )&(0x000000ffu));} + AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));} + AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));} + AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed to be one V_CVT_F32_UBYTE*. + AF1 ABuc0FromU1(AU1 i){return AF1((i )&255u);} + AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);} + AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);} + AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);} + #endif +//============================================================================================================================== + #ifdef A_HALF + // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. + AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0); + return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed for 3 ops to do SOA to AOS and conversion. + AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed for 2 ops to do both AOS to SOA, and conversion. + AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);} + AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);} + AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);} + AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [BSC] BYTE SIGNED CONVERSION +//------------------------------------------------------------------------------------------------------------------------------ +// Similar to [BUC]. +// Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively. +//------------------------------------------------------------------------------------------------------------------------------ +// ENCODING (without zero-based encoding) +// ======== +// 0 = unused (can be used to mean something else) +// 1 = lowest value +// 128 = exact zero center (zero based encoding +// 255 = highest value +//------------------------------------------------------------------------------------------------------------------------------ +// Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero). +// This is useful if there is a desire for cleared values to decode as zero. +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. +// ==== ===== +// 0 : -127/512 (unused) +// 1 : -126/512 +// 2 : -125/512 +// ... +// 128 : 0 +// ... +// 255 : 127/512 +// : 1/4 (just outside the encoding range) +//============================================================================================================================== + // Peak range for 32-bit and 16-bit operations. + #define A_BSC_32 (127.0) + #define A_BSC_16 (127.0/512.0) +//============================================================================================================================== + #if 1 + AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u) )&(0x000000ffu));} + AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));} + AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));} + AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u) )&(0x000000ffu)))^0x00000080u;} + AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;} + AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;} + AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ABsc0FromU1(AU1 i){return AF1((i )&255u)-128.0;} + AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;} + AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;} + AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ABsc0FromZbU1(AU1 i){return AF1(((i )&255u)^0x80u)-128.0;} + AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;} + AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;} + AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;} + #endif +//============================================================================================================================== + #ifdef A_HALF + // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. + AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0); + return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These support only positive inputs. +// Did not see value yet in specialization for range. +// Using quick testing, ended up mostly getting the same "best" approximation for various ranges. +// With hardware that can co-execute transcendentals, the value in approximations could be less than expected. +// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total. +// And co-execution would require a compiler interleaving a lot of independent work for packed usage. +//------------------------------------------------------------------------------------------------------------------------------ +// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total). +// Same with sqrt(), as this could be x*rsq() (7 ops). +//============================================================================================================================== + #ifdef A_HALF + // Minimize squared error across full positive range, 2 ops. + // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output. + AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));} + AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));} + AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));} + AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));} +//------------------------------------------------------------------------------------------------------------------------------ + // Lower precision estimation, 1 op. + // Minimize squared error across {smallest normal to 16384.0}. + AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));} + AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));} + AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));} + AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));} +//------------------------------------------------------------------------------------------------------------------------------ + // Medium precision estimation, one Newton Raphson iteration, 3 ops. + AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));} + AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));} + AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));} + AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // Minimize squared error across {smallest normal to 16384.0}, 2 ops. + AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));} + AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));} + AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));} + AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// FLOAT APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN", +// - Idea dates back to SGI, then to Quake 3, etc. +// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +// - sqrt(x)=rsqrt(x)*x +// - rcp(x)=rsqrt(x)*rsqrt(x) for positive x +// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +//------------------------------------------------------------------------------------------------------------------------------ +// These below are from perhaps less complete searching for optimal. +// Used FP16 normal range for testing with +4096 32-bit step size for sampling error. +// So these match up well with the half approximations. +//============================================================================================================================== + AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));} + AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));} + AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));} + AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));} + AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));} + AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));} + AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));} + AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));} + AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));} + AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));} + AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));} + AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));} + AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PQ APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do +// PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%. +//============================================================================================================================== +// Helpers + AF1 Quart(AF1 a) { a = a * a; return a * a;} + AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; } + AF2 Quart(AF2 a) { a = a * a; return a * a; } + AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; } + AF3 Quart(AF3 a) { a = a * a; return a * a; } + AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; } + AF4 Quart(AF4 a) { a = a * a; return a * a; } + AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; } + //------------------------------------------------------------------------------------------------------------------------------ + AF1 APrxPQToGamma2(AF1 a) { return Quart(a); } + AF1 APrxPQToLinear(AF1 a) { return Oct(a); } + AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); } + AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); } + AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); } + AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF2 APrxPQToGamma2(AF2 a) { return Quart(a); } + AF2 APrxPQToLinear(AF2 a) { return Oct(a); } + AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); } + AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); } + AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); } + AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF3 APrxPQToGamma2(AF3 a) { return Quart(a); } + AF3 APrxPQToLinear(AF3 a) { return Oct(a); } + AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); } + AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); } + AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); } + AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF4 APrxPQToGamma2(AF4 a) { return Quart(a); } + AF4 APrxPQToLinear(AF4 a) { return Oct(a); } + AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); } + AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); } + AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); } + AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PARABOLIC SIN & COS +//------------------------------------------------------------------------------------------------------------------------------ +// Approximate answers to transcendental questions. +//------------------------------------------------------------------------------------------------------------------------------ +//============================================================================================================================== + #if 1 + // Valid input range is {-1 to 1} representing {0 to 2 pi}. + // Output range is {-1/4 to 1/4} representing {-1 to 1}. + AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD. + AF2 APSinF2(AF2 x){return x*abs(x)-x;} + AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT + AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);} + AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + // For a packed {sin,cos} pair, + // - Native takes 16 clocks and 4 issue slots (no packed transcendentals). + // - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed). + AH1 APSinH1(AH1 x){return x*abs(x)-x;} + AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA + AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} + AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND + AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [ZOL] ZERO ONE LOGIC +//------------------------------------------------------------------------------------------------------------------------------ +// Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit. +//------------------------------------------------------------------------------------------------------------------------------ +// 0 := false +// 1 := true +//------------------------------------------------------------------------------------------------------------------------------ +// AndNot(x,y) -> !(x&y) .... One op. +// AndOr(x,y,z) -> (x&y)|z ... One op. +// GtZero(x) -> x>0.0 ..... One op. +// Sel(x,y,z) -> x?y:z ..... Two ops, has no precision loss. +// Signed(x) -> x<0.0 ..... One op. +// ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer. +//------------------------------------------------------------------------------------------------------------------------------ +// OPTIMIZATION NOTES +// ================== +// - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'. +// For example 'a.xy*k.xx+k.yy'. +//============================================================================================================================== + #if 1 + AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);} + AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);} + AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);} + AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AZolNotU1(AU1 x){return x^AU1_(1);} + AU2 AZolNotU2(AU2 x){return x^AU2_(1);} + AU3 AZolNotU3(AU3 x){return x^AU3_(1);} + AU4 AZolNotU4(AU4 x){return x^AU4_(1);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);} + AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);} + AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);} + AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);} +//============================================================================================================================== + AU1 AZolF1ToU1(AF1 x){return AU1(x);} + AU2 AZolF2ToU2(AF2 x){return AU2(x);} + AU3 AZolF3ToU3(AF3 x){return AU3(x);} + AU4 AZolF4ToU4(AF4 x){return AU4(x);} +//------------------------------------------------------------------------------------------------------------------------------ + // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled). + AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);} + AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);} + AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);} + AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolU1ToF1(AU1 x){return AF1(x);} + AF2 AZolU2ToF2(AU2 x){return AF2(x);} + AF3 AZolU3ToF3(AU3 x){return AF3(x);} + AF4 AZolU4ToF4(AU4 x){return AF4(x);} +//============================================================================================================================== + AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);} + AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);} + AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);} + AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);} + AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);} + AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);} + AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);} + AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);} + AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);} + AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));} + AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));} + AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));} + AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;} + AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;} + AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;} + AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);} + AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);} + AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);} + AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;} + AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;} + AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;} + AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));} + AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));} + AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));} + AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));} + AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));} + AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));} + AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));} + #endif +//============================================================================================================================== + #ifdef A_HALF + AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);} + AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);} + AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);} + AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AZolNotW1(AW1 x){return x^AW1_(1);} + AW2 AZolNotW2(AW2 x){return x^AW2_(1);} + AW3 AZolNotW3(AW3 x){return x^AW3_(1);} + AW4 AZolNotW4(AW4 x){return x^AW4_(1);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);} + AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);} + AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);} + AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);} +//============================================================================================================================== + // Uses denormal trick. + AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));} + AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));} + AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));} + AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + // AMD arch lacks a packed conversion opcode. + AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));} + AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));} + AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));} + AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));} +//============================================================================================================================== + AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);} + AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);} + AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);} + AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);} + AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);} + AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);} + AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);} + AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);} + AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);} + AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));} + AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));} + AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));} + AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;} + AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;} + AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;} + AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);} + AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);} + AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);} + AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;} + AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;} + AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;} + AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));} + AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));} + AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));} + AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COLOR CONVERSIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These are all linear to/from some other space (where 'linear' has been shortened out of the function name). +// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'. +// These are branch free implementations. +// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion. +//------------------------------------------------------------------------------------------------------------------------------ +// TRANSFER FUNCTIONS +// ================== +// 709 ..... Rec709 used for some HDTVs +// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native +// Pq ...... PQ native for HDR10 +// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type +// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations) +// Three ... Gamma 3.0, less fast, but good for HDR. +//------------------------------------------------------------------------------------------------------------------------------ +// KEEPING TO SPEC +// =============== +// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +// Also there is a slight step in the transition regions. +// Precision of the coefficients in the spec being the likely cause. +// Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store. +// This is to work around lack of hardware (typically only ROP does the conversion for free). +// To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free). +// So this header keeps with the spec. +// For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear. +// Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear. +//------------------------------------------------------------------------------------------------------------------------------ +// FOR PQ +// ====== +// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2. +// All constants are only specified to FP32 precision. +// External PQ source reference, +// - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl +//------------------------------------------------------------------------------------------------------------------------------ +// PACKED VERSIONS +// =============== +// These are the A*H2() functions. +// There is no PQ functions as FP16 seemed to not have enough precision for the conversion. +// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors. +// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least). +//------------------------------------------------------------------------------------------------------------------------------ +// NOTES +// ===== +// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case. +//============================================================================================================================== + #if 1 + AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma(). + AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} + AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} + AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302)); + return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));} + AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302)); + return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));} + AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302)); + return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToTwoF1(AF1 c){return sqrt(c);} + AF2 AToTwoF2(AF2 c){return sqrt(c);} + AF3 AToTwoF3(AF3 c){return sqrt(c);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));} + AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));} + AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));} + #endif +//============================================================================================================================== + #if 1 + // Unfortunately median won't work here. + AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} + AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} + AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833)); + return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));} + AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833)); + return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));} + AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833)); + return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));} +//------------------------------------------------------------------------------------------------------------------------------ + // Unfortunately median won't work here. + AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromTwoF1(AF1 c){return c*c;} + AF2 AFromTwoF2(AF2 c){return c*c;} + AF3 AFromTwoF3(AF3 c){return c*c;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromThreeF1(AF1 c){return c*c*c;} + AF2 AFromThreeF2(AF2 c){return c*c*c;} + AF3 AFromThreeF3(AF3 c){return c*c*c;} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));} + AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));} + AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToTwoH1(AH1 c){return sqrt(c);} + AH2 AToTwoH2(AH2 c){return sqrt(c);} + AH3 AToTwoH3(AH3 c){return sqrt(c);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));} + AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));} + AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));} + AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));} + AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromTwoH1(AH1 c){return c*c;} + AH2 AFromTwoH2(AH2 c){return c*c;} + AH3 AFromTwoH3(AH3 c){return c*c;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromThreeH1(AH1 c){return c*c*c;} + AH2 AFromThreeH2(AH2 c){return c*c*c;} + AH3 AFromThreeH3(AH3 c){return c*c*c;} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CS REMAP +//============================================================================================================================== + // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear. + // 543210 + // ====== + // ..xxx. + // yy...y + AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} +//============================================================================================================================== + // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions. + // 543210 + // ====== + // .xx..x + // y..yy. + // Details, + // LANE TO 8x8 MAPPING + // =================== + // 00 01 08 09 10 11 18 19 + // 02 03 0a 0b 12 13 1a 1b + // 04 05 0c 0d 14 15 1c 1d + // 06 07 0e 0f 16 17 1e 1f + // 20 21 28 29 30 31 38 39 + // 22 23 2a 2b 32 33 3a 3b + // 24 25 2c 2d 34 35 3c 3d + // 26 27 2e 2f 36 37 3e 3f + AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} +//============================================================================================================================== + #ifdef A_HALF + AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} + AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} + #endif +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// REFERENCE +// +//------------------------------------------------------------------------------------------------------------------------------ +// IEEE FLOAT RULES +// ================ +// - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1 +// - {+/-}0 * {+/-}INF = NaN +// - -INF + (+INF) = NaN +// - {+/-}0 / {+/-}0 = NaN +// - {+/-}INF / {+/-}INF = NaN +// - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN) +// - 0 == -0 +// - 4/0 = +INF +// - 4/-0 = -INF +// - 4+INF = +INF +// - 4-INF = -INF +// - 4*(+INF) = +INF +// - 4*(-INF) = -INF +// - -4*(+INF) = -INF +// - sqrt(+INF) = +INF +//------------------------------------------------------------------------------------------------------------------------------ +// FP16 ENCODING +// ============= +// fedcba9876543210 +// ---------------- +// ......mmmmmmmmmm 10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals) +// .eeeee.......... 5-bit exponent +// .00000.......... denormals +// .00001.......... -14 exponent +// .11110.......... 15 exponent +// .111110000000000 infinity +// .11111nnnnnnnnnn NaN with n!=0 +// s............... sign +//------------------------------------------------------------------------------------------------------------------------------ +// FP16/INT16 ALIASING DENORMAL +// ============================ +// 11-bit unsigned integers alias with half float denormal/normal values, +// 1 = 2^(-24) = 1/16777216 ....................... first denormal value +// 2 = 2^(-23) +// ... +// 1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value +// 1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers +// 2047 .............................................. last normal value that still maps to integers +// Scaling limits, +// 2^15 = 32768 ...................................... largest power of 2 scaling +// Largest pow2 conversion mapping is at *32768, +// 1 : 2^(-9) = 1/512 +// 2 : 1/256 +// 4 : 1/128 +// 8 : 1/64 +// 16 : 1/32 +// 32 : 1/16 +// 64 : 1/8 +// 128 : 1/4 +// 256 : 1/2 +// 512 : 1 +// 1024 : 2 +// 2047 : a little less than 4 +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU/CPU PORTABILITY +// +// +//------------------------------------------------------------------------------------------------------------------------------ +// This is the GPU implementation. +// See the CPU implementation for docs. +//============================================================================================================================== +#ifdef A_GPU + #define A_TRUE true + #define A_FALSE false + #define A_STATIC +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD2 + #define retAD3 AD3 + #define retAD4 AD4 + #define retAF2 AF2 + #define retAF3 AF3 + #define retAF4 AF4 + #define retAL2 AL2 + #define retAL3 AL3 + #define retAL4 AL4 + #define retAU2 AU2 + #define retAU3 AU3 + #define retAU4 AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 in AD2 + #define inAD3 in AD3 + #define inAD4 in AD4 + #define inAF2 in AF2 + #define inAF3 in AF3 + #define inAF4 in AF4 + #define inAL2 in AL2 + #define inAL3 in AL3 + #define inAL4 in AL4 + #define inAU2 in AU2 + #define inAU3 in AU3 + #define inAU4 in AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 inout AD2 + #define inoutAD3 inout AD3 + #define inoutAD4 inout AD4 + #define inoutAF2 inout AF2 + #define inoutAF3 inout AF3 + #define inoutAF4 inout AF4 + #define inoutAL2 inout AL2 + #define inoutAL3 inout AL3 + #define inoutAL4 inout AL4 + #define inoutAU2 inout AU2 + #define inoutAU3 inout AU3 + #define inoutAU4 inout AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 out AD2 + #define outAD3 out AD3 + #define outAD4 out AD4 + #define outAF2 out AF2 + #define outAF3 out AF3 + #define outAF4 out AF4 + #define outAL2 out AL2 + #define outAL3 out AL3 + #define outAL4 out AL4 + #define outAU2 out AU2 + #define outAU3 out AU3 + #define outAU4 out AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD2 x + #define varAD3(x) AD3 x + #define varAD4(x) AD4 x + #define varAF2(x) AF2 x + #define varAF3(x) AF3 x + #define varAF4(x) AF4 x + #define varAL2(x) AL2 x + #define varAL3(x) AL3 x + #define varAL4(x) AL4 x + #define varAU2(x) AU2 x + #define varAU3(x) AU3 x + #define varAU4(x) AU4 x +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) AD2(x,y) + #define initAD3(x,y,z) AD3(x,y,z) + #define initAD4(x,y,z,w) AD4(x,y,z,w) + #define initAF2(x,y) AF2(x,y) + #define initAF3(x,y,z) AF3(x,y,z) + #define initAF4(x,y,z,w) AF4(x,y,z,w) + #define initAL2(x,y) AL2(x,y) + #define initAL3(x,y,z) AL3(x,y,z) + #define initAL4(x,y,z,w) AL4(x,y,z,w) + #define initAU2(x,y) AU2(x,y) + #define initAU3(x,y,z) AU3(x,y,z) + #define initAU4(x,y,z,w) AU4(x,y,z,w) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//============================================================================================================================== + #define AAbsD1(a) abs(AD1(a)) + #define AAbsF1(a) abs(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ACosD1(a) cos(AD1(a)) + #define ACosF1(a) cos(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ADotD2(a,b) dot(AD2(a),AD2(b)) + #define ADotD3(a,b) dot(AD3(a),AD3(b)) + #define ADotD4(a,b) dot(AD4(a),AD4(b)) + #define ADotF2(a,b) dot(AF2(a),AF2(b)) + #define ADotF3(a,b) dot(AF3(a),AF3(b)) + #define ADotF4(a,b) dot(AF4(a),AF4(b)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AExp2D1(a) exp2(AD1(a)) + #define AExp2F1(a) exp2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AFloorD1(a) floor(AD1(a)) + #define AFloorF1(a) floor(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ALog2D1(a) log2(AD1(a)) + #define ALog2F1(a) log2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMaxD1(a,b) max(a,b) + #define AMaxF1(a,b) max(a,b) + #define AMaxL1(a,b) max(a,b) + #define AMaxU1(a,b) max(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMinD1(a,b) min(a,b) + #define AMinF1(a,b) min(a,b) + #define AMinL1(a,b) min(a,b) + #define AMinU1(a,b) min(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASinD1(a) sin(AD1(a)) + #define ASinF1(a) sin(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASqrtD1(a) sqrt(AD1(a)) + #define ASqrtF1(a) sqrt(AF1(a)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + #define APowD1(a,b) pow(AD1(a),AF1(b)) + #define APowF1(a,b) pow(AF1(a),AF1(b)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + #ifdef A_DUBL + AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;} + AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;} + AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;} + AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;} + AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;} + AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;} + AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;} + AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;} + AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;} + AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;} + AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;} + AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;} + AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;} + AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;} + AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;} + AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;} + AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;} + AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;} + AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;} + AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;} + AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;} + AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;} + AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;} + AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;} + AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;} + #endif +//============================================================================================================================== + AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;} + AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;} + AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;} + AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;} + AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;} + AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;} + AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;} + AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;} + AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;} + AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;} + AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;} + AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;} + AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;} + AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;} + AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;} + AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;} + AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;} + AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;} + AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;} + AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;} + AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;} + AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;} + AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;} + AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;} + AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;} +#endif diff --git a/externals/FidelityFX-FSR/ffx-fsr/ffx_fsr1.h b/externals/FidelityFX-FSR/ffx-fsr/ffx_fsr1.h new file mode 100644 index 000000000..15ecfde5c --- /dev/null +++ b/externals/FidelityFX-FSR/ffx-fsr/ffx_fsr1.h @@ -0,0 +1,1199 @@ +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// AMD FidelityFX SUPER RESOLUTION [FSR 1] ::: SPATIAL SCALING & EXTRAS - v1.20210629 +// +// +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// FSR is a collection of algorithms relating to generating a higher resolution image. +// This specific header focuses on single-image non-temporal image scaling, and related tools. +// +// The core functions are EASU and RCAS: +// [EASU] Edge Adaptive Spatial Upsampling ....... 1x to 4x area range spatial scaling, clamped adaptive elliptical filter. +// [RCAS] Robust Contrast Adaptive Sharpening .... A non-scaling variation on CAS. +// RCAS needs to be applied after EASU as a separate pass. +// +// Optional utility functions are: +// [LFGA] Linear Film Grain Applicator ........... Tool to apply film grain after scaling. +// [SRTM] Simple Reversible Tone-Mapper .......... Linear HDR {0 to FP16_MAX} to {0 to 1} and back. +// [TEPD] Temporal Energy Preserving Dither ...... Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// See each individual sub-section for inline documentation. +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// FUNCTION PERMUTATIONS +// ===================== +// *F() ..... Single item computation with 32-bit. +// *H() ..... Single item computation with 16-bit, with packing (aka two 16-bit ops in parallel) when possible. +// *Hx2() ... Processing two items in parallel with 16-bit, easier packing. +// Not all interfaces in this file have a *Hx2() form. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [EASU] EDGE ADAPTIVE SPATIAL UPSAMPLING +// +//------------------------------------------------------------------------------------------------------------------------------ +// EASU provides a high quality spatial-only scaling at relatively low cost. +// Meaning EASU is appropiate for laptops and other low-end GPUs. +// Quality from 1x to 4x area scaling is good. +//------------------------------------------------------------------------------------------------------------------------------ +// The scalar uses a modified fast approximation to the standard lanczos(size=2) kernel. +// EASU runs in a single pass, so it applies a directionally and anisotropically adaptive radial lanczos. +// This is also kept as simple as possible to have minimum runtime. +//------------------------------------------------------------------------------------------------------------------------------ +// The lanzcos filter has negative lobes, so by itself it will introduce ringing. +// To remove all ringing, the algorithm uses the nearest 2x2 input texels as a neighborhood, +// and limits output to the minimum and maximum of that neighborhood. +//------------------------------------------------------------------------------------------------------------------------------ +// Input image requirements: +// +// Color needs to be encoded as 3 channel[red, green, blue](e.g.XYZ not supported) +// Each channel needs to be in the range[0, 1] +// Any color primaries are supported +// Display / tonemapping curve needs to be as if presenting to sRGB display or similar(e.g.Gamma 2.0) +// There should be no banding in the input +// There should be no high amplitude noise in the input +// There should be no noise in the input that is not at input pixel granularity +// For performance purposes, use 32bpp formats +//------------------------------------------------------------------------------------------------------------------------------ +// Best to apply EASU at the end of the frame after tonemapping +// but before film grain or composite of the UI. +//------------------------------------------------------------------------------------------------------------------------------ +// Example of including this header for D3D HLSL : +// +// #define A_GPU 1 +// #define A_HLSL 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of including this header for Vulkan GLSL : +// +// #define A_GPU 1 +// #define A_GLSL 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of including this header for Vulkan HLSL : +// +// #define A_GPU 1 +// #define A_HLSL 1 +// #define A_HLSL_6_2 1 +// #define A_NO_16_BIT_CAST 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of declaring the required input callbacks for GLSL : +// The callbacks need to gather4 for each color channel using the specified texture coordinate 'p'. +// EASU uses gather4 to reduce position computation logic and for free Arrays of Structures to Structures of Arrays conversion. +// +// AH4 FsrEasuRH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,0));} +// AH4 FsrEasuGH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,1));} +// AH4 FsrEasuBH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,2));} +// ... +// The FsrEasuCon function needs to be called from the CPU or GPU to set up constants. +// The difference in viewport and input image size is there to support Dynamic Resolution Scaling. +// To use FsrEasuCon() on the CPU, define A_CPU before including ffx_a and ffx_fsr1. +// Including a GPU example here, the 'con0' through 'con3' values would be stored out to a constant buffer. +// AU4 con0,con1,con2,con3; +// FsrEasuCon(con0,con1,con2,con3, +// 1920.0,1080.0, // Viewport size (top left aligned) in the input image which is to be scaled. +// 3840.0,2160.0, // The size of the input image. +// 2560.0,1440.0); // The output resolution. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). +A_STATIC void FsrEasuCon( +outAU4 con0, +outAU4 con1, +outAU4 con2, +outAU4 con3, +// This the rendered image resolution being upscaled +AF1 inputViewportInPixelsX, +AF1 inputViewportInPixelsY, +// This is the resolution of the resource containing the input image (useful for dynamic resolution) +AF1 inputSizeInPixelsX, +AF1 inputSizeInPixelsY, +// This is the display resolution which the input image gets upscaled to +AF1 outputSizeInPixelsX, +AF1 outputSizeInPixelsY){ + // Output integer position to a pixel position in viewport. + con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)); + con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)); + con0[2]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5)); + con0[3]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5)); + // Viewport pixel position to normalized image space. + // This is used to get upper-left of 'F' tap. + con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX)); + con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY)); + // Centers of gather4, first offset from upper-left of 'F'. + // +---+---+ + // | | | + // +--(0)--+ + // | b | c | + // +---F---+---+---+ + // | e | f | g | h | + // +--(1)--+--(2)--+ + // | i | j | k | l | + // +---+---+---+---+ + // | n | o | + // +--(3)--+ + // | | | + // +---+---+ + con1[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); + con1[3]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsY)); + // These are from (0) instead of 'F'. + con2[0]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsX)); + con2[1]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); + con2[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); + con2[3]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); + con3[0]=AU1_AF1(AF1_( 0.0)*ARcpF1(inputSizeInPixelsX)); + con3[1]=AU1_AF1(AF1_( 4.0)*ARcpF1(inputSizeInPixelsY)); + con3[2]=con3[3]=0;} + +//If the an offset into the input image resource +A_STATIC void FsrEasuConOffset( + outAU4 con0, + outAU4 con1, + outAU4 con2, + outAU4 con3, + // This the rendered image resolution being upscaled + AF1 inputViewportInPixelsX, + AF1 inputViewportInPixelsY, + // This is the resolution of the resource containing the input image (useful for dynamic resolution) + AF1 inputSizeInPixelsX, + AF1 inputSizeInPixelsY, + // This is the display resolution which the input image gets upscaled to + AF1 outputSizeInPixelsX, + AF1 outputSizeInPixelsY, + // This is the input image offset into the resource containing it (useful for dynamic resolution) + AF1 inputOffsetInPixelsX, + AF1 inputOffsetInPixelsY) { + FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY); + con0[2] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsX * ARcpF1(outputSizeInPixelsX) - AF1_(0.5) + inputOffsetInPixelsX); + con0[3] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsY * ARcpF1(outputSizeInPixelsY) - AF1_(0.5) + inputOffsetInPixelsY); +} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(FSR_EASU_F) + // Input callback prototypes, need to be implemented by calling shader + AF4 FsrEasuRF(AF2 p); + AF4 FsrEasuGF(AF2 p); + AF4 FsrEasuBF(AF2 p); +//------------------------------------------------------------------------------------------------------------------------------ + // Filtering for a given tap for the scalar. + void FsrEasuTapF( + inout AF3 aC, // Accumulated color, with negative lobe. + inout AF1 aW, // Accumulated weight. + AF2 off, // Pixel offset from resolve position to tap. + AF2 dir, // Gradient direction. + AF2 len, // Length. + AF1 lob, // Negative lobe strength. + AF1 clp, // Clipping point. + AF3 c){ // Tap color. + // Rotate offset by direction. + AF2 v; + v.x=(off.x*( dir.x))+(off.y*dir.y); + v.y=(off.x*(-dir.y))+(off.y*dir.x); + // Anisotropy. + v*=len; + // Compute distance^2. + AF1 d2=v.x*v.x+v.y*v.y; + // Limit to the window as at corner, 2 taps can easily be outside. + d2=min(d2,clp); + // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x. + // (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2 + // |_______________________________________| |_______________| + // base window + // The general form of the 'base' is, + // (a*(b*x^2-1)^2-(a-1)) + // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe. + AF1 wB=AF1_(2.0/5.0)*d2+AF1_(-1.0); + AF1 wA=lob*d2+AF1_(-1.0); + wB*=wB; + wA*=wA; + wB=AF1_(25.0/16.0)*wB+AF1_(-(25.0/16.0-1.0)); + AF1 w=wB*wA; + // Do weighted average. + aC+=c*w;aW+=w;} +//------------------------------------------------------------------------------------------------------------------------------ + // Accumulate direction and length. + void FsrEasuSetF( + inout AF2 dir, + inout AF1 len, + AF2 pp, + AP1 biS,AP1 biT,AP1 biU,AP1 biV, + AF1 lA,AF1 lB,AF1 lC,AF1 lD,AF1 lE){ + // Compute bilinear weight, branches factor out as predicates are compiler time immediates. + // s t + // u v + AF1 w = AF1_(0.0); + if(biS)w=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y); + if(biT)w= pp.x *(AF1_(1.0)-pp.y); + if(biU)w=(AF1_(1.0)-pp.x)* pp.y ; + if(biV)w= pp.x * pp.y ; + // Direction is the '+' diff. + // a + // b c d + // e + // Then takes magnitude from abs average of both sides of 'c'. + // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms. + AF1 dc=lD-lC; + AF1 cb=lC-lB; + AF1 lenX=max(abs(dc),abs(cb)); + lenX=APrxLoRcpF1(lenX); + AF1 dirX=lD-lB; + dir.x+=dirX*w; + lenX=ASatF1(abs(dirX)*lenX); + lenX*=lenX; + len+=lenX*w; + // Repeat for the y axis. + AF1 ec=lE-lC; + AF1 ca=lC-lA; + AF1 lenY=max(abs(ec),abs(ca)); + lenY=APrxLoRcpF1(lenY); + AF1 dirY=lE-lA; + dir.y+=dirY*w; + lenY=ASatF1(abs(dirY)*lenY); + lenY*=lenY; + len+=lenY*w;} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrEasuF( + out AF3 pix, + AU2 ip, // Integer pixel position in output. + AU4 con0, // Constants generated by FsrEasuCon(). + AU4 con1, + AU4 con2, + AU4 con3){ +//------------------------------------------------------------------------------------------------------------------------------ + // Get position of 'f'. + AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw); + AF2 fp=floor(pp); + pp-=fp; +//------------------------------------------------------------------------------------------------------------------------------ + // 12-tap kernel. + // b c + // e f g h + // i j k l + // n o + // Gather 4 ordering. + // a b + // r g + // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions, + // a b <- unused (z) + // r g + // a b a b + // r g r g + // a b + // r g <- unused (z) + // Allowing dead-code removal to remove the 'z's. + AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw); + // These are from p0 to avoid pulling two constants on pre-Navi hardware. + AF2 p1=p0+AF2_AU2(con2.xy); + AF2 p2=p0+AF2_AU2(con2.zw); + AF2 p3=p0+AF2_AU2(con3.xy); + AF4 bczzR=FsrEasuRF(p0); + AF4 bczzG=FsrEasuGF(p0); + AF4 bczzB=FsrEasuBF(p0); + AF4 ijfeR=FsrEasuRF(p1); + AF4 ijfeG=FsrEasuGF(p1); + AF4 ijfeB=FsrEasuBF(p1); + AF4 klhgR=FsrEasuRF(p2); + AF4 klhgG=FsrEasuGF(p2); + AF4 klhgB=FsrEasuBF(p2); + AF4 zzonR=FsrEasuRF(p3); + AF4 zzonG=FsrEasuGF(p3); + AF4 zzonB=FsrEasuBF(p3); +//------------------------------------------------------------------------------------------------------------------------------ + // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD). + AF4 bczzL=bczzB*AF4_(0.5)+(bczzR*AF4_(0.5)+bczzG); + AF4 ijfeL=ijfeB*AF4_(0.5)+(ijfeR*AF4_(0.5)+ijfeG); + AF4 klhgL=klhgB*AF4_(0.5)+(klhgR*AF4_(0.5)+klhgG); + AF4 zzonL=zzonB*AF4_(0.5)+(zzonR*AF4_(0.5)+zzonG); + // Rename. + AF1 bL=bczzL.x; + AF1 cL=bczzL.y; + AF1 iL=ijfeL.x; + AF1 jL=ijfeL.y; + AF1 fL=ijfeL.z; + AF1 eL=ijfeL.w; + AF1 kL=klhgL.x; + AF1 lL=klhgL.y; + AF1 hL=klhgL.z; + AF1 gL=klhgL.w; + AF1 oL=zzonL.z; + AF1 nL=zzonL.w; + // Accumulate for bilinear interpolation. + AF2 dir=AF2_(0.0); + AF1 len=AF1_(0.0); + FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL); + FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL); + FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL); + FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL); +//------------------------------------------------------------------------------------------------------------------------------ + // Normalize with approximation, and cleanup close to zero. + AF2 dir2=dir*dir; + AF1 dirR=dir2.x+dir2.y; + AP1 zro=dirR w = -m/(n+e+w+s) +// 1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1) +// Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount. +// This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues. +// So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps. +// As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation. +// This stabilizes RCAS. +// RCAS does a simple highpass which is normalized against the local contrast then shaped, +// 0.25 +// 0.25 -1 0.25 +// 0.25 +// This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges. +// +// GLSL example for the required callbacks : +// +// AH4 FsrRcasLoadH(ASW2 p){return AH4(imageLoad(imgSrc,ASU2(p)));} +// void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b) +// { +// //do any simple input color conversions here or leave empty if none needed +// } +// +// FsrRcasCon need to be called from the CPU or GPU to set up constants. +// Including a GPU example here, the 'con' value would be stored out to a constant buffer. +// +// AU4 con; +// FsrRcasCon(con, +// 0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +// --------------- +// RCAS sharpening supports a CAS-like pass-through alpha via, +// #define FSR_RCAS_PASSTHROUGH_ALPHA 1 +// RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise. +// Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define, +// #define FSR_RCAS_DENOISE 1 +//============================================================================================================================== +// This is set at the limit of providing unnatural results for sharpening. +#define FSR_RCAS_LIMIT (0.25-(1.0/16.0)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). +A_STATIC void FsrRcasCon( +outAU4 con, +// The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +AF1 sharpness){ + // Transform from stops to linear value. + sharpness=AExp2F1(-sharpness); + varAF2(hSharp)=initAF2(sharpness,sharpness); + con[0]=AU1_AF1(sharpness); + con[1]=AU1_AH2_AF2(hSharp); + con[2]=0; + con[3]=0;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(FSR_RCAS_F) + // Input callback prototypes that need to be implemented by calling shader + AF4 FsrRcasLoadF(ASU2 p); + void FsrRcasInputF(inout AF1 r,inout AF1 g,inout AF1 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasF( + out AF1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out AF1 pixG, + out AF1 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AF1 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // Algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + ASU2 sp=ASU2(ip); + AF3 b=FsrRcasLoadF(sp+ASU2( 0,-1)).rgb; + AF3 d=FsrRcasLoadF(sp+ASU2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AF4 ee=FsrRcasLoadF(sp); + AF3 e=ee.rgb;pixA=ee.a; + #else + AF3 e=FsrRcasLoadF(sp).rgb; + #endif + AF3 f=FsrRcasLoadF(sp+ASU2( 1, 0)).rgb; + AF3 h=FsrRcasLoadF(sp+ASU2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + AF1 bR=b.r; + AF1 bG=b.g; + AF1 bB=b.b; + AF1 dR=d.r; + AF1 dG=d.g; + AF1 dB=d.b; + AF1 eR=e.r; + AF1 eG=e.g; + AF1 eB=e.b; + AF1 fR=f.r; + AF1 fG=f.g; + AF1 fB=f.b; + AF1 hR=h.r; + AF1 hG=h.g; + AF1 hB=h.b; + // Run optional input transform. + FsrRcasInputF(bR,bG,bB); + FsrRcasInputF(dR,dG,dB); + FsrRcasInputF(eR,eG,eB); + FsrRcasInputF(fR,fG,fB); + FsrRcasInputF(hR,hG,hB); + // Luma times 2. + AF1 bL=bB*AF1_(0.5)+(bR*AF1_(0.5)+bG); + AF1 dL=dB*AF1_(0.5)+(dR*AF1_(0.5)+dG); + AF1 eL=eB*AF1_(0.5)+(eR*AF1_(0.5)+eG); + AF1 fL=fB*AF1_(0.5)+(fR*AF1_(0.5)+fG); + AF1 hL=hB*AF1_(0.5)+(hR*AF1_(0.5)+hG); + // Noise detection. + AF1 nz=AF1_(0.25)*bL+AF1_(0.25)*dL+AF1_(0.25)*fL+AF1_(0.25)*hL-eL; + nz=ASatF1(abs(nz)*APrxMedRcpF1(AMax3F1(AMax3F1(bL,dL,eL),fL,hL)-AMin3F1(AMin3F1(bL,dL,eL),fL,hL))); + nz=AF1_(-0.5)*nz+AF1_(1.0); + // Min and max of ring. + AF1 mn4R=min(AMin3F1(bR,dR,fR),hR); + AF1 mn4G=min(AMin3F1(bG,dG,fG),hG); + AF1 mn4B=min(AMin3F1(bB,dB,fB),hB); + AF1 mx4R=max(AMax3F1(bR,dR,fR),hR); + AF1 mx4G=max(AMax3F1(bG,dG,fG),hG); + AF1 mx4B=max(AMax3F1(bB,dB,fB),hB); + // Immediate constants for peak range. + AF2 peakC=AF2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AF1 hitMinR=mn4R*ARcpF1(AF1_(4.0)*mx4R); + AF1 hitMinG=mn4G*ARcpF1(AF1_(4.0)*mx4G); + AF1 hitMinB=mn4B*ARcpF1(AF1_(4.0)*mx4B); + AF1 hitMaxR=(peakC.x-mx4R)*ARcpF1(AF1_(4.0)*mn4R+peakC.y); + AF1 hitMaxG=(peakC.x-mx4G)*ARcpF1(AF1_(4.0)*mn4G+peakC.y); + AF1 hitMaxB=(peakC.x-mx4B)*ARcpF1(AF1_(4.0)*mn4B+peakC.y); + AF1 lobeR=max(-hitMinR,hitMaxR); + AF1 lobeG=max(-hitMinG,hitMaxG); + AF1 lobeB=max(-hitMinB,hitMaxB); + AF1 lobe=max(AF1_(-FSR_RCAS_LIMIT),min(AMax3F1(lobeR,lobeG,lobeB),AF1_(0.0)))*AF1_AU1(con.x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AF1 rcpL=APrxMedRcpF1(AF1_(4.0)*lobe+AF1_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL; + return;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_H) + // Input callback prototypes that need to be implemented by calling shader + AH4 FsrRcasLoadH(ASW2 p); + void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasH( + out AH1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out AH1 pixG, + out AH1 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AH1 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // Sharpening algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + ASW2 sp=ASW2(ip); + AH3 b=FsrRcasLoadH(sp+ASW2( 0,-1)).rgb; + AH3 d=FsrRcasLoadH(sp+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee=FsrRcasLoadH(sp); + AH3 e=ee.rgb;pixA=ee.a; + #else + AH3 e=FsrRcasLoadH(sp).rgb; + #endif + AH3 f=FsrRcasLoadH(sp+ASW2( 1, 0)).rgb; + AH3 h=FsrRcasLoadH(sp+ASW2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + AH1 bR=b.r; + AH1 bG=b.g; + AH1 bB=b.b; + AH1 dR=d.r; + AH1 dG=d.g; + AH1 dB=d.b; + AH1 eR=e.r; + AH1 eG=e.g; + AH1 eB=e.b; + AH1 fR=f.r; + AH1 fG=f.g; + AH1 fB=f.b; + AH1 hR=h.r; + AH1 hG=h.g; + AH1 hB=h.b; + // Run optional input transform. + FsrRcasInputH(bR,bG,bB); + FsrRcasInputH(dR,dG,dB); + FsrRcasInputH(eR,eG,eB); + FsrRcasInputH(fR,fG,fB); + FsrRcasInputH(hR,hG,hB); + // Luma times 2. + AH1 bL=bB*AH1_(0.5)+(bR*AH1_(0.5)+bG); + AH1 dL=dB*AH1_(0.5)+(dR*AH1_(0.5)+dG); + AH1 eL=eB*AH1_(0.5)+(eR*AH1_(0.5)+eG); + AH1 fL=fB*AH1_(0.5)+(fR*AH1_(0.5)+fG); + AH1 hL=hB*AH1_(0.5)+(hR*AH1_(0.5)+hG); + // Noise detection. + AH1 nz=AH1_(0.25)*bL+AH1_(0.25)*dL+AH1_(0.25)*fL+AH1_(0.25)*hL-eL; + nz=ASatH1(abs(nz)*APrxMedRcpH1(AMax3H1(AMax3H1(bL,dL,eL),fL,hL)-AMin3H1(AMin3H1(bL,dL,eL),fL,hL))); + nz=AH1_(-0.5)*nz+AH1_(1.0); + // Min and max of ring. + AH1 mn4R=min(AMin3H1(bR,dR,fR),hR); + AH1 mn4G=min(AMin3H1(bG,dG,fG),hG); + AH1 mn4B=min(AMin3H1(bB,dB,fB),hB); + AH1 mx4R=max(AMax3H1(bR,dR,fR),hR); + AH1 mx4G=max(AMax3H1(bG,dG,fG),hG); + AH1 mx4B=max(AMax3H1(bB,dB,fB),hB); + // Immediate constants for peak range. + AH2 peakC=AH2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AH1 hitMinR=mn4R*ARcpH1(AH1_(4.0)*mx4R); + AH1 hitMinG=mn4G*ARcpH1(AH1_(4.0)*mx4G); + AH1 hitMinB=mn4B*ARcpH1(AH1_(4.0)*mx4B); + AH1 hitMaxR=(peakC.x-mx4R)*ARcpH1(AH1_(4.0)*mn4R+peakC.y); + AH1 hitMaxG=(peakC.x-mx4G)*ARcpH1(AH1_(4.0)*mn4G+peakC.y); + AH1 hitMaxB=(peakC.x-mx4B)*ARcpH1(AH1_(4.0)*mn4B+peakC.y); + AH1 lobeR=max(-hitMinR,hitMaxR); + AH1 lobeG=max(-hitMinG,hitMaxG); + AH1 lobeB=max(-hitMinB,hitMaxB); + AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH2_AU1(con.y).x; + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AH1 rcpL=APrxMedRcpH1(AH1_(4.0)*lobe+AH1_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_HX2) + // Input callback prototypes that need to be implemented by the calling shader + AH4 FsrRcasLoadHx2(ASW2 p); + void FsrRcasInputHx2(inout AH2 r,inout AH2 g,inout AH2 b); +//------------------------------------------------------------------------------------------------------------------------------ + // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store. + void FsrRcasDepackHx2(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){ + #ifdef A_HLSL + // Invoke a slower path for DX only, since it won't allow uninitialized values. + pix0.a=pix1.a=0.0; + #endif + pix0.rgb=AH3(pixR.x,pixG.x,pixB.x); + pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasHx2( + // Output values are for 2 8x8 tiles in a 16x8 region. + // pix.x = left 8x8 tile + // pix.y = right 8x8 tile + // This enables later processing to easily be packed as well. + out AH2 pixR, + out AH2 pixG, + out AH2 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AH2 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // No scaling algorithm uses minimal 3x3 pixel neighborhood. + ASW2 sp0=ASW2(ip); + AH3 b0=FsrRcasLoadHx2(sp0+ASW2( 0,-1)).rgb; + AH3 d0=FsrRcasLoadHx2(sp0+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee0=FsrRcasLoadHx2(sp0); + AH3 e0=ee0.rgb;pixA.r=ee0.a; + #else + AH3 e0=FsrRcasLoadHx2(sp0).rgb; + #endif + AH3 f0=FsrRcasLoadHx2(sp0+ASW2( 1, 0)).rgb; + AH3 h0=FsrRcasLoadHx2(sp0+ASW2( 0, 1)).rgb; + ASW2 sp1=sp0+ASW2(8,0); + AH3 b1=FsrRcasLoadHx2(sp1+ASW2( 0,-1)).rgb; + AH3 d1=FsrRcasLoadHx2(sp1+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee1=FsrRcasLoadHx2(sp1); + AH3 e1=ee1.rgb;pixA.g=ee1.a; + #else + AH3 e1=FsrRcasLoadHx2(sp1).rgb; + #endif + AH3 f1=FsrRcasLoadHx2(sp1+ASW2( 1, 0)).rgb; + AH3 h1=FsrRcasLoadHx2(sp1+ASW2( 0, 1)).rgb; + // Arrays of Structures to Structures of Arrays conversion. + AH2 bR=AH2(b0.r,b1.r); + AH2 bG=AH2(b0.g,b1.g); + AH2 bB=AH2(b0.b,b1.b); + AH2 dR=AH2(d0.r,d1.r); + AH2 dG=AH2(d0.g,d1.g); + AH2 dB=AH2(d0.b,d1.b); + AH2 eR=AH2(e0.r,e1.r); + AH2 eG=AH2(e0.g,e1.g); + AH2 eB=AH2(e0.b,e1.b); + AH2 fR=AH2(f0.r,f1.r); + AH2 fG=AH2(f0.g,f1.g); + AH2 fB=AH2(f0.b,f1.b); + AH2 hR=AH2(h0.r,h1.r); + AH2 hG=AH2(h0.g,h1.g); + AH2 hB=AH2(h0.b,h1.b); + // Run optional input transform. + FsrRcasInputHx2(bR,bG,bB); + FsrRcasInputHx2(dR,dG,dB); + FsrRcasInputHx2(eR,eG,eB); + FsrRcasInputHx2(fR,fG,fB); + FsrRcasInputHx2(hR,hG,hB); + // Luma times 2. + AH2 bL=bB*AH2_(0.5)+(bR*AH2_(0.5)+bG); + AH2 dL=dB*AH2_(0.5)+(dR*AH2_(0.5)+dG); + AH2 eL=eB*AH2_(0.5)+(eR*AH2_(0.5)+eG); + AH2 fL=fB*AH2_(0.5)+(fR*AH2_(0.5)+fG); + AH2 hL=hB*AH2_(0.5)+(hR*AH2_(0.5)+hG); + // Noise detection. + AH2 nz=AH2_(0.25)*bL+AH2_(0.25)*dL+AH2_(0.25)*fL+AH2_(0.25)*hL-eL; + nz=ASatH2(abs(nz)*APrxMedRcpH2(AMax3H2(AMax3H2(bL,dL,eL),fL,hL)-AMin3H2(AMin3H2(bL,dL,eL),fL,hL))); + nz=AH2_(-0.5)*nz+AH2_(1.0); + // Min and max of ring. + AH2 mn4R=min(AMin3H2(bR,dR,fR),hR); + AH2 mn4G=min(AMin3H2(bG,dG,fG),hG); + AH2 mn4B=min(AMin3H2(bB,dB,fB),hB); + AH2 mx4R=max(AMax3H2(bR,dR,fR),hR); + AH2 mx4G=max(AMax3H2(bG,dG,fG),hG); + AH2 mx4B=max(AMax3H2(bB,dB,fB),hB); + // Immediate constants for peak range. + AH2 peakC=AH2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AH2 hitMinR=mn4R*ARcpH2(AH2_(4.0)*mx4R); + AH2 hitMinG=mn4G*ARcpH2(AH2_(4.0)*mx4G); + AH2 hitMinB=mn4B*ARcpH2(AH2_(4.0)*mx4B); + AH2 hitMaxR=(peakC.x-mx4R)*ARcpH2(AH2_(4.0)*mn4R+peakC.y); + AH2 hitMaxG=(peakC.x-mx4G)*ARcpH2(AH2_(4.0)*mn4G+peakC.y); + AH2 hitMaxB=(peakC.x-mx4B)*ARcpH2(AH2_(4.0)*mn4B+peakC.y); + AH2 lobeR=max(-hitMinR,hitMaxR); + AH2 lobeG=max(-hitMinG,hitMaxG); + AH2 lobeB=max(-hitMinB,hitMaxB); + AH2 lobe=max(AH2_(-FSR_RCAS_LIMIT),min(AMax3H2(lobeR,lobeG,lobeB),AH2_(0.0)))*AH2_(AH2_AU1(con.y).x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AH2 rcpL=APrxMedRcpH2(AH2_(4.0)*lobe+AH2_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR +// +//------------------------------------------------------------------------------------------------------------------------------ +// Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts. +// Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel. +// The 'Lfga*()' functions provide a convenient way to introduce grain. +// These functions limit grain based on distance to signal limits. +// This is done so that the grain is temporally energy preserving, and thus won't modify image tonality. +// Grain application should be done in a linear colorspace. +// The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased). +//------------------------------------------------------------------------------------------------------------------------------ +// Usage, +// FsrLfga*( +// color, // In/out linear colorspace color {0 to 1} ranged. +// grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain. +// amount); // Amount of grain (0 to 1} ranged. +//------------------------------------------------------------------------------------------------------------------------------ +// Example if grain texture is monochrome: 'FsrLfgaF(color,AF3_(grain),amount)' +//============================================================================================================================== +#if defined(A_GPU) + // Maximum grain is the minimum distance to the signal limit. + void FsrLfgaF(inout AF3 c,AF3 t,AF1 a){c+=(t*AF3_(a))*min(AF3_(1.0)-c,c);} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + // Half precision version (slower). + void FsrLfgaH(inout AH3 c,AH3 t,AH1 a){c+=(t*AH3_(a))*min(AH3_(1.0)-c,c);} +//------------------------------------------------------------------------------------------------------------------------------ + // Packed half precision version (faster). + void FsrLfgaHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 tR,AH2 tG,AH2 tB,AH1 a){ + cR+=(tR*AH2_(a))*min(AH2_(1.0)-cR,cR);cG+=(tG*AH2_(a))*min(AH2_(1.0)-cG,cG);cB+=(tB*AH2_(a))*min(AH2_(1.0)-cB,cB);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER +// +//------------------------------------------------------------------------------------------------------------------------------ +// This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear. +// The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering. +//------------------------------------------------------------------------------------------------------------------------------ +// Reversible tonemapper usage, +// FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}. +// FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}. +//============================================================================================================================== +#if defined(A_GPU) + void FsrSrtmF(inout AF3 c){c*=AF3_(ARcpF1(AMax3F1(c.r,c.g,c.b)+AF1_(1.0)));} + // The extra max solves the c=1.0 case (which is a /0). + void FsrSrtmInvF(inout AF3 c){c*=AF3_(ARcpF1(max(AF1_(1.0/32768.0),AF1_(1.0)-AMax3F1(c.r,c.g,c.b))));} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + void FsrSrtmH(inout AH3 c){c*=AH3_(ARcpH1(AMax3H1(c.r,c.g,c.b)+AH1_(1.0)));} + void FsrSrtmInvH(inout AH3 c){c*=AH3_(ARcpH1(max(AH1_(1.0/32768.0),AH1_(1.0)-AMax3H1(c.r,c.g,c.b))));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrSrtmHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ + AH2 rcp=ARcpH2(AMax3H2(cR,cG,cB)+AH2_(1.0));cR*=rcp;cG*=rcp;cB*=rcp;} + void FsrSrtmInvHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ + AH2 rcp=ARcpH2(max(AH2_(1.0/32768.0),AH2_(1.0)-AMax3H2(cR,cG,cB)));cR*=rcp;cG*=rcp;cB*=rcp;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER +// +//------------------------------------------------------------------------------------------------------------------------------ +// Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// Gamma 2.0 is used so that the conversion back to linear is just to square the color. +// The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively. +// Given good non-biased temporal blue noise as dither input, +// the output dither will temporally conserve energy. +// This is done by choosing the linear nearest step point instead of perceptual nearest. +// See code below for details. +//------------------------------------------------------------------------------------------------------------------------------ +// DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION +// =============================================== +// - Output is 'uint(floor(saturate(n)*255.0+0.5))'. +// - Thus rounding is to nearest. +// - NaN gets converted to zero. +// - INF is clamped to {0.0 to 1.0}. +//============================================================================================================================== +#if defined(A_GPU) + // Hand tuned integer position to dither value, with more values than simple checkerboard. + // Only 32-bit has enough precision for this compddation. + // Output is {0 to <1}. + AF1 FsrTepdDitF(AU2 p,AU1 f){ + AF1 x=AF1_(p.x+f); + AF1 y=AF1_(p.y); + // The 1.61803 golden ratio. + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + // Number designed to provide a good visual pattern. + AF1 b=AF1_(1.0/3.69); + x=x*a+(y*b); + return AFractF1(x);} +//------------------------------------------------------------------------------------------------------------------------------ + // This version is 8-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC8F(inout AF3 c,AF1 dit){ + AF3 n=sqrt(c); + n=floor(n*AF3_(255.0))*AF3_(1.0/255.0); + AF3 a=n*n; + AF3 b=n+AF3_(1.0/255.0);b=b*b; + // Ratio of 'a' to 'b' required to produce 'c'. + // APrxLoRcpF1() won't work here (at least for very high dynamic ranges). + // APrxMedRcpF1() is an IADD,FMA,MUL. + AF3 r=(c-b)*APrxMedRcpF3(a-b); + // Use the ratio as a cutoff to choose 'a' or 'b'. + // AGtZeroF1() is a MUL. + c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // This version is 10-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC10F(inout AF3 c,AF1 dit){ + AF3 n=sqrt(c); + n=floor(n*AF3_(1023.0))*AF3_(1.0/1023.0); + AF3 a=n*n; + AF3 b=n+AF3_(1.0/1023.0);b=b*b; + AF3 r=(c-b)*APrxMedRcpF3(a-b); + c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/1023.0));} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + AH1 FsrTepdDitH(AU2 p,AU1 f){ + AF1 x=AF1_(p.x+f); + AF1 y=AF1_(p.y); + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + AF1 b=AF1_(1.0/3.69); + x=x*a+(y*b); + return AH1(AFractF1(x));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8H(inout AH3 c,AH1 dit){ + AH3 n=sqrt(c); + n=floor(n*AH3_(255.0))*AH3_(1.0/255.0); + AH3 a=n*n; + AH3 b=n+AH3_(1.0/255.0);b=b*b; + AH3 r=(c-b)*APrxMedRcpH3(a-b); + c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10H(inout AH3 c,AH1 dit){ + AH3 n=sqrt(c); + n=floor(n*AH3_(1023.0))*AH3_(1.0/1023.0); + AH3 a=n*n; + AH3 b=n+AH3_(1.0/1023.0);b=b*b; + AH3 r=(c-b)*APrxMedRcpH3(a-b); + c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/1023.0));} +//============================================================================================================================== + // This computes dither for positions 'p' and 'p+{8,0}'. + AH2 FsrTepdDitHx2(AU2 p,AU1 f){ + AF2 x; + x.x=AF1_(p.x+f); + x.y=x.x+AF1_(8.0); + AF1 y=AF1_(p.y); + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + AF1 b=AF1_(1.0/3.69); + x=x*AF2_(a)+AF2_(y*b); + return AH2(AFractF2(x));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ + AH2 nR=sqrt(cR); + AH2 nG=sqrt(cG); + AH2 nB=sqrt(cB); + nR=floor(nR*AH2_(255.0))*AH2_(1.0/255.0); + nG=floor(nG*AH2_(255.0))*AH2_(1.0/255.0); + nB=floor(nB*AH2_(255.0))*AH2_(1.0/255.0); + AH2 aR=nR*nR; + AH2 aG=nG*nG; + AH2 aB=nB*nB; + AH2 bR=nR+AH2_(1.0/255.0);bR=bR*bR; + AH2 bG=nG+AH2_(1.0/255.0);bG=bG*bG; + AH2 bB=nB+AH2_(1.0/255.0);bB=bB*bB; + AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); + AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); + AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); + cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/255.0)); + cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/255.0)); + cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ + AH2 nR=sqrt(cR); + AH2 nG=sqrt(cG); + AH2 nB=sqrt(cB); + nR=floor(nR*AH2_(1023.0))*AH2_(1.0/1023.0); + nG=floor(nG*AH2_(1023.0))*AH2_(1.0/1023.0); + nB=floor(nB*AH2_(1023.0))*AH2_(1.0/1023.0); + AH2 aR=nR*nR; + AH2 aG=nG*nG; + AH2 aB=nB*nB; + AH2 bR=nR+AH2_(1.0/1023.0);bR=bR*bR; + AH2 bG=nG+AH2_(1.0/1023.0);bG=bG*bG; + AH2 bB=nB+AH2_(1.0/1023.0);bB=bB*bB; + AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); + AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); + AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); + cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/1023.0)); + cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/1023.0)); + cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/1023.0));} +#endif diff --git a/externals/FidelityFX-FSR/license.txt b/externals/FidelityFX-FSR/license.txt new file mode 100644 index 000000000..324cba594 --- /dev/null +++ b/externals/FidelityFX-FSR/license.txt @@ -0,0 +1,19 @@ +Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. From b60966041c5b1dccd9c5c5ca00fb02353c2151bb Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 17 Oct 2021 17:22:16 +0200 Subject: [PATCH 105/156] Presentation: add Nearest Neighbor filter. --- src/common/settings.h | 9 ++-- .../renderer_opengl/renderer_opengl.cpp | 19 ++++++++- .../renderer_opengl/renderer_opengl.h | 1 + .../renderer_vulkan/vk_blit_screen.cpp | 41 +++++++++++++++---- .../renderer_vulkan/vk_blit_screen.h | 4 +- src/yuzu/configuration/configure_graphics.ui | 7 +++- 6 files changed, 67 insertions(+), 14 deletions(-) diff --git a/src/common/settings.h b/src/common/settings.h index 9da447ce0..84dab5217 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -61,10 +61,11 @@ enum class ResolutionSetup : u32 { }; enum class ScalingFilter : u32 { - Bilinear = 0, - Bicubic = 1, - ScaleForce = 2, - Fsr = 3, + NearestNeighbor = 0, + Bilinear = 1, + Bicubic = 2, + ScaleForce = 3, + Fsr = 4, }; struct ResolutionScalingInfo { diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 955dbc744..68423601c 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -264,6 +264,10 @@ void RendererOpenGL::InitOpenGLObjects() { glSamplerParameteri(present_sampler.handle, GL_TEXTURE_MIN_FILTER, GL_LINEAR); glSamplerParameteri(present_sampler.handle, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + present_sampler_nn.Create(); + glSamplerParameteri(present_sampler_nn.handle, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glSamplerParameteri(present_sampler_nn.handle, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + // Generate VBO handle for drawing vertex_buffer.Create(); @@ -346,6 +350,9 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { GLuint fragment_handle; const auto filter = Settings::values.scaling_filter.GetValue(); switch (filter) { + case Settings::ScalingFilter::NearestNeighbor: + fragment_handle = present_bilinear_fragment.handle; + break; case Settings::ScalingFilter::Bilinear: fragment_handle = present_bilinear_fragment.handle; break; @@ -355,6 +362,12 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { case Settings::ScalingFilter::ScaleForce: fragment_handle = present_scaleforce_fragment.handle; break; + case Settings::ScalingFilter::Fsr: + LOG_WARNING( + Render_OpenGL, + "FidelityFX FSR Super Sampling is not supported in OpenGL, changing to ScaleForce"); + fragment_handle = present_scaleforce_fragment.handle; + break; default: fragment_handle = present_bilinear_fragment.handle; break; @@ -464,7 +477,11 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { } glBindTextureUnit(0, screen_info.display_texture); - glBindSampler(0, present_sampler.handle); + if (Settings::values.scaling_filter.GetValue() != Settings::ScalingFilter::NearestNeighbor) { + glBindSampler(0, present_sampler.handle); + } else { + glBindSampler(0, present_sampler_nn.handle); + } glClear(GL_COLOR_BUFFER_BIT); glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index bf3d3502c..504ddbe7b 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -109,6 +109,7 @@ private: // OpenGL object IDs OGLSampler present_sampler; + OGLSampler present_sampler_nn; OGLBuffer vertex_buffer; OGLProgram present_vertex; OGLProgram present_bilinear_fragment; diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 8ce60e874..334eeb92e 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -152,7 +152,9 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, use_accelerated ? screen_info.image_view : *raw_image_views[image_index]; if (!fsr) { - UpdateDescriptorSet(image_index, source_image_view); + const bool is_nn = + Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::NearestNeighbor; + UpdateDescriptorSet(image_index, source_image_view, is_nn); } BufferData data; @@ -247,7 +249,7 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, crop_rect = crop_rect.Scale(Settings::values.resolution_info.up_factor); VkImageView fsr_image_view = fsr->Draw(scheduler, image_index, source_image_view, crop_rect); - UpdateDescriptorSet(image_index, fsr_image_view); + UpdateDescriptorSet(image_index, fsr_image_view, true); } scheduler.Record( @@ -286,6 +288,9 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, const auto filter = Settings::values.scaling_filter.GetValue(); cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE); switch (filter) { + case Settings::ScalingFilter::NearestNeighbor: + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *bilinear_pipeline); + break; case Settings::ScalingFilter::Bilinear: cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *bilinear_pipeline); break; @@ -745,13 +750,33 @@ void VKBlitScreen::CreateGraphicsPipeline() { } void VKBlitScreen::CreateSampler() { - bool linear = Settings::values.scaling_filter.GetValue() != Settings::ScalingFilter::Fsr; const VkSamplerCreateInfo ci{ .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .magFilter = linear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST, - .minFilter = linear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST, + .magFilter = VK_FILTER_LINEAR, + .minFilter = VK_FILTER_LINEAR, + .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST, + .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, + .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, + .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, + .mipLodBias = 0.0f, + .anisotropyEnable = VK_FALSE, + .maxAnisotropy = 0.0f, + .compareEnable = VK_FALSE, + .compareOp = VK_COMPARE_OP_NEVER, + .minLod = 0.0f, + .maxLod = 0.0f, + .borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK, + .unnormalizedCoordinates = VK_FALSE, + }; + + const VkSamplerCreateInfo ci_nn{ + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .magFilter = VK_FILTER_NEAREST, + .minFilter = VK_FILTER_NEAREST, .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST, .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, @@ -768,6 +793,7 @@ void VKBlitScreen::CreateSampler() { }; sampler = device.GetLogical().CreateSampler(ci); + nn_sampler = device.GetLogical().CreateSampler(ci_nn); } void VKBlitScreen::CreateFramebuffers() { @@ -862,7 +888,8 @@ void VKBlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) } } -void VKBlitScreen::UpdateDescriptorSet(std::size_t image_index, VkImageView image_view) const { +void VKBlitScreen::UpdateDescriptorSet(std::size_t image_index, VkImageView image_view, + bool nn) const { const VkDescriptorBufferInfo buffer_info{ .buffer = *buffer, .offset = offsetof(BufferData, uniform), @@ -883,7 +910,7 @@ void VKBlitScreen::UpdateDescriptorSet(std::size_t image_index, VkImageView imag }; const VkDescriptorImageInfo image_info{ - .sampler = *sampler, + .sampler = nn ? *nn_sampler : *sampler, .imageView = image_view, .imageLayout = VK_IMAGE_LAYOUT_GENERAL, }; diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index 337931468..448a2fbe6 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -90,7 +90,7 @@ private: void CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer); void CreateRawImages(const Tegra::FramebufferConfig& framebuffer); - void UpdateDescriptorSet(std::size_t image_index, VkImageView image_view) const; + void UpdateDescriptorSet(std::size_t image_index, VkImageView image_view, bool nn) const; void SetUniformData(BufferData& data, const Layout::FramebufferLayout layout) const; void SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer, const Layout::FramebufferLayout layout) const; @@ -115,12 +115,14 @@ private: vk::DescriptorPool descriptor_pool; vk::DescriptorSetLayout descriptor_set_layout; vk::PipelineLayout pipeline_layout; + vk::Pipeline nearest_neightbor_pipeline; vk::Pipeline bilinear_pipeline; vk::Pipeline bicubic_pipeline; vk::Pipeline scaleforce_pipeline; vk::RenderPass renderpass; std::vector framebuffers; vk::DescriptorSets descriptor_sets; + vk::Sampler nn_sampler; vk::Sampler sampler; vk::Buffer buffer; diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui index 014ca6683..fe2f6bb7f 100644 --- a/src/yuzu/configuration/configure_graphics.ui +++ b/src/yuzu/configuration/configure_graphics.ui @@ -387,6 +387,11 @@ + + + Nearest Neighbor + + Bilinear @@ -404,7 +409,7 @@ - FidelityFX Super Resolution + FidelityFX Super Resolution [Vulkan Only] From 425ab9ef4b982213f4ee0d53196f5474e255374f Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 17 Oct 2021 18:01:18 +0200 Subject: [PATCH 106/156] Texture Cache: Fix downscaling and correct memory comsumption. --- .../renderer_opengl/gl_texture_cache.cpp | 41 ++++++-- .../renderer_opengl/gl_texture_cache.h | 2 +- .../renderer_vulkan/vk_texture_cache.cpp | 93 +++++++++++++++++-- .../renderer_vulkan/vk_texture_cache.h | 3 + src/video_core/texture_cache/image_base.cpp | 4 +- src/video_core/texture_cache/image_base.h | 5 + src/video_core/texture_cache/texture_cache.h | 31 ++++--- .../texture_cache/texture_cache_base.h | 2 +- 8 files changed, 146 insertions(+), 35 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 944a3aa65..34d3723e5 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -876,7 +876,7 @@ void Image::CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t b } } -bool Image::Scale() { +bool Image::Scale(bool up_scale) { const auto format_type = GetFormatType(info.format); const GLenum attachment = [format_type] { switch (format_type) { @@ -944,14 +944,25 @@ bool Image::Scale() { const GLuint draw_fbo = runtime->rescale_draw_fbos[fbo_index].handle; for (s32 layer = 0; layer < info.resources.layers; ++layer) { for (s32 level = 0; level < info.resources.levels; ++level) { - const u32 src_level_width = std::max(1u, original_width >> level); - const u32 src_level_height = std::max(1u, original_height >> level); - const u32 dst_level_width = std::max(1u, scaled_width >> level); - const u32 dst_level_height = std::max(1u, scaled_height >> level); + const u32 src_level_width = + std::max(1u, (up_scale ? original_width : scaled_width) >> level); + const u32 src_level_height = + std::max(1u, (up_scale ? original_height : scaled_height) >> level); + const u32 dst_level_width = + std::max(1u, (up_scale ? scaled_width : original_width) >> level); + const u32 dst_level_height = + std::max(1u, (up_scale ? scaled_height : original_height) >> level); + + if (up_scale) { + glNamedFramebufferTextureLayer(read_fbo, attachment, texture.handle, level, layer); + glNamedFramebufferTextureLayer(draw_fbo, attachment, upscaled_backup.handle, level, + layer); + } else { + glNamedFramebufferTextureLayer(read_fbo, attachment, upscaled_backup.handle, level, + layer); + glNamedFramebufferTextureLayer(draw_fbo, attachment, texture.handle, level, layer); + } - glNamedFramebufferTextureLayer(read_fbo, attachment, texture.handle, level, layer); - glNamedFramebufferTextureLayer(draw_fbo, attachment, upscaled_backup.handle, level, - layer); glBlitNamedFramebuffer(read_fbo, draw_fbo, 0, 0, src_level_width, src_level_height, 0, 0, dst_level_width, dst_level_height, mask, filter); } @@ -959,7 +970,12 @@ bool Image::Scale() { if (scissor_test != GL_FALSE) { glEnablei(GL_SCISSOR_TEST, 0); } - current_texture = upscaled_backup.handle; + if (up_scale) { + current_texture = upscaled_backup.handle; + } else { + current_texture = texture.handle; + } + return true; } @@ -981,6 +997,7 @@ bool Image::ScaleUp() { flags &= ~ImageFlagBits::Rescaled; return false; } + scale_count++; if (!Scale()) { flags &= ~ImageFlagBits::Rescaled; return false; @@ -996,7 +1013,11 @@ bool Image::ScaleDown() { if (!runtime->resolution.active) { return false; } - current_texture = texture.handle; + scale_count++; + if (!Scale(false)) { + flags &= ~ImageFlagBits::Rescaled; + return false; + } return true; } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index f90dbfe9e..81aaef3da 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -205,7 +205,7 @@ private: void CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); - bool Scale(); + bool Scale(bool up_scale = true); OGLTexture texture; OGLTexture upscaled_backup; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 17c62e27d..51367c01d 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -592,7 +592,8 @@ struct RangedBarrierRange { } void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, const ImageInfo& info, - VkImageAspectFlags aspect_mask, const Settings::ResolutionScalingInfo& resolution) { + VkImageAspectFlags aspect_mask, const Settings::ResolutionScalingInfo& resolution, + bool up_scaling = true) { const bool is_2d = info.type == ImageType::e2D; const auto resources = info.resources; const VkExtent2D extent{ @@ -605,14 +606,16 @@ void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, con scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([dst_image, src_image, extent, resources, aspect_mask, resolution, is_2d, - vk_filter](vk::CommandBuffer cmdbuf) { + vk_filter, up_scaling](vk::CommandBuffer cmdbuf) { const VkOffset2D src_size{ - .x = static_cast(extent.width), - .y = static_cast(extent.height), + .x = static_cast(up_scaling ? extent.width : resolution.ScaleUp(extent.width)), + .y = static_cast(is_2d && up_scaling ? extent.height + : resolution.ScaleUp(extent.height)), }; const VkOffset2D dst_size{ - .x = static_cast(resolution.ScaleUp(extent.width)), - .y = static_cast(is_2d ? resolution.ScaleUp(extent.height) : extent.height), + .x = static_cast(up_scaling ? resolution.ScaleUp(extent.width) : extent.width), + .y = static_cast(is_2d && up_scaling ? resolution.ScaleUp(extent.height) + : extent.height), }; boost::container::small_vector regions; regions.reserve(resources.levels); @@ -1134,6 +1137,7 @@ bool Image::ScaleUp() { if (!resolution.active) { return false; } + scale_count++; const auto& device = runtime->device; const bool is_2d = info.type == ImageType::e2D; const u32 scaled_width = resolution.ScaleUp(info.size.width); @@ -1161,8 +1165,10 @@ bool Image::ScaleUp() { using namespace VideoCommon; static constexpr auto BLIT_OPERATION = Tegra::Engines::Fermi2D::Operation::SrcCopy; - const auto view_info = ImageViewInfo(ImageViewType::e2D, info.format); - scale_view = std::make_unique(*runtime, view_info, NULL_IMAGE_ID, *this); + if (!scale_view) { + const auto view_info = ImageViewInfo(ImageViewType::e2D, info.format); + scale_view = std::make_unique(*runtime, view_info, NULL_IMAGE_ID, *this); + } auto* view_ptr = scale_view.get(); const Region2D src_region{ @@ -1178,7 +1184,10 @@ bool Image::ScaleUp() { .height = scaled_height, }; if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) { - scale_framebuffer = std::make_unique(*runtime, view_ptr, nullptr, extent); + if (!scale_framebuffer) { + scale_framebuffer = + std::make_unique(*runtime, view_ptr, nullptr, extent); + } const auto color_view = scale_view->Handle(Shader::TextureType::Color2D); runtime->blit_image_helper.BlitColor( @@ -1186,7 +1195,10 @@ bool Image::ScaleUp() { Tegra::Engines::Fermi2D::Filter::Bilinear, BLIT_OPERATION); } else if (!runtime->device.IsBlitDepthStencilSupported() && aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { - scale_framebuffer = std::make_unique(*runtime, nullptr, view_ptr, extent); + if (!scale_framebuffer) { + scale_framebuffer = + std::make_unique(*runtime, view_ptr, nullptr, extent); + } runtime->blit_image_helper.BlitDepthStencil( scale_framebuffer.get(), scale_view->DepthView(), scale_view->StencilView(), dst_region, src_region, Tegra::Engines::Fermi2D::Filter::Point, BLIT_OPERATION); @@ -1209,6 +1221,67 @@ bool Image::ScaleDown() { if (!resolution.active) { return false; } + const auto& device = runtime->device; + const bool is_2d = info.type == ImageType::e2D; + const u32 scaled_width = resolution.ScaleUp(info.size.width); + const u32 scaled_height = is_2d ? resolution.ScaleUp(info.size.height) : info.size.height; + if (aspect_mask == 0) { + aspect_mask = ImageAspectMask(info.format); + } + static constexpr auto OPTIMAL_FORMAT = FormatType::Optimal; + const PixelFormat format = StorageFormat(info.format); + const auto vk_format = MaxwellToVK::SurfaceFormat(device, OPTIMAL_FORMAT, false, format).format; + const auto blit_usage = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; + if (device.IsFormatSupported(vk_format, blit_usage, OPTIMAL_FORMAT)) { + BlitScale(*scheduler, *scaled_image, *original_image, info, aspect_mask, resolution, false); + } else { + using namespace VideoCommon; + static constexpr auto BLIT_OPERATION = Tegra::Engines::Fermi2D::Operation::SrcCopy; + + if (!normal_view) { + const auto view_info = ImageViewInfo(ImageViewType::e2D, info.format); + normal_view = std::make_unique(*runtime, view_info, NULL_IMAGE_ID, *this); + } + auto* view_ptr = normal_view.get(); + + const Region2D src_region{ + .start = {0, 0}, + .end = {static_cast(scaled_width), static_cast(scaled_height)}, + }; + const Region2D dst_region{ + .start = {0, 0}, + .end = {static_cast(info.size.width), static_cast(info.size.height)}, + }; + const VkExtent2D extent{ + .width = scaled_width, + .height = scaled_height, + }; + if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) { + if (!normal_framebuffer) { + normal_framebuffer = + std::make_unique(*runtime, view_ptr, nullptr, extent); + } + const auto color_view = normal_view->Handle(Shader::TextureType::Color2D); + + runtime->blit_image_helper.BlitColor( + normal_framebuffer.get(), color_view, dst_region, src_region, + Tegra::Engines::Fermi2D::Filter::Bilinear, BLIT_OPERATION); + } else if (!runtime->device.IsBlitDepthStencilSupported() && + aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { + if (!normal_framebuffer) { + normal_framebuffer = + std::make_unique(*runtime, view_ptr, nullptr, extent); + } + runtime->blit_image_helper.BlitDepthStencil( + normal_framebuffer.get(), normal_view->DepthView(), normal_view->StencilView(), + dst_region, src_region, Tegra::Engines::Fermi2D::Filter::Point, BLIT_OPERATION); + } else { + // TODO: Use helper blits where applicable + flags &= ~ImageFlagBits::Rescaled; + LOG_ERROR(Render_Vulkan, "Device does not support scaling format {}", format); + return false; + } + } ASSERT(info.type != ImageType::Linear); current_image = *original_image; return true; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 6dc190632..df854a20c 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -148,6 +148,9 @@ private: std::unique_ptr scale_framebuffer; std::unique_ptr scale_view; + + std::unique_ptr normal_framebuffer; + std::unique_ptr normal_view; }; class ImageView : public VideoCommon::ImageViewBase { diff --git a/src/video_core/texture_cache/image_base.cpp b/src/video_core/texture_cache/image_base.cpp index 1909c9ecb..3db2ec825 100644 --- a/src/video_core/texture_cache/image_base.cpp +++ b/src/video_core/texture_cache/image_base.cpp @@ -60,8 +60,8 @@ namespace { ImageBase::ImageBase(const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) : info{info_}, guest_size_bytes{CalculateGuestSizeInBytes(info)}, unswizzled_size_bytes{CalculateUnswizzledSizeBytes(info)}, - converted_size_bytes{CalculateConvertedSizeBytes(info)}, scale_rating{}, - scale_tick{}, gpu_addr{gpu_addr_}, cpu_addr{cpu_addr_}, + converted_size_bytes{CalculateConvertedSizeBytes(info)}, scale_rating{}, scale_tick{}, + scale_count{}, gpu_addr{gpu_addr_}, cpu_addr{cpu_addr_}, cpu_addr_end{cpu_addr + guest_size_bytes}, mip_level_offsets{CalculateMipLevelOffsets(info)} { if (info.type == ImageType::e3D) { slice_offsets = CalculateSliceOffsets(info); diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index bab290ac7..cd4b5f636 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -77,6 +77,10 @@ struct ImageBase { void CheckBadOverlapState(); void CheckAliasState(); + bool HasScaled() { + return scale_count > 0; + } + ImageInfo info; u32 guest_size_bytes = 0; @@ -84,6 +88,7 @@ struct ImageBase { u32 converted_size_bytes = 0; u32 scale_rating = 0; u64 scale_tick = 0; + u32 scale_count = 0; ImageFlagBits flags = ImageFlagBits::CpuModified; GPUVAddr gpu_addr = 0; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index a035d2b18..cf0d33a45 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -854,8 +854,8 @@ void TextureCache

::InvalidateScale(Image& image) { } template -u64 TextureCache

::GetScaledImageSizeBytes(Image& image) { - const f32 add_to_size = Settings::values.resolution_info.up_factor - 1.0f; +u64 TextureCache

::GetScaledImageSizeBytes(ImageBase& image) { + const f32 add_to_size = Settings::values.resolution_info.up_factor; const bool sign = std::signbit(add_to_size); const u32 image_size_bytes = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); const u64 tentative_size = image_size_bytes * static_cast(std::abs(add_to_size)); @@ -865,11 +865,14 @@ u64 TextureCache

::GetScaledImageSizeBytes(Image& image) { template bool TextureCache

::ScaleUp(Image& image) { + const bool has_copy = image.HasScaled(); const bool rescaled = image.ScaleUp(); if (!rescaled) { return false; } - total_used_memory += GetScaledImageSizeBytes(image); + if (!has_copy) { + total_used_memory += GetScaledImageSizeBytes(image); + } InvalidateScale(image); return true; } @@ -880,7 +883,10 @@ bool TextureCache

::ScaleDown(Image& image) { if (!rescaled) { return false; } - total_used_memory -= GetScaledImageSizeBytes(image); + const bool has_copy = image.HasScaled(); + if (!has_copy) { + total_used_memory -= GetScaledImageSizeBytes(image); + } InvalidateScale(image); return true; } @@ -1391,13 +1397,6 @@ void TextureCache

::UnregisterImage(ImageId image_id) { "Trying to unregister an already registered image"); image.flags &= ~ImageFlagBits::Registered; image.flags &= ~ImageFlagBits::BadOverlap; - u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); - if ((IsPixelFormatASTC(image.info.format) && - True(image.flags & ImageFlagBits::AcceleratedUpload)) || - True(image.flags & ImageFlagBits::Converted)) { - tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); - } - total_used_memory -= Common::AlignUp(tentative_size, 1024); lru_cache.Free(image.lru_index); const auto& clear_page_table = [this, image_id]( @@ -1478,6 +1477,16 @@ template void TextureCache

::TrackImage(ImageBase& image, ImageId image_id) { ASSERT(False(image.flags & ImageFlagBits::Tracked)); image.flags |= ImageFlagBits::Tracked; + if (image.HasScaled()) { + total_used_memory -= GetScaledImageSizeBytes(image); + } + u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); + if ((IsPixelFormatASTC(image.info.format) && + True(image.flags & ImageFlagBits::AcceleratedUpload)) || + True(image.flags & ImageFlagBits::Converted)) { + tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); + } + total_used_memory -= Common::AlignUp(tentative_size, 1024); if (False(image.flags & ImageFlagBits::Sparse)) { rasterizer.UpdatePagesCachedCount(image.cpu_addr, image.guest_size_bytes, 1); return; diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 4dbe050af..e210393ba 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -331,7 +331,7 @@ private: void InvalidateScale(Image& image); bool ScaleUp(Image& image); bool ScaleDown(Image& image); - u64 GetScaledImageSizeBytes(Image& image); + u64 GetScaledImageSizeBytes(ImageBase& image); Runtime& runtime; VideoCore::RasterizerInterface& rasterizer; From 50b4c774cb30380761936f4cb897c31fc2d49075 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 17 Oct 2021 20:08:39 +0200 Subject: [PATCH 107/156] Vulkan: Fix Blit Depth Stencil --- src/video_core/renderer_vulkan/blit_image.cpp | 27 +++++++++++-------- src/video_core/renderer_vulkan/blit_image.h | 7 ++--- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp index bc3e4b93d..239698423 100644 --- a/src/video_core/renderer_vulkan/blit_image.cpp +++ b/src/video_core/renderer_vulkan/blit_image.cpp @@ -374,7 +374,7 @@ void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, VkImageView }; const VkPipelineLayout layout = *one_texture_pipeline_layout; const VkSampler sampler = is_linear ? *linear_sampler : *nearest_sampler; - const VkPipeline pipeline = FindOrEmplacePipeline(key); + const VkPipeline pipeline = FindOrEmplaceColorPipeline(key); scheduler.RequestRenderpass(dst_framebuffer); scheduler.Record([this, dst_region, src_region, pipeline, layout, sampler, src_view](vk::CommandBuffer cmdbuf) { @@ -397,10 +397,13 @@ void BlitImageHelper::BlitDepthStencil(const Framebuffer* dst_framebuffer, Tegra::Engines::Fermi2D::Operation operation) { ASSERT(filter == Tegra::Engines::Fermi2D::Filter::Point); ASSERT(operation == Tegra::Engines::Fermi2D::Operation::SrcCopy); - + const BlitImagePipelineKey key{ + .renderpass = dst_framebuffer->RenderPass(), + .operation = operation, + }; const VkPipelineLayout layout = *two_textures_pipeline_layout; const VkSampler sampler = *nearest_sampler; - const VkPipeline pipeline = BlitDepthStencilPipeline(dst_framebuffer->RenderPass()); + const VkPipeline pipeline = FindOrEmplaceDepthStencilPipeline(key); scheduler.RequestRenderpass(dst_framebuffer); scheduler.Record([dst_region, src_region, pipeline, layout, sampler, src_depth_view, src_stencil_view, this](vk::CommandBuffer cmdbuf) { @@ -492,7 +495,7 @@ void BlitImageHelper::Convert(VkPipeline pipeline, const Framebuffer* dst_frameb scheduler.InvalidateState(); } -VkPipeline BlitImageHelper::FindOrEmplacePipeline(const BlitImagePipelineKey& key) { +VkPipeline BlitImageHelper::FindOrEmplaceColorPipeline(const BlitImagePipelineKey& key) { const auto it = std::ranges::find(blit_color_keys, key); if (it != blit_color_keys.end()) { return *blit_color_pipelines[std::distance(blit_color_keys.begin(), it)]; @@ -546,12 +549,14 @@ VkPipeline BlitImageHelper::FindOrEmplacePipeline(const BlitImagePipelineKey& ke return *blit_color_pipelines.back(); } -VkPipeline BlitImageHelper::BlitDepthStencilPipeline(VkRenderPass renderpass) { - if (blit_depth_stencil_pipeline) { - return *blit_depth_stencil_pipeline; +VkPipeline BlitImageHelper::FindOrEmplaceDepthStencilPipeline(const BlitImagePipelineKey& key) { + const auto it = std::ranges::find(blit_depth_stencil_keys, key); + if (it != blit_depth_stencil_keys.end()) { + return *blit_depth_stencil_pipelines[std::distance(blit_depth_stencil_keys.begin(), it)]; } + blit_depth_stencil_keys.push_back(key); const std::array stages = MakeStages(*full_screen_vert, *blit_depth_stencil_frag); - blit_depth_stencil_pipeline = device.GetLogical().CreateGraphicsPipeline({ + blit_depth_stencil_pipelines.push_back(device.GetLogical().CreateGraphicsPipeline({ .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -567,12 +572,12 @@ VkPipeline BlitImageHelper::BlitDepthStencilPipeline(VkRenderPass renderpass) { .pColorBlendState = &PIPELINE_COLOR_BLEND_STATE_EMPTY_CREATE_INFO, .pDynamicState = &PIPELINE_DYNAMIC_STATE_CREATE_INFO, .layout = *two_textures_pipeline_layout, - .renderPass = renderpass, + .renderPass = key.renderpass, .subpass = 0, .basePipelineHandle = VK_NULL_HANDLE, .basePipelineIndex = 0, - }); - return *blit_depth_stencil_pipeline; + })); + return *blit_depth_stencil_pipelines.back(); } void BlitImageHelper::ConvertDepthToColorPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass) { diff --git a/src/video_core/renderer_vulkan/blit_image.h b/src/video_core/renderer_vulkan/blit_image.h index c0f4a16a4..d77f76678 100644 --- a/src/video_core/renderer_vulkan/blit_image.h +++ b/src/video_core/renderer_vulkan/blit_image.h @@ -60,9 +60,9 @@ private: void Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer, const ImageView& src_image_view, u32 up_scale, u32 down_shift); - [[nodiscard]] VkPipeline FindOrEmplacePipeline(const BlitImagePipelineKey& key); + [[nodiscard]] VkPipeline FindOrEmplaceColorPipeline(const BlitImagePipelineKey& key); - [[nodiscard]] VkPipeline BlitDepthStencilPipeline(VkRenderPass renderpass); + [[nodiscard]] VkPipeline FindOrEmplaceDepthStencilPipeline(const BlitImagePipelineKey& key); void ConvertDepthToColorPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass); @@ -88,7 +88,8 @@ private: std::vector blit_color_keys; std::vector blit_color_pipelines; - vk::Pipeline blit_depth_stencil_pipeline; + std::vector blit_depth_stencil_keys; + std::vector blit_depth_stencil_pipelines; vk::Pipeline convert_d32_to_r32_pipeline; vk::Pipeline convert_r32_to_d32_pipeline; vk::Pipeline convert_d16_to_r16_pipeline; From c2ca55c9d576940cfb37ba8569b1656b72c65569 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Mon, 18 Oct 2021 14:04:54 +0200 Subject: [PATCH 108/156] Texture Cache: ease the requirements of textures being blacklisted. --- src/video_core/texture_cache/texture_cache.h | 27 +++++-------------- .../texture_cache/texture_cache_base.h | 2 -- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index cf0d33a45..c885586e8 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -221,7 +221,6 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { scale_rating = 0; bool any_rescaled = false; bool can_rescale = true; - bool any_blacklisted = false; const auto check_rescale = [&](ImageViewId view_id, ImageId& id_save) { if (view_id != NULL_IMAGE_VIEW_ID && view_id != ImageViewId{}) { const auto& view = slot_image_views[view_id]; @@ -229,7 +228,6 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { id_save = image_id; auto& image = slot_images[image_id]; can_rescale &= ImageCanRescale(image); - any_blacklisted |= True(image.flags & ImageFlagBits::Blacklisted); any_rescaled |= True(image.flags & ImageFlagBits::Rescaled) || GetFormatType(image.info.format) != SurfaceType::ColorTexture; scale_rating = std::max(scale_rating, image.scale_tick <= frame_tick @@ -270,20 +268,17 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { } } else { rescaled = false; - const auto scale_down = [this, any_blacklisted](ImageId image_id) { + const auto scale_down = [this](ImageId image_id) { if (image_id != CORRUPT_ID) { Image& image = slot_images[image_id]; ScaleDown(image); - if (any_blacklisted) { - image.flags |= ImageFlagBits::Blacklisted; - } } }; for (size_t index = 0; index < NUM_RT; ++index) { scale_down(tmp_color_images[index]); } scale_down(tmp_depth_image); - scale_rating = 0; + scale_rating = 1; } } while (has_deleted_images); // Rescale End @@ -352,7 +347,10 @@ void TextureCache

::FillImageViews(DescriptorTable& table, if constexpr (has_blacklists) { if (view.blacklist && view.id != NULL_IMAGE_VIEW_ID) { const ImageViewBase& image_view{slot_image_views[view.id]}; - has_blacklisted |= BlackListImage(image_view.image_id); + auto& image = slot_images[image_view.image_id]; + image.flags |= ImageFlagBits::Blacklisted; + has_blacklisted |= ScaleDown(image); + image.scale_rating = 0; } } } @@ -783,20 +781,9 @@ ImageId TextureCache

::FindImage(const ImageInfo& info, GPUVAddr gpu_addr, return image_id; } -template -bool TextureCache

::BlackListImage(ImageId image_id) { - auto& image = slot_images[image_id]; - if (True(image.flags & ImageFlagBits::Blacklisted)) { - return false; - } - image.flags |= ImageFlagBits::Blacklisted; - ScaleDown(image); - return true; -} - template bool TextureCache

::ImageCanRescale(ImageBase& image) { - if (!image.info.rescaleable || True(image.flags & ImageFlagBits::Blacklisted)) { + if (!image.info.rescaleable) { return false; } if (Settings::values.resolution_info.downscale && !image.info.downscaleable) { diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index e210393ba..4f876b2f4 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -166,8 +166,6 @@ public: [[nodiscard]] bool IsRescaling(const ImageViewBase& image_view) const noexcept; - [[nodiscard]] bool BlackListImage(ImageId image_id); - std::mutex mutex; private: From 3b61de74e6dc7526ffa8f03c21d81e2c3566ce90 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Mon, 18 Oct 2021 22:56:36 +0200 Subject: [PATCH 109/156] Texture Cache: fix memory managment and optimize scaled downloads, uploads. --- .../renderer_opengl/gl_texture_cache.cpp | 24 +++++++++++++++---- .../renderer_opengl/gl_texture_cache.h | 4 ++-- .../renderer_vulkan/vk_texture_cache.cpp | 23 ++++++++++++++---- .../renderer_vulkan/vk_texture_cache.h | 4 ++-- src/video_core/texture_cache/image_base.cpp | 2 +- src/video_core/texture_cache/image_base.h | 6 ++--- src/video_core/texture_cache/texture_cache.h | 22 ++++++++--------- 7 files changed, 57 insertions(+), 28 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 34d3723e5..a6e9eb60b 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -697,7 +697,7 @@ void Image::UploadMemory(const ImageBufferMap& map, std::span copies) { const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); if (is_rescaled) { - ScaleDown(); + ScaleDown(true); } glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes); @@ -725,6 +725,10 @@ void Image::UploadMemory(const ImageBufferMap& map, void Image::DownloadMemory(ImageBufferMap& map, std::span copies) { + const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); + if (is_rescaled) { + ScaleDown(); + } glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer); glPixelStorei(GL_PACK_ALIGNMENT, 1); @@ -743,6 +747,9 @@ void Image::DownloadMemory(ImageBufferMap& map, } CopyImageToBuffer(copy, map.offset); } + if (is_rescaled) { + ScaleUp(true); + } } GLuint Image::StorageHandle() noexcept { @@ -979,7 +986,7 @@ bool Image::Scale(bool up_scale) { return true; } -bool Image::ScaleUp() { +bool Image::ScaleUp(bool ignore) { if (True(flags & ImageFlagBits::Rescaled)) { return false; } @@ -997,7 +1004,11 @@ bool Image::ScaleUp() { flags &= ~ImageFlagBits::Rescaled; return false; } - scale_count++; + has_scaled = true; + if (ignore) { + current_texture = upscaled_backup.handle; + return true; + } if (!Scale()) { flags &= ~ImageFlagBits::Rescaled; return false; @@ -1005,7 +1016,7 @@ bool Image::ScaleUp() { return true; } -bool Image::ScaleDown() { +bool Image::ScaleDown(bool ignore) { if (False(flags & ImageFlagBits::Rescaled)) { return false; } @@ -1013,7 +1024,10 @@ bool Image::ScaleDown() { if (!runtime->resolution.active) { return false; } - scale_count++; + if (ignore) { + current_texture = texture.handle; + return true; + } if (!Scale(false)) { flags &= ~ImageFlagBits::Rescaled; return false; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 81aaef3da..eeb5133d5 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -196,9 +196,9 @@ public: return gl_type; } - bool ScaleUp(); + bool ScaleUp(bool ignore = false); - bool ScaleDown(); + bool ScaleDown(bool ignore = false); private: void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 51367c01d..02aac3b98 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1055,7 +1055,7 @@ void Image::UploadMemory(const StagingBufferRef& map, std::spanRequestOutsideRenderPassOperationContext(); std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); @@ -1073,6 +1073,10 @@ void Image::UploadMemory(const StagingBufferRef& map, std::span copies) { + const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); + if (is_rescaled) { + ScaleDown(); + } std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); scheduler->RequestOutsideRenderPassOperationContext(); scheduler->Record([buffer = map.buffer, image = *original_image, aspect_mask = aspect_mask, @@ -1125,9 +1129,12 @@ void Image::DownloadMemory(const StagingBufferRef& map, std::spandevice; const bool is_2d = info.type == ImageType::e2D; const u32 scaled_width = resolution.ScaleUp(info.size.width); @@ -1149,8 +1156,12 @@ bool Image::ScaleUp() { scaled_image = MakeImage(device, scaled_info); auto& allocator = runtime->memory_allocator; scaled_commit = MemoryCommit(allocator.Commit(scaled_image, MemoryUsage::DeviceLocal)); + ignore = false; } current_image = *scaled_image; + if (ignore) { + return true; + } if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); @@ -1212,7 +1223,7 @@ bool Image::ScaleUp() { return true; } -bool Image::ScaleDown() { +bool Image::ScaleDown(bool ignore) { if (False(flags & ImageFlagBits::Rescaled)) { return false; } @@ -1221,6 +1232,10 @@ bool Image::ScaleDown() { if (!resolution.active) { return false; } + if (ignore) { + current_image = *original_image; + return true; + } const auto& device = runtime->device; const bool is_2d = info.type == ImageType::e2D; const u32 scaled_width = resolution.ScaleUp(info.size.width); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index df854a20c..8dbddfaf7 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -129,9 +129,9 @@ public: return std::exchange(initialized, true); } - bool ScaleUp(); + bool ScaleUp(bool ignore = false); - bool ScaleDown(); + bool ScaleDown(bool ignore = false); private: VKScheduler* scheduler{}; diff --git a/src/video_core/texture_cache/image_base.cpp b/src/video_core/texture_cache/image_base.cpp index 3db2ec825..3db2fdf34 100644 --- a/src/video_core/texture_cache/image_base.cpp +++ b/src/video_core/texture_cache/image_base.cpp @@ -61,7 +61,7 @@ ImageBase::ImageBase(const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_ : info{info_}, guest_size_bytes{CalculateGuestSizeInBytes(info)}, unswizzled_size_bytes{CalculateUnswizzledSizeBytes(info)}, converted_size_bytes{CalculateConvertedSizeBytes(info)}, scale_rating{}, scale_tick{}, - scale_count{}, gpu_addr{gpu_addr_}, cpu_addr{cpu_addr_}, + has_scaled{}, gpu_addr{gpu_addr_}, cpu_addr{cpu_addr_}, cpu_addr_end{cpu_addr + guest_size_bytes}, mip_level_offsets{CalculateMipLevelOffsets(info)} { if (info.type == ImageType::e3D) { slice_offsets = CalculateSliceOffsets(info); diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index cd4b5f636..02c669766 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -77,8 +77,8 @@ struct ImageBase { void CheckBadOverlapState(); void CheckAliasState(); - bool HasScaled() { - return scale_count > 0; + bool HasScaled() const { + return has_scaled; } ImageInfo info; @@ -88,7 +88,7 @@ struct ImageBase { u32 converted_size_bytes = 0; u32 scale_rating = 0; u64 scale_tick = 0; - u32 scale_count = 0; + bool has_scaled = false; ImageFlagBits flags = ImageFlagBits::CpuModified; GPUVAddr gpu_addr = 0; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index c885586e8..13914dc8b 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -60,7 +60,7 @@ TextureCache

::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& // On OpenGL we can be more conservatives as the driver takes care. expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB; critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB; - minimum_memory = expected_memory; + minimum_memory = 0; } } @@ -1464,16 +1464,6 @@ template void TextureCache

::TrackImage(ImageBase& image, ImageId image_id) { ASSERT(False(image.flags & ImageFlagBits::Tracked)); image.flags |= ImageFlagBits::Tracked; - if (image.HasScaled()) { - total_used_memory -= GetScaledImageSizeBytes(image); - } - u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); - if ((IsPixelFormatASTC(image.info.format) && - True(image.flags & ImageFlagBits::AcceleratedUpload)) || - True(image.flags & ImageFlagBits::Converted)) { - tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); - } - total_used_memory -= Common::AlignUp(tentative_size, 1024); if (False(image.flags & ImageFlagBits::Sparse)) { rasterizer.UpdatePagesCachedCount(image.cpu_addr, image.guest_size_bytes, 1); return; @@ -1519,6 +1509,16 @@ void TextureCache

::UntrackImage(ImageBase& image, ImageId image_id) { template void TextureCache

::DeleteImage(ImageId image_id) { ImageBase& image = slot_images[image_id]; + if (image.HasScaled()) { + total_used_memory -= GetScaledImageSizeBytes(image); + } + u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); + if ((IsPixelFormatASTC(image.info.format) && + True(image.flags & ImageFlagBits::AcceleratedUpload)) || + True(image.flags & ImageFlagBits::Converted)) { + tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); + } + total_used_memory -= Common::AlignUp(tentative_size, 1024); const GPUVAddr gpu_addr = image.gpu_addr; const auto alloc_it = image_allocs_table.find(gpu_addr); if (alloc_it == image_allocs_table.end()) { From f3ff8bdc0e8c6c25c1725b82d6862b93a8df3c84 Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Tue, 19 Oct 2021 17:46:01 +0200 Subject: [PATCH 110/156] TextureCache: Fix blitting filter in Vulkan and correct viewport/scissor calculation when downscaling. --- .../renderer_vulkan/vk_rasterizer.cpp | 24 ++++++++--- .../renderer_vulkan/vk_texture_cache.cpp | 40 ++++++++++++------- 2 files changed, 44 insertions(+), 20 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 1ceffa718..a9334e101 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -60,10 +60,19 @@ struct DrawParams { VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t index, float scale) { const auto& src = regs.viewport_transform[index]; - const float x = (src.translate_x - src.scale_x) * scale; - const float width = src.scale_x * 2.0f * scale; - float y = (src.translate_y - src.scale_y) * scale; - float height = src.scale_y * 2.0f * scale; + const auto conv = [scale](float value) { + float new_value = value * scale; + if (scale < 1.0f) { + bool sign = std::signbit(new_value); + new_value = std::round(std::abs(new_value)); + new_value = sign ? -new_value : new_value; + } + return new_value; + }; + const float x = conv(src.translate_x - src.scale_x); + const float width = conv(src.scale_x * 2.0f); + float y = conv(src.translate_y - src.scale_y); + float height = conv(src.scale_y * 2.0f); if (regs.screen_y_control.y_negate) { y += height; height = -height; @@ -91,8 +100,13 @@ VkRect2D GetScissorState(const Maxwell& regs, size_t index, u32 up_scale = 1, u3 if (value == 0) { return 0U; } + const u32 upset = value * up_scale; + u32 acumm = 0; + if ((up_scale >> down_shift) == 0) { + acumm = upset & 0x1; + } const u32 converted_value = (value * up_scale) >> down_shift; - return std::max(converted_value, 1U); + return std::max(converted_value + acumm, 1U); }; if (src.enable) { scissor.offset.x = static_cast(scale_up(src.min_x)); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 02aac3b98..84ec803ba 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -593,7 +593,7 @@ struct RangedBarrierRange { void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, const ImageInfo& info, VkImageAspectFlags aspect_mask, const Settings::ResolutionScalingInfo& resolution, - bool up_scaling = true) { + bool is_bilinear, bool up_scaling = true) { const bool is_2d = info.type == ImageType::e2D; const auto resources = info.resources; const VkExtent2D extent{ @@ -602,7 +602,7 @@ void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, con }; const bool is_zeta = (aspect_mask & VK_IMAGE_ASPECT_DEPTH_BIT) != 0; const bool is_int_format = IsPixelFormatInteger(info.format); - const VkFilter vk_filter = (is_zeta || is_int_format) ? VK_FILTER_NEAREST : VK_FILTER_LINEAR; + const VkFilter vk_filter = is_bilinear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST; scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([dst_image, src_image, extent, resources, aspect_mask, resolution, is_2d, @@ -1160,7 +1160,7 @@ bool Image::ScaleUp(bool ignore) { } current_image = *scaled_image; if (ignore) { - return true; + return true; } if (aspect_mask == 0) { @@ -1170,11 +1170,18 @@ bool Image::ScaleUp(bool ignore) { const PixelFormat format = StorageFormat(info.format); const auto vk_format = MaxwellToVK::SurfaceFormat(device, OPTIMAL_FORMAT, false, format).format; const auto blit_usage = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; + const bool is_color{aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT}; + const bool is_bilinear{is_color && !IsPixelFormatInteger(info.format)}; if (device.IsFormatSupported(vk_format, blit_usage, OPTIMAL_FORMAT)) { - BlitScale(*scheduler, *original_image, *scaled_image, info, aspect_mask, resolution); + BlitScale(*scheduler, *original_image, *scaled_image, info, aspect_mask, resolution, + device.IsFormatSupported(vk_format, + VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT, + OPTIMAL_FORMAT)); } else { using namespace VideoCommon; static constexpr auto BLIT_OPERATION = Tegra::Engines::Fermi2D::Operation::SrcCopy; + const auto operation = is_bilinear ? Tegra::Engines::Fermi2D::Filter::Bilinear + : Tegra::Engines::Fermi2D::Filter::Point; if (!scale_view) { const auto view_info = ImageViewInfo(ImageViewType::e2D, info.format); @@ -1201,9 +1208,8 @@ bool Image::ScaleUp(bool ignore) { } const auto color_view = scale_view->Handle(Shader::TextureType::Color2D); - runtime->blit_image_helper.BlitColor( - scale_framebuffer.get(), color_view, dst_region, src_region, - Tegra::Engines::Fermi2D::Filter::Bilinear, BLIT_OPERATION); + runtime->blit_image_helper.BlitColor(scale_framebuffer.get(), color_view, dst_region, + src_region, operation, BLIT_OPERATION); } else if (!runtime->device.IsBlitDepthStencilSupported() && aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { if (!scale_framebuffer) { @@ -1212,7 +1218,7 @@ bool Image::ScaleUp(bool ignore) { } runtime->blit_image_helper.BlitDepthStencil( scale_framebuffer.get(), scale_view->DepthView(), scale_view->StencilView(), - dst_region, src_region, Tegra::Engines::Fermi2D::Filter::Point, BLIT_OPERATION); + dst_region, src_region, operation, BLIT_OPERATION); } else { // TODO: Use helper blits where applicable flags &= ~ImageFlagBits::Rescaled; @@ -1233,8 +1239,8 @@ bool Image::ScaleDown(bool ignore) { return false; } if (ignore) { - current_image = *original_image; - return true; + current_image = *original_image; + return true; } const auto& device = runtime->device; const bool is_2d = info.type == ImageType::e2D; @@ -1247,11 +1253,16 @@ bool Image::ScaleDown(bool ignore) { const PixelFormat format = StorageFormat(info.format); const auto vk_format = MaxwellToVK::SurfaceFormat(device, OPTIMAL_FORMAT, false, format).format; const auto blit_usage = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; + const bool is_color{aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT}; + const bool is_bilinear{is_color && !IsPixelFormatInteger(info.format)}; if (device.IsFormatSupported(vk_format, blit_usage, OPTIMAL_FORMAT)) { - BlitScale(*scheduler, *scaled_image, *original_image, info, aspect_mask, resolution, false); + BlitScale(*scheduler, *scaled_image, *original_image, info, aspect_mask, resolution, + is_bilinear, false); } else { using namespace VideoCommon; static constexpr auto BLIT_OPERATION = Tegra::Engines::Fermi2D::Operation::SrcCopy; + const auto operation = is_bilinear ? Tegra::Engines::Fermi2D::Filter::Bilinear + : Tegra::Engines::Fermi2D::Filter::Point; if (!normal_view) { const auto view_info = ImageViewInfo(ImageViewType::e2D, info.format); @@ -1278,9 +1289,8 @@ bool Image::ScaleDown(bool ignore) { } const auto color_view = normal_view->Handle(Shader::TextureType::Color2D); - runtime->blit_image_helper.BlitColor( - normal_framebuffer.get(), color_view, dst_region, src_region, - Tegra::Engines::Fermi2D::Filter::Bilinear, BLIT_OPERATION); + runtime->blit_image_helper.BlitColor(normal_framebuffer.get(), color_view, dst_region, + src_region, operation, BLIT_OPERATION); } else if (!runtime->device.IsBlitDepthStencilSupported() && aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { if (!normal_framebuffer) { @@ -1289,7 +1299,7 @@ bool Image::ScaleDown(bool ignore) { } runtime->blit_image_helper.BlitDepthStencil( normal_framebuffer.get(), normal_view->DepthView(), normal_view->StencilView(), - dst_region, src_region, Tegra::Engines::Fermi2D::Filter::Point, BLIT_OPERATION); + dst_region, src_region, operation, BLIT_OPERATION); } else { // TODO: Use helper blits where applicable flags &= ~ImageFlagBits::Rescaled; From 150bc45401c7c6e5cbcf936371269f0c1d2a0e83 Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Tue, 19 Oct 2021 18:32:08 +0200 Subject: [PATCH 111/156] Texture cache: fix Intel with rescaler. --- src/video_core/renderer_vulkan/vk_texture_cache.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 84ec803ba..7c8732ec1 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1214,7 +1214,7 @@ bool Image::ScaleUp(bool ignore) { aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { if (!scale_framebuffer) { scale_framebuffer = - std::make_unique(*runtime, view_ptr, nullptr, extent); + std::make_unique(*runtime, nullptr, view_ptr, extent); } runtime->blit_image_helper.BlitDepthStencil( scale_framebuffer.get(), scale_view->DepthView(), scale_view->StencilView(), @@ -1295,7 +1295,7 @@ bool Image::ScaleDown(bool ignore) { aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { if (!normal_framebuffer) { normal_framebuffer = - std::make_unique(*runtime, view_ptr, nullptr, extent); + std::make_unique(*runtime, nullptr, view_ptr, extent); } runtime->blit_image_helper.BlitDepthStencil( normal_framebuffer.get(), normal_view->DepthView(), normal_view->StencilView(), From 826a350e2b6aadb4f123189c28f065b0e7926264 Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Tue, 19 Oct 2021 19:41:57 +0200 Subject: [PATCH 112/156] Vulkan Rasterizer: Fix clears on integer textures. --- .../renderer_vulkan/vk_rasterizer.cpp | 34 +++++++++++++- src/video_core/surface.cpp | 47 +++++++++++++++++++ src/video_core/surface.h | 4 ++ 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index a9334e101..ff75d14a1 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -211,6 +211,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { EndTransformFeedback(); } +#pragma optimize("", off) + void RasterizerVulkan::Clear() { MICROPROFILE_SCOPE(Vulkan_Clearing); @@ -260,7 +262,37 @@ void RasterizerVulkan::Clear() { const u32 color_attachment = regs.clear_buffers.RT; if (use_color && framebuffer->HasAspectColorBit(color_attachment)) { VkClearValue clear_value; - std::memcpy(clear_value.color.float32, regs.clear_color, sizeof(regs.clear_color)); + bool is_integer = false; + bool is_signed = false; + size_t int_size = 8; + for (std::size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; ++i) { + const auto& this_rt = regs.rt[i]; + if (this_rt.Address() == 0) { + continue; + } + if (this_rt.format == Tegra::RenderTargetFormat::NONE) { + continue; + } + const auto format = + VideoCore::Surface::PixelFormatFromRenderTargetFormat(this_rt.format); + is_integer = IsPixelFormatInteger(format); + is_signed = IsPixelFormatSignedInteger(format); + int_size = PixelComponentSizeBitsInteger(format); + break; + } + if (!is_integer) { + std::memcpy(clear_value.color.float32, regs.clear_color, sizeof(regs.clear_color)); + } else if (!is_signed) { + for (size_t i = 0; i < 4; i++) { + clear_value.color.uint32[i] = + static_cast(static_cast(int_size << 1U) * regs.clear_color[i]); + } + } else { + for (size_t i = 0; i < 4; i++) { + clear_value.color.int32[i] = static_cast( + (static_cast(int_size - 1) << 1) * (regs.clear_color[i] - 0.5f)); + } + } scheduler.Record([color_attachment, clear_value, clear_rect](vk::CommandBuffer cmdbuf) { const VkClearAttachment attachment{ diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 64941a486..58d262446 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -306,6 +306,53 @@ bool IsPixelFormatInteger(PixelFormat format) { } } +bool IsPixelFormatSignedInteger(PixelFormat format) { + switch (format) { + case PixelFormat::A8B8G8R8_SINT: + case PixelFormat::R8_SINT: + case PixelFormat::R16G16B16A16_SINT: + case PixelFormat::R32G32B32A32_SINT: + case PixelFormat::R32G32_SINT: + case PixelFormat::R16_SINT: + case PixelFormat::R16G16_SINT: + case PixelFormat::R8G8_SINT: + case PixelFormat::R32_SINT: + return true; + default: + return false; + } +} + +size_t PixelComponentSizeBitsInteger(PixelFormat format) { + switch (format) { + case PixelFormat::A8B8G8R8_SINT: + case PixelFormat::A8B8G8R8_UINT: + case PixelFormat::R8_SINT: + case PixelFormat::R8_UINT: + case PixelFormat::R8G8_SINT: + case PixelFormat::R8G8_UINT: + return 8; + case PixelFormat::A2B10G10R10_UINT: + return 10; + case PixelFormat::R16G16B16A16_SINT: + case PixelFormat::R16G16B16A16_UINT: + case PixelFormat::R16_UINT: + case PixelFormat::R16_SINT: + case PixelFormat::R16G16_UINT: + case PixelFormat::R16G16_SINT: + return 16; + case PixelFormat::R32G32B32A32_UINT: + case PixelFormat::R32G32B32A32_SINT: + case PixelFormat::R32G32_SINT: + case PixelFormat::R32G32_UINT: + case PixelFormat::R32_UINT: + case PixelFormat::R32_SINT: + return 32; + default: + return 0; + } +} + std::pair GetASTCBlockSize(PixelFormat format) { return {DefaultBlockWidth(format), DefaultBlockHeight(format)}; } diff --git a/src/video_core/surface.h b/src/video_core/surface.h index 3bb24abb7..2ce7c7d33 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -462,6 +462,10 @@ bool IsPixelFormatSRGB(PixelFormat format); bool IsPixelFormatInteger(PixelFormat format); +bool IsPixelFormatSignedInteger(PixelFormat format); + +size_t PixelComponentSizeBitsInteger(PixelFormat format); + std::pair GetASTCBlockSize(PixelFormat format); u64 EstimatedDecompressedSize(u64 base_size, PixelFormat format); From 4ad22c7d2b9d8fdfffc380f0b52f4ba943599bef Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Wed, 20 Oct 2021 00:33:03 +0200 Subject: [PATCH 113/156] Video Core: fix building for GCC. --- src/common/math_util.h | 4 +- .../renderer_vulkan/pipeline_helper.h | 2 +- src/video_core/renderer_vulkan/vk_fsr.cpp | 49 +++++++++++++------ .../renderer_vulkan/vk_rasterizer.cpp | 11 ++--- .../renderer_vulkan/vk_texture_cache.cpp | 2 - 5 files changed, 43 insertions(+), 25 deletions(-) diff --git a/src/common/math_util.h b/src/common/math_util.h index 4c38d8040..510c4e56d 100644 --- a/src/common/math_util.h +++ b/src/common/math_util.h @@ -48,8 +48,8 @@ struct Rectangle { } [[nodiscard]] Rectangle Scale(const float s) const { - return Rectangle{left, top, static_cast(left + GetWidth() * s), - static_cast(top + GetHeight() * s)}; + return Rectangle{left, top, static_cast(static_cast(left + GetWidth()) * s), + static_cast(static_cast(top + GetHeight()) * s)}; } }; diff --git a/src/video_core/renderer_vulkan/pipeline_helper.h b/src/video_core/renderer_vulkan/pipeline_helper.h index ae5e66ef4..11c160570 100644 --- a/src/video_core/renderer_vulkan/pipeline_helper.h +++ b/src/video_core/renderer_vulkan/pipeline_helper.h @@ -77,7 +77,7 @@ public: .stageFlags = static_cast( is_compute ? VK_SHADER_STAGE_COMPUTE_BIT : VK_SHADER_STAGE_ALL_GRAPHICS), .offset = 0, - .size = sizeof(RescalingLayout) - size_offset, + .size = static_cast(sizeof(RescalingLayout)) - size_offset, }; return device->GetLogical().CreatePipelineLayout({ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, diff --git a/src/video_core/renderer_vulkan/vk_fsr.cpp b/src/video_core/renderer_vulkan/vk_fsr.cpp index fd0a4aa42..1f60974be 100644 --- a/src/video_core/renderer_vulkan/vk_fsr.cpp +++ b/src/video_core/renderer_vulkan/vk_fsr.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include "common/common_types.h" #include "common/div_ceil.h" #include "video_core/host_shaders/vulkan_fidelityfx_fsr_easu_comp_spv.h" #include "video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_comp_spv.h" @@ -12,10 +13,10 @@ namespace Vulkan { -FSR::FSR(const Device& device, MemoryAllocator& memory_allocator, size_t image_count, - VkExtent2D output_size) - : device{device}, memory_allocator{memory_allocator}, image_count{image_count}, - output_size{output_size} { +FSR::FSR(const Device& device_, MemoryAllocator& memory_allocator_, size_t image_count_, + VkExtent2D output_size_) + : device{device_}, memory_allocator{memory_allocator_}, image_count{image_count_}, + output_size{output_size_} { CreateImages(); CreateSampler(); @@ -266,14 +267,17 @@ void FSR::UpdateDescriptorSet(std::size_t image_index, VkImageView image_view) c const auto blit_image_view = *image_views[image_count + image_index]; const VkDescriptorImageInfo image_info{ + .sampler = VK_NULL_HANDLE, .imageView = image_view, .imageLayout = VK_IMAGE_LAYOUT_GENERAL, }; const VkDescriptorImageInfo fsr_image_info{ + .sampler = VK_NULL_HANDLE, .imageView = fsr_image_view, .imageLayout = VK_IMAGE_LAYOUT_GENERAL, }; const VkDescriptorImageInfo blit_image_info{ + .sampler = VK_NULL_HANDLE, .imageView = blit_image_view, .imageLayout = VK_IMAGE_LAYOUT_GENERAL, }; @@ -341,35 +345,52 @@ void FSR::CreateSampler() { void FSR::CreateShaders() { easu_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_EASU_COMP_SPV); - rcas_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_EASU_COMP_SPV); + rcas_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_RCAS_COMP_SPV); } void FSR::CreatePipeline() { - VkPipelineShaderStageCreateInfo shader_stage{ - + VkPipelineShaderStageCreateInfo shader_stage_easu{ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, .pNext = nullptr, .flags = 0, .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = *easu_shader, .pName = "main", .pSpecializationInfo = nullptr, }; - VkComputePipelineCreateInfo pipeline_ci{ + VkPipelineShaderStageCreateInfo shader_stage_rcas{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = *rcas_shader, + .pName = "main", + .pSpecializationInfo = nullptr, + }; + + VkComputePipelineCreateInfo pipeline_ci_easu{ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, .pNext = nullptr, .flags = 0, + .stage = shader_stage_easu, .layout = *pipeline_layout, + .basePipelineHandle = VK_NULL_HANDLE, .basePipelineIndex = 0, }; - shader_stage.module = *easu_shader; - pipeline_ci.stage = shader_stage; - easu_pipeline = device.GetLogical().CreateComputePipeline(pipeline_ci); + VkComputePipelineCreateInfo pipeline_ci_rcas{ + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = shader_stage_rcas, + .layout = *pipeline_layout, + .basePipelineHandle = VK_NULL_HANDLE, + .basePipelineIndex = 0, + }; - shader_stage.module = *rcas_shader; - pipeline_ci.stage = shader_stage; - rcas_pipeline = device.GetLogical().CreateComputePipeline(pipeline_ci); + easu_pipeline = device.GetLogical().CreateComputePipeline(pipeline_ci_easu); + rcas_pipeline = device.GetLogical().CreateComputePipeline(pipeline_ci_rcas); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index ff75d14a1..5ca67c413 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -211,8 +211,6 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { EndTransformFeedback(); } -#pragma optimize("", off) - void RasterizerVulkan::Clear() { MICROPROFILE_SCOPE(Vulkan_Clearing); @@ -284,13 +282,14 @@ void RasterizerVulkan::Clear() { std::memcpy(clear_value.color.float32, regs.clear_color, sizeof(regs.clear_color)); } else if (!is_signed) { for (size_t i = 0; i < 4; i++) { - clear_value.color.uint32[i] = - static_cast(static_cast(int_size << 1U) * regs.clear_color[i]); + clear_value.color.uint32[i] = static_cast( + static_cast(static_cast(int_size) << 1U) * regs.clear_color[i]); } } else { for (size_t i = 0; i < 4; i++) { - clear_value.color.int32[i] = static_cast( - (static_cast(int_size - 1) << 1) * (regs.clear_color[i] - 0.5f)); + clear_value.color.int32[i] = + static_cast(static_cast(static_cast(int_size - 1) << 1) * + (regs.clear_color[i] - 0.5f)); } } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 7c8732ec1..413d472cd 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -600,8 +600,6 @@ void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, con .width = info.size.width, .height = info.size.height, }; - const bool is_zeta = (aspect_mask & VK_IMAGE_ASPECT_DEPTH_BIT) != 0; - const bool is_int_format = IsPixelFormatInteger(info.format); const VkFilter vk_filter = is_bilinear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST; scheduler.RequestOutsideRenderPassOperationContext(); From 7506ac4118c6065b2557fe3cb94bab52c099ca37 Mon Sep 17 00:00:00 2001 From: Marshall Mohror Date: Tue, 19 Oct 2021 21:20:28 -0500 Subject: [PATCH 114/156] Presentation: Fix turning FSR on and off in settings --- src/video_core/renderer_vulkan/vk_blit_screen.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 334eeb92e..ccf721008 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -362,6 +362,17 @@ void VKBlitScreen::CreateDynamicResources() { } void VKBlitScreen::RefreshResources(const Tegra::FramebufferConfig& framebuffer) { + if (Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::Fsr) { + if (!fsr) { + const auto& layout = render_window.GetFramebufferLayout(); + fsr = std::make_unique( + device, memory_allocator, image_count, + VkExtent2D{.width = layout.screen.GetWidth(), .height = layout.screen.GetHeight()}); + } + } else { + fsr.reset(); + } + if (framebuffer.width == raw_width && framebuffer.height == raw_height && !raw_images.empty()) { return; } From d37d10e7a7b9037a259b27923716e5ce3084d6c3 Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Wed, 20 Oct 2021 18:27:25 +0200 Subject: [PATCH 115/156] TextureCache: fix rescaling in aliases and overlap joins. --- .../renderer_vulkan/vk_rasterizer.cpp | 19 +++++---- src/video_core/texture_cache/texture_cache.h | 41 ++++++++++++++----- src/video_core/texture_cache/util.cpp | 8 +++- src/video_core/texture_cache/util.h | 3 +- 4 files changed, 48 insertions(+), 23 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 5ca67c413..fd334a146 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -63,7 +63,7 @@ VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t in const auto conv = [scale](float value) { float new_value = value * scale; if (scale < 1.0f) { - bool sign = std::signbit(new_value); + const bool sign = std::signbit(value); new_value = std::round(std::abs(new_value)); new_value = sign ? -new_value : new_value; } @@ -96,21 +96,22 @@ VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t in VkRect2D GetScissorState(const Maxwell& regs, size_t index, u32 up_scale = 1, u32 down_shift = 0) { const auto& src = regs.scissor_test[index]; VkRect2D scissor; - const auto scale_up = [&](u32 value) -> u32 { + const auto scale_up = [&](s32 value) -> s32 { if (value == 0) { return 0U; } - const u32 upset = value * up_scale; - u32 acumm = 0; + const s32 upset = value * up_scale; + s32 acumm = 0; if ((up_scale >> down_shift) == 0) { - acumm = upset & 0x1; + acumm = upset % 2; } - const u32 converted_value = (value * up_scale) >> down_shift; - return std::max(converted_value + acumm, 1U); + const s32 converted_value = (value * up_scale) >> down_shift; + return value < 0 ? std::min(converted_value - acumm, -1) + : std::max(converted_value + acumm, 1); }; if (src.enable) { - scissor.offset.x = static_cast(scale_up(src.min_x)); - scissor.offset.y = static_cast(scale_up(src.min_y)); + scissor.offset.x = scale_up(static_cast(src.min_x)); + scissor.offset.y = scale_up(static_cast(src.min_y)); scissor.extent.width = scale_up(src.max_x - src.min_x); scissor.extent.height = scale_up(src.max_y - src.min_y); } else { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 13914dc8b..a32c11d04 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -1037,8 +1037,11 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA if (overlap.info.num_samples != new_image.info.num_samples) { LOG_WARNING(HW_GPU, "Copying between images with different samples is not implemented"); } else { + const auto& resolution = Settings::values.resolution_info; const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value(); - auto copies = MakeShrinkImageCopies(new_info, overlap.info, base); + const u32 up_scale = can_rescale ? resolution.up_scale : 1; + const u32 down_shift = can_rescale ? resolution.down_shift : 0; + auto copies = MakeShrinkImageCopies(new_info, overlap.info, base, up_scale, down_shift); runtime.CopyImage(new_image, overlap, std::move(copies)); } if (True(overlap.flags & ImageFlagBits::Tracked)) { @@ -1659,19 +1662,35 @@ void TextureCache

::SynchronizeAliases(ImageId image_id) { const ImageBase& rhs_image = slot_images[rhs->id]; return lhs_image.modification_tick < rhs_image.modification_tick; }); + const auto& resolution = Settings::values.resolution_info; for (const AliasedImage* const aliased : aliased_images) { - if (any_rescaled) { - Image& aliased_image = slot_images[aliased->id]; - if (can_rescale) { - ScaleUp(aliased_image); - } else { - ScaleDown(aliased_image); - if (any_blacklisted) { - aliased_image.flags |= ImageFlagBits::Blacklisted; - } + if (!resolution.active | !any_rescaled) { + CopyImage(image_id, aliased->id, aliased->copies); + continue; + } + Image& aliased_image = slot_images[aliased->id]; + if (!can_rescale) { + ScaleDown(aliased_image); + if (any_blacklisted) { + aliased_image.flags |= ImageFlagBits::Blacklisted; + } + CopyImage(image_id, aliased->id, aliased->copies); + continue; + } + ScaleUp(aliased_image); + + const bool both_2d{image.info.type == ImageType::e2D && + aliased_image.info.type == ImageType::e2D}; + auto copies = aliased->copies; + for (auto copy : copies) { + copy.extent.width = std::max( + (copy.extent.width * resolution.up_scale) >> resolution.down_shift, 1); + if (both_2d) { + copy.extent.height = std::max( + (copy.extent.height * resolution.up_scale) >> resolution.down_shift, 1); } } - CopyImage(image_id, aliased->id, aliased->copies); + CopyImage(image_id, aliased->id, copies); } } diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index 59cf2f561..9922aa0cc 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -723,7 +723,7 @@ ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept { } std::vector MakeShrinkImageCopies(const ImageInfo& dst, const ImageInfo& src, - SubresourceBase base) { + SubresourceBase base, u32 up_scale, u32 down_shift) { ASSERT(dst.resources.levels >= src.resources.levels); ASSERT(dst.num_samples == src.num_samples); @@ -732,7 +732,7 @@ std::vector MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn ASSERT(src.type == ImageType::e3D); ASSERT(src.resources.levels == 1); } - + const bool both_2d{src.type == ImageType::e2D && dst.type == ImageType::e2D}; std::vector copies; copies.reserve(src.resources.levels); for (s32 level = 0; level < src.resources.levels; ++level) { @@ -762,6 +762,10 @@ std::vector MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn if (is_dst_3d) { copy.extent.depth = src.size.depth; } + copy.extent.width = std::max((copy.extent.width * up_scale) >> down_shift, 1); + if (both_2d) { + copy.extent.height = std::max((copy.extent.height * up_scale) >> down_shift, 1); + } } return copies; } diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h index 766502908..7af52de2e 100644 --- a/src/video_core/texture_cache/util.h +++ b/src/video_core/texture_cache/util.h @@ -55,7 +55,8 @@ struct OverlapResult { [[nodiscard]] std::vector MakeShrinkImageCopies(const ImageInfo& dst, const ImageInfo& src, - SubresourceBase base); + SubresourceBase base, u32 up_scale = 1, + u32 down_shift = 0); [[nodiscard]] bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config); From e7fc60406ef5309268d77edb5e5266febe147e53 Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Wed, 20 Oct 2021 18:56:34 +0200 Subject: [PATCH 116/156] VideoCore: Add more rescaling option. --- src/common/settings.cpp | 13 ++++++++++++ src/common/settings.h | 11 ++++++---- src/yuzu/configuration/configure_graphics.ui | 21 +++++++++++++++++--- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/src/common/settings.cpp b/src/common/settings.cpp index bc2c8c7d7..84ac937e5 100644 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -116,6 +116,11 @@ void UpdateRescalingInfo() { info.down_shift = 1; info.downscale = true; break; + case ResolutionSetup::Res3_4X: + info.up_scale = 3; + info.down_shift = 2; + info.downscale = true; + break; case ResolutionSetup::Res1X: info.up_scale = 1; info.down_shift = 0; @@ -132,6 +137,14 @@ void UpdateRescalingInfo() { info.up_scale = 4; info.down_shift = 0; break; + case ResolutionSetup::Res5X: + info.up_scale = 5; + info.down_shift = 0; + break; + case ResolutionSetup::Res6X: + info.up_scale = 6; + info.down_shift = 0; + break; default: UNREACHABLE(); info.up_scale = 1; diff --git a/src/common/settings.h b/src/common/settings.h index 84dab5217..f6acf5bdf 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -54,10 +54,13 @@ enum class NvdecEmulation : u32 { enum class ResolutionSetup : u32 { Res1_2X = 0, - Res1X = 1, - Res2X = 2, - Res3X = 3, - Res4X = 4, + Res3_4X = 1, + Res1X = 2, + Res2X = 3, + Res3X = 4, + Res4X = 5, + Res5X = 6, + Res6X = 7, }; enum class ScalingFilter : u32 { diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui index fe2f6bb7f..c1d7e8349 100644 --- a/src/yuzu/configuration/configure_graphics.ui +++ b/src/yuzu/configuration/configure_graphics.ui @@ -335,7 +335,12 @@ - 0.5X (360p/540p) + 0.5X (360p/540p)[MAY BREAK] + + + + + 0.75X (540p/810p)[MAY BREAK] @@ -350,12 +355,22 @@ - 3X (2160p[4K]/3240p[6K]) + 3X (2160p[4K]/3240p) - 4X (2880p/4320p[8K]) + 4X (2880p/4320p) + + + + + 5X (3600p/5400p) + + + + + 6X (4320p/6480p) From 916b882ea8870e695d50e8ca8c8e4c35fb1895d5 Mon Sep 17 00:00:00 2001 From: Marshall Mohror Date: Wed, 20 Oct 2021 13:35:59 -0500 Subject: [PATCH 117/156] Update scaleforce to use FP16 --- .../host_shaders/present_scaleforce.frag | 159 +++++++----------- 1 file changed, 63 insertions(+), 96 deletions(-) diff --git a/src/video_core/host_shaders/present_scaleforce.frag b/src/video_core/host_shaders/present_scaleforce.frag index 1829a9be8..ebc0d9b90 100644 --- a/src/video_core/host_shaders/present_scaleforce.frag +++ b/src/video_core/host_shaders/present_scaleforce.frag @@ -24,6 +24,9 @@ #version 460 +#extension GL_AMD_gpu_shader_half_float : enable +#extension GL_NV_gpu_shader5 : enable + #ifdef VULKAN #define BINDING_COLOR_TEXTURE 1 @@ -40,106 +43,70 @@ layout (location = 0) out vec4 frag_color; layout (binding = BINDING_COLOR_TEXTURE) uniform sampler2D input_texture; -vec2 tex_size; -vec2 inv_tex_size; +const bool ignore_alpha = true; -vec4 cubic(float v) { - vec3 n = vec3(1.0, 2.0, 3.0) - v; - vec3 s = n * n * n; - float x = s.x; - float y = s.y - 4.0 * s.x; - float z = s.z - 4.0 * s.y + 6.0 * s.x; - float w = 6.0 - x - y - z; - return vec4(x, y, z, w) / 6.0; -} - -// Bicubic interpolation -vec4 textureBicubic(vec2 tex_coords) { - tex_coords = tex_coords * tex_size - 0.5; - - vec2 fxy = modf(tex_coords, tex_coords); - - vec4 xcubic = cubic(fxy.x); - vec4 ycubic = cubic(fxy.y); - - vec4 c = tex_coords.xxyy + vec2(-0.5, +1.5).xyxy; - - vec4 s = vec4(xcubic.xz + xcubic.yw, ycubic.xz + ycubic.yw); - vec4 offset = c + vec4(xcubic.yw, ycubic.yw) / s; - - offset *= inv_tex_size.xxyy; - - vec4 sample0 = textureLod(input_texture, offset.xz, 0.0); - vec4 sample1 = textureLod(input_texture, offset.yz, 0.0); - vec4 sample2 = textureLod(input_texture, offset.xw, 0.0); - vec4 sample3 = textureLod(input_texture, offset.yw, 0.0); - - float sx = s.x / (s.x + s.y); - float sy = s.z / (s.z + s.w); - - return mix(mix(sample3, sample2, sx), mix(sample1, sample0, sx), sy); -} - -mat4x3 center_matrix; -vec4 center_alpha; - -// Finds the distance between four colors and cc in YCbCr space -vec4 ColorDist(vec4 A, vec4 B, vec4 C, vec4 D) { +float16_t ColorDist1(f16vec4 a, f16vec4 b) { // https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.2020_conversion - const vec3 K = vec3(0.2627, 0.6780, 0.0593); - const float LUMINANCE_WEIGHT = .6; - const mat3 YCBCR_MATRIX = - mat3(K * LUMINANCE_WEIGHT, -.5 * K.r / (1.0 - K.b), -.5 * K.g / (1.0 - K.b), .5, .5, - -.5 * K.g / (1.0 - K.r), -.5 * K.b / (1.0 - K.r)); + const f16vec3 K = f16vec3(0.2627, 0.6780, 0.0593); + const float16_t scaleB = float16_t(0.5) / (float16_t(1.0) - K.b); + const float16_t scaleR = float16_t(0.5) / (float16_t(1.0) - K.r); + f16vec4 diff = a - b; + float16_t Y = dot(diff.rgb, K); + float16_t Cb = scaleB * (diff.b - Y); + float16_t Cr = scaleR * (diff.r - Y); + f16vec3 YCbCr = f16vec3(Y, Cb, Cr); + float16_t d = length(YCbCr); + if (ignore_alpha) { + return d; + } + return sqrt(a.a * b.a * d * d + diff.a * diff.a); +} - mat4x3 colors = mat4x3(A.rgb, B.rgb, C.rgb, D.rgb) - center_matrix; - mat4x3 YCbCr = YCBCR_MATRIX * colors; - vec4 color_dist = vec3(1.0) * YCbCr; - color_dist *= color_dist; - vec4 alpha = vec4(A.a, B.a, C.a, D.a); +f16vec4 ColorDist(f16vec4 ref, f16vec4 A, f16vec4 B, f16vec4 C, f16vec4 D) { + return f16vec4( + ColorDist1(ref, A), + ColorDist1(ref, B), + ColorDist1(ref, C), + ColorDist1(ref, D) + ); +} - return sqrt((color_dist + abs(center_alpha - alpha)) * alpha * center_alpha); +vec4 Scaleforce(sampler2D tex, vec2 tex_coord) { + f16vec4 bl = f16vec4(textureOffset(tex, tex_coord, ivec2(-1, -1))); + f16vec4 bc = f16vec4(textureOffset(tex, tex_coord, ivec2(0, -1))); + f16vec4 br = f16vec4(textureOffset(tex, tex_coord, ivec2(1, -1))); + f16vec4 cl = f16vec4(textureOffset(tex, tex_coord, ivec2(-1, 0))); + f16vec4 cc = f16vec4(texture(tex, tex_coord)); + f16vec4 cr = f16vec4(textureOffset(tex, tex_coord, ivec2(1, 0))); + f16vec4 tl = f16vec4(textureOffset(tex, tex_coord, ivec2(-1, 1))); + f16vec4 tc = f16vec4(textureOffset(tex, tex_coord, ivec2(0, 1))); + f16vec4 tr = f16vec4(textureOffset(tex, tex_coord, ivec2(1, 1))); + + f16vec4 offset_tl = ColorDist(cc, tl, tc, tr, cr); + f16vec4 offset_br = ColorDist(cc, br, bc, bl, cl); + + // Calculate how different cc is from the texels around it + const float16_t plus_weight = float16_t(1.5); + const float16_t cross_weight = float16_t(1.5); + float16_t total_dist = dot(offset_tl + offset_br, f16vec4(cross_weight, plus_weight, cross_weight, plus_weight)); + + if (total_dist == float16_t(0.0)) { + return cc; + } else { + // Add together all the distances with direction taken into account + f16vec4 tmp = offset_tl - offset_br; + f16vec2 total_offset = tmp.wy * plus_weight + (tmp.zz + f16vec2(-tmp.x, tmp.x)) * cross_weight; + + // When the image has thin points, they tend to split apart. + // This is because the texels all around are different and total_offset reaches into clear areas. + // This works pretty well to keep the offset in bounds for these cases. + float16_t clamp_val = length(total_offset) / total_dist; + f16vec2 final_offset = clamp(total_offset, -clamp_val, clamp_val) / f16vec2(textureSize(tex, 0)); + + return texture(tex, tex_coord - final_offset); + } } void main() { - vec4 bl = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(-1, -1)); - vec4 bc = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(0, -1)); - vec4 br = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(1, -1)); - vec4 cl = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(-1, 0)); - vec4 cc = textureLod(input_texture, tex_coord, 0.0); - vec4 cr = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(1, 0)); - vec4 tl = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(-1, 1)); - vec4 tc = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(0, 1)); - vec4 tr = textureLodOffset(input_texture, tex_coord, 0.0, ivec2(1, 1)); - - - tex_size = vec2(textureSize(input_texture, 0)); - inv_tex_size = 1.0 / tex_size; - center_matrix = mat4x3(cc.rgb, cc.rgb, cc.rgb, cc.rgb); - center_alpha = cc.aaaa; - - vec4 offset_tl = ColorDist(tl, tc, tr, cr); - vec4 offset_br = ColorDist(br, bc, bl, cl); - - // Calculate how different cc is from the texels around it - float total_dist = dot(offset_tl + offset_br, vec4(1.0)); - - // Add together all the distances with direction taken into account - vec4 tmp = offset_tl - offset_br; - vec2 total_offset = tmp.wy + tmp.zz + vec2(-tmp.x, tmp.x); - - if (total_dist == 0.0) { - // Doing bicubic filtering just past the edges where the offset is 0 causes black floaters - // and it doesn't really matter which filter is used when the colors aren't changing. - frag_color = vec4(cc.rgb, 1.0f); - } else { - // When the image has thin points, they tend to split apart. - // This is because the texels all around are different - // and total_offset reaches into clear areas. - // This works pretty well to keep the offset in bounds for these cases. - float clamp_val = length(total_offset) / total_dist; - vec2 final_offset = clamp(total_offset, -clamp_val, clamp_val) * inv_tex_size; - - frag_color = vec4(textureBicubic(tex_coord - final_offset).rgb, 1.0f); - } -} + frag_color = Scaleforce(input_texture, tex_coord); +} \ No newline at end of file From bb3e95133d49878e75d800e2fc405ed5cee29318 Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Wed, 20 Oct 2021 23:13:29 +0200 Subject: [PATCH 118/156] Vulkan: fix waiting on semaphore. --- src/video_core/renderer_vulkan/vk_master_semaphore.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h index 0886b7da8..9be9c9bed 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.h +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h @@ -70,7 +70,9 @@ public: return; } // If none of the above is hit, fallback to a regular wait - semaphore.Wait(tick); + while (!semaphore.Wait(tick)) { + } + Refresh(); } private: From bf01b7993dca835a516abdc2142be96bc0f216ec Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Wed, 20 Oct 2021 23:21:52 +0200 Subject: [PATCH 119/156] TextureCache: Improve Reaper. --- src/video_core/texture_cache/texture_cache.h | 38 ++++++++++++------- .../texture_cache/texture_cache_base.h | 2 +- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index a32c11d04..f1254ef62 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -51,8 +51,8 @@ TextureCache

::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& if constexpr (HAS_DEVICE_MEMORY_INFO) { const auto device_memory = runtime.GetDeviceLocalMemory(); - const u64 possible_expected_memory = (device_memory * 3) / 10; - const u64 possible_critical_memory = (device_memory * 6) / 10; + const u64 possible_expected_memory = (device_memory * 4) / 10; + const u64 possible_critical_memory = (device_memory * 7) / 10; expected_memory = std::max(possible_expected_memory, DEFAULT_EXPECTED_MEMORY); critical_memory = std::max(possible_critical_memory, DEFAULT_CRITICAL_MEMORY); minimum_memory = 0; @@ -69,7 +69,7 @@ void TextureCache

::RunGarbageCollector() { const bool high_priority_mode = total_used_memory >= expected_memory; const bool aggressive_mode = total_used_memory >= critical_memory; const u64 ticks_to_destroy = aggressive_mode ? 10ULL : high_priority_mode ? 25ULL : 100ULL; - size_t num_iterations = aggressive_mode ? 10000 : (high_priority_mode ? 100 : 5); + size_t num_iterations = aggressive_mode ? 300 : (high_priority_mode ? 50 : 10); const auto clean_up = [this, &num_iterations, high_priority_mode](ImageId image_id) { if (num_iterations == 0) { return true; @@ -91,7 +91,7 @@ void TextureCache

::RunGarbageCollector() { UntrackImage(image, image_id); } UnregisterImage(image_id); - DeleteImage(image_id); + DeleteImage(image_id, image.scale_tick > frame_tick + 5); return false; }; lru_cache.ForEachItemBelow(frame_tick - ticks_to_destroy, clean_up); @@ -287,7 +287,9 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { if (image_id != CORRUPT_ID) { Image& image = slot_images[image_id]; image.scale_rating = scale_rating; - image.scale_tick = frame_tick + 1; + if (image.scale_tick <= frame_tick) { + image.scale_tick = frame_tick + 1; + } } }; for (size_t index = 0; index < NUM_RT; ++index) { @@ -810,6 +812,9 @@ bool TextureCache

::ImageCanRescale(ImageBase& image) { template void TextureCache

::InvalidateScale(Image& image) { + if (image.scale_tick <= frame_tick) { + image.scale_tick = frame_tick + 1; + } const std::span image_view_ids = image.image_view_ids; auto& dirty = maxwell3d.dirty.flags; dirty[Dirty::RenderTargets] = true; @@ -842,12 +847,15 @@ void TextureCache

::InvalidateScale(Image& image) { template u64 TextureCache

::GetScaledImageSizeBytes(ImageBase& image) { - const f32 add_to_size = Settings::values.resolution_info.up_factor; - const bool sign = std::signbit(add_to_size); - const u32 image_size_bytes = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); - const u64 tentative_size = image_size_bytes * static_cast(std::abs(add_to_size)); + const u64 scale_up = static_cast(Settings::values.resolution_info.up_scale * + Settings::values.resolution_info.up_scale); + const u64 down_shift = static_cast(Settings::values.resolution_info.down_shift + + Settings::values.resolution_info.down_shift); + const u64 image_size_bytes = + static_cast(std::max(image.guest_size_bytes, image.unswizzled_size_bytes)); + const u64 tentative_size = (image_size_bytes * scale_up) >> down_shift; const u64 fitted_size = Common::AlignUp(tentative_size, 1024); - return sign ? -fitted_size : fitted_size; + return fitted_size; } template @@ -1510,7 +1518,7 @@ void TextureCache

::UntrackImage(ImageBase& image, ImageId image_id) { } template -void TextureCache

::DeleteImage(ImageId image_id) { +void TextureCache

::DeleteImage(ImageId image_id, bool immediate_delete) { ImageBase& image = slot_images[image_id]; if (image.HasScaled()) { total_used_memory -= GetScaledImageSizeBytes(image); @@ -1576,10 +1584,14 @@ void TextureCache

::DeleteImage(ImageId image_id) { num_removed_overlaps); } for (const ImageViewId image_view_id : image_view_ids) { - sentenced_image_view.Push(std::move(slot_image_views[image_view_id])); + if (!immediate_delete) { + sentenced_image_view.Push(std::move(slot_image_views[image_view_id])); + } slot_image_views.erase(image_view_id); } - sentenced_images.Push(std::move(slot_images[image_id])); + if (!immediate_delete) { + sentenced_images.Push(std::move(slot_images[image_id])); + } slot_images.erase(image_id); alloc_images.erase(alloc_image_it); diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 4f876b2f4..eea589269 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -292,7 +292,7 @@ private: void UntrackImage(ImageBase& image, ImageId image_id); /// Delete image from the cache - void DeleteImage(ImageId image); + void DeleteImage(ImageId image, bool immediate_delete = false); /// Remove image views references from the cache void RemoveImageViewReferences(std::span removed_views); From 9e065b9c7d3b25ddfe20afa4a945cca6e9767fa9 Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Thu, 21 Oct 2021 01:27:54 +0200 Subject: [PATCH 120/156] VideoCore: Add gaussian filtering. --- src/common/settings.h | 5 +- src/video_core/host_shaders/CMakeLists.txt | 1 + .../host_shaders/present_gaussian.frag | 74 +++++++++++++++++++ .../renderer_opengl/renderer_opengl.cpp | 5 ++ .../renderer_opengl/renderer_opengl.h | 1 + .../renderer_vulkan/vk_blit_screen.cpp | 49 ++++++++++++ .../renderer_vulkan/vk_blit_screen.h | 2 + src/yuzu/configuration/configure_graphics.ui | 5 ++ 8 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 src/video_core/host_shaders/present_gaussian.frag diff --git a/src/common/settings.h b/src/common/settings.h index f6acf5bdf..830030efd 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -67,8 +67,9 @@ enum class ScalingFilter : u32 { NearestNeighbor = 0, Bilinear = 1, Bicubic = 2, - ScaleForce = 3, - Fsr = 4, + Gaussian = 3, + ScaleForce = 4, + Fsr = 5, }; struct ResolutionScalingInfo { diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 32e2ab500..b0e15773c 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -19,6 +19,7 @@ set(SHADER_FILES pitch_unswizzle.comp present_scaleforce.frag present_bicubic.frag + present_gaussian.frag vulkan_blit_color_float.frag vulkan_blit_depth_stencil.frag vulkan_fidelityfx_fsr_easu.comp diff --git a/src/video_core/host_shaders/present_gaussian.frag b/src/video_core/host_shaders/present_gaussian.frag new file mode 100644 index 000000000..d5e2b1781 --- /dev/null +++ b/src/video_core/host_shaders/present_gaussian.frag @@ -0,0 +1,74 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 460 core + +#ifdef VULKAN + +#define BINDING_COLOR_TEXTURE 1 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#define BINDING_COLOR_TEXTURE 0 + +#endif + +layout (location = 0) in vec2 frag_tex_coord; + +layout (location = 0) out vec4 color; + +layout (binding = BINDING_COLOR_TEXTURE) uniform sampler2D color_texture; + +const float offset[3] = float[](0.0, 1.3846153846, 3.2307692308); +const float weight[3] = float[](0.2270270270, 0.3162162162, 0.0702702703); + +vec4 blurVertical(sampler2D textureSampler, vec2 coord, vec2 norm) { + vec4 result = vec4(0.0f); + for (int i=1; i<3; i++) { + result += + texture(textureSampler, vec2(coord) + (vec2(0.0, offset[i]) * norm)) + * weight[i]; + result += + texture(textureSampler, vec2(coord) - (vec2(0.0, offset[i]) * norm)) + * weight[i]; + } + return result; +} + +vec4 blurHorizontal(sampler2D textureSampler, vec2 coord, vec2 norm) { + vec4 result = vec4(0.0f); + for (int i=1; i<3; i++) { + result += + texture(textureSampler, vec2(coord) + (vec2(offset[i], 0.0) * norm)) + * weight[i]; + result += + texture(textureSampler, vec2(coord) - (vec2(offset[i], 0.0) * norm)) + * weight[i]; + } + return result; +} + +vec4 blurDiagonal(sampler2D textureSampler, vec2 coord, vec2 norm) { + vec4 result = vec4(0.0f); + for (int i=1; i<3; i++) { + result += + texture(textureSampler, vec2(coord) + (vec2(offset[i], offset[i]) * norm)) + * weight[i]; + result += + texture(textureSampler, vec2(coord) - (vec2(offset[i], offset[i]) * norm)) + * weight[i]; + } + return result; +} + +void main() { + vec3 base = texture(color_texture, vec2(frag_tex_coord)).rgb * weight[0]; + vec2 tex_offset = 1.0f / textureSize(color_texture, 0); + vec3 horizontal = blurHorizontal(color_texture, frag_tex_coord, tex_offset).rgb; + vec3 vertical = blurVertical(color_texture, frag_tex_coord, tex_offset).rgb; + vec3 diagonalA = blurVertical(color_texture, frag_tex_coord, tex_offset).rgb; + vec3 diagonalB = blurVertical(color_texture, frag_tex_coord, -tex_offset).rgb; + vec3 combination = mix(mix(horizontal, vertical, 0.5f), mix(diagonalA, diagonalB, 0.5f), 0.5f); + color = vec4(combination + base, 1.0f); +} diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 68423601c..6132b3c49 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -256,6 +256,8 @@ void RendererOpenGL::InitOpenGLObjects() { present_vertex = CreateProgram(HostShaders::OPENGL_PRESENT_VERT, GL_VERTEX_SHADER); present_bilinear_fragment = CreateProgram(HostShaders::OPENGL_PRESENT_FRAG, GL_FRAGMENT_SHADER); present_bicubic_fragment = CreateProgram(HostShaders::PRESENT_BICUBIC_FRAG, GL_FRAGMENT_SHADER); + present_gaussian_fragment = + CreateProgram(HostShaders::PRESENT_GAUSSIAN_FRAG, GL_FRAGMENT_SHADER); present_scaleforce_fragment = CreateProgram(HostShaders::PRESENT_SCALEFORCE_FRAG, GL_FRAGMENT_SHADER); @@ -359,6 +361,9 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { case Settings::ScalingFilter::Bicubic: fragment_handle = present_bicubic_fragment.handle; break; + case Settings::ScalingFilter::Gaussian: + fragment_handle = present_gaussian_fragment.handle; + break; case Settings::ScalingFilter::ScaleForce: fragment_handle = present_scaleforce_fragment.handle; break; diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 504ddbe7b..62a746e41 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -114,6 +114,7 @@ private: OGLProgram present_vertex; OGLProgram present_bilinear_fragment; OGLProgram present_bicubic_fragment; + OGLProgram present_gaussian_fragment; OGLProgram present_scaleforce_fragment; OGLFramebuffer screenshot_framebuffer; diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index ccf721008..0d6bce214 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -18,6 +18,7 @@ #include "core/memory.h" #include "video_core/gpu.h" #include "video_core/host_shaders/present_bicubic_frag_spv.h" +#include "video_core/host_shaders/present_gaussian_frag_spv.h" #include "video_core/host_shaders/present_scaleforce_frag_spv.h" #include "video_core/host_shaders/vulkan_present_frag_spv.h" #include "video_core/host_shaders/vulkan_present_vert_spv.h" @@ -297,6 +298,9 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, case Settings::ScalingFilter::Bicubic: cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *bicubic_pipeline); break; + case Settings::ScalingFilter::Gaussian: + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *gaussian_pipeline); + break; case Settings::ScalingFilter::ScaleForce: cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *scaleforce_pipeline); break; @@ -388,6 +392,7 @@ void VKBlitScreen::CreateShaders() { vertex_shader = BuildShader(device, VULKAN_PRESENT_VERT_SPV); bilinear_fragment_shader = BuildShader(device, VULKAN_PRESENT_FRAG_SPV); bicubic_fragment_shader = BuildShader(device, PRESENT_BICUBIC_FRAG_SPV); + gaussian_fragment_shader = BuildShader(device, PRESENT_GAUSSIAN_FRAG_SPV); scaleforce_fragment_shader = BuildShader(device, PRESENT_SCALEFORCE_FRAG_SPV); } @@ -574,6 +579,27 @@ void VKBlitScreen::CreateGraphicsPipeline() { }, }}; + const std::array gaussian_shader_stages{{ + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = *vertex_shader, + .pName = "main", + .pSpecializationInfo = nullptr, + }, + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = *gaussian_fragment_shader, + .pName = "main", + .pSpecializationInfo = nullptr, + }, + }}; + const std::array scaleforce_shader_stages{{ { .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, @@ -733,6 +759,28 @@ void VKBlitScreen::CreateGraphicsPipeline() { .basePipelineIndex = 0, }; + const VkGraphicsPipelineCreateInfo gaussian_pipeline_ci{ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast(gaussian_shader_stages.size()), + .pStages = gaussian_shader_stages.data(), + .pVertexInputState = &vertex_input_ci, + .pInputAssemblyState = &input_assembly_ci, + .pTessellationState = nullptr, + .pViewportState = &viewport_state_ci, + .pRasterizationState = &rasterization_ci, + .pMultisampleState = &multisampling_ci, + .pDepthStencilState = nullptr, + .pColorBlendState = &color_blend_ci, + .pDynamicState = &dynamic_state_ci, + .layout = *pipeline_layout, + .renderPass = *renderpass, + .subpass = 0, + .basePipelineHandle = 0, + .basePipelineIndex = 0, + }; + const VkGraphicsPipelineCreateInfo scaleforce_pipeline_ci{ .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, .pNext = nullptr, @@ -757,6 +805,7 @@ void VKBlitScreen::CreateGraphicsPipeline() { bilinear_pipeline = device.GetLogical().CreateGraphicsPipeline(bilinear_pipeline_ci); bicubic_pipeline = device.GetLogical().CreateGraphicsPipeline(bicubic_pipeline_ci); + gaussian_pipeline = device.GetLogical().CreateGraphicsPipeline(gaussian_pipeline_ci); scaleforce_pipeline = device.GetLogical().CreateGraphicsPipeline(scaleforce_pipeline_ci); } diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index 448a2fbe6..96a5598ad 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -111,6 +111,7 @@ private: vk::ShaderModule vertex_shader; vk::ShaderModule bilinear_fragment_shader; vk::ShaderModule bicubic_fragment_shader; + vk::ShaderModule gaussian_fragment_shader; vk::ShaderModule scaleforce_fragment_shader; vk::DescriptorPool descriptor_pool; vk::DescriptorSetLayout descriptor_set_layout; @@ -118,6 +119,7 @@ private: vk::Pipeline nearest_neightbor_pipeline; vk::Pipeline bilinear_pipeline; vk::Pipeline bicubic_pipeline; + vk::Pipeline gaussian_pipeline; vk::Pipeline scaleforce_pipeline; vk::RenderPass renderpass; std::vector framebuffers; diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui index c1d7e8349..848ee2d08 100644 --- a/src/yuzu/configuration/configure_graphics.ui +++ b/src/yuzu/configuration/configure_graphics.ui @@ -417,6 +417,11 @@ Bicubic + + + Gaussian + + ScaleForce From 2eff80b47f578b346bb80c5afa2271859cd7943b Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Thu, 21 Oct 2021 02:05:16 +0200 Subject: [PATCH 121/156] QtGUI: Add buttton to toggle the filter. --- src/common/settings.h | 1 + .../renderer_opengl/renderer_opengl.cpp | 1 + src/yuzu/configuration/configure_graphics.ui | 2 +- src/yuzu/main.cpp | 56 +++++++++++++++++++ src/yuzu/main.h | 2 + 5 files changed, 61 insertions(+), 1 deletion(-) diff --git a/src/common/settings.h b/src/common/settings.h index 830030efd..e926a3268 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -70,6 +70,7 @@ enum class ScalingFilter : u32 { Gaussian = 3, ScaleForce = 4, Fsr = 5, + LastFilter = Fsr, }; struct ResolutionScalingInfo { diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 6132b3c49..227697c4f 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -24,6 +24,7 @@ #include "video_core/host_shaders/opengl_present_frag.h" #include "video_core/host_shaders/opengl_present_vert.h" #include "video_core/host_shaders/present_bicubic_frag.h" +#include "video_core/host_shaders/present_gaussian_frag.h" #include "video_core/host_shaders/present_scaleforce_frag.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_manager.h" diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui index 848ee2d08..7cfb2860d 100644 --- a/src/yuzu/configuration/configure_graphics.ui +++ b/src/yuzu/configuration/configure_graphics.ui @@ -429,7 +429,7 @@ - FidelityFX Super Resolution [Vulkan Only] + AMD's FidelityFX™️ Super Resolution [Vulkan Only] diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index 3cb146982..379bd0b17 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -774,6 +774,34 @@ void GMainWindow::InitializeWidgets() { tas_label->setFocusPolicy(Qt::NoFocus); statusBar()->insertPermanentWidget(0, tas_label); + // Setup Filter button + filter_status_button = new QPushButton(); + filter_status_button->setObjectName(QStringLiteral("TogglableStatusBarButton")); + filter_status_button->setFocusPolicy(Qt::NoFocus); + connect(filter_status_button, &QPushButton::clicked, [&] { + auto filter = Settings::values.scaling_filter.GetValue(); + if (filter == Settings::ScalingFilter::LastFilter) { + filter = Settings::ScalingFilter::NearestNeighbor; + } else { + filter = static_cast(static_cast(filter) + 1); + } + if (Settings::values.renderer_backend.GetValue() == Settings::RendererBackend::OpenGL && + filter == Settings::ScalingFilter::Fsr) { + filter = Settings::ScalingFilter::NearestNeighbor; + } + Settings::values.scaling_filter.SetValue(filter); + filter_status_button->setChecked(true); + UpdateFilterText(); + }); + auto filter = Settings::values.scaling_filter.GetValue(); + if (Settings::values.renderer_backend.GetValue() == Settings::RendererBackend::OpenGL && + filter == Settings::ScalingFilter::Fsr) { + Settings::values.scaling_filter.SetValue(Settings::ScalingFilter::NearestNeighbor); + } + UpdateFilterText(); + filter_status_button->setCheckable(true); + statusBar()->insertPermanentWidget(0, filter_status_button); + // Setup Dock button dock_status_button = new QPushButton(); dock_status_button->setObjectName(QStringLiteral("TogglableStatusBarButton")); @@ -3033,11 +3061,39 @@ void GMainWindow::UpdateGPUAccuracyButton() { } } +void GMainWindow::UpdateFilterText() { + const auto filter = Settings::values.scaling_filter.GetValue(); + switch (filter) { + case Settings::ScalingFilter::NearestNeighbor: + filter_status_button->setText(tr("NEAREST")); + break; + case Settings::ScalingFilter::Bilinear: + filter_status_button->setText(tr("BILINEAR")); + break; + case Settings::ScalingFilter::Bicubic: + filter_status_button->setText(tr("BICUBIC")); + break; + case Settings::ScalingFilter::Gaussian: + filter_status_button->setText(tr("GAUSSIAN")); + break; + case Settings::ScalingFilter::ScaleForce: + filter_status_button->setText(tr("SCALEFORCE")); + break; + case Settings::ScalingFilter::Fsr: + filter_status_button->setText(tr("AMD'S FIDELITYFX SR")); + break; + default: + filter_status_button->setText(tr("BILINEAR")); + break; + } +} + void GMainWindow::UpdateStatusButtons() { dock_status_button->setChecked(Settings::values.use_docked_mode.GetValue()); renderer_status_button->setChecked(Settings::values.renderer_backend.GetValue() == Settings::RendererBackend::Vulkan); UpdateGPUAccuracyButton(); + UpdateFilterText(); } void GMainWindow::UpdateUISettings() { diff --git a/src/yuzu/main.h b/src/yuzu/main.h index beb4f2984..d4d2f3d58 100644 --- a/src/yuzu/main.h +++ b/src/yuzu/main.h @@ -302,6 +302,7 @@ private: void MigrateConfigFiles(); void UpdateWindowTitle(std::string_view title_name = {}, std::string_view title_version = {}, std::string_view gpu_vendor = {}); + void UpdateFilterText(); void UpdateStatusBar(); void UpdateGPUAccuracyButton(); void UpdateStatusButtons(); @@ -336,6 +337,7 @@ private: QPushButton* gpu_accuracy_button = nullptr; QPushButton* renderer_status_button = nullptr; QPushButton* dock_status_button = nullptr; + QPushButton* filter_status_button = nullptr; QTimer status_bar_update_timer; std::unique_ptr config; From 510caeefb3e3bf4b365f08a97cdfeffaaf8a80ce Mon Sep 17 00:00:00 2001 From: Marshall Mohror Date: Wed, 20 Oct 2021 18:32:11 -0500 Subject: [PATCH 122/156] Settings: Add anti-aliasing method setting --- src/common/settings.cpp | 1 + src/common/settings.h | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/src/common/settings.cpp b/src/common/settings.cpp index 84ac937e5..3bcaa072f 100644 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -49,6 +49,7 @@ void LogSettings() { log_setting("CPU_Accuracy", values.cpu_accuracy.GetValue()); log_setting("Renderer_UseResolutionScaling", values.resolution_setup.GetValue()); log_setting("Renderer_ScalingFilter", values.scaling_filter.GetValue()); + log_setting("Renderer_AntiAliasing", values.anti_aliasing.GetValue()); log_setting("Renderer_UseSpeedLimit", values.use_speed_limit.GetValue()); log_setting("Renderer_SpeedLimit", values.speed_limit.GetValue()); log_setting("Renderer_UseDiskShaderCache", values.use_disk_shader_cache.GetValue()); diff --git a/src/common/settings.h b/src/common/settings.h index e926a3268..ca1c3c1aa 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -73,6 +73,11 @@ enum class ScalingFilter : u32 { LastFilter = Fsr, }; +enum class AntiAliasing : u32 { + None = 0, + Fxaa = 1, +}; + struct ResolutionScalingInfo { u32 up_scale{1}; u32 down_shift{0}; @@ -498,6 +503,7 @@ struct Values { ResolutionScalingInfo resolution_info{}; Setting resolution_setup{ResolutionSetup::Res1X, "resolution_setup"}; Setting scaling_filter{ScalingFilter::Bilinear, "scaling_filter"}; + Setting anti_aliasing{AntiAliasing::None, "anti_aliasing"}; // *nix platforms may have issues with the borderless windowed fullscreen mode. // Default to exclusive fullscreen on these platforms for now. RangedSetting fullscreen_mode{ From 74e39ed6eeb2362316044ad92e3de9d75034ca58 Mon Sep 17 00:00:00 2001 From: Marshall Mohror Date: Wed, 20 Oct 2021 18:33:07 -0500 Subject: [PATCH 123/156] Frontend: Add anti-aliasing method setting --- src/yuzu/configuration/config.cpp | 5 +++ src/yuzu/configuration/config.h | 1 + src/yuzu/configuration/configure_graphics.cpp | 24 ++++++++++++ src/yuzu/configuration/configure_graphics.ui | 39 +++++++++++++++++++ src/yuzu_cmd/config.cpp | 1 + 5 files changed, 70 insertions(+) diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index 4c296a94d..8227d06bc 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -826,6 +826,7 @@ void Config::ReadRendererValues() { ReadGlobalSetting(Settings::values.aspect_ratio); ReadGlobalSetting(Settings::values.resolution_setup); ReadGlobalSetting(Settings::values.scaling_filter); + ReadGlobalSetting(Settings::values.anti_aliasing); ReadGlobalSetting(Settings::values.max_anisotropy); ReadGlobalSetting(Settings::values.use_speed_limit); ReadGlobalSetting(Settings::values.speed_limit); @@ -1374,6 +1375,10 @@ void Config::SaveRendererValues() { static_cast(Settings::values.scaling_filter.GetValue(global)), static_cast(Settings::values.scaling_filter.GetDefault()), Settings::values.scaling_filter.UsingGlobal()); + WriteSetting(QString::fromStdString(Settings::values.anti_aliasing.GetLabel()), + static_cast(Settings::values.anti_aliasing.GetValue(global)), + static_cast(Settings::values.anti_aliasing.GetDefault()), + Settings::values.anti_aliasing.UsingGlobal()); WriteGlobalSetting(Settings::values.max_anisotropy); WriteGlobalSetting(Settings::values.use_speed_limit); WriteGlobalSetting(Settings::values.speed_limit); diff --git a/src/yuzu/configuration/config.h b/src/yuzu/configuration/config.h index 97dc1bb47..d673c1cdc 100644 --- a/src/yuzu/configuration/config.h +++ b/src/yuzu/configuration/config.h @@ -191,5 +191,6 @@ Q_DECLARE_METATYPE(Settings::FullscreenMode); Q_DECLARE_METATYPE(Settings::NvdecEmulation); Q_DECLARE_METATYPE(Settings::ResolutionSetup); Q_DECLARE_METATYPE(Settings::ScalingFilter); +Q_DECLARE_METATYPE(Settings::AntiAliasing); Q_DECLARE_METATYPE(Settings::RendererBackend); Q_DECLARE_METATYPE(Settings::ShaderBackend); diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp index 02498fad7..59f975a6e 100644 --- a/src/yuzu/configuration/configure_graphics.cpp +++ b/src/yuzu/configuration/configure_graphics.cpp @@ -107,6 +107,8 @@ void ConfigureGraphics::SetConfiguration() { static_cast(Settings::values.resolution_setup.GetValue())); ui->scaling_filter_combobox->setCurrentIndex( static_cast(Settings::values.scaling_filter.GetValue())); + ui->anti_aliasing_combobox->setCurrentIndex( + static_cast(Settings::values.anti_aliasing.GetValue())); } else { ConfigurationShared::SetPerGameSetting(ui->api, &Settings::values.renderer_backend); ConfigurationShared::SetHighlight(ui->api_widget, @@ -137,6 +139,11 @@ void ConfigureGraphics::SetConfiguration() { ConfigurationShared::SetHighlight(ui->scaling_filter_label, !Settings::values.scaling_filter.UsingGlobal()); + ConfigurationShared::SetPerGameSetting(ui->anti_aliasing_combobox, + &Settings::values.anti_aliasing); + ConfigurationShared::SetHighlight(ui->anti_aliasing_label, + !Settings::values.anti_aliasing.UsingGlobal()); + ui->bg_combobox->setCurrentIndex(Settings::values.bg_red.UsingGlobal() ? 0 : 1); ui->bg_button->setEnabled(!Settings::values.bg_red.UsingGlobal()); ConfigurationShared::SetHighlight(ui->bg_layout, !Settings::values.bg_red.UsingGlobal()); @@ -156,6 +163,10 @@ void ConfigureGraphics::ApplyConfiguration() { ui->scaling_filter_combobox->currentIndex() - ((Settings::IsConfiguringGlobal()) ? 0 : ConfigurationShared::USE_GLOBAL_OFFSET)); + const auto anti_aliasing = static_cast( + ui->anti_aliasing_combobox->currentIndex() - + ((Settings::IsConfiguringGlobal()) ? 0 : ConfigurationShared::USE_GLOBAL_OFFSET)); + ConfigurationShared::ApplyPerGameSetting(&Settings::values.fullscreen_mode, ui->fullscreen_mode_combobox); ConfigurationShared::ApplyPerGameSetting(&Settings::values.aspect_ratio, @@ -193,6 +204,9 @@ void ConfigureGraphics::ApplyConfiguration() { if (Settings::values.scaling_filter.UsingGlobal()) { Settings::values.scaling_filter.SetValue(scaling_filter); } + if (Settings::values.anti_aliasing.UsingGlobal()) { + Settings::values.anti_aliasing.SetValue(anti_aliasing); + } } else { if (ui->resolution_combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) { Settings::values.resolution_setup.SetGlobal(true); @@ -206,6 +220,12 @@ void ConfigureGraphics::ApplyConfiguration() { Settings::values.scaling_filter.SetGlobal(false); Settings::values.scaling_filter.SetValue(scaling_filter); } + if (ui->anti_aliasing_combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) { + Settings::values.anti_aliasing.SetGlobal(true); + } else { + Settings::values.anti_aliasing.SetGlobal(false); + Settings::values.anti_aliasing.SetValue(anti_aliasing); + } if (ui->api->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) { Settings::values.renderer_backend.SetGlobal(true); Settings::values.shader_backend.SetGlobal(true); @@ -354,6 +374,7 @@ void ConfigureGraphics::SetupPerGameUI() { ui->aspect_ratio_combobox->setEnabled(Settings::values.aspect_ratio.UsingGlobal()); ui->resolution_combobox->setEnabled(Settings::values.resolution_setup.UsingGlobal()); ui->scaling_filter_combobox->setEnabled(Settings::values.scaling_filter.UsingGlobal()); + ui->anti_aliasing_combobox->setEnabled(Settings::values.anti_aliasing.UsingGlobal()); ui->use_asynchronous_gpu_emulation->setEnabled( Settings::values.use_asynchronous_gpu_emulation.UsingGlobal()); ui->nvdec_emulation->setEnabled(Settings::values.nvdec_emulation.UsingGlobal()); @@ -388,6 +409,9 @@ void ConfigureGraphics::SetupPerGameUI() { ConfigurationShared::SetColoredComboBox( ui->scaling_filter_combobox, ui->scaling_filter_label, static_cast(Settings::values.scaling_filter.GetValue(true))); + ConfigurationShared::SetColoredComboBox( + ui->anti_aliasing_combobox, ui->anti_aliasing_label, + static_cast(Settings::values.anti_aliasing.GetValue(true))); ConfigurationShared::InsertGlobalItem( ui->api, static_cast(Settings::values.renderer_backend.GetValue(true))); ConfigurationShared::InsertGlobalItem( diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui index 7cfb2860d..0d2987fcf 100644 --- a/src/yuzu/configuration/configure_graphics.ui +++ b/src/yuzu/configuration/configure_graphics.ui @@ -437,6 +437,45 @@ + + + + + 0 + + + 0 + + + 0 + + + 0 + + + + + Anti-Aliasing Method: + + + + + + + + None + + + + + FXAA + + + + + + + diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index 3c888d84e..33241ea98 100644 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp @@ -453,6 +453,7 @@ void Config::ReadValues() { ReadSetting("Renderer", Settings::values.resolution_setup); ReadSetting("Renderer", Settings::values.scaling_filter); + ReadSetting("Renderer", Settings::values.anti_aliasing); ReadSetting("Renderer", Settings::values.fullscreen_mode); ReadSetting("Renderer", Settings::values.aspect_ratio); ReadSetting("Renderer", Settings::values.max_anisotropy); From 48cf3764626e1ed30450d15e00befb75a4eae329 Mon Sep 17 00:00:00 2001 From: Marshall Mohror Date: Wed, 20 Oct 2021 18:36:06 -0500 Subject: [PATCH 124/156] OpenGL: Implement FXAA --- src/video_core/host_shaders/CMakeLists.txt | 2 + src/video_core/host_shaders/fxaa.frag | 72 ++++++++++++ src/video_core/host_shaders/fxaa.vert | 40 +++++++ .../renderer_opengl/gl_resource_manager.cpp | 2 +- .../renderer_opengl/renderer_opengl.cpp | 109 ++++++++++++------ .../renderer_opengl/renderer_opengl.h | 4 + 6 files changed, 194 insertions(+), 35 deletions(-) create mode 100644 src/video_core/host_shaders/fxaa.frag create mode 100644 src/video_core/host_shaders/fxaa.vert diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index b0e15773c..6b5ea649a 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -13,6 +13,8 @@ set(SHADER_FILES convert_depth_to_float.frag convert_float_to_depth.frag full_screen_triangle.vert + fxaa.frag + fxaa.vert opengl_copy_bc4.comp opengl_present.frag opengl_present.vert diff --git a/src/video_core/host_shaders/fxaa.frag b/src/video_core/host_shaders/fxaa.frag new file mode 100644 index 000000000..23f910d4c --- /dev/null +++ b/src/video_core/host_shaders/fxaa.frag @@ -0,0 +1,72 @@ +// Adapted from +// https://www.geeks3d.com/20110405/fxaa-fast-approximate-anti-aliasing-demo-glsl-opengl-test-radeon-geforce/3/ + +#version 460 + +#ifdef VULKAN + +#define BINDING_COLOR_TEXTURE 1 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#define BINDING_COLOR_TEXTURE 0 + +#endif + +layout (location = 0) in vec4 posPos; + +layout (location = 0) out vec4 frag_color; + +layout (binding = BINDING_COLOR_TEXTURE) uniform sampler2D input_texture; + +const float FXAA_SPAN_MAX = 8.0; +const float FXAA_REDUCE_MUL = 1.0 / 8.0; +const float FXAA_REDUCE_MIN = 1.0 / 128.0; + +#define FxaaTexLod0(t, p) textureLod(t, p, 0.0) +#define FxaaTexOff(t, p, o) textureLodOffset(t, p, 0.0, o) + +vec3 FxaaPixelShader(vec4 posPos, sampler2D tex) { + + vec3 rgbNW = FxaaTexLod0(tex, posPos.zw).xyz; + vec3 rgbNE = FxaaTexOff(tex, posPos.zw, ivec2(1,0)).xyz; + vec3 rgbSW = FxaaTexOff(tex, posPos.zw, ivec2(0,1)).xyz; + vec3 rgbSE = FxaaTexOff(tex, posPos.zw, ivec2(1,1)).xyz; + vec3 rgbM = FxaaTexLod0(tex, posPos.xy).xyz; +/*---------------------------------------------------------*/ + vec3 luma = vec3(0.299, 0.587, 0.114); + float lumaNW = dot(rgbNW, luma); + float lumaNE = dot(rgbNE, luma); + float lumaSW = dot(rgbSW, luma); + float lumaSE = dot(rgbSE, luma); + float lumaM = dot(rgbM, luma); +/*---------------------------------------------------------*/ + float lumaMin = min(lumaM, min(min(lumaNW, lumaNE), min(lumaSW, lumaSE))); + float lumaMax = max(lumaM, max(max(lumaNW, lumaNE), max(lumaSW, lumaSE))); +/*---------------------------------------------------------*/ + vec2 dir; + dir.x = -((lumaNW + lumaNE) - (lumaSW + lumaSE)); + dir.y = ((lumaNW + lumaSW) - (lumaNE + lumaSE)); +/*---------------------------------------------------------*/ + float dirReduce = max( + (lumaNW + lumaNE + lumaSW + lumaSE) * (0.25 * FXAA_REDUCE_MUL), + FXAA_REDUCE_MIN); + float rcpDirMin = 1.0/(min(abs(dir.x), abs(dir.y)) + dirReduce); + dir = min(vec2( FXAA_SPAN_MAX, FXAA_SPAN_MAX), + max(vec2(-FXAA_SPAN_MAX, -FXAA_SPAN_MAX), + dir * rcpDirMin)) / textureSize(tex, 0); +/*--------------------------------------------------------*/ + vec3 rgbA = (1.0 / 2.0) * ( + FxaaTexLod0(tex, posPos.xy + dir * (1.0 / 3.0 - 0.5)).xyz + + FxaaTexLod0(tex, posPos.xy + dir * (2.0 / 3.0 - 0.5)).xyz); + vec3 rgbB = rgbA * (1.0 / 2.0) + (1.0 / 4.0) * ( + FxaaTexLod0(tex, posPos.xy + dir * (0.0 / 3.0 - 0.5)).xyz + + FxaaTexLod0(tex, posPos.xy + dir * (3.0 / 3.0 - 0.5)).xyz); + float lumaB = dot(rgbB, luma); + if((lumaB < lumaMin) || (lumaB > lumaMax)) return rgbA; + return rgbB; +} + +void main() { + frag_color = vec4(FxaaPixelShader(posPos, input_texture), 1.0); +} diff --git a/src/video_core/host_shaders/fxaa.vert b/src/video_core/host_shaders/fxaa.vert new file mode 100644 index 000000000..715fce462 --- /dev/null +++ b/src/video_core/host_shaders/fxaa.vert @@ -0,0 +1,40 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 460 + +out gl_PerVertex { + vec4 gl_Position; +}; + +const vec2 vertices[4] = + vec2[4](vec2(-1.0, 1.0), vec2(1.0, 1.0), vec2(-1.0, -1.0), vec2(1.0, -1.0)); + +layout (location = 0) out vec4 posPos; + +#ifdef VULKAN + +#define BINDING_COLOR_TEXTURE 1 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#define BINDING_COLOR_TEXTURE 0 + +#endif + +layout (binding = BINDING_COLOR_TEXTURE) uniform sampler2D input_texture; + +const float FXAA_SUBPIX_SHIFT = 0; + +void main() { +#ifdef VULKAN + vec2 vertex = vertices[gl_VertexIndex]; +#else + vec2 vertex = vertices[gl_VertexID]; +#endif + gl_Position = vec4(vertex, 0.0, 1.0); + vec2 vert_tex_coord = (vertex + 1.0) / 2.0; + posPos.xy = vert_tex_coord; + posPos.zw = vert_tex_coord - (0.5 + FXAA_SUBPIX_SHIFT) / textureSize(input_texture, 0); +} diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 8695c29e3..70947838c 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -166,7 +166,7 @@ void OGLFramebuffer::Create() { return; MICROPROFILE_SCOPE(OpenGL_ResourceCreation); - glGenFramebuffers(1, &handle); + glCreateFramebuffers(1, &handle); } void OGLFramebuffer::Release() { diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 227697c4f..dbe66a1b6 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -21,6 +21,8 @@ #include "core/memory.h" #include "core/perf_stats.h" #include "core/telemetry_session.h" +#include "video_core/host_shaders/fxaa_frag.h" +#include "video_core/host_shaders/fxaa_vert.h" #include "video_core/host_shaders/opengl_present_frag.h" #include "video_core/host_shaders/opengl_present_vert.h" #include "video_core/host_shaders/present_bicubic_frag.h" @@ -254,6 +256,8 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color void RendererOpenGL::InitOpenGLObjects() { // Create shader programs + fxaa_vertex = CreateProgram(HostShaders::FXAA_VERT, GL_VERTEX_SHADER); + fxaa_fragment = CreateProgram(HostShaders::FXAA_FRAG, GL_FRAGMENT_SHADER); present_vertex = CreateProgram(HostShaders::OPENGL_PRESENT_VERT, GL_VERTEX_SHADER); present_bilinear_fragment = CreateProgram(HostShaders::OPENGL_PRESENT_FRAG, GL_FRAGMENT_SHADER); present_bicubic_fragment = CreateProgram(HostShaders::PRESENT_BICUBIC_FRAG, GL_FRAGMENT_SHADER); @@ -287,6 +291,8 @@ void RendererOpenGL::InitOpenGLObjects() { // Clear screen to black LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture); + + fxaa_framebuffer.Create(); } void RendererOpenGL::AddTelemetryFields() { @@ -338,14 +344,83 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, texture.resource.Release(); texture.resource.Create(GL_TEXTURE_2D); glTextureStorage2D(texture.resource.handle, 1, internal_format, texture.width, texture.height); + fxaa_texture.Release(); + fxaa_texture.Create(GL_TEXTURE_2D); + glTextureStorage2D(fxaa_texture.handle, 1, GL_RGBA16F, texture.width, texture.height); + glNamedFramebufferTexture(fxaa_framebuffer.handle, GL_COLOR_ATTACHMENT0, fxaa_texture.handle, + 0); } void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { + // TODO: Signal state tracker about these changes + state_tracker.NotifyScreenDrawVertexArray(); + state_tracker.NotifyPolygonModes(); + state_tracker.NotifyViewport0(); + state_tracker.NotifyScissor0(); + state_tracker.NotifyColorMask(0); + state_tracker.NotifyBlend0(); + state_tracker.NotifyFramebuffer(); + state_tracker.NotifyFrontFace(); + state_tracker.NotifyCullTest(); + state_tracker.NotifyDepthTest(); + state_tracker.NotifyStencilTest(); + state_tracker.NotifyPolygonOffset(); + state_tracker.NotifyRasterizeEnable(); + state_tracker.NotifyFramebufferSRGB(); + state_tracker.NotifyLogicOp(); + state_tracker.NotifyClipControl(); + state_tracker.NotifyAlphaTest(); + + state_tracker.ClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE); + // Update background color before drawing glClearColor(Settings::values.bg_red.GetValue() / 255.0f, Settings::values.bg_green.GetValue() / 255.0f, Settings::values.bg_blue.GetValue() / 255.0f, 1.0f); + glEnable(GL_CULL_FACE); + glDisable(GL_COLOR_LOGIC_OP); + glDisable(GL_DEPTH_TEST); + glDisable(GL_STENCIL_TEST); + glDisable(GL_POLYGON_OFFSET_FILL); + glDisable(GL_RASTERIZER_DISCARD); + glDisable(GL_ALPHA_TEST); + glDisablei(GL_BLEND, 0); + glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); + glCullFace(GL_BACK); + glFrontFace(GL_CW); + glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); + + glBindTextureUnit(0, screen_info.display_texture); + + if (Settings::values.anti_aliasing.GetValue() == Settings::AntiAliasing::Fxaa) { + program_manager.BindPresentPrograms(fxaa_vertex.handle, fxaa_fragment.handle); + + glEnablei(GL_SCISSOR_TEST, 0); + glScissorIndexed(0, 0, 0, + framebuffer_crop_rect.GetWidth() != 0 ? framebuffer_crop_rect.GetWidth() + : screen_info.texture.width, + framebuffer_crop_rect.GetHeight() != 0 ? framebuffer_crop_rect.GetHeight() + : screen_info.texture.height); + glViewportIndexedf(0, 0.0f, 0.0f, static_cast(screen_info.texture.width), + static_cast(screen_info.texture.height)); + glDepthRangeIndexed(0, 0.0, 0.0); + + glBindSampler(0, present_sampler.handle); + GLint old_read_fb; + GLint old_draw_fb; + glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &old_read_fb); + glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &old_draw_fb); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, fxaa_framebuffer.handle); + + glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); + + glBindFramebuffer(GL_READ_FRAMEBUFFER, old_read_fb); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, old_draw_fb); + + glBindTextureUnit(0, fxaa_texture.handle); + } + // Set projection matrix const std::array ortho_matrix = MakeOrthographicMatrix(static_cast(layout.width), static_cast(layout.height)); @@ -422,47 +497,14 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { }; glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), std::data(vertices)); - // TODO: Signal state tracker about these changes - state_tracker.NotifyScreenDrawVertexArray(); - state_tracker.NotifyPolygonModes(); - state_tracker.NotifyViewport0(); - state_tracker.NotifyScissor0(); - state_tracker.NotifyColorMask(0); - state_tracker.NotifyBlend0(); - state_tracker.NotifyFramebuffer(); - state_tracker.NotifyFrontFace(); - state_tracker.NotifyCullTest(); - state_tracker.NotifyDepthTest(); - state_tracker.NotifyStencilTest(); - state_tracker.NotifyPolygonOffset(); - state_tracker.NotifyRasterizeEnable(); - state_tracker.NotifyFramebufferSRGB(); - state_tracker.NotifyLogicOp(); - state_tracker.NotifyClipControl(); - state_tracker.NotifyAlphaTest(); - - state_tracker.ClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE); - glEnable(GL_CULL_FACE); if (screen_info.display_srgb) { glEnable(GL_FRAMEBUFFER_SRGB); } else { glDisable(GL_FRAMEBUFFER_SRGB); } - glDisable(GL_COLOR_LOGIC_OP); - glDisable(GL_DEPTH_TEST); - glDisable(GL_STENCIL_TEST); - glDisable(GL_POLYGON_OFFSET_FILL); - glDisable(GL_RASTERIZER_DISCARD); - glDisable(GL_ALPHA_TEST); - glDisablei(GL_BLEND, 0); glDisablei(GL_SCISSOR_TEST, 0); - glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); - glCullFace(GL_BACK); - glFrontFace(GL_CW); - glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); glViewportIndexedf(0, 0.0f, 0.0f, static_cast(layout.width), static_cast(layout.height)); - glDepthRangeIndexed(0, 0.0, 0.0); glEnableVertexAttribArray(PositionLocation); glEnableVertexAttribArray(TexCoordLocation); @@ -482,7 +524,6 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex)); } - glBindTextureUnit(0, screen_info.display_texture); if (Settings::values.scaling_filter.GetValue() != Settings::ScalingFilter::NearestNeighbor) { glBindSampler(0, present_sampler.handle); } else { diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 62a746e41..f6c66f804 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -111,6 +111,8 @@ private: OGLSampler present_sampler; OGLSampler present_sampler_nn; OGLBuffer vertex_buffer; + OGLProgram fxaa_vertex; + OGLProgram fxaa_fragment; OGLProgram present_vertex; OGLProgram present_bilinear_fragment; OGLProgram present_bicubic_fragment; @@ -123,6 +125,8 @@ private: /// Display information for Switch screen ScreenInfo screen_info; + OGLTexture fxaa_texture; + OGLFramebuffer fxaa_framebuffer; /// OpenGL framebuffer data std::vector gl_framebuffer_data; From 056894f07ae4c1ef295fefb1e8f120964f2e04b4 Mon Sep 17 00:00:00 2001 From: Marshall Mohror Date: Wed, 20 Oct 2021 21:40:02 -0500 Subject: [PATCH 125/156] OpenGL: fix FXAA with scaling --- .../renderer_opengl/renderer_opengl.cpp | 39 ++++++++++++++----- .../renderer_opengl/renderer_opengl.h | 1 + 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index dbe66a1b6..e63f0bdd8 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -213,7 +213,9 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf framebuffer_crop_rect = framebuffer.crop_rect; const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset}; - if (rasterizer.AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) { + screen_info.was_accelerated = + rasterizer.AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride); + if (screen_info.was_accelerated) { return; } @@ -346,7 +348,9 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, glTextureStorage2D(texture.resource.handle, 1, internal_format, texture.width, texture.height); fxaa_texture.Release(); fxaa_texture.Create(GL_TEXTURE_2D); - glTextureStorage2D(fxaa_texture.handle, 1, GL_RGBA16F, texture.width, texture.height); + glTextureStorage2D(fxaa_texture.handle, 1, GL_RGBA16F, + Settings::values.resolution_info.ScaleUp(screen_info.texture.width), + Settings::values.resolution_info.ScaleUp(screen_info.texture.height)); glNamedFramebufferTexture(fxaa_framebuffer.handle, GL_COLOR_ATTACHMENT0, fxaa_texture.handle, 0); } @@ -397,13 +401,25 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { program_manager.BindPresentPrograms(fxaa_vertex.handle, fxaa_fragment.handle); glEnablei(GL_SCISSOR_TEST, 0); - glScissorIndexed(0, 0, 0, - framebuffer_crop_rect.GetWidth() != 0 ? framebuffer_crop_rect.GetWidth() - : screen_info.texture.width, - framebuffer_crop_rect.GetHeight() != 0 ? framebuffer_crop_rect.GetHeight() - : screen_info.texture.height); - glViewportIndexedf(0, 0.0f, 0.0f, static_cast(screen_info.texture.width), - static_cast(screen_info.texture.height)); + auto viewport_width = screen_info.texture.width; + auto scissor_width = framebuffer_crop_rect.GetWidth(); + if (scissor_width <= 0) { + scissor_width = viewport_width; + } + auto viewport_height = screen_info.texture.height; + auto scissor_height = framebuffer_crop_rect.GetHeight(); + if (scissor_height <= 0) { + scissor_height = viewport_height; + } + if (screen_info.was_accelerated) { + viewport_width = Settings::values.resolution_info.ScaleUp(viewport_width); + scissor_width = Settings::values.resolution_info.ScaleUp(scissor_width); + viewport_height = Settings::values.resolution_info.ScaleUp(viewport_height); + scissor_height = Settings::values.resolution_info.ScaleUp(scissor_height); + } + glScissorIndexed(0, 0, 0, scissor_width, scissor_height); + glViewportIndexedf(0, 0.0f, 0.0f, static_cast(viewport_width), + static_cast(viewport_height)); glDepthRangeIndexed(0, 0.0, 0.0); glBindSampler(0, present_sampler.handle); @@ -487,6 +503,11 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { scale_v = static_cast(framebuffer_crop_rect.GetHeight()) / static_cast(screen_info.texture.height); } + if (Settings::values.anti_aliasing.GetValue() == Settings::AntiAliasing::Fxaa && + !screen_info.was_accelerated) { + scale_u /= Settings::values.resolution_info.up_factor; + scale_v /= Settings::values.resolution_info.up_factor; + } const auto& screen = layout.screen; const std::array vertices = { diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index f6c66f804..cda333cad 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -50,6 +50,7 @@ struct TextureInfo { /// Structure used for storing information about the display target for the Switch screen struct ScreenInfo { GLuint display_texture{}; + bool was_accelerated = false; bool display_srgb{}; const Common::Rectangle display_texcoords{0.0f, 0.0f, 1.0f, 1.0f}; TextureInfo texture; From e6f1ed08fb1f11d86bb4cb7c03d83a9f443d6c12 Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Fri, 22 Oct 2021 19:22:34 +0200 Subject: [PATCH 126/156] Vulkan: Implement FXAA --- src/video_core/host_shaders/fxaa.vert | 2 +- .../renderer_vulkan/vk_blit_screen.cpp | 390 +++++++++++++++++- .../renderer_vulkan/vk_blit_screen.h | 17 + 3 files changed, 387 insertions(+), 22 deletions(-) diff --git a/src/video_core/host_shaders/fxaa.vert b/src/video_core/host_shaders/fxaa.vert index 715fce462..01d5ff4df 100644 --- a/src/video_core/host_shaders/fxaa.vert +++ b/src/video_core/host_shaders/fxaa.vert @@ -15,7 +15,7 @@ layout (location = 0) out vec4 posPos; #ifdef VULKAN -#define BINDING_COLOR_TEXTURE 1 +#define BINDING_COLOR_TEXTURE 0 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 0d6bce214..c0abcc17a 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -17,6 +17,8 @@ #include "core/frontend/emu_window.h" #include "core/memory.h" #include "video_core/gpu.h" +#include "video_core/host_shaders/fxaa_frag_spv.h" +#include "video_core/host_shaders/fxaa_vert_spv.h" #include "video_core/host_shaders/present_bicubic_frag_spv.h" #include "video_core/host_shaders/present_gaussian_frag_spv.h" #include "video_core/host_shaders/present_scaleforce_frag_spv.h" @@ -149,15 +151,9 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, scheduler.Wait(resource_ticks[image_index]); resource_ticks[image_index] = scheduler.CurrentTick(); - const VkImageView source_image_view = + VkImageView source_image_view = use_accelerated ? screen_info.image_view : *raw_image_views[image_index]; - if (!fsr) { - const bool is_nn = - Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::NearestNeighbor; - UpdateDescriptorSet(image_index, source_image_view, is_nn); - } - BufferData data; SetUniformData(data, layout); SetVertexData(data, framebuffer, layout); @@ -239,6 +235,68 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, }); } + const auto anti_alias_pass = Settings::values.anti_aliasing.GetValue(); + if (use_accelerated && anti_alias_pass != Settings::AntiAliasing::None) { + UpdateAADescriptorSet(image_index, source_image_view, false); + const u32 up_scale = Settings::values.resolution_info.up_scale; + const u32 down_shift = Settings::values.resolution_info.down_shift; + VkExtent2D size{ + .width = (up_scale * framebuffer.width) >> down_shift, + .height = (up_scale * framebuffer.height) >> down_shift, + }; + source_image_view = *aa_image_view; + scheduler.Record([this, image_index, size, anti_alias_pass](vk::CommandBuffer cmdbuf) { + const f32 bg_red = Settings::values.bg_red.GetValue() / 255.0f; + const f32 bg_green = Settings::values.bg_green.GetValue() / 255.0f; + const f32 bg_blue = Settings::values.bg_blue.GetValue() / 255.0f; + const VkClearValue clear_color{ + .color = {.float32 = {bg_red, bg_green, bg_blue, 1.0f}}, + }; + const VkRenderPassBeginInfo renderpass_bi{ + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .pNext = nullptr, + .renderPass = *aa_renderpass, + .framebuffer = *aa_framebuffer, + .renderArea = + { + .offset = {0, 0}, + .extent = size, + }, + .clearValueCount = 1, + .pClearValues = &clear_color, + }; + const VkViewport viewport{ + .x = 0.0f, + .y = 0.0f, + .width = static_cast(size.width), + .height = static_cast(size.height), + .minDepth = 0.0f, + .maxDepth = 1.0f, + }; + const VkRect2D scissor{ + .offset = {0, 0}, + .extent = size, + }; + cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE); + switch (anti_alias_pass) { + case Settings::AntiAliasing::Fxaa: + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *aa_pipeline); + break; + default: + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *aa_pipeline); + break; + } + cmdbuf.SetViewport(0, viewport); + cmdbuf.SetScissor(0, scissor); + + cmdbuf.BindVertexBuffer(0, *buffer, offsetof(BufferData, vertices)); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, *aa_pipeline_layout, 0, + aa_descriptor_sets[image_index], {}); + cmdbuf.Draw(4, 1, 0, 0); + cmdbuf.EndRenderPass(); + }); + } + if (fsr) { auto crop_rect = framebuffer.crop_rect; if (crop_rect.GetWidth() == 0) { @@ -251,6 +309,10 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, VkImageView fsr_image_view = fsr->Draw(scheduler, image_index, source_image_view, crop_rect); UpdateDescriptorSet(image_index, fsr_image_view, true); + } else { + const bool is_nn = + Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::NearestNeighbor; + UpdateDescriptorSet(image_index, source_image_view, is_nn); } scheduler.Record( @@ -329,11 +391,16 @@ VkSemaphore VKBlitScreen::DrawToSwapchain(const Tegra::FramebufferConfig& frameb } vk::Framebuffer VKBlitScreen::CreateFramebuffer(const VkImageView& image_view, VkExtent2D extent) { + return CreateFramebuffer(image_view, extent, renderpass); +} + +vk::Framebuffer VKBlitScreen::CreateFramebuffer(const VkImageView& image_view, VkExtent2D extent, + vk::RenderPass& rd) { return device.GetLogical().CreateFramebuffer(VkFramebufferCreateInfo{ .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .renderPass = *renderpass, + .renderPass = *rd, .attachmentCount = 1, .pAttachments = &image_view, .width = extent.width, @@ -390,6 +457,8 @@ void VKBlitScreen::RefreshResources(const Tegra::FramebufferConfig& framebuffer) void VKBlitScreen::CreateShaders() { vertex_shader = BuildShader(device, VULKAN_PRESENT_VERT_SPV); + fxaa_vertex_shader = BuildShader(device, FXAA_VERT_SPV); + fxaa_fragment_shader = BuildShader(device, FXAA_FRAG_SPV); bilinear_fragment_shader = BuildShader(device, VULKAN_PRESENT_FRAG_SPV); bicubic_fragment_shader = BuildShader(device, PRESENT_BICUBIC_FRAG_SPV); gaussian_fragment_shader = BuildShader(device, PRESENT_GAUSSIAN_FRAG_SPV); @@ -413,6 +482,13 @@ void VKBlitScreen::CreateDescriptorPool() { }, }}; + const std::array pool_sizes_aa{{ + { + .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .descriptorCount = static_cast(2 * image_count), + }, + }}; + const VkDescriptorPoolCreateInfo ci{ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, .pNext = nullptr, @@ -422,19 +498,33 @@ void VKBlitScreen::CreateDescriptorPool() { .pPoolSizes = pool_sizes.data(), }; descriptor_pool = device.GetLogical().CreateDescriptorPool(ci); + + const VkDescriptorPoolCreateInfo ci_aa{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, + .maxSets = static_cast(image_count), + .poolSizeCount = static_cast(pool_sizes_aa.size()), + .pPoolSizes = pool_sizes_aa.data(), + }; + aa_descriptor_pool = device.GetLogical().CreateDescriptorPool(ci_aa); } void VKBlitScreen::CreateRenderPass() { + renderpass = CreateRenderPassImpl(swapchain.GetImageViewFormat()); +} + +vk::RenderPass VKBlitScreen::CreateRenderPassImpl(VkFormat format, bool is_present) { const VkAttachmentDescription color_attachment{ .flags = 0, - .format = swapchain.GetImageViewFormat(), + .format = format, .samples = VK_SAMPLE_COUNT_1_BIT, .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR, .storeOp = VK_ATTACHMENT_STORE_OP_STORE, .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE, .stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE, .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, - .finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, + .finalLayout = is_present ? VK_IMAGE_LAYOUT_PRESENT_SRC_KHR : VK_IMAGE_LAYOUT_GENERAL, }; const VkAttachmentReference color_attachment_ref{ @@ -477,7 +567,7 @@ void VKBlitScreen::CreateRenderPass() { .pDependencies = &dependency, }; - renderpass = device.GetLogical().CreateRenderPass(renderpass_ci); + return device.GetLogical().CreateRenderPass(renderpass_ci); } void VKBlitScreen::CreateDescriptorSetLayout() { @@ -498,6 +588,23 @@ void VKBlitScreen::CreateDescriptorSetLayout() { }, }}; + const std::array layout_bindings_aa{{ + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_VERTEX_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT, + .pImmutableSamplers = nullptr, + }, + }}; + const VkDescriptorSetLayoutCreateInfo ci{ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, .pNext = nullptr, @@ -506,11 +613,21 @@ void VKBlitScreen::CreateDescriptorSetLayout() { .pBindings = layout_bindings.data(), }; + const VkDescriptorSetLayoutCreateInfo ci_aa{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .bindingCount = static_cast(layout_bindings_aa.size()), + .pBindings = layout_bindings_aa.data(), + }; + descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout(ci); + aa_descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout(ci_aa); } void VKBlitScreen::CreateDescriptorSets() { const std::vector layouts(image_count, *descriptor_set_layout); + const std::vector layouts_aa(image_count, *aa_descriptor_set_layout); const VkDescriptorSetAllocateInfo ai{ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, @@ -520,7 +637,16 @@ void VKBlitScreen::CreateDescriptorSets() { .pSetLayouts = layouts.data(), }; + const VkDescriptorSetAllocateInfo ai_aa{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .pNext = nullptr, + .descriptorPool = *aa_descriptor_pool, + .descriptorSetCount = static_cast(image_count), + .pSetLayouts = layouts_aa.data(), + }; + descriptor_sets = descriptor_pool.Allocate(ai); + aa_descriptor_sets = aa_descriptor_pool.Allocate(ai_aa); } void VKBlitScreen::CreatePipelineLayout() { @@ -533,7 +659,17 @@ void VKBlitScreen::CreatePipelineLayout() { .pushConstantRangeCount = 0, .pPushConstantRanges = nullptr, }; + const VkPipelineLayoutCreateInfo ci_aa{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .setLayoutCount = 1, + .pSetLayouts = aa_descriptor_set_layout.address(), + .pushConstantRangeCount = 0, + .pPushConstantRanges = nullptr, + }; pipeline_layout = device.GetLogical().CreatePipelineLayout(ci); + aa_pipeline_layout = device.GetLogical().CreatePipelineLayout(ci_aa); } void VKBlitScreen::CreateGraphicsPipeline() { @@ -862,7 +998,7 @@ void VKBlitScreen::CreateFramebuffers() { for (std::size_t i = 0; i < image_count; ++i) { const VkImageView image_view{swapchain.GetImageViewIndex(i)}; - framebuffers[i] = CreateFramebuffer(image_view, size); + framebuffers[i] = CreateFramebuffer(image_view, size, renderpass); } } @@ -872,6 +1008,11 @@ void VKBlitScreen::ReleaseRawImages() { } raw_images.clear(); raw_buffer_commits.clear(); + + aa_image_view.reset(); + aa_image.reset(); + aa_commit = MemoryCommit{}; + buffer.reset(); buffer_commit = MemoryCommit{}; } @@ -898,8 +1039,11 @@ void VKBlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) raw_image_views.resize(image_count); raw_buffer_commits.resize(image_count); - for (size_t i = 0; i < image_count; ++i) { - raw_images[i] = device.GetLogical().CreateImage(VkImageCreateInfo{ + const auto create_image = [&](bool used_on_framebuffer = false, u32 up_scale = 1, + u32 down_shift = 0) { + u32 extra_usages = used_on_framebuffer ? VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT + : VK_IMAGE_USAGE_TRANSFER_DST_BIT; + return device.GetLogical().CreateImage(VkImageCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -907,26 +1051,30 @@ void VKBlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) .format = GetFormat(framebuffer), .extent = { - .width = framebuffer.width, - .height = framebuffer.height, + .width = (up_scale * framebuffer.width) >> down_shift, + .height = (up_scale * framebuffer.height) >> down_shift, .depth = 1, }, .mipLevels = 1, .arrayLayers = 1, .samples = VK_SAMPLE_COUNT_1_BIT, - .tiling = VK_IMAGE_TILING_LINEAR, - .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, + .tiling = used_on_framebuffer ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR, + .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | extra_usages, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, }); - raw_buffer_commits[i] = memory_allocator.Commit(raw_images[i], MemoryUsage::DeviceLocal); - raw_image_views[i] = device.GetLogical().CreateImageView(VkImageViewCreateInfo{ + }; + const auto create_commit = [&](vk::Image& image) { + return memory_allocator.Commit(image, MemoryUsage::DeviceLocal); + }; + const auto create_image_view = [&](vk::Image& image) { + return device.GetLogical().CreateImageView(VkImageViewCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, .pNext = nullptr, .flags = 0, - .image = *raw_images[i], + .image = *image, .viewType = VK_IMAGE_VIEW_TYPE_2D, .format = GetFormat(framebuffer), .components = @@ -945,7 +1093,207 @@ void VKBlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) .layerCount = 1, }, }); + }; + + for (size_t i = 0; i < image_count; ++i) { + raw_images[i] = create_image(); + raw_buffer_commits[i] = create_commit(raw_images[i]); + raw_image_views[i] = create_image_view(raw_images[i]); } + + // AA Resources + const u32 up_scale = Settings::values.resolution_info.up_scale; + const u32 down_shift = Settings::values.resolution_info.down_shift; + aa_image = create_image(true, up_scale, down_shift); + aa_commit = create_commit(aa_image); + aa_image_view = create_image_view(aa_image); + VkExtent2D size{ + .width = (up_scale * framebuffer.width) >> down_shift, + .height = (up_scale * framebuffer.height) >> down_shift, + }; + if (aa_renderpass) { + aa_framebuffer = CreateFramebuffer(*aa_image_view, size, aa_renderpass); + return; + } + aa_renderpass = CreateRenderPassImpl(GetFormat(framebuffer), false); + aa_framebuffer = CreateFramebuffer(*aa_image_view, size, aa_renderpass); + + const std::array fxaa_shader_stages{{ + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = *fxaa_vertex_shader, + .pName = "main", + .pSpecializationInfo = nullptr, + }, + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = *fxaa_fragment_shader, + .pName = "main", + .pSpecializationInfo = nullptr, + }, + }}; + + const auto vertex_binding_description = ScreenRectVertex::GetDescription(); + const auto vertex_attrs_description = ScreenRectVertex::GetAttributes(); + + const VkPipelineVertexInputStateCreateInfo vertex_input_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = &vertex_binding_description, + .vertexAttributeDescriptionCount = u32{vertex_attrs_description.size()}, + .pVertexAttributeDescriptions = vertex_attrs_description.data(), + }; + + const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, + .primitiveRestartEnable = VK_FALSE, + }; + + const VkPipelineViewportStateCreateInfo viewport_state_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .viewportCount = 1, + .pViewports = nullptr, + .scissorCount = 1, + .pScissors = nullptr, + }; + + const VkPipelineRasterizationStateCreateInfo rasterization_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .frontFace = VK_FRONT_FACE_CLOCKWISE, + .depthBiasEnable = VK_FALSE, + .depthBiasConstantFactor = 0.0f, + .depthBiasClamp = 0.0f, + .depthBiasSlopeFactor = 0.0f, + .lineWidth = 1.0f, + }; + + const VkPipelineMultisampleStateCreateInfo multisampling_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT, + .sampleShadingEnable = VK_FALSE, + .minSampleShading = 0.0f, + .pSampleMask = nullptr, + .alphaToCoverageEnable = VK_FALSE, + .alphaToOneEnable = VK_FALSE, + }; + + const VkPipelineColorBlendAttachmentState color_blend_attachment{ + .blendEnable = VK_FALSE, + .srcColorBlendFactor = VK_BLEND_FACTOR_ZERO, + .dstColorBlendFactor = VK_BLEND_FACTOR_ZERO, + .colorBlendOp = VK_BLEND_OP_ADD, + .srcAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .alphaBlendOp = VK_BLEND_OP_ADD, + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT, + }; + + const VkPipelineColorBlendStateCreateInfo color_blend_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .logicOpEnable = VK_FALSE, + .logicOp = VK_LOGIC_OP_COPY, + .attachmentCount = 1, + .pAttachments = &color_blend_attachment, + .blendConstants = {0.0f, 0.0f, 0.0f, 0.0f}, + }; + + static constexpr std::array dynamic_states{ + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, + }; + const VkPipelineDynamicStateCreateInfo dynamic_state_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .dynamicStateCount = static_cast(dynamic_states.size()), + .pDynamicStates = dynamic_states.data(), + }; + + const VkGraphicsPipelineCreateInfo fxaa_pipeline_ci{ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast(fxaa_shader_stages.size()), + .pStages = fxaa_shader_stages.data(), + .pVertexInputState = &vertex_input_ci, + .pInputAssemblyState = &input_assembly_ci, + .pTessellationState = nullptr, + .pViewportState = &viewport_state_ci, + .pRasterizationState = &rasterization_ci, + .pMultisampleState = &multisampling_ci, + .pDepthStencilState = nullptr, + .pColorBlendState = &color_blend_ci, + .pDynamicState = &dynamic_state_ci, + .layout = *aa_pipeline_layout, + .renderPass = *aa_renderpass, + .subpass = 0, + .basePipelineHandle = 0, + .basePipelineIndex = 0, + }; + + // AA + aa_pipeline = device.GetLogical().CreateGraphicsPipeline(fxaa_pipeline_ci); +} + +void VKBlitScreen::UpdateAADescriptorSet(std::size_t image_index, VkImageView image_view, + bool nn) const { + const VkDescriptorImageInfo image_info{ + .sampler = nn ? *nn_sampler : *sampler, + .imageView = image_view, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + + const VkWriteDescriptorSet sampler_write{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .pNext = nullptr, + .dstSet = aa_descriptor_sets[image_index], + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .pImageInfo = &image_info, + .pBufferInfo = nullptr, + .pTexelBufferView = nullptr, + }; + + const VkWriteDescriptorSet sampler_write_2{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .pNext = nullptr, + .dstSet = aa_descriptor_sets[image_index], + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .pImageInfo = &image_info, + .pBufferInfo = nullptr, + .pTexelBufferView = nullptr, + }; + + device.GetLogical().UpdateDescriptorSets(std::array{sampler_write, sampler_write_2}, {}); } void VKBlitScreen::UpdateDescriptorSet(std::size_t image_index, VkImageView image_view, diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index 96a5598ad..e8737537e 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -68,6 +68,9 @@ public: [[nodiscard]] vk::Framebuffer CreateFramebuffer(const VkImageView& image_view, VkExtent2D extent); + [[nodiscard]] vk::Framebuffer CreateFramebuffer(const VkImageView& image_view, + VkExtent2D extent, vk::RenderPass& rd); + private: struct BufferData; @@ -76,6 +79,7 @@ private: void CreateSemaphores(); void CreateDescriptorPool(); void CreateRenderPass(); + vk::RenderPass CreateRenderPassImpl(VkFormat, bool is_present = true); void CreateDescriptorSetLayout(); void CreateDescriptorSets(); void CreatePipelineLayout(); @@ -91,6 +95,7 @@ private: void CreateRawImages(const Tegra::FramebufferConfig& framebuffer); void UpdateDescriptorSet(std::size_t image_index, VkImageView image_view, bool nn) const; + void UpdateAADescriptorSet(std::size_t image_index, VkImageView image_view, bool nn) const; void SetUniformData(BufferData& data, const Layout::FramebufferLayout layout) const; void SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer, const Layout::FramebufferLayout layout) const; @@ -109,6 +114,8 @@ private: const VKScreenInfo& screen_info; vk::ShaderModule vertex_shader; + vk::ShaderModule fxaa_vertex_shader; + vk::ShaderModule fxaa_fragment_shader; vk::ShaderModule bilinear_fragment_shader; vk::ShaderModule bicubic_fragment_shader; vk::ShaderModule gaussian_fragment_shader; @@ -116,6 +123,7 @@ private: vk::DescriptorPool descriptor_pool; vk::DescriptorSetLayout descriptor_set_layout; vk::PipelineLayout pipeline_layout; + vk::Pipeline aa_pipeline; vk::Pipeline nearest_neightbor_pipeline; vk::Pipeline bilinear_pipeline; vk::Pipeline bicubic_pipeline; @@ -136,6 +144,15 @@ private: std::vector raw_images; std::vector raw_image_views; std::vector raw_buffer_commits; + vk::Image aa_image; + vk::ImageView aa_image_view; + MemoryCommit aa_commit; + vk::Framebuffer aa_framebuffer; + vk::RenderPass aa_renderpass; + vk::DescriptorSets aa_descriptor_sets; + vk::DescriptorPool aa_descriptor_pool; + vk::DescriptorSetLayout aa_descriptor_set_layout; + vk::PipelineLayout aa_pipeline_layout; u32 raw_width = 0; u32 raw_height = 0; From 6cdfaee7b4f3f68310e9cca755482e3e6993da10 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Thu, 21 Oct 2021 21:31:33 +0200 Subject: [PATCH 127/156] Texture Cache: Fix blitting. --- src/video_core/texture_cache/util.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index 9922aa0cc..ddc9fb13a 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -1157,10 +1157,10 @@ void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* if (dst && GetFormatType(dst->info.format) != SurfaceType::ColorTexture) { dst_info.format = dst->info.format; } - if (!dst && src && GetFormatType(src->info.format) != SurfaceType::ColorTexture) { + if (src && GetFormatType(src->info.format) != SurfaceType::ColorTexture) { dst_info.format = src->info.format; } - if (!src && dst && GetFormatType(dst->info.format) != SurfaceType::ColorTexture) { + if (dst && GetFormatType(dst->info.format) != SurfaceType::ColorTexture) { src_info.format = dst->info.format; } } From 21a8ba0437989e4255b347c2b2cabbbc2a332fd3 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 22 Oct 2021 19:44:10 +0200 Subject: [PATCH 128/156] Vulkan: Fix FXAA in AMD. --- .../renderer_vulkan/vk_blit_screen.cpp | 42 ++++++++++++++++++- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index c0abcc17a..2bed4f3c5 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -244,8 +244,35 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, .width = (up_scale * framebuffer.width) >> down_shift, .height = (up_scale * framebuffer.height) >> down_shift, }; - source_image_view = *aa_image_view; scheduler.Record([this, image_index, size, anti_alias_pass](vk::CommandBuffer cmdbuf) { + const VkImageMemoryBarrier base_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = 0, + .dstAccessMask = 0, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = {}, + .subresourceRange = + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }; + + { + VkImageMemoryBarrier fsr_write_barrier = base_barrier; + fsr_write_barrier.image = *aa_image; + fsr_write_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, fsr_write_barrier); + } + const f32 bg_red = Settings::values.bg_red.GetValue() / 255.0f; const f32 bg_green = Settings::values.bg_green.GetValue() / 255.0f; const f32 bg_blue = Settings::values.bg_blue.GetValue() / 255.0f; @@ -294,7 +321,18 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, aa_descriptor_sets[image_index], {}); cmdbuf.Draw(4, 1, 0, 0); cmdbuf.EndRenderPass(); + + { + VkImageMemoryBarrier blit_read_barrier = base_barrier; + blit_read_barrier.image = *aa_image; + blit_read_barrier.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + blit_read_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT , + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, blit_read_barrier); + } }); + source_image_view = *aa_image_view; } if (fsr) { @@ -485,7 +523,7 @@ void VKBlitScreen::CreateDescriptorPool() { const std::array pool_sizes_aa{{ { .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - .descriptorCount = static_cast(2 * image_count), + .descriptorCount = static_cast(image_count * 2), }, }}; From a96c9c803be9aca0b9775c37c1e77e13cca56c80 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 22 Oct 2021 22:56:08 +0200 Subject: [PATCH 129/156] Yuzu UI: Add button for Anti Alias --- src/common/settings.h | 1 + src/yuzu/main.cpp | 42 ++++++++++++++++++++++++++++++++++++++++++ src/yuzu/main.h | 2 ++ 3 files changed, 45 insertions(+) diff --git a/src/common/settings.h b/src/common/settings.h index ca1c3c1aa..c7610ef1c 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -76,6 +76,7 @@ enum class ScalingFilter : u32 { enum class AntiAliasing : u32 { None = 0, Fxaa = 1, + LastAA = Fxaa, }; struct ResolutionScalingInfo { diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index 379bd0b17..d057dc889 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -774,6 +774,26 @@ void GMainWindow::InitializeWidgets() { tas_label->setFocusPolicy(Qt::NoFocus); statusBar()->insertPermanentWidget(0, tas_label); + // setup AA button + aa_status_button = new QPushButton(); + aa_status_button->setObjectName(QStringLiteral("TogglableStatusBarButton")); + aa_status_button->setFocusPolicy(Qt::NoFocus); + connect(aa_status_button, &QPushButton::clicked, [&] { + auto aa_mode = Settings::values.anti_aliasing.GetValue(); + if (aa_mode == Settings::AntiAliasing::LastAA) { + aa_mode = Settings::AntiAliasing::None; + } else { + aa_mode = static_cast(static_cast(aa_mode) + 1); + } + Settings::values.anti_aliasing.SetValue(aa_mode); + aa_status_button->setChecked(true); + UpdateAAText(); + }); + UpdateAAText(); + aa_status_button->setCheckable(true); + aa_status_button->setChecked(true); + statusBar()->insertPermanentWidget(0, aa_status_button); + // Setup Filter button filter_status_button = new QPushButton(); filter_status_button->setObjectName(QStringLiteral("TogglableStatusBarButton")); @@ -800,6 +820,7 @@ void GMainWindow::InitializeWidgets() { } UpdateFilterText(); filter_status_button->setCheckable(true); + filter_status_button->setChecked(true); statusBar()->insertPermanentWidget(0, filter_status_button); // Setup Dock button @@ -872,6 +893,11 @@ void GMainWindow::InitializeWidgets() { Settings::values.renderer_backend.SetValue(Settings::RendererBackend::Vulkan); } else { Settings::values.renderer_backend.SetValue(Settings::RendererBackend::OpenGL); + const auto filter = Settings::values.scaling_filter.GetValue(); + if (filter == Settings::ScalingFilter::Fsr) { + Settings::values.scaling_filter.SetValue(Settings::ScalingFilter::NearestNeighbor); + UpdateFilterText(); + } } system->ApplySettings(); @@ -3088,12 +3114,28 @@ void GMainWindow::UpdateFilterText() { } } +void GMainWindow::UpdateAAText() { + const auto aa_mode = Settings::values.anti_aliasing.GetValue(); + switch (aa_mode) { + case Settings::AntiAliasing::Fxaa: + aa_status_button->setText(tr("FXAA")); + break; + case Settings::AntiAliasing::None: + aa_status_button->setText(tr("NO AA")); + break; + default: + aa_status_button->setText(tr("FXAA")); + break; + } +} + void GMainWindow::UpdateStatusButtons() { dock_status_button->setChecked(Settings::values.use_docked_mode.GetValue()); renderer_status_button->setChecked(Settings::values.renderer_backend.GetValue() == Settings::RendererBackend::Vulkan); UpdateGPUAccuracyButton(); UpdateFilterText(); + UpdateAAText(); } void GMainWindow::UpdateUISettings() { diff --git a/src/yuzu/main.h b/src/yuzu/main.h index d4d2f3d58..24633ff2d 100644 --- a/src/yuzu/main.h +++ b/src/yuzu/main.h @@ -303,6 +303,7 @@ private: void UpdateWindowTitle(std::string_view title_name = {}, std::string_view title_version = {}, std::string_view gpu_vendor = {}); void UpdateFilterText(); + void UpdateAAText(); void UpdateStatusBar(); void UpdateGPUAccuracyButton(); void UpdateStatusButtons(); @@ -338,6 +339,7 @@ private: QPushButton* renderer_status_button = nullptr; QPushButton* dock_status_button = nullptr; QPushButton* filter_status_button = nullptr; + QPushButton* aa_status_button = nullptr; QTimer status_bar_update_timer; std::unique_ptr config; From 99547d2656ee8e84b684794fa8e013b146f15284 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 23 Oct 2021 00:23:50 +0200 Subject: [PATCH 130/156] HostShader: Fix gaussian and add attribution. --- .../host_shaders/present_gaussian.frag | 42 +++++++++---------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/src/video_core/host_shaders/present_gaussian.frag b/src/video_core/host_shaders/present_gaussian.frag index d5e2b1781..a9558548f 100644 --- a/src/video_core/host_shaders/present_gaussian.frag +++ b/src/video_core/host_shaders/present_gaussian.frag @@ -2,6 +2,10 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +// Code obtained from this 2 sources: +// - https://learnopengl.com/Advanced-Lighting/Bloom +// - https://www.rastergrid.com/blog/2010/09/efficient-gaussian-blur-with-linear-sampling/ + #version 460 core #ifdef VULKAN @@ -14,50 +18,40 @@ #endif -layout (location = 0) in vec2 frag_tex_coord; +layout(location = 0) in vec2 frag_tex_coord; -layout (location = 0) out vec4 color; +layout(location = 0) out vec4 color; -layout (binding = BINDING_COLOR_TEXTURE) uniform sampler2D color_texture; +layout(binding = BINDING_COLOR_TEXTURE) uniform sampler2D color_texture; const float offset[3] = float[](0.0, 1.3846153846, 3.2307692308); const float weight[3] = float[](0.2270270270, 0.3162162162, 0.0702702703); vec4 blurVertical(sampler2D textureSampler, vec2 coord, vec2 norm) { vec4 result = vec4(0.0f); - for (int i=1; i<3; i++) { - result += - texture(textureSampler, vec2(coord) + (vec2(0.0, offset[i]) * norm)) - * weight[i]; - result += - texture(textureSampler, vec2(coord) - (vec2(0.0, offset[i]) * norm)) - * weight[i]; + for (int i = 1; i < 3; i++) { + result += texture(textureSampler, vec2(coord) + (vec2(0.0, offset[i]) * norm)) * weight[i]; + result += texture(textureSampler, vec2(coord) - (vec2(0.0, offset[i]) * norm)) * weight[i]; } return result; } vec4 blurHorizontal(sampler2D textureSampler, vec2 coord, vec2 norm) { vec4 result = vec4(0.0f); - for (int i=1; i<3; i++) { - result += - texture(textureSampler, vec2(coord) + (vec2(offset[i], 0.0) * norm)) - * weight[i]; - result += - texture(textureSampler, vec2(coord) - (vec2(offset[i], 0.0) * norm)) - * weight[i]; + for (int i = 1; i < 3; i++) { + result += texture(textureSampler, vec2(coord) + (vec2(offset[i], 0.0) * norm)) * weight[i]; + result += texture(textureSampler, vec2(coord) - (vec2(offset[i], 0.0) * norm)) * weight[i]; } return result; } vec4 blurDiagonal(sampler2D textureSampler, vec2 coord, vec2 norm) { vec4 result = vec4(0.0f); - for (int i=1; i<3; i++) { + for (int i = 1; i < 3; i++) { result += - texture(textureSampler, vec2(coord) + (vec2(offset[i], offset[i]) * norm)) - * weight[i]; + texture(textureSampler, vec2(coord) + (vec2(offset[i], offset[i]) * norm)) * weight[i]; result += - texture(textureSampler, vec2(coord) - (vec2(offset[i], offset[i]) * norm)) - * weight[i]; + texture(textureSampler, vec2(coord) - (vec2(offset[i], offset[i]) * norm)) * weight[i]; } return result; } @@ -65,10 +59,12 @@ vec4 blurDiagonal(sampler2D textureSampler, vec2 coord, vec2 norm) { void main() { vec3 base = texture(color_texture, vec2(frag_tex_coord)).rgb * weight[0]; vec2 tex_offset = 1.0f / textureSize(color_texture, 0); + + // TODO(Blinkhawk): This code can be optimized through shader group instructions. vec3 horizontal = blurHorizontal(color_texture, frag_tex_coord, tex_offset).rgb; vec3 vertical = blurVertical(color_texture, frag_tex_coord, tex_offset).rgb; vec3 diagonalA = blurVertical(color_texture, frag_tex_coord, tex_offset).rgb; - vec3 diagonalB = blurVertical(color_texture, frag_tex_coord, -tex_offset).rgb; + vec3 diagonalB = blurVertical(color_texture, frag_tex_coord, tex_offset * vec2(1.0, -1.0)).rgb; vec3 combination = mix(mix(horizontal, vertical, 0.5f), mix(diagonalA, diagonalB, 0.5f), 0.5f); color = vec4(combination + base, 1.0f); } From c5dbd93adb0566f0b2b09657b4340cc3da59d703 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 23 Oct 2021 00:25:19 +0200 Subject: [PATCH 131/156] VulkanBufferCache: Avoid adding barriers between multiple copies. --- src/video_core/buffer_cache/buffer_cache.h | 4 +- .../renderer_vulkan/vk_buffer_cache.cpp | 38 +++++++++++++++++-- .../renderer_vulkan/vk_buffer_cache.h | 6 ++- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index d350c9b36..43bed63ac 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -853,12 +853,14 @@ void BufferCache

::CommitAsyncFlushesHigh() { } if constexpr (USE_MEMORY_MAPS) { auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + runtime.PreCopyBarrier(); for (auto& [copy, buffer_id] : downloads) { // Have in mind the staging buffer offset for the copy copy.dst_offset += download_staging.offset; const std::array copies{copy}; - runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies); + runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, false); } + runtime.PostCopyBarrier(); runtime.Finish(); for (const auto& [copy, buffer_id] : downloads) { const Buffer& buffer = slot_buffers[buffer_id]; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 8ac58bc2f..5ffd93499 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -146,7 +146,7 @@ void BufferCacheRuntime::Finish() { } void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer, - std::span copies) { + std::span copies, bool barrier) { static constexpr VkMemoryBarrier READ_BARRIER{ .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, .pNext = nullptr, @@ -163,10 +163,42 @@ void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer, boost::container::small_vector vk_copies(copies.size()); std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy); scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) { + scheduler.Record([src_buffer, dst_buffer, vk_copies, barrier](vk::CommandBuffer cmdbuf) { + if (barrier) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER); + } + cmdbuf.CopyBuffer(src_buffer, dst_buffer, vk_copies); + if (barrier) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER); + } + }); +} + +void BufferCacheRuntime::PreCopyBarrier() { + static constexpr VkMemoryBarrier READ_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, + }; + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([](vk::CommandBuffer cmdbuf) { cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER); - cmdbuf.CopyBuffer(src_buffer, dst_buffer, vk_copies); + }); +} + +void BufferCacheRuntime::PostCopyBarrier() { + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, + }; + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([](vk::CommandBuffer cmdbuf) { cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER); }); diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index c27402ff0..1ee0d8420 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -69,8 +69,12 @@ public: [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size); + void PreCopyBarrier(); + void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer, - std::span copies); + std::span copies, bool barrier = true); + + void PostCopyBarrier(); void ClearBuffer(VkBuffer dest_buffer, u32 offset, size_t size, u32 value); From 5c6fa8893589fd70bc743c0d0b77c0c375b24bd3 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 23 Oct 2021 01:52:34 +0200 Subject: [PATCH 132/156] OpenGlTextureCache: Fix state invalidation on rescaling. --- src/video_core/renderer_opengl/gl_texture_cache.cpp | 11 +++++++++++ src/video_core/renderer_opengl/gl_texture_cache.h | 4 ++++ src/video_core/texture_cache/texture_cache.h | 4 ++-- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index a6e9eb60b..6e7f66ef0 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -942,10 +942,21 @@ bool Image::Scale(bool up_scale) { dst_info.size.height = scaled_height; upscaled_backup = MakeImage(dst_info, gl_internal_format); } + auto& state_tracker = runtime->GetStateTracker(); + state_tracker.NotifyViewport0(); + state_tracker.NotifyScissor0(); // TODO (ameerj): Investigate other GL states that affect blitting. GLboolean scissor_test; glGetBooleani_v(GL_SCISSOR_TEST, 0, &scissor_test); glDisablei(GL_SCISSOR_TEST, 0); + if (up_scale) { + glViewportIndexedf(0, 0.0f, 0.0f, static_cast(scaled_width), + static_cast(scaled_height)); + } else { + glViewportIndexedf(0, 0.0f, 0.0f, static_cast(original_width), + static_cast(original_height)); + } + const GLuint read_fbo = runtime->rescale_read_fbos[fbo_index].handle; const GLuint draw_fbo = runtime->rescale_draw_fbos[fbo_index].handle; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index eeb5133d5..8161e6b72 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -116,6 +116,10 @@ public: void TickFrame() {} + StateTracker& GetStateTracker() { + return state_tracker; + } + private: struct StagingBuffers { explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_); diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index f1254ef62..dd9553806 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -1810,8 +1810,8 @@ void TextureCache

::BindRenderTarget(ImageViewId* old_id, ImageViewId new_id) if (*old_id == new_id) { return; } - if (*old_id) { - const ImageViewBase& old_view = slot_image_views[*old_id]; + if (new_id) { + const ImageViewBase& old_view = slot_image_views[new_id]; if (True(old_view.flags & ImageViewFlagBits::PreemtiveDownload)) { uncommitted_downloads.push_back(old_view.image_id); } From 172d4f1e3b08901548ae1d2e7e476f97e8032585 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Fri, 22 Oct 2021 22:11:23 -0400 Subject: [PATCH 133/156] gl_texture_cache: Simplify scaling procedures --- .../renderer_opengl/gl_texture_cache.cpp | 83 ++++++------------- .../renderer_opengl/gl_texture_cache.h | 2 +- 2 files changed, 28 insertions(+), 57 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 6e7f66ef0..6841b5450 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -883,7 +883,7 @@ void Image::CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t b } } -bool Image::Scale(bool up_scale) { +void Image::Scale(bool up_scale) { const auto format_type = GetFormatType(info.format); const GLenum attachment = [format_type] { switch (format_type) { @@ -942,77 +942,54 @@ bool Image::Scale(bool up_scale) { dst_info.size.height = scaled_height; upscaled_backup = MakeImage(dst_info, gl_internal_format); } - auto& state_tracker = runtime->GetStateTracker(); - state_tracker.NotifyViewport0(); - state_tracker.NotifyScissor0(); - // TODO (ameerj): Investigate other GL states that affect blitting. - GLboolean scissor_test; - glGetBooleani_v(GL_SCISSOR_TEST, 0, &scissor_test); - glDisablei(GL_SCISSOR_TEST, 0); - if (up_scale) { - glViewportIndexedf(0, 0.0f, 0.0f, static_cast(scaled_width), - static_cast(scaled_height)); - } else { - glViewportIndexedf(0, 0.0f, 0.0f, static_cast(original_width), - static_cast(original_height)); - } + const u32 src_width = up_scale ? original_width : scaled_width; + const u32 src_height = up_scale ? original_height : scaled_height; + const u32 dst_width = up_scale ? scaled_width : original_width; + const u32 dst_height = up_scale ? scaled_height : original_height; + const auto src_handle = up_scale ? texture.handle : upscaled_backup.handle; + const auto dst_handle = up_scale ? upscaled_backup.handle : texture.handle; + // TODO (ameerj): Investigate other GL states that affect blitting. + glDisablei(GL_SCISSOR_TEST, 0); + glViewportIndexedf(0, 0.0f, 0.0f, static_cast(dst_width), + static_cast(dst_height)); const GLuint read_fbo = runtime->rescale_read_fbos[fbo_index].handle; const GLuint draw_fbo = runtime->rescale_draw_fbos[fbo_index].handle; for (s32 layer = 0; layer < info.resources.layers; ++layer) { for (s32 level = 0; level < info.resources.levels; ++level) { - const u32 src_level_width = - std::max(1u, (up_scale ? original_width : scaled_width) >> level); - const u32 src_level_height = - std::max(1u, (up_scale ? original_height : scaled_height) >> level); - const u32 dst_level_width = - std::max(1u, (up_scale ? scaled_width : original_width) >> level); - const u32 dst_level_height = - std::max(1u, (up_scale ? scaled_height : original_height) >> level); + const u32 src_level_width = std::max(1u, src_width >> level); + const u32 src_level_height = std::max(1u, src_height >> level); + const u32 dst_level_width = std::max(1u, dst_width >> level); + const u32 dst_level_height = std::max(1u, dst_height >> level); - if (up_scale) { - glNamedFramebufferTextureLayer(read_fbo, attachment, texture.handle, level, layer); - glNamedFramebufferTextureLayer(draw_fbo, attachment, upscaled_backup.handle, level, - layer); - } else { - glNamedFramebufferTextureLayer(read_fbo, attachment, upscaled_backup.handle, level, - layer); - glNamedFramebufferTextureLayer(draw_fbo, attachment, texture.handle, level, layer); - } + glNamedFramebufferTextureLayer(read_fbo, attachment, src_handle, level, layer); + glNamedFramebufferTextureLayer(draw_fbo, attachment, dst_handle, level, layer); glBlitNamedFramebuffer(read_fbo, draw_fbo, 0, 0, src_level_width, src_level_height, 0, 0, dst_level_width, dst_level_height, mask, filter); } } - if (scissor_test != GL_FALSE) { - glEnablei(GL_SCISSOR_TEST, 0); - } - if (up_scale) { - current_texture = upscaled_backup.handle; - } else { - current_texture = texture.handle; - } - - return true; + current_texture = dst_handle; + auto& state_tracker = runtime->GetStateTracker(); + state_tracker.NotifyViewport0(); + state_tracker.NotifyScissor0(); } bool Image::ScaleUp(bool ignore) { if (True(flags & ImageFlagBits::Rescaled)) { return false; } - flags |= ImageFlagBits::Rescaled; - if (!runtime->resolution.active) { - return false; - } if (gl_format == 0 && gl_type == 0) { // compressed textures - flags &= ~ImageFlagBits::Rescaled; return false; } if (info.type == ImageType::Linear) { UNREACHABLE(); - flags &= ~ImageFlagBits::Rescaled; + return false; + } + flags |= ImageFlagBits::Rescaled; + if (!runtime->resolution.active) { return false; } has_scaled = true; @@ -1020,10 +997,7 @@ bool Image::ScaleUp(bool ignore) { current_texture = upscaled_backup.handle; return true; } - if (!Scale()) { - flags &= ~ImageFlagBits::Rescaled; - return false; - } + Scale(true); return true; } @@ -1039,10 +1013,7 @@ bool Image::ScaleDown(bool ignore) { current_texture = texture.handle; return true; } - if (!Scale(false)) { - flags &= ~ImageFlagBits::Rescaled; - return false; - } + Scale(false); return true; } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 8161e6b72..c51a7428d 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -209,7 +209,7 @@ private: void CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); - bool Scale(bool up_scale = true); + void Scale(bool up_scale); OGLTexture texture; OGLTexture upscaled_backup; From 93c9eb196f4444495f50220c3e6b89d2f0b582db Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Fri, 22 Oct 2021 22:59:30 -0400 Subject: [PATCH 134/156] gl_rasterizer: Fix ScissorTest and Clear when scaling --- src/video_core/renderer_opengl/gl_rasterizer.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index bb24a0656..4df8a684a 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -184,6 +184,9 @@ void RasterizerOpenGL::Clear() { SyncRasterizeEnable(); SyncStencilTestState(); + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.UpdateRenderTargets(true); + state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); if (regs.clear_flags.scissor) { SyncScissorTest(); } else { @@ -192,10 +195,6 @@ void RasterizerOpenGL::Clear() { } UNIMPLEMENTED_IF(regs.clear_flags.viewport); - std::scoped_lock lock{texture_cache.mutex}; - texture_cache.UpdateRenderTargets(true); - state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); - if (use_color) { glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color); } @@ -925,12 +924,9 @@ void RasterizerOpenGL::SyncScissorTest() { const auto& regs = maxwell3d.regs; const auto& resolution = Settings::values.resolution_info; - const auto scale_up = [resolution](u32 value) -> u32 { - if (value == 0) { - return 0U; - } - const u32 converted_value = (value * resolution.up_scale) >> resolution.down_shift; - return std::max(converted_value, 1U); + const bool is_rescaling{texture_cache.IsRescaling()}; + const auto scale_up = [resolution, is_rescaling](u32 value) { + return is_rescaling ? resolution.ScaleUp(value) : value; }; for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) { if (!force && !flags[Dirty::Scissor0 + index]) { From 282a4501d9b8c3e68e1c4545778097888caa7a88 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Fri, 22 Oct 2021 23:46:21 -0400 Subject: [PATCH 135/156] vk_texture_cache: Refactor 3D scaling helpers --- .../renderer_vulkan/vk_texture_cache.cpp | 187 +++++++----------- .../renderer_vulkan/vk_texture_cache.h | 2 + 2 files changed, 75 insertions(+), 114 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 413d472cd..85a1d520b 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -593,13 +593,16 @@ struct RangedBarrierRange { void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, const ImageInfo& info, VkImageAspectFlags aspect_mask, const Settings::ResolutionScalingInfo& resolution, - bool is_bilinear, bool up_scaling = true) { + bool up_scaling = true) { const bool is_2d = info.type == ImageType::e2D; const auto resources = info.resources; const VkExtent2D extent{ .width = info.size.width, .height = info.size.height, }; + // Depth and integer formats must use NEAREST filter for blits. + const bool is_color{aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT}; + const bool is_bilinear{is_color && !IsPixelFormatInteger(info.format)}; const VkFilter vk_filter = is_bilinear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST; scheduler.RequestOutsideRenderPassOperationContext(); @@ -1144,10 +1147,10 @@ bool Image::ScaleUp(bool ignore) { } has_scaled = true; const auto& device = runtime->device; - const bool is_2d = info.type == ImageType::e2D; - const u32 scaled_width = resolution.ScaleUp(info.size.width); - const u32 scaled_height = is_2d ? resolution.ScaleUp(info.size.height) : info.size.height; if (!scaled_image) { + const bool is_2d = info.type == ImageType::e2D; + const u32 scaled_width = resolution.ScaleUp(info.size.width); + const u32 scaled_height = is_2d ? resolution.ScaleUp(info.size.height) : info.size.height; auto scaled_info = info; scaled_info.size.width = scaled_width; scaled_info.size.height = scaled_height; @@ -1168,61 +1171,10 @@ bool Image::ScaleUp(bool ignore) { const PixelFormat format = StorageFormat(info.format); const auto vk_format = MaxwellToVK::SurfaceFormat(device, OPTIMAL_FORMAT, false, format).format; const auto blit_usage = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; - const bool is_color{aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT}; - const bool is_bilinear{is_color && !IsPixelFormatInteger(info.format)}; if (device.IsFormatSupported(vk_format, blit_usage, OPTIMAL_FORMAT)) { - BlitScale(*scheduler, *original_image, *scaled_image, info, aspect_mask, resolution, - device.IsFormatSupported(vk_format, - VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT, - OPTIMAL_FORMAT)); + BlitScale(*scheduler, *original_image, *scaled_image, info, aspect_mask, resolution); } else { - using namespace VideoCommon; - static constexpr auto BLIT_OPERATION = Tegra::Engines::Fermi2D::Operation::SrcCopy; - const auto operation = is_bilinear ? Tegra::Engines::Fermi2D::Filter::Bilinear - : Tegra::Engines::Fermi2D::Filter::Point; - - if (!scale_view) { - const auto view_info = ImageViewInfo(ImageViewType::e2D, info.format); - scale_view = std::make_unique(*runtime, view_info, NULL_IMAGE_ID, *this); - } - auto* view_ptr = scale_view.get(); - - const Region2D src_region{ - .start = {0, 0}, - .end = {static_cast(info.size.width), static_cast(info.size.height)}, - }; - const Region2D dst_region{ - .start = {0, 0}, - .end = {static_cast(scaled_width), static_cast(scaled_height)}, - }; - const VkExtent2D extent{ - .width = scaled_width, - .height = scaled_height, - }; - if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) { - if (!scale_framebuffer) { - scale_framebuffer = - std::make_unique(*runtime, view_ptr, nullptr, extent); - } - const auto color_view = scale_view->Handle(Shader::TextureType::Color2D); - - runtime->blit_image_helper.BlitColor(scale_framebuffer.get(), color_view, dst_region, - src_region, operation, BLIT_OPERATION); - } else if (!runtime->device.IsBlitDepthStencilSupported() && - aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { - if (!scale_framebuffer) { - scale_framebuffer = - std::make_unique(*runtime, nullptr, view_ptr, extent); - } - runtime->blit_image_helper.BlitDepthStencil( - scale_framebuffer.get(), scale_view->DepthView(), scale_view->StencilView(), - dst_region, src_region, operation, BLIT_OPERATION); - } else { - // TODO: Use helper blits where applicable - flags &= ~ImageFlagBits::Rescaled; - LOG_ERROR(Render_Vulkan, "Device does not support scaling format {}", format); - return false; - } + return BlitScaleHelper(true); } return true; } @@ -1231,82 +1183,89 @@ bool Image::ScaleDown(bool ignore) { if (False(flags & ImageFlagBits::Rescaled)) { return false; } + ASSERT(info.type != ImageType::Linear); flags &= ~ImageFlagBits::Rescaled; const auto& resolution = runtime->resolution; if (!resolution.active) { return false; } + current_image = *original_image; if (ignore) { - current_image = *original_image; return true; } - const auto& device = runtime->device; - const bool is_2d = info.type == ImageType::e2D; - const u32 scaled_width = resolution.ScaleUp(info.size.width); - const u32 scaled_height = is_2d ? resolution.ScaleUp(info.size.height) : info.size.height; if (aspect_mask == 0) { aspect_mask = ImageAspectMask(info.format); } static constexpr auto OPTIMAL_FORMAT = FormatType::Optimal; const PixelFormat format = StorageFormat(info.format); + const auto& device = runtime->device; const auto vk_format = MaxwellToVK::SurfaceFormat(device, OPTIMAL_FORMAT, false, format).format; const auto blit_usage = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; + if (device.IsFormatSupported(vk_format, blit_usage, OPTIMAL_FORMAT)) { + BlitScale(*scheduler, *scaled_image, *original_image, info, aspect_mask, resolution, false); + } else { + return BlitScaleHelper(false); + } + return true; +} + +bool Image::BlitScaleHelper(bool scale_up) { + using namespace VideoCommon; + static constexpr auto BLIT_OPERATION = Tegra::Engines::Fermi2D::Operation::SrcCopy; const bool is_color{aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT}; const bool is_bilinear{is_color && !IsPixelFormatInteger(info.format)}; - if (device.IsFormatSupported(vk_format, blit_usage, OPTIMAL_FORMAT)) { - BlitScale(*scheduler, *scaled_image, *original_image, info, aspect_mask, resolution, - is_bilinear, false); - } else { - using namespace VideoCommon; - static constexpr auto BLIT_OPERATION = Tegra::Engines::Fermi2D::Operation::SrcCopy; - const auto operation = is_bilinear ? Tegra::Engines::Fermi2D::Filter::Bilinear - : Tegra::Engines::Fermi2D::Filter::Point; + const auto operation = is_bilinear ? Tegra::Engines::Fermi2D::Filter::Bilinear + : Tegra::Engines::Fermi2D::Filter::Point; - if (!normal_view) { - const auto view_info = ImageViewInfo(ImageViewType::e2D, info.format); - normal_view = std::make_unique(*runtime, view_info, NULL_IMAGE_ID, *this); - } - auto* view_ptr = normal_view.get(); - - const Region2D src_region{ - .start = {0, 0}, - .end = {static_cast(scaled_width), static_cast(scaled_height)}, - }; - const Region2D dst_region{ - .start = {0, 0}, - .end = {static_cast(info.size.width), static_cast(info.size.height)}, - }; - const VkExtent2D extent{ - .width = scaled_width, - .height = scaled_height, - }; - if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) { - if (!normal_framebuffer) { - normal_framebuffer = - std::make_unique(*runtime, view_ptr, nullptr, extent); - } - const auto color_view = normal_view->Handle(Shader::TextureType::Color2D); - - runtime->blit_image_helper.BlitColor(normal_framebuffer.get(), color_view, dst_region, - src_region, operation, BLIT_OPERATION); - } else if (!runtime->device.IsBlitDepthStencilSupported() && - aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { - if (!normal_framebuffer) { - normal_framebuffer = - std::make_unique(*runtime, nullptr, view_ptr, extent); - } - runtime->blit_image_helper.BlitDepthStencil( - normal_framebuffer.get(), normal_view->DepthView(), normal_view->StencilView(), - dst_region, src_region, operation, BLIT_OPERATION); - } else { - // TODO: Use helper blits where applicable - flags &= ~ImageFlagBits::Rescaled; - LOG_ERROR(Render_Vulkan, "Device does not support scaling format {}", format); - return false; - } + const bool is_2d = info.type == ImageType::e2D; + const auto& resolution = runtime->resolution; + const u32 scaled_width = resolution.ScaleUp(info.size.width); + const u32 scaled_height = is_2d ? resolution.ScaleUp(info.size.height) : info.size.height; + if (!scale_view) { + const auto view_info = ImageViewInfo(ImageViewType::e2D, info.format); + scale_view = std::make_unique(*runtime, view_info, NULL_IMAGE_ID, *this); + } + + const u32 src_width = scale_up ? info.size.width : scaled_width; + const u32 src_height = scale_up ? info.size.height : scaled_height; + const u32 dst_width = scale_up ? scaled_width : info.size.width; + const u32 dst_height = scale_up ? scaled_height : info.size.height; + const Region2D src_region{ + .start = {0, 0}, + .end = {static_cast(src_width), static_cast(src_height)}, + }; + const Region2D dst_region{ + .start = {0, 0}, + .end = {static_cast(dst_width), static_cast(dst_height)}, + }; + const VkExtent2D extent{ + .width = scaled_width, + .height = scaled_height, + }; + + auto* view_ptr = scale_view.get(); + if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) { + if (!scale_framebuffer) { + scale_framebuffer = std::make_unique(*runtime, view_ptr, nullptr, extent); + } + const auto color_view = scale_view->Handle(Shader::TextureType::Color2D); + + runtime->blit_image_helper.BlitColor(scale_framebuffer.get(), color_view, dst_region, + src_region, operation, BLIT_OPERATION); + } else if (!runtime->device.IsBlitDepthStencilSupported() && + aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { + if (!scale_framebuffer) { + scale_framebuffer = std::make_unique(*runtime, nullptr, view_ptr, extent); + } + runtime->blit_image_helper.BlitDepthStencil( + scale_framebuffer.get(), scale_view->DepthView(), scale_view->StencilView(), dst_region, + src_region, operation, BLIT_OPERATION); + } else { + // TODO: Use helper blits where applicable + flags &= ~ImageFlagBits::Rescaled; + LOG_ERROR(Render_Vulkan, "Device does not support scaling format {}", info.format); + return false; } - ASSERT(info.type != ImageType::Linear); - current_image = *original_image; return true; } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 8dbddfaf7..9d149d306 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -134,6 +134,8 @@ public: bool ScaleDown(bool ignore = false); private: + bool BlitScaleHelper(bool scale_up); + VKScheduler* scheduler{}; TextureCacheRuntime* runtime{}; From a39e867c73140955120700b170dbdb4773d32745 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 23 Oct 2021 00:02:19 -0400 Subject: [PATCH 136/156] renderer_vulkan/blit_image: Use generic color state on Depth to Color blits Fixes Bayonetta 2 on AMD --- src/video_core/renderer_vulkan/blit_image.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp index 239698423..b3884a4f5 100644 --- a/src/video_core/renderer_vulkan/blit_image.cpp +++ b/src/video_core/renderer_vulkan/blit_image.cpp @@ -569,7 +569,7 @@ VkPipeline BlitImageHelper::FindOrEmplaceDepthStencilPipeline(const BlitImagePip .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO, .pMultisampleState = &PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, .pDepthStencilState = &PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, - .pColorBlendState = &PIPELINE_COLOR_BLEND_STATE_EMPTY_CREATE_INFO, + .pColorBlendState = &PIPELINE_COLOR_BLEND_STATE_GENERIC_CREATE_INFO, .pDynamicState = &PIPELINE_DYNAMIC_STATE_CREATE_INFO, .layout = *two_textures_pipeline_layout, .renderPass = key.renderpass, From dcc5b4f6b005a2c89bb4e77bca4cfe8705734021 Mon Sep 17 00:00:00 2001 From: Marshall Mohror Date: Fri, 22 Oct 2021 23:09:29 -0500 Subject: [PATCH 137/156] Presentation: Only use FP16 in scaling shaders on supported devices in Vulkan --- externals/CMakeLists.txt | 3 + src/video_core/CMakeLists.txt | 1 + src/video_core/host_shaders/CMakeLists.txt | 10 +- .../host_shaders/fidelityfx_fsr.comp | 106 +++++++++--------- ...ce.frag => opengl_present_scaleforce.frag} | 84 ++++++++------ ...p => vulkan_fidelityfx_fsr_easu_fp16.comp} | 4 +- .../vulkan_fidelityfx_fsr_easu_fp32.comp | 10 ++ ...p => vulkan_fidelityfx_fsr_rcas_fp16.comp} | 4 +- .../vulkan_fidelityfx_fsr_rcas_fp32.comp | 10 ++ .../vulkan_present_scaleforce_fp16.frag | 7 ++ .../vulkan_present_scaleforce_fp32.frag | 5 + .../renderer_opengl/renderer_opengl.cpp | 5 +- .../renderer_vulkan/vk_blit_screen.cpp | 17 ++- src/video_core/renderer_vulkan/vk_fsr.cpp | 46 +++++--- src/video_core/renderer_vulkan/vk_fsr.h | 2 +- 15 files changed, 199 insertions(+), 115 deletions(-) rename src/video_core/host_shaders/{present_scaleforce.frag => opengl_present_scaleforce.frag} (56%) rename src/video_core/host_shaders/{vulkan_fidelityfx_fsr_easu.comp => vulkan_fidelityfx_fsr_easu_fp16.comp} (67%) create mode 100644 src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.comp rename src/video_core/host_shaders/{vulkan_fidelityfx_fsr_rcas.comp => vulkan_fidelityfx_fsr_rcas_fp16.comp} (67%) create mode 100644 src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.comp create mode 100644 src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag create mode 100644 src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index a76a3d800..7ff2ccc24 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -25,6 +25,9 @@ if (ARCHITECTURE_x86_64) add_subdirectory(dynarmic) endif() +add_library(ffx-fsr INTERFACE) +target_include_directories(ffx-fsr INTERFACE FidelityFX-FSR/ffx-fsr) + # getopt if (MSVC) add_subdirectory(getopt) diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 91a30fef7..07b94dcc8 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -237,6 +237,7 @@ target_include_directories(video_core PRIVATE ${FFmpeg_INCLUDE_DIR}) target_link_libraries(video_core PRIVATE ${FFmpeg_LIBRARIES}) target_link_options(video_core PRIVATE ${FFmpeg_LDFLAGS}) +target_link_libraries(video_core PRIVATE ffx-fsr) add_dependencies(video_core host_shaders) target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE}) target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include) diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 6b5ea649a..d779a967a 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -18,16 +18,20 @@ set(SHADER_FILES opengl_copy_bc4.comp opengl_present.frag opengl_present.vert + opengl_present_scaleforce.frag pitch_unswizzle.comp - present_scaleforce.frag present_bicubic.frag present_gaussian.frag vulkan_blit_color_float.frag vulkan_blit_depth_stencil.frag - vulkan_fidelityfx_fsr_easu.comp - vulkan_fidelityfx_fsr_rcas.comp + vulkan_fidelityfx_fsr_easu_fp16.comp + vulkan_fidelityfx_fsr_easu_fp32.comp + vulkan_fidelityfx_fsr_rcas_fp16.comp + vulkan_fidelityfx_fsr_rcas_fp32.comp vulkan_present.frag vulkan_present.vert + vulkan_present_scaleforce_fp16.frag + vulkan_present_scaleforce_fp32.frag vulkan_quad_indexed.comp vulkan_uint8.comp ) diff --git a/src/video_core/host_shaders/fidelityfx_fsr.comp b/src/video_core/host_shaders/fidelityfx_fsr.comp index cbb601580..6b97f789d 100644 --- a/src/video_core/host_shaders/fidelityfx_fsr.comp +++ b/src/video_core/host_shaders/fidelityfx_fsr.comp @@ -28,80 +28,82 @@ // THE SOFTWARE. layout( push_constant ) uniform constants { - u32vec2 input_size; + uvec4 Const0; + uvec4 Const1; + uvec4 Const2; + uvec4 Const3; }; -uvec4 Const0; -uvec4 Const1; -uvec4 Const2; -uvec4 Const3; +layout(set=0,binding=0) uniform sampler2D InputTexture; +layout(set=0,binding=1,rgba16f) uniform image2D OutputTexture; #define A_GPU 1 #define A_GLSL 1 -#define A_HALF -#include "ffx_a.h" +#ifndef YUZU_USE_FP16 + #include "ffx_a.h" -f16vec4 LinearToSRGB(f16vec4 linear) { - bvec4 selector = greaterThan(linear, f16vec4(0.00313066844250063)); - f16vec4 low = linear * float16_t(12.92); - f16vec4 high = float16_t(1.055) * pow(linear, f16vec4(1 / 2.4)) - float16_t(0.055); - return mix(low, high, selector); -} + #if USE_EASU + #define FSR_EASU_F 1 + AF4 FsrEasuRF(AF2 p) { AF4 res = textureGather(InputTexture, p, 0); return res; } + AF4 FsrEasuGF(AF2 p) { AF4 res = textureGather(InputTexture, p, 1); return res; } + AF4 FsrEasuBF(AF2 p) { AF4 res = textureGather(InputTexture, p, 2); return res; } + #endif + #if USE_RCAS + #define FSR_RCAS_F 1 + AF4 FsrRcasLoadF(ASU2 p) { return texelFetch(InputTexture, ASU2(p), 0); } + void FsrRcasInputF(inout AF1 r, inout AF1 g, inout AF1 b) {} + #endif +#else + #define A_HALF + #include "ffx_a.h" -f16vec4 SRGBToLinear(f16vec4 srgb) { - bvec4 selector = greaterThan(srgb, f16vec4(0.0404482362771082)); - f16vec4 low = srgb * float16_t(1.0 / 12.92); - f16vec4 high = pow((srgb + float16_t(0.055)) * float16_t(1.0 / 1.055), f16vec4(2.4)); - return mix(low, high, selector); -} - -#if USE_EASU - #define FSR_EASU_H 1 - f16vec4 FsrEasuRH(vec2 p) { f16vec4 res = f16vec4(textureGather(InputTexture, p, 0)); return res; } - f16vec4 FsrEasuGH(vec2 p) { f16vec4 res = f16vec4(textureGather(InputTexture, p, 1)); return res; } - f16vec4 FsrEasuBH(vec2 p) { f16vec4 res = f16vec4(textureGather(InputTexture, p, 2)); return res; } -#endif -#if USE_RCAS - #define FSR_RCAS_H 1 - f16vec4 FsrRcasLoadH(ASW2 p) { return f16vec4(texelFetch(InputTexture, ASU2(p), 0)); } - void FsrRcasInputH(inout float16_t r, inout float16_t g, inout float16_t b) {} + #if USE_EASU + #define FSR_EASU_H 1 + AH4 FsrEasuRH(AF2 p) { AH4 res = AH4(textureGather(InputTexture, p, 0)); return res; } + AH4 FsrEasuGH(AF2 p) { AH4 res = AH4(textureGather(InputTexture, p, 1)); return res; } + AH4 FsrEasuBH(AF2 p) { AH4 res = AH4(textureGather(InputTexture, p, 2)); return res; } + #endif + #if USE_RCAS + #define FSR_RCAS_H 1 + AH4 FsrRcasLoadH(ASW2 p) { return AH4(texelFetch(InputTexture, ASU2(p), 0)); } + void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b){} + #endif #endif #include "ffx_fsr1.h" -void CurrFilter(u32vec2 pos) { - // For debugging +void CurrFilter(AU2 pos) { #if USE_BILINEAR - vec2 pp = (vec2(pos) * vec2_AU2(Const0.xy) + vec2_AU2(Const0.zw)) * vec2_AU2(Const1.xy) + vec2(0.5, -0.5) * vec2_AU2(Const1.zw); - imageStore(OutputTexture, ivec2(pos), textureLod(InputTexture, pp, 0.0)); + AF2 pp = (AF2(pos) * AF2_AU2(Const0.xy) + AF2_AU2(Const0.zw)) * AF2_AU2(Const1.xy) + AF2(0.5, -0.5) * AF2_AU2(Const1.zw); + imageStore(OutputTexture, ASU2(pos), textureLod(InputTexture, pp, 0.0)); #endif #if USE_EASU - f16vec3 c; - FsrEasuH(c, pos, Const0, Const1, Const2, Const3); - imageStore(OutputTexture, ivec2(pos), f16vec4(c, 1)); + #ifndef YUZU_USE_FP16 + AF3 c; + FsrEasuF(c, pos, Const0, Const1, Const2, Const3); + imageStore(OutputTexture, ASU2(pos), AF4(c, 1)); + #else + AH3 c; + FsrEasuH(c, pos, Const0, Const1, Const2, Const3); + imageStore(OutputTexture, ASU2(pos), AH4(c, 1)); + #endif #endif #if USE_RCAS - f16vec3 c; - FsrRcasH(c.r, c.g, c.b, pos, Const0); - imageStore(OutputTexture, ivec2(pos), f16vec4(c, 1)); + #ifndef YUZU_USE_FP16 + AF3 c; + FsrRcasF(c.r, c.g, c.b, pos, Const0); + imageStore(OutputTexture, ASU2(pos), AF4(c, 1)); + #else + AH3 c; + FsrRcasH(c.r, c.g, c.b, pos, Const0); + imageStore(OutputTexture, ASU2(pos), AH4(c, 1)); + #endif #endif - } layout(local_size_x=64) in; void main() { - -#if USE_EASU || USE_BILINEAR - vec2 ires = vec2(input_size); - vec2 tres = textureSize(InputTexture, 0); - vec2 ores = imageSize(OutputTexture); - FsrEasuCon(Const0, Const1, Const2, Const3, ires.x, ires.y, tres.x, tres.y, ores.x, ores.y); -#endif -#if USE_RCAS - FsrRcasCon(Const0, 0.25f); -#endif - // Do remapping of local xy in workgroup for a more PS-like swizzle pattern. AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u); CurrFilter(gxy); diff --git a/src/video_core/host_shaders/present_scaleforce.frag b/src/video_core/host_shaders/opengl_present_scaleforce.frag similarity index 56% rename from src/video_core/host_shaders/present_scaleforce.frag rename to src/video_core/host_shaders/opengl_present_scaleforce.frag index ebc0d9b90..71ff9e1e3 100644 --- a/src/video_core/host_shaders/present_scaleforce.frag +++ b/src/video_core/host_shaders/opengl_present_scaleforce.frag @@ -22,11 +22,29 @@ // Adapted from https://github.com/BreadFish64/ScaleFish/tree/master/scaleforce -#version 460 +//! #version 460 + +#extension GL_ARB_separate_shader_objects : enable + +#ifdef YUZU_USE_FP16 #extension GL_AMD_gpu_shader_half_float : enable #extension GL_NV_gpu_shader5 : enable +#define lfloat float16_t +#define lvec2 f16vec2 +#define lvec3 f16vec3 +#define lvec4 f16vec4 + +#else + +#define lfloat float +#define lvec2 vec2 +#define lvec3 vec3 +#define lvec4 vec4 + +#endif + #ifdef VULKAN #define BINDING_COLOR_TEXTURE 1 @@ -45,25 +63,25 @@ layout (binding = BINDING_COLOR_TEXTURE) uniform sampler2D input_texture; const bool ignore_alpha = true; -float16_t ColorDist1(f16vec4 a, f16vec4 b) { +lfloat ColorDist1(lvec4 a, lvec4 b) { // https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.2020_conversion - const f16vec3 K = f16vec3(0.2627, 0.6780, 0.0593); - const float16_t scaleB = float16_t(0.5) / (float16_t(1.0) - K.b); - const float16_t scaleR = float16_t(0.5) / (float16_t(1.0) - K.r); - f16vec4 diff = a - b; - float16_t Y = dot(diff.rgb, K); - float16_t Cb = scaleB * (diff.b - Y); - float16_t Cr = scaleR * (diff.r - Y); - f16vec3 YCbCr = f16vec3(Y, Cb, Cr); - float16_t d = length(YCbCr); + const lvec3 K = lvec3(0.2627, 0.6780, 0.0593); + const lfloat scaleB = lfloat(0.5) / (lfloat(1.0) - K.b); + const lfloat scaleR = lfloat(0.5) / (lfloat(1.0) - K.r); + lvec4 diff = a - b; + lfloat Y = dot(diff.rgb, K); + lfloat Cb = scaleB * (diff.b - Y); + lfloat Cr = scaleR * (diff.r - Y); + lvec3 YCbCr = lvec3(Y, Cb, Cr); + lfloat d = length(YCbCr); if (ignore_alpha) { return d; } return sqrt(a.a * b.a * d * d + diff.a * diff.a); } -f16vec4 ColorDist(f16vec4 ref, f16vec4 A, f16vec4 B, f16vec4 C, f16vec4 D) { - return f16vec4( +lvec4 ColorDist(lvec4 ref, lvec4 A, lvec4 B, lvec4 C, lvec4 D) { + return lvec4( ColorDist1(ref, A), ColorDist1(ref, B), ColorDist1(ref, C), @@ -72,36 +90,36 @@ f16vec4 ColorDist(f16vec4 ref, f16vec4 A, f16vec4 B, f16vec4 C, f16vec4 D) { } vec4 Scaleforce(sampler2D tex, vec2 tex_coord) { - f16vec4 bl = f16vec4(textureOffset(tex, tex_coord, ivec2(-1, -1))); - f16vec4 bc = f16vec4(textureOffset(tex, tex_coord, ivec2(0, -1))); - f16vec4 br = f16vec4(textureOffset(tex, tex_coord, ivec2(1, -1))); - f16vec4 cl = f16vec4(textureOffset(tex, tex_coord, ivec2(-1, 0))); - f16vec4 cc = f16vec4(texture(tex, tex_coord)); - f16vec4 cr = f16vec4(textureOffset(tex, tex_coord, ivec2(1, 0))); - f16vec4 tl = f16vec4(textureOffset(tex, tex_coord, ivec2(-1, 1))); - f16vec4 tc = f16vec4(textureOffset(tex, tex_coord, ivec2(0, 1))); - f16vec4 tr = f16vec4(textureOffset(tex, tex_coord, ivec2(1, 1))); + lvec4 bl = lvec4(textureOffset(tex, tex_coord, ivec2(-1, -1))); + lvec4 bc = lvec4(textureOffset(tex, tex_coord, ivec2(0, -1))); + lvec4 br = lvec4(textureOffset(tex, tex_coord, ivec2(1, -1))); + lvec4 cl = lvec4(textureOffset(tex, tex_coord, ivec2(-1, 0))); + lvec4 cc = lvec4(texture(tex, tex_coord)); + lvec4 cr = lvec4(textureOffset(tex, tex_coord, ivec2(1, 0))); + lvec4 tl = lvec4(textureOffset(tex, tex_coord, ivec2(-1, 1))); + lvec4 tc = lvec4(textureOffset(tex, tex_coord, ivec2(0, 1))); + lvec4 tr = lvec4(textureOffset(tex, tex_coord, ivec2(1, 1))); - f16vec4 offset_tl = ColorDist(cc, tl, tc, tr, cr); - f16vec4 offset_br = ColorDist(cc, br, bc, bl, cl); + lvec4 offset_tl = ColorDist(cc, tl, tc, tr, cr); + lvec4 offset_br = ColorDist(cc, br, bc, bl, cl); // Calculate how different cc is from the texels around it - const float16_t plus_weight = float16_t(1.5); - const float16_t cross_weight = float16_t(1.5); - float16_t total_dist = dot(offset_tl + offset_br, f16vec4(cross_weight, plus_weight, cross_weight, plus_weight)); + const lfloat plus_weight = lfloat(1.5); + const lfloat cross_weight = lfloat(1.5); + lfloat total_dist = dot(offset_tl + offset_br, lvec4(cross_weight, plus_weight, cross_weight, plus_weight)); - if (total_dist == float16_t(0.0)) { + if (total_dist == lfloat(0.0)) { return cc; } else { // Add together all the distances with direction taken into account - f16vec4 tmp = offset_tl - offset_br; - f16vec2 total_offset = tmp.wy * plus_weight + (tmp.zz + f16vec2(-tmp.x, tmp.x)) * cross_weight; + lvec4 tmp = offset_tl - offset_br; + lvec2 total_offset = tmp.wy * plus_weight + (tmp.zz + lvec2(-tmp.x, tmp.x)) * cross_weight; // When the image has thin points, they tend to split apart. // This is because the texels all around are different and total_offset reaches into clear areas. // This works pretty well to keep the offset in bounds for these cases. - float16_t clamp_val = length(total_offset) / total_dist; - f16vec2 final_offset = clamp(total_offset, -clamp_val, clamp_val) / f16vec2(textureSize(tex, 0)); + lfloat clamp_val = length(total_offset) / total_dist; + vec2 final_offset = vec2(clamp(total_offset, -clamp_val, clamp_val)) / textureSize(tex, 0); return texture(tex, tex_coord - final_offset); } @@ -109,4 +127,4 @@ vec4 Scaleforce(sampler2D tex, vec2 tex_coord) { void main() { frag_color = Scaleforce(input_texture, tex_coord); -} \ No newline at end of file +} diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu.comp b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.comp similarity index 67% rename from src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu.comp rename to src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.comp index 6525eeeb5..1c96a7905 100644 --- a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu.comp +++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.comp @@ -5,9 +5,7 @@ #version 460 core #extension GL_GOOGLE_include_directive : enable -layout(set=0,binding=0) uniform sampler2D InputTexture; -layout(set=0,binding=1,rgba16f) uniform image2D OutputTexture; - +#define YUZU_USE_FP16 #define USE_EASU 1 #include "fidelityfx_fsr.comp" diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.comp b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.comp new file mode 100644 index 000000000..f4daff739 --- /dev/null +++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.comp @@ -0,0 +1,10 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 460 core +#extension GL_GOOGLE_include_directive : enable + +#define USE_EASU 1 + +#include "fidelityfx_fsr.comp" diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas.comp b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.comp similarity index 67% rename from src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas.comp rename to src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.comp index 9463ed842..6b6796dd1 100644 --- a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas.comp +++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.comp @@ -5,9 +5,7 @@ #version 460 core #extension GL_GOOGLE_include_directive : enable -layout(set=0,binding=0) uniform sampler2D InputTexture; -layout(set=0,binding=1,rgba16f) uniform image2D OutputTexture; - +#define YUZU_USE_FP16 #define USE_RCAS 1 #include "fidelityfx_fsr.comp" diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.comp b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.comp new file mode 100644 index 000000000..f785eebf3 --- /dev/null +++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.comp @@ -0,0 +1,10 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 460 core +#extension GL_GOOGLE_include_directive : enable + +#define USE_RCAS 1 + +#include "fidelityfx_fsr.comp" diff --git a/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag b/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag new file mode 100644 index 000000000..924c03060 --- /dev/null +++ b/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag @@ -0,0 +1,7 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : enable + +#define YUZU_USE_FP16 + +#include "opengl_present_scaleforce.frag" diff --git a/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag b/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag new file mode 100644 index 000000000..a594b83ca --- /dev/null +++ b/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag @@ -0,0 +1,5 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : enable + +#include "opengl_present_scaleforce.frag" diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index e63f0bdd8..28daacd82 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -24,10 +24,10 @@ #include "video_core/host_shaders/fxaa_frag.h" #include "video_core/host_shaders/fxaa_vert.h" #include "video_core/host_shaders/opengl_present_frag.h" +#include "video_core/host_shaders/opengl_present_scaleforce_frag.h" #include "video_core/host_shaders/opengl_present_vert.h" #include "video_core/host_shaders/present_bicubic_frag.h" #include "video_core/host_shaders/present_gaussian_frag.h" -#include "video_core/host_shaders/present_scaleforce_frag.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_shader_util.h" @@ -266,7 +266,8 @@ void RendererOpenGL::InitOpenGLObjects() { present_gaussian_fragment = CreateProgram(HostShaders::PRESENT_GAUSSIAN_FRAG, GL_FRAGMENT_SHADER); present_scaleforce_fragment = - CreateProgram(HostShaders::PRESENT_SCALEFORCE_FRAG, GL_FRAGMENT_SHADER); + CreateProgram(fmt::format("#version 460\n{}", HostShaders::OPENGL_PRESENT_SCALEFORCE_FRAG), + GL_FRAGMENT_SHADER); // Generate presentation sampler present_sampler.Create(); diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 2bed4f3c5..9dfc508bc 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -21,8 +21,9 @@ #include "video_core/host_shaders/fxaa_vert_spv.h" #include "video_core/host_shaders/present_bicubic_frag_spv.h" #include "video_core/host_shaders/present_gaussian_frag_spv.h" -#include "video_core/host_shaders/present_scaleforce_frag_spv.h" #include "video_core/host_shaders/vulkan_present_frag_spv.h" +#include "video_core/host_shaders/vulkan_present_scaleforce_fp16_frag_spv.h" +#include "video_core/host_shaders/vulkan_present_scaleforce_fp32_frag_spv.h" #include "video_core/host_shaders/vulkan_present_vert_spv.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" @@ -328,7 +329,7 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, blit_read_barrier.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; blit_read_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT , + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, blit_read_barrier); } }); @@ -344,8 +345,12 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, crop_rect.bottom = framebuffer.height; } crop_rect = crop_rect.Scale(Settings::values.resolution_info.up_factor); + VkExtent2D fsr_input_size{ + .width = Settings::values.resolution_info.ScaleUp(framebuffer.width), + .height = Settings::values.resolution_info.ScaleUp(framebuffer.height), + }; VkImageView fsr_image_view = - fsr->Draw(scheduler, image_index, source_image_view, crop_rect); + fsr->Draw(scheduler, image_index, source_image_view, fsr_input_size, crop_rect); UpdateDescriptorSet(image_index, fsr_image_view, true); } else { const bool is_nn = @@ -500,7 +505,11 @@ void VKBlitScreen::CreateShaders() { bilinear_fragment_shader = BuildShader(device, VULKAN_PRESENT_FRAG_SPV); bicubic_fragment_shader = BuildShader(device, PRESENT_BICUBIC_FRAG_SPV); gaussian_fragment_shader = BuildShader(device, PRESENT_GAUSSIAN_FRAG_SPV); - scaleforce_fragment_shader = BuildShader(device, PRESENT_SCALEFORCE_FRAG_SPV); + if (device.IsFloat16Supported()) { + scaleforce_fragment_shader = BuildShader(device, VULKAN_PRESENT_SCALEFORCE_FP16_FRAG_SPV); + } else { + scaleforce_fragment_shader = BuildShader(device, VULKAN_PRESENT_SCALEFORCE_FP32_FRAG_SPV); + } } void VKBlitScreen::CreateSemaphores() { diff --git a/src/video_core/renderer_vulkan/vk_fsr.cpp b/src/video_core/renderer_vulkan/vk_fsr.cpp index 1f60974be..9288aa7c2 100644 --- a/src/video_core/renderer_vulkan/vk_fsr.cpp +++ b/src/video_core/renderer_vulkan/vk_fsr.cpp @@ -4,13 +4,19 @@ #include "common/common_types.h" #include "common/div_ceil.h" -#include "video_core/host_shaders/vulkan_fidelityfx_fsr_easu_comp_spv.h" -#include "video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_comp_spv.h" +#include "video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16_comp_spv.h" +#include "video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32_comp_spv.h" +#include "video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16_comp_spv.h" +#include "video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32_comp_spv.h" #include "video_core/renderer_vulkan/vk_fsr.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/vulkan_common/vulkan_device.h" +#define A_CPU +#include +#include + namespace Vulkan { FSR::FSR(const Device& device_, MemoryAllocator& memory_allocator_, size_t image_count_, @@ -29,11 +35,11 @@ FSR::FSR(const Device& device_, MemoryAllocator& memory_allocator_, size_t image } VkImageView FSR::Draw(VKScheduler& scheduler, size_t image_index, VkImageView image_view, - const Common::Rectangle& crop_rect) { + VkExtent2D input_image_extent, const Common::Rectangle& crop_rect) { UpdateDescriptorSet(image_index, image_view); - scheduler.Record([this, image_index, crop_rect](vk::CommandBuffer cmdbuf) { + scheduler.Record([this, image_index, input_image_extent, crop_rect](vk::CommandBuffer cmdbuf) { const VkImageMemoryBarrier base_barrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, @@ -54,13 +60,18 @@ VkImageView FSR::Draw(VKScheduler& scheduler, size_t image_index, VkImageView im }, }; - // TODO: Support clear color cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *easu_pipeline); - cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, - VkExtent2D{ - .width = static_cast(crop_rect.GetWidth()), - .height = static_cast(crop_rect.GetHeight()), - }); + + std::array push_constants; + FsrEasuConOffset( + push_constants.data() + 0, push_constants.data() + 4, push_constants.data() + 8, + push_constants.data() + 12, + + static_cast(crop_rect.GetWidth()), static_cast(crop_rect.GetHeight()), + static_cast(input_image_extent.width), static_cast(input_image_extent.height), + static_cast(output_size.width), static_cast(output_size.height), + static_cast(crop_rect.left), static_cast(crop_rect.top)); + cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, push_constants); { VkImageMemoryBarrier fsr_write_barrier = base_barrier; @@ -77,7 +88,9 @@ VkImageView FSR::Draw(VKScheduler& scheduler, size_t image_index, VkImageView im Common::DivCeil(output_size.height, 16u), 1); cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *rcas_pipeline); - cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, output_size); + + FsrRcasCon(push_constants.data(), 0.25f); + cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, push_constants); { std::array barriers; @@ -247,7 +260,7 @@ void FSR::CreatePipelineLayout() { VkPushConstantRange push_const{ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .offset = 0, - .size = sizeof(std::array), + .size = sizeof(std::array), }; VkPipelineLayoutCreateInfo ci{ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, @@ -344,8 +357,13 @@ void FSR::CreateSampler() { } void FSR::CreateShaders() { - easu_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_EASU_COMP_SPV); - rcas_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_RCAS_COMP_SPV); + if (device.IsFloat16Supported()) { + easu_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_EASU_FP16_COMP_SPV); + rcas_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_RCAS_FP16_COMP_SPV); + } else { + easu_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_EASU_FP32_COMP_SPV); + rcas_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_RCAS_FP32_COMP_SPV); + } } void FSR::CreatePipeline() { diff --git a/src/video_core/renderer_vulkan/vk_fsr.h b/src/video_core/renderer_vulkan/vk_fsr.h index 8391e2e58..6bbec3d36 100644 --- a/src/video_core/renderer_vulkan/vk_fsr.h +++ b/src/video_core/renderer_vulkan/vk_fsr.h @@ -18,7 +18,7 @@ public: explicit FSR(const Device& device, MemoryAllocator& memory_allocator, size_t image_count, VkExtent2D output_size); VkImageView Draw(VKScheduler& scheduler, size_t image_index, VkImageView image_view, - const Common::Rectangle& crop_rect); + VkExtent2D input_image_extent, const Common::Rectangle& crop_rect); private: void CreateDescriptorPool(); From 47369faaabee9fa47208890ed2945e54f3251f8a Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 23 Oct 2021 00:15:19 -0400 Subject: [PATCH 138/156] vk_blit_screen: Fix AA destruction order --- .../renderer_vulkan/vk_blit_screen.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index e8737537e..ad0cd8ee1 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -34,12 +34,11 @@ namespace Vulkan { struct ScreenInfo; class Device; +class FSR; class RasterizerVulkan; class VKScheduler; class VKSwapchain; -class FSR; - struct VKScreenInfo { VkImageView image_view{}; u32 width{}; @@ -123,7 +122,6 @@ private: vk::DescriptorPool descriptor_pool; vk::DescriptorSetLayout descriptor_set_layout; vk::PipelineLayout pipeline_layout; - vk::Pipeline aa_pipeline; vk::Pipeline nearest_neightbor_pipeline; vk::Pipeline bilinear_pipeline; vk::Pipeline bicubic_pipeline; @@ -144,15 +142,18 @@ private: std::vector raw_images; std::vector raw_image_views; std::vector raw_buffer_commits; - vk::Image aa_image; - vk::ImageView aa_image_view; - MemoryCommit aa_commit; - vk::Framebuffer aa_framebuffer; - vk::RenderPass aa_renderpass; - vk::DescriptorSets aa_descriptor_sets; + vk::DescriptorPool aa_descriptor_pool; vk::DescriptorSetLayout aa_descriptor_set_layout; vk::PipelineLayout aa_pipeline_layout; + vk::Pipeline aa_pipeline; + vk::RenderPass aa_renderpass; + vk::Framebuffer aa_framebuffer; + vk::DescriptorSets aa_descriptor_sets; + vk::Image aa_image; + vk::ImageView aa_image_view; + MemoryCommit aa_commit; + u32 raw_width = 0; u32 raw_height = 0; From bb0367548591a66cd963b14373ed5b9c61adabb1 Mon Sep 17 00:00:00 2001 From: Marshall Mohror Date: Sat, 23 Oct 2021 00:11:12 -0500 Subject: [PATCH 139/156] Vulkan: Reimplement FSR constant generation functions to avoid GCC warnings --- externals/CMakeLists.txt | 3 - src/video_core/CMakeLists.txt | 1 - src/video_core/renderer_vulkan/vk_fsr.cpp | 153 ++++++++++++++++++++-- 3 files changed, 145 insertions(+), 12 deletions(-) diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 7ff2ccc24..a76a3d800 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -25,9 +25,6 @@ if (ARCHITECTURE_x86_64) add_subdirectory(dynarmic) endif() -add_library(ffx-fsr INTERFACE) -target_include_directories(ffx-fsr INTERFACE FidelityFX-FSR/ffx-fsr) - # getopt if (MSVC) add_subdirectory(getopt) diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 07b94dcc8..91a30fef7 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -237,7 +237,6 @@ target_include_directories(video_core PRIVATE ${FFmpeg_INCLUDE_DIR}) target_link_libraries(video_core PRIVATE ${FFmpeg_LIBRARIES}) target_link_options(video_core PRIVATE ${FFmpeg_LDFLAGS}) -target_link_libraries(video_core PRIVATE ffx-fsr) add_dependencies(video_core host_shaders) target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE}) target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include) diff --git a/src/video_core/renderer_vulkan/vk_fsr.cpp b/src/video_core/renderer_vulkan/vk_fsr.cpp index 9288aa7c2..2feaa9d37 100644 --- a/src/video_core/renderer_vulkan/vk_fsr.cpp +++ b/src/video_core/renderer_vulkan/vk_fsr.cpp @@ -13,9 +13,146 @@ #include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/vulkan_common/vulkan_device.h" -#define A_CPU -#include -#include +// Reimplementations of the constant generating functions in ffx_fsr1.h +// GCC generated a lot of warnings when using the official header. +static u32 AU1_AH1_AF1(f32 f) { + static constexpr u32 base[512]{ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, + 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000, 0x1400, 0x1800, 0x1c00, 0x2000, + 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, + 0x5000, 0x5400, 0x5800, 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, + 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, + 0x9800, 0x9c00, 0xa000, 0xa400, 0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, + 0xc400, 0xc800, 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, + 0xf000, 0xf400, 0xf800, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + }; + static constexpr s8 shift[512]{ + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, + 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, + 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, + }; + auto u = std::bit_cast(f); + u32 i = u >> 23; + return base[i] + ((u & 0x7fffff) >> shift[i]); +} + +static u32 AU1_AH2_AF2(f32 a[2]) { + return AU1_AH1_AF1(a[0]) + (AU1_AH1_AF1(a[1]) << 16); +} + +static void FsrEasuCon(u32 con0[4], u32 con1[4], u32 con2[4], u32 con3[4], + f32 inputViewportInPixelsX, f32 inputViewportInPixelsY, + f32 inputSizeInPixelsX, f32 inputSizeInPixelsY, f32 outputSizeInPixelsX, + f32 outputSizeInPixelsY) { + con0[0] = std::bit_cast(inputViewportInPixelsX / outputSizeInPixelsX); + con0[1] = std::bit_cast(inputViewportInPixelsY / outputSizeInPixelsY); + con0[2] = std::bit_cast(0.5f * inputViewportInPixelsX / outputSizeInPixelsX - 0.5f); + con0[3] = std::bit_cast(0.5f * inputViewportInPixelsY / outputSizeInPixelsY - 0.5f); + con1[0] = std::bit_cast(1.0f / inputSizeInPixelsX); + con1[1] = std::bit_cast(1.0f / inputSizeInPixelsY); + con1[2] = std::bit_cast(1.0f / inputSizeInPixelsX); + con1[3] = std::bit_cast(-1.0f / inputSizeInPixelsY); + con2[0] = std::bit_cast(-1.0f / inputSizeInPixelsX); + con2[1] = std::bit_cast(2.0f / inputSizeInPixelsY); + con2[2] = std::bit_cast(1.0f / inputSizeInPixelsX); + con2[3] = std::bit_cast(2.0f / inputSizeInPixelsY); + con3[0] = std::bit_cast(0.0f / inputSizeInPixelsX); + con3[1] = std::bit_cast(4.0f / inputSizeInPixelsY); + con3[2] = con3[3] = 0; +} + +static void FsrEasuConOffset(u32 con0[4], u32 con1[4], u32 con2[4], u32 con3[4], + f32 inputViewportInPixelsX, f32 inputViewportInPixelsY, + f32 inputSizeInPixelsX, f32 inputSizeInPixelsY, + f32 outputSizeInPixelsX, f32 outputSizeInPixelsY, + f32 inputOffsetInPixelsX, f32 inputOffsetInPixelsY) { + FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, + inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY); + con0[2] = std::bit_cast(0.5f * inputViewportInPixelsX / outputSizeInPixelsX - 0.5f + + inputOffsetInPixelsX); + con0[3] = std::bit_cast(0.5f * inputViewportInPixelsY / outputSizeInPixelsY - 0.5f + + inputOffsetInPixelsY); +} + +static void FsrRcasCon(u32* con, f32 sharpness) { + sharpness = std::exp2f(-sharpness); + f32 hSharp[2]{sharpness, sharpness}; + con[0] = std::bit_cast(sharpness); + con[1] = AU1_AH2_AF2(hSharp); + con[2] = 0; + con[3] = 0; +} namespace Vulkan { @@ -62,15 +199,15 @@ VkImageView FSR::Draw(VKScheduler& scheduler, size_t image_index, VkImageView im cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *easu_pipeline); - std::array push_constants; + std::array push_constants; FsrEasuConOffset( push_constants.data() + 0, push_constants.data() + 4, push_constants.data() + 8, push_constants.data() + 12, - static_cast(crop_rect.GetWidth()), static_cast(crop_rect.GetHeight()), - static_cast(input_image_extent.width), static_cast(input_image_extent.height), - static_cast(output_size.width), static_cast(output_size.height), - static_cast(crop_rect.left), static_cast(crop_rect.top)); + static_cast(crop_rect.GetWidth()), static_cast(crop_rect.GetHeight()), + static_cast(input_image_extent.width), static_cast(input_image_extent.height), + static_cast(output_size.width), static_cast(output_size.height), + static_cast(crop_rect.left), static_cast(crop_rect.top)); cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, push_constants); { From 99124b72618285114ac3ff820732a510bbf7aae4 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 23 Oct 2021 01:29:44 -0400 Subject: [PATCH 140/156] FSR: Fix GCC build errors --- .../renderer_vulkan/vk_blit_screen.cpp | 19 ++--- .../renderer_vulkan/vk_blit_screen.h | 2 + src/video_core/renderer_vulkan/vk_fsr.cpp | 72 ++++++++++--------- 3 files changed, 50 insertions(+), 43 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 9dfc508bc..1e447e621 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -468,20 +468,14 @@ void VKBlitScreen::CreateDynamicResources() { CreateGraphicsPipeline(); fsr.reset(); if (Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::Fsr) { - const auto& layout = render_window.GetFramebufferLayout(); - fsr = std::make_unique( - device, memory_allocator, image_count, - VkExtent2D{.width = layout.screen.GetWidth(), .height = layout.screen.GetHeight()}); + CreateFSR(); } } void VKBlitScreen::RefreshResources(const Tegra::FramebufferConfig& framebuffer) { if (Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::Fsr) { if (!fsr) { - const auto& layout = render_window.GetFramebufferLayout(); - fsr = std::make_unique( - device, memory_allocator, image_count, - VkExtent2D{.width = layout.screen.GetWidth(), .height = layout.screen.GetHeight()}); + CreateFSR(); } } else { fsr.reset(); @@ -1443,6 +1437,15 @@ void VKBlitScreen::SetVertexData(BufferData& data, const Tegra::FramebufferConfi data.vertices[3] = ScreenRectVertex(x + w, y + h, texcoords.bottom * scale_u, right * scale_v); } +void VKBlitScreen::CreateFSR() { + const auto& layout = render_window.GetFramebufferLayout(); + const VkExtent2D fsr_size{ + .width = layout.screen.GetWidth(), + .height = layout.screen.GetHeight(), + }; + fsr = std::make_unique(device, memory_allocator, image_count, fsr_size); +} + u64 VKBlitScreen::CalculateBufferSize(const Tegra::FramebufferConfig& framebuffer) const { return sizeof(BufferData) + GetSizeInBytes(framebuffer) * image_count; } diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index ad0cd8ee1..bbca71af3 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -99,6 +99,8 @@ private: void SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer, const Layout::FramebufferLayout layout) const; + void CreateFSR(); + u64 CalculateBufferSize(const Tegra::FramebufferConfig& framebuffer) const; u64 GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer, std::size_t image_index) const; diff --git a/src/video_core/renderer_vulkan/vk_fsr.cpp b/src/video_core/renderer_vulkan/vk_fsr.cpp index 2feaa9d37..73629d229 100644 --- a/src/video_core/renderer_vulkan/vk_fsr.cpp +++ b/src/video_core/renderer_vulkan/vk_fsr.cpp @@ -2,8 +2,11 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include +#include "common/bit_cast.h" #include "common/common_types.h" #include "common/div_ceil.h" + #include "video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16_comp_spv.h" #include "video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32_comp_spv.h" #include "video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16_comp_spv.h" @@ -13,9 +16,11 @@ #include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/vulkan_common/vulkan_device.h" +namespace Vulkan { +namespace { // Reimplementations of the constant generating functions in ffx_fsr1.h // GCC generated a lot of warnings when using the official header. -static u32 AU1_AH1_AF1(f32 f) { +u32 AU1_AH1_AF1(f32 f) { static constexpr u32 base[512]{ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, @@ -102,59 +107,56 @@ static u32 AU1_AH1_AF1(f32 f) { 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, }; - auto u = std::bit_cast(f); - u32 i = u >> 23; + const u32 u = Common::BitCast(f); + const u32 i = u >> 23; return base[i] + ((u & 0x7fffff) >> shift[i]); } -static u32 AU1_AH2_AF2(f32 a[2]) { +u32 AU1_AH2_AF2(f32 a[2]) { return AU1_AH1_AF1(a[0]) + (AU1_AH1_AF1(a[1]) << 16); } -static void FsrEasuCon(u32 con0[4], u32 con1[4], u32 con2[4], u32 con3[4], - f32 inputViewportInPixelsX, f32 inputViewportInPixelsY, - f32 inputSizeInPixelsX, f32 inputSizeInPixelsY, f32 outputSizeInPixelsX, - f32 outputSizeInPixelsY) { - con0[0] = std::bit_cast(inputViewportInPixelsX / outputSizeInPixelsX); - con0[1] = std::bit_cast(inputViewportInPixelsY / outputSizeInPixelsY); - con0[2] = std::bit_cast(0.5f * inputViewportInPixelsX / outputSizeInPixelsX - 0.5f); - con0[3] = std::bit_cast(0.5f * inputViewportInPixelsY / outputSizeInPixelsY - 0.5f); - con1[0] = std::bit_cast(1.0f / inputSizeInPixelsX); - con1[1] = std::bit_cast(1.0f / inputSizeInPixelsY); - con1[2] = std::bit_cast(1.0f / inputSizeInPixelsX); - con1[3] = std::bit_cast(-1.0f / inputSizeInPixelsY); - con2[0] = std::bit_cast(-1.0f / inputSizeInPixelsX); - con2[1] = std::bit_cast(2.0f / inputSizeInPixelsY); - con2[2] = std::bit_cast(1.0f / inputSizeInPixelsX); - con2[3] = std::bit_cast(2.0f / inputSizeInPixelsY); - con3[0] = std::bit_cast(0.0f / inputSizeInPixelsX); - con3[1] = std::bit_cast(4.0f / inputSizeInPixelsY); +void FsrEasuCon(u32 con0[4], u32 con1[4], u32 con2[4], u32 con3[4], f32 inputViewportInPixelsX, + f32 inputViewportInPixelsY, f32 inputSizeInPixelsX, f32 inputSizeInPixelsY, + f32 outputSizeInPixelsX, f32 outputSizeInPixelsY) { + con0[0] = Common::BitCast(inputViewportInPixelsX / outputSizeInPixelsX); + con0[1] = Common::BitCast(inputViewportInPixelsY / outputSizeInPixelsY); + con0[2] = Common::BitCast(0.5f * inputViewportInPixelsX / outputSizeInPixelsX - 0.5f); + con0[3] = Common::BitCast(0.5f * inputViewportInPixelsY / outputSizeInPixelsY - 0.5f); + con1[0] = Common::BitCast(1.0f / inputSizeInPixelsX); + con1[1] = Common::BitCast(1.0f / inputSizeInPixelsY); + con1[2] = Common::BitCast(1.0f / inputSizeInPixelsX); + con1[3] = Common::BitCast(-1.0f / inputSizeInPixelsY); + con2[0] = Common::BitCast(-1.0f / inputSizeInPixelsX); + con2[1] = Common::BitCast(2.0f / inputSizeInPixelsY); + con2[2] = Common::BitCast(1.0f / inputSizeInPixelsX); + con2[3] = Common::BitCast(2.0f / inputSizeInPixelsY); + con3[0] = Common::BitCast(0.0f / inputSizeInPixelsX); + con3[1] = Common::BitCast(4.0f / inputSizeInPixelsY); con3[2] = con3[3] = 0; } -static void FsrEasuConOffset(u32 con0[4], u32 con1[4], u32 con2[4], u32 con3[4], - f32 inputViewportInPixelsX, f32 inputViewportInPixelsY, - f32 inputSizeInPixelsX, f32 inputSizeInPixelsY, - f32 outputSizeInPixelsX, f32 outputSizeInPixelsY, - f32 inputOffsetInPixelsX, f32 inputOffsetInPixelsY) { +void FsrEasuConOffset(u32 con0[4], u32 con1[4], u32 con2[4], u32 con3[4], + f32 inputViewportInPixelsX, f32 inputViewportInPixelsY, + f32 inputSizeInPixelsX, f32 inputSizeInPixelsY, f32 outputSizeInPixelsX, + f32 outputSizeInPixelsY, f32 inputOffsetInPixelsX, f32 inputOffsetInPixelsY) { FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY); - con0[2] = std::bit_cast(0.5f * inputViewportInPixelsX / outputSizeInPixelsX - 0.5f + - inputOffsetInPixelsX); - con0[3] = std::bit_cast(0.5f * inputViewportInPixelsY / outputSizeInPixelsY - 0.5f + - inputOffsetInPixelsY); + con0[2] = Common::BitCast(0.5f * inputViewportInPixelsX / outputSizeInPixelsX - 0.5f + + inputOffsetInPixelsX); + con0[3] = Common::BitCast(0.5f * inputViewportInPixelsY / outputSizeInPixelsY - 0.5f + + inputOffsetInPixelsY); } -static void FsrRcasCon(u32* con, f32 sharpness) { +void FsrRcasCon(u32* con, f32 sharpness) { sharpness = std::exp2f(-sharpness); f32 hSharp[2]{sharpness, sharpness}; - con[0] = std::bit_cast(sharpness); + con[0] = Common::BitCast(sharpness); con[1] = AU1_AH2_AF2(hSharp); con[2] = 0; con[3] = 0; } - -namespace Vulkan { +} // Anonymous namespace FSR::FSR(const Device& device_, MemoryAllocator& memory_allocator_, size_t image_count_, VkExtent2D output_size_) From 864f2e0b8144a0814f2555d06fffe1e7439854fe Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 23 Oct 2021 01:49:41 -0400 Subject: [PATCH 141/156] configure_graphics.ui: Cleanup scaling options and fix duplicate name warning --- src/yuzu/configuration/configure_graphics.ui | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui index 0d2987fcf..660b68c1c 100644 --- a/src/yuzu/configuration/configure_graphics.ui +++ b/src/yuzu/configuration/configure_graphics.ui @@ -335,12 +335,12 @@ - 0.5X (360p/540p)[MAY BREAK] + 0.5X (360p/540p) [EXPERIMENTAL] - 0.75X (540p/810p)[MAY BREAK] + 0.75X (540p/810p) [EXPERIMENTAL] @@ -350,12 +350,12 @@ - 2X (1440p/2160[4K]p) + 2X (1440p/2160p) - 3X (2160p[4K]/3240p) + 3X (2160p/3240p) @@ -380,7 +380,7 @@ - + 0 From 87abab71fff2189c549ae247eb6c281c8f618acd Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 23 Oct 2021 02:40:02 -0400 Subject: [PATCH 142/156] host_shaders: Misc copyright/style changes --- src/video_core/host_shaders/fxaa.frag | 6 +++++- src/video_core/host_shaders/fxaa.vert | 10 ++++------ src/video_core/host_shaders/present_bicubic.frag | 2 +- src/video_core/host_shaders/present_gaussian.frag | 4 ++-- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/video_core/host_shaders/fxaa.frag b/src/video_core/host_shaders/fxaa.frag index 23f910d4c..02f4068d1 100644 --- a/src/video_core/host_shaders/fxaa.frag +++ b/src/video_core/host_shaders/fxaa.frag @@ -1,4 +1,8 @@ -// Adapted from +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +// Source code is adapted from // https://www.geeks3d.com/20110405/fxaa-fast-approximate-anti-aliasing-demo-glsl-opengl-test-radeon-geforce/3/ #version 460 diff --git a/src/video_core/host_shaders/fxaa.vert b/src/video_core/host_shaders/fxaa.vert index 01d5ff4df..ac20c04e9 100644 --- a/src/video_core/host_shaders/fxaa.vert +++ b/src/video_core/host_shaders/fxaa.vert @@ -1,4 +1,4 @@ -// Copyright 2019 yuzu Emulator Project +// Copyright 2021 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. @@ -16,10 +16,12 @@ layout (location = 0) out vec4 posPos; #ifdef VULKAN #define BINDING_COLOR_TEXTURE 0 +#define VERTEX_ID gl_VertexIndex #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv #define BINDING_COLOR_TEXTURE 0 +#define VERTEX_ID gl_VertexID #endif @@ -28,11 +30,7 @@ layout (binding = BINDING_COLOR_TEXTURE) uniform sampler2D input_texture; const float FXAA_SUBPIX_SHIFT = 0; void main() { -#ifdef VULKAN - vec2 vertex = vertices[gl_VertexIndex]; -#else - vec2 vertex = vertices[gl_VertexID]; -#endif + vec2 vertex = vertices[VERTEX_ID]; gl_Position = vec4(vertex, 0.0, 1.0); vec2 vert_tex_coord = (vertex + 1.0) / 2.0; posPos.xy = vert_tex_coord; diff --git a/src/video_core/host_shaders/present_bicubic.frag b/src/video_core/host_shaders/present_bicubic.frag index f3e5410e7..902b70c2b 100644 --- a/src/video_core/host_shaders/present_bicubic.frag +++ b/src/video_core/host_shaders/present_bicubic.frag @@ -1,4 +1,4 @@ -// Copyright 2019 yuzu Emulator Project +// Copyright 2021 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. diff --git a/src/video_core/host_shaders/present_gaussian.frag b/src/video_core/host_shaders/present_gaussian.frag index a9558548f..72a300dac 100644 --- a/src/video_core/host_shaders/present_gaussian.frag +++ b/src/video_core/host_shaders/present_gaussian.frag @@ -1,8 +1,8 @@ -// Copyright 2019 yuzu Emulator Project +// Copyright 2021 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -// Code obtained from this 2 sources: +// Code adapted from the following sources: // - https://learnopengl.com/Advanced-Lighting/Bloom // - https://www.rastergrid.com/blog/2010/09/efficient-gaussian-blur-with-linear-sampling/ From c97c46747d07756075ce6a498a51126189d2be0d Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Sat, 23 Oct 2021 15:56:44 +0200 Subject: [PATCH 143/156] Vulkan: fix regression. --- .../renderer_vulkan/vk_texture_cache.cpp | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 85a1d520b..1c0741250 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1221,9 +1221,12 @@ bool Image::BlitScaleHelper(bool scale_up) { const auto& resolution = runtime->resolution; const u32 scaled_width = resolution.ScaleUp(info.size.width); const u32 scaled_height = is_2d ? resolution.ScaleUp(info.size.height) : info.size.height; - if (!scale_view) { + std::unique_ptr& blit_view = scale_up ? scale_view : normal_view; + std::unique_ptr& blit_framebuffer = + scale_up ? scale_framebuffer : normal_framebuffer; + if (!blit_view) { const auto view_info = ImageViewInfo(ImageViewType::e2D, info.format); - scale_view = std::make_unique(*runtime, view_info, NULL_IMAGE_ID, *this); + blit_view = std::make_unique(*runtime, view_info, NULL_IMAGE_ID, *this); } const u32 src_width = scale_up ? info.size.width : scaled_width; @@ -1239,27 +1242,27 @@ bool Image::BlitScaleHelper(bool scale_up) { .end = {static_cast(dst_width), static_cast(dst_height)}, }; const VkExtent2D extent{ - .width = scaled_width, - .height = scaled_height, + .width = std::max(scaled_width, info.size.width), + .height = std::max(scaled_height, info.size.width), }; - auto* view_ptr = scale_view.get(); + auto* view_ptr = blit_view.get(); if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) { - if (!scale_framebuffer) { - scale_framebuffer = std::make_unique(*runtime, view_ptr, nullptr, extent); + if (!blit_framebuffer) { + blit_framebuffer = std::make_unique(*runtime, view_ptr, nullptr, extent); } - const auto color_view = scale_view->Handle(Shader::TextureType::Color2D); + const auto color_view = blit_view->Handle(Shader::TextureType::Color2D); - runtime->blit_image_helper.BlitColor(scale_framebuffer.get(), color_view, dst_region, + runtime->blit_image_helper.BlitColor(blit_framebuffer.get(), color_view, dst_region, src_region, operation, BLIT_OPERATION); } else if (!runtime->device.IsBlitDepthStencilSupported() && aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { - if (!scale_framebuffer) { - scale_framebuffer = std::make_unique(*runtime, nullptr, view_ptr, extent); + if (!blit_framebuffer) { + blit_framebuffer = std::make_unique(*runtime, nullptr, view_ptr, extent); } - runtime->blit_image_helper.BlitDepthStencil( - scale_framebuffer.get(), scale_view->DepthView(), scale_view->StencilView(), dst_region, - src_region, operation, BLIT_OPERATION); + runtime->blit_image_helper.BlitDepthStencil(blit_framebuffer.get(), blit_view->DepthView(), + blit_view->StencilView(), dst_region, + src_region, operation, BLIT_OPERATION); } else { // TODO: Use helper blits where applicable flags &= ~ImageFlagBits::Rescaled; From 9189aacfe28d0114fbfff0e3f89e8912f5377454 Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Sat, 23 Oct 2021 15:57:05 +0200 Subject: [PATCH 144/156] OpenGL: Fix viewport/Scissor scaling on downscaling. --- .../renderer_opengl/gl_rasterizer.cpp | 34 +++++++++++++++---- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 4df8a684a..d8ac46d2a 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -187,6 +187,7 @@ void RasterizerOpenGL::Clear() { std::scoped_lock lock{texture_cache.mutex}; texture_cache.UpdateRenderTargets(true); state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); + SyncViewport(); if (regs.clear_flags.scissor) { SyncScissorTest(); } else { @@ -571,6 +572,15 @@ void RasterizerOpenGL::SyncViewport() { } const bool is_rescaling{texture_cache.IsRescaling()}; const float scale = is_rescaling ? Settings::values.resolution_info.up_factor : 1.0f; + const auto conv = [scale](float value) -> GLfloat { + float new_value = value * scale; + if (scale < 1.0f) { + const bool sign = std::signbit(value); + new_value = std::round(std::abs(new_value)); + new_value = sign ? -new_value : new_value; + } + return static_cast(new_value); + }; if (dirty_viewport) { flags[Dirty::Viewports] = false; @@ -586,10 +596,11 @@ void RasterizerOpenGL::SyncViewport() { flags[Dirty::Viewport0 + index] = false; const auto& src = regs.viewport_transform[index]; - GLfloat x = (src.translate_x - src.scale_x) * scale; - GLfloat y = (src.translate_y - src.scale_y) * scale; - GLfloat width = src.scale_x * 2.0f * scale; - GLfloat height = src.scale_y * 2.0f * scale; + GLfloat x = conv(src.translate_x - src.scale_x); + GLfloat y = conv(src.translate_y - src.scale_y); + GLfloat width = conv(src.scale_x * 2.0f); + GLfloat height = conv(src.scale_y * 2.0f); + if (height < 0) { y += height; height = -height; @@ -925,8 +936,19 @@ void RasterizerOpenGL::SyncScissorTest() { const auto& resolution = Settings::values.resolution_info; const bool is_rescaling{texture_cache.IsRescaling()}; - const auto scale_up = [resolution, is_rescaling](u32 value) { - return is_rescaling ? resolution.ScaleUp(value) : value; + const u32 up_scale = is_rescaling ? resolution.up_scale : 1U; + const u32 down_shift = is_rescaling ? resolution.down_shift : 0U; + const auto scale_up = [up_scale, down_shift](u32 value) -> u32 { + if (value == 0) { + return 0U; + } + const u32 upset = value * up_scale; + u32 acumm{}; + if ((up_scale >> down_shift) == 0) { + acumm = upset % 2; + } + const u32 converted_value = upset >> down_shift; + return std::max(converted_value + acumm, 1U); }; for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) { if (!force && !flags[Dirty::Scissor0 + index]) { From 099b0b3167d6dc47b764331f40aa935e8a9ef86a Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Sat, 23 Oct 2021 17:17:02 +0200 Subject: [PATCH 145/156] Texture Cache: Fix memory usage on ScaleDown. --- src/video_core/texture_cache/texture_cache.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index dd9553806..26ab857c9 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -878,10 +878,6 @@ bool TextureCache

::ScaleDown(Image& image) { if (!rescaled) { return false; } - const bool has_copy = image.HasScaled(); - if (!has_copy) { - total_used_memory -= GetScaledImageSizeBytes(image); - } InvalidateScale(image); return true; } From 9fc1fa1b0dd015db15c6eaafe68206943bf4cbc1 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 24 Oct 2021 22:52:43 -0400 Subject: [PATCH 146/156] gl_resource_manager: Ensure non EXT_framebuffer objects are created --- .../renderer_opengl/gl_resource_manager.cpp | 7 ++++++- .../renderer_opengl/gl_texture_cache.cpp | 14 ++------------ 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 70947838c..5e7101d28 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -166,7 +166,12 @@ void OGLFramebuffer::Create() { return; MICROPROFILE_SCOPE(OpenGL_ResourceCreation); - glCreateFramebuffers(1, &handle); + // Bind to READ_FRAMEBUFFER to stop Nvidia's driver from creating an EXT_framebuffer instead of + // a core framebuffer. EXT framebuffer attachments have to match in size and can be shared + // across contexts. yuzu doesn't share framebuffers across contexts and we need attachments with + // mismatching size, this is why core framebuffers are preferred. + glGenFramebuffers(1, &handle); + glBindFramebuffer(GL_READ_FRAMEBUFFER, handle); } void OGLFramebuffer::Release() { diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 6841b5450..00610ea2c 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -478,10 +478,6 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& for (size_t i = 0; i < rescale_draw_fbos.size(); ++i) { rescale_draw_fbos[i].Create(); rescale_read_fbos[i].Create(); - - // Make sure the framebuffer is created without DSA - glBindFramebuffer(GL_READ_FRAMEBUFFER, rescale_draw_fbos[i].handle); - glBindFramebuffer(GL_READ_FRAMEBUFFER, rescale_read_fbos[i].handle); } } } @@ -1224,13 +1220,8 @@ Sampler::Sampler(TextureCacheRuntime& runtime, const TSCEntry& config) { Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span color_buffers, ImageView* depth_buffer, const VideoCommon::RenderTargets& key) { - // Bind to READ_FRAMEBUFFER to stop Nvidia's driver from creating an EXT_framebuffer instead of - // a core framebuffer. EXT framebuffer attachments have to match in size and can be shared - // across contexts. yuzu doesn't share framebuffers across contexts and we need attachments with - // mismatching size, this is why core framebuffers are preferred. - GLuint handle; - glGenFramebuffers(1, &handle); - glBindFramebuffer(GL_READ_FRAMEBUFFER, handle); + framebuffer.Create(); + GLuint handle = framebuffer.handle; GLsizei num_buffers = 0; std::array gl_draw_buffers; @@ -1278,7 +1269,6 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span(name.size()), name.data()); } - framebuffer.handle = handle; } void BGRCopyPass::CopyBGR(Image& dst_image, Image& src_image, From 917b2466ad996cae75d9a0ca31226597b256acf9 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 24 Oct 2021 23:07:15 -0400 Subject: [PATCH 147/156] texture_cache: Refactor Render Target scaling function --- src/video_core/texture_cache/texture_cache.h | 33 +++++++++++-------- .../texture_cache/texture_cache_base.h | 5 +++ 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 26ab857c9..c8031b695 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -192,19 +192,8 @@ void TextureCache

::SynchronizeComputeDescriptors() { } template -void TextureCache

::UpdateRenderTargets(bool is_clear) { - using namespace VideoCommon::Dirty; +bool TextureCache

::RescaleRenderTargets(bool is_clear) { auto& flags = maxwell3d.dirty.flags; - if (!flags[Dirty::RenderTargets]) { - for (size_t index = 0; index < NUM_RT; ++index) { - ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; - PrepareImageView(color_buffer_id, true, is_clear && IsFullClear(color_buffer_id)); - } - const ImageViewId depth_buffer_id = render_targets.depth_buffer_id; - PrepareImageView(depth_buffer_id, true, is_clear && IsFullClear(depth_buffer_id)); - return; - } - u32 scale_rating = 0; bool rescaled = false; std::array tmp_color_images{}; @@ -281,8 +270,6 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { scale_rating = 1; } } while (has_deleted_images); - // Rescale End - const auto set_rating = [this, scale_rating](ImageId image_id) { if (image_id != CORRUPT_ID) { Image& image = slot_images[image_id]; @@ -297,6 +284,24 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { } set_rating(tmp_depth_image); + return rescaled; +} + +template +void TextureCache

::UpdateRenderTargets(bool is_clear) { + using namespace VideoCommon::Dirty; + auto& flags = maxwell3d.dirty.flags; + if (!flags[Dirty::RenderTargets]) { + for (size_t index = 0; index < NUM_RT; ++index) { + ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; + PrepareImageView(color_buffer_id, true, is_clear && IsFullClear(color_buffer_id)); + } + const ImageViewId depth_buffer_id = render_targets.depth_buffer_id; + PrepareImageView(depth_buffer_id, true, is_clear && IsFullClear(depth_buffer_id)); + return; + } + + const bool rescaled = RescaleRenderTargets(is_clear); if (is_rescaling != rescaled) { flags[Dirty::RescaleViewports] = true; flags[Dirty::RescaleScissors] = true; diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index eea589269..643ad811c 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -119,6 +119,11 @@ public: /// Refresh the state for compute image view and sampler descriptors void SynchronizeComputeDescriptors(); + /// Updates the Render Targets if they can be rescaled + /// @param is_clear True when the render targets are being used for clears + /// @retval True if the Render Targets have been rescaled. + bool RescaleRenderTargets(bool is_clear); + /// Update bound render targets and upload memory if necessary /// @param is_clear True when the render targets are being used for clears void UpdateRenderTargets(bool is_clear); From de1c8c5c2c3131bb122351e676014cdc7c442e78 Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Fri, 29 Oct 2021 17:02:57 +0200 Subject: [PATCH 148/156] Texture Cahe/Shader decompiler: Resize PointSize on rescaling, refactor and make reaper more agressive on 4Gb GPUs. --- .../ir_opt/rescaling_pass.cpp | 21 +++++++++++++++++++ .../renderer_opengl/gl_rasterizer.cpp | 5 +++-- src/video_core/texture_cache/image_base.h | 1 - src/video_core/texture_cache/image_info.cpp | 6 +++--- src/video_core/texture_cache/texture_cache.h | 18 ++-------------- 5 files changed, 29 insertions(+), 22 deletions(-) diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp index a5fa4ee83..81098c038 100644 --- a/src/shader_recompiler/ir_opt/rescaling_pass.cpp +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -75,6 +75,14 @@ void PatchFragCoord(IR::Block& block, IR::Inst& inst) { inst.ReplaceUsesWith(downscaled_frag_coord); } +void PatchPointSize(IR::Block& block, IR::Inst& inst) { + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + const IR::F32 point_value{inst.Arg(1)}; + const IR::F32 up_factor{ir.FPRecip(ir.ResolutionDownFactor())}; + const IR::F32 upscaled_point_value{ir.FPMul(point_value, up_factor)}; + inst.SetArg(1, upscaled_point_value); +} + [[nodiscard]] IR::U32 Scale(IR::IREmitter& ir, const IR::U1& is_scaled, const IR::U32& value) { IR::U32 scaled_value{value}; if (const u32 up_scale = Settings::values.resolution_info.up_scale; up_scale != 1) { @@ -253,6 +261,19 @@ void Visit(const IR::Program& program, IR::Block& block, IR::Inst& inst) { } break; } + case IR::Opcode::SetAttribute: { + const IR::Attribute attr{inst.Arg(0).Attribute()}; + switch (attr) { + case IR::Attribute::PointSize: + if (inst.Flags() != 0xDEADBEEF) { + PatchPointSize(block, inst); + } + break; + default: + break; + } + break; + } case IR::Opcode::ImageQueryDimensions: PatchImageQueryDimensions(block, inst); break; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index d8ac46d2a..9b516c64f 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -976,8 +976,9 @@ void RasterizerOpenGL::SyncPointState() { oglEnable(GL_POINT_SPRITE, maxwell3d.regs.point_sprite_enable); oglEnable(GL_PROGRAM_POINT_SIZE, maxwell3d.regs.vp_point_size.enable); - - glPointSize(std::max(1.0f, maxwell3d.regs.point_size)); + const bool is_rescaling{texture_cache.IsRescaling()}; + const float scale = is_rescaling ? Settings::values.resolution_info.up_factor : 1.0f; + glPointSize(std::max(1.0f, maxwell3d.regs.point_size * scale)); } void RasterizerOpenGL::SyncLineState() { diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index 02c669766..89c111c00 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -38,7 +38,6 @@ enum class ImageFlagBits : u32 { Rescaled = 1 << 12, CheckingRescalable = 1 << 13, IsRescalable = 1 << 14, - Blacklisted = 1 << 15, }; DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index d8e414247..015a2d33d 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -135,7 +135,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) type = ImageType::e3D; size.depth = rt.depth; } else { - rescaleable = block.depth == 0 && size.height > 256; + rescaleable = block.depth == 0; downscaleable = size.height > 512; type = ImageType::e2D; resources.layers = rt.depth; @@ -165,7 +165,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept { type = ImageType::e3D; size.depth = regs.zeta_depth; } else { - rescaleable = block.depth == 0 && size.height > 256; + rescaleable = block.depth == 0; downscaleable = size.height > 512; type = ImageType::e2D; resources.layers = regs.zeta_depth; @@ -199,7 +199,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept { .height = config.height, .depth = 1, }; - rescaleable = block.depth == 0 && size.height > 256; + rescaleable = block.depth == 0; downscaleable = size.height > 512; } } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index c8031b695..aec130a32 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -53,8 +53,8 @@ TextureCache

::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& const auto device_memory = runtime.GetDeviceLocalMemory(); const u64 possible_expected_memory = (device_memory * 4) / 10; const u64 possible_critical_memory = (device_memory * 7) / 10; - expected_memory = std::max(possible_expected_memory, DEFAULT_EXPECTED_MEMORY); - critical_memory = std::max(possible_critical_memory, DEFAULT_CRITICAL_MEMORY); + expected_memory = std::max(possible_expected_memory, DEFAULT_EXPECTED_MEMORY - 256_MiB); + critical_memory = std::max(possible_critical_memory, DEFAULT_CRITICAL_MEMORY - 512_MiB); minimum_memory = 0; } else { // On OpenGL we can be more conservatives as the driver takes care. @@ -355,7 +355,6 @@ void TextureCache

::FillImageViews(DescriptorTable& table, if (view.blacklist && view.id != NULL_IMAGE_VIEW_ID) { const ImageViewBase& image_view{slot_image_views[view.id]}; auto& image = slot_images[image_view.image_id]; - image.flags |= ImageFlagBits::Blacklisted; has_blacklisted |= ScaleDown(image); image.scale_rating = 0; } @@ -985,7 +984,6 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA bool can_rescale = info.rescaleable; bool any_rescaled = false; - bool any_blacklisted = false; for (const ImageId sibling_id : all_siblings) { if (!can_rescale) { break; @@ -993,7 +991,6 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA Image& sibling = slot_images[sibling_id]; can_rescale &= ImageCanRescale(sibling); any_rescaled |= True(sibling.flags & ImageFlagBits::Rescaled); - any_blacklisted |= True(sibling.flags & ImageFlagBits::Blacklisted); } can_rescale &= any_rescaled; @@ -1007,9 +1004,6 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA for (const ImageId sibling_id : all_siblings) { Image& sibling = slot_images[sibling_id]; ScaleDown(sibling); - if (any_blacklisted) { - sibling.flags |= ImageFlagBits::Blacklisted; - } } } @@ -1644,7 +1638,6 @@ void TextureCache

::SynchronizeAliases(ImageId image_id) { boost::container::small_vector aliased_images; Image& image = slot_images[image_id]; bool any_rescaled = True(image.flags & ImageFlagBits::Rescaled); - bool any_blacklisted = True(image.flags & ImageFlagBits::Blacklisted); u64 most_recent_tick = image.modification_tick; for (const AliasedImage& aliased : image.aliased_images) { ImageBase& aliased_image = slot_images[aliased.id]; @@ -1652,7 +1645,6 @@ void TextureCache

::SynchronizeAliases(ImageId image_id) { most_recent_tick = std::max(most_recent_tick, aliased_image.modification_tick); aliased_images.push_back(&aliased); any_rescaled |= True(aliased_image.flags & ImageFlagBits::Rescaled); - any_blacklisted |= True(aliased_image.flags & ImageFlagBits::Blacklisted); } } if (aliased_images.empty()) { @@ -1664,9 +1656,6 @@ void TextureCache

::SynchronizeAliases(ImageId image_id) { ScaleUp(image); } else { ScaleDown(image); - if (any_blacklisted) { - image.flags |= ImageFlagBits::Blacklisted; - } } } image.modification_tick = most_recent_tick; @@ -1684,9 +1673,6 @@ void TextureCache

::SynchronizeAliases(ImageId image_id) { Image& aliased_image = slot_images[aliased->id]; if (!can_rescale) { ScaleDown(aliased_image); - if (any_blacklisted) { - aliased_image.flags |= ImageFlagBits::Blacklisted; - } CopyImage(image_id, aliased->id, aliased->copies); continue; } From d46a71e786b42ecfe702ae00e01e6dcdf9121875 Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Fri, 29 Oct 2021 17:45:02 +0200 Subject: [PATCH 149/156] HostShader: fix Gaussian filter. --- src/video_core/host_shaders/present_gaussian.frag | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/video_core/host_shaders/present_gaussian.frag b/src/video_core/host_shaders/present_gaussian.frag index 72a300dac..66fed3238 100644 --- a/src/video_core/host_shaders/present_gaussian.frag +++ b/src/video_core/host_shaders/present_gaussian.frag @@ -63,8 +63,8 @@ void main() { // TODO(Blinkhawk): This code can be optimized through shader group instructions. vec3 horizontal = blurHorizontal(color_texture, frag_tex_coord, tex_offset).rgb; vec3 vertical = blurVertical(color_texture, frag_tex_coord, tex_offset).rgb; - vec3 diagonalA = blurVertical(color_texture, frag_tex_coord, tex_offset).rgb; - vec3 diagonalB = blurVertical(color_texture, frag_tex_coord, tex_offset * vec2(1.0, -1.0)).rgb; + vec3 diagonalA = blurDiagonal(color_texture, frag_tex_coord, tex_offset).rgb; + vec3 diagonalB = blurDiagonal(color_texture, frag_tex_coord, tex_offset * vec2(1.0, -1.0)).rgb; vec3 combination = mix(mix(horizontal, vertical, 0.5f), mix(diagonalA, diagonalB, 0.5f), 0.5f); color = vec4(combination + base, 1.0f); } From 6f9869096378a5100620b3f2b685d61bf8ccae16 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 30 Oct 2021 01:47:18 +0200 Subject: [PATCH 150/156] ShaderCache: Better fix for Shuffling gl_FragCoord --- src/shader_recompiler/ir_opt/rescaling_pass.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp index 81098c038..c28500dd1 100644 --- a/src/shader_recompiler/ir_opt/rescaling_pass.cpp +++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp @@ -30,7 +30,7 @@ namespace { return false; } -void VisitMark(const IR::Inst& inst) { +void VisitMark(IR::Block& block, IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::ShuffleIndex: case IR::Opcode::ShuffleUp: @@ -49,19 +49,30 @@ void VisitMark(const IR::Inst& inst) { break; } IR::Inst* const bitcast_inst{bitcast_arg.InstRecursive()}; + bool must_patch_outside = false; if (bitcast_inst->GetOpcode() == IR::Opcode::GetAttribute) { const IR::Attribute attr{bitcast_inst->Arg(0).Attribute()}; switch (attr) { case IR::Attribute::PositionX: case IR::Attribute::PositionY: bitcast_inst->SetFlags(0xDEADBEEF); + must_patch_outside = true; break; default: break; } } + if (must_patch_outside) { + const auto it{IR::Block::InstructionList::s_iterator_to(inst)}; + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + const IR::F32 new_inst{&*block.PrependNewInst(it, inst)}; + const IR::F32 up_factor{ir.FPRecip(ir.ResolutionDownFactor())}; + const IR::Value converted{ir.FPMul(new_inst, up_factor)}; + inst.ReplaceUsesWith(converted); + } break; } + default: break; } @@ -302,7 +313,7 @@ void RescalingPass(IR::Program& program) { if (is_fragment_shader) { for (IR::Block* const block : program.post_order_blocks) { for (IR::Inst& inst : block->Instructions()) { - VisitMark(inst); + VisitMark(*block, inst); } } } From 6c97ab571a3d169d1d2a5472040d4373ea61184d Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 30 Oct 2021 01:50:32 +0200 Subject: [PATCH 151/156] Texture Cache: revert Image changes. --- src/video_core/texture_cache/image_info.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index 015a2d33d..afb94082b 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -16,6 +16,7 @@ namespace VideoCommon { using Tegra::Texture::TextureType; using Tegra::Texture::TICEntry; using VideoCore::Surface::PixelFormat; +using VideoCore::Surface::SurfaceType; ImageInfo::ImageInfo(const TICEntry& config) noexcept { format = PixelFormatFromTextureInfo(config.format, config.r_type, config.g_type, config.b_type, @@ -102,6 +103,7 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept { layer_stride = CalculateLayerStride(*this); maybe_unaligned_layer_stride = CalculateLayerSize(*this); rescaleable &= (block.depth == 0) && resources.levels == 1; + rescaleable &= size.height > 256 || GetFormatType(format) != SurfaceType::ColorTexture; downscaleable = size.height > 512; } } @@ -136,6 +138,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) size.depth = rt.depth; } else { rescaleable = block.depth == 0; + rescaleable &= size.height > 256; downscaleable = size.height > 512; type = ImageType::e2D; resources.layers = rt.depth; @@ -200,6 +203,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept { .depth = 1, }; rescaleable = block.depth == 0; + rescaleable &= size.height > 256; downscaleable = size.height > 512; } } From 5230378709470da56927e85c50d0524f9ce3f81b Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 30 Oct 2021 01:52:11 +0200 Subject: [PATCH 152/156] TextureCache: Make a better Anisotropic setter. --- .../renderer_opengl/gl_texture_cache.cpp | 9 ++++++++- .../renderer_vulkan/vk_texture_cache.cpp | 9 ++++++++- src/video_core/textures/texture.cpp | 19 +------------------ .../configure_graphics_advanced.ui | 8 ++++---- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 00610ea2c..c2668fee6 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -1201,7 +1201,14 @@ Sampler::Sampler(TextureCacheRuntime& runtime, const TSCEntry& config) { glSamplerParameterfv(handle, GL_TEXTURE_BORDER_COLOR, config.BorderColor().data()); if (GLAD_GL_ARB_texture_filter_anisotropic || GLAD_GL_EXT_texture_filter_anisotropic) { - glSamplerParameterf(handle, GL_TEXTURE_MAX_ANISOTROPY, config.MaxAnisotropy()); + const f32 setting_anisotropic = + static_cast(1U << Settings::values.max_anisotropy.GetValue()); + const f32 game_anisotropic = std::clamp(config.MaxAnisotropy(), 1.0f, 16.0f); + const bool aument_anisotropic = + game_anisotropic > 1.0f || config.mipmap_filter == TextureMipmapFilter::Linear; + const f32 max_anisotropy = + aument_anisotropic ? std::max(game_anisotropic, setting_anisotropic) : game_anisotropic; + glSamplerParameterf(handle, GL_TEXTURE_MAX_ANISOTROPY, max_anisotropy); } else { LOG_WARNING(Render_OpenGL, "GL_ARB_texture_filter_anisotropic is required"); } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 1c0741250..7db561ca0 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1448,7 +1448,14 @@ Sampler::Sampler(TextureCacheRuntime& runtime, const Tegra::Texture::TSCEntry& t LOG_WARNING(Render_Vulkan, "VK_EXT_sampler_filter_minmax is required"); } // Some games have samplers with garbage. Sanitize them here. - const float max_anisotropy = std::clamp(tsc.MaxAnisotropy(), 1.0f, 16.0f); + const f32 setting_anisotropic = + static_cast(1U << Settings::values.max_anisotropy.GetValue()); + const f32 game_anisotropic = std::clamp(tsc.MaxAnisotropy(), 1.0f, 16.0f); + const bool aument_anisotropic = + game_anisotropic > 1.0f || tsc.mipmap_filter == TextureMipmapFilter::Linear; + const f32 max_anisotropy = + aument_anisotropic ? std::max(game_anisotropic, setting_anisotropic) : game_anisotropic; + sampler = device.GetLogical().CreateSampler(VkSamplerCreateInfo{ .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, .pNext = pnext, diff --git a/src/video_core/textures/texture.cpp b/src/video_core/textures/texture.cpp index a552543ed..b2d5bb03e 100644 --- a/src/video_core/textures/texture.cpp +++ b/src/video_core/textures/texture.cpp @@ -6,7 +6,6 @@ #include #include "common/cityhash.h" -#include "common/settings.h" #include "video_core/textures/texture.h" using Tegra::Texture::TICEntry; @@ -51,22 +50,6 @@ constexpr std::array SRGB_CONVERSION_LUT = { 0.917104f, 0.929242f, 0.941493f, 0.953859f, 0.966338f, 1.000000f, 1.000000f, 1.000000f, }; -unsigned SettingsMinimumAnisotropy() noexcept { - switch (static_cast(Settings::values.max_anisotropy.GetValue())) { - default: - case Anisotropy::Default: - return 1U; - case Anisotropy::Filter2x: - return 2U; - case Anisotropy::Filter4x: - return 4U; - case Anisotropy::Filter8x: - return 8U; - case Anisotropy::Filter16x: - return 16U; - } -} - } // Anonymous namespace std::array TSCEntry::BorderColor() const noexcept { @@ -78,7 +61,7 @@ std::array TSCEntry::BorderColor() const noexcept { } float TSCEntry::MaxAnisotropy() const noexcept { - return static_cast(std::max(1U << max_anisotropy, SettingsMinimumAnisotropy())); + return static_cast(1U << max_anisotropy); } } // namespace Tegra::Texture diff --git a/src/yuzu/configuration/configure_graphics_advanced.ui b/src/yuzu/configuration/configure_graphics_advanced.ui index d06b45f17..cbbcd45a0 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.ui +++ b/src/yuzu/configuration/configure_graphics_advanced.ui @@ -130,22 +130,22 @@ - 2x (WILL BREAK THINGS) + 2x - 4x (WILL BREAK THINGS) + 4x - 8x (WILL BREAK THINGS) + 8x - 16x (WILL BREAK THINGS) + 16x From 282e04bffb4962dcc1d8aee2cb0fd2a1a45c86e6 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 16 Nov 2021 23:07:17 +0100 Subject: [PATCH 153/156] TextureCache: Add automatic anisotropic filtering and refactor code. --- src/common/settings.h | 2 +- .../renderer_opengl/gl_texture_cache.cpp | 8 +------- .../renderer_vulkan/vk_texture_cache.cpp | 8 +------- src/video_core/textures/texture.cpp | 15 ++++++++++++++- .../configuration/configure_graphics_advanced.ui | 5 +++++ 5 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/common/settings.h b/src/common/settings.h index c7610ef1c..42f8b4a7d 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -515,7 +515,7 @@ struct Values { #endif FullscreenMode::Borderless, FullscreenMode::Exclusive, "fullscreen_mode"}; RangedSetting aspect_ratio{0, 0, 3, "aspect_ratio"}; - RangedSetting max_anisotropy{0, 0, 4, "max_anisotropy"}; + RangedSetting max_anisotropy{0, 0, 5, "max_anisotropy"}; Setting use_speed_limit{true, "use_speed_limit"}; RangedSetting speed_limit{100, 0, 9999, "speed_limit"}; Setting use_disk_shader_cache{true, "use_disk_shader_cache"}; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index c2668fee6..1d3f193af 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -1201,13 +1201,7 @@ Sampler::Sampler(TextureCacheRuntime& runtime, const TSCEntry& config) { glSamplerParameterfv(handle, GL_TEXTURE_BORDER_COLOR, config.BorderColor().data()); if (GLAD_GL_ARB_texture_filter_anisotropic || GLAD_GL_EXT_texture_filter_anisotropic) { - const f32 setting_anisotropic = - static_cast(1U << Settings::values.max_anisotropy.GetValue()); - const f32 game_anisotropic = std::clamp(config.MaxAnisotropy(), 1.0f, 16.0f); - const bool aument_anisotropic = - game_anisotropic > 1.0f || config.mipmap_filter == TextureMipmapFilter::Linear; - const f32 max_anisotropy = - aument_anisotropic ? std::max(game_anisotropic, setting_anisotropic) : game_anisotropic; + const f32 max_anisotropy = std::clamp(config.MaxAnisotropy(), 1.0f, 16.0f); glSamplerParameterf(handle, GL_TEXTURE_MAX_ANISOTROPY, max_anisotropy); } else { LOG_WARNING(Render_OpenGL, "GL_ARB_texture_filter_anisotropic is required"); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 7db561ca0..daf26f380 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1448,13 +1448,7 @@ Sampler::Sampler(TextureCacheRuntime& runtime, const Tegra::Texture::TSCEntry& t LOG_WARNING(Render_Vulkan, "VK_EXT_sampler_filter_minmax is required"); } // Some games have samplers with garbage. Sanitize them here. - const f32 setting_anisotropic = - static_cast(1U << Settings::values.max_anisotropy.GetValue()); - const f32 game_anisotropic = std::clamp(tsc.MaxAnisotropy(), 1.0f, 16.0f); - const bool aument_anisotropic = - game_anisotropic > 1.0f || tsc.mipmap_filter == TextureMipmapFilter::Linear; - const f32 max_anisotropy = - aument_anisotropic ? std::max(game_anisotropic, setting_anisotropic) : game_anisotropic; + const f32 max_anisotropy = std::clamp(tsc.MaxAnisotropy(), 1.0f, 16.0f); sampler = device.GetLogical().CreateSampler(VkSamplerCreateInfo{ .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, diff --git a/src/video_core/textures/texture.cpp b/src/video_core/textures/texture.cpp index b2d5bb03e..ba066f98f 100644 --- a/src/video_core/textures/texture.cpp +++ b/src/video_core/textures/texture.cpp @@ -6,6 +6,7 @@ #include #include "common/cityhash.h" +#include "common/settings.h" #include "video_core/textures/texture.h" using Tegra::Texture::TICEntry; @@ -61,7 +62,19 @@ std::array TSCEntry::BorderColor() const noexcept { } float TSCEntry::MaxAnisotropy() const noexcept { - return static_cast(1U << max_anisotropy); + if (max_anisotropy == 0 && mipmap_filter != TextureMipmapFilter::Linear) { + return 1.0f; + } + const auto anisotropic_settings = Settings::values.max_anisotropy.GetValue(); + u32 new_max_anisotropic{}; + if (anisotropic_settings == 0) { + const auto anisotropic_based_onscale = Settings::values.resolution_info.up_scale >> + Settings::values.resolution_info.down_shift; + new_max_anisotropic = std::max(anisotropic_based_onscale + 1U, 1U); + } else { + new_max_anisotropic = Settings::values.max_anisotropy.GetValue(); + } + return static_cast(1U << std::min(max_anisotropy + anisotropic_settings - 1, 31U)); } } // namespace Tegra::Texture diff --git a/src/yuzu/configuration/configure_graphics_advanced.ui b/src/yuzu/configuration/configure_graphics_advanced.ui index cbbcd45a0..96de0b3d1 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.ui +++ b/src/yuzu/configuration/configure_graphics_advanced.ui @@ -123,6 +123,11 @@ + + + Automatic + + Default From 978f598ff64d3bd0299d06c47e6cbd63a496122c Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Wed, 17 Nov 2021 00:59:46 +0100 Subject: [PATCH 154/156] TextureCache: Fix OGL cleaning --- .../renderer_opengl/gl_texture_cache.cpp | 4 ++++ .../renderer_opengl/gl_texture_cache.h | 16 ++++++++++++++++ .../renderer_vulkan/vk_texture_cache.cpp | 4 ++++ .../renderer_vulkan/vk_texture_cache.h | 16 ++++++++++++++++ src/video_core/texture_cache/texture_cache.h | 3 +++ 5 files changed, 43 insertions(+) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 1d3f193af..30dfcfa6a 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -1117,6 +1117,8 @@ ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::NullImageViewParams& params) : VideoCommon::ImageViewBase{params}, views{runtime.null_image_views} {} +ImageView::~ImageView() = default; + GLuint ImageView::StorageView(Shader::TextureType texture_type, Shader::ImageFormat image_format) { if (image_format == Shader::ImageFormat::Typeless) { return Handle(texture_type); @@ -1272,6 +1274,8 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span copies) { static constexpr VideoCommon::Offset3D zero_offset{0, 0, 0}; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index c51a7428d..a717cf8c8 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -232,6 +232,14 @@ public: const VideoCommon::ImageViewInfo& view_info); explicit ImageView(TextureCacheRuntime&, const VideoCommon::NullImageViewParams&); + ~ImageView(); + + ImageView(const ImageView&) = delete; + ImageView& operator=(const ImageView&) = delete; + + ImageView(ImageView&&) = default; + ImageView& operator=(ImageView&&) = default; + [[nodiscard]] GLuint StorageView(Shader::TextureType texture_type, Shader::ImageFormat image_format); @@ -300,6 +308,14 @@ public: explicit Framebuffer(TextureCacheRuntime&, std::span color_buffers, ImageView* depth_buffer, const VideoCommon::RenderTargets& key); + ~Framebuffer(); + + Framebuffer(const Framebuffer&) = delete; + Framebuffer& operator=(const Framebuffer&) = delete; + + Framebuffer(Framebuffer&&) = default; + Framebuffer& operator=(Framebuffer&&) = default; + [[nodiscard]] GLuint Handle() const noexcept { return framebuffer.handle; } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index daf26f380..407fd2a15 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1366,6 +1366,8 @@ ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::NullImageViewParams& params) : VideoCommon::ImageViewBase{params} {} +ImageView::~ImageView() = default; + VkImageView ImageView::DepthView() { if (depth_view) { return *depth_view; @@ -1492,6 +1494,8 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, ImageView* color_buffer, CreateFramebuffer(runtime, color_buffers, depth_buffer); } +Framebuffer::~Framebuffer() = default; + void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime, std::span color_buffers, ImageView* depth_buffer) { diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 9d149d306..ff28b4e96 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -162,6 +162,14 @@ public: const VideoCommon::ImageViewInfo&, GPUVAddr); explicit ImageView(TextureCacheRuntime&, const VideoCommon::NullImageViewParams&); + ~ImageView(); + + ImageView(const ImageView&) = delete; + ImageView& operator=(const ImageView&) = delete; + + ImageView(ImageView&&) = default; + ImageView& operator=(ImageView&&) = default; + [[nodiscard]] VkImageView DepthView(); [[nodiscard]] VkImageView StencilView(); @@ -235,6 +243,14 @@ public: explicit Framebuffer(TextureCacheRuntime& runtime, ImageView* color_buffer, ImageView* depth_buffer, VkExtent2D extent); + ~Framebuffer(); + + Framebuffer(const Framebuffer&) = delete; + Framebuffer& operator=(const Framebuffer&) = delete; + + Framebuffer(Framebuffer&&) = default; + Framebuffer& operator=(Framebuffer&&) = default; + void CreateFramebuffer(TextureCacheRuntime& runtime, std::span color_buffers, ImageView* depth_buffer); diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index aec130a32..4d2874bf2 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -1620,6 +1620,9 @@ void TextureCache

::RemoveFramebuffers(std::span removed_vi auto it = framebuffers.begin(); while (it != framebuffers.end()) { if (it->first.Contains(removed_views)) { + auto framebuffer_id = it->second; + ASSERT(framebuffer_id); + sentenced_framebuffers.Push(std::move(slot_framebuffers[framebuffer_id])); it = framebuffers.erase(it); } else { ++it; From 1128cc35b97b70bd597e353a11ca3ec22784a58a Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Wed, 17 Nov 2021 01:40:49 +0100 Subject: [PATCH 155/156] TextureCache: OGL query device memory if possible. --- src/video_core/renderer_opengl/gl_texture_cache.cpp | 12 +++++++++++- src/video_core/renderer_opengl/gl_texture_cache.h | 4 +++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 30dfcfa6a..2f7d98d8b 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -9,8 +9,8 @@ #include +#include "common/literals.h" #include "common/settings.h" - #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state_tracker.h" @@ -42,6 +42,7 @@ using VideoCore::Surface::IsPixelFormatSRGB; using VideoCore::Surface::MaxPixelFormat; using VideoCore::Surface::PixelFormat; using VideoCore::Surface::SurfaceType; +using namespace Common::Literals; struct CopyOrigin { GLint level; @@ -496,6 +497,15 @@ ImageBufferMap TextureCacheRuntime::DownloadStagingBuffer(size_t size) { return download_buffers.RequestMap(size, false); } +u64 TextureCacheRuntime::GetDeviceLocalMemory() const { + if (GLAD_GL_NVX_gpu_memory_info) { + GLint cur_avail_mem_kb = 0; + glGetIntegerv(GL_GPU_MEMORY_INFO_CURRENT_AVAILABLE_VIDMEM_NVX, &cur_avail_mem_kb); + return static_cast(cur_avail_mem_kb) * 1_KiB; + } + return 2_GiB; // Return minimum requirements +} + void TextureCacheRuntime::CopyImage(Image& dst_image, Image& src_image, std::span copies) { const GLuint dst_name = dst_image.Handle(); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index a717cf8c8..1bb762568 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -82,6 +82,8 @@ public: ImageBufferMap DownloadStagingBuffer(size_t size); + u64 GetDeviceLocalMemory() const; + void CopyImage(Image& dst, Image& src, std::span copies); void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view, bool rescaled) { @@ -333,7 +335,7 @@ struct TextureCacheParams { static constexpr bool ENABLE_VALIDATION = true; static constexpr bool FRAMEBUFFER_BLITS = true; static constexpr bool HAS_EMULATED_COPIES = true; - static constexpr bool HAS_DEVICE_MEMORY_INFO = false; + static constexpr bool HAS_DEVICE_MEMORY_INFO = true; using Runtime = OpenGL::TextureCacheRuntime; using Image = OpenGL::Image; From 1c8a3d8d2916e8d43808c9b4e75c756f162890e8 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Wed, 17 Nov 2021 02:22:21 +0100 Subject: [PATCH 156/156] TextureCache: Fix Automatic Anisotropic. --- src/video_core/textures/texture.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/video_core/textures/texture.cpp b/src/video_core/textures/texture.cpp index ba066f98f..06954963d 100644 --- a/src/video_core/textures/texture.cpp +++ b/src/video_core/textures/texture.cpp @@ -66,15 +66,14 @@ float TSCEntry::MaxAnisotropy() const noexcept { return 1.0f; } const auto anisotropic_settings = Settings::values.max_anisotropy.GetValue(); - u32 new_max_anisotropic{}; + u32 added_anisotropic{}; if (anisotropic_settings == 0) { - const auto anisotropic_based_onscale = Settings::values.resolution_info.up_scale >> - Settings::values.resolution_info.down_shift; - new_max_anisotropic = std::max(anisotropic_based_onscale + 1U, 1U); + added_anisotropic = Settings::values.resolution_info.up_scale >> + Settings::values.resolution_info.down_shift; } else { - new_max_anisotropic = Settings::values.max_anisotropy.GetValue(); + added_anisotropic = Settings::values.max_anisotropy.GetValue() - 1U; } - return static_cast(1U << std::min(max_anisotropy + anisotropic_settings - 1, 31U)); + return static_cast(1U << (max_anisotropy + added_anisotropic)); } } // namespace Tegra::Texture