From 82107b33a2251eb4f55ab2006a8fc0cb47cc39e8 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 25 Jun 2023 18:43:23 -0400 Subject: [PATCH 1/4] OpenGL: Add Local Memory warmup shader --- src/video_core/host_shaders/CMakeLists.txt | 1 + .../host_shaders/opengl_lmem_warmup.comp | 47 +++++++++++++++++++ .../renderer_opengl/gl_rasterizer.cpp | 2 + .../renderer_opengl/gl_shader_manager.cpp | 10 +++- .../renderer_opengl/gl_shader_manager.h | 3 ++ 5 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 src/video_core/host_shaders/opengl_lmem_warmup.comp diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 2442c3c29..e61d9af80 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -33,6 +33,7 @@ set(SHADER_FILES opengl_fidelityfx_fsr.frag opengl_fidelityfx_fsr_easu.frag opengl_fidelityfx_fsr_rcas.frag + opengl_lmem_warmup.comp opengl_present.frag opengl_present.vert opengl_present_scaleforce.frag diff --git a/src/video_core/host_shaders/opengl_lmem_warmup.comp b/src/video_core/host_shaders/opengl_lmem_warmup.comp new file mode 100644 index 000000000..518268477 --- /dev/null +++ b/src/video_core/host_shaders/opengl_lmem_warmup.comp @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +// This shader is a workaround for a quirk in NVIDIA OpenGL drivers +// Shaders using local memory see a great performance benefit if a shader that was dispatched +// before it had more local memory allocated. +// This shader allocates the maximum local memory allowed on NVIDIA drivers to ensure that +// subsequent shaders see the performance boost. + +// NOTE: This shader does no actual meaningful work and returns immediately, +// it is simply a means to have the driver expect a shader using lots of local memory. + +#version 450 + +layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; + +layout(location = 0) uniform uint uniform_data; + +layout(binding = 0, rgba8) uniform writeonly restrict image2DArray dest_image; + +#define MAX_LMEM_SIZE 4080 // Size chosen to avoid errors in Nvidia's GLSL compiler +#define NUM_LMEM_CONSTANTS 1 +#define ARRAY_SIZE MAX_LMEM_SIZE - NUM_LMEM_CONSTANTS + +uint lmem_0[ARRAY_SIZE]; +const uvec4 constant_values[NUM_LMEM_CONSTANTS] = uvec4[](uvec4(0)); + +void main() { + const uint global_id = gl_GlobalInvocationID.x; + if (global_id <= 128) { + // Since the shader is called with a dispatch of 1x1x1 + // This should always be the case, and this shader will not actually execute + return; + } + for (uint t = 0; t < uniform_data; t++) { + const uint offset = (t * uniform_data); + lmem_0[offset] = t; + } + const uint offset = (gl_GlobalInvocationID.y * uniform_data + gl_GlobalInvocationID.x); + const uint value = lmem_0[offset]; + const uint const_value = constant_values[offset / 4][offset % 4]; + const uvec4 color = uvec4(value + const_value); + + // A "side-effect" is needed so the variables don't get optimized out, + // but this should never execute so there should be no clobbering of previously bound state. + imageStore(dest_image, ivec3(gl_GlobalInvocationID), color); +} diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index fc711c44a..d03288516 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -222,6 +222,7 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) { gpu.TickWork(); std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; + program_manager.LocalMemoryWarmup(); pipeline->SetEngine(maxwell3d, gpu_memory); pipeline->Configure(is_indexed); @@ -371,6 +372,7 @@ void RasterizerOpenGL::DispatchCompute() { if (!pipeline) { return; } + program_manager.LocalMemoryWarmup(); pipeline->SetEngine(kepler_compute, gpu_memory); pipeline->Configure(); const auto& qmd{kepler_compute->launch_description}; diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 98841ae65..2f6ba6823 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -3,7 +3,9 @@ #include +#include "video_core/host_shaders/opengl_lmem_warmup_comp.h" #include "video_core/renderer_opengl/gl_shader_manager.h" +#include "video_core/renderer_opengl/gl_shader_util.h" namespace OpenGL { @@ -12,7 +14,8 @@ static constexpr std::array ASSEMBLY_PROGRAM_ENUMS{ GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, }; -ProgramManager::ProgramManager(const Device& device) { +ProgramManager::ProgramManager(const Device& device) + : lmem_warmup_program(CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER)) { glCreateProgramPipelines(1, &pipeline.handle); if (device.UseAssemblyShaders()) { glEnable(GL_COMPUTE_PROGRAM_NV); @@ -98,6 +101,11 @@ void ProgramManager::BindAssemblyPrograms(std::span current_programs{}; GLuint current_assembly_compute_program = 0; + OGLProgram lmem_warmup_program; }; } // namespace OpenGL From b198339580f1a54c4c670eb58593eb64e2ef945c Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 25 Jun 2023 18:43:52 -0400 Subject: [PATCH 2/4] emit_glasm: Fix lmem size computation --- src/shader_recompiler/backend/glasm/emit_glasm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/shader_recompiler/backend/glasm/emit_glasm.cpp b/src/shader_recompiler/backend/glasm/emit_glasm.cpp index fd4a61a4d..b795c0179 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm.cpp +++ b/src/shader_recompiler/backend/glasm/emit_glasm.cpp @@ -461,7 +461,7 @@ std::string EmitGLASM(const Profile& profile, const RuntimeInfo& runtime_info, I header += fmt::format("R{},", index); } if (program.local_memory_size > 0) { - header += fmt::format("lmem[{}],", program.local_memory_size); + header += fmt::format("lmem[{}],", Common::DivCeil(program.local_memory_size, 4U)); } if (program.info.uses_fswzadd) { header += "FSWZA[4],FSWZB[4],"; From 405eae3734dd6bfb259df0afceecf4de1f1262ce Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 25 Jun 2023 18:59:33 -0400 Subject: [PATCH 3/4] shaders: Track local memory usage --- src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp | 4 ++++ src/shader_recompiler/shader_info.h | 1 + src/video_core/renderer_opengl/gl_compute_pipeline.cpp | 1 + src/video_core/renderer_opengl/gl_compute_pipeline.h | 5 +++++ src/video_core/renderer_opengl/gl_graphics_pipeline.cpp | 1 + src/video_core/renderer_opengl/gl_graphics_pipeline.h | 5 +++++ src/video_core/renderer_opengl/gl_rasterizer.cpp | 8 ++++++-- 7 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp index 5a4195217..70292686f 100644 --- a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp +++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp @@ -424,6 +424,10 @@ void VisitUsages(Info& info, IR::Inst& inst) { info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2; info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4; break; + case IR::Opcode::LoadLocal: + case IR::Opcode::WriteLocal: + info.uses_local_memory = true; + break; default: break; } diff --git a/src/shader_recompiler/shader_info.h b/src/shader_recompiler/shader_info.h index d308db942..b4b4afd37 100644 --- a/src/shader_recompiler/shader_info.h +++ b/src/shader_recompiler/shader_info.h @@ -172,6 +172,7 @@ struct Info { bool stores_indexed_attributes{}; bool stores_global_memory{}; + bool uses_local_memory{}; bool uses_fp16{}; bool uses_fp64{}; diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp index 3151c0db8..f9ca55c36 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp @@ -63,6 +63,7 @@ ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cac writes_global_memory = !use_storage_buffers && std::ranges::any_of(info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); + uses_local_memory = info.uses_local_memory; if (force_context_flush) { std::scoped_lock lock{built_mutex}; built_fence.Create(); diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.h b/src/video_core/renderer_opengl/gl_compute_pipeline.h index 9bcc72b59..c26b4fa5e 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.h +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.h @@ -59,6 +59,10 @@ public: return writes_global_memory; } + [[nodiscard]] bool UsesLocalMemory() const noexcept { + return uses_local_memory; + } + void SetEngine(Tegra::Engines::KeplerCompute* kepler_compute_, Tegra::MemoryManager* gpu_memory_) { kepler_compute = kepler_compute_; @@ -84,6 +88,7 @@ private: bool use_storage_buffers{}; bool writes_global_memory{}; + bool uses_local_memory{}; std::mutex built_mutex; std::condition_variable built_condvar; diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index c58f760b8..23a48c6fe 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -215,6 +215,7 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c writes_global_memory |= std::ranges::any_of( info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); + uses_local_memory |= info.uses_local_memory; } ASSERT(num_textures <= MAX_TEXTURES); ASSERT(num_images <= MAX_IMAGES); diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.h b/src/video_core/renderer_opengl/gl_graphics_pipeline.h index 7bab3be0a..7b3d7eae8 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.h +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.h @@ -98,6 +98,10 @@ public: return writes_global_memory; } + [[nodiscard]] bool UsesLocalMemory() const noexcept { + return uses_local_memory; + } + [[nodiscard]] bool IsBuilt() noexcept; template @@ -146,6 +150,7 @@ private: bool use_storage_buffers{}; bool writes_global_memory{}; + bool uses_local_memory{}; static constexpr std::size_t XFB_ENTRY_STRIDE = 3; GLsizei num_xfb_attribs{}; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index d03288516..edf527f2d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -222,7 +222,9 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) { gpu.TickWork(); std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; - program_manager.LocalMemoryWarmup(); + if (pipeline->UsesLocalMemory()) { + program_manager.LocalMemoryWarmup(); + } pipeline->SetEngine(maxwell3d, gpu_memory); pipeline->Configure(is_indexed); @@ -372,7 +374,9 @@ void RasterizerOpenGL::DispatchCompute() { if (!pipeline) { return; } - program_manager.LocalMemoryWarmup(); + if (pipeline->UsesLocalMemory()) { + program_manager.LocalMemoryWarmup(); + } pipeline->SetEngine(kepler_compute, gpu_memory); pipeline->Configure(); const auto& qmd{kepler_compute->launch_description}; From 4f160633d369b702a45ace9b6ff133312761c5f8 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 25 Jun 2023 19:06:51 -0400 Subject: [PATCH 4/4] OpenGL: Limit lmem warmup to NVIDIA :frog: --- src/video_core/renderer_opengl/gl_device.cpp | 1 + src/video_core/renderer_opengl/gl_device.h | 5 +++++ .../renderer_opengl/gl_shader_manager.cpp | 13 +++++++++---- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 03d234f2f..33e63c17d 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -194,6 +194,7 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) { has_bool_ref_bug = true; } } + has_lmem_perf_bug = is_nvidia; strict_context_required = emu_window.StrictContextRequired(); // Blocks AMD and Intel OpenGL drivers on Windows from using asynchronous shader compilation. diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index ad27264e5..a5a6bbbba 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -192,6 +192,10 @@ public: return supports_conditional_barriers; } + bool HasLmemPerfBug() const { + return has_lmem_perf_bug; + } + private: static bool TestVariableAoffi(); static bool TestPreciseBug(); @@ -238,6 +242,7 @@ private: bool can_report_memory{}; bool strict_context_required{}; bool supports_conditional_barriers{}; + bool has_lmem_perf_bug{}; std::string vendor_name; }; diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 2f6ba6823..03d4b9d06 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -14,12 +14,15 @@ static constexpr std::array ASSEMBLY_PROGRAM_ENUMS{ GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, }; -ProgramManager::ProgramManager(const Device& device) - : lmem_warmup_program(CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER)) { +ProgramManager::ProgramManager(const Device& device) { glCreateProgramPipelines(1, &pipeline.handle); if (device.UseAssemblyShaders()) { glEnable(GL_COMPUTE_PROGRAM_NV); } + if (device.HasLmemPerfBug()) { + lmem_warmup_program = + CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER); + } } void ProgramManager::BindComputeProgram(GLuint program) { @@ -102,8 +105,10 @@ void ProgramManager::BindAssemblyPrograms(std::span