From 76ca2a5f82f4df64cb839af42c93acb6705411ae Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sat, 2 Nov 2019 04:08:31 -0300 Subject: [PATCH 1/2] gl_rasterizer: Upload constant buffers with glNamedBufferSubData Nvidia's OpenGL driver maps gl(Named)BufferSubData with some requirements to a fast. This path has an extra memcpy but updates the buffer without orphaning or waiting for previous calls. It can be seen as a better model for "push constants" that can upload a whole UBO instead of 256 bytes. This path has some requirements established here: http://on-demand.gputechconf.com/gtc/2014/presentations/S4379-opengl-44-scene-rendering-techniques.pdf#page=24 Instead of using the stream buffer, this commits moves constant buffers uploads to calls of glNamedBufferSubData and from my testing it brings a performance improvement. This is disabled when the vendor is not Nvidia since it brings performance regressions. --- src/video_core/buffer_cache/buffer_cache.h | 14 +++++++-- .../renderer_opengl/gl_buffer_cache.cpp | 31 ++++++++++++++++--- .../renderer_opengl/gl_buffer_cache.h | 20 ++++++++++-- src/video_core/renderer_opengl/gl_device.cpp | 4 +++ src/video_core/renderer_opengl/gl_device.h | 5 +++ .../renderer_opengl/gl_rasterizer.cpp | 29 +++++++++++------ 6 files changed, 84 insertions(+), 19 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 2442ddfd6..63b3a8205 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -30,7 +30,7 @@ public: using BufferInfo = std::pair; BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, - bool is_written = false) { + bool is_written = false, bool use_fast_cbuf = false) { std::lock_guard lock{mutex}; auto& memory_manager = system.GPU().MemoryManager(); @@ -43,9 +43,13 @@ public: // Cache management is a big overhead, so only cache entries with a given size. // TODO: Figure out which size is the best for given games. constexpr std::size_t max_stream_size = 0x800; - if (size < max_stream_size) { + if (use_fast_cbuf || size < max_stream_size) { if (!is_written && !IsRegionWritten(cache_addr, cache_addr + size - 1)) { - return StreamBufferUpload(host_ptr, size, alignment); + if (use_fast_cbuf) { + return ConstBufferUpload(host_ptr, size); + } else { + return StreamBufferUpload(host_ptr, size, alignment); + } } } @@ -152,6 +156,10 @@ protected: virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset, std::size_t dst_offset, std::size_t size) = 0; + virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { + return {}; + } + /// Register an object into the cache void Register(const MapInterval& new_map, bool inherit_written = false) { const CacheAddr cache_ptr = new_map->GetStart(); diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index f8a807c84..0375fca17 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -8,13 +8,17 @@ #include "common/assert.h" #include "common/microprofile.h" +#include "video_core/engines/maxwell_3d.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_resource_manager.h" namespace OpenGL { +using Maxwell = Tegra::Engines::Maxwell3D::Regs; + MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t size) @@ -26,11 +30,22 @@ CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t siz CachedBufferBlock::~CachedBufferBlock() = default; OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, - std::size_t stream_size) - : VideoCommon::BufferCache{ - rasterizer, system, std::make_unique(stream_size, true)} {} + const Device& device, std::size_t stream_size) + : GenericBufferCache{rasterizer, system, std::make_unique(stream_size, true)} { + if (!device.HasFastBufferSubData()) { + return; + } -OGLBufferCache::~OGLBufferCache() = default; + static constexpr auto size = static_cast(Maxwell::MaxConstBufferSize); + glCreateBuffers(static_cast(std::size(cbufs)), std::data(cbufs)); + for (const GLuint cbuf : cbufs) { + glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW); + } +} + +OGLBufferCache::~OGLBufferCache() { + glDeleteBuffers(static_cast(std::size(cbufs)), std::data(cbufs)); +} Buffer OGLBufferCache::CreateBlock(CacheAddr cache_addr, std::size_t size) { return std::make_shared(cache_addr, size); @@ -69,4 +84,12 @@ void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t static_cast(size)); } +OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer, + std::size_t size) { + DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); + const GLuint& cbuf = cbufs[cbuf_cursor++]; + glNamedBufferSubData(cbuf, 0, static_cast(size), raw_pointer); + return {&cbuf, 0}; +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index 022e7bfa9..8c7145443 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -4,10 +4,12 @@ #pragma once +#include #include #include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/engines/maxwell_3d.h" #include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" @@ -18,12 +20,14 @@ class System; namespace OpenGL { +class Device; class OGLStreamBuffer; class RasterizerOpenGL; class CachedBufferBlock; using Buffer = std::shared_ptr; +using GenericBufferCache = VideoCommon::BufferCache; class CachedBufferBlock : public VideoCommon::BufferBlock { public: @@ -38,14 +42,18 @@ private: OGLBuffer gl_buffer{}; }; -class OGLBufferCache final : public VideoCommon::BufferCache { +class OGLBufferCache final : public GenericBufferCache { public: explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, - std::size_t stream_size); + const Device& device, std::size_t stream_size); ~OGLBufferCache(); const GLuint* GetEmptyBuffer(std::size_t) override; + void Acquire() noexcept { + cbuf_cursor = 0; + } + protected: Buffer CreateBlock(CacheAddr cache_addr, std::size_t size) override; @@ -61,6 +69,14 @@ protected: void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, std::size_t dst_offset, std::size_t size) override; + + BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override; + +private: + std::size_t cbuf_cursor = 0; + std::array + cbufs; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 64de7e425..c65b24c69 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -51,8 +51,11 @@ bool HasExtension(const std::vector& images, std::string_view } // Anonymous namespace Device::Device() { + const std::string_view vendor = reinterpret_cast(glGetString(GL_VENDOR)); const std::vector extensions = GetExtensions(); + const bool is_nvidia = vendor == "NVIDIA Corporation"; + uniform_buffer_alignment = GetInteger(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); shader_storage_alignment = GetInteger(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); max_vertex_attributes = GetInteger(GL_MAX_VERTEX_ATTRIBS); @@ -64,6 +67,7 @@ Device::Device() { has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = TestComponentIndexingBug(); has_precise_bug = TestPreciseBug(); + has_fast_buffer_sub_data = is_nvidia; LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index bb273c3d6..bf35bd0b6 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -54,6 +54,10 @@ public: return has_precise_bug; } + bool HasFastBufferSubData() const { + return has_fast_buffer_sub_data; + } + private: static bool TestVariableAoffi(); static bool TestComponentIndexingBug(); @@ -69,6 +73,7 @@ private: bool has_variable_aoffi{}; bool has_component_indexing_bug{}; bool has_precise_bug{}; + bool has_fast_buffer_sub_data{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 6a4d2c83a..28fa8a8be 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -67,7 +67,7 @@ static std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buf RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, ScreenInfo& info) : texture_cache{system, *this, device}, shader_cache{*this, system, emu_window, device}, - system{system}, screen_info{info}, buffer_cache{*this, system, STREAM_BUFFER_SIZE} { + system{system}, screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} { shader_program_manager = std::make_unique(); state.draw.shader_program = 0; state.Apply(); @@ -558,6 +558,8 @@ void RasterizerOpenGL::DrawPrelude() { SyncPolygonOffset(); SyncAlphaTest(); + buffer_cache.Acquire(); + // Draw the vertex batch const bool is_indexed = accelerate_draw == AccelDraw::Indexed; @@ -573,9 +575,11 @@ void RasterizerOpenGL::DrawPrelude() { (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage; - // Add space for at least 18 constant buffers - buffer_size += Maxwell::MaxConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); + if (!device.HasFastBufferSubData()) { + // Add space for at least 18 constant buffers + buffer_size += Maxwell::MaxConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); + } // Prepare the vertex array. buffer_cache.Map(buffer_size); @@ -739,10 +743,12 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { state.draw.shader_program = program; state.draw.program_pipeline = 0; - const std::size_t buffer_size = - Tegra::Engines::KeplerCompute::NumConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); - buffer_cache.Map(buffer_size); + if (!device.HasFastBufferSubData()) { + const std::size_t buffer_size = + Tegra::Engines::KeplerCompute::NumConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); + buffer_cache.Map(buffer_size); + } bind_ubo_pushbuffer.Setup(0); bind_ssbo_pushbuffer.Setup(0); @@ -750,7 +756,9 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { SetupComputeConstBuffers(kernel); SetupComputeGlobalMemory(kernel); - buffer_cache.Unmap(); + if (!device.HasFastBufferSubData()) { + buffer_cache.Unmap(); + } bind_ubo_pushbuffer.Bind(); bind_ssbo_pushbuffer.Bind(); @@ -879,7 +887,8 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); const auto alignment = device.GetUniformBufferAlignment(); - const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment); + const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, + device.HasFastBufferSubData()); bind_ubo_pushbuffer.Push(cbuf, offset, size); } From 442a1cc0211131cb237b5291fd49dbd2f37399e9 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sat, 2 Nov 2019 13:19:07 -0300 Subject: [PATCH 2/2] gl_rasterizer: Re-enable stream buffer memory due to global memory Global memory is still using the stream buffer when it shouldn't. As a temporary fix re-enable the stream buffer on compute. --- .../renderer_opengl/gl_rasterizer.cpp | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 28fa8a8be..e9abd901e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -575,11 +575,9 @@ void RasterizerOpenGL::DrawPrelude() { (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage; - if (!device.HasFastBufferSubData()) { - // Add space for at least 18 constant buffers - buffer_size += Maxwell::MaxConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); - } + // Add space for at least 18 constant buffers + buffer_size += Maxwell::MaxConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); // Prepare the vertex array. buffer_cache.Map(buffer_size); @@ -743,12 +741,10 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { state.draw.shader_program = program; state.draw.program_pipeline = 0; - if (!device.HasFastBufferSubData()) { - const std::size_t buffer_size = - Tegra::Engines::KeplerCompute::NumConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); - buffer_cache.Map(buffer_size); - } + const std::size_t buffer_size = + Tegra::Engines::KeplerCompute::NumConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); + buffer_cache.Map(buffer_size); bind_ubo_pushbuffer.Setup(0); bind_ssbo_pushbuffer.Setup(0); @@ -756,9 +752,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { SetupComputeConstBuffers(kernel); SetupComputeGlobalMemory(kernel); - if (!device.HasFastBufferSubData()) { - buffer_cache.Unmap(); - } + buffer_cache.Unmap(); bind_ubo_pushbuffer.Bind(); bind_ssbo_pushbuffer.Bind();