From 7b574f406b25c02a0e0efd8b7ec13d68ecb55497 Mon Sep 17 00:00:00 2001 From: bunnei Date: Wed, 23 Jan 2019 22:17:55 -0500 Subject: [PATCH] gpu: Move command processing to another thread. --- .../service/nvdrv/devices/nvhost_as_gpu.cpp | 2 +- src/core/memory.cpp | 8 +- src/video_core/CMakeLists.txt | 2 + src/video_core/engines/kepler_memory.cpp | 2 +- src/video_core/engines/maxwell_dma.cpp | 4 +- src/video_core/gpu.cpp | 44 ++++- src/video_core/gpu.h | 22 ++- src/video_core/gpu_thread.cpp | 154 ++++++++++++++++++ src/video_core/gpu_thread.h | 135 +++++++++++++++ 9 files changed, 358 insertions(+), 15 deletions(-) create mode 100644 src/video_core/gpu_thread.cpp create mode 100644 src/video_core/gpu_thread.h diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp index 466db7ccd..a34b9e753 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp @@ -178,7 +178,7 @@ u32 nvhost_as_gpu::UnmapBuffer(const std::vector& input, std::vector& ou auto& gpu = system_instance.GPU(); auto cpu_addr = gpu.MemoryManager().GpuToCpuAddress(params.offset); ASSERT(cpu_addr); - system_instance.Renderer().Rasterizer().FlushAndInvalidateRegion(*cpu_addr, itr->second.size); + gpu.FlushAndInvalidateRegion(*cpu_addr, itr->second.size); params.offset = gpu.MemoryManager().UnmapBuffer(params.offset, itr->second.size); diff --git a/src/core/memory.cpp b/src/core/memory.cpp index ec279cef8..6591c45d2 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -356,16 +356,16 @@ void RasterizerFlushVirtualRegion(VAddr start, u64 size, FlushMode mode) { const VAddr overlap_end = std::min(end, region_end); const VAddr overlap_size = overlap_end - overlap_start; - auto& rasterizer = system_instance.Renderer().Rasterizer(); + auto& gpu = system_instance.GPU(); switch (mode) { case FlushMode::Flush: - rasterizer.FlushRegion(overlap_start, overlap_size); + gpu.FlushRegion(overlap_start, overlap_size); break; case FlushMode::Invalidate: - rasterizer.InvalidateRegion(overlap_start, overlap_size); + gpu.InvalidateRegion(overlap_start, overlap_size); break; case FlushMode::FlushAndInvalidate: - rasterizer.FlushAndInvalidateRegion(overlap_start, overlap_size); + gpu.FlushAndInvalidateRegion(overlap_start, overlap_size); break; } }; diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 3e9d2b3be..3bb5d0ed7 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -17,6 +17,8 @@ add_library(video_core STATIC engines/shader_header.h gpu.cpp gpu.h + gpu_thread.cpp + gpu_thread.h macro_interpreter.cpp macro_interpreter.h memory_manager.cpp diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 4f6126116..aae2a4019 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -48,7 +48,7 @@ void KeplerMemory::ProcessData(u32 data) { // We have to invalidate the destination region to evict any outdated surfaces from the cache. // We do this before actually writing the new data because the destination address might contain // a dirty surface that will have to be written back to memory. - rasterizer.InvalidateRegion(*dest_address, sizeof(u32)); + Core::System::GetInstance().GPU().InvalidateRegion(*dest_address, sizeof(u32)); Memory::Write32(*dest_address, data); system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 0474c7ba3..9dfea5999 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -92,12 +92,12 @@ void MaxwellDMA::HandleCopy() { const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) { // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated // copying. - rasterizer.FlushRegion(*source_cpu, src_size); + Core::System::GetInstance().GPU().FlushRegion(*source_cpu, src_size); // We have to invalidate the destination region to evict any outdated surfaces from the // cache. We do this before actually writing the new data because the destination address // might contain a dirty surface that will have to be written back to memory. - rasterizer.InvalidateRegion(*dest_cpu, dst_size); + Core::System::GetInstance().GPU().InvalidateRegion(*dest_cpu, dst_size); }; if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) { diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index b0f3310e5..0d7a052dd 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -6,12 +6,14 @@ #include "core/core.h" #include "core/core_timing.h" #include "core/memory.h" +#include "core/settings.h" #include "video_core/engines/fermi_2d.h" #include "video_core/engines/kepler_compute.h" #include "video_core/engines/kepler_memory.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/maxwell_dma.h" #include "video_core/gpu.h" +#include "video_core/gpu_thread.h" #include "video_core/renderer_base.h" namespace Tegra { @@ -37,6 +39,10 @@ GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{ren kepler_compute = std::make_unique(*memory_manager); maxwell_dma = std::make_unique(system, rasterizer, *memory_manager); kepler_memory = std::make_unique(system, rasterizer, *memory_manager); + + if (Settings::values.use_asynchronous_gpu_emulation) { + gpu_thread = std::make_unique(renderer, *dma_pusher); + } } GPU::~GPU() = default; @@ -66,13 +72,45 @@ const DmaPusher& GPU::DmaPusher() const { } void GPU::PushGPUEntries(Tegra::CommandList&& entries) { - dma_pusher->Push(std::move(entries)); - dma_pusher->DispatchCalls(); + if (Settings::values.use_asynchronous_gpu_emulation) { + gpu_thread->SubmitList(std::move(entries)); + } else { + dma_pusher->Push(std::move(entries)); + dma_pusher->DispatchCalls(); + } } void GPU::SwapBuffers( std::optional> framebuffer) { - renderer.SwapBuffers(std::move(framebuffer)); + if (Settings::values.use_asynchronous_gpu_emulation) { + gpu_thread->SwapBuffers(std::move(framebuffer)); + } else { + renderer.SwapBuffers(std::move(framebuffer)); + } +} + +void GPU::FlushRegion(VAddr addr, u64 size) { + if (Settings::values.use_asynchronous_gpu_emulation) { + gpu_thread->FlushRegion(addr, size); + } else { + renderer.Rasterizer().FlushRegion(addr, size); + } +} + +void GPU::InvalidateRegion(VAddr addr, u64 size) { + if (Settings::values.use_asynchronous_gpu_emulation) { + gpu_thread->InvalidateRegion(addr, size); + } else { + renderer.Rasterizer().InvalidateRegion(addr, size); + } +} + +void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) { + if (Settings::values.use_asynchronous_gpu_emulation) { + gpu_thread->FlushAndInvalidateRegion(addr, size); + } else { + renderer.Rasterizer().FlushAndInvalidateRegion(addr, size); + } } u32 RenderTargetBytesPerPixel(RenderTargetFormat format) { diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 62649bd6e..3f3098bf1 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -19,6 +19,10 @@ namespace VideoCore { class RendererBase; } // namespace VideoCore +namespace VideoCommon::GPUThread { +class ThreadManager; +} // namespace VideoCommon::GPUThread + namespace Tegra { enum class RenderTargetFormat : u32 { @@ -200,7 +204,7 @@ public: std::array reg_array; }; } regs{}; - + /// Push GPU command entries to be processed void PushGPUEntries(Tegra::CommandList&& entries); @@ -208,6 +212,15 @@ public: void SwapBuffers( std::optional> framebuffer); + /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory + void FlushRegion(VAddr addr, u64 size); + + /// Notify rasterizer that any caches of the specified region should be invalidated + void InvalidateRegion(VAddr addr, u64 size); + + /// Notify rasterizer that any caches of the specified region should be flushed and invalidated + void FlushAndInvalidateRegion(VAddr addr, u64 size); + private: void ProcessBindMethod(const MethodCall& method_call); void ProcessSemaphoreTriggerMethod(); @@ -216,17 +229,18 @@ private: /// Calls a GPU puller method. void CallPullerMethod(const MethodCall& method_call); - + /// Calls a GPU engine method. void CallEngineMethod(const MethodCall& method_call); - + /// Determines where the method should be executed. bool ExecuteMethodOnEngine(const MethodCall& method_call); private: std::unique_ptr dma_pusher; std::unique_ptr memory_manager; - + std::unique_ptr gpu_thread; + VideoCore::RendererBase& renderer; /// Mapping of command subchannels to their bound engine ids. diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp new file mode 100644 index 000000000..22c4cca4d --- /dev/null +++ b/src/video_core/gpu_thread.cpp @@ -0,0 +1,154 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "common/microprofile.h" +#include "core/frontend/scope_acquire_window_context.h" +#include "core/settings.h" +#include "video_core/dma_pusher.h" +#include "video_core/gpu.h" +#include "video_core/gpu_thread.h" +#include "video_core/renderer_base.h" + +namespace VideoCommon::GPUThread { + +/// Executes a single GPU thread command +static void ExecuteCommand(CommandData* command, VideoCore::RendererBase& renderer, + Tegra::DmaPusher& dma_pusher) { + if (const auto submit_list = std::get_if(command)) { + dma_pusher.Push(std::move(submit_list->entries)); + dma_pusher.DispatchCalls(); + } else if (const auto data = std::get_if(command)) { + renderer.SwapBuffers(data->framebuffer); + } else if (const auto data = std::get_if(command)) { + renderer.Rasterizer().FlushRegion(data->addr, data->size); + } else if (const auto data = std::get_if(command)) { + renderer.Rasterizer().InvalidateRegion(data->addr, data->size); + } else if (const auto data = std::get_if(command)) { + renderer.Rasterizer().FlushAndInvalidateRegion(data->addr, data->size); + } else { + UNREACHABLE(); + } +} + +/// Runs the GPU thread +static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher, + SynchState& state) { + + MicroProfileOnThreadCreate("GpuThread"); + + auto WaitForWakeup = [&]() { + std::unique_lock lock{state.signal_mutex}; + state.signal_condition.wait(lock, [&] { return !state.IsIdle() || !state.is_running; }); + }; + + // Wait for first GPU command before acquiring the window context + WaitForWakeup(); + + // If emulation was stopped during disk shader loading, abort before trying to acquire context + if (!state.is_running) { + return; + } + + Core::Frontend::ScopeAcquireWindowContext acquire_context{renderer.GetRenderWindow()}; + + while (state.is_running) { + if (!state.is_running) { + return; + } + + { + // Thread has been woken up, so make the previous write queue the next read queue + std::lock_guard lock{state.signal_mutex}; + std::swap(state.push_queue, state.pop_queue); + } + + // Execute all of the GPU commands + while (!state.pop_queue->empty()) { + ExecuteCommand(&state.pop_queue->front(), renderer, dma_pusher); + state.pop_queue->pop(); + } + + // Signal that the GPU thread has finished processing commands + if (state.IsIdle()) { + state.idle_condition.notify_one(); + } + + // Wait for CPU thread to send more GPU commands + WaitForWakeup(); + } +} + +ThreadManager::ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher) + : renderer{renderer}, dma_pusher{dma_pusher}, thread{RunThread, std::ref(renderer), + std::ref(dma_pusher), std::ref(state)}, + thread_id{thread.get_id()} {} + +ThreadManager::~ThreadManager() { + { + // Notify GPU thread that a shutdown is pending + std::lock_guard lock{state.signal_mutex}; + state.is_running = false; + } + + state.signal_condition.notify_one(); + thread.join(); +} + +void ThreadManager::SubmitList(Tegra::CommandList&& entries) { + if (entries.empty()) { + return; + } + + PushCommand(SubmitListCommand(std::move(entries)), false, false); +} + +void ThreadManager::SwapBuffers( + std::optional> framebuffer) { + PushCommand(SwapBuffersCommand(std::move(framebuffer)), true, false); +} + +void ThreadManager::FlushRegion(VAddr addr, u64 size) { + if (Settings::values.use_accurate_gpu_emulation) { + PushCommand(FlushRegionCommand(addr, size), true, false); + } +} + +void ThreadManager::InvalidateRegion(VAddr addr, u64 size) { + PushCommand(InvalidateRegionCommand(addr, size), true, true); +} + +void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) { + if (Settings::values.use_accurate_gpu_emulation) { + PushCommand(FlushAndInvalidateRegionCommand(addr, size), true, false); + } else { + InvalidateRegion(addr, size); + } +} + +void ThreadManager::PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu) { + { + std::lock_guard lock{state.signal_mutex}; + + if ((allow_on_cpu && state.IsIdle()) || IsGpuThread()) { + // Execute the command synchronously on the current thread + ExecuteCommand(&command_data, renderer, dma_pusher); + return; + } + + // Push the command to the GPU thread + state.push_queue->emplace(command_data); + } + + // Signal the GPU thread that commands are pending + state.signal_condition.notify_one(); + + if (wait_for_idle) { + // Wait for the GPU to be idle (all commands to be executed) + std::unique_lock lock{state.idle_mutex}; + state.idle_condition.wait(lock, [this] { return state.IsIdle(); }); + } +} + +} // namespace VideoCommon::GPUThread diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h new file mode 100644 index 000000000..ad9f9462b --- /dev/null +++ b/src/video_core/gpu_thread.h @@ -0,0 +1,135 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Tegra { +struct FramebufferConfig; +class DmaPusher; +} // namespace Tegra + +namespace VideoCore { +class RendererBase; +} // namespace VideoCore + +namespace VideoCommon::GPUThread { + +/// Command to signal to the GPU thread that a command list is ready for processing +struct SubmitListCommand final { + explicit SubmitListCommand(Tegra::CommandList&& entries) : entries{std::move(entries)} {} + + Tegra::CommandList entries; +}; + +/// Command to signal to the GPU thread that a swap buffers is pending +struct SwapBuffersCommand final { + explicit SwapBuffersCommand(std::optional framebuffer) + : framebuffer{std::move(framebuffer)} {} + + std::optional framebuffer; +}; + +/// Command to signal to the GPU thread to flush a region +struct FlushRegionCommand final { + explicit constexpr FlushRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {} + + const VAddr addr; + const u64 size; +}; + +/// Command to signal to the GPU thread to invalidate a region +struct InvalidateRegionCommand final { + explicit constexpr InvalidateRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {} + + const VAddr addr; + const u64 size; +}; + +/// Command to signal to the GPU thread to flush and invalidate a region +struct FlushAndInvalidateRegionCommand final { + explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr, u64 size) + : addr{addr}, size{size} {} + + const VAddr addr; + const u64 size; +}; + +using CommandData = std::variant; + +/// Struct used to synchronize the GPU thread +struct SynchState final { + std::atomic is_running{true}; + std::condition_variable signal_condition; + std::mutex signal_mutex; + std::condition_variable idle_condition; + std::mutex idle_mutex; + + // We use two queues for sending commands to the GPU thread, one for writing (push_queue) to and + // one for reading from (pop_queue). These are swapped whenever the current pop_queue becomes + // empty. This allows for efficient thread-safe access, as it does not require any copies. + + using CommandQueue = std::queue; + std::array command_queues; + CommandQueue* push_queue{&command_queues[0]}; + CommandQueue* pop_queue{&command_queues[1]}; + + /// Returns true if the GPU thread should be idle, meaning there are no commands to process + bool IsIdle() const { + return command_queues[0].empty() && command_queues[1].empty(); + } +}; + +/// Class used to manage the GPU thread +class ThreadManager final { +public: + explicit ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher); + ~ThreadManager(); + + /// Push GPU command entries to be processed + void SubmitList(Tegra::CommandList&& entries); + + /// Swap buffers (render frame) + void SwapBuffers( + std::optional> framebuffer); + + /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory + void FlushRegion(VAddr addr, u64 size); + + /// Notify rasterizer that any caches of the specified region should be invalidated + void InvalidateRegion(VAddr addr, u64 size); + + /// Notify rasterizer that any caches of the specified region should be flushed and invalidated + void FlushAndInvalidateRegion(VAddr addr, u64 size); + + /// Waits the caller until the GPU thread is idle, used for synchronization + void WaitForIdle(); + +private: + /// Pushes a command to be executed by the GPU thread + void PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu); + + /// Returns true if this is called by the GPU thread + bool IsGpuThread() const { + return std::this_thread::get_id() == thread_id; + } + +private: + SynchState state; + std::thread thread; + std::thread::id thread_id; + VideoCore::RendererBase& renderer; + Tegra::DmaPusher& dma_pusher; +}; + +} // namespace VideoCommon::GPUThread