From 4aec060f6de410698d5b0a5bffd42d4327b258e4 Mon Sep 17 00:00:00 2001
From: Markus Wick <markus@selfnet.de>
Date: Tue, 6 Apr 2021 20:30:22 +0200
Subject: [PATCH 1/4] common/threadsafe_queue: Provide Wait() method.

It shall block until there is something to consume in the queue.

And use it for the GPU emulation instead of the spin loop.
This is only in booting the emulator, however in BOTW this is the case for about 1 second.
---
 src/common/threadsafe_queue.h | 10 +++++++++-
 src/video_core/gpu_thread.cpp |  3 +--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/common/threadsafe_queue.h b/src/common/threadsafe_queue.h
index a4647314a..ad04df8ca 100644
--- a/src/common/threadsafe_queue.h
+++ b/src/common/threadsafe_queue.h
@@ -83,11 +83,15 @@ public:
         return true;
     }
 
-    T PopWait() {
+    void Wait() {
         if (Empty()) {
             std::unique_lock lock{cv_mutex};
             cv.wait(lock, [this]() { return !Empty(); });
         }
+    }
+
+    T PopWait() {
+        Wait();
         T t;
         Pop(t);
         return t;
@@ -156,6 +160,10 @@ public:
         return spsc_queue.Pop(t);
     }
 
+    void Wait() {
+        spsc_queue.Wait();
+    }
+
     T PopWait() {
         return spsc_queue.PopWait();
     }
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 99353f15f..cd59a7faf 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -29,8 +29,7 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
     system.RegisterHostThread();
 
     // Wait for first GPU command before acquiring the window context
-    while (state.queue.Empty())
-        ;
+    state.queue.Wait();
 
     // If emulation was stopped during disk shader loading, abort before trying to acquire context
     if (!state.is_running) {

From 5145133a604f626c05f832465ac22019b003c32a Mon Sep 17 00:00:00 2001
From: Markus Wick <markus@selfnet.de>
Date: Wed, 7 Apr 2021 08:42:54 +0200
Subject: [PATCH 2/4] video_core/gpu_thread: Implement a ShutDown method.

This was implicitly done by `is_powered_on = false`, however the explicit method allows us to block until the GPU is actually gone.

This should fix a race condition while removing the other subsystems while the GPU is still active.
---
 src/core/core.cpp             |  2 +-
 src/video_core/gpu.cpp        |  4 ++--
 src/video_core/gpu.h          |  4 ++--
 src/video_core/gpu_thread.cpp | 26 ++++++++++++++++++--------
 src/video_core/gpu_thread.h   |  7 +++++--
 5 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/src/core/core.cpp b/src/core/core.cpp
index 305f56ff1..56b47e671 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -296,7 +296,7 @@ struct System::Impl {
         exit_lock = false;
 
         if (gpu_core) {
-            gpu_core->WaitIdle();
+            gpu_core->ShutDown();
         }
 
         services.reset();
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index c61f44619..009c6f574 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -517,8 +517,8 @@ void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
     interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
 }
 
-void GPU::WaitIdle() const {
-    gpu_thread.WaitIdle();
+void GPU::ShutDown() {
+    gpu_thread.ShutDown();
 }
 
 void GPU::OnCommandListEnd() {
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index b2ee45496..ecab35d3b 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -219,8 +219,8 @@ public:
         return *shader_notify;
     }
 
-    // Waits for the GPU to finish working
-    void WaitIdle() const;
+    // Stops the GPU execution and waits for the GPU to finish working
+    void ShutDown();
 
     /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
     void WaitFence(u32 syncpoint_id, u32 value);
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index cd59a7faf..6b8f06f78 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -68,13 +68,7 @@ ThreadManager::ThreadManager(Core::System& system_, bool is_async_)
     : system{system_}, is_async{is_async_} {}
 
 ThreadManager::~ThreadManager() {
-    if (!thread.joinable()) {
-        return;
-    }
-
-    // Notify GPU thread that a shutdown is pending
-    PushCommand(EndProcessingCommand());
-    thread.join();
+    ShutDown();
 }
 
 void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
@@ -132,10 +126,26 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
 
 void ThreadManager::WaitIdle() const {
     while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed) &&
-           system.IsPoweredOn()) {
+           state.is_running) {
     }
 }
 
+void ThreadManager::ShutDown() {
+    if (!state.is_running) {
+        return;
+    }
+
+    state.is_running = false;
+
+    if (!thread.joinable()) {
+        return;
+    }
+
+    // Notify GPU thread that a shutdown is pending
+    PushCommand(EndProcessingCommand());
+    thread.join();
+}
+
 void ThreadManager::OnCommandListEnd() {
     PushCommand(OnCommandListEndCommand());
 }
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 18269e51c..d384164de 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -132,8 +132,8 @@ public:
     /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
     void FlushAndInvalidateRegion(VAddr addr, u64 size);
 
-    // Wait until the gpu thread is idle.
-    void WaitIdle() const;
+    // Stops the GPU execution and waits for the GPU to finish working
+    void ShutDown();
 
     void OnCommandListEnd();
 
@@ -141,6 +141,9 @@ private:
     /// Pushes a command to be executed by the GPU thread
     u64 PushCommand(CommandData&& command_data);
 
+    // Wait until the gpu thread is idle.
+    void WaitIdle() const;
+
     Core::System& system;
     const bool is_async;
     VideoCore::RasterizerInterface* rasterizer = nullptr;

From e6fb49fa4bb2864702abcefc14f6bb62eaba7a7e Mon Sep 17 00:00:00 2001
From: Markus Wick <markus@selfnet.de>
Date: Wed, 7 Apr 2021 13:57:49 +0200
Subject: [PATCH 3/4] video_core/gpu_thread: Keep the write lock for allocating
 the fence.

Else the fence might get submited out-of-order into the queue, which makes testing them pointless.
Overhead should be tiny as the mutex is just moved from the queue to the writing code.
---
 src/video_core/gpu_thread.cpp | 2 ++
 src/video_core/gpu_thread.h   | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 6b8f06f78..9488bf544 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -151,11 +151,13 @@ void ThreadManager::OnCommandListEnd() {
 }
 
 u64 ThreadManager::PushCommand(CommandData&& command_data) {
+    std::unique_lock lk(state.write_lock);
     const u64 fence{++state.last_fence};
     state.queue.Push(CommandDataContainer(std::move(command_data), fence));
 
     if (!is_async) {
         // In synchronous GPU mode, block the caller until the command has executed
+        lk.unlock();
         WaitIdle();
     }
 
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index d384164de..cb901c22a 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -101,7 +101,8 @@ struct CommandDataContainer {
 struct SynchState final {
     std::atomic_bool is_running{true};
 
-    using CommandQueue = Common::MPSCQueue<CommandDataContainer>;
+    using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
+    std::mutex write_lock;
     CommandQueue queue;
     u64 last_fence{};
     std::atomic<u64> signaled_fence{};

From e8bd9aed8bf0f60455d0ae6a8f6f3abf92dd8305 Mon Sep 17 00:00:00 2001
From: Markus Wick <markus@selfnet.de>
Date: Wed, 7 Apr 2021 11:41:31 +0200
Subject: [PATCH 4/4] video_core: Use a CV for blocking commands.

There is no need for a busy loop here. Let's just use a condition variable to save some power.
---
 src/video_core/gpu_thread.cpp | 45 +++++++++++++++++++++--------------
 src/video_core/gpu_thread.h   | 11 ++++-----
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 9488bf544..7addfbc7b 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -56,11 +56,17 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
         } else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) {
             rasterizer->OnCPUWrite(invalidate->addr, invalidate->size);
         } else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
-            return;
+            ASSERT(state.is_running == false);
         } else {
             UNREACHABLE();
         }
         state.signaled_fence.store(next.fence);
+        if (next.block) {
+            // We have to lock the write_lock to ensure that the condition_variable wait not get a
+            // race between the check and the lock itself.
+            std::lock_guard lk(state.write_lock);
+            state.cv.notify_all();
+        }
     }
 }
 
@@ -105,9 +111,8 @@ void ThreadManager::FlushRegion(VAddr addr, u64 size) {
     case Settings::GPUAccuracy::Extreme: {
         auto& gpu = system.GPU();
         u64 fence = gpu.RequestFlush(addr, size);
-        PushCommand(GPUTickCommand());
-        while (fence > gpu.CurrentFlushRequestFence()) {
-        }
+        PushCommand(GPUTickCommand(), true);
+        ASSERT(fence <= gpu.CurrentFlushRequestFence());
         break;
     }
     default:
@@ -124,18 +129,16 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
     rasterizer->OnCPUWrite(addr, size);
 }
 
-void ThreadManager::WaitIdle() const {
-    while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed) &&
-           state.is_running) {
-    }
-}
-
 void ThreadManager::ShutDown() {
     if (!state.is_running) {
         return;
     }
 
-    state.is_running = false;
+    {
+        std::lock_guard lk(state.write_lock);
+        state.is_running = false;
+        state.cv.notify_all();
+    }
 
     if (!thread.joinable()) {
         return;
@@ -150,15 +153,21 @@ void ThreadManager::OnCommandListEnd() {
     PushCommand(OnCommandListEndCommand());
 }
 
-u64 ThreadManager::PushCommand(CommandData&& command_data) {
-    std::unique_lock lk(state.write_lock);
-    const u64 fence{++state.last_fence};
-    state.queue.Push(CommandDataContainer(std::move(command_data), fence));
-
+u64 ThreadManager::PushCommand(CommandData&& command_data, bool block) {
     if (!is_async) {
         // In synchronous GPU mode, block the caller until the command has executed
-        lk.unlock();
-        WaitIdle();
+        block = true;
+    }
+
+    std::unique_lock lk(state.write_lock);
+    const u64 fence{++state.last_fence};
+    state.queue.Push(CommandDataContainer(std::move(command_data), fence, block));
+
+    if (block) {
+        state.cv.wait(lk, [this, fence] {
+            return fence <= state.signaled_fence.load(std::memory_order_relaxed) ||
+                   !state.is_running;
+        });
     }
 
     return fence;
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index cb901c22a..11a648f38 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -90,11 +90,12 @@ using CommandData =
 struct CommandDataContainer {
     CommandDataContainer() = default;
 
-    explicit CommandDataContainer(CommandData&& data_, u64 next_fence_)
-        : data{std::move(data_)}, fence{next_fence_} {}
+    explicit CommandDataContainer(CommandData&& data_, u64 next_fence_, bool block_)
+        : data{std::move(data_)}, fence{next_fence_}, block(block_) {}
 
     CommandData data;
     u64 fence{};
+    bool block{};
 };
 
 /// Struct used to synchronize the GPU thread
@@ -106,6 +107,7 @@ struct SynchState final {
     CommandQueue queue;
     u64 last_fence{};
     std::atomic<u64> signaled_fence{};
+    std::condition_variable cv;
 };
 
 /// Class used to manage the GPU thread
@@ -140,10 +142,7 @@ public:
 
 private:
     /// Pushes a command to be executed by the GPU thread
-    u64 PushCommand(CommandData&& command_data);
-
-    // Wait until the gpu thread is idle.
-    void WaitIdle() const;
+    u64 PushCommand(CommandData&& command_data, bool block = false);
 
     Core::System& system;
     const bool is_async;