gl_query_cache: Implement host queries using a deferred cache

Instead of waiting immediately for executed commands, defer the query
until the guest CPU reads it. This way we get closer to what the guest
program is doing.

To archive this we have to build a dependency queue, because host APIs
(like OpenGL and Vulkan) use ranged queries instead of counters like
NVN.

Waiting for queries implicitly uses fences and this requires a command
being queued, otherwise the driver will lock waiting until a timeout. To
fix this when there are no commands queued, we explicitly call glFlush.
This commit is contained in:
ReinUsesLisp 2019-11-26 18:52:15 -03:00
parent ef9920e164
commit aae8c180cb
7 changed files with 328 additions and 86 deletions

View file

@ -556,23 +556,13 @@ void Maxwell3D::ProcessQueryGet() {
// matches the current payload.
UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE");
break;
case Regs::QueryOperation::Counter: {
u64 result;
switch (regs.query.query_get.select) {
case Regs::QuerySelect::Zero:
result = 0;
break;
case Regs::QuerySelect::SamplesPassed:
result = rasterizer.Query(VideoCore::QueryType::SamplesPassed);
break;
default:
result = 1;
UNIMPLEMENTED_MSG("Unimplemented query select type {}",
static_cast<u32>(regs.query.query_get.select.Value()));
case Regs::QueryOperation::Counter:
if (const std::optional<u64> result = GetQueryResult()) {
// If the query returns an empty optional it means it's cached and deferred.
// In this case we have a non-empty result, so we stamp it immediately.
StampQueryResult(*result, regs.query.query_get.short_query == 0);
}
StampQueryResult(result, regs.query.query_get.short_query == 0);
break;
}
case Regs::QueryOperation::Trap:
UNIMPLEMENTED_MSG("Unimplemented query operation TRAP");
break;
@ -595,20 +585,20 @@ void Maxwell3D::ProcessQueryCondition() {
}
case Regs::ConditionMode::ResNonZero: {
Regs::QueryCompare cmp;
memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
break;
}
case Regs::ConditionMode::Equal: {
Regs::QueryCompare cmp;
memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
execute_on =
cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
break;
}
case Regs::ConditionMode::NotEqual: {
Regs::QueryCompare cmp;
memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
execute_on =
cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
break;
@ -674,6 +664,21 @@ void Maxwell3D::DrawArrays() {
}
}
std::optional<u64> Maxwell3D::GetQueryResult() {
switch (regs.query.query_get.select) {
case Regs::QuerySelect::Zero:
return 0;
case Regs::QuerySelect::SamplesPassed:
// Deferred.
rasterizer.Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed);
return {};
default:
UNIMPLEMENTED_MSG("Unimplemented query select type {}",
static_cast<u32>(regs.query.query_get.select.Value()));
return 1;
}
}
void Maxwell3D::ProcessCBBind(std::size_t stage_index) {
// Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
auto& shader = state.shader_stages[stage_index];

View file

@ -6,6 +6,7 @@
#include <array>
#include <bitset>
#include <optional>
#include <type_traits>
#include <unordered_map>
#include <vector>
@ -1462,6 +1463,9 @@ private:
// Handles a instance drawcall from MME
void StepInstance(MMEDrawMode expected_mode, u32 count);
/// Returns a query's value or an empty object if the value will be deferred through a cache.
std::optional<u64> GetQueryResult();
};
#define ASSERT_REG_POSITION(field_name, position) \

View file

@ -20,6 +20,7 @@ namespace VideoCore {
enum class QueryType {
SamplesPassed,
};
constexpr std::size_t NumQueryTypes = 1;
enum class LoadCallbackStage {
Prepare,
@ -48,8 +49,8 @@ public:
/// Resets the counter of a query
virtual void ResetCounter(QueryType type) = 0;
/// Returns the value of a GPU query
virtual u64 Query(QueryType type) = 0;
/// Records a GPU query and caches it
virtual void Query(GPUVAddr gpu_addr, QueryType type) = 0;
/// Notify rasterizer that all caches should be flushed to Switch memory
virtual void FlushAll() = 0;

View file

@ -2,58 +2,203 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <cstring>
#include <memory>
#include <utility>
#include <vector>
#include <glad/glad.h>
#include "common/assert.h"
#include "core/core.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/memory_manager.h"
#include "video_core/renderer_opengl/gl_query_cache.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
namespace OpenGL {
HostCounter::HostCounter(GLenum target) {
query.Create(target);
using VideoCore::QueryType;
namespace {
constexpr std::array<GLenum, VideoCore::NumQueryTypes> QueryTargets = {GL_SAMPLES_PASSED};
constexpr GLenum GetTarget(QueryType type) {
return QueryTargets[static_cast<std::size_t>(type)];
}
HostCounter::~HostCounter() = default;
} // Anonymous namespace
void HostCounter::UpdateState(bool enabled) {
CounterStream::CounterStream(QueryCache& cache, QueryType type)
: cache{cache}, type{type}, target{GetTarget(type)} {}
CounterStream::~CounterStream() = default;
void CounterStream::Update(bool enabled, bool any_command_queued) {
if (enabled) {
Enable();
} else {
Disable();
if (!current) {
current = cache.GetHostCounter(last, type);
}
return;
}
if (current) {
EndQuery(any_command_queued);
}
last = std::exchange(current, nullptr);
}
void HostCounter::Reset() {
counter = 0;
Disable();
void CounterStream::Reset(bool any_command_queued) {
if (current) {
EndQuery(any_command_queued);
}
current = nullptr;
last = nullptr;
}
u64 HostCounter::Query() {
if (!is_beginned) {
return counter;
std::shared_ptr<HostCounter> CounterStream::GetCurrent(bool any_command_queued) {
if (!current) {
return nullptr;
}
Disable();
u64 value;
glGetQueryObjectui64v(query.handle, GL_QUERY_RESULT, &value);
Enable();
EndQuery(any_command_queued);
last = std::move(current);
current = cache.GetHostCounter(last, type);
return last;
}
counter += value;
void CounterStream::EndQuery(bool any_command_queued) {
if (!any_command_queued) {
// There are chances a query waited on without commands (glDraw, glClear, glDispatch). Not
// having any of these causes a lock. glFlush is considered a command, so we can safely wait
// for this. Insert to the OpenGL command stream a flush.
glFlush();
}
glEndQuery(target);
}
QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& rasterizer)
: RasterizerCache{rasterizer}, system{system},
rasterizer{rasterizer}, streams{{CounterStream{*this, QueryType::SamplesPassed}}} {}
QueryCache::~QueryCache() = default;
void QueryCache::Query(GPUVAddr gpu_addr, QueryType type) {
auto& memory_manager = system.GPU().MemoryManager();
const auto host_ptr = memory_manager.GetPointer(gpu_addr);
auto query = TryGet(host_ptr);
if (!query) {
const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
ASSERT_OR_EXECUTE(cpu_addr, return;);
query = std::make_shared<CachedQuery>(type, *cpu_addr, host_ptr);
Register(query);
}
query->SetCounter(GetStream(type).GetCurrent(rasterizer.AnyCommandQueued()));
query->MarkAsModified(true, *this);
}
void QueryCache::UpdateCounters() {
auto& samples_passed = GetStream(QueryType::SamplesPassed);
const auto& regs = system.GPU().Maxwell3D().regs;
samples_passed.Update(regs.samplecnt_enable, rasterizer.AnyCommandQueued());
}
void QueryCache::ResetCounter(QueryType type) {
GetStream(type).Reset(rasterizer.AnyCommandQueued());
}
void QueryCache::Reserve(QueryType type, OGLQuery&& query) {
reserved_queries[static_cast<std::size_t>(type)].push_back(std::move(query));
}
std::shared_ptr<HostCounter> QueryCache::GetHostCounter(std::shared_ptr<HostCounter> dependency,
QueryType type) {
const auto type_index = static_cast<std::size_t>(type);
auto& reserve = reserved_queries[type_index];
if (reserve.empty()) {
return std::make_shared<HostCounter>(*this, std::move(dependency), type);
}
auto counter = std::make_shared<HostCounter>(*this, std::move(dependency), type,
std::move(reserve.back()));
reserve.pop_back();
return counter;
}
void HostCounter::Enable() {
if (is_beginned) {
return;
void QueryCache::FlushObjectInner(const std::shared_ptr<CachedQuery>& counter_) {
auto& counter = *counter_;
auto& stream = GetStream(counter.GetType());
// Waiting for a query while another query of the same target is enabled locks Nvidia's driver.
// To avoid this disable and re-enable keeping the dependency stream.
const bool is_enabled = stream.IsEnabled();
if (is_enabled) {
stream.Update(false, false);
}
counter.Flush();
if (is_enabled) {
stream.Update(true, false);
}
is_beginned = true;
glBeginQuery(GL_SAMPLES_PASSED, query.handle);
}
void HostCounter::Disable() {
if (!is_beginned) {
return;
CounterStream& QueryCache::GetStream(QueryType type) {
return streams[static_cast<std::size_t>(type)];
}
HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, QueryType type)
: cache{cache}, type{type}, dependency{std::move(dependency)} {
const GLenum target = GetTarget(type);
query.Create(target);
glBeginQuery(target, query.handle);
}
HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, QueryType type,
OGLQuery&& query_)
: cache{cache}, type{type}, dependency{std::move(dependency)}, query{std::move(query_)} {
glBeginQuery(GetTarget(type), query.handle);
}
HostCounter::~HostCounter() {
cache.Reserve(type, std::move(query));
}
u64 HostCounter::Query() {
if (query.handle == 0) {
return result;
}
glEndQuery(GL_SAMPLES_PASSED);
is_beginned = false;
glGetQueryObjectui64v(query.handle, GL_QUERY_RESULT, &result);
if (dependency) {
result += dependency->Query();
}
return result;
}
CachedQuery::CachedQuery(QueryType type, VAddr cpu_addr, u8* host_ptr)
: RasterizerCacheObject{host_ptr}, type{type}, cpu_addr{cpu_addr}, host_ptr{host_ptr} {}
CachedQuery::~CachedQuery() = default;
void CachedQuery::Flush() {
const u64 value = counter->Query();
std::memcpy(host_ptr, &value, sizeof(value));
}
void CachedQuery::SetCounter(std::shared_ptr<HostCounter> counter_) {
counter = std::move(counter_);
}
QueryType CachedQuery::GetType() const {
return type;
}
} // namespace OpenGL

View file

@ -4,38 +4,131 @@
#pragma once
#include <array>
#include <memory>
#include <optional>
#include <vector>
#include <glad/glad.h>
#include "common/common_types.h"
#include "video_core/rasterizer_cache.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
namespace Core {
class System;
}
namespace OpenGL {
class CachedQuery;
class HostCounter;
class RasterizerOpenGL;
class QueryCache;
class CounterStream final {
public:
explicit CounterStream(QueryCache& cache, VideoCore::QueryType type);
~CounterStream();
void Update(bool enabled, bool any_command_queued);
void Reset(bool any_command_queued);
std::shared_ptr<HostCounter> GetCurrent(bool any_command_queued);
bool IsEnabled() const {
return current != nullptr;
}
private:
void EndQuery(bool any_command_queued);
QueryCache& cache;
std::shared_ptr<HostCounter> current;
std::shared_ptr<HostCounter> last;
VideoCore::QueryType type;
GLenum target;
};
class QueryCache final : public RasterizerCache<std::shared_ptr<CachedQuery>> {
public:
explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer);
~QueryCache();
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type);
void UpdateCounters();
void ResetCounter(VideoCore::QueryType type);
void Reserve(VideoCore::QueryType type, OGLQuery&& query);
std::shared_ptr<HostCounter> GetHostCounter(std::shared_ptr<HostCounter> dependency,
VideoCore::QueryType type);
protected:
void FlushObjectInner(const std::shared_ptr<CachedQuery>& counter) override;
private:
CounterStream& GetStream(VideoCore::QueryType type);
Core::System& system;
RasterizerOpenGL& rasterizer;
std::array<CounterStream, VideoCore::NumQueryTypes> streams;
std::array<std::vector<OGLQuery>, VideoCore::NumQueryTypes> reserved_queries;
};
class HostCounter final {
public:
explicit HostCounter(GLenum target);
explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
VideoCore::QueryType type);
explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
VideoCore::QueryType type, OGLQuery&& query);
~HostCounter();
/// Enables or disables the counter as required.
void UpdateState(bool enabled);
/// Resets the counter disabling it if needed.
void Reset();
/// Returns the current value of the query.
/// @note It may harm precision of future queries if the counter is not disabled.
u64 Query();
private:
/// Enables the counter when disabled.
void Enable();
QueryCache& cache;
VideoCore::QueryType type;
/// Disables the counter when enabled.
void Disable();
std::shared_ptr<HostCounter> dependency; ///< Counter queued before this one.
OGLQuery query; ///< OpenGL query.
u64 result; ///< Added values of the counter.
};
OGLQuery query; ///< OpenGL query.
u64 counter{}; ///< Added values of the counter.
bool is_beginned{}; ///< True when the OpenGL query is beginned.
class CachedQuery final : public RasterizerCacheObject {
public:
explicit CachedQuery(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr);
~CachedQuery();
/// Writes the counter value to host memory.
void Flush();
/// Updates the counter this cached query registered in guest memory will write when requested.
void SetCounter(std::shared_ptr<HostCounter> counter);
/// Returns the query type.
VideoCore::QueryType GetType() const;
VAddr GetCpuAddr() const override {
return cpu_addr;
}
std::size_t GetSizeInBytes() const override {
return sizeof(u64);
}
private:
VideoCore::QueryType type;
VAddr cpu_addr; ///< Guest CPU address.
u8* host_ptr; ///< Writable host pointer.
std::shared_ptr<HostCounter> counter; ///< Host counter to query, owns the dependency tree.
};
} // namespace OpenGL

View file

@ -25,6 +25,7 @@
#include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/shader_type.h"
#include "video_core/memory_manager.h"
#include "video_core/renderer_opengl/gl_query_cache.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
#include "video_core/renderer_opengl/gl_shader_cache.h"
#include "video_core/renderer_opengl/gl_shader_gen.h"
@ -92,8 +93,8 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
ScreenInfo& info)
: RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device},
shader_cache{*this, system, emu_window, device}, system{system}, screen_info{info},
buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system},
screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
shader_program_manager = std::make_unique<GLShader::ProgramManager>();
state.draw.shader_program = 0;
state.Apply();
@ -548,9 +549,9 @@ void RasterizerOpenGL::Clear() {
void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
MICROPROFILE_SCOPE(OpenGL_Drawing);
auto& gpu = system.GPU().Maxwell3D();
const auto& regs = gpu.regs;
samples_passed.UpdateState(regs.samplecnt_enable);
query_cache.UpdateCounters();
SyncRasterizeEnable(state);
SyncColorMask();
@ -718,24 +719,11 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
}
void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) {
switch (type) {
case VideoCore::QueryType::SamplesPassed:
samples_passed.Reset();
break;
default:
UNIMPLEMENTED_MSG("type={}", static_cast<u32>(type));
break;
}
query_cache.ResetCounter(type);
}
u64 RasterizerOpenGL::Query(VideoCore::QueryType type) {
switch (type) {
case VideoCore::QueryType::SamplesPassed:
return samples_passed.Query();
default:
UNIMPLEMENTED_MSG("type={}", static_cast<u32>(type));
return 1;
}
void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type) {
query_cache.Query(gpu_addr, type);
}
void RasterizerOpenGL::FlushAll() {}
@ -747,6 +735,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
}
texture_cache.FlushRegion(addr, size);
buffer_cache.FlushRegion(addr, size);
query_cache.FlushRegion(addr, size);
}
void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
@ -757,6 +746,7 @@ void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
texture_cache.InvalidateRegion(addr, size);
shader_cache.InvalidateRegion(addr, size);
buffer_cache.InvalidateRegion(addr, size);
query_cache.InvalidateRegion(addr, size);
}
void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {

View file

@ -63,7 +63,7 @@ public:
void Clear() override;
void DispatchCompute(GPUVAddr code_addr) override;
void ResetCounter(VideoCore::QueryType type) override;
u64 Query(VideoCore::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type) override;
void FlushAll() override;
void FlushRegion(CacheAddr addr, u64 size) override;
void InvalidateRegion(CacheAddr addr, u64 size) override;
@ -78,6 +78,11 @@ public:
void LoadDiskResources(const std::atomic_bool& stop_loading,
const VideoCore::DiskResourceLoadCallback& callback) override;
/// Returns true when there are commands queued to the OpenGL server.
bool AnyCommandQueued() const {
return num_queued_commands > 0;
}
private:
/// Configures the color and depth framebuffer states.
void ConfigureFramebuffers();
@ -207,6 +212,7 @@ private:
ShaderCacheOpenGL shader_cache;
SamplerCacheOpenGL sampler_cache;
FramebufferCacheOpenGL framebuffer_cache;
QueryCache query_cache;
Core::System& system;
ScreenInfo& screen_info;
@ -224,8 +230,6 @@ private:
BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
HostCounter samples_passed{GL_SAMPLES_PASSED};
/// Number of commands queued to the OpenGL driver. Reseted on flush.
std::size_t num_queued_commands = 0;
};