From bdc01254a9b3ce8359f8f007c2102cb2d112418e Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 4 Aug 2023 03:31:52 +0200 Subject: [PATCH 01/10] Query Cache: Setup Base rework --- src/video_core/CMakeLists.txt | 6 + src/video_core/query_cache/bank_base.h | 106 ++++ src/video_core/query_cache/query_base.h | 72 +++ src/video_core/query_cache/query_cache.h | 543 ++++++++++++++++++ src/video_core/query_cache/query_cache_base.h | 181 ++++++ src/video_core/query_cache/query_stream.h | 125 ++++ src/video_core/query_cache/types.h | 74 +++ 7 files changed, 1107 insertions(+) create mode 100644 src/video_core/query_cache/bank_base.h create mode 100644 src/video_core/query_cache/query_base.h create mode 100644 src/video_core/query_cache/query_cache.h create mode 100644 src/video_core/query_cache/query_cache_base.h create mode 100644 src/video_core/query_cache/query_stream.h create mode 100644 src/video_core/query_cache/types.h diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 9b13ccbab..cf9266d54 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -95,6 +95,12 @@ add_library(video_core STATIC memory_manager.h precompiled_headers.h pte_kind.h + query_cache/bank_base.h + query_cache/query_base.h + query_cache/query_cache_base.h + query_cache/query_cache.h + query_cache/query_stream.h + query_cache/types.h query_cache.h rasterizer_accelerated.cpp rasterizer_accelerated.h diff --git a/src/video_core/query_cache/bank_base.h b/src/video_core/query_cache/bank_base.h new file mode 100644 index 000000000..4246a609d --- /dev/null +++ b/src/video_core/query_cache/bank_base.h @@ -0,0 +1,106 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include +#include +#include + + +#include "common/common_types.h" + +namespace VideoCommon { + +class BankBase { +protected: + const size_t base_bank_size; + size_t bank_size; + std::atomic references; + size_t current_slot; + +public: + BankBase(size_t bank_size_) + : base_bank_size{bank_size_}, bank_size(bank_size_), references(0), current_slot(0) {} + + virtual ~BankBase() = default; + + virtual std::pair Reserve() { + if (IsClosed()) { + return {false, bank_size}; + } + const size_t result = current_slot++; + return {true, result}; + } + + virtual void Reset() { + current_slot = 0; + references = 0; + bank_size = base_bank_size; + } + + size_t Size() const { + return bank_size; + } + + void AddReference(size_t how_many = 1) { + references.fetch_add(how_many, std::memory_order_relaxed); + } + + void CloseReference(size_t how_many = 1) { + if (how_many > references.load(std::memory_order_relaxed)) { + UNREACHABLE(); + } + references.fetch_sub(how_many, std::memory_order_relaxed); + } + + void Close() { + bank_size = current_slot; + } + + constexpr bool IsClosed() { + return current_slot >= bank_size; + } + + bool IsDead() { + return IsClosed() && references == 0; + } +}; + +template +class BankPool { +private: + std::deque bank_pool; + std::deque bank_indices; + +public: + BankPool() = default; + ~BankPool() = default; + + // Reserve a bank from the pool and return its index + template + size_t ReserveBank(Func&& builder) { + if (!bank_indices.empty() && bank_pool[bank_indices.front()].IsDead()) { + size_t new_index = bank_indices.front(); + bank_indices.pop_front(); + bank_pool[new_index].Reset(); + return new_index; + } + size_t new_index = bank_pool.size(); + builder(bank_pool, new_index); + bank_indices.push_back(new_index); + return new_index; + } + + // Get a reference to a bank using its index + BankType& GetBank(size_t index) { + return bank_pool[index]; + } + + // Get the total number of banks in the pool + size_t BankCount() const { + return bank_pool.size(); + } +}; + +} // namespace VideoCommon diff --git a/src/video_core/query_cache/query_base.h b/src/video_core/query_cache/query_base.h new file mode 100644 index 000000000..485ed669c --- /dev/null +++ b/src/video_core/query_cache/query_base.h @@ -0,0 +1,72 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace VideoCommon { + +enum class QueryFlagBits : u32 { + HasTimestamp = 1 << 0, ///< Indicates if this query has a tiemstamp. + IsFinalValueSynced = 1 << 1, ///< Indicates if the query has been synced in the host + IsHostSynced = 1 << 2, ///< Indicates if the query has been synced in the host + IsGuestSynced = 1 << 3, ///< Indicates if the query has been synced with the guest. + IsHostManaged = 1 << 4, ///< Indicates if this query points to a host query + IsRewritten = 1 << 5, ///< Indicates if this query was rewritten by another query + IsInvalidated = 1 << 6, ///< Indicates the value of th query has been nullified. + IsOrphan = 1 << 7, ///< Indicates the query has not been set by a guest query. + IsFence = 1 << 8, ///< Indicates the query is a fence. +}; +DECLARE_ENUM_FLAG_OPERATORS(QueryFlagBits) + +class QueryBase { +public: + VAddr guest_address; + QueryFlagBits flags; + u64 value; + +protected: + // Default constructor + QueryBase() : guest_address(0), flags{}, value{} {} + + // Parameterized constructor + QueryBase(VAddr address, QueryFlagBits flags_, u64 value_) + : guest_address(address), flags(flags_), value{value_} {} +}; + +class GuestQuery : public QueryBase { +public: + // Parameterized constructor + GuestQuery(bool isLong, VAddr address, u64 queryValue) + : QueryBase(address, QueryFlagBits::IsFinalValueSynced, queryValue) { + if (isLong) { + flags |= QueryFlagBits::HasTimestamp; + } + } +}; + +class HostQueryBase : public QueryBase { +public: + // Default constructor + HostQueryBase() + : QueryBase(0, QueryFlagBits::IsHostManaged | QueryFlagBits::IsOrphan, 0), start_bank_id{}, + size_banks{}, start_slot{}, size_slots{} {} + + // Parameterized constructor + HostQueryBase(bool isLong, VAddr address) + : QueryBase(address, QueryFlagBits::IsHostManaged, 0), start_bank_id{}, size_banks{}, + start_slot{}, size_slots{} { + if (isLong) { + flags |= QueryFlagBits::HasTimestamp; + } + } + + u32 start_bank_id; + u32 size_banks; + size_t start_slot; + size_t size_slots; +}; + +} // namespace VideoCommon \ No newline at end of file diff --git a/src/video_core/query_cache/query_cache.h b/src/video_core/query_cache/query_cache.h new file mode 100644 index 000000000..f6af48d14 --- /dev/null +++ b/src/video_core/query_cache/query_cache.h @@ -0,0 +1,543 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common/assert.h" +#include "common/common_types.h" +#include "common/logging/log.h" +#include "common/scope_exit.h" +#include "common/settings.h" +#include "core/memory.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" +#include "video_core/query_cache/bank_base.h" +#include "video_core/query_cache/query_base.h" +#include "video_core/query_cache/query_cache_base.h" +#include "video_core/query_cache/query_stream.h" +#include "video_core/query_cache/types.h" + +namespace VideoCommon { + +using Maxwell = Tegra::Engines::Maxwell3D; + +struct SyncValuesStruct { + VAddr address; + u64 value; + u64 size; + + static constexpr bool GeneratesBaseBuffer = true; +}; + +template +class GuestStreamer : public SimpleStreamer { +public: + using RuntimeType = typename Traits::RuntimeType; + + GuestStreamer(size_t id_, RuntimeType& runtime_) + : SimpleStreamer(id_), runtime{runtime_} {} + + virtual ~GuestStreamer() = default; + + size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, + std::optional subreport = std::nullopt) override { + auto new_id = BuildQuery(has_timestamp, address, static_cast(value)); + pending_sync.push_back(new_id); + return new_id; + } + + bool HasPendingSync() override { + return !pending_sync.empty(); + } + + void SyncWrites() override { + if (pending_sync.empty()) { + return; + } + std::vector sync_values; + sync_values.reserve(pending_sync.size()); + for (size_t pending_id : pending_sync) { + auto& query = slot_queries[pending_id]; + if (True(query.flags & QueryFlagBits::IsRewritten) || + True(query.flags & QueryFlagBits::IsInvalidated)) { + continue; + } + query.flags |= QueryFlagBits::IsHostSynced; + sync_values.emplace_back(query.guest_address, query.value, + True(query.flags & QueryFlagBits::HasTimestamp) ? 8 : 4); + } + pending_sync.clear(); + if (sync_values.size() > 0) { + runtime.template SyncValues(sync_values); + } + } + +private: + RuntimeType& runtime; + std::deque pending_sync; +}; + +template +class StubStreamer : public GuestStreamer { +public: + using RuntimeType = typename Traits::RuntimeType; + + StubStreamer(size_t id_, RuntimeType& runtime_) : GuestStreamer(id_, runtime_) {} + + ~StubStreamer() override = default; + + size_t WriteCounter(VAddr address, bool has_timestamp, [[maybe_unused]] u32 value, + std::optional subreport = std::nullopt) override { + size_t new_id = GuestStreamer::WriteCounter(address, has_timestamp, 1U, subreport); + return new_id; + } +}; + +template +struct QueryCacheBase::QueryCacheBaseImpl { + using RuntimeType = typename Traits::RuntimeType; + + QueryCacheBaseImpl(QueryCacheBase* owner_, VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_, Tegra::GPU& gpu_) + : owner{owner_}, rasterizer{rasterizer_}, + cpu_memory{cpu_memory_}, runtime{runtime_}, gpu{gpu_} { + streamer_mask = 0; + for (size_t i = 0; i < static_cast(QueryType::MaxQueryTypes); i++) { + streamers[i] = runtime.GetStreamerInterface(static_cast(i)); + if (streamers[i]) { + streamer_mask |= 1ULL << i; + } + } + } + + template + void ForEachStreamerIn(u64 mask, Func&& func) { + static constexpr bool RETURNS_BOOL = + std::is_same_v, bool>; + while (mask != 0) { + size_t position = std::countr_zero(mask); + mask &= ~(1ULL << position); + if constexpr (RETURNS_BOOL) { + if (func(streamers[position])) { + return; + } + } else { + func(streamers[position]); + } + } + } + + template + void ForEachStreamer(Func&& func) { + ForEachStreamerIn(streamer_mask, func); + } + + QueryBase* ObtainQuery(QueryCacheBase::QueryLocation location) { + size_t which_stream = location.stream_id.Value(); + auto* streamer = streamers[which_stream]; + if (!streamer) { + return nullptr; + } + return streamer->GetQuery(location.query_id.Value()); + } + + QueryCacheBase* owner; + VideoCore::RasterizerInterface& rasterizer; + Core::Memory::Memory& cpu_memory; + Traits::RuntimeType& runtime; + Tegra::GPU& gpu; + std::array(QueryType::MaxQueryTypes)> streamers; + u64 streamer_mask; + std::mutex flush_guard; + std::deque flushes_pending; + std::vector::QueryLocation> pending_unregister; +}; + +template +QueryCacheBase::QueryCacheBase(Tegra::GPU& gpu_, + VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_) + : cached_queries{} { + impl = std::make_unique::QueryCacheBaseImpl>( + this, rasterizer_, cpu_memory_, runtime_, gpu_); +} + +template +QueryCacheBase::~QueryCacheBase() = default; + +template +void QueryCacheBase::CounterEnable(QueryType counter_type, bool is_enabled) { + size_t index = static_cast(counter_type); + StreamerInterface* streamer = impl->streamers[index]; + if (!streamer) [[unlikely]] { + UNREACHABLE(); + return; + } + if (is_enabled) { + streamer->StartCounter(); + } else { + streamer->PauseCounter(); + } +} + +template +void QueryCacheBase::CounterClose(QueryType counter_type) { + size_t index = static_cast(counter_type); + StreamerInterface* streamer = impl->streamers[index]; + if (!streamer) [[unlikely]] { + UNREACHABLE(); + return; + } + streamer->CloseCounter(); +} + +template +void QueryCacheBase::CounterReset(QueryType counter_type) { + size_t index = static_cast(counter_type); + StreamerInterface* streamer = impl->streamers[index]; + if (!streamer) [[unlikely]] { + UNIMPLEMENTED(); + return; + } + streamer->ResetCounter(); +} + +template +void QueryCacheBase::BindToChannel(s32 id) { + VideoCommon::ChannelSetupCaches::BindToChannel(id); + impl->runtime.Bind3DEngine(maxwell3d); +} + +template +void QueryCacheBase::CounterReport(GPUVAddr addr, QueryType counter_type, + QueryPropertiesFlags flags, u32 payload, u32 subreport) { + const bool has_timestamp = True(flags & QueryPropertiesFlags::HasTimeout); + const bool is_fence = True(flags & QueryPropertiesFlags::IsAFence); + size_t streamer_id = static_cast(counter_type); + auto* streamer = impl->streamers[streamer_id]; + if (!streamer) [[unlikely]] { + if (has_timestamp) { + u64 timestamp = impl->gpu.GetTicks(); + gpu_memory->Write(addr + 8, timestamp); + gpu_memory->Write(addr, 1ULL); + } else { + gpu_memory->Write(addr, 1U); + } + return; + } + auto cpu_addr_opt = gpu_memory->GpuToCpuAddress(addr); + if (!cpu_addr_opt) [[unlikely]] { + return; + } + VAddr cpu_addr = *cpu_addr_opt; + const size_t new_query_id = streamer->WriteCounter(cpu_addr, has_timestamp, payload, subreport); + auto* query = streamer->GetQuery(new_query_id); + if (is_fence) { + query->flags |= QueryFlagBits::IsFence; + } + QueryLocation query_location{}; + query_location.stream_id.Assign(static_cast(streamer_id)); + query_location.query_id.Assign(static_cast(new_query_id)); + const auto gen_caching_indexing = [](VAddr cur_addr) { + return std::make_pair(cur_addr >> Core::Memory::YUZU_PAGEBITS, + static_cast(cur_addr & Core::Memory::YUZU_PAGEMASK)); + }; + u8* pointer = impl->cpu_memory.GetPointer(cpu_addr); + u8* pointer_timestamp = impl->cpu_memory.GetPointer(cpu_addr + 8); + bool is_synced = !Settings::IsGPULevelHigh() && is_fence; + std::function operation( + [this, is_synced, query_base = query, query_location, pointer, pointer_timestamp] { + if (True(query_base->flags & QueryFlagBits::IsInvalidated)) { + if (!is_synced) [[likely]] { + impl->pending_unregister.push_back(query_location); + } + return; + } + if (False(query_base->flags & QueryFlagBits::IsFinalValueSynced)) [[unlikely]] { + UNREACHABLE(); + return; + } + if (True(query_base->flags & QueryFlagBits::HasTimestamp)) { + u64 timestamp = impl->gpu.GetTicks(); + std::memcpy(pointer_timestamp, ×tamp, sizeof(timestamp)); + std::memcpy(pointer, &query_base->value, sizeof(query_base->value)); + } else { + u32 value = static_cast(query_base->value); + std::memcpy(pointer, &value, sizeof(value)); + } + if (!is_synced) [[likely]] { + impl->pending_unregister.push_back(query_location); + } + }); + if (is_fence) { + impl->rasterizer.SignalFence(std::move(operation)); + } else { + impl->rasterizer.SyncOperation(std::move(operation)); + } + if (is_synced) { + streamer->Free(new_query_id); + return; + } + auto [cont_addr, base] = gen_caching_indexing(cpu_addr); + { + std::scoped_lock lock(cache_mutex); + auto it1 = cached_queries.try_emplace(cont_addr); + auto& sub_container = it1.first->second; + auto it_current = sub_container.find(base); + if (it_current == sub_container.end()) { + sub_container.insert_or_assign(base, query_location); + return; + } + auto* old_query = impl->ObtainQuery(it_current->second); + old_query->flags |= QueryFlagBits::IsRewritten; + sub_container.insert_or_assign(base, query_location); + } +} + +template +void QueryCacheBase::UnregisterPending() { + const auto gen_caching_indexing = [](VAddr cur_addr) { + return std::make_pair(cur_addr >> Core::Memory::YUZU_PAGEBITS, + static_cast(cur_addr & Core::Memory::YUZU_PAGEMASK)); + }; + std::scoped_lock lock(cache_mutex); + for (QueryLocation loc : impl->pending_unregister) { + const auto [streamer_id, query_id] = loc.unpack(); + auto* streamer = impl->streamers[streamer_id]; + if (!streamer) [[unlikely]] { + continue; + } + auto* query = streamer->GetQuery(query_id); + auto [cont_addr, base] = gen_caching_indexing(query->guest_address); + auto it1 = cached_queries.find(cont_addr); + if (it1 != cached_queries.end()) { + auto it2 = it1->second.find(base); + if (it2 != it1->second.end()) { + if (it2->second.raw == loc.raw) { + it1->second.erase(it2); + } + } + } + streamer->Free(query_id); + } + impl->pending_unregister.clear(); +} + +template +void QueryCacheBase::NotifyWFI() { + bool should_sync = false; + impl->ForEachStreamer( + [&should_sync](StreamerInterface* streamer) { should_sync |= streamer->HasPendingSync(); }); + if (!should_sync) { + return; + } + + impl->ForEachStreamer([](StreamerInterface* streamer) { streamer->PresyncWrites(); }); + impl->runtime.Barriers(true); + impl->ForEachStreamer([](StreamerInterface* streamer) { streamer->SyncWrites(); }); + impl->runtime.Barriers(false); +} + +template +void QueryCacheBase::NotifySegment(bool resume) { + if (resume) { + impl->runtime.ResumeHostConditionalRendering(); + } else { + impl->runtime.PauseHostConditionalRendering(); + CounterClose(VideoCommon::QueryType::ZPassPixelCount64); + CounterClose(VideoCommon::QueryType::StreamingByteCount); + } +} + +template +bool QueryCacheBase::AccelerateHostConditionalRendering() { + bool qc_dirty = false; + const auto gen_lookup = [this, &qc_dirty](GPUVAddr address) -> VideoCommon::LookupData { + auto cpu_addr_opt = gpu_memory->GpuToCpuAddress(address); + if (!cpu_addr_opt) [[unlikely]] { + return VideoCommon::LookupData{ + .address = 0, + .found_query = nullptr, + }; + } + VAddr cpu_addr = *cpu_addr_opt; + std::scoped_lock lock(cache_mutex); + auto it1 = cached_queries.find(cpu_addr >> Core::Memory::YUZU_PAGEBITS); + if (it1 == cached_queries.end()) { + return VideoCommon::LookupData{ + .address = cpu_addr, + .found_query = nullptr, + }; + } + auto& sub_container = it1->second; + auto it_current = sub_container.find(cpu_addr & Core::Memory::YUZU_PAGEMASK); + + if (it_current == sub_container.end()) { + auto it_current_2 = sub_container.find((cpu_addr & Core::Memory::YUZU_PAGEMASK) + 4); + if (it_current_2 == sub_container.end()) { + return VideoCommon::LookupData{ + .address = cpu_addr, + .found_query = nullptr, + }; + } + } + auto* query = impl->ObtainQuery(it_current->second); + qc_dirty |= True(query->flags & QueryFlagBits::IsHostManaged) && + False(query->flags & QueryFlagBits::IsGuestSynced); + return VideoCommon::LookupData{ + .address = cpu_addr, + .found_query = query, + }; + }; + + auto& regs = maxwell3d->regs; + if (regs.render_enable_override != Maxwell::Regs::RenderEnable::Override::UseRenderEnable) { + impl->runtime.EndHostConditionalRendering(); + return false; + } + /*if (!Settings::IsGPULevelHigh()) { + impl->runtime.EndHostConditionalRendering(); + return gpu_memory->IsMemoryDirty(regs.render_enable.Address(), 24, + VideoCommon::CacheType::BufferCache | + VideoCommon::CacheType::QueryCache); + }*/ + const ComparisonMode mode = static_cast(regs.render_enable.mode); + const GPUVAddr address = regs.render_enable.Address(); + switch (mode) { + case ComparisonMode::True: + impl->runtime.EndHostConditionalRendering(); + return false; + case ComparisonMode::False: + impl->runtime.EndHostConditionalRendering(); + return false; + case ComparisonMode::Conditional: { + VideoCommon::LookupData object_1{gen_lookup(address)}; + return impl->runtime.HostConditionalRenderingCompareValue(object_1, qc_dirty); + } + case ComparisonMode::IfEqual: { + VideoCommon::LookupData object_1{gen_lookup(address)}; + VideoCommon::LookupData object_2{gen_lookup(address + 16)}; + return impl->runtime.HostConditionalRenderingCompareValues(object_1, object_2, qc_dirty, + true); + } + case ComparisonMode::IfNotEqual: { + VideoCommon::LookupData object_1{gen_lookup(address)}; + VideoCommon::LookupData object_2{gen_lookup(address + 16)}; + return impl->runtime.HostConditionalRenderingCompareValues(object_1, object_2, qc_dirty, + false); + } + default: + return false; + } +} + +// Async downloads +template +void QueryCacheBase::CommitAsyncFlushes() { + u64 mask{}; + { + std::scoped_lock lk(impl->flush_guard); + impl->ForEachStreamer([&mask](StreamerInterface* streamer) { + bool local_result = streamer->HasUnsyncedQueries(); + if (local_result) { + mask |= 1ULL << streamer->GetId(); + } + }); + impl->flushes_pending.push_back(mask); + } + std::function func([this] { UnregisterPending(); }); + impl->rasterizer.SyncOperation(std::move(func)); + if (mask == 0) { + return; + } + impl->ForEachStreamerIn(mask, + [](StreamerInterface* streamer) { streamer->PushUnsyncedQueries(); }); +} + +template +bool QueryCacheBase::HasUncommittedFlushes() const { + bool result = false; + impl->ForEachStreamer([&result](StreamerInterface* streamer) { + result |= streamer->HasUnsyncedQueries(); + return result; + }); + return result; +} + +template +bool QueryCacheBase::ShouldWaitAsyncFlushes() { + std::scoped_lock lk(impl->flush_guard); + return !impl->flushes_pending.empty() && impl->flushes_pending.front() != 0ULL; +} + +template +void QueryCacheBase::PopAsyncFlushes() { + u64 mask; + { + std::scoped_lock lk(impl->flush_guard); + mask = impl->flushes_pending.front(); + impl->flushes_pending.pop_front(); + } + if (mask == 0) { + return; + } + impl->ForEachStreamerIn(mask, + [](StreamerInterface* streamer) { streamer->PopUnsyncedQueries(); }); +} + +// Invalidation + +template +void QueryCacheBase::InvalidateQuery(QueryCacheBase::QueryLocation location) { + auto* query_base = impl->ObtainQuery(location); + if (!query_base) { + return; + } + query_base->flags |= QueryFlagBits::IsInvalidated; +} + +template +bool QueryCacheBase::IsQueryDirty(QueryCacheBase::QueryLocation location) { + auto* query_base = impl->ObtainQuery(location); + if (!query_base) { + return false; + } + return True(query_base->flags & QueryFlagBits::IsHostManaged) && + False(query_base->flags & QueryFlagBits::IsGuestSynced); +} + +template +bool QueryCacheBase::SemiFlushQueryDirty(QueryCacheBase::QueryLocation location) { + auto* query_base = impl->ObtainQuery(location); + if (!query_base) { + return false; + } + if (True(query_base->flags & QueryFlagBits::IsFinalValueSynced) && + False(query_base->flags & QueryFlagBits::IsGuestSynced)) { + auto* ptr = impl->cpu_memory.GetPointer(query_base->guest_address); + if (True(query_base->flags & QueryFlagBits::HasTimestamp)) { + std::memcpy(ptr, &query_base->value, sizeof(query_base->value)); + return false; + } + u32 value_l = static_cast(query_base->value); + std::memcpy(ptr, &value_l, sizeof(value_l)); + return false; + } + return True(query_base->flags & QueryFlagBits::IsHostManaged) && + False(query_base->flags & QueryFlagBits::IsGuestSynced); +} + +template +void QueryCacheBase::RequestGuestHostSync() { + impl->rasterizer.ReleaseFences(); +} + +} // namespace VideoCommon diff --git a/src/video_core/query_cache/query_cache_base.h b/src/video_core/query_cache/query_cache_base.h new file mode 100644 index 000000000..55f508dd1 --- /dev/null +++ b/src/video_core/query_cache/query_cache_base.h @@ -0,0 +1,181 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common/assert.h" +#include "common/bit_field.h" +#include "common/common_types.h" +#include "core/memory.h" +#include "video_core/control/channel_state_cache.h" +#include "video_core/query_cache/query_base.h" +#include "video_core/query_cache/types.h" + +namespace Core::Memory { +class Memory; +} + +namespace VideoCore { +class RasterizerInterface; +} + +namespace Tegra { +class GPU; +} + +namespace VideoCommon { + +struct LookupData { + VAddr address; + QueryBase* found_query; +}; + +template +class QueryCacheBase : public VideoCommon::ChannelSetupCaches { + using RuntimeType = typename Traits::RuntimeType; + +public: + union QueryLocation { + BitField<27, 5, u32> stream_id; + BitField<0, 27, u32> query_id; + u32 raw; + + std::pair unpack() { + return {static_cast(stream_id.Value()), static_cast(query_id.Value())}; + } + }; + + explicit QueryCacheBase(Tegra::GPU& gpu, VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_); + + ~QueryCacheBase(); + + void InvalidateRegion(VAddr addr, std::size_t size) { + IterateCache(addr, size, + [this](QueryLocation location) { InvalidateQuery(location); }); + } + + void FlushRegion(VAddr addr, std::size_t size) { + bool result = false; + IterateCache(addr, size, [this, &result](QueryLocation location) { + result |= SemiFlushQueryDirty(location); + return result; + }); + if (result) { + RequestGuestHostSync(); + } + } + + static u64 BuildMask(std::span types) { + u64 mask = 0; + for (auto query_type : types) { + mask |= 1ULL << (static_cast(query_type)); + } + return mask; + } + + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size) { + bool result = false; + IterateCache(addr, size, [this, &result](QueryLocation location) { + result |= IsQueryDirty(location); + return result; + }); + return result; + } + + void CounterEnable(QueryType counter_type, bool is_enabled); + + void CounterReset(QueryType counter_type); + + void CounterClose(QueryType counter_type); + + void CounterReport(GPUVAddr addr, QueryType counter_type, QueryPropertiesFlags flags, + u32 payload, u32 subreport); + + void NotifyWFI(); + + bool AccelerateHostConditionalRendering(); + + // Async downloads + void CommitAsyncFlushes(); + + bool HasUncommittedFlushes() const; + + bool ShouldWaitAsyncFlushes(); + + void PopAsyncFlushes(); + + void NotifySegment(bool resume); + + void BindToChannel(s32 id) override; + +protected: + template + void IterateCache(VAddr addr, std::size_t size, Func&& func) { + static constexpr bool RETURNS_BOOL = + std::is_same_v, bool>; + const u64 addr_begin = addr; + const u64 addr_end = addr_begin + size; + + const u64 page_end = addr_end >> Core::Memory::YUZU_PAGEBITS; + std::scoped_lock lock(cache_mutex); + for (u64 page = addr_begin >> Core::Memory::YUZU_PAGEBITS; page <= page_end; ++page) { + const u64 page_start = page << Core::Memory::YUZU_PAGEBITS; + const auto in_range = [page_start, addr_begin, addr_end](const u32 query_location) { + const u64 cache_begin = page_start + query_location; + const u64 cache_end = cache_begin + sizeof(u32); + return cache_begin < addr_end && addr_begin < cache_end; + }; + const auto& it = cached_queries.find(page); + if (it == std::end(cached_queries)) { + continue; + } + auto& contents = it->second; + for (auto& query : contents) { + if (!in_range(query.first)) { + continue; + } + if constexpr (RETURNS_BOOL) { + if (func(query.second)) { + return; + } + } else { + func(query.second); + } + } + if constexpr (remove_from_cache) { + const auto in_range2 = [&](const std::pair& pair) { + return in_range(pair.first); + }; + std::erase_if(contents, in_range2); + } + } + } + + using ContentCache = typename std::unordered_map>; + + void InvalidateQuery(QueryLocation location); + bool IsQueryDirty(QueryLocation location); + bool SemiFlushQueryDirty(QueryLocation location); + void RequestGuestHostSync(); + void UnregisterPending(); + + std::unordered_map> cached_queries; + std::mutex cache_mutex; + + struct QueryCacheBaseImpl; + friend struct QueryCacheBaseImpl; + friend RuntimeType; + + std::unique_ptr impl; +}; + +} // namespace VideoCommon \ No newline at end of file diff --git a/src/video_core/query_cache/query_stream.h b/src/video_core/query_cache/query_stream.h new file mode 100644 index 000000000..dd5f95b3c --- /dev/null +++ b/src/video_core/query_cache/query_stream.h @@ -0,0 +1,125 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include +#include +#include + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/query_cache/bank_base.h" +#include "video_core/query_cache/query_base.h" + +namespace VideoCommon { + +class StreamerInterface { +public: + StreamerInterface(size_t id_, u64 dependance_mask_ = 0) : id{id_}, dependance_mask{dependance_mask_} {} + virtual ~StreamerInterface() = default; + + virtual QueryBase* GetQuery(size_t id) = 0; + + virtual void StartCounter() { + /* Do Nothing */ + } + + virtual void PauseCounter() { + /* Do Nothing */ + } + + virtual void ResetCounter() { + /* Do Nothing */ + } + + virtual void CloseCounter() { + /* Do Nothing */ + } + + virtual bool HasPendingSync() { + return false; + } + + virtual void PresyncWrites() { + /* Do Nothing */ + } + + virtual void SyncWrites() { + /* Do Nothing */ + } + + virtual size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, + std::optional subreport = std::nullopt) = 0; + + virtual bool HasUnsyncedQueries() { + return false; + } + + virtual void PushUnsyncedQueries() { + /* Do Nothing */ + } + + virtual void PopUnsyncedQueries() { + /* Do Nothing */ + } + + virtual void Free(size_t query_id) = 0; + + size_t GetId() const { + return id; + } + +protected: + const size_t id; + const u64 dependance_mask; +}; + +template +class SimpleStreamer : public StreamerInterface { +public: + SimpleStreamer(size_t id_) : StreamerInterface{id_} {} + virtual ~SimpleStreamer() = default; + +protected: + virtual QueryType* GetQuery(size_t query_id) override { + if (query_id < slot_queries.size()) { + return &slot_queries[query_id]; + } + return nullptr; + } + + virtual void Free(size_t query_id) override { + std::scoped_lock lk(guard); + ReleaseQuery(query_id); + } + + template ()...))> + size_t BuildQuery(Args&&... args) { + std::scoped_lock lk(guard); + if (!old_queries.empty()) { + size_t new_id = old_queries.front(); + old_queries.pop_front(); + new (&slot_queries[new_id]) QueryType(std::forward(args)...); + return new_id; + } + size_t new_id = slot_queries.size(); + slot_queries.emplace_back(std::forward(args)...); + return new_id; + } + + void ReleaseQuery(size_t query_id) { + + if (query_id < slot_queries.size()) { + old_queries.push_back(query_id); + return; + } + UNREACHABLE(); + } + + std::mutex guard; + std::deque slot_queries; + std::deque old_queries; +}; + +} // namespace VideoCommon \ No newline at end of file diff --git a/src/video_core/query_cache/types.h b/src/video_core/query_cache/types.h new file mode 100644 index 000000000..e9226bbfc --- /dev/null +++ b/src/video_core/query_cache/types.h @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace VideoCommon { + +enum class QueryPropertiesFlags : u32 { + HasTimeout = 1 << 0, + IsAFence = 1 << 1, +}; +DECLARE_ENUM_FLAG_OPERATORS(QueryPropertiesFlags) + +// This should always be equivalent to maxwell3d Report Semaphore Reports +enum class QueryType : u32 { + Payload = 0, // "None" in docs, but confirmed via hardware to return the payload + VerticesGenerated = 1, + ZPassPixelCount = 2, + PrimitivesGenerated = 3, + AlphaBetaClocks = 4, + VertexShaderInvocations = 5, + StreamingPrimitivesNeededMinusSucceeded = 6, + GeometryShaderInvocations = 7, + GeometryShaderPrimitivesGenerated = 9, + ZCullStats0 = 10, + StreamingPrimitivesSucceeded = 11, + ZCullStats1 = 12, + StreamingPrimitivesNeeded = 13, + ZCullStats2 = 14, + ClipperInvocations = 15, + ZCullStats3 = 16, + ClipperPrimitivesGenerated = 17, + VtgPrimitivesOut = 18, + PixelShaderInvocations = 19, + ZPassPixelCount64 = 21, + IEEECleanColorTarget = 24, + IEEECleanZetaTarget = 25, + StreamingByteCount = 26, + TessellationInitInvocations = 27, + BoundingRectangle = 28, + TessellationShaderInvocations = 29, + TotalStreamingPrimitivesNeededMinusSucceeded = 30, + TessellationShaderPrimitivesGenerated = 31, + // max. + MaxQueryTypes, +}; + +// Comparison modes for Host Conditional Rendering +enum class ComparisonMode : u32 { + False = 0, + True = 1, + Conditional = 2, + IfEqual = 3, + IfNotEqual = 4, + MaxComparisonMode, +}; + +// Reduction ops. +enum class ReductionOp : u32 { + RedAdd = 0, + RedMin = 1, + RedMax = 2, + RedInc = 3, + RedDec = 4, + RedAnd = 5, + RedOr = 6, + RedXor = 7, + MaxReductionOp, +}; + +} // namespace VideoCommon \ No newline at end of file From f1a2e367113518b277f34ffbb04499882c3b6051 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 4 Aug 2023 03:32:30 +0200 Subject: [PATCH 02/10] Query Cachge: Fully rework Vulkan's query cache --- src/common/settings.cpp | 10 +- src/common/settings.h | 2 + src/video_core/buffer_cache/buffer_cache.h | 19 +- .../buffer_cache/buffer_cache_base.h | 12 + src/video_core/control/channel_state_cache.h | 2 +- src/video_core/engines/maxwell_3d.cpp | 74 +- src/video_core/engines/maxwell_3d.h | 3 - src/video_core/engines/maxwell_dma.cpp | 12 +- src/video_core/engines/puller.cpp | 11 +- src/video_core/fence_manager.h | 23 +- src/video_core/gpu.cpp | 4 +- src/video_core/host_shaders/CMakeLists.txt | 1 + .../resolve_conditional_render.comp | 20 + src/video_core/macro/macro_hle.cpp | 24 + src/video_core/query_cache.h | 13 +- src/video_core/rasterizer_interface.h | 12 +- .../renderer_null/null_rasterizer.cpp | 18 +- .../renderer_null/null_rasterizer.h | 6 +- .../renderer_opengl/gl_query_cache.cpp | 2 +- .../renderer_opengl/gl_query_cache.h | 2 +- .../renderer_opengl/gl_rasterizer.cpp | 32 +- .../renderer_opengl/gl_rasterizer.h | 6 +- .../renderer_vulkan/vk_buffer_cache.cpp | 3 + .../renderer_vulkan/vk_compute_pass.cpp | 47 + .../renderer_vulkan/vk_compute_pass.h | 13 + .../renderer_vulkan/vk_fence_manager.h | 2 +- .../renderer_vulkan/vk_query_cache.cpp | 1286 +++++++++++++++-- .../renderer_vulkan/vk_query_cache.h | 119 +- .../renderer_vulkan/vk_rasterizer.cpp | 98 +- .../renderer_vulkan/vk_rasterizer.h | 13 +- .../renderer_vulkan/vk_scheduler.cpp | 9 +- src/video_core/renderer_vulkan/vk_scheduler.h | 2 +- src/video_core/vulkan_common/vulkan_device.h | 6 + .../vulkan_common/vulkan_wrapper.cpp | 3 + src/video_core/vulkan_common/vulkan_wrapper.h | 19 + 35 files changed, 1573 insertions(+), 355 deletions(-) create mode 100644 src/video_core/host_shaders/resolve_conditional_render.comp diff --git a/src/common/settings.cpp b/src/common/settings.cpp index 4ecaf550b..3fde3cae6 100644 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -130,13 +130,17 @@ void LogSettings() { log_path("DataStorage_SDMCDir", Common::FS::GetYuzuPath(Common::FS::YuzuPath::SDMCDir)); } +void UpdateGPUAccuracy() { + values.current_gpu_accuracy = values.gpu_accuracy.GetValue(); +} + bool IsGPULevelExtreme() { - return values.gpu_accuracy.GetValue() == GpuAccuracy::Extreme; + return values.current_gpu_accuracy == GpuAccuracy::Extreme; } bool IsGPULevelHigh() { - return values.gpu_accuracy.GetValue() == GpuAccuracy::Extreme || - values.gpu_accuracy.GetValue() == GpuAccuracy::High; + return values.current_gpu_accuracy == GpuAccuracy::Extreme || + values.current_gpu_accuracy == GpuAccuracy::High; } bool IsFastmemEnabled() { diff --git a/src/common/settings.h b/src/common/settings.h index 82ec9077e..ae5e5d2b8 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -307,6 +307,7 @@ struct Values { Specialization::Default, true, true}; + GpuAccuracy current_gpu_accuracy{GpuAccuracy::High}; SwitchableSetting max_anisotropy{ linkage, AnisotropyMode::Automatic, AnisotropyMode::Automatic, AnisotropyMode::X16, "max_anisotropy", Category::RendererAdvanced}; @@ -522,6 +523,7 @@ struct Values { extern Values values; +void UpdateGPUAccuracy(); bool IsGPULevelExtreme(); bool IsGPULevelHigh(); diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 8be7bd594..f91b7d1e4 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -272,13 +272,20 @@ std::pair BufferCache

::ObtainBuffer(GPUVAddr gpu_ad if (!cpu_addr) { return {&slot_buffers[NULL_BUFFER_ID], 0}; } - const BufferId buffer_id = FindBuffer(*cpu_addr, size); + return ObtainCPUBuffer(*cpu_addr, size, sync_info, post_op); +} + +template +std::pair BufferCache

::ObtainCPUBuffer(VAddr cpu_addr, u32 size, + ObtainBufferSynchronize sync_info, + ObtainBufferOperation post_op) { + const BufferId buffer_id = FindBuffer(cpu_addr, size); Buffer& buffer = slot_buffers[buffer_id]; // synchronize op switch (sync_info) { case ObtainBufferSynchronize::FullSynchronize: - SynchronizeBuffer(buffer, *cpu_addr, size); + SynchronizeBuffer(buffer, cpu_addr, size); break; default: break; @@ -286,11 +293,11 @@ std::pair BufferCache

::ObtainBuffer(GPUVAddr gpu_ad switch (post_op) { case ObtainBufferOperation::MarkAsWritten: - MarkWrittenBuffer(buffer_id, *cpu_addr, size); + MarkWrittenBuffer(buffer_id, cpu_addr, size); break; case ObtainBufferOperation::DiscardWrite: { - VAddr cpu_addr_start = Common::AlignDown(*cpu_addr, 64); - VAddr cpu_addr_end = Common::AlignUp(*cpu_addr + size, 64); + VAddr cpu_addr_start = Common::AlignDown(cpu_addr, 64); + VAddr cpu_addr_end = Common::AlignUp(cpu_addr + size, 64); IntervalType interval{cpu_addr_start, cpu_addr_end}; ClearDownload(interval); common_ranges.subtract(interval); @@ -300,7 +307,7 @@ std::pair BufferCache

::ObtainBuffer(GPUVAddr gpu_ad break; } - return {&buffer, buffer.Offset(*cpu_addr)}; + return {&buffer, buffer.Offset(cpu_addr)}; } template diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 0b7135d49..9507071e5 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -295,6 +295,10 @@ public: [[nodiscard]] std::pair ObtainBuffer(GPUVAddr gpu_addr, u32 size, ObtainBufferSynchronize sync_info, ObtainBufferOperation post_op); + + [[nodiscard]] std::pair ObtainCPUBuffer(VAddr gpu_addr, u32 size, + ObtainBufferSynchronize sync_info, + ObtainBufferOperation post_op); void FlushCachedWrites(); /// Return true when there are uncommitted buffers to be downloaded @@ -335,6 +339,14 @@ public: [[nodiscard]] std::pair GetDrawIndirectBuffer(); + template + void BufferOperations(Func&& func) { + do { + channel_state->has_deleted_buffers = false; + func(); + } while (channel_state->has_deleted_buffers); + } + std::recursive_mutex mutex; Runtime& runtime; diff --git a/src/video_core/control/channel_state_cache.h b/src/video_core/control/channel_state_cache.h index 46bc9e322..5574e1fba 100644 --- a/src/video_core/control/channel_state_cache.h +++ b/src/video_core/control/channel_state_cache.h @@ -51,7 +51,7 @@ public: virtual void CreateChannel(Tegra::Control::ChannelState& channel); /// Bind a channel for execution. - void BindToChannel(s32 id); + virtual void BindToChannel(s32 id); /// Erase channel's state. void EraseChannel(s32 id); diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 06e349e43..922c399e6 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -20,8 +20,6 @@ namespace Tegra::Engines { -using VideoCore::QueryType; - /// First register id that is actually a Macro call. constexpr u32 MacroRegistersStart = 0xE00; @@ -500,27 +498,21 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) { } void Maxwell3D::ProcessQueryGet() { + VideoCommon::QueryPropertiesFlags flags{}; + if (regs.report_semaphore.query.short_query == 0) { + flags |= VideoCommon::QueryPropertiesFlags::HasTimeout; + } + const GPUVAddr sequence_address{regs.report_semaphore.Address()}; + const VideoCommon::QueryType query_type = + static_cast(regs.report_semaphore.query.report.Value()); + const u32 payload = regs.report_semaphore.payload; + const u32 subreport = regs.report_semaphore.query.sub_report; switch (regs.report_semaphore.query.operation) { case Regs::ReportSemaphore::Operation::Release: if (regs.report_semaphore.query.short_query != 0) { - const GPUVAddr sequence_address{regs.report_semaphore.Address()}; - const u32 payload = regs.report_semaphore.payload; - std::function operation([this, sequence_address, payload] { - memory_manager.Write(sequence_address, payload); - }); - rasterizer->SignalFence(std::move(operation)); - } else { - struct LongQueryResult { - u64_le value; - u64_le timestamp; - }; - const GPUVAddr sequence_address{regs.report_semaphore.Address()}; - const u32 payload = regs.report_semaphore.payload; - [this, sequence_address, payload] { - memory_manager.Write(sequence_address + sizeof(u64), system.GPU().GetTicks()); - memory_manager.Write(sequence_address, payload); - }(); + flags |= VideoCommon::QueryPropertiesFlags::IsAFence; } + rasterizer->Query(sequence_address, query_type, flags, payload, subreport); break; case Regs::ReportSemaphore::Operation::Acquire: // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that @@ -528,11 +520,7 @@ void Maxwell3D::ProcessQueryGet() { UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE"); break; case Regs::ReportSemaphore::Operation::ReportOnly: - if (const std::optional result = GetQueryResult()) { - // If the query returns an empty optional it means it's cached and deferred. - // In this case we have a non-empty result, so we stamp it immediately. - StampQueryResult(*result, regs.report_semaphore.query.short_query == 0); - } + rasterizer->Query(sequence_address, query_type, flags, payload, subreport); break; case Regs::ReportSemaphore::Operation::Trap: UNIMPLEMENTED_MSG("Unimplemented query operation TRAP"); @@ -544,6 +532,10 @@ void Maxwell3D::ProcessQueryGet() { } void Maxwell3D::ProcessQueryCondition() { + if (rasterizer->AccelerateConditionalRendering()) { + execute_on = true; + return; + } const GPUVAddr condition_address{regs.render_enable.Address()}; switch (regs.render_enable_override) { case Regs::RenderEnable::Override::AlwaysRender: @@ -553,10 +545,6 @@ void Maxwell3D::ProcessQueryCondition() { execute_on = false; break; case Regs::RenderEnable::Override::UseRenderEnable: { - if (rasterizer->AccelerateConditionalRendering()) { - execute_on = true; - return; - } switch (regs.render_enable.mode) { case Regs::RenderEnable::Mode::True: { execute_on = true; @@ -606,7 +594,13 @@ void Maxwell3D::ProcessCounterReset() { #endif switch (regs.clear_report_value) { case Regs::ClearReport::ZPassPixelCount: - rasterizer->ResetCounter(QueryType::SamplesPassed); + rasterizer->ResetCounter(VideoCommon::QueryType::ZPassPixelCount64); + break; + case Regs::ClearReport::PrimitivesGenerated: + rasterizer->ResetCounter(VideoCommon::QueryType::StreamingByteCount); + break; + case Regs::ClearReport::VtgPrimitivesOut: + rasterizer->ResetCounter(VideoCommon::QueryType::StreamingByteCount); break; default: LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.clear_report_value); @@ -620,28 +614,6 @@ void Maxwell3D::ProcessSyncPoint() { rasterizer->SignalSyncPoint(sync_point); } -std::optional Maxwell3D::GetQueryResult() { - switch (regs.report_semaphore.query.report) { - case Regs::ReportSemaphore::Report::Payload: - return regs.report_semaphore.payload; - case Regs::ReportSemaphore::Report::ZPassPixelCount64: -#if ANDROID - if (!Settings::IsGPULevelHigh()) { - // This is problematic on Android, disable on GPU Normal. - return 120; - } -#endif - // Deferred. - rasterizer->Query(regs.report_semaphore.Address(), QueryType::SamplesPassed, - system.GPU().GetTicks()); - return std::nullopt; - default: - LOG_DEBUG(HW_GPU, "Unimplemented query report type {}", - regs.report_semaphore.query.report.Value()); - return 1; - } -} - void Maxwell3D::ProcessCBBind(size_t stage_index) { // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader // stage. diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 6c19354e1..17faacc37 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -3182,9 +3182,6 @@ private: /// Handles writes to syncing register. void ProcessSyncPoint(); - /// Returns a query's value or an empty object if the value will be deferred through a cache. - std::optional GetQueryResult(); - void RefreshParametersImpl(); bool IsMethodExecutable(u32 method); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 279f0daa1..422d4d859 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -362,21 +362,17 @@ void MaxwellDMA::ReleaseSemaphore() { const auto type = regs.launch_dma.semaphore_type; const GPUVAddr address = regs.semaphore.address; const u32 payload = regs.semaphore.payload; + VideoCommon::QueryPropertiesFlags flags{VideoCommon::QueryPropertiesFlags::IsAFence}; switch (type) { case LaunchDMA::SemaphoreType::NONE: break; case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: { - std::function operation( - [this, address, payload] { memory_manager.Write(address, payload); }); - rasterizer->SignalFence(std::move(operation)); + rasterizer->Query(address, VideoCommon::QueryType::Payload, flags, payload, 0); break; } case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: { - std::function operation([this, address, payload] { - memory_manager.Write(address + sizeof(u64), system.GPU().GetTicks()); - memory_manager.Write(address, payload); - }); - rasterizer->SignalFence(std::move(operation)); + rasterizer->Query(address, VideoCommon::QueryType::Payload, + flags | VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0); break; } default: diff --git a/src/video_core/engines/puller.cpp b/src/video_core/engines/puller.cpp index 6de2543b7..582738234 100644 --- a/src/video_core/engines/puller.cpp +++ b/src/video_core/engines/puller.cpp @@ -82,10 +82,7 @@ void Puller::ProcessSemaphoreTriggerMethod() { if (op == GpuSemaphoreOperation::WriteLong) { const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; const u32 payload = regs.semaphore_sequence; - [this, sequence_address, payload] { - memory_manager.Write(sequence_address + sizeof(u64), gpu.GetTicks()); - memory_manager.Write(sequence_address, payload); - }(); + rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0); } else { do { const u32 word{memory_manager.Read(regs.semaphore_address.SemaphoreAddress())}; @@ -120,10 +117,7 @@ void Puller::ProcessSemaphoreTriggerMethod() { void Puller::ProcessSemaphoreRelease() { const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; const u32 payload = regs.semaphore_release; - std::function operation([this, sequence_address, payload] { - memory_manager.Write(sequence_address, payload); - }); - rasterizer->SignalFence(std::move(operation)); + rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, VideoCommon::QueryPropertiesFlags::IsAFence, payload, 0); } void Puller::ProcessSemaphoreAcquire() { @@ -132,7 +126,6 @@ void Puller::ProcessSemaphoreAcquire() { while (word != value) { regs.acquire_active = true; regs.acquire_value = value; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); rasterizer->ReleaseFences(); word = memory_manager.Read(regs.semaphore_address.SemaphoreAddress()); // TODO(kemathe73) figure out how to do the acquire_timeout diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index ab20ff30f..8459a3092 100644 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -104,9 +104,28 @@ public: SignalFence(std::move(func)); } - void WaitPendingFences() { + void WaitPendingFences(bool force) { if constexpr (!can_async_check) { - TryReleasePendingFences(); + if (force) { + TryReleasePendingFences(); + } else { + TryReleasePendingFences(); + } + } else { + if (!force) { + return; + } + std::mutex wait_mutex; + std::condition_variable wait_cv; + std::atomic wait_finished{}; + std::function func([&] { + std::scoped_lock lk(wait_mutex); + wait_finished.store(true, std::memory_order_relaxed); + wait_cv.notify_all(); + }); + SignalFence(std::move(func)); + std::unique_lock lk(wait_mutex); + wait_cv.wait(lk, [&wait_finished] { return wait_finished.load(std::memory_order_relaxed); }); } } diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index c192e33b2..11549d448 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -102,7 +102,8 @@ struct GPU::Impl { /// Signal the ending of command list. void OnCommandListEnd() { - rasterizer->ReleaseFences(); + rasterizer->ReleaseFences(false); + Settings::UpdateGPUAccuracy(); } /// Request a host GPU memory flush from the CPU. @@ -220,6 +221,7 @@ struct GPU::Impl { /// This can be used to launch any necessary threads and register any necessary /// core timing events. void Start() { + Settings::UpdateGPUAccuracy(); gpu_thread.StartThread(*renderer, renderer->Context(), *scheduler); } diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index c4d459077..fb24b6532 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -41,6 +41,7 @@ set(SHADER_FILES pitch_unswizzle.comp present_bicubic.frag present_gaussian.frag + resolve_conditional_render.comp smaa_edge_detection.vert smaa_edge_detection.frag smaa_blending_weight_calculation.vert diff --git a/src/video_core/host_shaders/resolve_conditional_render.comp b/src/video_core/host_shaders/resolve_conditional_render.comp new file mode 100644 index 000000000..307e77d1a --- /dev/null +++ b/src/video_core/host_shaders/resolve_conditional_render.comp @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#version 450 + +layout(local_size_x = 1) in; + +layout(std430, binding = 0) buffer Query { + uvec2 initial; + uvec2 unknown; + uvec2 current; +}; + +layout(std430, binding = 1) buffer Result { + uint result; +}; + +void main() { + result = all(equal(initial, current)) ? 1 : 0; +} diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index 6272a4652..e980af171 100644 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp @@ -319,6 +319,25 @@ private: } }; +class HLE_DrawIndirectByteCount final : public HLEMacroImpl { +public: + explicit HLE_DrawIndirectByteCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} + + void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { + maxwell3d.RefreshParameters(); + + maxwell3d.regs.draw.begin = parameters[0]; + maxwell3d.regs.draw_auto_stride = parameters[1]; + maxwell3d.regs.draw_auto_byte_count = parameters[2]; + + if (maxwell3d.ShouldExecute()) { + maxwell3d.draw_manager->DrawArray( + maxwell3d.regs.draw.topology, 0, + maxwell3d.regs.draw_auto_byte_count / maxwell3d.regs.draw_auto_stride, 0, 1); + } + } +}; + class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl { public: explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} @@ -536,6 +555,11 @@ HLEMacro::HLEMacro(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} { [](Maxwell3D& maxwell3d__) -> std::unique_ptr { return std::make_unique(maxwell3d__); })); + builders.emplace(0xB5F74EDB717278ECULL, + std::function(Maxwell3D&)>( + [](Maxwell3D& maxwell3d__) -> std::unique_ptr { + return std::make_unique(maxwell3d__); + })); } HLEMacro::~HLEMacro() = default; diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index 7047e2e63..9fcaeeac7 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -25,6 +25,13 @@ #include "video_core/rasterizer_interface.h" #include "video_core/texture_cache/slot_vector.h" +namespace VideoCore { +enum class QueryType { + SamplesPassed, +}; +constexpr std::size_t NumQueryTypes = 1; +} // namespace VideoCore + namespace VideoCommon { using AsyncJobId = SlotId; @@ -98,10 +105,10 @@ private: }; template -class QueryCacheBase : public VideoCommon::ChannelSetupCaches { +class QueryCacheLegacy : public VideoCommon::ChannelSetupCaches { public: - explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_, - Core::Memory::Memory& cpu_memory_) + explicit QueryCacheLegacy(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_) : rasterizer{rasterizer_}, // Use reinterpret_cast instead of static_cast as workaround for // UBSan bug (https://github.com/llvm/llvm-project/issues/59060) diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index cb8029a4f..2ba7cbb0d 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -9,6 +9,7 @@ #include #include "common/common_types.h" #include "common/polyfill_thread.h" +#include "video_core/query_cache/types.h" #include "video_core/cache_types.h" #include "video_core/engines/fermi_2d.h" #include "video_core/gpu.h" @@ -26,11 +27,6 @@ struct ChannelState; namespace VideoCore { -enum class QueryType { - SamplesPassed, -}; -constexpr std::size_t NumQueryTypes = 1; - enum class LoadCallbackStage { Prepare, Build, @@ -58,10 +54,10 @@ public: virtual void DispatchCompute() = 0; /// Resets the counter of a query - virtual void ResetCounter(QueryType type) = 0; + virtual void ResetCounter(VideoCommon::QueryType type) = 0; /// Records a GPU query and caches it - virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional timestamp) = 0; + virtual void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) = 0; /// Signal an uniform buffer binding virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, @@ -83,7 +79,7 @@ public: virtual void SignalReference() = 0; /// Release all pending fences. - virtual void ReleaseFences() = 0; + virtual void ReleaseFences(bool force = true) = 0; /// Notify rasterizer that all caches should be flushed to Switch memory virtual void FlushAll() = 0; diff --git a/src/video_core/renderer_null/null_rasterizer.cpp b/src/video_core/renderer_null/null_rasterizer.cpp index 92ecf6682..65cd5aa06 100644 --- a/src/video_core/renderer_null/null_rasterizer.cpp +++ b/src/video_core/renderer_null/null_rasterizer.cpp @@ -26,16 +26,18 @@ void RasterizerNull::Draw(bool is_indexed, u32 instance_count) {} void RasterizerNull::DrawTexture() {} void RasterizerNull::Clear(u32 layer_count) {} void RasterizerNull::DispatchCompute() {} -void RasterizerNull::ResetCounter(VideoCore::QueryType type) {} -void RasterizerNull::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, - std::optional timestamp) { +void RasterizerNull::ResetCounter(VideoCommon::QueryType type) {} +void RasterizerNull::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { if (!gpu_memory) { return; } - - gpu_memory->Write(gpu_addr, u64{0}); - if (timestamp) { - gpu_memory->Write(gpu_addr + 8, *timestamp); + if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { + u64 ticks = m_gpu.GetTicks(); + gpu_memory->Write(gpu_addr + 8, ticks); + gpu_memory->Write(gpu_addr, static_cast(payload)); + } else { + gpu_memory->Write(gpu_addr, payload); } } void RasterizerNull::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, @@ -74,7 +76,7 @@ void RasterizerNull::SignalSyncPoint(u32 value) { syncpoint_manager.IncrementHost(value); } void RasterizerNull::SignalReference() {} -void RasterizerNull::ReleaseFences() {} +void RasterizerNull::ReleaseFences(bool) {} void RasterizerNull::FlushAndInvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {} void RasterizerNull::WaitForIdle() {} void RasterizerNull::FragmentBarrier() {} diff --git a/src/video_core/renderer_null/null_rasterizer.h b/src/video_core/renderer_null/null_rasterizer.h index 93b9a6971..57a8c4c85 100644 --- a/src/video_core/renderer_null/null_rasterizer.h +++ b/src/video_core/renderer_null/null_rasterizer.h @@ -42,8 +42,8 @@ public: void DrawTexture() override; void Clear(u32 layer_count) override; void DispatchCompute() override; - void ResetCounter(VideoCore::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional timestamp) override; + void ResetCounter(VideoCommon::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void FlushAll() override; @@ -63,7 +63,7 @@ public: void SyncOperation(std::function&& func) override; void SignalSyncPoint(u32 value) override; void SignalReference() override; - void ReleaseFences() override; + void ReleaseFences(bool force) override; void FlushAndInvalidateRegion( VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void WaitForIdle() override; diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp index 99d7347f5..ec142d48e 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.cpp +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp @@ -27,7 +27,7 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) { } // Anonymous namespace QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_) - : QueryCacheBase(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} + : QueryCacheLegacy(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} QueryCache::~QueryCache() = default; diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h index 872513f22..0721e0b3d 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.h +++ b/src/video_core/renderer_opengl/gl_query_cache.h @@ -26,7 +26,7 @@ class RasterizerOpenGL; using CounterStream = VideoCommon::CounterStreamBase; class QueryCache final - : public VideoCommon::QueryCacheBase { + : public VideoCommon::QueryCacheLegacy { public: explicit QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_); ~QueryCache(); diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index dd03efecd..a975bbe75 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -396,13 +396,31 @@ void RasterizerOpenGL::DispatchCompute() { has_written_global_memory |= pipeline->WritesGlobalMemory(); } -void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) { - query_cache.ResetCounter(type); +void RasterizerOpenGL::ResetCounter(VideoCommon::QueryType type) { + if (type == VideoCommon::QueryType::ZPassPixelCount64) { + query_cache.ResetCounter(VideoCore::QueryType::SamplesPassed); + } } -void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, - std::optional timestamp) { - query_cache.Query(gpu_addr, type, timestamp); +void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { + if (type == VideoCommon::QueryType::ZPassPixelCount64) { + std::optional timestamp{True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout) + ? std::make_optional(gpu.GetTicks()) : std:: nullopt }; + if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { + query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, {gpu.GetTicks()}); + } else { + query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, std::nullopt); + } + return; + } + if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { + u64 ticks = gpu.GetTicks(); + gpu_memory->Write(gpu_addr + 8, ticks); + gpu_memory->Write(gpu_addr, static_cast(payload)); + } else { + gpu_memory->Write(gpu_addr, payload); + } } void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, @@ -573,8 +591,8 @@ void RasterizerOpenGL::SignalReference() { fence_manager.SignalOrdering(); } -void RasterizerOpenGL::ReleaseFences() { - fence_manager.WaitPendingFences(); +void RasterizerOpenGL::ReleaseFences(bool force) { + fence_manager.WaitPendingFences(force); } void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size, diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 8eda2ddba..05e048e15 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -86,8 +86,8 @@ public: void DrawTexture() override; void Clear(u32 layer_count) override; void DispatchCompute() override; - void ResetCounter(VideoCore::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional timestamp) override; + void ResetCounter(VideoCommon::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void FlushAll() override; @@ -107,7 +107,7 @@ public: void SyncOperation(std::function&& func) override; void SignalSyncPoint(u32 value) override; void SignalReference() override; - void ReleaseFences() override; + void ReleaseFences(bool force = true) override; void FlushAndInvalidateRegion( VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void WaitForIdle() override; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index e15865d16..d8148e89a 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -61,6 +61,9 @@ vk::Buffer CreateBuffer(const Device& device, const MemoryAllocator& memory_allo if (device.IsExtTransformFeedbackSupported()) { flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; } + if (device.IsExtConditionalRendering()) { + flags |= VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT; + } const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 54ee030ce..97cd4521d 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -12,6 +12,7 @@ #include "common/common_types.h" #include "common/div_ceil.h" #include "video_core/host_shaders/astc_decoder_comp_spv.h" +#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" @@ -302,6 +303,52 @@ std::pair QuadIndexedPass::Assemble( return {staging.buffer, staging.offset}; } +ConditionalRenderingResolvePass::ConditionalRenderingResolvePass(const Device& device_, + Scheduler& scheduler_, + DescriptorPool& descriptor_pool_, ComputePassDescriptorQueue& compute_pass_descriptor_queue_) + : ComputePass(device_, descriptor_pool_, INPUT_OUTPUT_DESCRIPTOR_SET_BINDINGS, + INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE, INPUT_OUTPUT_BANK_INFO, nullptr, + RESOLVE_CONDITIONAL_RENDER_COMP_SPV), + scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} + +void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, + u32 src_offset, bool compare_to_zero) { + scheduler.RequestOutsideRenderPassOperationContext(); + + const size_t compare_size = compare_to_zero ? 8 : 24; + + compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.AddBuffer(src_buffer, src_offset, compare_size); + compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, sizeof(u32)); + const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([this, descriptor_data](vk::CommandBuffer cmdbuf) { + static constexpr VkMemoryBarrier read_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, + }; + const VkDescriptorSet set = descriptor_allocator.Commit(); + device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); + cmdbuf.Dispatch(1, 1, 1); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); + }); +} + ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, StagingBufferPool& staging_buffer_pool_, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index dd3927376..c62f30d30 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -82,6 +82,19 @@ private: ComputePassDescriptorQueue& compute_pass_descriptor_queue; }; +class ConditionalRenderingResolvePass final : public ComputePass { +public: + explicit ConditionalRenderingResolvePass( + const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_); + + void Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, u32 src_offset, bool compare_to_zero); + +private: + Scheduler& scheduler; + ComputePassDescriptorQueue& compute_pass_descriptor_queue; +}; + class ASTCDecoderPass final : public ComputePass { public: explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h index 145359d4e..14fc5ad71 100644 --- a/src/video_core/renderer_vulkan/vk_fence_manager.h +++ b/src/video_core/renderer_vulkan/vk_fence_manager.h @@ -8,6 +8,7 @@ #include "video_core/fence_manager.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" namespace Core { class System; @@ -20,7 +21,6 @@ class RasterizerInterface; namespace Vulkan { class Device; -class QueryCache; class Scheduler; class InnerFence : public VideoCommon::FenceBase { diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 29e0b797b..42f571007 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -1,139 +1,1223 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later -#include #include +#include +#include +#include +#include +#include +#include #include #include +#include +#include + +#include "common/common_types.h" +#include "core/memory.h" +#include "video_core/query_cache/query_cache.h" +#include "video_core/renderer_vulkan/vk_buffer_cache.h" +#include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_resource_pool.h" #include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -using VideoCore::QueryType; +using VideoCommon::QueryType; namespace { +class SamplesQueryBank : public VideoCommon::BankBase { +public: + static constexpr size_t BANK_SIZE = 256; + static constexpr size_t QUERY_SIZE = 8; + SamplesQueryBank(const Device& device_, size_t index_) + : BankBase(BANK_SIZE), device{device_}, index{index_} { + const auto& dev = device.GetLogical(); + query_pool = dev.CreateQueryPool({ + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .queryType = VK_QUERY_TYPE_OCCLUSION, + .queryCount = BANK_SIZE, + .pipelineStatistics = 0, + }); + Reset(); + } -constexpr std::array QUERY_TARGETS = {VK_QUERY_TYPE_OCCLUSION}; + ~SamplesQueryBank() = default; -constexpr VkQueryType GetTarget(QueryType type) { - return QUERY_TARGETS[static_cast(type)]; -} + void Reset() override { + ASSERT(references == 0); + VideoCommon::BankBase::Reset(); + const auto& dev = device.GetLogical(); + dev.ResetQueryPool(*query_pool, 0, BANK_SIZE); + host_results.fill(0ULL); + next_bank = 0; + } -} // Anonymous namespace + void Sync(size_t start, size_t size) { + const auto& dev = device.GetLogical(); + const VkResult query_result = dev.GetQueryResults( + *query_pool, static_cast(start), static_cast(size), sizeof(u64) * size, + &host_results[start], sizeof(u64), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); + switch (query_result) { + case VK_SUCCESS: + return; + case VK_ERROR_DEVICE_LOST: + device.ReportLoss(); + [[fallthrough]]; + default: + throw vk::Exception(query_result); + } + } -QueryPool::QueryPool(const Device& device_, Scheduler& scheduler, QueryType type_) - : ResourcePool{scheduler.GetMasterSemaphore(), GROW_STEP}, device{device_}, type{type_} {} + VkQueryPool GetInnerPool() { + return *query_pool; + } -QueryPool::~QueryPool() = default; + size_t GetIndex() const { + return index; + } -std::pair QueryPool::Commit() { - std::size_t index; - do { - index = CommitResource(); - } while (usage[index]); - usage[index] = true; + const std::array& GetResults() const { + return host_results; + } - return {*pools[index / GROW_STEP], static_cast(index % GROW_STEP)}; -} + size_t next_bank; -void QueryPool::Allocate(std::size_t begin, std::size_t end) { - usage.resize(end); +private: + const Device& device; + const size_t index; + vk::QueryPool query_pool; + std::array host_results; +}; - pools.push_back(device.GetLogical().CreateQueryPool({ - .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .queryType = GetTarget(type), - .queryCount = static_cast(end - begin), - .pipelineStatistics = 0, - })); -} +using BaseStreamer = VideoCommon::SimpleStreamer; -void QueryPool::Reserve(std::pair query) { - const auto it = - std::find_if(pools.begin(), pools.end(), [query_pool = query.first](vk::QueryPool& pool) { - return query_pool == *pool; +struct HostSyncValues { + VAddr address; + size_t size; + size_t offset; + + static constexpr bool GeneratesBaseBuffer = false; +}; + +template +class SamplesStreamer : public BaseStreamer { +public: + SamplesStreamer(size_t id, QueryCacheRuntime& runtime_, const Device& device_, + Scheduler& scheduler_, const MemoryAllocator& memory_allocator_) + : BaseStreamer(id), runtime{runtime_}, device{device_}, scheduler{scheduler_}, + memory_allocator{memory_allocator_} { + BuildResolveBuffer(); + current_bank = nullptr; + current_query = nullptr; + } + + void StartCounter() override { + if (has_started) { + return; + } + ReserveHostQuery(); + scheduler.Record([query_pool = current_query_pool, + query_index = current_bank_slot](vk::CommandBuffer cmdbuf) { + const bool use_precise = Settings::IsGPULevelHigh(); + cmdbuf.BeginQuery(query_pool, static_cast(query_index), + use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0); + }); + has_started = true; + } + + void PauseCounter() override { + if (!has_started) { + return; + } + scheduler.Record([query_pool = current_query_pool, + query_index = current_bank_slot](vk::CommandBuffer cmdbuf) { + cmdbuf.EndQuery(query_pool, static_cast(query_index)); + }); + has_started = false; + } + + void ResetCounter() override { + if (has_started) { + PauseCounter(); + } + AbandonCurrentQuery(); + } + + void CloseCounter() override { + PauseCounter(); + } + + bool HasPendingSync() override { + return !pending_sync.empty(); + } + + void SyncWrites() override { + if (sync_values_stash.empty()) { + return; + } + + for (size_t i = 0; i < sync_values_stash.size(); i++) { + runtime.template SyncValues(sync_values_stash[i], *resolve_buffers[i]); + } + + sync_values_stash.clear(); + } + + void PresyncWrites() override { + if (pending_sync.empty()) { + return; + } + PauseCounter(); + sync_values_stash.clear(); + sync_values_stash.emplace_back(); + std::vector* sync_values = &sync_values_stash.back(); + sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE); + std::unordered_map> offsets; + size_t this_bank_slot = std::numeric_limits::max(); + size_t resolve_slots_remaining = resolve_slots; + size_t resolve_buffer_index = 0; + ApplyBanksWideOp(pending_sync, [&](SamplesQueryBank* bank, size_t start, + size_t amount) { + size_t bank_id = bank->GetIndex(); + if (this_bank_slot != bank_id) { + this_bank_slot = bank_id; + if (resolve_slots_remaining == 0) { + resolve_buffer_index++; + if (resolve_buffer_index >= resolve_buffers.size()) { + BuildResolveBuffer(); + } + resolve_slots_remaining = resolve_slots; + sync_values_stash.emplace_back(); + sync_values = sync_values = &sync_values_stash.back(); + sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE); + } + resolve_slots_remaining--; + } + auto& resolve_buffer = resolve_buffers[resolve_buffer_index]; + const size_t base_offset = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * + (resolve_slots - resolve_slots_remaining - 1); + VkQueryPool query_pool = bank->GetInnerPool(); + scheduler.Record([start, amount, base_offset, query_pool, + buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) { + size_t final_offset = base_offset + start * SamplesQueryBank::QUERY_SIZE; + const VkBufferMemoryBarrier copy_query_pool_barrier{ + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = buffer, + .offset = final_offset, + .size = amount * SamplesQueryBank::QUERY_SIZE, + }; + + cmdbuf.CopyQueryPoolResults( + query_pool, static_cast(start), static_cast(amount), buffer, + static_cast(final_offset), SamplesQueryBank::QUERY_SIZE, + VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier); + }); + offsets[bank_id] = {sync_values_stash.size() - 1, base_offset}; }); - if (it != std::end(pools)) { - const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it); - usage[pool_index * GROW_STEP + static_cast(query.second)] = false; + // Convert queries + for (auto q : pending_sync) { + auto* query = GetQuery(q); + if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { + continue; + } + if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { + continue; + } + if (query->size_slots > 1) { + // This is problematic. + UNIMPLEMENTED(); + } + query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; + auto loc_data = offsets[query->start_bank_id]; + sync_values_stash[loc_data.first].emplace_back(HostSyncValues{ + .address = query->guest_address, + .size = SamplesQueryBank::QUERY_SIZE, + .offset = loc_data.second + query->start_slot * SamplesQueryBank::QUERY_SIZE, + }); + } + + AbandonCurrentQuery(); + pending_sync.clear(); } -} -QueryCache::QueryCache(VideoCore::RasterizerInterface& rasterizer_, - Core::Memory::Memory& cpu_memory_, const Device& device_, - Scheduler& scheduler_) - : QueryCacheBase{rasterizer_, cpu_memory_}, device{device_}, scheduler{scheduler_}, - query_pools{ - QueryPool{device_, scheduler_, QueryType::SamplesPassed}, - } {} - -QueryCache::~QueryCache() { - // TODO(Rodrigo): This is a hack to destroy all HostCounter instances before the base class - // destructor is called. The query cache should be redesigned to have a proper ownership model - // instead of using shared pointers. - for (size_t query_type = 0; query_type < VideoCore::NumQueryTypes; ++query_type) { - auto& stream = Stream(static_cast(query_type)); - stream.Update(false); - stream.Reset(); + size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, + [[maybe_unused]] std::optional subreport) override { + auto index = BuildQuery(); + auto* new_query = GetQuery(index); + new_query->guest_address = address; + new_query->value = 100; + new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; + if (has_timestamp) { + new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; + } + if (!current_query) { + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + return index; + } + new_query->start_bank_id = current_query->start_bank_id; + new_query->size_banks = current_query->size_banks; + new_query->start_slot = current_query->start_slot; + new_query->size_slots = current_query->size_slots; + ApplyBankOp(new_query, [](SamplesQueryBank* bank, size_t start, size_t amount) { + bank->AddReference(amount); + }); + pending_sync.push_back(index); + pending_flush_queries.push_back(index); + return index; } -} -std::pair QueryCache::AllocateQuery(QueryType type) { - return query_pools[static_cast(type)].Commit(); -} - -void QueryCache::Reserve(QueryType type, std::pair query) { - query_pools[static_cast(type)].Reserve(query); -} - -HostCounter::HostCounter(QueryCache& cache_, std::shared_ptr dependency_, - QueryType type_) - : HostCounterBase{std::move(dependency_)}, cache{cache_}, type{type_}, - query{cache_.AllocateQuery(type_)}, tick{cache_.GetScheduler().CurrentTick()} { - const vk::Device* logical = &cache.GetDevice().GetLogical(); - cache.GetScheduler().Record([logical, query_ = query](vk::CommandBuffer cmdbuf) { - const bool use_precise = Settings::IsGPULevelHigh(); - logical->ResetQueryPool(query_.first, query_.second, 1); - cmdbuf.BeginQuery(query_.first, query_.second, - use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0); - }); -} - -HostCounter::~HostCounter() { - cache.Reserve(type, query); -} - -void HostCounter::EndQuery() { - cache.GetScheduler().Record([query_ = query](vk::CommandBuffer cmdbuf) { - cmdbuf.EndQuery(query_.first, query_.second); - }); -} - -u64 HostCounter::BlockingQuery(bool async) const { - if (!async) { - cache.GetScheduler().Wait(tick); + bool HasUnsyncedQueries() override { + return !pending_flush_queries.empty(); } - u64 data; - const VkResult query_result = cache.GetDevice().GetLogical().GetQueryResults( - query.first, query.second, 1, sizeof(data), &data, sizeof(data), - VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); - switch (query_result) { - case VK_SUCCESS: - return data; - case VK_ERROR_DEVICE_LOST: - cache.GetDevice().ReportLoss(); - [[fallthrough]]; + void PushUnsyncedQueries() override { + PauseCounter(); + { + std::scoped_lock lk(flush_guard); + pending_flush_sets.emplace_back(std::move(pending_flush_queries)); + } + } + + void PopUnsyncedQueries() override { + std::vector current_flush_queries; + { + std::scoped_lock lk(flush_guard); + current_flush_queries = std::move(pending_flush_sets.front()); + pending_flush_sets.pop_front(); + } + ApplyBanksWideOp( + current_flush_queries, + [](SamplesQueryBank* bank, size_t start, size_t amount) { bank->Sync(start, amount); }); + for (auto q : current_flush_queries) { + auto* query = GetQuery(q); + u64 total = 0; + ApplyBankOp(query, [&total](SamplesQueryBank* bank, size_t start, size_t amount) { + const auto& results = bank->GetResults(); + for (size_t i = 0; i < amount; i++) { + total += results[start + i]; + } + }); + query->value = total; + query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + } + } + +private: + template + void ApplyBankOp(VideoCommon::HostQueryBase* query, Func&& func) { + size_t size_slots = query->size_slots; + if (size_slots == 0) { + return; + } + size_t bank_id = query->start_bank_id; + size_t banks_set = query->size_banks; + size_t start_slot = query->start_slot; + for (size_t i = 0; i < banks_set; i++) { + auto& the_bank = bank_pool.GetBank(bank_id); + size_t amount = std::min(the_bank.Size() - start_slot, size_slots); + func(&the_bank, start_slot, amount); + bank_id = the_bank.next_bank - 1; + start_slot = 0; + size_slots -= amount; + } + } + + template + void ApplyBanksWideOp(std::vector& queries, Func&& func) { + std::conditional_t>, + std::unordered_map>> + indexer; + for (auto q : queries) { + auto* query = GetQuery(q); + ApplyBankOp(query, [&indexer](SamplesQueryBank* bank, size_t start, size_t amount) { + auto id = bank->GetIndex(); + auto pair = indexer.try_emplace(id, std::numeric_limits::max(), + std::numeric_limits::min()); + auto& current_pair = pair.first->second; + current_pair.first = std::min(current_pair.first, start); + current_pair.second = std::max(current_pair.second, amount + start); + }); + } + for (auto& cont : indexer) { + func(&bank_pool.GetBank(cont.first), cont.second.first, + cont.second.second - cont.second.first); + } + } + + void ReserveBank() { + current_bank_id = + bank_pool.ReserveBank([this](std::deque& queue, size_t index) { + queue.emplace_back(device, index); + }); + if (current_bank) { + current_bank->next_bank = current_bank_id + 1; + } + current_bank = &bank_pool.GetBank(current_bank_id); + current_query_pool = current_bank->GetInnerPool(); + } + + size_t ReserveBankSlot() { + if (!current_bank || current_bank->IsClosed()) { + ReserveBank(); + } + auto [built, index] = current_bank->Reserve(); + current_bank_slot = index; + return index; + } + + void ReserveHostQuery() { + size_t new_slot = ReserveBankSlot(); + current_bank->AddReference(1); + if (current_query) { + size_t bank_id = current_query->start_bank_id; + size_t banks_set = current_query->size_banks - 1; + bool found = bank_id == current_bank_id; + while (!found && banks_set > 0) { + SamplesQueryBank& some_bank = bank_pool.GetBank(bank_id); + bank_id = some_bank.next_bank - 1; + found = bank_id == current_bank_id; + banks_set--; + } + if (!found) { + current_query->size_banks++; + } + current_query->size_slots++; + } else { + current_query_id = BuildQuery(); + current_query = GetQuery(current_query_id); + current_query->start_bank_id = static_cast(current_bank_id); + current_query->size_banks = 1; + current_query->start_slot = new_slot; + current_query->size_slots = 1; + } + } + + void Free(size_t query_id) override { + std::scoped_lock lk(guard); + auto* query = GetQuery(query_id); + ApplyBankOp(query, [](SamplesQueryBank* bank, size_t start, size_t amount) { + bank->CloseReference(amount); + }); + ReleaseQuery(query_id); + } + + void AbandonCurrentQuery() { + if (!current_query) { + return; + } + Free(current_query_id); + current_query = nullptr; + current_query_id = 0; + } + + void BuildResolveBuffer() { + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * resolve_slots, + .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + resolve_buffers.emplace_back( + std::move(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal))); + } + + static constexpr size_t resolve_slots = 8; + + QueryCacheRuntime& runtime; + const Device& device; + Scheduler& scheduler; + const MemoryAllocator& memory_allocator; + VideoCommon::BankPool bank_pool; + std::deque resolve_buffers; + std::deque> sync_values_stash; + + // syncing queue + std::vector pending_sync; + + // flush levels + std::vector pending_flush_queries; + std::deque> pending_flush_sets; + + // State Machine + size_t current_bank_slot; + size_t current_bank_id; + SamplesQueryBank* current_bank; + VkQueryPool current_query_pool; + size_t current_query_id; + VideoCommon::HostQueryBase* current_query; + bool has_started{}; + std::mutex flush_guard; +}; + +// Transform feedback queries +class TFBQueryBank : public VideoCommon::BankBase { +public: + static constexpr size_t BANK_SIZE = 1024; + static constexpr size_t QUERY_SIZE = 4; + TFBQueryBank(Scheduler& scheduler_, const MemoryAllocator& memory_allocator, size_t index_) + : BankBase(BANK_SIZE), scheduler{scheduler_}, index{index_} { + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = QUERY_SIZE * BANK_SIZE, + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + } + + ~TFBQueryBank() = default; + + void Reset() override { + ASSERT(references == 0); + VideoCommon::BankBase::Reset(); + } + + void Sync(StagingBufferRef& stagging_buffer, size_t extra_offset, size_t start, size_t size) { + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([this, dst_buffer = stagging_buffer.buffer, extra_offset, start, + size](vk::CommandBuffer cmdbuf) { + std::array copy{VkBufferCopy{ + .srcOffset = start * QUERY_SIZE, + .dstOffset = extra_offset, + .size = size * QUERY_SIZE, + }}; + cmdbuf.CopyBuffer(*buffer, dst_buffer, copy); + }); + } + + size_t GetIndex() const { + return index; + } + + VkBuffer GetBuffer() const { + return *buffer; + } + +private: + Scheduler& scheduler; + const size_t index; + vk::Buffer buffer; +}; + +template +class TFBCounterStreamer : public BaseStreamer { +public: + TFBCounterStreamer(size_t id, QueryCacheRuntime& runtime_, const Device& device_, + Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, + StagingBufferPool& staging_pool_) + : BaseStreamer(id), runtime{runtime_}, device{device_}, scheduler{scheduler_}, + memory_allocator{memory_allocator_}, staging_pool{staging_pool_} { + buffers_count = 0; + current_bank = nullptr; + counter_buffers.fill(VK_NULL_HANDLE); + offsets.fill(0); + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = TFBQueryBank::QUERY_SIZE * NUM_STREAMS, + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_COUNTER_BUFFER_BIT_EXT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + + counters_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + for (auto& c : counter_buffers) { + c = *counters_buffer; + } + size_t base_offset = 0; + for (auto& o : offsets) { + o = base_offset; + base_offset += TFBQueryBank::QUERY_SIZE; + } + } + + void StartCounter() override { + FlushBeginTFB(); + has_started = true; + } + + void PauseCounter() override { + CloseCounter(); + } + + void ResetCounter() override { + CloseCounter(); + } + + void CloseCounter() override { + if (has_flushed_end_pending) { + FlushEndTFB(); + } + runtime.View3DRegs([this](Tegra::Engines::Maxwell3D::Regs& regs) { + if (regs.transform_feedback_enabled == 0) { + streams_mask = 0; + has_started = false; + } + }); + } + + bool HasPendingSync() override { + return !pending_sync.empty(); + } + + void SyncWrites() override { + CloseCounter(); + std::unordered_map> sync_values_stash; + for (auto q : pending_sync) { + auto* query = GetQuery(q); + if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { + continue; + } + if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { + continue; + } + query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; + sync_values_stash.try_emplace(query->start_bank_id); + sync_values_stash[query->start_bank_id].emplace_back(HostSyncValues{ + .address = query->guest_address, + .size = TFBQueryBank::QUERY_SIZE, + .offset = query->start_slot * TFBQueryBank::QUERY_SIZE, + }); + } + for (auto& p : sync_values_stash) { + auto& bank = bank_pool.GetBank(p.first); + runtime.template SyncValues(p.second, bank.GetBuffer()); + } + pending_sync.clear(); + } + + size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, + std::optional subreport_) override { + auto index = BuildQuery(); + auto* new_query = GetQuery(index); + new_query->guest_address = address; + new_query->value = 0; + new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; + if (has_timestamp) { + new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; + } + if (!subreport_) { + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + return index; + } + const size_t subreport = static_cast(*subreport_); + UpdateBuffers(); + if ((streams_mask & (1ULL << subreport)) == 0) { + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + return index; + } + CloseCounter(); + auto [bank_slot, data_slot] = ProduceCounterBuffer(subreport); + new_query->start_bank_id = static_cast(bank_slot); + new_query->size_banks = 1; + new_query->start_slot = static_cast(data_slot); + new_query->size_slots = 1; + pending_sync.push_back(index); + pending_flush_queries.push_back(index); + return index; + } + + bool HasUnsyncedQueries() override { + return !pending_flush_queries.empty(); + } + + void PushUnsyncedQueries() override { + CloseCounter(); + auto staging_ref = staging_pool.Request( + pending_flush_queries.size() * TFBQueryBank::QUERY_SIZE, MemoryUsage::Download, true); + size_t offset_base = staging_ref.offset; + for (auto q : pending_flush_queries) { + auto* query = GetQuery(q); + auto& bank = bank_pool.GetBank(query->start_bank_id); + bank.Sync(staging_ref, offset_base, query->start_slot, 1); + offset_base += TFBQueryBank::QUERY_SIZE; + bank.CloseReference(); + } + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, + }; + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER); + }); + + std::scoped_lock lk(flush_guard); + for (auto& str : free_queue) { + staging_pool.FreeDeferred(str); + } + free_queue.clear(); + download_buffers.emplace_back(staging_ref); + pending_flush_sets.emplace_back(std::move(pending_flush_queries)); + } + + void PopUnsyncedQueries() override { + StagingBufferRef staging_ref; + std::vector flushed_queries; + { + std::scoped_lock lk(flush_guard); + staging_ref = download_buffers.front(); + flushed_queries = std::move(pending_flush_sets.front()); + download_buffers.pop_front(); + pending_flush_sets.pop_front(); + } + + size_t offset_base = staging_ref.offset; + for (auto q : flushed_queries) { + auto* query = GetQuery(q); + u32 result = 0; + std::memcpy(&result, staging_ref.mapped_span.data() + offset_base, sizeof(u32)); + query->value = static_cast(result); + query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + offset_base += TFBQueryBank::QUERY_SIZE; + } + + { + std::scoped_lock lk(flush_guard); + free_queue.emplace_back(staging_ref); + } + } + +private: + void FlushBeginTFB() { + if (has_flushed_end_pending) [[unlikely]] { + return; + } + has_flushed_end_pending = true; + if (!has_started || buffers_count == 0) { + scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); + }); + UpdateBuffers(); + return; + } + scheduler.Record([this, total = static_cast(buffers_count)](vk::CommandBuffer cmdbuf) { + cmdbuf.BeginTransformFeedbackEXT(0, total, counter_buffers.data(), offsets.data()); + }); + UpdateBuffers(); + } + + void FlushEndTFB() { + if (!has_flushed_end_pending) [[unlikely]] { + UNREACHABLE(); + return; + } + has_flushed_end_pending = false; + + if (buffers_count == 0) { + scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); + }); + } else { + scheduler.Record([this, total = static_cast(buffers_count)](vk::CommandBuffer cmdbuf) { + cmdbuf.EndTransformFeedbackEXT(0, total, counter_buffers.data(), offsets.data()); + }); + } + } + + void UpdateBuffers() { + runtime.View3DRegs([this](Tegra::Engines::Maxwell3D::Regs& regs) { + buffers_count = 0; + for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers; + i++) { + const auto& tf = regs.transform_feedback; + if (tf.buffers[i].enable == 0) { + continue; + } + const size_t stream = tf.controls[i].stream; + streams_mask |= 1ULL << stream; + buffers_count = std::max(buffers_count, stream + 1); + } + }); + } + + std::pair ProduceCounterBuffer(size_t stream) { + if (current_bank == nullptr || current_bank->IsClosed()) { + current_bank_id = + bank_pool.ReserveBank([this](std::deque& queue, size_t index) { + queue.emplace_back(scheduler, memory_allocator, index); + }); + current_bank = &bank_pool.GetBank(current_bank_id); + } + auto [dont_care, slot] = current_bank->Reserve(); + current_bank->AddReference(); + + static constexpr VkMemoryBarrier READ_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + }; + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT, + }; + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([dst_buffer = current_bank->GetBuffer(), + src_buffer = counter_buffers[stream], src_offset = offsets[stream], + slot](vk::CommandBuffer cmdbuf) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER); + std::array copy{VkBufferCopy{ + .srcOffset = src_offset, + .dstOffset = slot * TFBQueryBank::QUERY_SIZE, + .size = TFBQueryBank::QUERY_SIZE, + }}; + cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, WRITE_BARRIER); + }); + return {current_bank_id, slot}; + } + + static constexpr size_t NUM_STREAMS = 4; + static constexpr size_t STREAMS_MASK = (1ULL << NUM_STREAMS) - 1ULL; + + QueryCacheRuntime& runtime; + const Device& device; + Scheduler& scheduler; + const MemoryAllocator& memory_allocator; + StagingBufferPool& staging_pool; + VideoCommon::BankPool bank_pool; + size_t current_bank_id; + TFBQueryBank* current_bank; + vk::Buffer counters_buffer; + + // syncing queue + std::vector pending_sync; + + // flush levels + std::vector pending_flush_queries; + std::deque download_buffers; + std::deque> pending_flush_sets; + std::vector free_queue; + std::mutex flush_guard; + + // state machine + bool has_started{}; + bool has_flushed_end_pending{}; + size_t buffers_count{}; + std::array counter_buffers{}; + std::array offsets{}; + u64 streams_mask; +}; + +} // namespace + +struct QueryCacheRuntimeImpl { + QueryCacheRuntimeImpl(QueryCacheRuntime& runtime, VideoCore::RasterizerInterface* rasterizer_, + Core::Memory::Memory& cpu_memory_, Vulkan::BufferCache& buffer_cache_, + const Device& device_, const MemoryAllocator& memory_allocator_, + Scheduler& scheduler_, StagingBufferPool& staging_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue, + DescriptorPool& descriptor_pool) + : rasterizer{rasterizer_}, cpu_memory{cpu_memory_}, + buffer_cache{buffer_cache_}, device{device_}, + memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_}, + guest_streamer(0, runtime), + sample_streamer(static_cast(QueryType::ZPassPixelCount64), runtime, device, + scheduler, memory_allocator), + tfb_streamer(static_cast(QueryType::StreamingByteCount), runtime, device, + scheduler, memory_allocator, staging_pool), + hcr_setup{}, hcr_is_set{}, is_hcr_running{} { + + hcr_setup.sType = VK_STRUCTURE_TYPE_CONDITIONAL_RENDERING_BEGIN_INFO_EXT; + hcr_setup.pNext = nullptr; + hcr_setup.flags = 0; + + conditional_resolve_pass = std::make_unique( + device, scheduler, descriptor_pool, compute_pass_descriptor_queue); + + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = sizeof(u32), + .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + hcr_resolve_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + } + + VideoCore::RasterizerInterface* rasterizer; + Core::Memory::Memory& cpu_memory; + Vulkan::BufferCache& buffer_cache; + + const Device& device; + const MemoryAllocator& memory_allocator; + Scheduler& scheduler; + StagingBufferPool& staging_pool; + + // Streamers + VideoCommon::GuestStreamer guest_streamer; + SamplesStreamer sample_streamer; + TFBCounterStreamer tfb_streamer; + + std::vector> little_cache; + std::vector> buffers_to_upload_to; + std::vector redirect_cache; + std::vector> copies_setup; + + // Host conditional rendering data + std::unique_ptr conditional_resolve_pass; + vk::Buffer hcr_resolve_buffer; + VkConditionalRenderingBeginInfoEXT hcr_setup; + VkBuffer hcr_buffer; + size_t hcr_offset; + bool hcr_is_set; + bool is_hcr_running; + + // maxwell3d + Tegra::Engines::Maxwell3D* maxwell3d; +}; + +QueryCacheRuntime::QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer, + Core::Memory::Memory& cpu_memory_, + Vulkan::BufferCache& buffer_cache_, const Device& device_, + const MemoryAllocator& memory_allocator_, + Scheduler& scheduler_, StagingBufferPool& staging_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue, + DescriptorPool& descriptor_pool) { + impl = std::make_unique( + *this, rasterizer, cpu_memory_, buffer_cache_, device_, memory_allocator_, scheduler_, + staging_pool_, compute_pass_descriptor_queue, descriptor_pool); +} + +void QueryCacheRuntime::Bind3DEngine(Tegra::Engines::Maxwell3D* maxwell3d) { + impl->maxwell3d = maxwell3d; +} + +template +void QueryCacheRuntime::View3DRegs(Func&& func) { + func(impl->maxwell3d->regs); +} + +void QueryCacheRuntime::EndHostConditionalRendering() { + PauseHostConditionalRendering(); + impl->hcr_is_set = false; + impl->is_hcr_running = false; + impl->hcr_buffer = nullptr; + impl->hcr_offset = 0; +} + +void QueryCacheRuntime::PauseHostConditionalRendering() { + if (!impl->hcr_is_set) { + return; + } + if (impl->is_hcr_running) { + impl->scheduler.Record( + [](vk::CommandBuffer cmdbuf) { cmdbuf.EndConditionalRenderingEXT(); }); + } + impl->is_hcr_running = false; +} + +void QueryCacheRuntime::ResumeHostConditionalRendering() { + if (!impl->hcr_is_set) { + return; + } + if (!impl->is_hcr_running) { + impl->scheduler.Record([hcr_setup = impl->hcr_setup](vk::CommandBuffer cmdbuf) { + cmdbuf.BeginConditionalRenderingEXT(hcr_setup); + }); + } + impl->is_hcr_running = true; +} + +void QueryCacheRuntime::HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, + bool is_equal) { + { + std::scoped_lock lk(impl->buffer_cache.mutex); + static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; + const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; + const auto [buffer, offset] = + impl->buffer_cache.ObtainCPUBuffer(object.address, 8, sync_info, post_op); + impl->hcr_buffer = buffer->Handle(); + impl->hcr_offset = offset; + } + if (impl->hcr_is_set) { + if (impl->hcr_setup.buffer == impl->hcr_buffer && + impl->hcr_setup.offset == impl->hcr_offset) { + ResumeHostConditionalRendering(); + return; + } + PauseHostConditionalRendering(); + } + impl->hcr_setup.buffer = impl->hcr_buffer; + impl->hcr_setup.offset = impl->hcr_offset; + impl->hcr_setup.flags = is_equal ? VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT : 0; + impl->hcr_is_set = true; + impl->is_hcr_running = false; + ResumeHostConditionalRendering(); +} + +void QueryCacheRuntime::HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal) { + VkBuffer to_resolve; + u32 to_resolve_offset; + { + std::scoped_lock lk(impl->buffer_cache.mutex); + static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::NoSynchronize; + const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; + const auto [buffer, offset] = + impl->buffer_cache.ObtainCPUBuffer(address, 24, sync_info, post_op); + to_resolve = buffer->Handle(); + to_resolve_offset = static_cast(offset); + } + if (impl->is_hcr_running) { + PauseHostConditionalRendering(); + } + impl->conditional_resolve_pass->Resolve(*impl->hcr_resolve_buffer, to_resolve, + to_resolve_offset, false); + impl->hcr_setup.buffer = *impl->hcr_resolve_buffer; + impl->hcr_setup.offset = 0; + impl->hcr_setup.flags = is_equal ? 0 : VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; + impl->hcr_is_set = true; + impl->is_hcr_running = false; + ResumeHostConditionalRendering(); +} + +bool QueryCacheRuntime::HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, + [[maybe_unused]] bool qc_dirty) { + if (!impl->device.IsExtConditionalRendering()) { + return false; + } + HostConditionalRenderingCompareValueImpl(object_1, false); + return true; +} + +bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1, + VideoCommon::LookupData object_2, + bool qc_dirty, bool equal_check) { + if (!impl->device.IsExtConditionalRendering()) { + return false; + } + + const auto check_in_bc = [&](VAddr address) { + return impl->buffer_cache.IsRegionGpuModified(address, 8); + }; + const auto check_value = [&](VAddr address) { + u8* ptr = impl->cpu_memory.GetPointer(address); + u64 value{}; + std::memcpy(&value, ptr, sizeof(value)); + return value == 0; + }; + std::array objects{&object_1, &object_2}; + std::array is_in_bc{}; + std::array is_in_qc{}; + std::array is_in_ac{}; + std::array is_null{}; + { + std::scoped_lock lk(impl->buffer_cache.mutex); + for (size_t i = 0; i < 2; i++) { + is_in_qc[i] = objects[i]->found_query != nullptr; + is_in_bc[i] = !is_in_qc[i] && check_in_bc(objects[i]->address); + is_in_ac[i] = is_in_qc[i] || is_in_bc[i]; + } + } + + if (!is_in_ac[0] && !is_in_ac[1]) { + EndHostConditionalRendering(); + return false; + } + + if (!qc_dirty && !is_in_bc[0] && !is_in_bc[1]) { + EndHostConditionalRendering(); + return false; + } + + for (size_t i = 0; i < 2; i++) { + is_null[i] = !is_in_ac[i] && check_value(objects[i]->address); + } + + for (size_t i = 0; i < 2; i++) { + if (is_null[i]) { + size_t j = (i + 1) % 2; + HostConditionalRenderingCompareValueImpl(*objects[j], equal_check); + return true; + } + } + HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); + return true; +} + +QueryCacheRuntime::~QueryCacheRuntime() = default; + +VideoCommon::StreamerInterface* QueryCacheRuntime::GetStreamerInterface(QueryType query_type) { + switch (query_type) { + case QueryType::Payload: + return &impl->guest_streamer; + case QueryType::ZPassPixelCount64: + return &impl->sample_streamer; + case QueryType::StreamingByteCount: + return &impl->tfb_streamer; default: - throw vk::Exception(query_result); + return nullptr; } } +void QueryCacheRuntime::Barriers(bool is_prebarrier) { + static constexpr VkMemoryBarrier READ_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, + }; + if (is_prebarrier) { + impl->scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER); + }); + } else { + impl->scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER); + }); + } +} + +template +void QueryCacheRuntime::SyncValues(std::span values, VkBuffer base_src_buffer) { + if (values.size() == 0) { + return; + } + impl->redirect_cache.clear(); + impl->little_cache.clear(); + size_t total_size = 0; + for (auto& sync_val : values) { + total_size += sync_val.size; + bool found = false; + VAddr base = Common::AlignDown(sync_val.address, Core::Memory::YUZU_PAGESIZE); + VAddr base_end = base + Core::Memory::YUZU_PAGESIZE; + for (size_t i = 0; i < impl->little_cache.size(); i++) { + const auto set_found = [&] { + impl->redirect_cache.push_back(i); + found = true; + }; + auto& loc = impl->little_cache[i]; + if (base < loc.second && loc.first < base_end) { + set_found(); + break; + } + if (loc.first == base_end) { + loc.first = base; + set_found(); + break; + } + if (loc.second == base) { + loc.second = base_end; + set_found(); + break; + } + } + if (!found) { + impl->redirect_cache.push_back(impl->little_cache.size()); + impl->little_cache.emplace_back(base, base_end); + } + } + + // Vulkan part. + std::scoped_lock lk(impl->buffer_cache.mutex); + impl->buffer_cache.BufferOperations([&] { + impl->buffers_to_upload_to.clear(); + for (auto& pair : impl->little_cache) { + static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; + const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; + const auto [buffer, offset] = impl->buffer_cache.ObtainCPUBuffer( + pair.first, static_cast(pair.second - pair.first), sync_info, post_op); + impl->buffers_to_upload_to.emplace_back(buffer->Handle(), offset); + } + }); + + VkBuffer src_buffer; + [[maybe_unused]] StagingBufferRef ref; + impl->copies_setup.clear(); + impl->copies_setup.resize(impl->little_cache.size()); + if constexpr (SyncValuesType::GeneratesBaseBuffer) { + ref = impl->staging_pool.Request(total_size, MemoryUsage::Upload); + size_t current_offset = ref.offset; + size_t accumulated_size = 0; + for (size_t i = 0; i < values.size(); i++) { + size_t which_copy = impl->redirect_cache[i]; + impl->copies_setup[which_copy].emplace_back(VkBufferCopy{ + .srcOffset = current_offset + accumulated_size, + .dstOffset = impl->buffers_to_upload_to[which_copy].second + values[i].address - + impl->little_cache[which_copy].first, + .size = values[i].size, + }); + std::memcpy(ref.mapped_span.data() + accumulated_size, &values[i].value, + values[i].size); + accumulated_size += values[i].size; + } + src_buffer = ref.buffer; + } else { + for (size_t i = 0; i < values.size(); i++) { + size_t which_copy = impl->redirect_cache[i]; + impl->copies_setup[which_copy].emplace_back(VkBufferCopy{ + .srcOffset = values[i].offset, + .dstOffset = impl->buffers_to_upload_to[which_copy].second + values[i].address - + impl->little_cache[which_copy].first, + .size = values[i].size, + }); + } + src_buffer = base_src_buffer; + } + + impl->scheduler.RequestOutsideRenderPassOperationContext(); + impl->scheduler.Record([src_buffer, dst_buffers = std::move(impl->buffers_to_upload_to), + vk_copies = std::move(impl->copies_setup)](vk::CommandBuffer cmdbuf) { + size_t size = dst_buffers.size(); + for (size_t i = 0; i < size; i++) { + cmdbuf.CopyBuffer(src_buffer, dst_buffers[i].first, vk_copies[i]); + } + }); +} + } // namespace Vulkan + +namespace VideoCommon { + +template class QueryCacheBase; + +} // namespace VideoCommon diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h index c1b9552eb..9ad2929d7 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.h +++ b/src/video_core/renderer_vulkan/vk_query_cache.h @@ -1,101 +1,74 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #pragma once -#include #include -#include -#include -#include "common/common_types.h" -#include "video_core/query_cache.h" -#include "video_core/renderer_vulkan/vk_resource_pool.h" -#include "video_core/vulkan_common/vulkan_wrapper.h" +#include "video_core/query_cache/query_cache_base.h" +#include "video_core/renderer_vulkan/vk_buffer_cache.h" namespace VideoCore { class RasterizerInterface; } +namespace VideoCommon { +class StreamerInterface; +} + namespace Vulkan { -class CachedQuery; class Device; -class HostCounter; -class QueryCache; class Scheduler; +class StagingBufferPool; -using CounterStream = VideoCommon::CounterStreamBase; +struct QueryCacheRuntimeImpl; -class QueryPool final : public ResourcePool { +class QueryCacheRuntime { public: - explicit QueryPool(const Device& device, Scheduler& scheduler, VideoCore::QueryType type); - ~QueryPool() override; + explicit QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer, + Core::Memory::Memory& cpu_memory_, + Vulkan::BufferCache& buffer_cache_, const Device& device_, + const MemoryAllocator& memory_allocator_, Scheduler& scheduler_, + StagingBufferPool& staging_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue, + DescriptorPool& descriptor_pool); + ~QueryCacheRuntime(); - std::pair Commit(); + template + void SyncValues(std::span values, VkBuffer base_src_buffer = nullptr); - void Reserve(std::pair query); + void Barriers(bool is_prebarrier); -protected: - void Allocate(std::size_t begin, std::size_t end) override; + void EndHostConditionalRendering(); + + void PauseHostConditionalRendering(); + + void ResumeHostConditionalRendering(); + + bool HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, bool qc_dirty); + + bool HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1, + VideoCommon::LookupData object_2, bool qc_dirty, bool equal_check); + + VideoCommon::StreamerInterface* GetStreamerInterface(VideoCommon::QueryType query_type); + + void Bind3DEngine(Tegra::Engines::Maxwell3D* maxwell3d); + + template + void View3DRegs(Func&& func); private: - static constexpr std::size_t GROW_STEP = 512; - - const Device& device; - const VideoCore::QueryType type; - - std::vector pools; - std::vector usage; + void HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, bool is_equal); + void HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal); + friend struct QueryCacheRuntimeImpl; + std::unique_ptr impl; }; -class QueryCache final - : public VideoCommon::QueryCacheBase { -public: - explicit QueryCache(VideoCore::RasterizerInterface& rasterizer_, - Core::Memory::Memory& cpu_memory_, const Device& device_, - Scheduler& scheduler_); - ~QueryCache(); - - std::pair AllocateQuery(VideoCore::QueryType type); - - void Reserve(VideoCore::QueryType type, std::pair query); - - const Device& GetDevice() const noexcept { - return device; - } - - Scheduler& GetScheduler() const noexcept { - return scheduler; - } - -private: - const Device& device; - Scheduler& scheduler; - std::array query_pools; +struct QueryCacheParams { + using RuntimeType = Vulkan::QueryCacheRuntime; }; -class HostCounter final : public VideoCommon::HostCounterBase { -public: - explicit HostCounter(QueryCache& cache_, std::shared_ptr dependency_, - VideoCore::QueryType type_); - ~HostCounter(); - - void EndQuery(); - -private: - u64 BlockingQuery(bool async = false) const override; - - QueryCache& cache; - const VideoCore::QueryType type; - const std::pair query; - const u64 tick; -}; - -class CachedQuery : public VideoCommon::CachedQueryBase { -public: - explicit CachedQuery(QueryCache&, VideoCore::QueryType, VAddr cpu_addr_, u8* host_ptr_) - : CachedQueryBase{cpu_addr_, host_ptr_} {} -}; +using QueryCache = VideoCommon::QueryCacheBase; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 01e76a82c..e8862ba04 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -24,6 +24,7 @@ #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" @@ -170,9 +171,11 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool, guest_descriptor_queue, compute_pass_descriptor_queue, descriptor_pool), buffer_cache(*this, cpu_memory_, buffer_cache_runtime), + query_cache_runtime(this, cpu_memory_, buffer_cache, device, memory_allocator, scheduler, + staging_pool, compute_pass_descriptor_queue, descriptor_pool), + query_cache(gpu, *this, cpu_memory_, query_cache_runtime), pipeline_cache(*this, device, scheduler, descriptor_pool, guest_descriptor_queue, render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), - query_cache{*this, cpu_memory_, device, scheduler}, accelerate_dma(buffer_cache, texture_cache, scheduler), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), wfi_event(device.GetLogical().CreateEvent()) { @@ -189,13 +192,15 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { FlushWork(); gpu_memory->FlushCaching(); + query_cache.NotifySegment(true); + #if ANDROID if (Settings::IsGPULevelHigh()) { // This is problematic on Android, disable on GPU Normal. - query_cache.UpdateCounters(); + // query_cache.UpdateCounters(); } #else - query_cache.UpdateCounters(); + // query_cache.UpdateCounters(); #endif GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()}; @@ -207,13 +212,12 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { pipeline->SetEngine(maxwell3d, gpu_memory); pipeline->Configure(is_indexed); - BeginTransformFeedback(); - UpdateDynamicStates(); + HandleTransformFeedback(); + query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, + maxwell3d->regs.zpass_pixel_count_enable); draw_func(); - - EndTransformFeedback(); } void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { @@ -241,6 +245,14 @@ void RasterizerVulkan::DrawIndirect() { const auto indirect_buffer = buffer_cache.GetDrawIndirectBuffer(); const auto& buffer = indirect_buffer.first; const auto& offset = indirect_buffer.second; + if (params.is_byte_count) { + scheduler.Record([buffer_obj = buffer->Handle(), offset, + stride = params.stride](vk::CommandBuffer cmdbuf) { + cmdbuf.DrawIndirectByteCountEXT(1, 0, buffer_obj, offset, 0, + static_cast(stride)); + }); + return; + } if (params.include_count) { const auto count = buffer_cache.GetDrawIndirectCount(); const auto& draw_buffer = count.first; @@ -280,13 +292,15 @@ void RasterizerVulkan::DrawTexture() { SCOPE_EXIT({ gpu.TickWork(); }); FlushWork(); + query_cache.NotifySegment(true); + #if ANDROID if (Settings::IsGPULevelHigh()) { // This is problematic on Android, disable on GPU Normal. - query_cache.UpdateCounters(); + // query_cache.UpdateCounters(); } #else - query_cache.UpdateCounters(); + // query_cache.UpdateCounters(); #endif texture_cache.SynchronizeGraphicsDescriptors(); @@ -294,6 +308,8 @@ void RasterizerVulkan::DrawTexture() { UpdateDynamicStates(); + query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, + maxwell3d->regs.zpass_pixel_count_enable); const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState(); const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler); const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture); @@ -319,12 +335,16 @@ void RasterizerVulkan::Clear(u32 layer_count) { #if ANDROID if (Settings::IsGPULevelHigh()) { // This is problematic on Android, disable on GPU Normal. - query_cache.UpdateCounters(); + // query_cache.UpdateCounters(); } #else - query_cache.UpdateCounters(); + // query_cache.UpdateCounters(); #endif + query_cache.NotifySegment(true); + query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, + maxwell3d->regs.zpass_pixel_count_enable); + auto& regs = maxwell3d->regs; const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B || regs.clear_surface.A; @@ -482,13 +502,13 @@ void RasterizerVulkan::DispatchCompute() { scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); }); } -void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) { - query_cache.ResetCounter(type); +void RasterizerVulkan::ResetCounter(VideoCommon::QueryType type) { + query_cache.CounterReset(type); } -void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, - std::optional timestamp) { - query_cache.Query(gpu_addr, type, timestamp); +void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { + query_cache.CounterReport(gpu_addr, type, flags, payload, subreport); } void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, @@ -669,8 +689,8 @@ void RasterizerVulkan::SignalReference() { fence_manager.SignalReference(); } -void RasterizerVulkan::ReleaseFences() { - fence_manager.WaitPendingFences(); +void RasterizerVulkan::ReleaseFences(bool force) { + fence_manager.WaitPendingFences(force); } void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size, @@ -694,6 +714,8 @@ void RasterizerVulkan::WaitForIdle() { flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; } + query_cache.NotifyWFI(); + scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) { cmdbuf.SetEvent(event, flags); @@ -737,19 +759,7 @@ void RasterizerVulkan::TickFrame() { bool RasterizerVulkan::AccelerateConditionalRendering() { gpu_memory->FlushCaching(); - if (Settings::IsGPULevelHigh()) { - // TODO(Blinkhawk): Reimplement Host conditional rendering. - return false; - } - // Medium / Low Hack: stub any checks on queries written into the buffer cache. - const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()}; - Maxwell::ReportSemaphore::Compare cmp; - if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp), - VideoCommon::CacheType::BufferCache | - VideoCommon::CacheType::QueryCache)) { - return true; - } - return false; + return query_cache.AccelerateHostConditionalRendering(); } bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, @@ -795,6 +805,7 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, if (!image_view) { return false; } + query_cache.NotifySegment(false); screen_info.image = image_view->ImageHandle(); screen_info.image_view = image_view->Handle(Shader::TextureType::Color2D); screen_info.width = image_view->size.width; @@ -933,31 +944,18 @@ void RasterizerVulkan::UpdateDynamicStates() { } } -void RasterizerVulkan::BeginTransformFeedback() { +void RasterizerVulkan::HandleTransformFeedback() { const auto& regs = maxwell3d->regs; - if (regs.transform_feedback_enabled == 0) { - return; - } if (!device.IsExtTransformFeedbackSupported()) { LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); return; } - UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) || - regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation)); - scheduler.Record( - [](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); }); -} - -void RasterizerVulkan::EndTransformFeedback() { - const auto& regs = maxwell3d->regs; - if (regs.transform_feedback_enabled == 0) { - return; + query_cache.CounterEnable(VideoCommon::QueryType::StreamingByteCount, + regs.transform_feedback_enabled); + if (regs.transform_feedback_enabled != 0) { + UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) || + regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation)); } - if (!device.IsExtTransformFeedbackSupported()) { - return; - } - scheduler.Record( - [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); } void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index b31982485..ffd44c68d 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -84,8 +84,8 @@ public: void DrawTexture() override; void Clear(u32 layer_count) override; void DispatchCompute() override; - void ResetCounter(VideoCore::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional timestamp) override; + void ResetCounter(VideoCommon::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void FlushAll() override; @@ -106,7 +106,7 @@ public: void SyncOperation(std::function&& func) override; void SignalSyncPoint(u32 value) override; void SignalReference() override; - void ReleaseFences() override; + void ReleaseFences(bool force = true) override; void FlushAndInvalidateRegion( VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void WaitForIdle() override; @@ -146,9 +146,7 @@ private: void UpdateDynamicStates(); - void BeginTransformFeedback(); - - void EndTransformFeedback(); + void HandleTransformFeedback(); void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); @@ -195,8 +193,9 @@ private: TextureCache texture_cache; BufferCacheRuntime buffer_cache_runtime; BufferCache buffer_cache; - PipelineCache pipeline_cache; + QueryCacheRuntime query_cache_runtime; QueryCache query_cache; + PipelineCache pipeline_cache; AccelerateDMA accelerate_dma; FenceManager fence_manager; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 89fd31b4f..3be7837f4 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -243,10 +243,10 @@ void Scheduler::AllocateNewContext() { #if ANDROID if (Settings::IsGPULevelHigh()) { // This is problematic on Android, disable on GPU Normal. - query_cache->UpdateCounters(); + query_cache->NotifySegment(true); } #else - query_cache->UpdateCounters(); + query_cache->NotifySegment(true); #endif } } @@ -261,11 +261,12 @@ void Scheduler::EndPendingOperations() { #if ANDROID if (Settings::IsGPULevelHigh()) { // This is problematic on Android, disable on GPU Normal. - query_cache->DisableStreams(); + // query_cache->DisableStreams(); } #else - query_cache->DisableStreams(); + // query_cache->DisableStreams(); #endif + query_cache->NotifySegment(false); EndRenderPass(); } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 475c682eb..c87e5fb07 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -15,6 +15,7 @@ #include "common/common_types.h" #include "common/polyfill_thread.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -24,7 +25,6 @@ class Device; class Framebuffer; class GraphicsPipeline; class StateTracker; -class QueryCache; /// The scheduler abstracts command buffer and fence management with an interface that's able to do /// OpenGL-like operations on Vulkan command buffers. diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 6c7fa34e5..16f0425be 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -61,6 +61,7 @@ VK_DEFINE_HANDLE(VmaAllocator) // Define miscellaneous extensions which may be used by the implementation here. #define FOR_EACH_VK_EXTENSION(EXTENSION) \ + EXTENSION(EXT, CONDITIONAL_RENDERING, conditional_rendering) \ EXTENSION(EXT, CONSERVATIVE_RASTERIZATION, conservative_rasterization) \ EXTENSION(EXT, DEPTH_RANGE_UNRESTRICTED, depth_range_unrestricted) \ EXTENSION(EXT, MEMORY_BUDGET, memory_budget) \ @@ -93,6 +94,7 @@ VK_DEFINE_HANDLE(VmaAllocator) // Define extensions where the absence of the extension may result in a degraded experience. #define FOR_EACH_VK_RECOMMENDED_EXTENSION(EXTENSION_NAME) \ + EXTENSION_NAME(VK_EXT_CONDITIONAL_RENDERING_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME) \ @@ -536,6 +538,10 @@ public: return extensions.shader_atomic_int64; } + bool IsExtConditionalRendering() const { + return extensions.conditional_rendering; + } + bool HasTimelineSemaphore() const; /// Returns the minimum supported version of SPIR-V. diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index c3f388d89..5a08a92e1 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -75,6 +75,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkBeginCommandBuffer); X(vkBindBufferMemory); X(vkBindImageMemory); + X(vkCmdBeginConditionalRenderingEXT); X(vkCmdBeginQuery); X(vkCmdBeginRenderPass); X(vkCmdBeginTransformFeedbackEXT); @@ -91,6 +92,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdCopyBufferToImage); X(vkCmdCopyImage); X(vkCmdCopyImageToBuffer); + X(vkCmdCopyQueryPoolResults); X(vkCmdDispatch); X(vkCmdDispatchIndirect); X(vkCmdDraw); @@ -99,6 +101,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdDrawIndexedIndirect); X(vkCmdDrawIndirectCount); X(vkCmdDrawIndexedIndirectCount); + X(vkCmdEndConditionalRenderingEXT); X(vkCmdEndQuery); X(vkCmdEndRenderPass); X(vkCmdEndTransformFeedbackEXT); diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 049fa8038..27d94a7d5 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -185,6 +185,7 @@ struct DeviceDispatch : InstanceDispatch { PFN_vkBeginCommandBuffer vkBeginCommandBuffer{}; PFN_vkBindBufferMemory vkBindBufferMemory{}; PFN_vkBindImageMemory vkBindImageMemory{}; + PFN_vkCmdBeginConditionalRenderingEXT vkCmdBeginConditionalRenderingEXT{}; PFN_vkCmdBeginDebugUtilsLabelEXT vkCmdBeginDebugUtilsLabelEXT{}; PFN_vkCmdBeginQuery vkCmdBeginQuery{}; PFN_vkCmdBeginRenderPass vkCmdBeginRenderPass{}; @@ -202,6 +203,7 @@ struct DeviceDispatch : InstanceDispatch { PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage{}; PFN_vkCmdCopyImage vkCmdCopyImage{}; PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{}; + PFN_vkCmdCopyQueryPoolResults vkCmdCopyQueryPoolResults{}; PFN_vkCmdDispatch vkCmdDispatch{}; PFN_vkCmdDispatchIndirect vkCmdDispatchIndirect{}; PFN_vkCmdDraw vkCmdDraw{}; @@ -210,6 +212,7 @@ struct DeviceDispatch : InstanceDispatch { PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{}; PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{}; PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{}; + PFN_vkCmdEndConditionalRenderingEXT vkCmdEndConditionalRenderingEXT{}; PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; PFN_vkCmdEndQuery vkCmdEndQuery{}; PFN_vkCmdEndRenderPass vkCmdEndRenderPass{}; @@ -1270,6 +1273,13 @@ public: regions.data()); } + void CopyQueryPoolResults(VkQueryPool query_pool, u32 first_query, u32 query_count, + VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize stride, + VkQueryResultFlags flags) const noexcept { + dld->vkCmdCopyQueryPoolResults(handle, query_pool, first_query, query_count, dst_buffer, + dst_offset, stride, flags); + } + void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size, u32 data) const noexcept { dld->vkCmdFillBuffer(handle, dst_buffer, dst_offset, size, data); @@ -1448,6 +1458,15 @@ public: counter_buffers, counter_buffer_offsets); } + void BeginConditionalRenderingEXT( + const VkConditionalRenderingBeginInfoEXT& info) const noexcept { + dld->vkCmdBeginConditionalRenderingEXT(handle, &info); + } + + void EndConditionalRenderingEXT() const noexcept { + dld->vkCmdEndConditionalRenderingEXT(handle); + } + void BeginDebugUtilsLabelEXT(const char* label, std::span color) const noexcept { const VkDebugUtilsLabelEXT label_info{ .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, From 57401589c2e94d49b03fd68ae0ad5b2e36aac795 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 4 Aug 2023 03:33:04 +0200 Subject: [PATCH 03/10] Macro HLE: Add DrawIndirectByteCount --- src/video_core/engines/draw_manager.h | 1 + src/video_core/macro/macro_hle.cpp | 35 ++++++++++++++++--- .../vulkan_common/vulkan_wrapper.cpp | 1 + src/video_core/vulkan_common/vulkan_wrapper.h | 8 +++++ 4 files changed, 40 insertions(+), 5 deletions(-) diff --git a/src/video_core/engines/draw_manager.h b/src/video_core/engines/draw_manager.h index 7c22c49f1..18d959143 100644 --- a/src/video_core/engines/draw_manager.h +++ b/src/video_core/engines/draw_manager.h @@ -46,6 +46,7 @@ public: }; struct IndirectParams { + bool is_byte_count; bool is_indexed; bool include_count; GPUVAddr count_start_address; diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index e980af171..046c8085e 100644 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp @@ -67,6 +67,7 @@ public: } auto& params = maxwell3d.draw_manager->GetIndirectParams(); + params.is_byte_count = false; params.is_indexed = false; params.include_count = false; params.count_start_address = 0; @@ -161,6 +162,7 @@ public: 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); } auto& params = maxwell3d.draw_manager->GetIndirectParams(); + params.is_byte_count = false; params.is_indexed = true; params.include_count = false; params.count_start_address = 0; @@ -256,6 +258,7 @@ public: const u32 estimate = static_cast(maxwell3d.EstimateIndexBufferSize()); maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; auto& params = maxwell3d.draw_manager->GetIndirectParams(); + params.is_byte_count = false; params.is_indexed = true; params.include_count = true; params.count_start_address = maxwell3d.GetMacroAddress(4); @@ -324,17 +327,39 @@ public: explicit HLE_DrawIndirectByteCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { + auto topology = static_cast(parameters[0] & 0xFFFFU); + if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) { + Fallback(parameters); + return; + } + + auto& params = maxwell3d.draw_manager->GetIndirectParams(); + params.is_byte_count = true; + params.is_indexed = false; + params.include_count = false; + params.count_start_address = 0; + params.indirect_start_address = maxwell3d.GetMacroAddress(2); + params.buffer_size = 4; + params.max_draw_counts = 1; + params.stride = parameters[1]; + maxwell3d.regs.draw.begin = parameters[0]; + maxwell3d.regs.draw_auto_stride = parameters[1]; + maxwell3d.regs.draw_auto_byte_count = parameters[2]; + + maxwell3d.draw_manager->DrawArrayIndirect(topology); + } + +private: + void Fallback(const std::vector& parameters) { maxwell3d.RefreshParameters(); maxwell3d.regs.draw.begin = parameters[0]; maxwell3d.regs.draw_auto_stride = parameters[1]; maxwell3d.regs.draw_auto_byte_count = parameters[2]; - if (maxwell3d.ShouldExecute()) { - maxwell3d.draw_manager->DrawArray( - maxwell3d.regs.draw.topology, 0, - maxwell3d.regs.draw_auto_byte_count / maxwell3d.regs.draw_auto_stride, 0, 1); - } + maxwell3d.draw_manager->DrawArray( + maxwell3d.regs.draw.topology, 0, + maxwell3d.regs.draw_auto_byte_count / maxwell3d.regs.draw_auto_stride, 0, 1); } }; diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index 5a08a92e1..5afba365c 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -101,6 +101,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdDrawIndexedIndirect); X(vkCmdDrawIndirectCount); X(vkCmdDrawIndexedIndirectCount); + X(vkCmdDrawIndirectByteCountEXT); X(vkCmdEndConditionalRenderingEXT); X(vkCmdEndQuery); X(vkCmdEndRenderPass); diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 27d94a7d5..0d4bbe7f7 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -212,6 +212,7 @@ struct DeviceDispatch : InstanceDispatch { PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{}; PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{}; PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{}; + PFN_vkCmdDrawIndirectByteCountEXT vkCmdDrawIndirectByteCountEXT{}; PFN_vkCmdEndConditionalRenderingEXT vkCmdEndConditionalRenderingEXT{}; PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; PFN_vkCmdEndQuery vkCmdEndQuery{}; @@ -1185,6 +1186,13 @@ public: count_offset, draw_count, stride); } + void DrawIndirectByteCountEXT(u32 instance_count, u32 first_instance, VkBuffer counter_buffer, + VkDeviceSize counter_buffer_offset, u32 counter_offset, + u32 stride) { + dld->vkCmdDrawIndirectByteCountEXT(handle, instance_count, first_instance, counter_buffer, + counter_buffer_offset, counter_offset, stride); + } + void ClearAttachments(Span attachments, Span rects) const noexcept { dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(), From aa6587d854e4953876b02ca71278a665bcae8179 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 4 Aug 2023 13:38:49 +0200 Subject: [PATCH 04/10] QueryCache: Implement dependant queries. --- src/video_core/query_cache/query_base.h | 1 + src/video_core/query_cache/query_cache.h | 18 +- src/video_core/query_cache/query_stream.h | 6 +- .../renderer_vulkan/vk_query_cache.cpp | 160 +++++++++++++++++- 4 files changed, 180 insertions(+), 5 deletions(-) diff --git a/src/video_core/query_cache/query_base.h b/src/video_core/query_cache/query_base.h index 485ed669c..0ae23af9f 100644 --- a/src/video_core/query_cache/query_base.h +++ b/src/video_core/query_cache/query_base.h @@ -18,6 +18,7 @@ enum class QueryFlagBits : u32 { IsInvalidated = 1 << 6, ///< Indicates the value of th query has been nullified. IsOrphan = 1 << 7, ///< Indicates the query has not been set by a guest query. IsFence = 1 << 8, ///< Indicates the query is a fence. + IsQueuedForAsyncFlush = 1 <<9,///< Indicates that the query can be flushed at any moment }; DECLARE_ENUM_FLAG_OPERATORS(QueryFlagBits) diff --git a/src/video_core/query_cache/query_cache.h b/src/video_core/query_cache/query_cache.h index f6af48d14..f1393d5c7 100644 --- a/src/video_core/query_cache/query_cache.h +++ b/src/video_core/query_cache/query_cache.h @@ -489,8 +489,22 @@ void QueryCacheBase::PopAsyncFlushes() { if (mask == 0) { return; } - impl->ForEachStreamerIn(mask, - [](StreamerInterface* streamer) { streamer->PopUnsyncedQueries(); }); + u64 ran_mask = 0; + u64 next_phase = 0; + while (mask) { + impl->ForEachStreamerIn(mask, [&mask, &ran_mask, &next_phase](StreamerInterface* streamer) { + u64 dep_mask = streamer->GetDependenceMask(); + if ((dep_mask & ~ran_mask) != 0) { + next_phase |= dep_mask; + return; + } + u64 index = streamer->GetId(); + ran_mask |= (1ULL << index); + mask &= ~(1ULL << index); + streamer->PopUnsyncedQueries(); + }); + ran_mask |= next_phase; + } } // Invalidation diff --git a/src/video_core/query_cache/query_stream.h b/src/video_core/query_cache/query_stream.h index dd5f95b3c..0e9275565 100644 --- a/src/video_core/query_cache/query_stream.h +++ b/src/video_core/query_cache/query_stream.h @@ -70,6 +70,10 @@ public: return id; } + u64 GetDependenceMask() const { + return dependance_mask; + } + protected: const size_t id; const u64 dependance_mask; @@ -78,7 +82,7 @@ protected: template class SimpleStreamer : public StreamerInterface { public: - SimpleStreamer(size_t id_) : StreamerInterface{id_} {} + SimpleStreamer(size_t id_, u64 dependance_mask_ = 0) : StreamerInterface{id_, dependance_mask_} {} virtual ~SimpleStreamer() = default; protected: diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 42f571007..ef891e26b 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -525,6 +525,9 @@ private: vk::Buffer buffer; }; +template +class PrimitivesSucceededStreamer; + template class TFBCounterStreamer : public BaseStreamer { public: @@ -537,6 +540,7 @@ public: current_bank = nullptr; counter_buffers.fill(VK_NULL_HANDLE); offsets.fill(0); + last_queries.fill(0); const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, @@ -630,7 +634,7 @@ public: return index; } const size_t subreport = static_cast(*subreport_); - UpdateBuffers(); + last_queries[subreport] = address; if ((streams_mask & (1ULL << subreport)) == 0) { new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; return index; @@ -646,6 +650,13 @@ public: return index; } + std::optional GetLastQueryStream(size_t stream) { + if (last_queries[stream] != 0) { + return {last_queries[stream]}; + } + return std::nullopt; + } + bool HasUnsyncedQueries() override { return !pending_flush_queries.empty(); } @@ -657,6 +668,7 @@ public: size_t offset_base = staging_ref.offset; for (auto q : pending_flush_queries) { auto* query = GetQuery(q); + query->flags |= VideoCommon::QueryFlagBits::IsQueuedForAsyncFlush; auto& bank = bank_pool.GetBank(query->start_bank_id); bank.Sync(staging_ref, offset_base, query->start_slot, 1); offset_base += TFBQueryBank::QUERY_SIZE; @@ -741,13 +753,15 @@ private: cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); } else { - scheduler.Record([this, total = static_cast(buffers_count)](vk::CommandBuffer cmdbuf) { + scheduler.Record([this, + total = static_cast(buffers_count)](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, total, counter_buffers.data(), offsets.data()); }); } } void UpdateBuffers() { + last_queries.fill(0); runtime.View3DRegs([this](Tegra::Engines::Maxwell3D::Regs& regs) { buffers_count = 0; for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers; @@ -804,6 +818,9 @@ private: return {current_bank_id, slot}; } + template + friend class PrimitivesSucceededStreamer; + static constexpr size_t NUM_STREAMS = 4; static constexpr size_t STREAMS_MASK = (1ULL << NUM_STREAMS) - 1ULL; @@ -833,9 +850,143 @@ private: size_t buffers_count{}; std::array counter_buffers{}; std::array offsets{}; + std::array last_queries; u64 streams_mask; }; +class PrimitivesQueryBase : public VideoCommon::QueryBase { +public: + // Default constructor + PrimitivesQueryBase() + : VideoCommon::QueryBase(0, VideoCommon::QueryFlagBits::IsHostManaged, 0), stride{}, + dependant_index{}, dependant_manage{} {} + + // Parameterized constructor + PrimitivesQueryBase(bool is_long, VAddr address) + : VideoCommon::QueryBase(address, VideoCommon::QueryFlagBits::IsHostManaged, 0), stride{}, + dependant_index{}, dependant_manage{} { + if (is_long) { + flags |= VideoCommon::QueryFlagBits::HasTimestamp; + } + } + + u64 stride; + VAddr dependant_address; + size_t dependant_index; + bool dependant_manage; +}; + +template +class PrimitivesSucceededStreamer : public VideoCommon::SimpleStreamer { +public: + PrimitivesSucceededStreamer(size_t id, QueryCacheRuntime& runtime_, + TFBCounterStreamer& tfb_streamer_, Core::Memory::Memory& cpu_memory_) + : VideoCommon::SimpleStreamer( + id, 1ULL << static_cast(VideoCommon::QueryType::StreamingByteCount)), + runtime{runtime_}, tfb_streamer{tfb_streamer_}, cpu_memory{cpu_memory_} {} + + size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, + std::optional subreport_) override { + auto index = BuildQuery(); + auto* new_query = GetQuery(index); + new_query->guest_address = address; + new_query->value = 0; + if (has_timestamp) { + new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; + } + if (!subreport_) { + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + return index; + } + const size_t subreport = static_cast(*subreport_); + auto dependant_address_opt = tfb_streamer.GetLastQueryStream(subreport); + bool must_manage_dependance = false; + if (dependant_address_opt) { + new_query->dependant_address = *dependant_address_opt; + } else { + new_query->dependant_index = + tfb_streamer.WriteCounter(address, has_timestamp, value, subreport_); + auto* dependant_query = tfb_streamer.GetQuery(new_query->dependant_index); + dependant_query->flags |= VideoCommon::QueryFlagBits::IsInvalidated; + must_manage_dependance = true; + if (True(dependant_query->flags & VideoCommon::QueryFlagBits::IsFinalValueSynced)) { + new_query->value = 0; + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + if (must_manage_dependance) { + tfb_streamer.Free(new_query->dependant_index); + } + return index; + } + } + + new_query->dependant_manage = must_manage_dependance; + runtime.View3DRegs([new_query, subreport](Tegra::Engines::Maxwell3D::Regs& regs) { + for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers; + i++) { + const auto& tf = regs.transform_feedback; + if (tf.controls[i].stream != subreport) { + continue; + } + new_query->stride = tf.controls[i].stride; + break; + } + }); + pending_flush_queries.push_back(index); + return index; + } + + bool HasUnsyncedQueries() override { + return !pending_flush_queries.empty(); + } + + void PushUnsyncedQueries() override { + std::scoped_lock lk(flush_guard); + pending_flush_sets.emplace_back(std::move(pending_flush_queries)); + pending_flush_queries.clear(); + } + + void PopUnsyncedQueries() override { + std::vector flushed_queries; + { + std::scoped_lock lk(flush_guard); + flushed_queries = std::move(pending_flush_sets.front()); + pending_flush_sets.pop_front(); + } + + for (auto q : flushed_queries) { + auto* query = GetQuery(q); + if (True(query->flags & VideoCommon::QueryFlagBits::IsFinalValueSynced)) { + continue; + } + + query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + if (query->dependant_manage) { + auto* dependant_query = tfb_streamer.GetQuery(query->dependant_index); + query->value = dependant_query->value / query->stride; + tfb_streamer.Free(query->dependant_index); + } else { + u8* pointer = cpu_memory.GetPointer(query->dependant_address); + u32 result; + std::memcpy(&result, pointer, sizeof(u32)); + query->value = static_cast(result) / query->stride; + } + } + } + +private: + QueryCacheRuntime& runtime; + TFBCounterStreamer& tfb_streamer; + Core::Memory::Memory& cpu_memory; + + // syncing queue + std::vector pending_sync; + + // flush levels + std::vector pending_flush_queries; + std::deque> pending_flush_sets; + std::mutex flush_guard; +}; + } // namespace struct QueryCacheRuntimeImpl { @@ -853,6 +1004,8 @@ struct QueryCacheRuntimeImpl { scheduler, memory_allocator), tfb_streamer(static_cast(QueryType::StreamingByteCount), runtime, device, scheduler, memory_allocator, staging_pool), + primitives_succeeded_streamer( + static_cast(QueryType::StreamingPrimitivesSucceeded), runtime, tfb_streamer, cpu_memory_), hcr_setup{}, hcr_is_set{}, is_hcr_running{} { hcr_setup.sType = VK_STRUCTURE_TYPE_CONDITIONAL_RENDERING_BEGIN_INFO_EXT; @@ -889,6 +1042,7 @@ struct QueryCacheRuntimeImpl { VideoCommon::GuestStreamer guest_streamer; SamplesStreamer sample_streamer; TFBCounterStreamer tfb_streamer; + PrimitivesSucceededStreamer primitives_succeeded_streamer; std::vector> little_cache; std::vector> buffers_to_upload_to; @@ -1086,6 +1240,8 @@ VideoCommon::StreamerInterface* QueryCacheRuntime::GetStreamerInterface(QueryTyp return &impl->sample_streamer; case QueryType::StreamingByteCount: return &impl->tfb_streamer; + case QueryType::StreamingPrimitivesSucceeded: + return &impl->primitives_succeeded_streamer; default: return nullptr; } From 282ae8fa51e060e6d4ef026b734aa871b1b9331e Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 6 Aug 2023 09:38:16 +0200 Subject: [PATCH 05/10] Query Cache: address issues --- src/video_core/buffer_cache/buffer_cache.h | 5 +- .../buffer_cache/buffer_cache_base.h | 4 +- src/video_core/engines/maxwell_3d.cpp | 6 - src/video_core/engines/puller.cpp | 6 +- src/video_core/fence_manager.h | 14 +- src/video_core/query_cache/bank_base.h | 16 +- src/video_core/query_cache/query_base.h | 44 ++-- src/video_core/query_cache/query_cache.h | 66 +++--- src/video_core/query_cache/query_cache_base.h | 8 +- src/video_core/query_cache/query_stream.h | 22 +- src/video_core/rasterizer_interface.h | 5 +- .../renderer_null/null_rasterizer.h | 3 +- .../renderer_opengl/gl_rasterizer.cpp | 24 ++- .../renderer_opengl/gl_rasterizer.h | 3 +- .../renderer_vulkan/vk_compute_pass.cpp | 6 +- .../renderer_vulkan/vk_fence_manager.h | 2 +- .../renderer_vulkan/vk_query_cache.cpp | 203 +++++++++++------- .../renderer_vulkan/vk_query_cache.h | 5 +- .../renderer_vulkan/vk_rasterizer.cpp | 27 --- .../renderer_vulkan/vk_rasterizer.h | 3 +- src/video_core/renderer_vulkan/vk_scheduler.h | 12 +- 21 files changed, 270 insertions(+), 214 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index f91b7d1e4..9e90c587c 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -276,9 +276,8 @@ std::pair BufferCache

::ObtainBuffer(GPUVAddr gpu_ad } template -std::pair BufferCache

::ObtainCPUBuffer(VAddr cpu_addr, u32 size, - ObtainBufferSynchronize sync_info, - ObtainBufferOperation post_op) { +std::pair BufferCache

::ObtainCPUBuffer( + VAddr cpu_addr, u32 size, ObtainBufferSynchronize sync_info, ObtainBufferOperation post_op) { const BufferId buffer_id = FindBuffer(cpu_addr, size); Buffer& buffer = slot_buffers[buffer_id]; diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 9507071e5..c4f6e8d12 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -297,8 +297,8 @@ public: ObtainBufferOperation post_op); [[nodiscard]] std::pair ObtainCPUBuffer(VAddr gpu_addr, u32 size, - ObtainBufferSynchronize sync_info, - ObtainBufferOperation post_op); + ObtainBufferSynchronize sync_info, + ObtainBufferOperation post_op); void FlushCachedWrites(); /// Return true when there are uncommitted buffers to be downloaded diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 922c399e6..46b9c548a 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -596,12 +596,6 @@ void Maxwell3D::ProcessCounterReset() { case Regs::ClearReport::ZPassPixelCount: rasterizer->ResetCounter(VideoCommon::QueryType::ZPassPixelCount64); break; - case Regs::ClearReport::PrimitivesGenerated: - rasterizer->ResetCounter(VideoCommon::QueryType::StreamingByteCount); - break; - case Regs::ClearReport::VtgPrimitivesOut: - rasterizer->ResetCounter(VideoCommon::QueryType::StreamingByteCount); - break; default: LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.clear_report_value); break; diff --git a/src/video_core/engines/puller.cpp b/src/video_core/engines/puller.cpp index 582738234..8dd34c04a 100644 --- a/src/video_core/engines/puller.cpp +++ b/src/video_core/engines/puller.cpp @@ -82,7 +82,8 @@ void Puller::ProcessSemaphoreTriggerMethod() { if (op == GpuSemaphoreOperation::WriteLong) { const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; const u32 payload = regs.semaphore_sequence; - rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0); + rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, + VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0); } else { do { const u32 word{memory_manager.Read(regs.semaphore_address.SemaphoreAddress())}; @@ -117,7 +118,8 @@ void Puller::ProcessSemaphoreTriggerMethod() { void Puller::ProcessSemaphoreRelease() { const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; const u32 payload = regs.semaphore_release; - rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, VideoCommon::QueryPropertiesFlags::IsAFence, payload, 0); + rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, + VideoCommon::QueryPropertiesFlags::IsAFence, payload, 0); } void Puller::ProcessSemaphoreAcquire() { diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index 8459a3092..805a89900 100644 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -55,6 +55,9 @@ public: // Unlike other fences, this one doesn't void SignalOrdering() { + if constexpr (!can_async_check) { + TryReleasePendingFences(); + } std::scoped_lock lock{buffer_cache.mutex}; buffer_cache.AccumulateFlushes(); } @@ -104,13 +107,9 @@ public: SignalFence(std::move(func)); } - void WaitPendingFences(bool force) { + void WaitPendingFences([[maybe_unused]] bool force) { if constexpr (!can_async_check) { - if (force) { - TryReleasePendingFences(); - } else { - TryReleasePendingFences(); - } + TryReleasePendingFences(); } else { if (!force) { return; @@ -125,7 +124,8 @@ public: }); SignalFence(std::move(func)); std::unique_lock lk(wait_mutex); - wait_cv.wait(lk, [&wait_finished] { return wait_finished.load(std::memory_order_relaxed); }); + wait_cv.wait( + lk, [&wait_finished] { return wait_finished.load(std::memory_order_relaxed); }); } } diff --git a/src/video_core/query_cache/bank_base.h b/src/video_core/query_cache/bank_base.h index 4246a609d..420927091 100644 --- a/src/video_core/query_cache/bank_base.h +++ b/src/video_core/query_cache/bank_base.h @@ -7,21 +7,19 @@ #include #include - #include "common/common_types.h" namespace VideoCommon { class BankBase { protected: - const size_t base_bank_size; - size_t bank_size; - std::atomic references; - size_t current_slot; + const size_t base_bank_size{}; + size_t bank_size{}; + std::atomic references{}; + size_t current_slot{}; public: - BankBase(size_t bank_size_) - : base_bank_size{bank_size_}, bank_size(bank_size_), references(0), current_slot(0) {} + explicit BankBase(size_t bank_size_) : base_bank_size{bank_size_}, bank_size(bank_size_) {} virtual ~BankBase() = default; @@ -58,11 +56,11 @@ public: bank_size = current_slot; } - constexpr bool IsClosed() { + bool IsClosed() const { return current_slot >= bank_size; } - bool IsDead() { + bool IsDead() const { return IsClosed() && references == 0; } }; diff --git a/src/video_core/query_cache/query_base.h b/src/video_core/query_cache/query_base.h index 0ae23af9f..993a13eac 100644 --- a/src/video_core/query_cache/query_base.h +++ b/src/video_core/query_cache/query_base.h @@ -9,28 +9,28 @@ namespace VideoCommon { enum class QueryFlagBits : u32 { - HasTimestamp = 1 << 0, ///< Indicates if this query has a tiemstamp. - IsFinalValueSynced = 1 << 1, ///< Indicates if the query has been synced in the host - IsHostSynced = 1 << 2, ///< Indicates if the query has been synced in the host - IsGuestSynced = 1 << 3, ///< Indicates if the query has been synced with the guest. - IsHostManaged = 1 << 4, ///< Indicates if this query points to a host query - IsRewritten = 1 << 5, ///< Indicates if this query was rewritten by another query - IsInvalidated = 1 << 6, ///< Indicates the value of th query has been nullified. - IsOrphan = 1 << 7, ///< Indicates the query has not been set by a guest query. - IsFence = 1 << 8, ///< Indicates the query is a fence. - IsQueuedForAsyncFlush = 1 <<9,///< Indicates that the query can be flushed at any moment + HasTimestamp = 1 << 0, ///< Indicates if this query has a timestamp. + IsFinalValueSynced = 1 << 1, ///< Indicates if the query has been synced in the host + IsHostSynced = 1 << 2, ///< Indicates if the query has been synced in the host + IsGuestSynced = 1 << 3, ///< Indicates if the query has been synced with the guest. + IsHostManaged = 1 << 4, ///< Indicates if this query points to a host query + IsRewritten = 1 << 5, ///< Indicates if this query was rewritten by another query + IsInvalidated = 1 << 6, ///< Indicates the value of th query has been nullified. + IsOrphan = 1 << 7, ///< Indicates the query has not been set by a guest query. + IsFence = 1 << 8, ///< Indicates the query is a fence. + IsQueuedForAsyncFlush = 1 << 9, ///< Indicates that the query can be flushed at any moment }; DECLARE_ENUM_FLAG_OPERATORS(QueryFlagBits) class QueryBase { public: - VAddr guest_address; - QueryFlagBits flags; - u64 value; + VAddr guest_address{}; + QueryFlagBits flags{}; + u64 value{}; protected: // Default constructor - QueryBase() : guest_address(0), flags{}, value{} {} + QueryBase() = default; // Parameterized constructor QueryBase(VAddr address, QueryFlagBits flags_, u64 value_) @@ -51,23 +51,21 @@ public: class HostQueryBase : public QueryBase { public: // Default constructor - HostQueryBase() - : QueryBase(0, QueryFlagBits::IsHostManaged | QueryFlagBits::IsOrphan, 0), start_bank_id{}, - size_banks{}, start_slot{}, size_slots{} {} + HostQueryBase() : QueryBase(0, QueryFlagBits::IsHostManaged | QueryFlagBits::IsOrphan, 0) {} // Parameterized constructor - HostQueryBase(bool isLong, VAddr address) + HostQueryBase(bool has_timestamp, VAddr address) : QueryBase(address, QueryFlagBits::IsHostManaged, 0), start_bank_id{}, size_banks{}, start_slot{}, size_slots{} { - if (isLong) { + if (has_timestamp) { flags |= QueryFlagBits::HasTimestamp; } } - u32 start_bank_id; - u32 size_banks; - size_t start_slot; - size_t size_slots; + u32 start_bank_id{}; + u32 size_banks{}; + size_t start_slot{}; + size_t size_slots{}; }; } // namespace VideoCommon \ No newline at end of file diff --git a/src/video_core/query_cache/query_cache.h b/src/video_core/query_cache/query_cache.h index f1393d5c7..042af053c 100644 --- a/src/video_core/query_cache/query_cache.h +++ b/src/video_core/query_cache/query_cache.h @@ -54,7 +54,7 @@ public: return new_id; } - bool HasPendingSync() override { + bool HasPendingSync() const override { return !pending_sync.empty(); } @@ -71,8 +71,10 @@ public: continue; } query.flags |= QueryFlagBits::IsHostSynced; - sync_values.emplace_back(query.guest_address, query.value, - True(query.flags & QueryFlagBits::HasTimestamp) ? 8 : 4); + sync_values.emplace_back(SyncValuesStruct{ + .address = query.guest_address, + .value = query.value, + .size = static_cast(True(query.flags & QueryFlagBits::HasTimestamp) ? 8 : 4)}); } pending_sync.clear(); if (sync_values.size() > 0) { @@ -90,15 +92,20 @@ class StubStreamer : public GuestStreamer { public: using RuntimeType = typename Traits::RuntimeType; - StubStreamer(size_t id_, RuntimeType& runtime_) : GuestStreamer(id_, runtime_) {} + StubStreamer(size_t id_, RuntimeType& runtime_, u32 stub_value_) + : GuestStreamer(id_, runtime_), stub_value{stub_value_} {} ~StubStreamer() override = default; size_t WriteCounter(VAddr address, bool has_timestamp, [[maybe_unused]] u32 value, std::optional subreport = std::nullopt) override { - size_t new_id = GuestStreamer::WriteCounter(address, has_timestamp, 1U, subreport); + size_t new_id = + GuestStreamer::WriteCounter(address, has_timestamp, stub_value, subreport); return new_id; } + +private: + u32 stub_value; }; template @@ -113,7 +120,7 @@ struct QueryCacheBase::QueryCacheBaseImpl { for (size_t i = 0; i < static_cast(QueryType::MaxQueryTypes); i++) { streamers[i] = runtime.GetStreamerInterface(static_cast(i)); if (streamers[i]) { - streamer_mask |= 1ULL << i; + streamer_mask |= 1ULL << streamers[i]->GetId(); } } } @@ -152,7 +159,7 @@ struct QueryCacheBase::QueryCacheBaseImpl { QueryCacheBase* owner; VideoCore::RasterizerInterface& rasterizer; Core::Memory::Memory& cpu_memory; - Traits::RuntimeType& runtime; + RuntimeType& runtime; Tegra::GPU& gpu; std::array(QueryType::MaxQueryTypes)> streamers; u64 streamer_mask; @@ -223,15 +230,11 @@ void QueryCacheBase::CounterReport(GPUVAddr addr, QueryType counter_type const bool is_fence = True(flags & QueryPropertiesFlags::IsAFence); size_t streamer_id = static_cast(counter_type); auto* streamer = impl->streamers[streamer_id]; - if (!streamer) [[unlikely]] { - if (has_timestamp) { - u64 timestamp = impl->gpu.GetTicks(); - gpu_memory->Write(addr + 8, timestamp); - gpu_memory->Write(addr, 1ULL); - } else { - gpu_memory->Write(addr, 1U); - } - return; + if (streamer == nullptr) [[unlikely]] { + counter_type = QueryType::Payload; + payload = 1U; + streamer_id = static_cast(counter_type); + streamer = impl->streamers[streamer_id]; } auto cpu_addr_opt = gpu_memory->GpuToCpuAddress(addr); if (!cpu_addr_opt) [[unlikely]] { @@ -403,12 +406,6 @@ bool QueryCacheBase::AccelerateHostConditionalRendering() { impl->runtime.EndHostConditionalRendering(); return false; } - /*if (!Settings::IsGPULevelHigh()) { - impl->runtime.EndHostConditionalRendering(); - return gpu_memory->IsMemoryDirty(regs.render_enable.Address(), 24, - VideoCommon::CacheType::BufferCache | - VideoCommon::CacheType::QueryCache); - }*/ const ComparisonMode mode = static_cast(regs.render_enable.mode); const GPUVAddr address = regs.render_enable.Address(); switch (mode) { @@ -442,6 +439,9 @@ bool QueryCacheBase::AccelerateHostConditionalRendering() { // Async downloads template void QueryCacheBase::CommitAsyncFlushes() { + // Make sure to have the results synced in Host. + NotifyWFI(); + u64 mask{}; { std::scoped_lock lk(impl->flush_guard); @@ -458,8 +458,19 @@ void QueryCacheBase::CommitAsyncFlushes() { if (mask == 0) { return; } - impl->ForEachStreamerIn(mask, - [](StreamerInterface* streamer) { streamer->PushUnsyncedQueries(); }); + u64 ran_mask = ~mask; + while (mask) { + impl->ForEachStreamerIn(mask, [&mask, &ran_mask](StreamerInterface* streamer) { + u64 dep_mask = streamer->GetDependentMask(); + if ((dep_mask & ~ran_mask) != 0) { + return; + } + u64 index = streamer->GetId(); + ran_mask |= (1ULL << index); + mask &= ~(1ULL << index); + streamer->PushUnsyncedQueries(); + }); + } } template @@ -489,13 +500,11 @@ void QueryCacheBase::PopAsyncFlushes() { if (mask == 0) { return; } - u64 ran_mask = 0; - u64 next_phase = 0; + u64 ran_mask = ~mask; while (mask) { - impl->ForEachStreamerIn(mask, [&mask, &ran_mask, &next_phase](StreamerInterface* streamer) { + impl->ForEachStreamerIn(mask, [&mask, &ran_mask](StreamerInterface* streamer) { u64 dep_mask = streamer->GetDependenceMask(); if ((dep_mask & ~ran_mask) != 0) { - next_phase |= dep_mask; return; } u64 index = streamer->GetId(); @@ -503,7 +512,6 @@ void QueryCacheBase::PopAsyncFlushes() { mask &= ~(1ULL << index); streamer->PopUnsyncedQueries(); }); - ran_mask |= next_phase; } } diff --git a/src/video_core/query_cache/query_cache_base.h b/src/video_core/query_cache/query_cache_base.h index 55f508dd1..07be421c6 100644 --- a/src/video_core/query_cache/query_cache_base.h +++ b/src/video_core/query_cache/query_cache_base.h @@ -47,7 +47,7 @@ public: BitField<0, 27, u32> query_id; u32 raw; - std::pair unpack() { + std::pair unpack() const { return {static_cast(stream_id.Value()), static_cast(query_id.Value())}; } }; @@ -73,7 +73,7 @@ public: } } - static u64 BuildMask(std::span types) { + static u64 BuildMask(std::span types) { u64 mask = 0; for (auto query_type : types) { mask |= 1ULL << (static_cast(query_type)); @@ -160,7 +160,7 @@ protected: } } - using ContentCache = typename std::unordered_map>; + using ContentCache = std::unordered_map>; void InvalidateQuery(QueryLocation location); bool IsQueryDirty(QueryLocation location); @@ -175,7 +175,7 @@ protected: friend struct QueryCacheBaseImpl; friend RuntimeType; - std::unique_ptr impl; + std::unique_ptr impl; }; } // namespace VideoCommon \ No newline at end of file diff --git a/src/video_core/query_cache/query_stream.h b/src/video_core/query_cache/query_stream.h index 0e9275565..e7aac955b 100644 --- a/src/video_core/query_cache/query_stream.h +++ b/src/video_core/query_cache/query_stream.h @@ -16,7 +16,7 @@ namespace VideoCommon { class StreamerInterface { public: - StreamerInterface(size_t id_, u64 dependance_mask_ = 0) : id{id_}, dependance_mask{dependance_mask_} {} + explicit StreamerInterface(size_t id_) : id{id_}, dependence_mask{}, dependent_mask{} {} virtual ~StreamerInterface() = default; virtual QueryBase* GetQuery(size_t id) = 0; @@ -37,7 +37,7 @@ public: /* Do Nothing */ } - virtual bool HasPendingSync() { + virtual bool HasPendingSync() const { return false; } @@ -52,7 +52,7 @@ public: virtual size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, std::optional subreport = std::nullopt) = 0; - virtual bool HasUnsyncedQueries() { + virtual bool HasUnsyncedQueries() const { return false; } @@ -71,18 +71,28 @@ public: } u64 GetDependenceMask() const { - return dependance_mask; + return dependence_mask; + } + + u64 GetDependentMask() const { + return dependence_mask; } protected: + void MakeDependent(StreamerInterface* depend_on) { + dependence_mask |= 1ULL << depend_on->id; + depend_on->dependent_mask |= 1ULL << id; + } + const size_t id; - const u64 dependance_mask; + u64 dependence_mask; + u64 dependent_mask; }; template class SimpleStreamer : public StreamerInterface { public: - SimpleStreamer(size_t id_, u64 dependance_mask_ = 0) : StreamerInterface{id_, dependance_mask_} {} + explicit SimpleStreamer(size_t id_) : StreamerInterface{id_} {} virtual ~SimpleStreamer() = default; protected: diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 2ba7cbb0d..af1469147 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -9,10 +9,10 @@ #include #include "common/common_types.h" #include "common/polyfill_thread.h" -#include "video_core/query_cache/types.h" #include "video_core/cache_types.h" #include "video_core/engines/fermi_2d.h" #include "video_core/gpu.h" +#include "video_core/query_cache/types.h" #include "video_core/rasterizer_download_area.h" namespace Tegra { @@ -57,7 +57,8 @@ public: virtual void ResetCounter(VideoCommon::QueryType type) = 0; /// Records a GPU query and caches it - virtual void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) = 0; + virtual void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) = 0; /// Signal an uniform buffer binding virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, diff --git a/src/video_core/renderer_null/null_rasterizer.h b/src/video_core/renderer_null/null_rasterizer.h index 57a8c4c85..23001eeb8 100644 --- a/src/video_core/renderer_null/null_rasterizer.h +++ b/src/video_core/renderer_null/null_rasterizer.h @@ -43,7 +43,8 @@ public: void Clear(u32 layer_count) override; void DispatchCompute() override; void ResetCounter(VideoCommon::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; + void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void FlushAll() override; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index a975bbe75..27e2de1bf 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -405,8 +405,6 @@ void RasterizerOpenGL::ResetCounter(VideoCommon::QueryType type) { void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { if (type == VideoCommon::QueryType::ZPassPixelCount64) { - std::optional timestamp{True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout) - ? std::make_optional(gpu.GetTicks()) : std:: nullopt }; if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, {gpu.GetTicks()}); } else { @@ -414,13 +412,23 @@ void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, } return; } - if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { - u64 ticks = gpu.GetTicks(); - gpu_memory->Write(gpu_addr + 8, ticks); - gpu_memory->Write(gpu_addr, static_cast(payload)); - } else { - gpu_memory->Write(gpu_addr, payload); + if (type != VideoCommon::QueryType::Payload) { + payload = 1u; } + std::function func([this, gpu_addr, flags, memory_manager = gpu_memory, payload]() { + if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { + u64 ticks = gpu.GetTicks(); + memory_manager->Write(gpu_addr + 8, ticks); + memory_manager->Write(gpu_addr, static_cast(payload)); + } else { + memory_manager->Write(gpu_addr, payload); + } + }); + if (True(flags & VideoCommon::QueryPropertiesFlags::IsAFence)) { + SignalFence(std::move(func)); + return; + } + func(); } void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 05e048e15..ceffe1f1e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -87,7 +87,8 @@ public: void Clear(u32 layer_count) override; void DispatchCompute() override; void ResetCounter(VideoCommon::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; + void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void FlushAll() override; diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 97cd4521d..039dc95e1 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -303,9 +303,9 @@ std::pair QuadIndexedPass::Assemble( return {staging.buffer, staging.offset}; } -ConditionalRenderingResolvePass::ConditionalRenderingResolvePass(const Device& device_, - Scheduler& scheduler_, - DescriptorPool& descriptor_pool_, ComputePassDescriptorQueue& compute_pass_descriptor_queue_) +ConditionalRenderingResolvePass::ConditionalRenderingResolvePass( + const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_) : ComputePass(device_, descriptor_pool_, INPUT_OUTPUT_DESCRIPTOR_SET_BINDINGS, INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE, INPUT_OUTPUT_BANK_INFO, nullptr, RESOLVE_CONDITIONAL_RENDER_COMP_SPV), diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h index 14fc5ad71..336573574 100644 --- a/src/video_core/renderer_vulkan/vk_fence_manager.h +++ b/src/video_core/renderer_vulkan/vk_fence_manager.h @@ -7,8 +7,8 @@ #include "video_core/fence_manager.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" -#include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_query_cache.h" +#include "video_core/renderer_vulkan/vk_texture_cache.h" namespace Core { class System; diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index ef891e26b..add0c6fb3 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -11,11 +11,9 @@ #include #include -#include -#include - #include "common/common_types.h" #include "core/memory.h" +#include "video_core/engines/draw_manager.h" #include "video_core/query_cache/query_cache.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" @@ -30,6 +28,7 @@ namespace Vulkan { +using Tegra::Engines::Maxwell3D; using VideoCommon::QueryType; namespace { @@ -37,7 +36,7 @@ class SamplesQueryBank : public VideoCommon::BankBase { public: static constexpr size_t BANK_SIZE = 256; static constexpr size_t QUERY_SIZE = 8; - SamplesQueryBank(const Device& device_, size_t index_) + explicit SamplesQueryBank(const Device& device_, size_t index_) : BankBase(BANK_SIZE), device{device_}, index{index_} { const auto& dev = device.GetLogical(); query_pool = dev.CreateQueryPool({ @@ -109,18 +108,19 @@ struct HostSyncValues { static constexpr bool GeneratesBaseBuffer = false; }; -template class SamplesStreamer : public BaseStreamer { public: - SamplesStreamer(size_t id, QueryCacheRuntime& runtime_, const Device& device_, - Scheduler& scheduler_, const MemoryAllocator& memory_allocator_) - : BaseStreamer(id), runtime{runtime_}, device{device_}, scheduler{scheduler_}, + explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, const Device& device_, + Scheduler& scheduler_, const MemoryAllocator& memory_allocator_) + : BaseStreamer(id_), runtime{runtime_}, device{device_}, scheduler{scheduler_}, memory_allocator{memory_allocator_} { BuildResolveBuffer(); current_bank = nullptr; current_query = nullptr; } + ~SamplesStreamer() = default; + void StartCounter() override { if (has_started) { return; @@ -157,7 +157,7 @@ public: PauseCounter(); } - bool HasPendingSync() override { + bool HasPendingSync() const override { return !pending_sync.empty(); } @@ -198,7 +198,7 @@ public: } resolve_slots_remaining = resolve_slots; sync_values_stash.emplace_back(); - sync_values = sync_values = &sync_values_stash.back(); + sync_values = &sync_values_stash.back(); sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE); } resolve_slots_remaining--; @@ -207,6 +207,7 @@ public: const size_t base_offset = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * (resolve_slots - resolve_slots_remaining - 1); VkQueryPool query_pool = bank->GetInnerPool(); + scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([start, amount, base_offset, query_pool, buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) { size_t final_offset = base_offset + start * SamplesQueryBank::QUERY_SIZE; @@ -284,7 +285,7 @@ public: return index; } - bool HasUnsyncedQueries() override { + bool HasUnsyncedQueries() const override { return !pending_flush_queries.empty(); } @@ -348,8 +349,8 @@ private: for (auto q : queries) { auto* query = GetQuery(q); ApplyBankOp(query, [&indexer](SamplesQueryBank* bank, size_t start, size_t amount) { - auto id = bank->GetIndex(); - auto pair = indexer.try_emplace(id, std::numeric_limits::max(), + auto id_ = bank->GetIndex(); + auto pair = indexer.try_emplace(id_, std::numeric_limits::max(), std::numeric_limits::min()); auto& current_pair = pair.first->second; current_pair.first = std::min(current_pair.first, start); @@ -434,13 +435,14 @@ private: .pNext = nullptr, .flags = 0, .size = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * resolve_slots, - .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, }; resolve_buffers.emplace_back( - std::move(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal))); + memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); } static constexpr size_t resolve_slots = 8; @@ -476,7 +478,8 @@ class TFBQueryBank : public VideoCommon::BankBase { public: static constexpr size_t BANK_SIZE = 1024; static constexpr size_t QUERY_SIZE = 4; - TFBQueryBank(Scheduler& scheduler_, const MemoryAllocator& memory_allocator, size_t index_) + explicit TFBQueryBank(Scheduler& scheduler_, const MemoryAllocator& memory_allocator, + size_t index_) : BankBase(BANK_SIZE), scheduler{scheduler_}, index{index_} { const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, @@ -525,22 +528,21 @@ private: vk::Buffer buffer; }; -template class PrimitivesSucceededStreamer; -template class TFBCounterStreamer : public BaseStreamer { public: - TFBCounterStreamer(size_t id, QueryCacheRuntime& runtime_, const Device& device_, - Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, - StagingBufferPool& staging_pool_) - : BaseStreamer(id), runtime{runtime_}, device{device_}, scheduler{scheduler_}, + explicit TFBCounterStreamer(size_t id_, QueryCacheRuntime& runtime_, const Device& device_, + Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, + StagingBufferPool& staging_pool_) + : BaseStreamer(id_), runtime{runtime_}, device{device_}, scheduler{scheduler_}, memory_allocator{memory_allocator_}, staging_pool{staging_pool_} { buffers_count = 0; current_bank = nullptr; counter_buffers.fill(VK_NULL_HANDLE); offsets.fill(0); last_queries.fill(0); + last_queries_stride.fill(1); const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, @@ -564,6 +566,8 @@ public: } } + ~TFBCounterStreamer() = default; + void StartCounter() override { FlushBeginTFB(); has_started = true; @@ -581,15 +585,15 @@ public: if (has_flushed_end_pending) { FlushEndTFB(); } - runtime.View3DRegs([this](Tegra::Engines::Maxwell3D::Regs& regs) { - if (regs.transform_feedback_enabled == 0) { + runtime.View3DRegs([this](Maxwell3D& maxwell3d) { + if (maxwell3d.regs.transform_feedback_enabled == 0) { streams_mask = 0; has_started = false; } }); } - bool HasPendingSync() override { + bool HasPendingSync() const override { return !pending_sync.empty(); } @@ -650,14 +654,19 @@ public: return index; } - std::optional GetLastQueryStream(size_t stream) { + std::optional> GetLastQueryStream(size_t stream) { if (last_queries[stream] != 0) { - return {last_queries[stream]}; + std::pair result(last_queries[stream], last_queries_stride[stream]); + return result; } return std::nullopt; } - bool HasUnsyncedQueries() override { + Maxwell3D::Regs::PrimitiveTopology GetOutputTopology() const { + return out_topology; + } + + bool HasUnsyncedQueries() const override { return !pending_flush_queries.empty(); } @@ -762,15 +771,17 @@ private: void UpdateBuffers() { last_queries.fill(0); - runtime.View3DRegs([this](Tegra::Engines::Maxwell3D::Regs& regs) { + last_queries_stride.fill(1); + runtime.View3DRegs([this](Maxwell3D& maxwell3d) { buffers_count = 0; - for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers; - i++) { - const auto& tf = regs.transform_feedback; + out_topology = maxwell3d.draw_manager->GetDrawState().topology; + for (size_t i = 0; i < Maxwell3D::Regs::NumTransformFeedbackBuffers; i++) { + const auto& tf = maxwell3d.regs.transform_feedback; if (tf.buffers[i].enable == 0) { continue; } const size_t stream = tf.controls[i].stream; + last_queries_stride[stream] = tf.controls[i].stride; streams_mask |= 1ULL << stream; buffers_count = std::max(buffers_count, stream + 1); } @@ -785,7 +796,8 @@ private: }); current_bank = &bank_pool.GetBank(current_bank_id); } - auto [dont_care, slot] = current_bank->Reserve(); + auto [dont_care, other] = current_bank->Reserve(); + const size_t slot = other; // workaround to compile bug. current_bank->AddReference(); static constexpr VkMemoryBarrier READ_BARRIER{ @@ -818,11 +830,9 @@ private: return {current_bank_id, slot}; } - template friend class PrimitivesSucceededStreamer; static constexpr size_t NUM_STREAMS = 4; - static constexpr size_t STREAMS_MASK = (1ULL << NUM_STREAMS) - 1ULL; QueryCacheRuntime& runtime; const Device& device; @@ -851,6 +861,8 @@ private: std::array counter_buffers{}; std::array offsets{}; std::array last_queries; + std::array last_queries_stride; + Maxwell3D::Regs::PrimitiveTopology out_topology; u64 streams_mask; }; @@ -858,32 +870,34 @@ class PrimitivesQueryBase : public VideoCommon::QueryBase { public: // Default constructor PrimitivesQueryBase() - : VideoCommon::QueryBase(0, VideoCommon::QueryFlagBits::IsHostManaged, 0), stride{}, - dependant_index{}, dependant_manage{} {} + : VideoCommon::QueryBase(0, VideoCommon::QueryFlagBits::IsHostManaged, 0) {} // Parameterized constructor - PrimitivesQueryBase(bool is_long, VAddr address) - : VideoCommon::QueryBase(address, VideoCommon::QueryFlagBits::IsHostManaged, 0), stride{}, - dependant_index{}, dependant_manage{} { - if (is_long) { + PrimitivesQueryBase(bool has_timestamp, VAddr address) + : VideoCommon::QueryBase(address, VideoCommon::QueryFlagBits::IsHostManaged, 0) { + if (has_timestamp) { flags |= VideoCommon::QueryFlagBits::HasTimestamp; } } - u64 stride; - VAddr dependant_address; - size_t dependant_index; - bool dependant_manage; + u64 stride{}; + VAddr dependant_address{}; + Maxwell3D::Regs::PrimitiveTopology topology{Maxwell3D::Regs::PrimitiveTopology::Points}; + size_t dependant_index{}; + bool dependant_manage{}; }; -template class PrimitivesSucceededStreamer : public VideoCommon::SimpleStreamer { public: - PrimitivesSucceededStreamer(size_t id, QueryCacheRuntime& runtime_, - TFBCounterStreamer& tfb_streamer_, Core::Memory::Memory& cpu_memory_) - : VideoCommon::SimpleStreamer( - id, 1ULL << static_cast(VideoCommon::QueryType::StreamingByteCount)), - runtime{runtime_}, tfb_streamer{tfb_streamer_}, cpu_memory{cpu_memory_} {} + explicit PrimitivesSucceededStreamer(size_t id_, QueryCacheRuntime& runtime_, + TFBCounterStreamer& tfb_streamer_, + Core::Memory::Memory& cpu_memory_) + : VideoCommon::SimpleStreamer(id_), runtime{runtime_}, + tfb_streamer{tfb_streamer_}, cpu_memory{cpu_memory_} { + MakeDependent(&tfb_streamer); + } + + ~PrimitivesSucceededStreamer() = default; size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, std::optional subreport_) override { @@ -901,8 +915,11 @@ public: const size_t subreport = static_cast(*subreport_); auto dependant_address_opt = tfb_streamer.GetLastQueryStream(subreport); bool must_manage_dependance = false; + new_query->topology = tfb_streamer.GetOutputTopology(); if (dependant_address_opt) { - new_query->dependant_address = *dependant_address_opt; + auto [dep_address, stride] = *dependant_address_opt; + new_query->dependant_address = dep_address; + new_query->stride = stride; } else { new_query->dependant_index = tfb_streamer.WriteCounter(address, has_timestamp, value, subreport_); @@ -917,25 +934,28 @@ public: } return index; } + new_query->stride = 1; + runtime.View3DRegs([new_query, subreport](Maxwell3D& maxwell3d) { + for (size_t i = 0; i < Maxwell3D::Regs::NumTransformFeedbackBuffers; i++) { + const auto& tf = maxwell3d.regs.transform_feedback; + if (tf.buffers[i].enable == 0) { + continue; + } + if (tf.controls[i].stream != subreport) { + continue; + } + new_query->stride = tf.controls[i].stride; + break; + } + }); } new_query->dependant_manage = must_manage_dependance; - runtime.View3DRegs([new_query, subreport](Tegra::Engines::Maxwell3D::Regs& regs) { - for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers; - i++) { - const auto& tf = regs.transform_feedback; - if (tf.controls[i].stream != subreport) { - continue; - } - new_query->stride = tf.controls[i].stride; - break; - } - }); pending_flush_queries.push_back(index); return index; } - bool HasUnsyncedQueries() override { + bool HasUnsyncedQueries() const override { return !pending_flush_queries.empty(); } @@ -960,22 +980,49 @@ public: } query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + u64 num_vertices = 0; if (query->dependant_manage) { auto* dependant_query = tfb_streamer.GetQuery(query->dependant_index); - query->value = dependant_query->value / query->stride; + num_vertices = dependant_query->value / query->stride; tfb_streamer.Free(query->dependant_index); } else { u8* pointer = cpu_memory.GetPointer(query->dependant_address); u32 result; std::memcpy(&result, pointer, sizeof(u32)); - query->value = static_cast(result) / query->stride; + num_vertices = static_cast(result) / query->stride; } + query->value = [&]() -> u64 { + switch (query->topology) { + case Maxwell3D::Regs::PrimitiveTopology::Points: + return num_vertices; + case Maxwell3D::Regs::PrimitiveTopology::Lines: + return num_vertices / 2; + case Maxwell3D::Regs::PrimitiveTopology::LineLoop: + return (num_vertices / 2) + 1; + case Maxwell3D::Regs::PrimitiveTopology::LineStrip: + return num_vertices - 1; + case Maxwell3D::Regs::PrimitiveTopology::Patches: + case Maxwell3D::Regs::PrimitiveTopology::Triangles: + case Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency: + return num_vertices / 3; + case Maxwell3D::Regs::PrimitiveTopology::TriangleFan: + case Maxwell3D::Regs::PrimitiveTopology::TriangleStrip: + case Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency: + return num_vertices - 2; + case Maxwell3D::Regs::PrimitiveTopology::Quads: + return num_vertices / 4; + case Maxwell3D::Regs::PrimitiveTopology::Polygon: + return 1U; + default: + return num_vertices; + } + }(); } } private: QueryCacheRuntime& runtime; - TFBCounterStreamer& tfb_streamer; + TFBCounterStreamer& tfb_streamer; Core::Memory::Memory& cpu_memory; // syncing queue @@ -1005,7 +1052,10 @@ struct QueryCacheRuntimeImpl { tfb_streamer(static_cast(QueryType::StreamingByteCount), runtime, device, scheduler, memory_allocator, staging_pool), primitives_succeeded_streamer( - static_cast(QueryType::StreamingPrimitivesSucceeded), runtime, tfb_streamer, cpu_memory_), + static_cast(QueryType::StreamingPrimitivesSucceeded), runtime, tfb_streamer, + cpu_memory_), + primitives_needed_minus_suceeded_streamer( + static_cast(QueryType::StreamingPrimitivesNeededMinusSucceeded), runtime, 0u), hcr_setup{}, hcr_is_set{}, is_hcr_running{} { hcr_setup.sType = VK_STRUCTURE_TYPE_CONDITIONAL_RENDERING_BEGIN_INFO_EXT; @@ -1040,9 +1090,10 @@ struct QueryCacheRuntimeImpl { // Streamers VideoCommon::GuestStreamer guest_streamer; - SamplesStreamer sample_streamer; - TFBCounterStreamer tfb_streamer; - PrimitivesSucceededStreamer primitives_succeeded_streamer; + SamplesStreamer sample_streamer; + TFBCounterStreamer tfb_streamer; + PrimitivesSucceededStreamer primitives_succeeded_streamer; + VideoCommon::StubStreamer primitives_needed_minus_suceeded_streamer; std::vector> little_cache; std::vector> buffers_to_upload_to; @@ -1059,7 +1110,7 @@ struct QueryCacheRuntimeImpl { bool is_hcr_running; // maxwell3d - Tegra::Engines::Maxwell3D* maxwell3d; + Maxwell3D* maxwell3d; }; QueryCacheRuntime::QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer, @@ -1074,13 +1125,13 @@ QueryCacheRuntime::QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer, staging_pool_, compute_pass_descriptor_queue, descriptor_pool); } -void QueryCacheRuntime::Bind3DEngine(Tegra::Engines::Maxwell3D* maxwell3d) { +void QueryCacheRuntime::Bind3DEngine(Maxwell3D* maxwell3d) { impl->maxwell3d = maxwell3d; } template void QueryCacheRuntime::View3DRegs(Func&& func) { - func(impl->maxwell3d->regs); + func(*impl->maxwell3d); } void QueryCacheRuntime::EndHostConditionalRendering() { @@ -1240,8 +1291,12 @@ VideoCommon::StreamerInterface* QueryCacheRuntime::GetStreamerInterface(QueryTyp return &impl->sample_streamer; case QueryType::StreamingByteCount: return &impl->tfb_streamer; + case QueryType::StreamingPrimitivesNeeded: + case QueryType::VtgPrimitivesOut: case QueryType::StreamingPrimitivesSucceeded: return &impl->primitives_succeeded_streamer; + case QueryType::StreamingPrimitivesNeededMinusSucceeded: + return &impl->primitives_needed_minus_suceeded_streamer; default: return nullptr; } diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h index 9ad2929d7..e9a1ea169 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.h +++ b/src/video_core/renderer_vulkan/vk_query_cache.h @@ -49,7 +49,8 @@ public: bool HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, bool qc_dirty); bool HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1, - VideoCommon::LookupData object_2, bool qc_dirty, bool equal_check); + VideoCommon::LookupData object_2, bool qc_dirty, + bool equal_check); VideoCommon::StreamerInterface* GetStreamerInterface(VideoCommon::QueryType query_type); @@ -66,7 +67,7 @@ private: }; struct QueryCacheParams { - using RuntimeType = Vulkan::QueryCacheRuntime; + using RuntimeType = typename Vulkan::QueryCacheRuntime; }; using QueryCache = VideoCommon::QueryCacheBase; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index e8862ba04..c7ce7c312 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -194,15 +194,6 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { query_cache.NotifySegment(true); -#if ANDROID - if (Settings::IsGPULevelHigh()) { - // This is problematic on Android, disable on GPU Normal. - // query_cache.UpdateCounters(); - } -#else - // query_cache.UpdateCounters(); -#endif - GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()}; if (!pipeline) { return; @@ -294,15 +285,6 @@ void RasterizerVulkan::DrawTexture() { query_cache.NotifySegment(true); -#if ANDROID - if (Settings::IsGPULevelHigh()) { - // This is problematic on Android, disable on GPU Normal. - // query_cache.UpdateCounters(); - } -#else - // query_cache.UpdateCounters(); -#endif - texture_cache.SynchronizeGraphicsDescriptors(); texture_cache.UpdateRenderTargets(false); @@ -332,15 +314,6 @@ void RasterizerVulkan::Clear(u32 layer_count) { FlushWork(); gpu_memory->FlushCaching(); -#if ANDROID - if (Settings::IsGPULevelHigh()) { - // This is problematic on Android, disable on GPU Normal. - // query_cache.UpdateCounters(); - } -#else - // query_cache.UpdateCounters(); -#endif - query_cache.NotifySegment(true); query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, maxwell3d->regs.zpass_pixel_count_enable); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index ffd44c68d..ad069556c 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -85,7 +85,8 @@ public: void Clear(u32 layer_count) override; void DispatchCompute() override; void ResetCounter(VideoCommon::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; + void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void FlushAll() override; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index c87e5fb07..da03803aa 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -15,9 +15,13 @@ #include "common/common_types.h" #include "common/polyfill_thread.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" -#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/vulkan_common/vulkan_wrapper.h" +namespace VideoCommon { +template +class QueryCacheBase; +} + namespace Vulkan { class CommandPool; @@ -26,6 +30,8 @@ class Framebuffer; class GraphicsPipeline; class StateTracker; +struct QueryCacheParams; + /// The scheduler abstracts command buffer and fence management with an interface that's able to do /// OpenGL-like operations on Vulkan command buffers. class Scheduler { @@ -63,7 +69,7 @@ public: void InvalidateState(); /// Assigns the query cache. - void SetQueryCache(QueryCache& query_cache_) { + void SetQueryCache(VideoCommon::QueryCacheBase& query_cache_) { query_cache = &query_cache_; } @@ -219,7 +225,7 @@ private: std::unique_ptr master_semaphore; std::unique_ptr command_pool; - QueryCache* query_cache = nullptr; + VideoCommon::QueryCacheBase* query_cache = nullptr; vk::CommandBuffer current_cmdbuf; From 2fea1b8407b66dd0e9ed1776c34dad043e1becf4 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 19 Aug 2023 21:49:38 +0200 Subject: [PATCH 06/10] Query Cache: Fix guest side sample counting --- src/video_core/engines/maxwell_3d.cpp | 6 -- src/video_core/query_cache/query_base.h | 19 +++--- src/video_core/query_cache/query_cache.h | 46 +++++++------- src/video_core/query_cache/query_stream.h | 10 +++ .../renderer_vulkan/vk_query_cache.cpp | 62 ++++++++++++++++--- 5 files changed, 97 insertions(+), 46 deletions(-) diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 46b9c548a..32d767d85 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -586,12 +586,6 @@ void Maxwell3D::ProcessQueryCondition() { } void Maxwell3D::ProcessCounterReset() { -#if ANDROID - if (!Settings::IsGPULevelHigh()) { - // This is problematic on Android, disable on GPU Normal. - return; - } -#endif switch (regs.clear_report_value) { case Regs::ClearReport::ZPassPixelCount: rasterizer->ResetCounter(VideoCommon::QueryType::ZPassPixelCount64); diff --git a/src/video_core/query_cache/query_base.h b/src/video_core/query_cache/query_base.h index 993a13eac..1d786b3a7 100644 --- a/src/video_core/query_cache/query_base.h +++ b/src/video_core/query_cache/query_base.h @@ -9,16 +9,15 @@ namespace VideoCommon { enum class QueryFlagBits : u32 { - HasTimestamp = 1 << 0, ///< Indicates if this query has a timestamp. - IsFinalValueSynced = 1 << 1, ///< Indicates if the query has been synced in the host - IsHostSynced = 1 << 2, ///< Indicates if the query has been synced in the host - IsGuestSynced = 1 << 3, ///< Indicates if the query has been synced with the guest. - IsHostManaged = 1 << 4, ///< Indicates if this query points to a host query - IsRewritten = 1 << 5, ///< Indicates if this query was rewritten by another query - IsInvalidated = 1 << 6, ///< Indicates the value of th query has been nullified. - IsOrphan = 1 << 7, ///< Indicates the query has not been set by a guest query. - IsFence = 1 << 8, ///< Indicates the query is a fence. - IsQueuedForAsyncFlush = 1 << 9, ///< Indicates that the query can be flushed at any moment + HasTimestamp = 1 << 0, ///< Indicates if this query has a timestamp. + IsFinalValueSynced = 1 << 1, ///< Indicates if the query has been synced in the host + IsHostSynced = 1 << 2, ///< Indicates if the query has been synced in the host + IsGuestSynced = 1 << 3, ///< Indicates if the query has been synced with the guest. + IsHostManaged = 1 << 4, ///< Indicates if this query points to a host query + IsRewritten = 1 << 5, ///< Indicates if this query was rewritten by another query + IsInvalidated = 1 << 6, ///< Indicates the value of th query has been nullified. + IsOrphan = 1 << 7, ///< Indicates the query has not been set by a guest query. + IsFence = 1 << 8, ///< Indicates the query is a fence. }; DECLARE_ENUM_FLAG_OPERATORS(QueryFlagBits) diff --git a/src/video_core/query_cache/query_cache.h b/src/video_core/query_cache/query_cache.h index 042af053c..4b89b5bf6 100644 --- a/src/video_core/query_cache/query_cache.h +++ b/src/video_core/query_cache/query_cache.h @@ -256,30 +256,32 @@ void QueryCacheBase::CounterReport(GPUVAddr addr, QueryType counter_type u8* pointer = impl->cpu_memory.GetPointer(cpu_addr); u8* pointer_timestamp = impl->cpu_memory.GetPointer(cpu_addr + 8); bool is_synced = !Settings::IsGPULevelHigh() && is_fence; - std::function operation( - [this, is_synced, query_base = query, query_location, pointer, pointer_timestamp] { - if (True(query_base->flags & QueryFlagBits::IsInvalidated)) { - if (!is_synced) [[likely]] { - impl->pending_unregister.push_back(query_location); - } - return; - } - if (False(query_base->flags & QueryFlagBits::IsFinalValueSynced)) [[unlikely]] { - UNREACHABLE(); - return; - } - if (True(query_base->flags & QueryFlagBits::HasTimestamp)) { - u64 timestamp = impl->gpu.GetTicks(); - std::memcpy(pointer_timestamp, ×tamp, sizeof(timestamp)); - std::memcpy(pointer, &query_base->value, sizeof(query_base->value)); - } else { - u32 value = static_cast(query_base->value); - std::memcpy(pointer, &value, sizeof(value)); - } + std::function operation([this, is_synced, streamer, query_base = query, query_location, + pointer, pointer_timestamp] { + if (True(query_base->flags & QueryFlagBits::IsInvalidated)) { if (!is_synced) [[likely]] { impl->pending_unregister.push_back(query_location); } - }); + return; + } + if (False(query_base->flags & QueryFlagBits::IsFinalValueSynced)) [[unlikely]] { + UNREACHABLE(); + return; + } + query_base->value += streamer->GetAmmendValue(); + streamer->SetAccumulationValue(query_base->value); + if (True(query_base->flags & QueryFlagBits::HasTimestamp)) { + u64 timestamp = impl->gpu.GetTicks(); + std::memcpy(pointer_timestamp, ×tamp, sizeof(timestamp)); + std::memcpy(pointer, &query_base->value, sizeof(query_base->value)); + } else { + u32 value = static_cast(query_base->value); + std::memcpy(pointer, &value, sizeof(value)); + } + if (!is_synced) [[likely]] { + impl->pending_unregister.push_back(query_location); + } + }); if (is_fence) { impl->rasterizer.SignalFence(std::move(operation)); } else { @@ -354,9 +356,9 @@ void QueryCacheBase::NotifySegment(bool resume) { if (resume) { impl->runtime.ResumeHostConditionalRendering(); } else { - impl->runtime.PauseHostConditionalRendering(); CounterClose(VideoCommon::QueryType::ZPassPixelCount64); CounterClose(VideoCommon::QueryType::StreamingByteCount); + impl->runtime.PauseHostConditionalRendering(); } } diff --git a/src/video_core/query_cache/query_stream.h b/src/video_core/query_cache/query_stream.h index e7aac955b..39da6ac07 100644 --- a/src/video_core/query_cache/query_stream.h +++ b/src/video_core/query_cache/query_stream.h @@ -78,6 +78,14 @@ public: return dependence_mask; } + u64 GetAmmendValue() const { + return ammend_value; + } + + void SetAccumulationValue(u64 new_value) { + acumulation_value = new_value; + } + protected: void MakeDependent(StreamerInterface* depend_on) { dependence_mask |= 1ULL << depend_on->id; @@ -87,6 +95,8 @@ protected: const size_t id; u64 dependence_mask; u64 dependent_mask; + u64 ammend_value{}; + u64 acumulation_value{}; }; template diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index add0c6fb3..2147776f8 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -110,13 +110,16 @@ struct HostSyncValues { class SamplesStreamer : public BaseStreamer { public: - explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, const Device& device_, + explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, + VideoCore::RasterizerInterface* rasterizer_, const Device& device_, Scheduler& scheduler_, const MemoryAllocator& memory_allocator_) - : BaseStreamer(id_), runtime{runtime_}, device{device_}, scheduler{scheduler_}, - memory_allocator{memory_allocator_} { + : BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_}, + scheduler{scheduler_}, memory_allocator{memory_allocator_} { BuildResolveBuffer(); current_bank = nullptr; current_query = nullptr; + ammend_value = 0; + acumulation_value = 0; } ~SamplesStreamer() = default; @@ -151,6 +154,11 @@ public: PauseCounter(); } AbandonCurrentQuery(); + std::function func([this, counts = pending_flush_queries.size()] { + ammend_value = 0; + acumulation_value = 0; + }); + rasterizer->SyncOperation(std::move(func)); } void CloseCounter() override { @@ -244,7 +252,7 @@ public: } if (query->size_slots > 1) { // This is problematic. - UNIMPLEMENTED(); + // UNIMPLEMENTED(); } query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; auto loc_data = offsets[query->start_bank_id]; @@ -255,16 +263,20 @@ public: }); } + ReplicateCurrentQueryIfNeeded(); + std::function func([this] { ammend_value = acumulation_value; }); + rasterizer->SyncOperation(std::move(func)); AbandonCurrentQuery(); pending_sync.clear(); } size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, [[maybe_unused]] std::optional subreport) override { + PauseCounter(); auto index = BuildQuery(); auto* new_query = GetQuery(index); new_query->guest_address = address; - new_query->value = 100; + new_query->value = 0; new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; if (has_timestamp) { new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; @@ -291,6 +303,7 @@ public: void PushUnsyncedQueries() override { PauseCounter(); + current_bank->Close(); { std::scoped_lock lk(flush_guard); pending_flush_sets.emplace_back(std::move(pending_flush_queries)); @@ -429,6 +442,34 @@ private: current_query_id = 0; } + void ReplicateCurrentQueryIfNeeded() { + if (pending_sync.empty()) { + return; + } + if (!current_query) { + return; + } + auto index = BuildQuery(); + auto* new_query = GetQuery(index); + new_query->guest_address = 0; + new_query->value = 0; + new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; + new_query->start_bank_id = current_query->start_bank_id; + new_query->size_banks = current_query->size_banks; + new_query->start_slot = current_query->start_slot; + new_query->size_slots = current_query->size_slots; + ApplyBankOp(new_query, [](SamplesQueryBank* bank, size_t start, size_t amount) { + bank->AddReference(amount); + }); + pending_flush_queries.push_back(index); + std::function func([this, index] { + auto* query = GetQuery(index); + query->value += GetAmmendValue(); + SetAccumulationValue(query->value); + Free(index); + }); + } + void BuildResolveBuffer() { const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, @@ -448,6 +489,7 @@ private: static constexpr size_t resolve_slots = 8; QueryCacheRuntime& runtime; + VideoCore::RasterizerInterface* rasterizer; const Device& device; Scheduler& scheduler; const MemoryAllocator& memory_allocator; @@ -470,6 +512,7 @@ private: size_t current_query_id; VideoCommon::HostQueryBase* current_query; bool has_started{}; + bool current_unset{}; std::mutex flush_guard; }; @@ -677,7 +720,6 @@ public: size_t offset_base = staging_ref.offset; for (auto q : pending_flush_queries) { auto* query = GetQuery(q); - query->flags |= VideoCommon::QueryFlagBits::IsQueuedForAsyncFlush; auto& bank = bank_pool.GetBank(query->start_bank_id); bank.Sync(staging_ref, offset_base, query->start_slot, 1); offset_base += TFBQueryBank::QUERY_SIZE; @@ -1047,8 +1089,8 @@ struct QueryCacheRuntimeImpl { buffer_cache{buffer_cache_}, device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_}, guest_streamer(0, runtime), - sample_streamer(static_cast(QueryType::ZPassPixelCount64), runtime, device, - scheduler, memory_allocator), + sample_streamer(static_cast(QueryType::ZPassPixelCount64), runtime, rasterizer, + device, scheduler, memory_allocator), tfb_streamer(static_cast(QueryType::StreamingByteCount), runtime, device, scheduler, memory_allocator, staging_pool), primitives_succeeded_streamer( @@ -1277,6 +1319,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku return true; } } + if (!is_in_bc[0] && !is_in_bc[1]) { + // Both queries are in query cache, it's best to just flush. + return false; + } HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); return true; } From c8237d5c312485394389b2520451ef720604ea9a Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 20 Aug 2023 17:53:08 +0200 Subject: [PATCH 07/10] Query Cache: Implement host side sample counting. --- src/video_core/host_shaders/CMakeLists.txt | 1 + .../host_shaders/queries_prefix_scan_sum.comp | 124 +++++++++++++++ .../renderer_vulkan/vk_compute_pass.cpp | 110 ++++++++++++- .../renderer_vulkan/vk_compute_pass.h | 14 ++ .../renderer_vulkan/vk_query_cache.cpp | 147 ++++++++++++------ 5 files changed, 348 insertions(+), 48 deletions(-) create mode 100644 src/video_core/host_shaders/queries_prefix_scan_sum.comp diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index fb24b6532..8218ec4c8 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -41,6 +41,7 @@ set(SHADER_FILES pitch_unswizzle.comp present_bicubic.frag present_gaussian.frag + queries_prefix_scan_sum.comp resolve_conditional_render.comp smaa_edge_detection.vert smaa_edge_detection.frag diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp new file mode 100644 index 000000000..dce1279fe --- /dev/null +++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp @@ -0,0 +1,124 @@ +// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel +// SPDX-License-Identifier: MIT + +// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and +// Nicholas Haemel. Modified to suit needs and optimize for subgroup + +#version 460 core + +#ifdef VULKAN + +#extension GL_KHR_shader_subgroup_arithmetic : enable +#define HAS_EXTENDED_TYPES 1 +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS \ + } \ + ; +#define UNIFORM(n) +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 1 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#extension GL_KHR_shader_subgroup_arithmetic : enable +#extension GL_NV_gpu_shader5 : enable +#ifdef GL_NV_gpu_shader5 +#define HAS_EXTENDED_TYPES 1 +#else +#define HAS_EXTENDED_TYPES 0 +#endif +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout(location = n) uniform +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 0 + +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) uint max_accumulation_base; +UNIFORM(1) uint accumulation_limit; +END_PUSH_CONSTANTS + +layout(local_size_x = 32) in; + +layout(std430, binding = 0) readonly buffer block1 { + uvec2 input_data[gl_WorkGroupSize.x]; +}; + +layout(std430, binding = 1) writeonly coherent buffer block2 { + uvec2 output_data[gl_WorkGroupSize.x]; +}; + +layout(std430, binding = 2) coherent buffer block3 { + uvec2 accumulated_data; +}; + +shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; + +uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { + uint carry = 0; + uvec2 result; + result.x = uaddCarry(value_1.x, value_2.x, carry); + result.y = value_1.y + value_2.y + carry; + return result; +} + +void main(void) { + uint id = gl_LocalInvocationID.x; + uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); + uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); + uint work_size = gl_WorkGroupSize.x; + uint rd_id; + uint wr_id; + uint mask; + uvec2 input_1 = input_data[id * 2]; + uvec2 input_2 = input_data[id * 2 + 1]; + // The number of steps is the log base 2 of the + // work group size, which should be a power of 2 + const uint steps = uint(log2(work_size)) + 1; + uint step = 0; + + // Each invocation is responsible for the content of + // two elements of the output array + shared_data[id * 2] = input_1; + shared_data[id * 2 + 1] = input_2; + // Synchronize to make sure that everyone has initialized + // their elements of shared_data[] with data loaded from + // the input arrays + barrier(); + memoryBarrierShared(); + // For each step... + for (step = 0; step < steps; step++) { + // Calculate the read and write index in the + // shared array + mask = (1 << step) - 1; + rd_id = ((id >> step) << (step + 1)) + mask; + wr_id = rd_id + 1 + (id & mask); + // Accumulate the read data into our element + + shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); + // Synchronize again to make sure that everyone + // has caught up with us + barrier(); + memoryBarrierShared(); + } + // Add the accumulation + shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); + shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); + barrier(); + memoryBarrierShared(); + + // Finally write our data back to the output buffer + output_data[id * 2] = shared_data[id * 2]; + output_data[id * 2 + 1] = shared_data[id * 2 + 1]; + if (id == 0) { + if (max_accumulation_base >= accumulation_limit + 1) { + accumulated_data = shared_data[accumulation_limit]; + return; + } + uvec2 value_1 = shared_data[max_accumulation_base]; + uvec2 value_2 = shared_data[accumulation_limit]; + accumulated_data = AddUint64(value_1, -value_2); + } +} \ No newline at end of file diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 039dc95e1..a1af08cda 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -12,6 +12,7 @@ #include "common/common_types.h" #include "common/div_ceil.h" #include "video_core/host_shaders/astc_decoder_comp_spv.h" +#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h" #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" @@ -58,6 +59,30 @@ constexpr std::array INPUT_OUTPUT_DESCRIPTOR_SE }, }}; +constexpr std::array QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{ + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = 2, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, +}}; + constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ .uniform_buffers = 0, .storage_buffers = 2, @@ -68,6 +93,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ .score = 2, }; +constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{ + .uniform_buffers = 0, + .storage_buffers = 3, + .texture_buffers = 0, + .image_buffers = 0, + .textures = 0, + .images = 0, + .score = 3, +}; + constexpr std::array ASTC_DESCRIPTOR_SET_BINDINGS{{ { .binding = ASTC_BINDING_INPUT_BUFFER, @@ -104,6 +139,15 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT .stride = sizeof(DescriptorUpdateEntry), }; +constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{ + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 3, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = 0, + .stride = sizeof(DescriptorUpdateEntry), +}; + constexpr std::array ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ { @@ -132,6 +176,11 @@ struct AstcPushConstants { u32 block_height; u32 block_height_mask; }; + +struct QueriesPrefixScanPushConstants { + u32 max_accumulation_base; + u32 accumulation_limit; +}; } // Anonymous namespace ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, @@ -313,8 +362,6 @@ ConditionalRenderingResolvePass::ConditionalRenderingResolvePass( void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, u32 src_offset, bool compare_to_zero) { - scheduler.RequestOutsideRenderPassOperationContext(); - const size_t compare_size = compare_to_zero ? 8 : 24; compute_pass_descriptor_queue.Acquire(); @@ -327,7 +374,7 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_ static constexpr VkMemoryBarrier read_barrier{ .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, .pNext = nullptr, - .srcAccessMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, }; static constexpr VkMemoryBarrier write_barrier{ @@ -349,6 +396,63 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_ }); } +QueriesPrefixScanPass::QueriesPrefixScanPass( + const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_) + : ComputePass(device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, + QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, + COMPUTE_PUSH_CONSTANT_RANGE, + QUERIES_PREFIX_SCAN_SUM_COMP_SPV), + scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} + +void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, + VkBuffer src_buffer, size_t number_of_sums, + size_t max_accumulation_limit) { + size_t aligned_runs = Common::AlignUp(number_of_sums, 32); + + compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, aligned_runs * sizeof(u64)); + compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, aligned_runs * sizeof(u64)); + compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); + const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([this, descriptor_data, max_accumulation_limit, number_of_sums, + aligned_runs](vk::CommandBuffer cmdbuf) { + static constexpr VkMemoryBarrier read_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | + VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | + VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | + VK_ACCESS_UNIFORM_READ_BIT | + VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, + }; + const QueriesPrefixScanPushConstants uniforms{ + .max_accumulation_base = static_cast(max_accumulation_limit), + .accumulation_limit = static_cast(number_of_sums - 1), + }; + const VkDescriptorSet set = descriptor_allocator.Commit(); + device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); + cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); + cmdbuf.Dispatch(static_cast(aligned_runs / 32U), 1, 1); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); + }); +} + ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, StagingBufferPool& staging_buffer_pool_, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index c62f30d30..e6ff86e9a 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -95,6 +95,20 @@ private: ComputePassDescriptorQueue& compute_pass_descriptor_queue; }; +class QueriesPrefixScanPass final : public ComputePass { +public: + explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_, + DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_); + + void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, + size_t number_of_sums, size_t max_accumulation_limit); + +private: + Scheduler& scheduler; + ComputePassDescriptorQueue& compute_pass_descriptor_queue; +}; + class ASTCDecoderPass final : public ComputePass { public: explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 2147776f8..ded190ae0 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -11,6 +11,7 @@ #include #include +#include "common/bit_util.h" #include "common/common_types.h" #include "core/memory.h" #include "video_core/engines/draw_manager.h" @@ -112,14 +113,34 @@ class SamplesStreamer : public BaseStreamer { public: explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, VideoCore::RasterizerInterface* rasterizer_, const Device& device_, - Scheduler& scheduler_, const MemoryAllocator& memory_allocator_) + Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue, + DescriptorPool& descriptor_pool) : BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_}, scheduler{scheduler_}, memory_allocator{memory_allocator_} { - BuildResolveBuffer(); current_bank = nullptr; current_query = nullptr; ammend_value = 0; acumulation_value = 0; + queries_prefix_scan_pass = std::make_unique( + device, scheduler, descriptor_pool, compute_pass_descriptor_queue); + + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = 8, + .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + accumulation_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, 8, 0); + }); } ~SamplesStreamer() = default; @@ -159,6 +180,8 @@ public: acumulation_value = 0; }); rasterizer->SyncOperation(std::move(func)); + accumulation_since_last_sync = false; + last_accumulation_checkpoint = std::min(last_accumulation_checkpoint, num_slots_used); } void CloseCounter() override { @@ -175,7 +198,8 @@ public: } for (size_t i = 0; i < sync_values_stash.size(); i++) { - runtime.template SyncValues(sync_values_stash[i], *resolve_buffers[i]); + runtime.template SyncValues(sync_values_stash[i], + *buffers[resolve_buffers[i]]); } sync_values_stash.clear(); @@ -189,36 +213,21 @@ public: sync_values_stash.clear(); sync_values_stash.emplace_back(); std::vector* sync_values = &sync_values_stash.back(); - sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE); + sync_values->reserve(num_slots_used); std::unordered_map> offsets; - size_t this_bank_slot = std::numeric_limits::max(); - size_t resolve_slots_remaining = resolve_slots; - size_t resolve_buffer_index = 0; + resolve_buffers.clear(); + size_t resolve_buffer_index = ObtainBuffer(num_slots_used); + resolve_buffers.push_back(resolve_buffer_index); + size_t base_offset = 0; + ApplyBanksWideOp(pending_sync, [&](SamplesQueryBank* bank, size_t start, size_t amount) { size_t bank_id = bank->GetIndex(); - if (this_bank_slot != bank_id) { - this_bank_slot = bank_id; - if (resolve_slots_remaining == 0) { - resolve_buffer_index++; - if (resolve_buffer_index >= resolve_buffers.size()) { - BuildResolveBuffer(); - } - resolve_slots_remaining = resolve_slots; - sync_values_stash.emplace_back(); - sync_values = &sync_values_stash.back(); - sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE); - } - resolve_slots_remaining--; - } - auto& resolve_buffer = resolve_buffers[resolve_buffer_index]; - const size_t base_offset = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * - (resolve_slots - resolve_slots_remaining - 1); + auto& resolve_buffer = buffers[resolve_buffer_index]; VkQueryPool query_pool = bank->GetInnerPool(); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([start, amount, base_offset, query_pool, buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) { - size_t final_offset = base_offset + start * SamplesQueryBank::QUERY_SIZE; const VkBufferMemoryBarrier copy_query_pool_barrier{ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, .pNext = nullptr, @@ -227,39 +236,60 @@ public: .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .buffer = buffer, - .offset = final_offset, + .offset = base_offset, .size = amount * SamplesQueryBank::QUERY_SIZE, }; cmdbuf.CopyQueryPoolResults( query_pool, static_cast(start), static_cast(amount), buffer, - static_cast(final_offset), SamplesQueryBank::QUERY_SIZE, + static_cast(base_offset), SamplesQueryBank::QUERY_SIZE, VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT); cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier); }); - offsets[bank_id] = {sync_values_stash.size() - 1, base_offset}; + offsets[bank_id] = {start, base_offset}; + base_offset += amount * SamplesQueryBank::QUERY_SIZE; }); // Convert queries + bool has_multi_queries = false; for (auto q : pending_sync) { auto* query = GetQuery(q); + size_t sync_value_slot = 0; if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { continue; } if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { continue; } - if (query->size_slots > 1) { - // This is problematic. - // UNIMPLEMENTED(); + if (accumulation_since_last_sync || query->size_slots > 1) { + if (!has_multi_queries) { + has_multi_queries = true; + sync_values_stash.emplace_back(); + } + sync_value_slot = 1; } query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; auto loc_data = offsets[query->start_bank_id]; - sync_values_stash[loc_data.first].emplace_back(HostSyncValues{ + sync_values_stash[sync_value_slot].emplace_back(HostSyncValues{ .address = query->guest_address, .size = SamplesQueryBank::QUERY_SIZE, - .offset = loc_data.second + query->start_slot * SamplesQueryBank::QUERY_SIZE, + .offset = + loc_data.second + (query->start_slot - loc_data.first + query->size_slots - 1) * + SamplesQueryBank::QUERY_SIZE, + }); + } + + if (has_multi_queries) { + size_t intermediary_buffer_index = ObtainBuffer(num_slots_used); + resolve_buffers.push_back(intermediary_buffer_index); + queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index], + *buffers[resolve_buffer_index], num_slots_used, + std::min(last_accumulation_checkpoint, num_slots_used)); + } else { + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, 8, 0); }); } @@ -267,6 +297,9 @@ public: std::function func([this] { ammend_value = acumulation_value; }); rasterizer->SyncOperation(std::move(func)); AbandonCurrentQuery(); + num_slots_used = 0; + last_accumulation_checkpoint = std::numeric_limits::max(); + accumulation_since_last_sync = has_multi_queries; pending_sync.clear(); } @@ -400,6 +433,7 @@ private: void ReserveHostQuery() { size_t new_slot = ReserveBankSlot(); current_bank->AddReference(1); + num_slots_used++; if (current_query) { size_t bank_id = current_query->start_bank_id; size_t banks_set = current_query->size_banks - 1; @@ -470,32 +504,50 @@ private: }); } - void BuildResolveBuffer() { + template + size_t ObtainBuffer(size_t num_needed) { + const size_t log_2 = std::max(6U, Common::Log2Ceil64(num_needed)); + if constexpr (is_resolve) { + if (resolve_table[log_2] != 0) { + return resolve_table[log_2] - 1; + } + } else { + if (intermediary_table[log_2] != 0) { + return intermediary_table[log_2] - 1; + } + } const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .size = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * resolve_slots, + .size = SamplesQueryBank::QUERY_SIZE * (1ULL << log_2), .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, }; - resolve_buffers.emplace_back( - memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); + buffers.emplace_back(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); + if constexpr (is_resolve) { + resolve_table[log_2] = buffers.size(); + } else { + intermediary_table[log_2] = buffers.size(); + } + return buffers.size() - 1; } - static constexpr size_t resolve_slots = 8; - QueryCacheRuntime& runtime; VideoCore::RasterizerInterface* rasterizer; const Device& device; Scheduler& scheduler; const MemoryAllocator& memory_allocator; VideoCommon::BankPool bank_pool; - std::deque resolve_buffers; + std::deque buffers; + std::array resolve_table{}; + std::array intermediary_table{}; + vk::Buffer accumulation_buffer; std::deque> sync_values_stash; + std::vector resolve_buffers; // syncing queue std::vector pending_sync; @@ -510,10 +562,14 @@ private: SamplesQueryBank* current_bank; VkQueryPool current_query_pool; size_t current_query_id; + size_t num_slots_used{}; + size_t last_accumulation_checkpoint{}; + bool accumulation_since_last_sync{}; VideoCommon::HostQueryBase* current_query; bool has_started{}; - bool current_unset{}; std::mutex flush_guard; + + std::unique_ptr queries_prefix_scan_pass; }; // Transform feedback queries @@ -1090,7 +1146,8 @@ struct QueryCacheRuntimeImpl { memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_}, guest_streamer(0, runtime), sample_streamer(static_cast(QueryType::ZPassPixelCount64), runtime, rasterizer, - device, scheduler, memory_allocator), + device, scheduler, memory_allocator, compute_pass_descriptor_queue, + descriptor_pool), tfb_streamer(static_cast(QueryType::StreamingByteCount), runtime, device, scheduler, memory_allocator, staging_pool), primitives_succeeded_streamer( @@ -1319,10 +1376,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku return true; } } - if (!is_in_bc[0] && !is_in_bc[1]) { + /*if (!is_in_bc[0] && !is_in_bc[1]) { // Both queries are in query cache, it's best to just flush. - return false; - } + return true; + }*/ HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); return true; } From a07c88e686fb9b65924876d472a8184f1f1849df Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 22 Aug 2023 12:28:25 +0200 Subject: [PATCH 08/10] Query Cache: Simplify Prefix Sum compute shader --- src/video_core/host_shaders/CMakeLists.txt | 4 +- .../host_shaders/queries_prefix_scan_sum.comp | 166 +++++++++++------- .../queries_prefix_scan_sum_nosubgroups.comp | 120 +++++++++++++ .../renderer_vulkan/vk_compute_pass.cpp | 27 ++- .../renderer_vulkan/vk_compute_pass.h | 4 +- .../renderer_vulkan/vk_query_cache.cpp | 4 +- 6 files changed, 252 insertions(+), 73 deletions(-) create mode 100644 src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 8218ec4c8..6b912027f 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -42,6 +42,7 @@ set(SHADER_FILES present_bicubic.frag present_gaussian.frag queries_prefix_scan_sum.comp + queries_prefix_scan_sum_nosubgroups.comp resolve_conditional_render.comp smaa_edge_detection.vert smaa_edge_detection.frag @@ -72,6 +73,7 @@ if ("${GLSLANGVALIDATOR}" STREQUAL "GLSLANGVALIDATOR-NOTFOUND") endif() set(GLSL_FLAGS "") +set(SPIR_V_VERSION "spirv1.3") set(QUIET_FLAG "--quiet") set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) @@ -125,7 +127,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES}) OUTPUT ${SPIRV_HEADER_FILE} COMMAND - ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} + ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} --target-env ${SPIR_V_VERSION} MAIN_DEPENDENCY ${SOURCE_FILE} ) diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp index dce1279fe..8f10e248e 100644 --- a/src/video_core/host_shaders/queries_prefix_scan_sum.comp +++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp @@ -1,26 +1,24 @@ -// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel -// SPDX-License-Identifier: MIT - -// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and -// Nicholas Haemel. Modified to suit needs and optimize for subgroup +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #version 460 core +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_KHR_shader_subgroup_shuffle_relative : require +#extension GL_KHR_shader_subgroup_arithmetic : require + #ifdef VULKAN -#extension GL_KHR_shader_subgroup_arithmetic : enable #define HAS_EXTENDED_TYPES 1 #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { -#define END_PUSH_CONSTANTS \ - } \ - ; +#define END_PUSH_CONSTANTS }; #define UNIFORM(n) #define BINDING_INPUT_BUFFER 0 #define BINDING_OUTPUT_IMAGE 1 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv -#extension GL_KHR_shader_subgroup_arithmetic : enable #extension GL_NV_gpu_shader5 : enable #ifdef GL_NV_gpu_shader5 #define HAS_EXTENDED_TYPES 1 @@ -43,19 +41,20 @@ END_PUSH_CONSTANTS layout(local_size_x = 32) in; layout(std430, binding = 0) readonly buffer block1 { - uvec2 input_data[gl_WorkGroupSize.x]; + uvec2 input_data[]; }; -layout(std430, binding = 1) writeonly coherent buffer block2 { - uvec2 output_data[gl_WorkGroupSize.x]; +layout(std430, binding = 1) coherent buffer block2 { + uvec2 output_data[]; }; layout(std430, binding = 2) coherent buffer block3 { uvec2 accumulated_data; }; -shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; +shared uvec2 shared_data[2]; +// Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64 uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { uint carry = 0; uvec2 result; @@ -64,61 +63,102 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { return result; } -void main(void) { - uint id = gl_LocalInvocationID.x; - uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); - uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); - uint work_size = gl_WorkGroupSize.x; - uint rd_id; - uint wr_id; - uint mask; - uvec2 input_1 = input_data[id * 2]; - uvec2 input_2 = input_data[id * 2 + 1]; - // The number of steps is the log base 2 of the - // work group size, which should be a power of 2 - const uint steps = uint(log2(work_size)) + 1; - uint step = 0; +// do subgroup Prefix Sum using Hillis and Steele's algorithm +uvec2 subgroupInclusiveAddUint64(uvec2 value) { + uvec2 result = value; + for (uint i = 1; i < gl_SubgroupSize; i *= 2) { + if (i <= gl_SubgroupInvocationID) { + uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; + result = AddUint64(result, other); + } + } + return result; +} - // Each invocation is responsible for the content of - // two elements of the output array - shared_data[id * 2] = input_1; - shared_data[id * 2 + 1] = input_2; - // Synchronize to make sure that everyone has initialized - // their elements of shared_data[] with data loaded from - // the input arrays +// Writes down the results to the output buffer and to the accumulation buffer +void WriteResults(uvec2 result) { + uint current_global_id = gl_GlobalInvocationID.x; + uvec2 base_data = current_global_id < max_accumulation_base ? accumulated_data : uvec2(0); + output_data[current_global_id] = result + base_data; + if (max_accumulation_base >= accumulation_limit + 1) { + if (current_global_id == accumulation_limit) { + accumulated_data = result; + } + return; + } + // We have that ugly case in which the accumulation data is reset in the middle somewhere. + barrier(); + groupMemoryBarrier(); + if (current_global_id == accumulation_limit) { + uvec2 value_1 = output_data[max_accumulation_base]; + accumulated_data = AddUint64(result, -value_1); + } +} + +void main() { + uint subgroup_inv_id = gl_SubgroupInvocationID; + uint subgroup_id = gl_SubgroupID; + uint last_subgroup_id = subgroupMax(subgroup_inv_id); + uint current_global_id = gl_GlobalInvocationID.x; + uint total_work = gl_NumWorkGroups.x * gl_WorkGroupSize.x; + uvec2 data = input_data[current_global_id]; + // make sure all input data has been loaded + subgroupBarrier(); + subgroupMemoryBarrier(); + + uvec2 result = subgroupInclusiveAddUint64(data); + + // if we had less queries than our subgroup, just write down the results. + if (total_work <= gl_SubgroupSize) { // This condition is constant per dispatch. + WriteResults(result); + return; + } + + // We now have more, so lets write the last result into shared memory. + // Only pick the last subgroup. + if (subgroup_inv_id == last_subgroup_id) { + shared_data[subgroup_id] = result; + } + // wait until everyone loaded their stuffs barrier(); memoryBarrierShared(); - // For each step... - for (step = 0; step < steps; step++) { - // Calculate the read and write index in the - // shared array - mask = (1 << step) - 1; - rd_id = ((id >> step) << (step + 1)) + mask; - wr_id = rd_id + 1 + (id & mask); - // Accumulate the read data into our element - shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); - // Synchronize again to make sure that everyone - // has caught up with us + // Case 1: the total work for the grouped results can be calculated in a single subgroup + // operation (about 1024 queries). + uint total_extra_work = gl_NumSubgroups * gl_NumWorkGroups.x; + if (total_extra_work <= gl_SubgroupSize) { // This condition is constant per dispatch. + if (subgroup_id != 0) { + uvec2 tmp = shared_data[subgroup_inv_id]; + subgroupBarrier(); + subgroupMemoryBarrierShared(); + tmp = subgroupInclusiveAddUint64(tmp); + result = AddUint64(result, subgroupShuffle(tmp, subgroup_id - 1)); + } + + WriteResults(result); + return; + } + + // Case 2: our work amount is huge, so lets do it in O(log n) steps. + const uint extra = (total_extra_work ^ (total_extra_work - 1)) != 0 ? 1 : 0; + const uint steps = 1 << (findMSB(total_extra_work) + extra); + uint step; + // Hillis and Steele's algorithm + for (step = 1; step < steps; step *= 2) { + if (current_global_id < steps && current_global_id >= step) { + uvec2 current = shared_data[current_global_id]; + uvec2 other = shared_data[current_global_id - step]; + shared_data[current_global_id] = AddUint64(current, other); + } + // steps is constant, so this will always execute in ever workgroup's thread. barrier(); memoryBarrierShared(); } - // Add the accumulation - shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); - shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); - barrier(); - memoryBarrierShared(); - - // Finally write our data back to the output buffer - output_data[id * 2] = shared_data[id * 2]; - output_data[id * 2 + 1] = shared_data[id * 2 + 1]; - if (id == 0) { - if (max_accumulation_base >= accumulation_limit + 1) { - accumulated_data = shared_data[accumulation_limit]; - return; - } - uvec2 value_1 = shared_data[max_accumulation_base]; - uvec2 value_2 = shared_data[accumulation_limit]; - accumulated_data = AddUint64(value_1, -value_2); + // Only add results for groups higher than 0 + if (subgroup_id != 0) { + result = AddUint64(result, shared_data[subgroup_id - 1]); } + + // Just write the final results. We are done + WriteResults(result); } \ No newline at end of file diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp new file mode 100644 index 000000000..8021476ed --- /dev/null +++ b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp @@ -0,0 +1,120 @@ +// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel +// SPDX-License-Identifier: MIT + +// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and +// Nicholas Haemel. Modified to suit needs. + +#version 460 core + +#ifdef VULKAN + +#define HAS_EXTENDED_TYPES 1 +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS }; +#define UNIFORM(n) +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 1 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#extension GL_NV_gpu_shader5 : enable +#ifdef GL_NV_gpu_shader5 +#define HAS_EXTENDED_TYPES 1 +#else +#define HAS_EXTENDED_TYPES 0 +#endif +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout(location = n) uniform +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 0 + +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) uint max_accumulation_base; +UNIFORM(1) uint accumulation_limit; +END_PUSH_CONSTANTS + +layout(local_size_x = 32) in; + +layout(std430, binding = 0) readonly buffer block1 { + uvec2 input_data[gl_WorkGroupSize.x]; +}; + +layout(std430, binding = 1) writeonly coherent buffer block2 { + uvec2 output_data[gl_WorkGroupSize.x]; +}; + +layout(std430, binding = 2) coherent buffer block3 { + uvec2 accumulated_data; +}; + +shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; + +uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { + uint carry = 0; + uvec2 result; + result.x = uaddCarry(value_1.x, value_2.x, carry); + result.y = value_1.y + value_2.y + carry; + return result; +} + +void main(void) { + uint id = gl_LocalInvocationID.x; + uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); + uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); + uint work_size = gl_WorkGroupSize.x; + uint rd_id; + uint wr_id; + uint mask; + uvec2 input_1 = input_data[id * 2]; + uvec2 input_2 = input_data[id * 2 + 1]; + // The number of steps is the log base 2 of the + // work group size, which should be a power of 2 + const uint steps = uint(log2(work_size)) + 1; + uint step = 0; + + // Each invocation is responsible for the content of + // two elements of the output array + shared_data[id * 2] = input_1; + shared_data[id * 2 + 1] = input_2; + // Synchronize to make sure that everyone has initialized + // their elements of shared_data[] with data loaded from + // the input arrays + barrier(); + memoryBarrierShared(); + // For each step... + for (step = 0; step < steps; step++) { + // Calculate the read and write index in the + // shared array + mask = (1 << step) - 1; + rd_id = ((id >> step) << (step + 1)) + mask; + wr_id = rd_id + 1 + (id & mask); + // Accumulate the read data into our element + + shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); + // Synchronize again to make sure that everyone + // has caught up with us + barrier(); + memoryBarrierShared(); + } + // Add the accumulation + shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); + shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); + barrier(); + memoryBarrierShared(); + + // Finally write our data back to the output buffer + output_data[id * 2] = shared_data[id * 2]; + output_data[id * 2 + 1] = shared_data[id * 2 + 1]; + if (id == 0) { + if (max_accumulation_base >= accumulation_limit + 1) { + accumulated_data = shared_data[accumulation_limit]; + return; + } + uvec2 value_1 = shared_data[max_accumulation_base]; + uvec2 value_2 = shared_data[accumulation_limit]; + accumulated_data = AddUint64(value_1, -value_2); + } +} \ No newline at end of file diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index a1af08cda..44ec5a032 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -13,6 +13,7 @@ #include "common/div_ceil.h" #include "video_core/host_shaders/astc_decoder_comp_spv.h" #include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h" +#include "video_core/host_shaders/queries_prefix_scan_sum_nosubgroups_comp_spv.h" #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" @@ -187,7 +188,8 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, vk::Span bindings, vk::Span templates, const DescriptorBankInfo& bank_info, - vk::Span push_constants, std::span code) + vk::Span push_constants, std::span code, + std::optional optional_subgroup_size) : device{device_} { descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, @@ -228,13 +230,19 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, .pCode = code.data(), }); device.SaveShader(code); + const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, + .pNext = nullptr, + .requiredSubgroupSize = optional_subgroup_size ? *optional_subgroup_size : 32U, + }; + bool use_setup_size = device.IsExtSubgroupSizeControlSupported() && optional_subgroup_size; pipeline = device.GetLogical().CreateComputePipeline({ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, .pNext = nullptr, .flags = 0, .stage{ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, - .pNext = nullptr, + .pNext = use_setup_size ? &subgroup_size_ci : nullptr, .flags = 0, .stage = VK_SHADER_STAGE_COMPUTE_BIT, .module = *module, @@ -399,10 +407,17 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_ QueriesPrefixScanPass::QueriesPrefixScanPass( const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, ComputePassDescriptorQueue& compute_pass_descriptor_queue_) - : ComputePass(device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, - QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, - COMPUTE_PUSH_CONSTANT_RANGE, - QUERIES_PREFIX_SCAN_SUM_COMP_SPV), + : ComputePass( + device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, + QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, + COMPUTE_PUSH_CONSTANT_RANGE, + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_BASIC_BIT) && + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) && + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) + ? std::span(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) + : std::span(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV), + {32}), scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index e6ff86e9a..68ffb1b82 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -3,6 +3,7 @@ #pragma once +#include #include #include @@ -31,7 +32,8 @@ public: vk::Span bindings, vk::Span templates, const DescriptorBankInfo& bank_info, - vk::Span push_constants, std::span code); + vk::Span push_constants, std::span code, + std::optional optional_subgroup_size = std::nullopt); ~ComputePass(); protected: diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index ded190ae0..825e1a72e 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -1376,10 +1376,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku return true; } } - /*if (!is_in_bc[0] && !is_in_bc[1]) { + if (!is_in_bc[0] && !is_in_bc[1]) { // Both queries are in query cache, it's best to just flush. return true; - }*/ + } HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); return true; } From bf0d6b8806b7367a17bbeb2bb59f4bcba1fb1375 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 22 Aug 2023 17:44:03 +0200 Subject: [PATCH 09/10] Query Cache: Fix behavior in Normal Accuracy --- src/video_core/query_cache/query_cache.h | 13 +++++++++++++ src/video_core/renderer_vulkan/vk_query_cache.cpp | 10 ++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/video_core/query_cache/query_cache.h b/src/video_core/query_cache/query_cache.h index 4b89b5bf6..78b42b518 100644 --- a/src/video_core/query_cache/query_cache.h +++ b/src/video_core/query_cache/query_cache.h @@ -256,6 +256,7 @@ void QueryCacheBase::CounterReport(GPUVAddr addr, QueryType counter_type u8* pointer = impl->cpu_memory.GetPointer(cpu_addr); u8* pointer_timestamp = impl->cpu_memory.GetPointer(cpu_addr + 8); bool is_synced = !Settings::IsGPULevelHigh() && is_fence; + std::function operation([this, is_synced, streamer, query_base = query, query_location, pointer, pointer_timestamp] { if (True(query_base->flags & QueryFlagBits::IsInvalidated)) { @@ -285,6 +286,18 @@ void QueryCacheBase::CounterReport(GPUVAddr addr, QueryType counter_type if (is_fence) { impl->rasterizer.SignalFence(std::move(operation)); } else { + if (!Settings::IsGPULevelHigh() && counter_type == QueryType::Payload) { + if (has_timestamp) { + u64 timestamp = impl->gpu.GetTicks(); + u64 value = static_cast(payload); + std::memcpy(pointer_timestamp, ×tamp, sizeof(timestamp)); + std::memcpy(pointer, &value, sizeof(value)); + } else { + std::memcpy(pointer, &payload, sizeof(payload)); + } + streamer->Free(new_query_id); + return; + } impl->rasterizer.SyncOperation(std::move(operation)); } if (is_synced) { diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 825e1a72e..2cc007716 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -1365,6 +1365,11 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku return false; } + const bool is_gpu_high = Settings::IsGPULevelHigh(); + if (!is_gpu_high && impl->device.GetDriverID() == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) { + return true; + } + for (size_t i = 0; i < 2; i++) { is_null[i] = !is_in_ac[i] && check_value(objects[i]->address); } @@ -1376,6 +1381,11 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku return true; } } + + if (!is_gpu_high) { + return true; + } + if (!is_in_bc[0] && !is_in_bc[1]) { // Both queries are in query cache, it's best to just flush. return true; From 57d8cd6c40bbadeb30e7a4792267061cbad4d446 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Thu, 24 Aug 2023 03:58:59 +0200 Subject: [PATCH 10/10] Query Cache: Fix Prefix Sums --- .../host_shaders/queries_prefix_scan_sum.comp | 131 ++++++++++-------- .../queries_prefix_scan_sum_nosubgroups.comp | 60 +++++--- .../renderer_vulkan/vk_compute_pass.cpp | 99 +++++++------ .../renderer_vulkan/vk_compute_pass.h | 2 +- .../renderer_vulkan/vk_query_cache.cpp | 13 +- 5 files changed, 174 insertions(+), 131 deletions(-) diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp index 8f10e248e..6faa8981f 100644 --- a/src/video_core/host_shaders/queries_prefix_scan_sum.comp +++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp @@ -34,11 +34,16 @@ #endif BEGIN_PUSH_CONSTANTS -UNIFORM(0) uint max_accumulation_base; -UNIFORM(1) uint accumulation_limit; +UNIFORM(0) uint min_accumulation_base; +UNIFORM(1) uint max_accumulation_base; +UNIFORM(2) uint accumulation_limit; +UNIFORM(3) uint buffer_offset; END_PUSH_CONSTANTS -layout(local_size_x = 32) in; +#define LOCAL_RESULTS 8 +#define QUERIES_PER_INVOC 2048 + +layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; layout(std430, binding = 0) readonly buffer block1 { uvec2 input_data[]; @@ -52,7 +57,7 @@ layout(std430, binding = 2) coherent buffer block3 { uvec2 accumulated_data; }; -shared uvec2 shared_data[2]; +shared uvec2 shared_data[128]; // Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64 uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { @@ -67,8 +72,8 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { uvec2 subgroupInclusiveAddUint64(uvec2 value) { uvec2 result = value; for (uint i = 1; i < gl_SubgroupSize; i *= 2) { + uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; if (i <= gl_SubgroupInvocationID) { - uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; result = AddUint64(result, other); } } @@ -76,89 +81,93 @@ uvec2 subgroupInclusiveAddUint64(uvec2 value) { } // Writes down the results to the output buffer and to the accumulation buffer -void WriteResults(uvec2 result) { - uint current_global_id = gl_GlobalInvocationID.x; - uvec2 base_data = current_global_id < max_accumulation_base ? accumulated_data : uvec2(0); - output_data[current_global_id] = result + base_data; - if (max_accumulation_base >= accumulation_limit + 1) { - if (current_global_id == accumulation_limit) { - accumulated_data = result; +void WriteResults(uvec2 results[LOCAL_RESULTS]) { + const uint current_id = gl_LocalInvocationID.x; + const uvec2 accum = accumulated_data; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + uvec2 base_data = current_id * LOCAL_RESULTS + i < min_accumulation_base ? accum : uvec2(0, 0); + AddUint64(results[i], base_data); + } + for (uint i = 0; i < LOCAL_RESULTS; i++) { + output_data[buffer_offset + current_id * LOCAL_RESULTS + i] = results[i]; + } + uint index = accumulation_limit % LOCAL_RESULTS; + uint base_id = accumulation_limit / LOCAL_RESULTS; + if (min_accumulation_base >= accumulation_limit + 1) { + if (current_id == base_id) { + accumulated_data = results[index]; } return; } // We have that ugly case in which the accumulation data is reset in the middle somewhere. barrier(); groupMemoryBarrier(); - if (current_global_id == accumulation_limit) { - uvec2 value_1 = output_data[max_accumulation_base]; - accumulated_data = AddUint64(result, -value_1); + + if (current_id == base_id) { + uvec2 reset_value = output_data[max_accumulation_base - 1]; + // Calculate two complement / negate manually + reset_value = AddUint64(uvec2(1,0), ~reset_value); + accumulated_data = AddUint64(results[index], reset_value); } } void main() { - uint subgroup_inv_id = gl_SubgroupInvocationID; - uint subgroup_id = gl_SubgroupID; - uint last_subgroup_id = subgroupMax(subgroup_inv_id); - uint current_global_id = gl_GlobalInvocationID.x; - uint total_work = gl_NumWorkGroups.x * gl_WorkGroupSize.x; - uvec2 data = input_data[current_global_id]; + const uint subgroup_inv_id = gl_SubgroupInvocationID; + const uint subgroup_id = gl_SubgroupID + gl_WorkGroupID.x * gl_NumSubgroups; + const uint last_subgroup_id = subgroupMax(subgroup_inv_id); + const uint current_id = gl_LocalInvocationID.x; + const uint total_work = accumulation_limit; + const uint last_result_id = LOCAL_RESULTS - 1; + uvec2 data[LOCAL_RESULTS]; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + data[i] = input_data[buffer_offset + current_id * LOCAL_RESULTS + i]; + } + uvec2 results[LOCAL_RESULTS]; + results[0] = data[0]; + for (uint i = 1; i < LOCAL_RESULTS; i++) { + results[i] = AddUint64(data[i], results[i - 1]); + } // make sure all input data has been loaded subgroupBarrier(); subgroupMemoryBarrier(); - uvec2 result = subgroupInclusiveAddUint64(data); + // on the last local result, do a subgroup inclusive scan sum + results[last_result_id] = subgroupInclusiveAddUint64(results[last_result_id]); + // get the last local result from the subgroup behind the current + uvec2 result_behind = subgroupShuffleUp(results[last_result_id], 1); + if (subgroup_inv_id != 0) { + for (uint i = 1; i < LOCAL_RESULTS; i++) { + results[i - 1] = AddUint64(results[i - 1], result_behind); + } + } // if we had less queries than our subgroup, just write down the results. - if (total_work <= gl_SubgroupSize) { // This condition is constant per dispatch. - WriteResults(result); + if (total_work <= gl_SubgroupSize * LOCAL_RESULTS) { // This condition is constant per dispatch. + WriteResults(results); return; } // We now have more, so lets write the last result into shared memory. // Only pick the last subgroup. if (subgroup_inv_id == last_subgroup_id) { - shared_data[subgroup_id] = result; + shared_data[subgroup_id] = results[last_result_id]; } // wait until everyone loaded their stuffs barrier(); memoryBarrierShared(); - // Case 1: the total work for the grouped results can be calculated in a single subgroup - // operation (about 1024 queries). - uint total_extra_work = gl_NumSubgroups * gl_NumWorkGroups.x; - if (total_extra_work <= gl_SubgroupSize) { // This condition is constant per dispatch. - if (subgroup_id != 0) { - uvec2 tmp = shared_data[subgroup_inv_id]; - subgroupBarrier(); - subgroupMemoryBarrierShared(); - tmp = subgroupInclusiveAddUint64(tmp); - result = AddUint64(result, subgroupShuffle(tmp, subgroup_id - 1)); - } - - WriteResults(result); - return; - } - - // Case 2: our work amount is huge, so lets do it in O(log n) steps. - const uint extra = (total_extra_work ^ (total_extra_work - 1)) != 0 ? 1 : 0; - const uint steps = 1 << (findMSB(total_extra_work) + extra); - uint step; - // Hillis and Steele's algorithm - for (step = 1; step < steps; step *= 2) { - if (current_global_id < steps && current_global_id >= step) { - uvec2 current = shared_data[current_global_id]; - uvec2 other = shared_data[current_global_id - step]; - shared_data[current_global_id] = AddUint64(current, other); - } - // steps is constant, so this will always execute in ever workgroup's thread. - barrier(); - memoryBarrierShared(); - } - // Only add results for groups higher than 0 + // only if it's not the first subgroup if (subgroup_id != 0) { - result = AddUint64(result, shared_data[subgroup_id - 1]); + // get the results from some previous invocation + uvec2 tmp = shared_data[subgroup_inv_id]; + subgroupBarrier(); + subgroupMemoryBarrierShared(); + tmp = subgroupInclusiveAddUint64(tmp); + // obtain the result that would be equivalent to the previous result + uvec2 shuffled_result = subgroupShuffle(tmp, subgroup_id - 1); + for (uint i = 0; i < LOCAL_RESULTS; i++) { + results[i] = AddUint64(results[i], shuffled_result); + } } - - // Just write the final results. We are done - WriteResults(result); + WriteResults(results); } \ No newline at end of file diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp index 8021476ed..559a213b9 100644 --- a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp +++ b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp @@ -32,25 +32,30 @@ #endif BEGIN_PUSH_CONSTANTS -UNIFORM(0) uint max_accumulation_base; -UNIFORM(1) uint accumulation_limit; +UNIFORM(0) uint min_accumulation_base; +UNIFORM(1) uint max_accumulation_base; +UNIFORM(2) uint accumulation_limit; +UNIFORM(3) uint buffer_offset; END_PUSH_CONSTANTS -layout(local_size_x = 32) in; +#define LOCAL_RESULTS 4 +#define QUERIES_PER_INVOC 2048 + +layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; layout(std430, binding = 0) readonly buffer block1 { - uvec2 input_data[gl_WorkGroupSize.x]; + uvec2 input_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; }; layout(std430, binding = 1) writeonly coherent buffer block2 { - uvec2 output_data[gl_WorkGroupSize.x]; + uvec2 output_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; }; layout(std430, binding = 2) coherent buffer block3 { uvec2 accumulated_data; }; -shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; +shared uvec2 shared_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { uint carry = 0; @@ -62,23 +67,31 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { void main(void) { uint id = gl_LocalInvocationID.x; - uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); - uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); + uvec2 base_value[LOCAL_RESULTS]; + const uvec2 accum = accumulated_data; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + base_value[i] = (buffer_offset + id * LOCAL_RESULTS + i) < min_accumulation_base + ? accumulated_data + : uvec2(0); + } uint work_size = gl_WorkGroupSize.x; uint rd_id; uint wr_id; uint mask; - uvec2 input_1 = input_data[id * 2]; - uvec2 input_2 = input_data[id * 2 + 1]; + uvec2 inputs[LOCAL_RESULTS]; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + inputs[i] = input_data[buffer_offset + id * LOCAL_RESULTS + i]; + } // The number of steps is the log base 2 of the // work group size, which should be a power of 2 - const uint steps = uint(log2(work_size)) + 1; + const uint steps = uint(log2(work_size)) + uint(log2(LOCAL_RESULTS)); uint step = 0; // Each invocation is responsible for the content of // two elements of the output array - shared_data[id * 2] = input_1; - shared_data[id * 2 + 1] = input_2; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + shared_data[id * LOCAL_RESULTS + i] = inputs[i]; + } // Synchronize to make sure that everyone has initialized // their elements of shared_data[] with data loaded from // the input arrays @@ -100,21 +113,26 @@ void main(void) { memoryBarrierShared(); } // Add the accumulation - shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); - shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); + for (uint i = 0; i < LOCAL_RESULTS; i++) { + shared_data[id * LOCAL_RESULTS + i] = + AddUint64(shared_data[id * LOCAL_RESULTS + i], base_value[i]); + } barrier(); memoryBarrierShared(); // Finally write our data back to the output buffer - output_data[id * 2] = shared_data[id * 2]; - output_data[id * 2 + 1] = shared_data[id * 2 + 1]; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + output_data[buffer_offset + id * LOCAL_RESULTS + i] = shared_data[id * LOCAL_RESULTS + i]; + } if (id == 0) { - if (max_accumulation_base >= accumulation_limit + 1) { + if (min_accumulation_base >= accumulation_limit + 1) { accumulated_data = shared_data[accumulation_limit]; return; } - uvec2 value_1 = shared_data[max_accumulation_base]; - uvec2 value_2 = shared_data[accumulation_limit]; - accumulated_data = AddUint64(value_1, -value_2); + uvec2 reset_value = shared_data[max_accumulation_base - 1]; + uvec2 final_value = shared_data[accumulation_limit]; + // Two complements + reset_value = AddUint64(uvec2(1, 0), ~reset_value); + accumulated_data = AddUint64(final_value, reset_value); } } \ No newline at end of file diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 44ec5a032..289d5b25c 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -179,8 +179,10 @@ struct AstcPushConstants { }; struct QueriesPrefixScanPushConstants { + u32 min_accumulation_base; u32 max_accumulation_base; u32 accumulation_limit; + u32 buffer_offset; }; } // Anonymous namespace @@ -416,56 +418,65 @@ QueriesPrefixScanPass::QueriesPrefixScanPass( device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) ? std::span(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) - : std::span(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV), - {32}), + : std::span(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV)), scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, size_t number_of_sums, - size_t max_accumulation_limit) { - size_t aligned_runs = Common::AlignUp(number_of_sums, 32); + size_t min_accumulation_limit, size_t max_accumulation_limit) { + size_t current_runs = number_of_sums; + size_t offset = 0; + while (current_runs != 0) { + static constexpr size_t DISPATCH_SIZE = 2048U; + size_t runs_to_do = std::min(current_runs, DISPATCH_SIZE); + current_runs -= runs_to_do; + compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, number_of_sums * sizeof(u64)); + compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, number_of_sums * sizeof(u64)); + compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); + const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; + size_t used_offset = offset; + offset += runs_to_do; - compute_pass_descriptor_queue.Acquire(); - compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, aligned_runs * sizeof(u64)); - compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, aligned_runs * sizeof(u64)); - compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); - const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([this, descriptor_data, min_accumulation_limit, max_accumulation_limit, + runs_to_do, used_offset](vk::CommandBuffer cmdbuf) { + static constexpr VkMemoryBarrier read_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | + VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | + VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | + VK_ACCESS_UNIFORM_READ_BIT | + VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, + }; + const QueriesPrefixScanPushConstants uniforms{ + .min_accumulation_base = static_cast(min_accumulation_limit), + .max_accumulation_base = static_cast(max_accumulation_limit), + .accumulation_limit = static_cast(runs_to_do - 1), + .buffer_offset = static_cast(used_offset), + }; + const VkDescriptorSet set = descriptor_allocator.Commit(); + device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([this, descriptor_data, max_accumulation_limit, number_of_sums, - aligned_runs](vk::CommandBuffer cmdbuf) { - static constexpr VkMemoryBarrier read_barrier{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, - }; - static constexpr VkMemoryBarrier write_barrier{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | - VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | - VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | - VK_ACCESS_UNIFORM_READ_BIT | - VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, - }; - const QueriesPrefixScanPushConstants uniforms{ - .max_accumulation_base = static_cast(max_accumulation_limit), - .accumulation_limit = static_cast(number_of_sums - 1), - }; - const VkDescriptorSet set = descriptor_allocator.Commit(); - device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); - - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); - cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); - cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); - cmdbuf.Dispatch(static_cast(aligned_runs / 32U), 1, 1); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); - }); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); + cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); + cmdbuf.Dispatch(1, 1, 1); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, + write_barrier); + }); + } } ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index 68ffb1b82..3ff935639 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -104,7 +104,7 @@ public: ComputePassDescriptorQueue& compute_pass_descriptor_queue_); void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, - size_t number_of_sums, size_t max_accumulation_limit); + size_t number_of_sums, size_t min_accumulation_limit, size_t max_accumulation_limit); private: Scheduler& scheduler; diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 2cc007716..a32da3ba3 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -181,7 +181,8 @@ public: }); rasterizer->SyncOperation(std::move(func)); accumulation_since_last_sync = false; - last_accumulation_checkpoint = std::min(last_accumulation_checkpoint, num_slots_used); + first_accumulation_checkpoint = std::min(first_accumulation_checkpoint, num_slots_used); + last_accumulation_checkpoint = std::max(last_accumulation_checkpoint, num_slots_used); } void CloseCounter() override { @@ -285,7 +286,9 @@ public: resolve_buffers.push_back(intermediary_buffer_index); queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index], *buffers[resolve_buffer_index], num_slots_used, - std::min(last_accumulation_checkpoint, num_slots_used)); + std::min(first_accumulation_checkpoint, num_slots_used), + last_accumulation_checkpoint); + } else { scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { @@ -298,7 +301,8 @@ public: rasterizer->SyncOperation(std::move(func)); AbandonCurrentQuery(); num_slots_used = 0; - last_accumulation_checkpoint = std::numeric_limits::max(); + first_accumulation_checkpoint = std::numeric_limits::max(); + last_accumulation_checkpoint = 0; accumulation_since_last_sync = has_multi_queries; pending_sync.clear(); } @@ -506,7 +510,7 @@ private: template size_t ObtainBuffer(size_t num_needed) { - const size_t log_2 = std::max(6U, Common::Log2Ceil64(num_needed)); + const size_t log_2 = std::max(11U, Common::Log2Ceil64(num_needed)); if constexpr (is_resolve) { if (resolve_table[log_2] != 0) { return resolve_table[log_2] - 1; @@ -563,6 +567,7 @@ private: VkQueryPool current_query_pool; size_t current_query_id; size_t num_slots_used{}; + size_t first_accumulation_checkpoint{}; size_t last_accumulation_checkpoint{}; bool accumulation_since_last_sync{}; VideoCommon::HostQueryBase* current_query;