From 4841dc0b745389fb03edbf900f25511bee4b3d88 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 3 Feb 2024 22:51:04 +0100 Subject: [PATCH 1/5] VideoCore: Move Slot Vector to Common --- src/common/CMakeLists.txt | 1 + .../texture_cache => common}/slot_vector.h | 8 ++++---- src/video_core/CMakeLists.txt | 1 - .../buffer_cache/buffer_cache_base.h | 6 +++--- src/video_core/query_cache.h | 6 +++--- .../renderer_opengl/gl_buffer_cache.h | 2 +- .../renderer_opengl/gl_texture_cache.h | 2 +- .../renderer_vulkan/vk_buffer_cache.cpp | 2 +- .../renderer_vulkan/vk_buffer_cache.h | 2 +- .../renderer_vulkan/vk_texture_cache.h | 2 +- .../texture_cache/texture_cache_base.h | 18 +++++++++--------- src/video_core/texture_cache/types.h | 16 ++++++++-------- 12 files changed, 33 insertions(+), 33 deletions(-) rename src/{video_core/texture_cache => common}/slot_vector.h (97%) diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 85926fc8f..bf3f3b781 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -121,6 +121,7 @@ add_library(common STATIC settings_input.cpp settings_input.h settings_setting.h + slot_vector.h socket_types.h spin_lock.cpp spin_lock.h diff --git a/src/video_core/texture_cache/slot_vector.h b/src/common/slot_vector.h similarity index 97% rename from src/video_core/texture_cache/slot_vector.h rename to src/common/slot_vector.h index 3ffa2a661..34ff7de94 100644 --- a/src/video_core/texture_cache/slot_vector.h +++ b/src/common/slot_vector.h @@ -14,7 +14,7 @@ #include "common/common_types.h" #include "common/polyfill_ranges.h" -namespace VideoCommon { +namespace Common { struct SlotId { static constexpr u32 INVALID_INDEX = std::numeric_limits::max(); @@ -217,11 +217,11 @@ private: std::vector free_list; }; -} // namespace VideoCommon +} // namespace Common template <> -struct std::hash { - size_t operator()(const VideoCommon::SlotId& id) const noexcept { +struct std::hash { + size_t operator()(const Common::SlotId& id) const noexcept { return std::hash{}(id.index); } }; diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 16c905db9..55180f4b5 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -274,7 +274,6 @@ add_library(video_core STATIC texture_cache/image_view_info.h texture_cache/render_targets.h texture_cache/samples_helper.h - texture_cache/slot_vector.h texture_cache/texture_cache.cpp texture_cache/texture_cache.h texture_cache/texture_cache_base.h diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 80dbb81e7..59124458d 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -41,7 +41,7 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" #include "video_core/surface.h" -#include "video_core/texture_cache/slot_vector.h" +#include "common/slot_vector.h" #include "video_core/texture_cache/types.h" namespace boost { @@ -55,7 +55,7 @@ MICROPROFILE_DECLARE(GPU_PrepareBuffers); MICROPROFILE_DECLARE(GPU_BindUploadBuffers); MICROPROFILE_DECLARE(GPU_DownloadMemory); -using BufferId = SlotId; +using BufferId = Common::SlotId; using VideoCore::Surface::PixelFormat; using namespace Common::Literals; @@ -559,7 +559,7 @@ private: Tegra::MaxwellDeviceMemoryManager& device_memory; - SlotVector slot_buffers; + Common::SlotVector slot_buffers; DelayedDestructionRing delayed_destruction_ring; const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{}; diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index 4861b123a..e1019f228 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -18,12 +18,12 @@ #include "common/assert.h" #include "common/settings.h" +#include "common/slot_vector.h" #include "video_core/control/channel_state_cache.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/host1x/gpu_device_memory_manager.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" -#include "video_core/texture_cache/slot_vector.h" namespace VideoCore { enum class QueryType { @@ -37,7 +37,7 @@ constexpr std::size_t NumQueryTypes = static_cast(QueryType::Count); namespace VideoCommon { -using AsyncJobId = SlotId; +using AsyncJobId = Common::SlotId; static constexpr AsyncJobId NULL_ASYNC_JOB_ID{0}; @@ -341,7 +341,7 @@ private: static constexpr std::uintptr_t YUZU_PAGESIZE = 4096; static constexpr unsigned YUZU_PAGEBITS = 12; - SlotVector slot_async_jobs; + Common::SlotVector slot_async_jobs; VideoCore::RasterizerInterface& rasterizer; Tegra::MaxwellDeviceMemoryManager& device_memory; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index af34c272b..022275fd6 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -90,7 +90,7 @@ public: void PostCopyBarrier(); void Finish(); - void TickFrame(VideoCommon::SlotVector&) noexcept {} + void TickFrame(Common::SlotVector&) noexcept {} void ClearBuffer(Buffer& dest_buffer, u32 offset, size_t size, u32 value); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 3e54edcc2..d4165d8e4 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -30,13 +30,13 @@ class Image; class ImageView; class Sampler; +using Common::SlotVector; using VideoCommon::ImageId; using VideoCommon::ImageViewId; using VideoCommon::ImageViewType; using VideoCommon::NUM_RT; using VideoCommon::Region2D; using VideoCommon::RenderTargets; -using VideoCommon::SlotVector; struct FormatProperties { GLenum compatibility_class; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 31001d142..e5e1e3ab6 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -368,7 +368,7 @@ u32 BufferCacheRuntime::GetStorageBufferAlignment() const { return static_cast(device.GetStorageBufferAlignment()); } -void BufferCacheRuntime::TickFrame(VideoCommon::SlotVector& slot_buffers) noexcept { +void BufferCacheRuntime::TickFrame(Common::SlotVector& slot_buffers) noexcept { for (auto it = slot_buffers.begin(); it != slot_buffers.end(); it++) { it->ResetUsageTracking(); } diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index e273f4988..ac14c9f86 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -81,7 +81,7 @@ public: ComputePassDescriptorQueue& compute_pass_descriptor_queue, DescriptorPool& descriptor_pool); - void TickFrame(VideoCommon::SlotVector& slot_buffers) noexcept; + void TickFrame(Common::SlotVector& slot_buffers) noexcept; void Finish(); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 0dbde65d6..aaeb5ef93 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -20,11 +20,11 @@ struct ResolutionScalingInfo; namespace Vulkan { +using Common::SlotVector; using VideoCommon::ImageId; using VideoCommon::NUM_RT; using VideoCommon::Region2D; using VideoCommon::RenderTargets; -using VideoCommon::SlotVector; using VideoCore::Surface::PixelFormat; class BlitImageHelper; diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index e7b910121..da98a634b 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -21,6 +21,7 @@ #include "common/lru_cache.h" #include "common/polyfill_ranges.h" #include "common/scratch_buffer.h" +#include "common/slot_vector.h" #include "common/thread_worker.h" #include "video_core/compatible_formats.h" #include "video_core/control/channel_state_cache.h" @@ -32,7 +33,6 @@ #include "video_core/texture_cache/image_info.h" #include "video_core/texture_cache/image_view_base.h" #include "video_core/texture_cache/render_targets.h" -#include "video_core/texture_cache/slot_vector.h" #include "video_core/texture_cache/types.h" #include "video_core/textures/texture.h" @@ -451,16 +451,16 @@ private: struct PendingDownload { bool is_swizzle; size_t async_buffer_id; - SlotId object_id; + Common::SlotId object_id; }; - SlotVector slot_images; - SlotVector slot_map_views; - SlotVector slot_image_views; - SlotVector slot_image_allocs; - SlotVector slot_samplers; - SlotVector slot_framebuffers; - SlotVector slot_buffer_downloads; + Common::SlotVector slot_images; + Common::SlotVector slot_map_views; + Common::SlotVector slot_image_views; + Common::SlotVector slot_image_allocs; + Common::SlotVector slot_samplers; + Common::SlotVector slot_framebuffers; + Common::SlotVector slot_buffer_downloads; // TODO: This data structure is not optimal and it should be reworked diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h index 0453456b4..07c304386 100644 --- a/src/video_core/texture_cache/types.h +++ b/src/video_core/texture_cache/types.h @@ -5,21 +5,21 @@ #include "common/common_funcs.h" #include "common/common_types.h" -#include "video_core/texture_cache/slot_vector.h" +#include "common/slot_vector.h" namespace VideoCommon { constexpr size_t NUM_RT = 8; constexpr size_t MAX_MIP_LEVELS = 14; -constexpr SlotId CORRUPT_ID{0xfffffffe}; +constexpr Common::SlotId CORRUPT_ID{0xfffffffe}; -using ImageId = SlotId; -using ImageMapId = SlotId; -using ImageViewId = SlotId; -using ImageAllocId = SlotId; -using SamplerId = SlotId; -using FramebufferId = SlotId; +using ImageId = Common::SlotId; +using ImageMapId = Common::SlotId; +using ImageViewId = Common::SlotId; +using ImageAllocId = Common::SlotId; +using SamplerId = Common::SlotId; +using FramebufferId = Common::SlotId; /// Fake image ID for null image views constexpr ImageId NULL_IMAGE_ID{0}; From 01ba6cf610641f1937092b469843b14ebc2a5962 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 4 Feb 2024 14:44:17 +0100 Subject: [PATCH 2/5] Common: Introduce Range Sets --- src/common/CMakeLists.txt | 2 + src/common/range_sets.h | 73 ++++++++++ src/common/range_sets.inc | 279 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 354 insertions(+) create mode 100644 src/common/range_sets.h create mode 100644 src/common/range_sets.inc diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index bf3f3b781..c19af2ab8 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -107,6 +107,8 @@ add_library(common STATIC quaternion.h range_map.h range_mutex.h + range_sets.h + range_sets.inc reader_writer_queue.h ring_buffer.h ${CMAKE_CURRENT_BINARY_DIR}/scm_rev.cpp diff --git a/src/common/range_sets.h b/src/common/range_sets.h new file mode 100644 index 000000000..f4ee00fec --- /dev/null +++ b/src/common/range_sets.h @@ -0,0 +1,73 @@ +// SPDX-FileCopyrightText: 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include + +#include "common/common_types.h" + +namespace Common { + +template +class RangeSet { +public: + RangeSet(); + ~RangeSet(); + + RangeSet(RangeSet const&) = delete; + RangeSet& operator=(RangeSet const&) = delete; + + RangeSet(RangeSet&& other); + RangeSet& operator=(RangeSet&& other); + + void Add(AddressType base_address, size_t size); + void Subtract(AddressType base_address, size_t size); + void Clear(); + bool Empty() const; + + template + void ForEach(Func&& func) const; + + template + void ForEachInRange(AddressType device_addr, size_t size, Func&& func) const; + +private: + struct RangeSetImpl; + std::unique_ptr m_impl; +}; + +template +class SplitRangeSet { +public: + SplitRangeSet(); + ~SplitRangeSet(); + + SplitRangeSet(SplitRangeSet const&) = delete; + SplitRangeSet& operator=(SplitRangeSet const&) = delete; + + SplitRangeSet(SplitRangeSet&& other); + SplitRangeSet& operator=(SplitRangeSet&& other); + + void Add(AddressType base_address, size_t size); + void Subtract(AddressType base_address, size_t size); + + template + void Subtract(AddressType base_address, size_t size, Func&& on_delete); + + void DeleteAll(AddressType base_address, size_t size); + void Clear(); + bool Empty() const; + + template + void ForEach(Func&& func) const; + + template + void ForEachInRange(AddressType device_addr, size_t size, Func&& func) const; + +private: + struct SplitRangeSetImpl; + std::unique_ptr m_impl; +}; + +} // namespace Common \ No newline at end of file diff --git a/src/common/range_sets.inc b/src/common/range_sets.inc new file mode 100644 index 000000000..fa55a68fb --- /dev/null +++ b/src/common/range_sets.inc @@ -0,0 +1,279 @@ +// SPDX-FileCopyrightText: 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include + +#define BOOST_NO_MT +#include +#undef BOOST_NO_MT +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/range_sets.h" + +namespace boost { +template +class fast_pool_allocator; +} + +namespace Common { + +template +struct RangeSet::RangeSetImpl { + using IntervalSet = boost::icl::interval_set< + AddressType, std::less, ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, AddressType, std::less), + boost::fast_pool_allocator>; + using IntervalType = typename IntervalSet::interval_type; + + RangeSetImpl() = default; + ~RangeSetImpl() = default; + + void Add(AddressType base_address, size_t size) { + AddressType end_address = base_address + static_cast(size); + IntervalType interval{base_address, end_address}; + m_ranges_set.add(interval); + } + + void Subtract(AddressType base_address, size_t size) { + AddressType end_address = base_address + static_cast(size); + IntervalType interval{base_address, end_address}; + m_ranges_set.subtract(interval); + } + + IntervalSet m_ranges_set; +}; + +template +struct SplitRangeSet::SplitRangeSetImpl { + + using IntervalSet = + boost::icl::split_interval_map; + using IntervalType = typename IntervalSet::interval_type; + + SplitRangeSetImpl() = default; + ~SplitRangeSetImpl() = default; + + void Add(AddressType base_address, size_t size) { + AddressType end_address = base_address + static_cast(size); + IntervalType interval{base_address, end_address}; + m_split_ranges_set += std::make_pair(interval, 1); + } + + template + void Subtract(AddressType base_address, size_t size, s32 amount, + [[maybe_unused]] Func&& on_delete) { + AddressType end_address = base_address + static_cast(size); + IntervalType interval{base_address, end_address}; + bool any_removals = false; + m_split_ranges_set += std::make_pair(interval, -amount); + do { + any_removals = false; + auto it = m_split_ranges_set.lower_bound(interval); + if (it == m_split_ranges_set.end()) { + return; + } + auto end_it = m_split_ranges_set.upper_bound(interval); + for (; it != end_it; it++) { + if (it->second <= 0) { + if constexpr (has_on_delete) { + if (it->second == 0) { + on_delete(it->first.lower(), it->first.upper()); + } + } + any_removals = true; + m_split_ranges_set.erase(it); + break; + } + } + } while (any_removals); + } + + IntervalSet m_split_ranges_set; +}; + +template +RangeSet::RangeSet() { + m_impl = std::make_unique::RangeSetImpl>(); +} + +template +RangeSet::~RangeSet() = default; + +template +RangeSet::RangeSet(RangeSet&& other) { + m_impl = std::make_unique::RangeSetImpl>(); + m_impl->m_ranges_set = std::move(other.m_impl->m_ranges_set); +} + +template +RangeSet& RangeSet::operator=(RangeSet&& other) { + m_impl->m_ranges_set = std::move(other.m_impl->m_ranges_set); +} + +template +void RangeSet::Add(AddressType base_address, size_t size) { + m_impl->Add(base_address, size); +} + +template +void RangeSet::Subtract(AddressType base_address, size_t size) { + m_impl->Subtract(base_address, size); +} + +template +void RangeSet::Clear() { + m_impl->m_ranges_set.clear(); +} + +template +bool RangeSet::Empty() const { + return m_impl->m_ranges_set.empty(); +} + +template +template +void RangeSet::ForEach(Func&& func) const { + if (m_impl->m_ranges_set.empty()) { + return; + } + auto it = m_impl->m_ranges_set.begin(); + auto end_it = m_impl->m_ranges_set.end(); + for (; it != end_it; it++) { + const AddressType inter_addr_end = it->upper(); + const AddressType inter_addr = it->lower(); + func(inter_addr, inter_addr_end); + } +} + +template +template +void RangeSet::ForEachInRange(AddressType base_addr, size_t size, Func&& func) const { + auto& range_set = m_impl->m_ranges_set; + const AddressType start_address = base_addr; + const AddressType end_address = start_address + size; + const RangeSetImpl::IntervalType search_interval{start_address, end_address}; + auto it = range_set.lower_bound(search_interval); + if (it == range_set.end()) { + return; + } + auto end_it = range_set.upper_bound(search_interval); + for (; it != end_it; it++) { + AddressType inter_addr_end = it->upper(); + AddressType inter_addr = it->lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end); + } +} + +template +SplitRangeSet::SplitRangeSet() { + m_impl = std::make_unique::SplitRangeSetImpl>(); +} + +template +SplitRangeSet::~SplitRangeSet() = default; + +template +SplitRangeSet::SplitRangeSet(SplitRangeSet&& other) { + m_impl = std::make_unique::SplitRangeSetImpl>(); + m_impl->m_split_ranges_set = std::move(other.m_impl->m_split_ranges_set); +} + +template +SplitRangeSet& SplitRangeSet::operator=(SplitRangeSet&& other) { + m_impl->m_split_ranges_set = std::move(other.m_impl->m_split_ranges_set); +} + +template +void SplitRangeSet::Add(AddressType base_address, size_t size) { + m_impl->Add(base_address, size); +} + +template +void SplitRangeSet::Subtract(AddressType base_address, size_t size) { + m_impl->Subtract(base_address, size, 1, [](AddressType, AddressType) {}); +} + +template +template +void SplitRangeSet::Subtract(AddressType base_address, size_t size, Func&& on_delete) { + m_impl->Subtract(base_address, size, 1, on_delete); +} + +template +void SplitRangeSet::DeleteAll(AddressType base_address, size_t size) { + m_impl->Subtract(base_address, size, std::numeric_limits::max(), + [](AddressType, AddressType) {}); +} + +template +void SplitRangeSet::Clear() { + m_impl->m_split_ranges_set.clear(); +} + +template +bool SplitRangeSet::Empty() const { + return m_impl->m_split_ranges_set.empty(); +} + +template +template +void SplitRangeSet::ForEach(Func&& func) const { + if (m_impl->m_split_ranges_set.empty()) { + return; + } + auto it = m_impl->m_split_ranges_set.begin(); + auto end_it = m_impl->m_split_ranges_set.end(); + for (; it != end_it; it++) { + const AddressType inter_addr_end = it->first.upper(); + const AddressType inter_addr = it->first.lower(); + func(inter_addr, inter_addr_end, it->second); + } +} + +template +template +void SplitRangeSet::ForEachInRange(AddressType base_address, size_t size, + Func&& func) const { + auto& range_set = m_impl->m_split_ranges_set; + const AddressType start_address = base_address; + const AddressType end_address = start_address + size; + const SplitRangeSetImpl::IntervalType search_interval{start_address, end_address}; + auto it = range_set.lower_bound(search_interval); + if (it == range_set.end()) { + return; + } + auto end_it = range_set.upper_bound(search_interval); + for (; it != end_it; it++) { + auto& inter = it->first; + AddressType inter_addr_end = inter.upper(); + AddressType inter_addr = inter.lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end, it->second); + } +} + +} // namespace Common \ No newline at end of file From accccc0cbf54bb080c1180ad47445aada317454c Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 4 Feb 2024 14:44:38 +0100 Subject: [PATCH 3/5] NVDRV: Refactor HeapMapper to use RangeSets --- .../hle/service/nvdrv/core/heap_mapper.cpp | 187 ++++-------------- 1 file changed, 43 insertions(+), 144 deletions(-) diff --git a/src/core/hle/service/nvdrv/core/heap_mapper.cpp b/src/core/hle/service/nvdrv/core/heap_mapper.cpp index 096dc5deb..542125a1c 100644 --- a/src/core/hle/service/nvdrv/core/heap_mapper.cpp +++ b/src/core/hle/service/nvdrv/core/heap_mapper.cpp @@ -3,110 +3,21 @@ #include -#include -#define BOOST_NO_MT -#include -#undef BOOST_NO_MT -#include -#include -#include -#include -#include -#include -#include - +#include "common/range_sets.h" +#include "common/range_sets.inc" #include "core/hle/service/nvdrv/core/heap_mapper.h" #include "video_core/host1x/host1x.h" -namespace boost { -template -class fast_pool_allocator; -} - namespace Service::Nvidia::NvCore { -using IntervalCompare = std::less; -using IntervalInstance = boost::icl::interval_type_default; -using IntervalAllocator = boost::fast_pool_allocator; -using IntervalSet = boost::icl::interval_set; -using IntervalType = typename IntervalSet::interval_type; - -template -struct counter_add_functor : public boost::icl::identity_based_inplace_combine { - // types - typedef counter_add_functor type; - typedef boost::icl::identity_based_inplace_combine base_type; - - // public member functions - void operator()(Type& current, const Type& added) const { - current += added; - if (current < base_type::identity_element()) { - current = base_type::identity_element(); - } - } - - // public static functions - static void version(Type&){}; -}; - -using OverlapCombine = counter_add_functor; -using OverlapSection = boost::icl::inter_section; -using OverlapCounter = boost::icl::split_interval_map; - struct HeapMapper::HeapMapperInternal { - HeapMapperInternal(Tegra::Host1x::Host1x& host1x) : device_memory{host1x.MemoryManager()} {} + HeapMapperInternal(Tegra::Host1x::Host1x& host1x) : m_device_memory{host1x.MemoryManager()} {} ~HeapMapperInternal() = default; - template - void ForEachInOverlapCounter(OverlapCounter& current_range, VAddr cpu_addr, u64 size, - Func&& func) { - const DAddr start_address = cpu_addr; - const DAddr end_address = start_address + size; - const IntervalType search_interval{start_address, end_address}; - auto it = current_range.lower_bound(search_interval); - if (it == current_range.end()) { - return; - } - auto end_it = current_range.upper_bound(search_interval); - for (; it != end_it; it++) { - auto& inter = it->first; - DAddr inter_addr_end = inter.upper(); - DAddr inter_addr = inter.lower(); - if (inter_addr_end > end_address) { - inter_addr_end = end_address; - } - if (inter_addr < start_address) { - inter_addr = start_address; - } - func(inter_addr, inter_addr_end, it->second); - } - } - - void RemoveEachInOverlapCounter(OverlapCounter& current_range, - const IntervalType search_interval, int subtract_value) { - bool any_removals = false; - current_range.add(std::make_pair(search_interval, subtract_value)); - do { - any_removals = false; - auto it = current_range.lower_bound(search_interval); - if (it == current_range.end()) { - return; - } - auto end_it = current_range.upper_bound(search_interval); - for (; it != end_it; it++) { - if (it->second <= 0) { - any_removals = true; - current_range.erase(it); - break; - } - } - } while (any_removals); - } - - IntervalSet base_set; - OverlapCounter mapping_overlaps; - Tegra::MaxwellDeviceMemoryManager& device_memory; - std::mutex guard; + Common::RangeSet m_temporary_set; + Common::SplitRangeSet m_mapped_ranges; + Tegra::MaxwellDeviceMemoryManager& m_device_memory; + std::mutex m_guard; }; HeapMapper::HeapMapper(VAddr start_vaddress, DAddr start_daddress, size_t size, Core::Asid asid, @@ -116,60 +27,48 @@ HeapMapper::HeapMapper(VAddr start_vaddress, DAddr start_daddress, size_t size, } HeapMapper::~HeapMapper() { - m_internal->device_memory.Unmap(m_daddress, m_size); + // Unmap whatever has been mapped. + m_internal->m_mapped_ranges.ForEach([this](VAddr start_addr, VAddr end_addr, s32 count) { + const size_t sub_size = end_addr - start_addr; + const size_t offset = start_addr - m_vaddress; + m_internal->m_device_memory.Unmap(m_daddress + offset, sub_size); + }); } DAddr HeapMapper::Map(VAddr start, size_t size) { - std::scoped_lock lk(m_internal->guard); - m_internal->base_set.clear(); - const IntervalType interval{start, start + size}; - m_internal->base_set.insert(interval); - m_internal->ForEachInOverlapCounter(m_internal->mapping_overlaps, start, size, - [this](VAddr start_addr, VAddr end_addr, int) { - const IntervalType other{start_addr, end_addr}; - m_internal->base_set.subtract(other); - }); - if (!m_internal->base_set.empty()) { - auto it = m_internal->base_set.begin(); - auto end_it = m_internal->base_set.end(); - for (; it != end_it; it++) { - const VAddr inter_addr_end = it->upper(); - const VAddr inter_addr = it->lower(); - const size_t offset = inter_addr - m_vaddress; - const size_t sub_size = inter_addr_end - inter_addr; - m_internal->device_memory.Map(m_daddress + offset, m_vaddress + offset, sub_size, - m_asid); - } - } - m_internal->mapping_overlaps += std::make_pair(interval, 1); - m_internal->base_set.clear(); - return m_daddress + (start - m_vaddress); + std::scoped_lock lk(m_internal->m_guard); + // Add the mapping range to a temporary range set. + m_internal->m_temporary_set.Clear(); + m_internal->m_temporary_set.Add(start, size); + + // Remove anything that's already mapped from the temporary range set. + m_internal->m_mapped_ranges.ForEachInRange( + start, size, [this](VAddr start_addr, VAddr end_addr, s32) { + m_internal->m_temporary_set.Subtract(start_addr, end_addr - start_addr); + }); + + // Map anything that has not been mapped yet. + m_internal->m_temporary_set.ForEach([this](VAddr start_addr, VAddr end_addr) { + const size_t sub_size = end_addr - start_addr; + const size_t offset = start_addr - m_vaddress; + m_internal->m_device_memory.Map(m_daddress + offset, m_vaddress + offset, sub_size, m_asid); + }); + + // Add the mapping range to the split map, to register the map and overlaps. + m_internal->m_mapped_ranges.Add(start, size); + m_internal->m_temporary_set.Clear(); + return m_daddress + static_cast(start - m_vaddress); } void HeapMapper::Unmap(VAddr start, size_t size) { - std::scoped_lock lk(m_internal->guard); - m_internal->base_set.clear(); - m_internal->ForEachInOverlapCounter(m_internal->mapping_overlaps, start, size, - [this](VAddr start_addr, VAddr end_addr, int value) { - if (value <= 1) { - const IntervalType other{start_addr, end_addr}; - m_internal->base_set.insert(other); - } - }); - if (!m_internal->base_set.empty()) { - auto it = m_internal->base_set.begin(); - auto end_it = m_internal->base_set.end(); - for (; it != end_it; it++) { - const VAddr inter_addr_end = it->upper(); - const VAddr inter_addr = it->lower(); - const size_t offset = inter_addr - m_vaddress; - const size_t sub_size = inter_addr_end - inter_addr; - m_internal->device_memory.Unmap(m_daddress + offset, sub_size); - } - } - const IntervalType to_remove{start, start + size}; - m_internal->RemoveEachInOverlapCounter(m_internal->mapping_overlaps, to_remove, -1); - m_internal->base_set.clear(); + std::scoped_lock lk(m_internal->m_guard); + + // Just subtract the range and whatever is deleted, unmap it. + m_internal->m_mapped_ranges.Subtract(start, size, [this](VAddr start_addr, VAddr end_addr) { + const size_t sub_size = end_addr - start_addr; + const size_t offset = start_addr - m_vaddress; + m_internal->m_device_memory.Unmap(m_daddress + offset, sub_size); + }); } } // namespace Service::Nvidia::NvCore From 0d5a3abeaefd3a1682c48a59c5a9170cfb0a39d0 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 4 Feb 2024 19:16:07 +0100 Subject: [PATCH 4/5] Buffer Cache: Refactor to use Range sets instead --- src/common/range_sets.inc | 184 +++++++------ src/video_core/buffer_cache/buffer_cache.h | 250 +++++++----------- .../buffer_cache/buffer_cache_base.h | 131 +-------- .../renderer_opengl/gl_buffer_cache.h | 1 - .../renderer_vulkan/vk_buffer_cache.h | 1 - 5 files changed, 206 insertions(+), 361 deletions(-) diff --git a/src/common/range_sets.inc b/src/common/range_sets.inc index fa55a68fb..705ebd4a1 100644 --- a/src/common/range_sets.inc +++ b/src/common/range_sets.inc @@ -6,9 +6,6 @@ #include #include -#define BOOST_NO_MT -#include -#undef BOOST_NO_MT #include #include #include @@ -20,18 +17,16 @@ #include "common/range_sets.h" -namespace boost { -template -class fast_pool_allocator; -} - namespace Common { template struct RangeSet::RangeSetImpl { + template + using MyAllocator = boost::fast_pool_allocator; using IntervalSet = boost::icl::interval_set< AddressType, std::less, ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, AddressType, std::less), - boost::fast_pool_allocator>; + MyAllocator>; using IntervalType = typename IntervalSet::interval_type; RangeSetImpl() = default; @@ -49,18 +44,58 @@ struct RangeSet::RangeSetImpl { m_ranges_set.subtract(interval); } + template + void ForEach(Func&& func) const { + if (m_ranges_set.empty()) { + return; + } + auto it = m_ranges_set.begin(); + auto end_it = m_ranges_set.end(); + for (; it != end_it; it++) { + const AddressType inter_addr_end = it->upper(); + const AddressType inter_addr = it->lower(); + func(inter_addr, inter_addr_end); + } + } + + template + void ForEachInRange(AddressType base_addr, size_t size, Func&& func) const { + if (m_ranges_set.empty()) { + return; + } + const AddressType start_address = base_addr; + const AddressType end_address = start_address + size; + const RangeSetImpl::IntervalType search_interval{start_address, end_address}; + auto it = m_ranges_set.lower_bound(search_interval); + if (it == m_ranges_set.end()) { + return; + } + auto end_it = m_ranges_set.upper_bound(search_interval); + for (; it != end_it; it++) { + AddressType inter_addr_end = it->upper(); + AddressType inter_addr = it->lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end); + } + } + IntervalSet m_ranges_set; }; template struct SplitRangeSet::SplitRangeSetImpl { - - using IntervalSet = - boost::icl::split_interval_map; + template + using MyAllocator = boost::fast_pool_allocator; + using IntervalSet = boost::icl::split_interval_map< + AddressType, s32, boost::icl::partial_enricher, std::less, boost::icl::inplace_plus, + boost::icl::inter_section, + ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, AddressType, std::less), MyAllocator>; using IntervalType = typename IntervalSet::interval_type; SplitRangeSetImpl() = default; @@ -75,6 +110,9 @@ struct SplitRangeSet::SplitRangeSetImpl { template void Subtract(AddressType base_address, size_t size, s32 amount, [[maybe_unused]] Func&& on_delete) { + if (m_split_ranges_set.empty()) { + return; + } AddressType end_address = base_address + static_cast(size); IntervalType interval{base_address, end_address}; bool any_removals = false; @@ -101,6 +139,47 @@ struct SplitRangeSet::SplitRangeSetImpl { } while (any_removals); } + template + void ForEach(Func&& func) const { + if (m_split_ranges_set.empty()) { + return; + } + auto it = m_split_ranges_set.begin(); + auto end_it = m_split_ranges_set.end(); + for (; it != end_it; it++) { + const AddressType inter_addr_end = it->first.upper(); + const AddressType inter_addr = it->first.lower(); + func(inter_addr, inter_addr_end, it->second); + } + } + + template + void ForEachInRange(AddressType base_address, size_t size, Func&& func) const { + if (m_split_ranges_set.empty()) { + return; + } + const AddressType start_address = base_address; + const AddressType end_address = start_address + size; + const SplitRangeSetImpl::IntervalType search_interval{start_address, end_address}; + auto it = m_split_ranges_set.lower_bound(search_interval); + if (it == m_split_ranges_set.end()) { + return; + } + auto end_it = m_split_ranges_set.upper_bound(search_interval); + for (; it != end_it; it++) { + auto& inter = it->first; + AddressType inter_addr_end = inter.upper(); + AddressType inter_addr = inter.lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end, it->second); + } + } + IntervalSet m_split_ranges_set; }; @@ -146,41 +225,13 @@ bool RangeSet::Empty() const { template template void RangeSet::ForEach(Func&& func) const { - if (m_impl->m_ranges_set.empty()) { - return; - } - auto it = m_impl->m_ranges_set.begin(); - auto end_it = m_impl->m_ranges_set.end(); - for (; it != end_it; it++) { - const AddressType inter_addr_end = it->upper(); - const AddressType inter_addr = it->lower(); - func(inter_addr, inter_addr_end); - } + m_impl->ForEach(std::move(func)); } template template -void RangeSet::ForEachInRange(AddressType base_addr, size_t size, Func&& func) const { - auto& range_set = m_impl->m_ranges_set; - const AddressType start_address = base_addr; - const AddressType end_address = start_address + size; - const RangeSetImpl::IntervalType search_interval{start_address, end_address}; - auto it = range_set.lower_bound(search_interval); - if (it == range_set.end()) { - return; - } - auto end_it = range_set.upper_bound(search_interval); - for (; it != end_it; it++) { - AddressType inter_addr_end = it->upper(); - AddressType inter_addr = it->lower(); - if (inter_addr_end > end_address) { - inter_addr_end = end_address; - } - if (inter_addr < start_address) { - inter_addr = start_address; - } - func(inter_addr, inter_addr_end); - } +void RangeSet::ForEachInRange(AddressType base_address, size_t size, Func&& func) const { + m_impl->ForEachInRange(base_address, size, std::move(func)); } template @@ -209,18 +260,18 @@ void SplitRangeSet::Add(AddressType base_address, size_t size) { template void SplitRangeSet::Subtract(AddressType base_address, size_t size) { - m_impl->Subtract(base_address, size, 1, [](AddressType, AddressType) {}); + m_impl->template Subtract(base_address, size, 1, [](AddressType, AddressType) {}); } template template void SplitRangeSet::Subtract(AddressType base_address, size_t size, Func&& on_delete) { - m_impl->Subtract(base_address, size, 1, on_delete); + m_impl->template Subtract(base_address, size, 1, std::move(on_delete)); } template void SplitRangeSet::DeleteAll(AddressType base_address, size_t size) { - m_impl->Subtract(base_address, size, std::numeric_limits::max(), + m_impl->template Subtract(base_address, size, std::numeric_limits::max(), [](AddressType, AddressType) {}); } @@ -237,43 +288,14 @@ bool SplitRangeSet::Empty() const { template template void SplitRangeSet::ForEach(Func&& func) const { - if (m_impl->m_split_ranges_set.empty()) { - return; - } - auto it = m_impl->m_split_ranges_set.begin(); - auto end_it = m_impl->m_split_ranges_set.end(); - for (; it != end_it; it++) { - const AddressType inter_addr_end = it->first.upper(); - const AddressType inter_addr = it->first.lower(); - func(inter_addr, inter_addr_end, it->second); - } + m_impl->ForEach(func); } template template void SplitRangeSet::ForEachInRange(AddressType base_address, size_t size, Func&& func) const { - auto& range_set = m_impl->m_split_ranges_set; - const AddressType start_address = base_address; - const AddressType end_address = start_address + size; - const SplitRangeSetImpl::IntervalType search_interval{start_address, end_address}; - auto it = range_set.lower_bound(search_interval); - if (it == range_set.end()) { - return; - } - auto end_it = range_set.upper_bound(search_interval); - for (; it != end_it; it++) { - auto& inter = it->first; - AddressType inter_addr_end = inter.upper(); - AddressType inter_addr = inter.lower(); - if (inter_addr_end > end_address) { - inter_addr_end = end_address; - } - if (inter_addr < start_address) { - inter_addr = start_address; - } - func(inter_addr, inter_addr_end, it->second); - } + m_impl->ForEachInRange(base_address, size, std::move(func)); } } // namespace Common \ No newline at end of file diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index b4bf369d1..6d3d933c5 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -7,6 +7,7 @@ #include #include +#include "common/range_sets.inc" #include "video_core/buffer_cache/buffer_cache_base.h" #include "video_core/guest_memory.h" #include "video_core/host1x/gpu_device_memory_manager.h" @@ -20,7 +21,7 @@ BufferCache

::BufferCache(Tegra::MaxwellDeviceMemoryManager& device_memory_, R : runtime{runtime_}, device_memory{device_memory_}, memory_tracker{device_memory} { // Ensure the first slot is used for the null buffer void(slot_buffers.insert(runtime, NullBufferParams{})); - common_ranges.clear(); + gpu_modified_ranges.Clear(); inline_buffer_id = NULL_BUFFER_ID; if (!runtime.CanReportMemoryUsage()) { @@ -43,6 +44,9 @@ BufferCache

::BufferCache(Tegra::MaxwellDeviceMemoryManager& device_memory_, R DEFAULT_CRITICAL_MEMORY)); } +template +BufferCache

::~BufferCache() = default; + template void BufferCache

::RunGarbageCollector() { const bool aggressive_gc = total_used_memory >= critical_memory; @@ -96,20 +100,17 @@ void BufferCache

::TickFrame() { ++frame_tick; delayed_destruction_ring.Tick(); - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - for (auto& buffer : async_buffers_death_ring) { - runtime.FreeDeferredStagingBuffer(buffer); - } - async_buffers_death_ring.clear(); + for (auto& buffer : async_buffers_death_ring) { + runtime.FreeDeferredStagingBuffer(buffer); } + async_buffers_death_ring.clear(); } template void BufferCache

::WriteMemory(DAddr device_addr, u64 size) { if (memory_tracker.IsRegionGpuModified(device_addr, size)) { - const IntervalType subtract_interval{device_addr, device_addr + size}; - ClearDownload(subtract_interval); - common_ranges.subtract(subtract_interval); + ClearDownload(device_addr, size); + gpu_modified_ranges.Subtract(device_addr, size); } memory_tracker.MarkRegionAsCpuModified(device_addr, size); } @@ -174,11 +175,11 @@ void BufferCache

::DownloadMemory(DAddr device_addr, u64 size) { } template -void BufferCache

::ClearDownload(IntervalType subtract_interval) { - RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1024); - uncommitted_ranges.subtract(subtract_interval); - for (auto& interval_set : committed_ranges) { - interval_set.subtract(subtract_interval); +void BufferCache

::ClearDownload(DAddr device_addr, u64 size) { + async_downloads.DeleteAll(device_addr, size); + uncommitted_gpu_modified_ranges.Subtract(device_addr, size); + for (auto& interval_set : committed_gpu_modified_ranges) { + interval_set.Subtract(device_addr, size); } } @@ -195,8 +196,7 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am return false; } - const IntervalType subtract_interval{*cpu_dest_address, *cpu_dest_address + amount}; - ClearDownload(subtract_interval); + ClearDownload(*cpu_dest_address, amount); BufferId buffer_a; BufferId buffer_b; @@ -215,21 +215,20 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am .size = amount, }}; - boost::container::small_vector tmp_intervals; + boost::container::small_vector, 4> tmp_intervals; auto mirror = [&](DAddr base_address, DAddr base_address_end) { const u64 size = base_address_end - base_address; const DAddr diff = base_address - *cpu_src_address; const DAddr new_base_address = *cpu_dest_address + diff; - const IntervalType add_interval{new_base_address, new_base_address + size}; - tmp_intervals.push_back(add_interval); - uncommitted_ranges.add(add_interval); + tmp_intervals.push_back({new_base_address, size}); + uncommitted_gpu_modified_ranges.Add(new_base_address, size); }; - ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror); + gpu_modified_ranges.ForEachInRange(*cpu_src_address, amount, mirror); // This subtraction in this order is important for overlapping copies. - common_ranges.subtract(subtract_interval); + gpu_modified_ranges.Subtract(*cpu_dest_address, amount); const bool has_new_downloads = tmp_intervals.size() != 0; - for (const IntervalType& add_interval : tmp_intervals) { - common_ranges.add(add_interval); + for (const auto& pair : tmp_intervals) { + gpu_modified_ranges.Add(pair.first, pair.second); } const auto& copy = copies[0]; src_buffer.MarkUsage(copy.src_offset, copy.size); @@ -257,9 +256,8 @@ bool BufferCache

::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) { } const size_t size = amount * sizeof(u32); - const IntervalType subtract_interval{*cpu_dst_address, *cpu_dst_address + size}; - ClearDownload(subtract_interval); - common_ranges.subtract(subtract_interval); + ClearDownload(*cpu_dst_address, size); + gpu_modified_ranges.Subtract(*cpu_dst_address, size); const BufferId buffer = FindBuffer(*cpu_dst_address, static_cast(size)); Buffer& dest_buffer = slot_buffers[buffer]; @@ -300,11 +298,11 @@ std::pair BufferCache

::ObtainCPUBuffer( MarkWrittenBuffer(buffer_id, device_addr, size); break; case ObtainBufferOperation::DiscardWrite: { - DAddr device_addr_start = Common::AlignDown(device_addr, 64); - DAddr device_addr_end = Common::AlignUp(device_addr + size, 64); - IntervalType interval{device_addr_start, device_addr_end}; - ClearDownload(interval); - common_ranges.subtract(interval); + const DAddr device_addr_start = Common::AlignDown(device_addr, 64); + const DAddr device_addr_end = Common::AlignUp(device_addr + size, 64); + const size_t new_size = device_addr_end - device_addr_start; + ClearDownload(device_addr_start, new_size); + gpu_modified_ranges.Subtract(device_addr_start, new_size); break; } default: @@ -504,46 +502,40 @@ void BufferCache

::FlushCachedWrites() { template bool BufferCache

::HasUncommittedFlushes() const noexcept { - return !uncommitted_ranges.empty() || !committed_ranges.empty(); + return !uncommitted_gpu_modified_ranges.Empty() || !committed_gpu_modified_ranges.empty(); } template void BufferCache

::AccumulateFlushes() { - if (uncommitted_ranges.empty()) { + if (uncommitted_gpu_modified_ranges.Empty()) { return; } - committed_ranges.emplace_back(std::move(uncommitted_ranges)); + committed_gpu_modified_ranges.emplace_back(std::move(uncommitted_gpu_modified_ranges)); } template bool BufferCache

::ShouldWaitAsyncFlushes() const noexcept { - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - return (!async_buffers.empty() && async_buffers.front().has_value()); - } else { - return false; - } + return (!async_buffers.empty() && async_buffers.front().has_value()); } template void BufferCache

::CommitAsyncFlushesHigh() { AccumulateFlushes(); - if (committed_ranges.empty()) { - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - async_buffers.emplace_back(std::optional{}); - } + if (committed_gpu_modified_ranges.empty()) { + async_buffers.emplace_back(std::optional{}); return; } MICROPROFILE_SCOPE(GPU_DownloadMemory); - auto it = committed_ranges.begin(); - while (it != committed_ranges.end()) { + auto it = committed_gpu_modified_ranges.begin(); + while (it != committed_gpu_modified_ranges.end()) { auto& current_intervals = *it; auto next_it = std::next(it); - while (next_it != committed_ranges.end()) { - for (auto& interval : *next_it) { - current_intervals.subtract(interval); - } + while (next_it != committed_gpu_modified_ranges.end()) { + next_it->ForEach([¤t_intervals](DAddr start, DAddr end) { + current_intervals.Subtract(start, end - start); + }); next_it++; } it++; @@ -552,10 +544,10 @@ void BufferCache

::CommitAsyncFlushesHigh() { boost::container::small_vector, 16> downloads; u64 total_size_bytes = 0; u64 largest_copy = 0; - for (const IntervalSet& intervals : committed_ranges) { - for (auto& interval : intervals) { - const std::size_t size = interval.upper() - interval.lower(); - const DAddr device_addr = interval.lower(); + for (const Common::RangeSet& range_set : committed_gpu_modified_ranges) { + range_set.ForEach([&](DAddr interval_lower, DAddr interval_upper) { + const std::size_t size = interval_upper - interval_lower; + const DAddr device_addr = interval_lower; ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) { const DAddr buffer_start = buffer.CpuAddr(); const DAddr buffer_end = buffer_start + buffer.SizeBytes(); @@ -583,77 +575,35 @@ void BufferCache

::CommitAsyncFlushesHigh() { largest_copy = std::max(largest_copy, new_size); }; - ForEachInRangeSet(common_ranges, device_addr_out, range_size, add_download); + gpu_modified_ranges.ForEachInRange(device_addr_out, range_size, + add_download); }); }); - } + }); } - committed_ranges.clear(); + committed_gpu_modified_ranges.clear(); if (downloads.empty()) { - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - async_buffers.emplace_back(std::optional{}); - } + async_buffers.emplace_back(std::optional{}); return; } - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); - boost::container::small_vector normalized_copies; - IntervalSet new_async_range{}; - runtime.PreCopyBarrier(); - for (auto& [copy, buffer_id] : downloads) { - copy.dst_offset += download_staging.offset; - const std::array copies{copy}; - BufferCopy second_copy{copy}; - Buffer& buffer = slot_buffers[buffer_id]; - second_copy.src_offset = static_cast(buffer.CpuAddr()) + copy.src_offset; - DAddr orig_device_addr = static_cast(second_copy.src_offset); - const IntervalType base_interval{orig_device_addr, orig_device_addr + copy.size}; - async_downloads += std::make_pair(base_interval, 1); - buffer.MarkUsage(copy.src_offset, copy.size); - runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); - normalized_copies.push_back(second_copy); - } - runtime.PostCopyBarrier(); - pending_downloads.emplace_back(std::move(normalized_copies)); - async_buffers.emplace_back(download_staging); - } else { - if (!Settings::IsGPULevelHigh()) { - committed_ranges.clear(); - uncommitted_ranges.clear(); - } else { - if constexpr (USE_MEMORY_MAPS) { - auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); - runtime.PreCopyBarrier(); - for (auto& [copy, buffer_id] : downloads) { - // Have in mind the staging buffer offset for the copy - copy.dst_offset += download_staging.offset; - const std::array copies{copy}; - Buffer& buffer = slot_buffers[buffer_id]; - buffer.MarkUsage(copy.src_offset, copy.size); - runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); - } - runtime.PostCopyBarrier(); - runtime.Finish(); - for (const auto& [copy, buffer_id] : downloads) { - const Buffer& buffer = slot_buffers[buffer_id]; - const DAddr device_addr = buffer.CpuAddr() + copy.src_offset; - // Undo the modified offset - const u64 dst_offset = copy.dst_offset - download_staging.offset; - const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; - device_memory.WriteBlockUnsafe(device_addr, read_mapped_memory, copy.size); - } - } else { - const std::span immediate_buffer = ImmediateBuffer(largest_copy); - for (const auto& [copy, buffer_id] : downloads) { - Buffer& buffer = slot_buffers[buffer_id]; - buffer.ImmediateDownload(copy.src_offset, - immediate_buffer.subspan(0, copy.size)); - const DAddr device_addr = buffer.CpuAddr() + copy.src_offset; - device_memory.WriteBlockUnsafe(device_addr, immediate_buffer.data(), copy.size); - } - } - } + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); + boost::container::small_vector normalized_copies; + runtime.PreCopyBarrier(); + for (auto& [copy, buffer_id] : downloads) { + copy.dst_offset += download_staging.offset; + const std::array copies{copy}; + BufferCopy second_copy{copy}; + Buffer& buffer = slot_buffers[buffer_id]; + second_copy.src_offset = static_cast(buffer.CpuAddr()) + copy.src_offset; + const DAddr orig_device_addr = static_cast(second_copy.src_offset); + async_downloads.Add(orig_device_addr, copy.size); + buffer.MarkUsage(copy.src_offset, copy.size); + runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); + normalized_copies.push_back(second_copy); } + runtime.PostCopyBarrier(); + pending_downloads.emplace_back(std::move(normalized_copies)); + async_buffers.emplace_back(download_staging); } template @@ -676,37 +626,31 @@ void BufferCache

::PopAsyncBuffers() { async_buffers.pop_front(); return; } - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - auto& downloads = pending_downloads.front(); - auto& async_buffer = async_buffers.front(); - u8* base = async_buffer->mapped_span.data(); - const size_t base_offset = async_buffer->offset; - for (const auto& copy : downloads) { - const DAddr device_addr = static_cast(copy.src_offset); - const u64 dst_offset = copy.dst_offset - base_offset; - const u8* read_mapped_memory = base + dst_offset; - ForEachInOverlapCounter( - async_downloads, device_addr, copy.size, [&](DAddr start, DAddr end, int count) { - device_memory.WriteBlockUnsafe(start, &read_mapped_memory[start - device_addr], - end - start); - if (count == 1) { - const IntervalType base_interval{start, end}; - common_ranges.subtract(base_interval); - } - }); - const IntervalType subtract_interval{device_addr, device_addr + copy.size}; - RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1); - } - async_buffers_death_ring.emplace_back(*async_buffer); - async_buffers.pop_front(); - pending_downloads.pop_front(); + auto& downloads = pending_downloads.front(); + auto& async_buffer = async_buffers.front(); + u8* base = async_buffer->mapped_span.data(); + const size_t base_offset = async_buffer->offset; + for (const auto& copy : downloads) { + const DAddr device_addr = static_cast(copy.src_offset); + const u64 dst_offset = copy.dst_offset - base_offset; + const u8* read_mapped_memory = base + dst_offset; + async_downloads.ForEachInRange(device_addr, copy.size, [&](DAddr start, DAddr end, s32) { + device_memory.WriteBlockUnsafe(start, &read_mapped_memory[start - device_addr], + end - start); + }); + async_downloads.Subtract(device_addr, copy.size, [&](DAddr start, DAddr end) { + gpu_modified_ranges.Subtract(start, end - start); + }); } + async_buffers_death_ring.emplace_back(*async_buffer); + async_buffers.pop_front(); + pending_downloads.pop_front(); } template bool BufferCache

::IsRegionGpuModified(DAddr addr, size_t size) { bool is_dirty = false; - ForEachInRangeSet(common_ranges, addr, size, [&](DAddr, DAddr) { is_dirty = true; }); + gpu_modified_ranges.ForEachInRange(addr, size, [&](DAddr, DAddr) { is_dirty = true; }); return is_dirty; } @@ -1320,10 +1264,8 @@ void BufferCache

::UpdateComputeTextureBuffers() { template void BufferCache

::MarkWrittenBuffer(BufferId buffer_id, DAddr device_addr, u32 size) { memory_tracker.MarkRegionAsGpuModified(device_addr, size); - - const IntervalType base_interval{device_addr, device_addr + size}; - common_ranges.add(base_interval); - uncommitted_ranges.add(base_interval); + gpu_modified_ranges.Add(device_addr, size); + uncommitted_gpu_modified_ranges.Add(device_addr, size); } template @@ -1600,9 +1542,8 @@ bool BufferCache

::InlineMemory(DAddr dest_address, size_t copy_size, template void BufferCache

::InlineMemoryImplementation(DAddr dest_address, size_t copy_size, std::span inlined_buffer) { - const IntervalType subtract_interval{dest_address, dest_address + copy_size}; - ClearDownload(subtract_interval); - common_ranges.subtract(subtract_interval); + ClearDownload(dest_address, copy_size); + gpu_modified_ranges.Subtract(dest_address, copy_size); BufferId buffer_id = FindBuffer(dest_address, static_cast(copy_size)); auto& buffer = slot_buffers[buffer_id]; @@ -1652,12 +1593,9 @@ void BufferCache

::DownloadBufferMemory(Buffer& buffer, DAddr device_addr, u64 largest_copy = std::max(largest_copy, new_size); }; - const DAddr start_address = device_addr_out; - const DAddr end_address = start_address + range_size; - ForEachInRangeSet(common_ranges, start_address, range_size, add_download); - const IntervalType subtract_interval{start_address, end_address}; - ClearDownload(subtract_interval); - common_ranges.subtract(subtract_interval); + gpu_modified_ranges.ForEachInRange(device_addr_out, range_size, add_download); + ClearDownload(device_addr_out, range_size); + gpu_modified_ranges.Subtract(device_addr_out, range_size); }); if (total_size_bytes == 0) { return; diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 59124458d..448516651 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -13,25 +13,15 @@ #include #include -#include -#define BOOST_NO_MT -#include -#undef BOOST_NO_MT -#include -#include -#include -#include -#include -#include -#include - #include "common/common_types.h" #include "common/div_ceil.h" #include "common/literals.h" #include "common/lru_cache.h" #include "common/microprofile.h" +#include "common/range_sets.h" #include "common/scope_exit.h" #include "common/settings.h" +#include "common/slot_vector.h" #include "video_core/buffer_cache/buffer_base.h" #include "video_core/control/channel_state_cache.h" #include "video_core/delayed_destruction_ring.h" @@ -41,14 +31,8 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" #include "video_core/surface.h" -#include "common/slot_vector.h" #include "video_core/texture_cache/types.h" -namespace boost { -template -class fast_pool_allocator; -} - namespace VideoCommon { MICROPROFILE_DECLARE(GPU_PrepareBuffers); @@ -184,7 +168,6 @@ class BufferCache : public VideoCommon::ChannelSetupCaches; - using IntervalInstance = boost::icl::interval_type_default; - using IntervalAllocator = boost::fast_pool_allocator; - using IntervalSet = boost::icl::interval_set; - using IntervalType = typename IntervalSet::interval_type; - - template - struct counter_add_functor : public boost::icl::identity_based_inplace_combine { - // types - typedef counter_add_functor type; - typedef boost::icl::identity_based_inplace_combine base_type; - - // public member functions - void operator()(Type& current, const Type& added) const { - current += added; - if (current < base_type::identity_element()) { - current = base_type::identity_element(); - } - } - - // public static functions - static void version(Type&){}; - }; - - using OverlapCombine = counter_add_functor; - using OverlapSection = boost::icl::inter_section; - using OverlapCounter = boost::icl::split_interval_map; - struct OverlapResult { boost::container::small_vector ids; DAddr begin; @@ -240,6 +195,8 @@ class BufferCache : public VideoCommon::ChannelSetupCaches - void ForEachInRangeSet(IntervalSet& current_range, DAddr device_addr, u64 size, Func&& func) { - const DAddr start_address = device_addr; - const DAddr end_address = start_address + size; - const IntervalType search_interval{start_address, end_address}; - auto it = current_range.lower_bound(search_interval); - if (it == current_range.end()) { - return; - } - auto end_it = current_range.upper_bound(search_interval); - for (; it != end_it; it++) { - DAddr inter_addr_end = it->upper(); - DAddr inter_addr = it->lower(); - if (inter_addr_end > end_address) { - inter_addr_end = end_address; - } - if (inter_addr < start_address) { - inter_addr = start_address; - } - func(inter_addr, inter_addr_end); - } - } - - template - void ForEachInOverlapCounter(OverlapCounter& current_range, DAddr device_addr, u64 size, - Func&& func) { - const DAddr start_address = device_addr; - const DAddr end_address = start_address + size; - const IntervalType search_interval{start_address, end_address}; - auto it = current_range.lower_bound(search_interval); - if (it == current_range.end()) { - return; - } - auto end_it = current_range.upper_bound(search_interval); - for (; it != end_it; it++) { - auto& inter = it->first; - DAddr inter_addr_end = inter.upper(); - DAddr inter_addr = inter.lower(); - if (inter_addr_end > end_address) { - inter_addr_end = end_address; - } - if (inter_addr < start_address) { - inter_addr = start_address; - } - func(inter_addr, inter_addr_end, it->second); - } - } - - void RemoveEachInOverlapCounter(OverlapCounter& current_range, - const IntervalType search_interval, int subtract_value) { - bool any_removals = false; - current_range.add(std::make_pair(search_interval, subtract_value)); - do { - any_removals = false; - auto it = current_range.lower_bound(search_interval); - if (it == current_range.end()) { - return; - } - auto end_it = current_range.upper_bound(search_interval); - for (; it != end_it; it++) { - if (it->second <= 0) { - any_removals = true; - current_range.erase(it); - break; - } - } - } while (any_removals); - } - static bool IsRangeGranular(DAddr device_addr, size_t size) { return (device_addr & ~Core::DEVICE_PAGEMASK) == ((device_addr + size) & ~Core::DEVICE_PAGEMASK); @@ -552,7 +440,7 @@ private: [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; - void ClearDownload(IntervalType subtract_interval); + void ClearDownload(DAddr base_addr, u64 size); void InlineMemoryImplementation(DAddr dest_address, size_t copy_size, std::span inlined_buffer); @@ -567,13 +455,12 @@ private: u32 last_index_count = 0; MemoryTracker memory_tracker; - IntervalSet uncommitted_ranges; - IntervalSet common_ranges; - IntervalSet cached_ranges; - std::deque committed_ranges; + Common::RangeSet uncommitted_gpu_modified_ranges; + Common::RangeSet gpu_modified_ranges; + std::deque> committed_gpu_modified_ranges; // Async Buffers - OverlapCounter async_downloads; + Common::SplitRangeSet async_downloads; std::deque> async_buffers; std::deque> pending_downloads; std::optional current_buffer; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index 022275fd6..fd471e979 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -251,7 +251,6 @@ struct BufferCacheParams { static constexpr bool NEEDS_BIND_STORAGE_INDEX = true; static constexpr bool USE_MEMORY_MAPS = true; static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = true; - static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = true; // TODO: Investigate why OpenGL seems to perform worse with persistently mapped buffer uploads static constexpr bool USE_MEMORY_MAPS_FOR_UPLOADS = false; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index ac14c9f86..efe960258 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -181,7 +181,6 @@ struct BufferCacheParams { static constexpr bool NEEDS_BIND_STORAGE_INDEX = false; static constexpr bool USE_MEMORY_MAPS = true; static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = false; - static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = true; static constexpr bool USE_MEMORY_MAPS_FOR_UPLOADS = true; }; From fa47ac1c9f8b117d556c7c18ac9dcb062af5cefc Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Mon, 5 Feb 2024 12:46:49 +0100 Subject: [PATCH 5/5] Common: Rename SplitRangeSet to OverlapRangeSet --- src/common/range_sets.h | 20 +++--- src/common/range_sets.inc | 63 ++++++++++--------- .../hle/service/nvdrv/core/heap_mapper.cpp | 2 +- .../buffer_cache/buffer_cache_base.h | 2 +- 4 files changed, 45 insertions(+), 42 deletions(-) diff --git a/src/common/range_sets.h b/src/common/range_sets.h index f4ee00fec..f8fcee483 100644 --- a/src/common/range_sets.h +++ b/src/common/range_sets.h @@ -38,16 +38,16 @@ private: }; template -class SplitRangeSet { +class OverlapRangeSet { public: - SplitRangeSet(); - ~SplitRangeSet(); + OverlapRangeSet(); + ~OverlapRangeSet(); - SplitRangeSet(SplitRangeSet const&) = delete; - SplitRangeSet& operator=(SplitRangeSet const&) = delete; + OverlapRangeSet(OverlapRangeSet const&) = delete; + OverlapRangeSet& operator=(OverlapRangeSet const&) = delete; - SplitRangeSet(SplitRangeSet&& other); - SplitRangeSet& operator=(SplitRangeSet&& other); + OverlapRangeSet(OverlapRangeSet&& other); + OverlapRangeSet& operator=(OverlapRangeSet&& other); void Add(AddressType base_address, size_t size); void Subtract(AddressType base_address, size_t size); @@ -66,8 +66,8 @@ public: void ForEachInRange(AddressType device_addr, size_t size, Func&& func) const; private: - struct SplitRangeSetImpl; - std::unique_ptr m_impl; + struct OverlapRangeSetImpl; + std::unique_ptr m_impl; }; -} // namespace Common \ No newline at end of file +} // namespace Common diff --git a/src/common/range_sets.inc b/src/common/range_sets.inc index 705ebd4a1..b83eceb7b 100644 --- a/src/common/range_sets.inc +++ b/src/common/range_sets.inc @@ -19,14 +19,18 @@ namespace Common { +namespace { +template +using RangeSetsAllocator = + boost::fast_pool_allocator; +} + template struct RangeSet::RangeSetImpl { - template - using MyAllocator = boost::fast_pool_allocator; using IntervalSet = boost::icl::interval_set< AddressType, std::less, ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, AddressType, std::less), - MyAllocator>; + RangeSetsAllocator>; using IntervalType = typename IntervalSet::interval_type; RangeSetImpl() = default; @@ -88,18 +92,15 @@ struct RangeSet::RangeSetImpl { }; template -struct SplitRangeSet::SplitRangeSetImpl { - template - using MyAllocator = boost::fast_pool_allocator; +struct OverlapRangeSet::OverlapRangeSetImpl { using IntervalSet = boost::icl::split_interval_map< AddressType, s32, boost::icl::partial_enricher, std::less, boost::icl::inplace_plus, boost::icl::inter_section, - ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, AddressType, std::less), MyAllocator>; + ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, AddressType, std::less), RangeSetsAllocator>; using IntervalType = typename IntervalSet::interval_type; - SplitRangeSetImpl() = default; - ~SplitRangeSetImpl() = default; + OverlapRangeSetImpl() = default; + ~OverlapRangeSetImpl() = default; void Add(AddressType base_address, size_t size) { AddressType end_address = base_address + static_cast(size); @@ -160,7 +161,7 @@ struct SplitRangeSet::SplitRangeSetImpl { } const AddressType start_address = base_address; const AddressType end_address = start_address + size; - const SplitRangeSetImpl::IntervalType search_interval{start_address, end_address}; + const OverlapRangeSetImpl::IntervalType search_interval{start_address, end_address}; auto it = m_split_ranges_set.lower_bound(search_interval); if (it == m_split_ranges_set.end()) { return; @@ -230,72 +231,74 @@ void RangeSet::ForEach(Func&& func) const { template template -void RangeSet::ForEachInRange(AddressType base_address, size_t size, Func&& func) const { +void RangeSet::ForEachInRange(AddressType base_address, size_t size, + Func&& func) const { m_impl->ForEachInRange(base_address, size, std::move(func)); } template -SplitRangeSet::SplitRangeSet() { - m_impl = std::make_unique::SplitRangeSetImpl>(); +OverlapRangeSet::OverlapRangeSet() { + m_impl = std::make_unique::OverlapRangeSetImpl>(); } template -SplitRangeSet::~SplitRangeSet() = default; +OverlapRangeSet::~OverlapRangeSet() = default; template -SplitRangeSet::SplitRangeSet(SplitRangeSet&& other) { - m_impl = std::make_unique::SplitRangeSetImpl>(); +OverlapRangeSet::OverlapRangeSet(OverlapRangeSet&& other) { + m_impl = std::make_unique::OverlapRangeSetImpl>(); m_impl->m_split_ranges_set = std::move(other.m_impl->m_split_ranges_set); } template -SplitRangeSet& SplitRangeSet::operator=(SplitRangeSet&& other) { +OverlapRangeSet& OverlapRangeSet::operator=(OverlapRangeSet&& other) { m_impl->m_split_ranges_set = std::move(other.m_impl->m_split_ranges_set); } template -void SplitRangeSet::Add(AddressType base_address, size_t size) { +void OverlapRangeSet::Add(AddressType base_address, size_t size) { m_impl->Add(base_address, size); } template -void SplitRangeSet::Subtract(AddressType base_address, size_t size) { +void OverlapRangeSet::Subtract(AddressType base_address, size_t size) { m_impl->template Subtract(base_address, size, 1, [](AddressType, AddressType) {}); } template template -void SplitRangeSet::Subtract(AddressType base_address, size_t size, Func&& on_delete) { +void OverlapRangeSet::Subtract(AddressType base_address, size_t size, + Func&& on_delete) { m_impl->template Subtract(base_address, size, 1, std::move(on_delete)); } template -void SplitRangeSet::DeleteAll(AddressType base_address, size_t size) { +void OverlapRangeSet::DeleteAll(AddressType base_address, size_t size) { m_impl->template Subtract(base_address, size, std::numeric_limits::max(), - [](AddressType, AddressType) {}); + [](AddressType, AddressType) {}); } template -void SplitRangeSet::Clear() { +void OverlapRangeSet::Clear() { m_impl->m_split_ranges_set.clear(); } template -bool SplitRangeSet::Empty() const { +bool OverlapRangeSet::Empty() const { return m_impl->m_split_ranges_set.empty(); } template template -void SplitRangeSet::ForEach(Func&& func) const { +void OverlapRangeSet::ForEach(Func&& func) const { m_impl->ForEach(func); } template template -void SplitRangeSet::ForEachInRange(AddressType base_address, size_t size, - Func&& func) const { +void OverlapRangeSet::ForEachInRange(AddressType base_address, size_t size, + Func&& func) const { m_impl->ForEachInRange(base_address, size, std::move(func)); } -} // namespace Common \ No newline at end of file +} // namespace Common diff --git a/src/core/hle/service/nvdrv/core/heap_mapper.cpp b/src/core/hle/service/nvdrv/core/heap_mapper.cpp index 542125a1c..af17e3e85 100644 --- a/src/core/hle/service/nvdrv/core/heap_mapper.cpp +++ b/src/core/hle/service/nvdrv/core/heap_mapper.cpp @@ -15,7 +15,7 @@ struct HeapMapper::HeapMapperInternal { ~HeapMapperInternal() = default; Common::RangeSet m_temporary_set; - Common::SplitRangeSet m_mapped_ranges; + Common::OverlapRangeSet m_mapped_ranges; Tegra::MaxwellDeviceMemoryManager& m_device_memory; std::mutex m_guard; }; diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 448516651..240e9f015 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -460,7 +460,7 @@ private: std::deque> committed_gpu_modified_ranges; // Async Buffers - Common::SplitRangeSet async_downloads; + Common::OverlapRangeSet async_downloads; std::deque> async_buffers; std::deque> pending_downloads; std::optional current_buffer;