From a5a94f52ffcbf3119d272a9369021a213ea6dad2 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Wed, 9 Feb 2022 15:00:05 +0100 Subject: [PATCH] MacroHLE: Add MultidrawIndirect HLE Macro. --- src/video_core/buffer_cache/buffer_cache.h | 22 +++++++ src/video_core/dma_pusher.cpp | 14 +++-- src/video_core/dma_pusher.h | 1 + src/video_core/engines/draw_manager.cpp | 21 +++++++ src/video_core/engines/draw_manager.h | 20 +++++++ src/video_core/engines/engine_interface.h | 2 + src/video_core/macro/macro_hle.cpp | 53 ++++++++--------- src/video_core/rasterizer_interface.h | 3 + .../renderer_vulkan/vk_rasterizer.cpp | 57 ++++++++++++++----- .../renderer_vulkan/vk_rasterizer.h | 4 ++ .../vulkan_common/vulkan_device.cpp | 2 +- .../vulkan_common/vulkan_wrapper.cpp | 2 + src/video_core/vulkan_common/vulkan_wrapper.h | 15 +++++ 13 files changed, 169 insertions(+), 47 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index f1c60d1f3..99abe0edf 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -170,6 +170,9 @@ public: void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format, bool is_written, bool is_image); + [[nodiscard]] std::pair ObtainBuffer(GPUVAddr gpu_addr, u32 size, + bool synchronize, bool mark_as_written); + void FlushCachedWrites(); /// Return true when there are uncommitted buffers to be downloaded @@ -790,6 +793,25 @@ void BufferCache

::BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_add compute_texture_buffers[tbo_index] = GetTextureBufferBinding(gpu_addr, size, format); } +template +std::pair BufferCache

::ObtainBuffer(GPUVAddr gpu_addr, u32 size, + bool synchronize, + bool mark_as_written) { + const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); + if (!cpu_addr) { + return {&slot_buffers[NULL_BUFFER_ID], 0}; + } + const BufferId buffer_id = FindBuffer(*cpu_addr, size); + Buffer& buffer = slot_buffers[buffer_id]; + if (synchronize) { + SynchronizeBuffer(buffer, *cpu_addr, size); + } + if (mark_as_written) { + MarkWrittenBuffer(buffer_id, *cpu_addr, size); + } + return {&buffer, buffer.Offset(*cpu_addr)}; +} + template void BufferCache

::FlushCachedWrites() { for (const BufferId buffer_id : cached_write_buffer_ids) { diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 322de2606..eb1371612 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -61,7 +61,7 @@ bool DmaPusher::Step() { } else { const CommandListHeader command_list_header{ command_list.command_lists[dma_pushbuffer_subindex++]}; - const GPUVAddr dma_get = command_list_header.addr; + dma_state.dma_get = command_list_header.addr; if (dma_pushbuffer_subindex >= command_list.command_lists.size()) { // We've gone through the current list, remove it from the queue @@ -75,11 +75,11 @@ bool DmaPusher::Step() { // Push buffer non-empty, read a word command_headers.resize_destructive(command_list_header.size); - if (Settings::IsGPULevelHigh()) { - memory_manager.ReadBlock(dma_get, command_headers.data(), + if (Settings::IsGPULevelExtreme()) { + memory_manager.ReadBlock(dma_state.dma_get, command_headers.data(), command_list_header.size * sizeof(u32)); } else { - memory_manager.ReadBlockUnsafe(dma_get, command_headers.data(), + memory_manager.ReadBlockUnsafe(dma_state.dma_get, command_headers.data(), command_list_header.size * sizeof(u32)); } ProcessCommands(command_headers); @@ -174,8 +174,10 @@ void DmaPusher::CallMultiMethod(const u32* base_start, u32 num_methods) const { puller.CallMultiMethod(dma_state.method, dma_state.subchannel, base_start, num_methods, dma_state.method_count); } else { - subchannels[dma_state.subchannel]->CallMultiMethod(dma_state.method, base_start, - num_methods, dma_state.method_count); + auto subchannel = subchannels[dma_state.subchannel]; + subchannel->current_dma_segment = dma_state.dma_get; + subchannel->CallMultiMethod(dma_state.method, base_start, num_methods, + dma_state.method_count); } } diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index 6f00de937..ca0899ba7 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -156,6 +156,7 @@ private: u32 subchannel; ///< Current subchannel u32 method_count; ///< Current method count u32 length_pending; ///< Large NI command length pending + GPUVAddr dma_get; ///< Currently read segment bool non_incrementing; ///< Current command's NI flag bool is_last_call; }; diff --git a/src/video_core/engines/draw_manager.cpp b/src/video_core/engines/draw_manager.cpp index 3a78421f6..4fa77b684 100644 --- a/src/video_core/engines/draw_manager.cpp +++ b/src/video_core/engines/draw_manager.cpp @@ -91,6 +91,16 @@ void DrawManager::DrawIndex(PrimitiveTopology topology, u32 index_first, u32 ind ProcessDraw(true, num_instances); } +void DrawManager::DrawIndexedIndirect(PrimitiveTopology topology, u32 index_first, u32 index_count) { + const auto& regs{maxwell3d->regs}; + draw_state.topology = topology; + draw_state.index_buffer = regs.index_buffer; + draw_state.index_buffer.first = index_first; + draw_state.index_buffer.count = index_count; + + ProcessDrawIndirect(true); +} + void DrawManager::SetInlineIndexBuffer(u32 index) { draw_state.inline_index_draw_indexes.push_back(static_cast(index & 0x000000ff)); draw_state.inline_index_draw_indexes.push_back(static_cast((index & 0x0000ff00) >> 8)); @@ -198,4 +208,15 @@ void DrawManager::ProcessDraw(bool draw_indexed, u32 instance_count) { maxwell3d->rasterizer->Draw(draw_indexed, instance_count); } } + +void DrawManager::ProcessDrawIndirect(bool draw_indexed) { + LOG_TRACE(HW_GPU, "called, topology={}, count={}", draw_state.topology, + draw_indexed ? draw_state.index_buffer.count : draw_state.vertex_buffer.count); + + UpdateTopology(); + + if (maxwell3d->ShouldExecute()) { + maxwell3d->rasterizer->DrawIndirect(draw_indexed); + } +} } // namespace Tegra::Engines diff --git a/src/video_core/engines/draw_manager.h b/src/video_core/engines/draw_manager.h index 0e6930a9c..0cdb37f83 100644 --- a/src/video_core/engines/draw_manager.h +++ b/src/video_core/engines/draw_manager.h @@ -32,6 +32,13 @@ public: std::vector inline_index_draw_indexes; }; + struct IndirectParams { + GPUVAddr start_address; + size_t buffer_size; + size_t max_draw_counts; + size_t stride; + }; + explicit DrawManager(Maxwell3D* maxwell_3d); void ProcessMethodCall(u32 method, u32 argument); @@ -46,10 +53,20 @@ public: void DrawIndex(PrimitiveTopology topology, u32 index_first, u32 index_count, u32 base_index, u32 base_instance, u32 num_instances); + void DrawIndexedIndirect(PrimitiveTopology topology, u32 index_first, u32 index_count); + const State& GetDrawState() const { return draw_state; } + IndirectParams& GetIndirectParams() { + return indirect_state; + } + + const IndirectParams& GetIndirectParams() const { + return indirect_state; + } + private: void SetInlineIndexBuffer(u32 index); @@ -63,7 +80,10 @@ private: void ProcessDraw(bool draw_indexed, u32 instance_count); + void ProcessDrawIndirect(bool draw_indexed); + Maxwell3D* maxwell3d{}; State draw_state{}; + IndirectParams indirect_state{}; }; } // namespace Tegra::Engines diff --git a/src/video_core/engines/engine_interface.h b/src/video_core/engines/engine_interface.h index 26cde8584..76630272d 100644 --- a/src/video_core/engines/engine_interface.h +++ b/src/video_core/engines/engine_interface.h @@ -17,6 +17,8 @@ public: /// Write multiple values to the register identified by method. virtual void CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) = 0; + + GPUVAddr current_dma_segment; }; } // namespace Tegra::Engines diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index 8549db2e4..1cc202cc7 100644 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp @@ -53,42 +53,43 @@ void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector& // Multidraw Indirect void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) { - SCOPE_EXIT({ - // Clean everything. - maxwell3d.regs.vertex_id_base = 0x0; - maxwell3d.CallMethod(0x8e3, 0x640, true); - maxwell3d.CallMethod(0x8e4, 0x0, true); - maxwell3d.CallMethod(0x8e5, 0x0, true); - maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; - }); const u32 start_indirect = parameters[0]; const u32 end_indirect = parameters[1]; if (start_indirect >= end_indirect) { // Nothing to do. return; } - const u32 padding = parameters[3]; - const std::size_t max_draws = parameters[4]; + const auto topology = + static_cast(parameters[2]); + const u32 padding = parameters[3]; // padding is in words + // size of each indirect segment const u32 indirect_words = 5 + padding; - const std::size_t first_draw = start_indirect; - const std::size_t effective_draws = end_indirect - start_indirect; - const std::size_t last_draw = start_indirect + std::min(effective_draws, max_draws); - - for (std::size_t index = first_draw; index < last_draw; index++) { + const u32 stride = indirect_words * sizeof(u32); + const GPUVAddr start_address = maxwell3d.current_dma_segment + 4 * sizeof(u32); + const std::size_t draw_count = end_indirect - start_indirect; + u32 lowest_first = std::numeric_limits::max(); + u32 highest_limit = std::numeric_limits::min(); + for (std::size_t index = 0; index < draw_count; index++) { const std::size_t base = index * indirect_words + 5; - const u32 base_vertex = parameters[base + 3]; - const u32 base_instance = parameters[base + 4]; - maxwell3d.regs.vertex_id_base = base_vertex; - maxwell3d.CallMethod(0x8e3, 0x640, true); - maxwell3d.CallMethod(0x8e4, base_vertex, true); - maxwell3d.CallMethod(0x8e5, base_instance, true); - maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; - maxwell3d.draw_manager->DrawIndex( - static_cast(parameters[2]), - parameters[base + 2], parameters[base], base_vertex, base_instance, - parameters[base + 1]); + const u32 count = parameters[base]; + const u32 first_index = parameters[base + 2]; + lowest_first = std::min(lowest_first, first_index); + highest_limit = std::max(highest_limit, first_index + count); } + + const u32 base_vertex = parameters[8]; + const u32 base_instance = parameters[9]; + maxwell3d.CallMethod(0x8e3, 0x640, true); + maxwell3d.CallMethod(0x8e4, base_vertex, true); + maxwell3d.CallMethod(0x8e5, base_instance, true); + auto& params = maxwell3d.draw_manager->GetIndirectParams(); + params.start_address = start_address; + params.buffer_size = sizeof(u32) + stride * draw_count; + params.max_draw_counts = draw_count; + params.stride = stride; + maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; + maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, highest_limit); } // Multi-layer Clear diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index b6907463c..a2a651f34 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -42,6 +42,9 @@ public: /// Dispatches a draw invocation virtual void Draw(bool is_indexed, u32 instance_count) = 0; + /// Dispatches an indirect draw invocation + virtual void DrawIndirect(bool is_indexed) {} + /// Clear the current framebuffer virtual void Clear(u32 layer_count) = 0; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index ac1eb9895..9b75f33dd 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -180,7 +180,8 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra RasterizerVulkan::~RasterizerVulkan() = default; -void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { +template +void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { MICROPROFILE_SCOPE(Vulkan_Drawing); SCOPE_EXIT({ gpu.TickWork(); }); @@ -201,22 +202,50 @@ void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { UpdateDynamicStates(); - const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); - const u32 num_instances{instance_count}; - const DrawParams draw_params{MakeDrawParams(draw_state, num_instances, is_indexed)}; - scheduler.Record([draw_params](vk::CommandBuffer cmdbuf) { - if (draw_params.is_indexed) { - cmdbuf.DrawIndexed(draw_params.num_vertices, draw_params.num_instances, - draw_params.first_index, draw_params.base_vertex, - draw_params.base_instance); - } else { - cmdbuf.Draw(draw_params.num_vertices, draw_params.num_instances, - draw_params.base_vertex, draw_params.base_instance); - } - }); + draw_func(); + EndTransformFeedback(); } +void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { + PrepareDraw(is_indexed, [this, is_indexed, instance_count] { + const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); + const u32 num_instances{instance_count}; + const DrawParams draw_params{MakeDrawParams(draw_state, num_instances, is_indexed)}; + scheduler.Record([draw_params](vk::CommandBuffer cmdbuf) { + if (draw_params.is_indexed) { + cmdbuf.DrawIndexed(draw_params.num_vertices, draw_params.num_instances, + draw_params.first_index, draw_params.base_vertex, + draw_params.base_instance); + } else { + cmdbuf.Draw(draw_params.num_vertices, draw_params.num_instances, + draw_params.base_vertex, draw_params.base_instance); + } + }); + }); +} + +void RasterizerVulkan::DrawIndirect(bool is_indexed) { + PrepareDraw(is_indexed, [this, is_indexed] { + const auto params = maxwell3d->draw_manager->GetIndirectParams(); + const auto [buffer, offset] = buffer_cache.ObtainBuffer( + params.start_address, static_cast(params.buffer_size), true, false); + scheduler.Record([buffer_obj = buffer->Handle(), offset, + max_draw_counts = params.max_draw_counts, stride = params.stride, + is_indexed](vk::CommandBuffer cmdbuf) { + if (is_indexed) { + cmdbuf.DrawIndexedIndirectCount(buffer_obj, offset + 4ULL, buffer_obj, offset, + static_cast(max_draw_counts), + static_cast(stride)); + } else { + cmdbuf.DrawIndirectCount(buffer_obj, offset + 4ULL, buffer_obj, offset, + static_cast(max_draw_counts), + static_cast(stride)); + } + }); + }); +} + void RasterizerVulkan::Clear(u32 layer_count) { MICROPROFILE_SCOPE(Vulkan_Clearing); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index ee483cfd9..bc43a8a1f 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -65,6 +65,7 @@ public: ~RasterizerVulkan() override; void Draw(bool is_indexed, u32 instance_count) override; + void DrawIndirect(bool is_indexed) override; void Clear(u32 layer_count) override; void DispatchCompute() override; void ResetCounter(VideoCore::QueryType type) override; @@ -114,6 +115,9 @@ private: static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float); + template + void PrepareDraw(bool is_indexed, Func&&); + void FlushWork(); void UpdateDynamicStates(); diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index c4d31681a..477fc428b 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -350,7 +350,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR .sampleRateShading = true, .dualSrcBlend = true, .logicOp = true, - .multiDrawIndirect = false, + .multiDrawIndirect = true, .drawIndirectFirstInstance = false, .depthClamp = true, .depthBiasClamp = true, diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index 7dca7341c..c58c4c1c4 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -94,6 +94,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdDispatch); X(vkCmdDraw); X(vkCmdDrawIndexed); + X(vkCmdDrawIndirectCount); + X(vkCmdDrawIndexedIndirectCount); X(vkCmdEndQuery); X(vkCmdEndRenderPass); X(vkCmdEndTransformFeedbackEXT); diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 8bd4fd4d9..9bd158dce 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -213,6 +213,8 @@ struct DeviceDispatch : InstanceDispatch { PFN_vkCmdDispatch vkCmdDispatch{}; PFN_vkCmdDraw vkCmdDraw{}; PFN_vkCmdDrawIndexed vkCmdDrawIndexed{}; + PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{}; + PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{}; PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; PFN_vkCmdEndQuery vkCmdEndQuery{}; PFN_vkCmdEndRenderPass vkCmdEndRenderPass{}; @@ -1019,6 +1021,19 @@ public: first_instance); } + void DrawIndirectCount(VkBuffer src_buffer, VkDeviceSize src_offset, VkBuffer count_buffer, + VkDeviceSize count_offset, u32 draw_count, u32 stride) const noexcept { + dld->vkCmdDrawIndirectCount(handle, src_buffer, src_offset, count_buffer, count_offset, + draw_count, stride); + } + + void DrawIndexedIndirectCount(VkBuffer src_buffer, VkDeviceSize src_offset, + VkBuffer count_buffer, VkDeviceSize count_offset, u32 draw_count, + u32 stride) const noexcept { + dld->vkCmdDrawIndexedIndirectCount(handle, src_buffer, src_offset, count_buffer, + count_offset, draw_count, stride); + } + void ClearAttachments(Span attachments, Span rects) const noexcept { dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(),