Rasterizer: Implement Inline2Memory Acceleration.
This commit is contained in:
parent
f54280dafd
commit
4258d515e6
14 changed files with 122 additions and 6 deletions
|
@ -131,6 +131,8 @@ public:
|
||||||
|
|
||||||
void DownloadMemory(VAddr cpu_addr, u64 size);
|
void DownloadMemory(VAddr cpu_addr, u64 size);
|
||||||
|
|
||||||
|
bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<u8> inlined_buffer);
|
||||||
|
|
||||||
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
|
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
|
||||||
|
|
||||||
void DisableGraphicsUniformBuffer(size_t stage, u32 index);
|
void DisableGraphicsUniformBuffer(size_t stage, u32 index);
|
||||||
|
@ -808,6 +810,8 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
MICROPROFILE_SCOPE(GPU_DownloadMemory);
|
MICROPROFILE_SCOPE(GPU_DownloadMemory);
|
||||||
|
const bool is_accuracy_normal =
|
||||||
|
Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::Normal;
|
||||||
|
|
||||||
boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads;
|
boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads;
|
||||||
u64 total_size_bytes = 0;
|
u64 total_size_bytes = 0;
|
||||||
|
@ -819,6 +823,9 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
|
||||||
ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
|
ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
|
||||||
buffer.ForEachDownloadRangeAndClear(
|
buffer.ForEachDownloadRangeAndClear(
|
||||||
cpu_addr, size, [&](u64 range_offset, u64 range_size) {
|
cpu_addr, size, [&](u64 range_offset, u64 range_size) {
|
||||||
|
if (is_accuracy_normal) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
const VAddr buffer_addr = buffer.CpuAddr();
|
const VAddr buffer_addr = buffer.CpuAddr();
|
||||||
const auto add_download = [&](VAddr start, VAddr end) {
|
const auto add_download = [&](VAddr start, VAddr end) {
|
||||||
const u64 new_offset = start - buffer_addr;
|
const u64 new_offset = start - buffer_addr;
|
||||||
|
@ -1417,10 +1424,8 @@ void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 s
|
||||||
const IntervalType base_interval{cpu_addr, cpu_addr + size};
|
const IntervalType base_interval{cpu_addr, cpu_addr + size};
|
||||||
common_ranges.add(base_interval);
|
common_ranges.add(base_interval);
|
||||||
|
|
||||||
const bool is_accuracy_high =
|
|
||||||
Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High;
|
|
||||||
const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
|
const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
|
||||||
if (!is_async && !is_accuracy_high) {
|
if (!is_async) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
uncommitted_ranges.add(base_interval);
|
uncommitted_ranges.add(base_interval);
|
||||||
|
@ -1643,6 +1648,41 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
|
||||||
runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
|
runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class P>
|
||||||
|
bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size,
|
||||||
|
std::span<u8> inlined_buffer) {
|
||||||
|
const bool is_dirty = IsRegionRegistered(dest_address, copy_size);
|
||||||
|
if (!is_dirty) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (!IsRegionGpuModified(dest_address, copy_size)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const IntervalType subtract_interval{dest_address, dest_address + copy_size};
|
||||||
|
ClearDownload(subtract_interval);
|
||||||
|
|
||||||
|
BufferId buffer_id = FindBuffer(dest_address, static_cast<u32>(copy_size));
|
||||||
|
auto& buffer = slot_buffers[buffer_id];
|
||||||
|
SynchronizeBuffer(buffer, dest_address, static_cast<u32>(copy_size));
|
||||||
|
|
||||||
|
if constexpr (USE_MEMORY_MAPS) {
|
||||||
|
std::array copies{BufferCopy{
|
||||||
|
.src_offset = 0,
|
||||||
|
.dst_offset = buffer.Offset(dest_address),
|
||||||
|
.size = copy_size,
|
||||||
|
}};
|
||||||
|
auto upload_staging = runtime.UploadStagingBuffer(copy_size);
|
||||||
|
u8* const src_pointer = upload_staging.mapped_span.data();
|
||||||
|
std::memcpy(src_pointer, inlined_buffer.data(), copy_size);
|
||||||
|
runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
|
||||||
|
} else {
|
||||||
|
buffer.ImmediateUpload(buffer.Offset(dest_address), inlined_buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
template <class P>
|
template <class P>
|
||||||
void BufferCache<P>::DownloadBufferMemory(Buffer& buffer) {
|
void BufferCache<P>::DownloadBufferMemory(Buffer& buffer) {
|
||||||
DownloadBufferMemory(buffer, buffer.CpuAddr(), buffer.SizeBytes());
|
DownloadBufferMemory(buffer, buffer.CpuAddr(), buffer.SizeBytes());
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
#include "common/assert.h"
|
#include "common/assert.h"
|
||||||
#include "video_core/engines/engine_upload.h"
|
#include "video_core/engines/engine_upload.h"
|
||||||
#include "video_core/memory_manager.h"
|
#include "video_core/memory_manager.h"
|
||||||
|
#include "video_core/rasterizer_interface.h"
|
||||||
#include "video_core/textures/decoders.h"
|
#include "video_core/textures/decoders.h"
|
||||||
|
|
||||||
namespace Tegra::Engines::Upload {
|
namespace Tegra::Engines::Upload {
|
||||||
|
@ -16,6 +17,10 @@ State::State(MemoryManager& memory_manager_, Registers& regs_)
|
||||||
|
|
||||||
State::~State() = default;
|
State::~State() = default;
|
||||||
|
|
||||||
|
void State::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
|
||||||
|
rasterizer = rasterizer_;
|
||||||
|
}
|
||||||
|
|
||||||
void State::ProcessExec(const bool is_linear_) {
|
void State::ProcessExec(const bool is_linear_) {
|
||||||
write_offset = 0;
|
write_offset = 0;
|
||||||
copy_size = regs.line_length_in * regs.line_count;
|
copy_size = regs.line_length_in * regs.line_count;
|
||||||
|
@ -32,8 +37,7 @@ void State::ProcessData(const u32 data, const bool is_last_call) {
|
||||||
}
|
}
|
||||||
const GPUVAddr address{regs.dest.Address()};
|
const GPUVAddr address{regs.dest.Address()};
|
||||||
if (is_linear) {
|
if (is_linear) {
|
||||||
memory_manager.FlushRegion(address, copy_size);
|
rasterizer->AccelerateInline2Memory(address, copy_size, inner_buffer);
|
||||||
memory_manager.WriteBlock(address, inner_buffer.data(), copy_size);
|
|
||||||
} else {
|
} else {
|
||||||
UNIMPLEMENTED_IF(regs.dest.z != 0);
|
UNIMPLEMENTED_IF(regs.dest.z != 0);
|
||||||
UNIMPLEMENTED_IF(regs.dest.depth != 1);
|
UNIMPLEMENTED_IF(regs.dest.depth != 1);
|
||||||
|
|
|
@ -12,6 +12,10 @@ namespace Tegra {
|
||||||
class MemoryManager;
|
class MemoryManager;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace VideoCore {
|
||||||
|
class RasterizerInterface;
|
||||||
|
}
|
||||||
|
|
||||||
namespace Tegra::Engines::Upload {
|
namespace Tegra::Engines::Upload {
|
||||||
|
|
||||||
struct Registers {
|
struct Registers {
|
||||||
|
@ -60,6 +64,9 @@ public:
|
||||||
void ProcessExec(bool is_linear_);
|
void ProcessExec(bool is_linear_);
|
||||||
void ProcessData(u32 data, bool is_last_call);
|
void ProcessData(u32 data, bool is_last_call);
|
||||||
|
|
||||||
|
/// Binds a rasterizer to this engine.
|
||||||
|
void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
u32 write_offset = 0;
|
u32 write_offset = 0;
|
||||||
u32 copy_size = 0;
|
u32 copy_size = 0;
|
||||||
|
@ -68,6 +75,7 @@ private:
|
||||||
bool is_linear = false;
|
bool is_linear = false;
|
||||||
Registers& regs;
|
Registers& regs;
|
||||||
MemoryManager& memory_manager;
|
MemoryManager& memory_manager;
|
||||||
|
VideoCore::RasterizerInterface* rasterizer = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace Tegra::Engines::Upload
|
} // namespace Tegra::Engines::Upload
|
||||||
|
|
|
@ -22,6 +22,7 @@ KeplerCompute::~KeplerCompute() = default;
|
||||||
|
|
||||||
void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
|
void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
|
||||||
rasterizer = rasterizer_;
|
rasterizer = rasterizer_;
|
||||||
|
upload_state.BindRasterizer(rasterizer);
|
||||||
}
|
}
|
||||||
|
|
||||||
void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
|
void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
|
||||||
|
|
|
@ -19,6 +19,10 @@ KeplerMemory::KeplerMemory(Core::System& system_, MemoryManager& memory_manager)
|
||||||
|
|
||||||
KeplerMemory::~KeplerMemory() = default;
|
KeplerMemory::~KeplerMemory() = default;
|
||||||
|
|
||||||
|
void KeplerMemory::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
|
||||||
|
upload_state.BindRasterizer(rasterizer_);
|
||||||
|
}
|
||||||
|
|
||||||
void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
|
void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
|
||||||
ASSERT_MSG(method < Regs::NUM_REGS,
|
ASSERT_MSG(method < Regs::NUM_REGS,
|
||||||
"Invalid KeplerMemory register, increase the size of the Regs structure");
|
"Invalid KeplerMemory register, increase the size of the Regs structure");
|
||||||
|
|
|
@ -22,6 +22,10 @@ namespace Tegra {
|
||||||
class MemoryManager;
|
class MemoryManager;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace VideoCore {
|
||||||
|
class RasterizerInterface;
|
||||||
|
}
|
||||||
|
|
||||||
namespace Tegra::Engines {
|
namespace Tegra::Engines {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -38,6 +42,9 @@ public:
|
||||||
explicit KeplerMemory(Core::System& system_, MemoryManager& memory_manager);
|
explicit KeplerMemory(Core::System& system_, MemoryManager& memory_manager);
|
||||||
~KeplerMemory() override;
|
~KeplerMemory() override;
|
||||||
|
|
||||||
|
/// Binds a rasterizer to this engine.
|
||||||
|
void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
|
||||||
|
|
||||||
/// Write the value to the register identified by method.
|
/// Write the value to the register identified by method.
|
||||||
void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
|
void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
|
||||||
|
|
||||||
|
|
|
@ -31,6 +31,7 @@ Maxwell3D::~Maxwell3D() = default;
|
||||||
|
|
||||||
void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
|
void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
|
||||||
rasterizer = rasterizer_;
|
rasterizer = rasterizer_;
|
||||||
|
upload_state.BindRasterizer(rasterizer_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Maxwell3D::InitializeRegisterDefaults() {
|
void Maxwell3D::InitializeRegisterDefaults() {
|
||||||
|
|
|
@ -1557,7 +1557,8 @@ private:
|
||||||
|
|
||||||
static constexpr u32 null_cb_data = 0xFFFFFFFF;
|
static constexpr u32 null_cb_data = 0xFFFFFFFF;
|
||||||
struct CBDataState {
|
struct CBDataState {
|
||||||
std::array<std::array<u32, 0x4000>, 16> buffer;
|
static constexpr size_t inline_size = 0x8000;
|
||||||
|
std::array<std::array<u32, inline_size>, 16> buffer;
|
||||||
u32 current{null_cb_data};
|
u32 current{null_cb_data};
|
||||||
u32 id{null_cb_data};
|
u32 id{null_cb_data};
|
||||||
u32 start_pos{};
|
u32 start_pos{};
|
||||||
|
|
|
@ -59,6 +59,7 @@ struct GPU::Impl {
|
||||||
maxwell_3d->BindRasterizer(rasterizer);
|
maxwell_3d->BindRasterizer(rasterizer);
|
||||||
fermi_2d->BindRasterizer(rasterizer);
|
fermi_2d->BindRasterizer(rasterizer);
|
||||||
kepler_compute->BindRasterizer(rasterizer);
|
kepler_compute->BindRasterizer(rasterizer);
|
||||||
|
kepler_memory->BindRasterizer(rasterizer);
|
||||||
maxwell_dma->BindRasterizer(rasterizer);
|
maxwell_dma->BindRasterizer(rasterizer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -123,6 +123,9 @@ public:
|
||||||
|
|
||||||
[[nodiscard]] virtual Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() = 0;
|
[[nodiscard]] virtual Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() = 0;
|
||||||
|
|
||||||
|
virtual void AccelerateInline2Memory(GPUVAddr address, size_t copy_size,
|
||||||
|
std::span<u8> memory) = 0;
|
||||||
|
|
||||||
/// Attempt to use a faster method to display the framebuffer to screen
|
/// Attempt to use a faster method to display the framebuffer to screen
|
||||||
[[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,
|
[[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,
|
||||||
VAddr framebuffer_addr, u32 pixel_stride) {
|
VAddr framebuffer_addr, u32 pixel_stride) {
|
||||||
|
|
|
@ -484,6 +484,28 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerOpenGL::AccessAccelerateDMA()
|
||||||
return accelerate_dma;
|
return accelerate_dma;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void RasterizerOpenGL::AccelerateInline2Memory(GPUVAddr address, size_t copy_size,
|
||||||
|
std::span<u8> memory) {
|
||||||
|
auto cpu_addr = gpu_memory.GpuToCpuAddress(address);
|
||||||
|
if (!cpu_addr) [[unlikely]] {
|
||||||
|
gpu_memory.WriteBlock(address, memory.data(), copy_size);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
gpu_memory.WriteBlockUnsafe(address, memory.data(), copy_size);
|
||||||
|
{
|
||||||
|
std::unique_lock<std::mutex> lock{buffer_cache.mutex};
|
||||||
|
if (!buffer_cache.InlineMemory(*cpu_addr, copy_size, memory)) {
|
||||||
|
buffer_cache.WriteMemory(*cpu_addr, copy_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::scoped_lock lock_texture{texture_cache.mutex};
|
||||||
|
texture_cache.WriteMemory(*cpu_addr, copy_size);
|
||||||
|
}
|
||||||
|
shader_cache.InvalidateRegion(*cpu_addr, copy_size);
|
||||||
|
query_cache.InvalidateRegion(*cpu_addr, copy_size);
|
||||||
|
}
|
||||||
|
|
||||||
bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
|
bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
|
||||||
VAddr framebuffer_addr, u32 pixel_stride) {
|
VAddr framebuffer_addr, u32 pixel_stride) {
|
||||||
if (framebuffer_addr == 0) {
|
if (framebuffer_addr == 0) {
|
||||||
|
|
|
@ -106,6 +106,7 @@ public:
|
||||||
const Tegra::Engines::Fermi2D::Surface& dst,
|
const Tegra::Engines::Fermi2D::Surface& dst,
|
||||||
const Tegra::Engines::Fermi2D::Config& copy_config) override;
|
const Tegra::Engines::Fermi2D::Config& copy_config) override;
|
||||||
Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
|
Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
|
||||||
|
void AccelerateInline2Memory(GPUVAddr address, size_t copy_size, std::span<u8> memory) override;
|
||||||
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
|
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
|
||||||
u32 pixel_stride) override;
|
u32 pixel_stride) override;
|
||||||
void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
|
void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
|
||||||
|
|
|
@ -548,6 +548,28 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerVulkan::AccessAccelerateDMA()
|
||||||
return accelerate_dma;
|
return accelerate_dma;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void RasterizerVulkan::AccelerateInline2Memory(GPUVAddr address, size_t copy_size,
|
||||||
|
std::span<u8> memory) {
|
||||||
|
auto cpu_addr = gpu_memory.GpuToCpuAddress(address);
|
||||||
|
if (!cpu_addr) [[unlikely]] {
|
||||||
|
gpu_memory.WriteBlock(address, memory.data(), copy_size);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
gpu_memory.WriteBlockUnsafe(address, memory.data(), copy_size);
|
||||||
|
{
|
||||||
|
std::unique_lock<std::mutex> lock{buffer_cache.mutex};
|
||||||
|
if (!buffer_cache.InlineMemory(*cpu_addr, copy_size, memory)) {
|
||||||
|
buffer_cache.WriteMemory(*cpu_addr, copy_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::scoped_lock lock_texture{texture_cache.mutex};
|
||||||
|
texture_cache.WriteMemory(*cpu_addr, copy_size);
|
||||||
|
}
|
||||||
|
pipeline_cache.InvalidateRegion(*cpu_addr, copy_size);
|
||||||
|
query_cache.InvalidateRegion(*cpu_addr, copy_size);
|
||||||
|
}
|
||||||
|
|
||||||
bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config,
|
bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config,
|
||||||
VAddr framebuffer_addr, u32 pixel_stride) {
|
VAddr framebuffer_addr, u32 pixel_stride) {
|
||||||
if (!framebuffer_addr) {
|
if (!framebuffer_addr) {
|
||||||
|
|
|
@ -99,6 +99,7 @@ public:
|
||||||
const Tegra::Engines::Fermi2D::Surface& dst,
|
const Tegra::Engines::Fermi2D::Surface& dst,
|
||||||
const Tegra::Engines::Fermi2D::Config& copy_config) override;
|
const Tegra::Engines::Fermi2D::Config& copy_config) override;
|
||||||
Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
|
Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
|
||||||
|
void AccelerateInline2Memory(GPUVAddr address, size_t copy_size, std::span<u8> memory) override;
|
||||||
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
|
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
|
||||||
u32 pixel_stride) override;
|
u32 pixel_stride) override;
|
||||||
void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
|
void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
|
||||||
|
|
Loading…
Reference in a new issue