From 4e35177e23069ad7a4cb0fdfa2ad5b34300c44f7 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 9 Aug 2019 23:50:21 -0300 Subject: [PATCH] shader_ir: Implement VOTE Implement VOTE using Nvidia's intrinsics. Documentation about these can be found here https://developer.nvidia.com/reading-between-threads-shader-intrinsics Instead of using portable ARB instructions I opted to use Nvidia intrinsics because these are the closest we have to how Tegra X1 hardware renders. To stub VOTE on non-Nvidia drivers (including nouveau) this commit simulates a GPU with a warp size of one, returning what is meaningful for the instruction being emulated: * anyThreadNV(value) -> value * allThreadsNV(value) -> value * allThreadsEqualNV(value) -> true ballotARB, also known as "uint64_t(activeThreadsNV())", emits VOTE.ANY Rd, PT, PT; on nouveau's compiler. This doesn't match exactly to Nvidia's code VOTE.ALL Rd, PT, PT; Which is emulated with activeThreadsNV() by this commit. In theory this shouldn't really matter since .ANY, .ALL and .EQ affect the predicates (set to PT on those cases) and not the registers. --- CMakeModules/GenerateSCMRev.cmake | 1 + src/common/CMakeLists.txt | 1 + src/video_core/CMakeLists.txt | 1 + src/video_core/engines/shader_bytecode.h | 16 ++++++ src/video_core/renderer_opengl/gl_device.cpp | 3 + src/video_core/renderer_opengl/gl_device.h | 5 ++ .../renderer_opengl/gl_shader_cache.cpp | 4 +- .../renderer_opengl/gl_shader_decompiler.cpp | 47 ++++++++++++++++ .../renderer_vulkan/vk_shader_decompiler.cpp | 25 +++++++++ src/video_core/shader/decode.cpp | 1 + src/video_core/shader/decode/warp.cpp | 55 +++++++++++++++++++ src/video_core/shader/node.h | 5 ++ src/video_core/shader/shader_ir.h | 1 + 13 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 src/video_core/shader/decode/warp.cpp diff --git a/CMakeModules/GenerateSCMRev.cmake b/CMakeModules/GenerateSCMRev.cmake index abdc74428..a1ace89cb 100644 --- a/CMakeModules/GenerateSCMRev.cmake +++ b/CMakeModules/GenerateSCMRev.cmake @@ -81,6 +81,7 @@ set(HASH_FILES "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp" "${VIDEO_CORE}/shader/decode/shift.cpp" "${VIDEO_CORE}/shader/decode/video.cpp" + "${VIDEO_CORE}/shader/decode/warp.cpp" "${VIDEO_CORE}/shader/decode/xmad.cpp" "${VIDEO_CORE}/shader/control_flow.cpp" "${VIDEO_CORE}/shader/control_flow.h" diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 2b4266f29..01abdb3bb 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -55,6 +55,7 @@ add_custom_command(OUTPUT scm_rev.cpp "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp" "${VIDEO_CORE}/shader/decode/shift.cpp" "${VIDEO_CORE}/shader/decode/video.cpp" + "${VIDEO_CORE}/shader/decode/warp.cpp" "${VIDEO_CORE}/shader/decode/xmad.cpp" "${VIDEO_CORE}/shader/control_flow.cpp" "${VIDEO_CORE}/shader/control_flow.h" diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 7c18c27b3..f315e021d 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -100,6 +100,7 @@ add_library(video_core STATIC shader/decode/integer_set.cpp shader/decode/half_set.cpp shader/decode/video.cpp + shader/decode/warp.cpp shader/decode/xmad.cpp shader/decode/other.cpp shader/control_flow.cpp diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index aaa1acea9..bc8c2a1c5 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -538,6 +538,12 @@ enum class PhysicalAttributeDirection : u64 { Output = 1, }; +enum class VoteOperation : u64 { + All = 0, // allThreadsNV + Any = 1, // anyThreadNV + Eq = 2, // allThreadsEqualNV +}; + union Instruction { Instruction& operator=(const Instruction& instr) { value = instr.value; @@ -564,6 +570,13 @@ union Instruction { BitField<13, 1, u64> trigger; } nop; + union { + BitField<48, 2, VoteOperation> operation; + BitField<45, 3, u64> dest_pred; + BitField<39, 3, u64> value; + BitField<42, 1, u64> negate_value; + } vote; + union { BitField<8, 8, Register> gpr; BitField<20, 24, s64> offset; @@ -1487,6 +1500,7 @@ public: SYNC, BRK, DEPBAR, + VOTE, BFE_C, BFE_R, BFE_IMM, @@ -1649,6 +1663,7 @@ public: Hfma2, Flow, Synch, + Warp, Memory, Texture, Image, @@ -1775,6 +1790,7 @@ private: INST("111000110100---", Id::BRK, Type::Flow, "BRK"), INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"), INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"), + INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"), INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"), INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"), INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"), diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 85424a4c9..03d434b28 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -27,6 +27,8 @@ Device::Device() { shader_storage_alignment = GetInteger(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); max_vertex_attributes = GetInteger(GL_MAX_VERTEX_ATTRIBS); max_varyings = GetInteger(GL_MAX_VARYING_VECTORS); + has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group && + GLAD_GL_NV_shader_thread_shuffle; has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array; has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = TestComponentIndexingBug(); @@ -36,6 +38,7 @@ Device::Device(std::nullptr_t) { uniform_buffer_alignment = 0; max_vertex_attributes = 16; max_varyings = 15; + has_warp_intrinsics = true; has_vertex_viewport_layer = true; has_variable_aoffi = true; has_component_indexing_bug = false; diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index dc883722d..3ef7c6dd8 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -30,6 +30,10 @@ public: return max_varyings; } + bool HasWarpIntrinsics() const { + return has_warp_intrinsics; + } + bool HasVertexViewportLayer() const { return has_vertex_viewport_layer; } @@ -50,6 +54,7 @@ private: std::size_t shader_storage_alignment{}; u32 max_vertex_attributes{}; u32 max_varyings{}; + bool has_warp_intrinsics{}; bool has_vertex_viewport_layer{}; bool has_variable_aoffi{}; bool has_component_indexing_bug{}; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 1c90facc3..a32a7e984 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -212,7 +212,9 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn const auto texture_buffer_usage{variant.texture_buffer_usage}; std::string source = "#version 430 core\n" - "#extension GL_ARB_separate_shader_objects : enable\n"; + "#extension GL_ARB_separate_shader_objects : enable\n" + "#extension GL_NV_gpu_shader5 : enable\n" + "#extension GL_NV_shader_thread_group : enable\n"; if (entries.shader_viewport_layer_array) { source += "#extension GL_ARB_shader_viewport_layer_array : enable\n"; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index d8f722c26..1bfdbcd61 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -1735,6 +1735,48 @@ private: return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')'; } + std::string BallotThread(Operation operation) { + const std::string value = VisitOperand(operation, 0, Type::Bool); + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, + "Nvidia warp intrinsics are not available and its required by a shader"); + // Stub on non-Nvidia devices by simulating all threads voting the same as the active + // one. + return fmt::format("utof({} ? 0xFFFFFFFFU : 0U)", value); + } + return fmt::format("utof(ballotThreadNV({}))", value); + } + + std::string Vote(Operation operation, const char* func) { + const std::string value = VisitOperand(operation, 0, Type::Bool); + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, + "Nvidia vote intrinsics are not available and its required by a shader"); + // Stub with a warp size of one. + return value; + } + return fmt::format("{}({})", func, value); + } + + std::string VoteAll(Operation operation) { + return Vote(operation, "allThreadsNV"); + } + + std::string VoteAny(Operation operation) { + return Vote(operation, "anyThreadNV"); + } + + std::string VoteEqual(Operation operation) { + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, + "Nvidia vote intrinsics are not available and its required by a shader"); + // We must return true here since a stub for a theoretical warp size of 1 will always + // return an equal result for all its votes. + return "true"; + } + return Vote(operation, "allThreadsEqualNV"); + } + static constexpr std::array operation_decompilers = { &GLSLDecompiler::Assign, @@ -1885,6 +1927,11 @@ private: &GLSLDecompiler::WorkGroupId<0>, &GLSLDecompiler::WorkGroupId<1>, &GLSLDecompiler::WorkGroupId<2>, + + &GLSLDecompiler::BallotThread, + &GLSLDecompiler::VoteAll, + &GLSLDecompiler::VoteAny, + &GLSLDecompiler::VoteEqual, }; static_assert(operation_decompilers.size() == static_cast(OperationCode::Amount)); diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 24a591797..a35b45c9c 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -1072,6 +1072,26 @@ private: return {}; } + Id BallotThread(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id VoteAll(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id VoteAny(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id VoteEqual(Operation) { + UNIMPLEMENTED(); + return {}; + } + Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type, const std::string& name) { const Id id = OpVariable(type, storage); @@ -1364,6 +1384,11 @@ private: &SPIRVDecompiler::WorkGroupId<0>, &SPIRVDecompiler::WorkGroupId<1>, &SPIRVDecompiler::WorkGroupId<2>, + + &SPIRVDecompiler::BallotThread, + &SPIRVDecompiler::VoteAll, + &SPIRVDecompiler::VoteAny, + &SPIRVDecompiler::VoteEqual, }; static_assert(operation_decompilers.size() == static_cast(OperationCode::Amount)); diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp index b547d8323..47a9fd961 100644 --- a/src/video_core/shader/decode.cpp +++ b/src/video_core/shader/decode.cpp @@ -176,6 +176,7 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) { {OpCode::Type::Ffma, &ShaderIR::DecodeFfma}, {OpCode::Type::Hfma2, &ShaderIR::DecodeHfma2}, {OpCode::Type::Conversion, &ShaderIR::DecodeConversion}, + {OpCode::Type::Warp, &ShaderIR::DecodeWarp}, {OpCode::Type::Memory, &ShaderIR::DecodeMemory}, {OpCode::Type::Texture, &ShaderIR::DecodeTexture}, {OpCode::Type::Image, &ShaderIR::DecodeImage}, diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp new file mode 100644 index 000000000..04ca74f46 --- /dev/null +++ b/src/video_core/shader/decode/warp.cpp @@ -0,0 +1,55 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/engines/shader_bytecode.h" +#include "video_core/shader/node_helper.h" +#include "video_core/shader/shader_ir.h" + +namespace VideoCommon::Shader { + +using Tegra::Shader::Instruction; +using Tegra::Shader::OpCode; +using Tegra::Shader::Pred; +using Tegra::Shader::VoteOperation; + +namespace { +OperationCode GetOperationCode(VoteOperation vote_op) { + switch (vote_op) { + case VoteOperation::All: + return OperationCode::VoteAll; + case VoteOperation::Any: + return OperationCode::VoteAny; + case VoteOperation::Eq: + return OperationCode::VoteEqual; + default: + UNREACHABLE_MSG("Invalid vote operation={}", static_cast(vote_op)); + return OperationCode::VoteAll; + } +} +} // Anonymous namespace + +u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) { + const Instruction instr = {program_code[pc]}; + const auto opcode = OpCode::Decode(instr); + + switch (opcode->get().GetId()) { + case OpCode::Id::VOTE: { + const Node value = GetPredicate(instr.vote.value, instr.vote.negate_value != 0); + const Node active = Operation(OperationCode::BallotThread, value); + const Node vote = Operation(GetOperationCode(instr.vote.operation), value); + SetRegister(bb, instr.gpr0, active); + SetPredicate(bb, instr.vote.dest_pred, vote); + break; + } + default: + UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName()); + break; + } + + return pc; +} + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index 5f0852364..5db9313c4 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -168,6 +168,11 @@ enum class OperationCode { WorkGroupIdY, /// () -> uint WorkGroupIdZ, /// () -> uint + BallotThread, /// (bool) -> uint + VoteAll, /// (bool) -> bool + VoteAny, /// (bool) -> bool + VoteEqual, /// (bool) -> bool + Amount, }; diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 59a083d90..99d06ff4a 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -167,6 +167,7 @@ private: u32 DecodeFfma(NodeBlock& bb, u32 pc); u32 DecodeHfma2(NodeBlock& bb, u32 pc); u32 DecodeConversion(NodeBlock& bb, u32 pc); + u32 DecodeWarp(NodeBlock& bb, u32 pc); u32 DecodeMemory(NodeBlock& bb, u32 pc); u32 DecodeTexture(NodeBlock& bb, u32 pc); u32 DecodeImage(NodeBlock& bb, u32 pc);