From 3ef006b5abbe78bb2ae423a7cab74d7da2f8bc08 Mon Sep 17 00:00:00 2001
From: Liam <byteslice@airmail.cc>
Date: Wed, 30 Nov 2022 17:16:00 -0500
Subject: [PATCH] shader_recompiler: add gl_Layer translation GS for older
 hardware

---
 src/shader_recompiler/CMakeLists.txt          |  1 +
 .../frontend/maxwell/translate_program.cpp    | 81 +++++++++++++++++++
 .../frontend/maxwell/translate_program.h      |  9 +++
 src/shader_recompiler/host_translate_info.h   |  3 +-
 src/shader_recompiler/ir_opt/layer_pass.cpp   | 68 ++++++++++++++++
 src/shader_recompiler/ir_opt/passes.h         |  1 +
 src/shader_recompiler/shader_info.h           |  3 +
 .../renderer_opengl/gl_shader_cache.cpp       | 37 ++++++++-
 .../renderer_vulkan/vk_pipeline_cache.cpp     | 33 +++++++-
 9 files changed, 230 insertions(+), 6 deletions(-)
 create mode 100644 src/shader_recompiler/ir_opt/layer_pass.cpp

diff --git a/src/shader_recompiler/CMakeLists.txt b/src/shader_recompiler/CMakeLists.txt
index 545d69c7e..8cd584154 100644
--- a/src/shader_recompiler/CMakeLists.txt
+++ b/src/shader_recompiler/CMakeLists.txt
@@ -221,6 +221,7 @@ add_library(shader_recompiler STATIC
     ir_opt/dual_vertex_pass.cpp
     ir_opt/global_memory_to_storage_buffer_pass.cpp
     ir_opt/identity_removal_pass.cpp
+    ir_opt/layer_pass.cpp
     ir_opt/lower_fp16_to_fp32.cpp
     ir_opt/lower_int64_to_int32.cpp
     ir_opt/passes.h
diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
index 376aae0ea..3adbd2b16 100644
--- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
@@ -9,6 +9,7 @@
 #include "common/settings.h"
 #include "shader_recompiler/exception.h"
 #include "shader_recompiler/frontend/ir/basic_block.h"
+#include "shader_recompiler/frontend/ir/ir_emitter.h"
 #include "shader_recompiler/frontend/ir/post_order.h"
 #include "shader_recompiler/frontend/maxwell/structured_control_flow.h"
 #include "shader_recompiler/frontend/maxwell/translate/translate.h"
@@ -233,6 +234,8 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo
         Optimization::VerificationPass(program);
     }
     Optimization::CollectShaderInfoPass(env, program);
+    Optimization::LayerPass(program, host_info);
+
     CollectInterpolationInfo(env, program);
     AddNVNStorageBuffers(program);
     return program;
@@ -331,4 +334,82 @@ void ConvertLegacyToGeneric(IR::Program& program, const Shader::RuntimeInfo& run
     }
 }
 
+IR::Program GenerateGeometryPassthrough(ObjectPool<IR::Inst>& inst_pool,
+                                        ObjectPool<IR::Block>& block_pool,
+                                        const HostTranslateInfo& host_info,
+                                        IR::Program& source_program,
+                                        Shader::OutputTopology output_topology) {
+    IR::Program program;
+    program.stage = Stage::Geometry;
+    program.output_topology = output_topology;
+    switch (output_topology) {
+    case OutputTopology::PointList:
+        program.output_vertices = 1;
+        break;
+    case OutputTopology::LineStrip:
+        program.output_vertices = 2;
+        break;
+    default:
+        program.output_vertices = 3;
+        break;
+    }
+
+    program.is_geometry_passthrough = false;
+    program.info.loads.mask = source_program.info.stores.mask;
+    program.info.stores.mask = source_program.info.stores.mask;
+    program.info.stores.Set(IR::Attribute::Layer, true);
+    program.info.stores.Set(source_program.info.emulated_layer, false);
+
+    IR::Block* current_block = block_pool.Create(inst_pool);
+    auto& node{program.syntax_list.emplace_back()};
+    node.type = IR::AbstractSyntaxNode::Type::Block;
+    node.data.block = current_block;
+
+    IR::IREmitter ir{*current_block};
+    for (u32 i = 0; i < program.output_vertices; i++) {
+        // Assign generics from input
+        for (u32 j = 0; j < 32; j++) {
+            if (!program.info.stores.Generic(j)) {
+                continue;
+            }
+
+            const IR::Attribute attr = IR::Attribute::Generic0X + (j * 4);
+            ir.SetAttribute(attr + 0, ir.GetAttribute(attr + 0, ir.Imm32(i)), ir.Imm32(0));
+            ir.SetAttribute(attr + 1, ir.GetAttribute(attr + 1, ir.Imm32(i)), ir.Imm32(0));
+            ir.SetAttribute(attr + 2, ir.GetAttribute(attr + 2, ir.Imm32(i)), ir.Imm32(0));
+            ir.SetAttribute(attr + 3, ir.GetAttribute(attr + 3, ir.Imm32(i)), ir.Imm32(0));
+        }
+
+        // Assign position from input
+        const IR::Attribute attr = IR::Attribute::PositionX;
+        ir.SetAttribute(attr + 0, ir.GetAttribute(attr + 0, ir.Imm32(i)), ir.Imm32(0));
+        ir.SetAttribute(attr + 1, ir.GetAttribute(attr + 1, ir.Imm32(i)), ir.Imm32(0));
+        ir.SetAttribute(attr + 2, ir.GetAttribute(attr + 2, ir.Imm32(i)), ir.Imm32(0));
+        ir.SetAttribute(attr + 3, ir.GetAttribute(attr + 3, ir.Imm32(i)), ir.Imm32(0));
+
+        // Assign layer
+        ir.SetAttribute(IR::Attribute::Layer, ir.GetAttribute(source_program.info.emulated_layer),
+                        ir.Imm32(0));
+
+        // Emit vertex
+        ir.EmitVertex(ir.Imm32(0));
+    }
+    ir.EndPrimitive(ir.Imm32(0));
+
+    IR::Block* return_block{block_pool.Create(inst_pool)};
+    IR::IREmitter{*return_block}.Epilogue();
+    current_block->AddBranch(return_block);
+
+    auto& merge{program.syntax_list.emplace_back()};
+    merge.type = IR::AbstractSyntaxNode::Type::Block;
+    merge.data.block = return_block;
+    program.syntax_list.emplace_back().type = IR::AbstractSyntaxNode::Type::Return;
+
+    program.blocks = GenerateBlocks(program.syntax_list);
+    program.post_order_blocks = PostOrder(program.syntax_list.front());
+    Optimization::SsaRewritePass(program);
+
+    return program;
+}
+
 } // namespace Shader::Maxwell
diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.h b/src/shader_recompiler/frontend/maxwell/translate_program.h
index 02ede8c9c..497afe7cb 100644
--- a/src/shader_recompiler/frontend/maxwell/translate_program.h
+++ b/src/shader_recompiler/frontend/maxwell/translate_program.h
@@ -25,4 +25,13 @@ namespace Shader::Maxwell {
 
 void ConvertLegacyToGeneric(IR::Program& program, const RuntimeInfo& runtime_info);
 
+// Maxwell v1 and older Nvidia cards don't support setting gl_Layer from non-geometry stages.
+// This creates a workaround by setting the layer as a generic output and creating a
+// passthrough geometry shader that reads the generic and sets the layer.
+[[nodiscard]] IR::Program GenerateGeometryPassthrough(ObjectPool<IR::Inst>& inst_pool,
+                                                      ObjectPool<IR::Block>& block_pool,
+                                                      const HostTranslateInfo& host_info,
+                                                      IR::Program& source_program,
+                                                      Shader::OutputTopology output_topology);
+
 } // namespace Shader::Maxwell
diff --git a/src/shader_recompiler/host_translate_info.h b/src/shader_recompiler/host_translate_info.h
index cc1500690..d5d279554 100644
--- a/src/shader_recompiler/host_translate_info.h
+++ b/src/shader_recompiler/host_translate_info.h
@@ -13,7 +13,8 @@ struct HostTranslateInfo {
     bool support_float16{};      ///< True when the device supports 16-bit floats
     bool support_int64{};        ///< True when the device supports 64-bit integers
     bool needs_demote_reorder{}; ///< True when the device needs DemoteToHelperInvocation reordered
-    bool support_snorm_render_buffer{}; ///< True when the device supports SNORM render buffers
+    bool support_snorm_render_buffer{};  ///< True when the device supports SNORM render buffers
+    bool support_viewport_index_layer{}; ///< True when the device supports gl_Layer in VS
 };
 
 } // namespace Shader
diff --git a/src/shader_recompiler/ir_opt/layer_pass.cpp b/src/shader_recompiler/ir_opt/layer_pass.cpp
new file mode 100644
index 000000000..4574f7cf2
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/layer_pass.cpp
@@ -0,0 +1,68 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <algorithm>
+#include <bit>
+#include <optional>
+
+#include <boost/container/small_vector.hpp>
+
+#include "shader_recompiler/environment.h"
+#include "shader_recompiler/frontend/ir/basic_block.h"
+#include "shader_recompiler/frontend/ir/breadth_first_search.h"
+#include "shader_recompiler/frontend/ir/ir_emitter.h"
+#include "shader_recompiler/host_translate_info.h"
+#include "shader_recompiler/ir_opt/passes.h"
+#include "shader_recompiler/shader_info.h"
+
+namespace Shader::Optimization {
+
+static IR::Attribute EmulatedLayerAttribute(VaryingState& stores) {
+    for (u32 i = 0; i < 32; i++) {
+        if (!stores.Generic(i)) {
+            return IR::Attribute::Generic0X + (i * 4);
+        }
+    }
+    return IR::Attribute::Layer;
+}
+
+static bool PermittedProgramStage(Stage stage) {
+    switch (stage) {
+    case Stage::VertexA:
+    case Stage::VertexB:
+    case Stage::TessellationControl:
+    case Stage::TessellationEval:
+        return true;
+    default:
+        return false;
+    }
+}
+
+void LayerPass(IR::Program& program, const HostTranslateInfo& host_info) {
+    if (host_info.support_viewport_index_layer || !PermittedProgramStage(program.stage)) {
+        return;
+    }
+
+    const auto end{program.post_order_blocks.end()};
+    const auto layer_attribute = EmulatedLayerAttribute(program.info.stores);
+    bool requires_layer_emulation = false;
+
+    for (auto block = program.post_order_blocks.begin(); block != end; ++block) {
+        for (IR::Inst& inst : (*block)->Instructions()) {
+            if (inst.GetOpcode() == IR::Opcode::SetAttribute &&
+                inst.Arg(0).Attribute() == IR::Attribute::Layer) {
+                requires_layer_emulation = true;
+                inst.SetArg(0, IR::Value{layer_attribute});
+            }
+        }
+    }
+
+    if (requires_layer_emulation) {
+        program.info.requires_layer_emulation = true;
+        program.info.emulated_layer = layer_attribute;
+        program.info.stores.Set(IR::Attribute::Layer, false);
+        program.info.stores.Set(layer_attribute, true);
+    }
+}
+
+} // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir_opt/passes.h b/src/shader_recompiler/ir_opt/passes.h
index 586a0668f..11bfe801a 100644
--- a/src/shader_recompiler/ir_opt/passes.h
+++ b/src/shader_recompiler/ir_opt/passes.h
@@ -23,6 +23,7 @@ void RescalingPass(IR::Program& program);
 void SsaRewritePass(IR::Program& program);
 void PositionPass(Environment& env, IR::Program& program);
 void TexturePass(Environment& env, IR::Program& program, const HostTranslateInfo& host_info);
+void LayerPass(IR::Program& program, const HostTranslateInfo& host_info);
 void VerificationPass(const IR::Program& program);
 
 // Dual Vertex
diff --git a/src/shader_recompiler/shader_info.h b/src/shader_recompiler/shader_info.h
index ee6252bb5..d9c6e92db 100644
--- a/src/shader_recompiler/shader_info.h
+++ b/src/shader_recompiler/shader_info.h
@@ -204,6 +204,9 @@ struct Info {
     u32 nvn_buffer_base{};
     std::bitset<16> nvn_buffer_used{};
 
+    bool requires_layer_emulation{};
+    IR::Attribute emulated_layer{};
+
     boost::container::static_vector<ConstantBufferDescriptor, MAX_CBUFS>
         constant_buffer_descriptors;
     boost::container::static_vector<StorageBufferDescriptor, MAX_SSBOS> storage_buffers_descriptors;
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 3fe04a115..a38060100 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -39,6 +39,7 @@ using Shader::Backend::GLASM::EmitGLASM;
 using Shader::Backend::GLSL::EmitGLSL;
 using Shader::Backend::SPIRV::EmitSPIRV;
 using Shader::Maxwell::ConvertLegacyToGeneric;
+using Shader::Maxwell::GenerateGeometryPassthrough;
 using Shader::Maxwell::MergeDualVertexPrograms;
 using Shader::Maxwell::TranslateProgram;
 using VideoCommon::ComputeEnvironment;
@@ -56,6 +57,17 @@ auto MakeSpan(Container& container) {
     return std::span(container.data(), container.size());
 }
 
+Shader::OutputTopology MaxwellToOutputTopology(Maxwell::PrimitiveTopology topology) {
+    switch (topology) {
+    case Maxwell::PrimitiveTopology::Points:
+        return Shader::OutputTopology::PointList;
+    case Maxwell::PrimitiveTopology::LineStrip:
+        return Shader::OutputTopology::LineStrip;
+    default:
+        return Shader::OutputTopology::TriangleStrip;
+    }
+}
+
 Shader::RuntimeInfo MakeRuntimeInfo(const GraphicsPipelineKey& key,
                                     const Shader::IR::Program& program,
                                     const Shader::IR::Program* previous_program,
@@ -220,6 +232,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo
           .support_int64 = device.HasShaderInt64(),
           .needs_demote_reorder = device.IsAmd(),
           .support_snorm_render_buffer = false,
+          .support_viewport_index_layer = device.HasVertexViewportLayer(),
       } {
     if (use_asynchronous_shaders) {
         workers = CreateWorkers();
@@ -314,9 +327,7 @@ GraphicsPipeline* ShaderCache::CurrentGraphicsPipeline() {
     const auto& regs{maxwell3d->regs};
     graphics_key.raw = 0;
     graphics_key.early_z.Assign(regs.mandated_early_z != 0 ? 1 : 0);
-    graphics_key.gs_input_topology.Assign(graphics_key.unique_hashes[4] != 0
-                                              ? regs.draw.topology.Value()
-                                              : Maxwell::PrimitiveTopology{});
+    graphics_key.gs_input_topology.Assign(regs.draw.topology.Value());
     graphics_key.tessellation_primitive.Assign(regs.tessellation.params.domain_type.Value());
     graphics_key.tessellation_spacing.Assign(regs.tessellation.params.spacing.Value());
     graphics_key.tessellation_clockwise.Assign(
@@ -415,7 +426,19 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline(
     std::array<Shader::IR::Program, Maxwell::MaxShaderProgram> programs;
     const bool uses_vertex_a{key.unique_hashes[0] != 0};
     const bool uses_vertex_b{key.unique_hashes[1] != 0};
+
+    // Layer passthrough generation for devices without GL_ARB_shader_viewport_layer_array
+    Shader::IR::Program* layer_source_program{};
+
     for (size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
+        const bool is_emulated_stage = layer_source_program != nullptr &&
+                                       index == static_cast<u32>(Maxwell::ShaderType::Geometry);
+        if (key.unique_hashes[index] == 0 && is_emulated_stage) {
+            auto topology = MaxwellToOutputTopology(key.gs_input_topology);
+            programs[index] = GenerateGeometryPassthrough(pools.inst, pools.block, host_info,
+                                                          *layer_source_program, topology);
+            continue;
+        }
         if (key.unique_hashes[index] == 0) {
             continue;
         }
@@ -443,6 +466,10 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline(
                 Shader::NumDescriptors(program_vb.info.storage_buffers_descriptors);
             programs[index] = MergeDualVertexPrograms(program_va, program_vb, env);
         }
+
+        if (programs[index].info.requires_layer_emulation) {
+            layer_source_program = &programs[index];
+        }
     }
     const u32 glasm_storage_buffer_limit{device.GetMaxGLASMStorageBufferBlocks()};
     const bool glasm_use_storage_buffers{total_storage_buffers <= glasm_storage_buffer_limit};
@@ -456,7 +483,9 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline(
     const bool use_glasm{device.UseAssemblyShaders()};
     const size_t first_index = uses_vertex_a && uses_vertex_b ? 1 : 0;
     for (size_t index = first_index; index < Maxwell::MaxShaderProgram; ++index) {
-        if (key.unique_hashes[index] == 0) {
+        const bool is_emulated_stage = layer_source_program != nullptr &&
+                                       index == static_cast<u32>(Maxwell::ShaderType::Geometry);
+        if (key.unique_hashes[index] == 0 && !is_emulated_stage) {
             continue;
         }
         UNIMPLEMENTED_IF(index == 0);
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index d4b0a542a..150413b04 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -46,6 +46,7 @@ MICROPROFILE_DECLARE(Vulkan_PipelineCache);
 namespace {
 using Shader::Backend::SPIRV::EmitSPIRV;
 using Shader::Maxwell::ConvertLegacyToGeneric;
+using Shader::Maxwell::GenerateGeometryPassthrough;
 using Shader::Maxwell::MergeDualVertexPrograms;
 using Shader::Maxwell::TranslateProgram;
 using VideoCommon::ComputeEnvironment;
@@ -60,6 +61,17 @@ auto MakeSpan(Container& container) {
     return std::span(container.data(), container.size());
 }
 
+Shader::OutputTopology MaxwellToOutputTopology(Maxwell::PrimitiveTopology topology) {
+    switch (topology) {
+    case Maxwell::PrimitiveTopology::Points:
+        return Shader::OutputTopology::PointList;
+    case Maxwell::PrimitiveTopology::LineStrip:
+        return Shader::OutputTopology::LineStrip;
+    default:
+        return Shader::OutputTopology::TriangleStrip;
+    }
+}
+
 Shader::CompareFunction MaxwellToCompareFunction(Maxwell::ComparisonOp comparison) {
     switch (comparison) {
     case Maxwell::ComparisonOp::Never_D3D:
@@ -327,6 +339,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device
         .needs_demote_reorder = driver_id == VK_DRIVER_ID_AMD_PROPRIETARY_KHR ||
                                 driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR,
         .support_snorm_render_buffer = true,
+        .support_viewport_index_layer = device.IsExtShaderViewportIndexLayerSupported(),
     };
 }
 
@@ -509,7 +522,19 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline(
     std::array<Shader::IR::Program, Maxwell::MaxShaderProgram> programs;
     const bool uses_vertex_a{key.unique_hashes[0] != 0};
     const bool uses_vertex_b{key.unique_hashes[1] != 0};
+
+    // Layer passthrough generation for devices without VK_EXT_shader_viewport_index_layer
+    Shader::IR::Program* layer_source_program{};
+
     for (size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
+        const bool is_emulated_stage = layer_source_program != nullptr &&
+                                       index == static_cast<u32>(Maxwell::ShaderType::Geometry);
+        if (key.unique_hashes[index] == 0 && is_emulated_stage) {
+            auto topology = MaxwellToOutputTopology(key.state.topology);
+            programs[index] = GenerateGeometryPassthrough(pools.inst, pools.block, host_info,
+                                                          *layer_source_program, topology);
+            continue;
+        }
         if (key.unique_hashes[index] == 0) {
             continue;
         }
@@ -530,6 +555,10 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline(
             auto program_vb{TranslateProgram(pools.inst, pools.block, env, cfg, host_info)};
             programs[index] = MergeDualVertexPrograms(program_va, program_vb, env);
         }
+
+        if (programs[index].info.requires_layer_emulation) {
+            layer_source_program = &programs[index];
+        }
     }
     std::array<const Shader::Info*, Maxwell::MaxShaderStage> infos{};
     std::array<vk::ShaderModule, Maxwell::MaxShaderStage> modules;
@@ -538,7 +567,9 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline(
     Shader::Backend::Bindings binding;
     for (size_t index = uses_vertex_a && uses_vertex_b ? 1 : 0; index < Maxwell::MaxShaderProgram;
          ++index) {
-        if (key.unique_hashes[index] == 0) {
+        const bool is_emulated_stage = layer_source_program != nullptr &&
+                                       index == static_cast<u32>(Maxwell::ShaderType::Geometry);
+        if (key.unique_hashes[index] == 0 && !is_emulated_stage) {
             continue;
         }
         UNIMPLEMENTED_IF(index == 0);