From f6f0383b49e70eb9cc534fc5b283371f29189108 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Mon, 26 Jul 2021 01:31:05 -0300
Subject: [PATCH 1/3] shader: Add TryInstRecursive utility to values

---
 src/shader_recompiler/frontend/ir/value.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/shader_recompiler/frontend/ir/value.h b/src/shader_recompiler/frontend/ir/value.h
index 0c6bf684d..dbea20115 100644
--- a/src/shader_recompiler/frontend/ir/value.h
+++ b/src/shader_recompiler/frontend/ir/value.h
@@ -57,6 +57,7 @@ public:
 
     [[nodiscard]] IR::Inst* Inst() const;
     [[nodiscard]] IR::Inst* InstRecursive() const;
+    [[nodiscard]] IR::Inst* TryInstRecursive() const;
     [[nodiscard]] IR::Value Resolve() const;
     [[nodiscard]] IR::Reg Reg() const;
     [[nodiscard]] IR::Pred Pred() const;
@@ -308,6 +309,13 @@ inline IR::Inst* Value::InstRecursive() const {
     return inst;
 }
 
+inline IR::Inst* Value::TryInstRecursive() const {
+    if (IsIdentity()) {
+        return inst->Arg(0).TryInstRecursive();
+    }
+    return type == Type::Opaque ? inst : nullptr;
+}
+
 inline IR::Value Value::Resolve() const {
     if (IsIdentity()) {
         return inst->Arg(0).Resolve();

From 09fb41dc63eeda3a82580f119704e691ead9e76a Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Mon, 26 Jul 2021 04:15:23 -0300
Subject: [PATCH 2/3] shader: Use TryInstRecursive on XMAD multiply folding

Simplify a bit the logic.
---
 .../ir_opt/constant_propagation_pass.cpp      | 28 +++++++++----------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
index 8dd6d6c2c..08a06da02 100644
--- a/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
+++ b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
@@ -116,33 +116,31 @@ bool FoldXmadMultiply(IR::Block& block, IR::Inst& inst) {
      *
      * This optimization has been proven safe by LLVM and MSVC.
      */
-    const IR::Value lhs_arg{inst.Arg(0)};
-    const IR::Value rhs_arg{inst.Arg(1)};
-    if (lhs_arg.IsImmediate() || rhs_arg.IsImmediate()) {
+    IR::Inst* const lhs_shl{inst.Arg(0).TryInstRecursive()};
+    IR::Inst* const rhs_mul{inst.Arg(1).TryInstRecursive()};
+    if (!lhs_shl || !rhs_mul) {
         return false;
     }
-    IR::Inst* const lhs_shl{lhs_arg.InstRecursive()};
     if (lhs_shl->GetOpcode() != IR::Opcode::ShiftLeftLogical32 ||
         lhs_shl->Arg(1) != IR::Value{16U}) {
         return false;
     }
-    if (lhs_shl->Arg(0).IsImmediate()) {
+    IR::Inst* const lhs_mul{lhs_shl->Arg(0).TryInstRecursive()};
+    if (!lhs_mul) {
         return false;
     }
-    IR::Inst* const lhs_mul{lhs_shl->Arg(0).InstRecursive()};
-    IR::Inst* const rhs_mul{rhs_arg.InstRecursive()};
     if (lhs_mul->GetOpcode() != IR::Opcode::IMul32 || rhs_mul->GetOpcode() != IR::Opcode::IMul32) {
         return false;
     }
-    if (lhs_mul->Arg(1).Resolve() != rhs_mul->Arg(1).Resolve()) {
-        return false;
-    }
     const IR::U32 factor_b{lhs_mul->Arg(1)};
-    if (lhs_mul->Arg(0).IsImmediate() || rhs_mul->Arg(0).IsImmediate()) {
+    if (factor_b.Resolve() != rhs_mul->Arg(1).Resolve()) {
+        return false;
+    }
+    IR::Inst* const lhs_bfe{lhs_mul->Arg(0).TryInstRecursive()};
+    IR::Inst* const rhs_bfe{rhs_mul->Arg(0).TryInstRecursive()};
+    if (!lhs_bfe || !rhs_bfe) {
         return false;
     }
-    IR::Inst* const lhs_bfe{lhs_mul->Arg(0).InstRecursive()};
-    IR::Inst* const rhs_bfe{rhs_mul->Arg(0).InstRecursive()};
     if (lhs_bfe->GetOpcode() != IR::Opcode::BitFieldUExtract) {
         return false;
     }
@@ -155,10 +153,10 @@ bool FoldXmadMultiply(IR::Block& block, IR::Inst& inst) {
     if (rhs_bfe->Arg(1) != IR::Value{0U} || rhs_bfe->Arg(2) != IR::Value{16U}) {
         return false;
     }
-    if (lhs_bfe->Arg(0).Resolve() != rhs_bfe->Arg(0).Resolve()) {
+    const IR::U32 factor_a{lhs_bfe->Arg(0)};
+    if (factor_a.Resolve() != rhs_bfe->Arg(0).Resolve()) {
         return false;
     }
-    const IR::U32 factor_a{lhs_bfe->Arg(0)};
     IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
     inst.ReplaceUsesWith(ir.IMul(factor_a, factor_b));
     return true;

From 66a0cedba39cabba30c756626d7b58bd0e519d8e Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Mon, 26 Jul 2021 04:24:26 -0300
Subject: [PATCH 3/3] shader: Fold integer FMA from Nvidia's pattern

Fold shaders doing "a * b + c" on integers from the pattern generated by
Nvidia's GL compiler.

On a somewhat complex compute shader it reduces the code size by 16
instructions from 2 matches on Turing GPUs.

On Intel as extracted from KHR_pipeline_executable_properties:
Before the optimization:
```
Instruction Count: 2057
Basic Block Count: 45
Scratch Memory Size: 14752
Spill Count: 232
Fill Count: 261
SEND Count: 610
Cycle Count: 11325
```

After the optimization:
```
Instruction Count: 2046
Basic Block Count: 44
Scratch Memory Size: 13728
Spill Count: 219
Fill Count: 268
SEND Count: 604
Cycle Count: 11367
```
---
 .../ir_opt/constant_propagation_pass.cpp      | 175 ++++++++++++++++++
 1 file changed, 175 insertions(+)

diff --git a/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
index 08a06da02..c403a5fae 100644
--- a/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
+++ b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include <algorithm>
+#include <functional>
 #include <tuple>
 #include <type_traits>
 
@@ -88,6 +89,26 @@ bool FoldWhenAllImmediates(IR::Inst& inst, Func&& func) {
     return true;
 }
 
+/// Return true when all values in a range are equal
+template <typename Range>
+bool AreEqual(const Range& range) {
+    auto resolver{[](const auto& value) { return value.Resolve(); }};
+    auto equal{[](const IR::Value& lhs, const IR::Value& rhs) {
+        if (lhs == rhs) {
+            return true;
+        }
+        // Not equal, but try to match if they read the same constant buffer
+        if (!lhs.IsImmediate() && !rhs.IsImmediate() &&
+            lhs.Inst()->GetOpcode() == IR::Opcode::GetCbufU32 &&
+            rhs.Inst()->GetOpcode() == IR::Opcode::GetCbufU32 &&
+            lhs.Inst()->Arg(0) == rhs.Inst()->Arg(0) && lhs.Inst()->Arg(1) == rhs.Inst()->Arg(1)) {
+            return true;
+        }
+        return false;
+    }};
+    return std::ranges::adjacent_find(range, std::not_fn(equal), resolver) == std::end(range);
+}
+
 void FoldGetRegister(IR::Inst& inst) {
     if (inst.Arg(0).Reg() == IR::Reg::RZ) {
         inst.ReplaceUsesWith(IR::Value{u32{0}});
@@ -100,6 +121,157 @@ void FoldGetPred(IR::Inst& inst) {
     }
 }
 
+/// Replaces the XMAD pattern generated by an integer FMA
+bool FoldXmadMultiplyAdd(IR::Block& block, IR::Inst& inst) {
+    /*
+     * We are looking for this specific pattern:
+     *   %6      = BitFieldUExtract %op_b, #0, #16
+     *   %7      = BitFieldUExtract %op_a', #16, #16
+     *   %8      = IMul32 %6, %7
+     *   %10     = BitFieldUExtract %op_a', #0, #16
+     *   %11     = BitFieldInsert %8, %10, #16, #16
+     *   %15     = BitFieldUExtract %op_b, #0, #16
+     *   %16     = BitFieldUExtract %op_a, #0, #16
+     *   %17     = IMul32 %15, %16
+     *   %18     = IAdd32 %17, %op_c
+     *   %22     = BitFieldUExtract %op_b, #16, #16
+     *   %23     = BitFieldUExtract %11, #16, #16
+     *   %24     = IMul32 %22, %23
+     *   %25     = ShiftLeftLogical32 %24, #16
+     *   %26     = ShiftLeftLogical32 %11, #16
+     *   %27     = IAdd32 %26, %18
+     *   %result = IAdd32 %25, %27
+     *
+     * And replace it with:
+     *  %temp   = IMul32 %op_a, %op_b
+     *  %result = IAdd32 %temp, %op_c
+     *
+     * This optimization has been proven safe by Nvidia's compiler logic being reversed.
+     * (If Nvidia generates this code from 'fma(a, b, c)', we can do the same in the reverse order.)
+     */
+    const IR::Value zero{0u};
+    const IR::Value sixteen{16u};
+    IR::Inst* const _25{inst.Arg(0).TryInstRecursive()};
+    IR::Inst* const _27{inst.Arg(1).TryInstRecursive()};
+    if (!_25 || !_27) {
+        return false;
+    }
+    if (_27->GetOpcode() != IR::Opcode::IAdd32) {
+        return false;
+    }
+    if (_25->GetOpcode() != IR::Opcode::ShiftLeftLogical32 || _25->Arg(1) != sixteen) {
+        return false;
+    }
+    IR::Inst* const _24{_25->Arg(0).TryInstRecursive()};
+    if (!_24 || _24->GetOpcode() != IR::Opcode::IMul32) {
+        return false;
+    }
+    IR::Inst* const _22{_24->Arg(0).TryInstRecursive()};
+    IR::Inst* const _23{_24->Arg(1).TryInstRecursive()};
+    if (!_22 || !_23) {
+        return false;
+    }
+    if (_22->GetOpcode() != IR::Opcode::BitFieldUExtract) {
+        return false;
+    }
+    if (_23->GetOpcode() != IR::Opcode::BitFieldUExtract) {
+        return false;
+    }
+    if (_22->Arg(1) != sixteen || _22->Arg(2) != sixteen) {
+        return false;
+    }
+    if (_23->Arg(1) != sixteen || _23->Arg(2) != sixteen) {
+        return false;
+    }
+    IR::Inst* const _11{_23->Arg(0).TryInstRecursive()};
+    if (!_11 || _11->GetOpcode() != IR::Opcode::BitFieldInsert) {
+        return false;
+    }
+    if (_11->Arg(2) != sixteen || _11->Arg(3) != sixteen) {
+        return false;
+    }
+    IR::Inst* const _8{_11->Arg(0).TryInstRecursive()};
+    IR::Inst* const _10{_11->Arg(1).TryInstRecursive()};
+    if (!_8 || !_10) {
+        return false;
+    }
+    if (_8->GetOpcode() != IR::Opcode::IMul32) {
+        return false;
+    }
+    if (_10->GetOpcode() != IR::Opcode::BitFieldUExtract) {
+        return false;
+    }
+    IR::Inst* const _6{_8->Arg(0).TryInstRecursive()};
+    IR::Inst* const _7{_8->Arg(1).TryInstRecursive()};
+    if (!_6 || !_7) {
+        return false;
+    }
+    if (_6->GetOpcode() != IR::Opcode::BitFieldUExtract) {
+        return false;
+    }
+    if (_7->GetOpcode() != IR::Opcode::BitFieldUExtract) {
+        return false;
+    }
+    if (_6->Arg(1) != zero || _6->Arg(2) != sixteen) {
+        return false;
+    }
+    if (_7->Arg(1) != sixteen || _7->Arg(2) != sixteen) {
+        return false;
+    }
+    IR::Inst* const _26{_27->Arg(0).TryInstRecursive()};
+    IR::Inst* const _18{_27->Arg(1).TryInstRecursive()};
+    if (!_26 || !_18) {
+        return false;
+    }
+    if (_26->GetOpcode() != IR::Opcode::ShiftLeftLogical32 || _26->Arg(1) != sixteen) {
+        return false;
+    }
+    if (_26->Arg(0).InstRecursive() != _11) {
+        return false;
+    }
+    if (_18->GetOpcode() != IR::Opcode::IAdd32) {
+        return false;
+    }
+    IR::Inst* const _17{_18->Arg(0).TryInstRecursive()};
+    if (!_17 || _17->GetOpcode() != IR::Opcode::IMul32) {
+        return false;
+    }
+    IR::Inst* const _15{_17->Arg(0).TryInstRecursive()};
+    IR::Inst* const _16{_17->Arg(1).TryInstRecursive()};
+    if (!_15 || !_16) {
+        return false;
+    }
+    if (_15->GetOpcode() != IR::Opcode::BitFieldUExtract) {
+        return false;
+    }
+    if (_16->GetOpcode() != IR::Opcode::BitFieldUExtract) {
+        return false;
+    }
+    if (_15->Arg(1) != zero || _16->Arg(1) != zero || _10->Arg(1) != zero) {
+        return false;
+    }
+    if (_15->Arg(2) != sixteen || _16->Arg(2) != sixteen || _10->Arg(2) != sixteen) {
+        return false;
+    }
+    const std::array<IR::Value, 3> op_as{
+        _7->Arg(0).Resolve(),
+        _16->Arg(0).Resolve(),
+        _10->Arg(0).Resolve(),
+    };
+    const std::array<IR::Value, 3> op_bs{
+        _22->Arg(0).Resolve(),
+        _6->Arg(0).Resolve(),
+        _15->Arg(0).Resolve(),
+    };
+    const IR::U32 op_c{_18->Arg(1)};
+    if (!AreEqual(op_as) || !AreEqual(op_bs)) {
+        return false;
+    }
+    IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
+    inst.ReplaceUsesWith(ir.IAdd(ir.IMul(IR::U32{op_as[0]}, IR::U32{op_bs[1]}), op_c));
+    return true;
+}
+
 /// Replaces the pattern generated by two XMAD multiplications
 bool FoldXmadMultiply(IR::Block& block, IR::Inst& inst) {
     /*
@@ -179,6 +351,9 @@ void FoldAdd(IR::Block& block, IR::Inst& inst) {
         if (FoldXmadMultiply(block, inst)) {
             return;
         }
+        if (FoldXmadMultiplyAdd(block, inst)) {
+            return;
+        }
     }
 }