From 73be8c745477042e129e4cb6981a2e4ad643e466 Mon Sep 17 00:00:00 2001
From: Connor Fitzgerald <connorwadefitzgerald@gmail.com>
Date: Sun, 15 Aug 2021 19:32:52 -0400
Subject: [PATCH] [hlsl-out] Implicitly transpose all matrices

---
 src/back/hlsl/mod.rs       | 11 +++++++++++
 src/back/hlsl/storage.rs   |  9 ++++-----
 src/back/hlsl/writer.rs    | 13 ++++++++++---
 tests/out/hlsl/access.hlsl |  6 +++---
 tests/out/hlsl/shadow.hlsl |  6 +++---
 tests/out/hlsl/skybox.hlsl |  8 ++++----
 6 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/src/back/hlsl/mod.rs b/src/back/hlsl/mod.rs
index 11e812a156..4dc3ed6e62 100644
--- a/src/back/hlsl/mod.rs
+++ b/src/back/hlsl/mod.rs
@@ -5,6 +5,17 @@
 //! - 5.1
 //! - 6.0
 //!
+//! All matrix construction/deconstruction is row based in HLSL. This means that when
+//! we construct a matrix from column vectors, our matrix will be implicitly transposed.
+//! The inverse transposition happens when we call `[0]` to get the zeroth column vector.
+//!
+//! Because all of our matrices are implicitly transposed, we flip arguments to `mul`. `mat * vec`
+//! becomes `vec * mat`, etc. This acts as the inverse transpose making the results identical.
+//!
+//! The only time we don't get this implicit transposition is when reading matrices from Uniforms/Push Constants.
+//! To deal with this, we add `row_major` to all declarations of matrices in Uniforms/Push Constants.
+//!
+//! Finally because all of our matrices are transposed, if you use `mat3x4`, it'll become `float4x3` in HLSL.
 
 mod conv;
 mod help;
diff --git a/src/back/hlsl/storage.rs b/src/back/hlsl/storage.rs
index 4cbc3e20c1..60d0b161f9 100644
--- a/src/back/hlsl/storage.rs
+++ b/src/back/hlsl/storage.rs
@@ -1,7 +1,6 @@
 //! Logic related to `ByteAddressBuffer` operations.
 //!
 //! HLSL backend uses byte address buffers for all storage buffers in IR.
-//! Matrices have to be transposed, because HLSL syntax implies row majority.
 
 use super::{
     super::{FunctionCtx, INDENT},
@@ -122,7 +121,7 @@ impl<W: fmt::Write> super::Writer<'_, W> {
             } => {
                 write!(
                     self.out,
-                    "transpose({}{}x{}(",
+                    "{}{}x{}(",
                     crate::ScalarKind::Float.to_hlsl_str(width)?,
                     rows as u8,
                     columns as u8,
@@ -144,7 +143,7 @@ impl<W: fmt::Write> super::Writer<'_, W> {
                     (TypeResolution::Value(ty_inner), i * row_stride)
                 });
                 self.write_storage_load_sequence(module, var_handle, iter, func_ctx)?;
-                write!(self.out, "))")?;
+                write!(self.out, ")")?;
             }
             crate::TypeInner::Array {
                 base,
@@ -267,7 +266,7 @@ impl<W: fmt::Write> super::Writer<'_, W> {
                 let depth = indent + 1;
                 write!(
                     self.out,
-                    "{}{}{}x{} {}{} = transpose(",
+                    "{}{}{}x{} {}{} = ",
                     INDENT.repeat(indent + 1),
                     crate::ScalarKind::Float.to_hlsl_str(width)?,
                     rows as u8,
@@ -276,7 +275,7 @@ impl<W: fmt::Write> super::Writer<'_, W> {
                     depth,
                 )?;
                 self.write_store_value(module, &value, func_ctx)?;
-                writeln!(self.out, ");")?;
+                writeln!(self.out, ";")?;
                 // then iterate the stores
                 let row_stride = width as u32 * columns as u32;
                 for i in 0..rows as u32 {
diff --git a/src/back/hlsl/writer.rs b/src/back/hlsl/writer.rs
index dde9d128eb..fbf41cd44b 100644
--- a/src/back/hlsl/writer.rs
+++ b/src/back/hlsl/writer.rs
@@ -640,6 +640,10 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                         }
                     }
 
+                    if let TypeInner::Matrix { .. } = module.types[ty].inner {
+                        write!(self.out, "row_major ")?;
+                    }
+
                     // Write the member type and name
                     self.write_type(module, ty)?;
                     write!(
@@ -700,12 +704,14 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
             } => {
                 // The IR supports only float matrix
                 // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-matrix
+
+                // Because of the implicit transpose all matrices have in HLSL, we need to tranpose the size as well.
                 write!(
                     self.out,
                     "{}{}x{}",
                     crate::ScalarKind::Float.to_hlsl_str(width)?,
-                    back::vector_size_str(columns),
                     back::vector_size_str(rows),
+                    back::vector_size_str(columns),
                 )?;
             }
             TypeInner::Image {
@@ -1302,10 +1308,11 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                     .inner_with(&module.types)
                     .is_matrix() =>
             {
+                // We intentionally flip the order of multiplication as our matrices are implicitly transposed.
                 write!(self.out, "mul(")?;
-                self.write_expr(module, left, func_ctx)?;
-                write!(self.out, ", ")?;
                 self.write_expr(module, right, func_ctx)?;
+                write!(self.out, ", ")?;
+                self.write_expr(module, left, func_ctx)?;
                 write!(self.out, ")")?;
             }
             Expression::Binary { op, left, right } => {
diff --git a/tests/out/hlsl/access.hlsl b/tests/out/hlsl/access.hlsl
index d724bbcaf5..1d9ff1ecf2 100644
--- a/tests/out/hlsl/access.hlsl
+++ b/tests/out/hlsl/access.hlsl
@@ -19,14 +19,14 @@ float4 foo(VertexInput_foo vertexinput_foo) : SV_Position
 
     float baz = foo1;
     foo1 = 1.0;
-    float4x4 matrix1 = transpose(float4x4(asfloat(bar.Load4(0+0)), asfloat(bar.Load4(0+16)), asfloat(bar.Load4(0+32)), asfloat(bar.Load4(0+48))));
+    float4x4 matrix1 = float4x4(asfloat(bar.Load4(0+0)), asfloat(bar.Load4(0+16)), asfloat(bar.Load4(0+32)), asfloat(bar.Load4(0+48)));
     uint2 arr[2] = {asuint(bar.Load2(72+0)), asuint(bar.Load2(72+8))};
     float4 _expr13 = asfloat(bar.Load4(48+0));
     float b = _expr13.x;
     int a = asint(bar.Load((((NagaBufferLengthRW(bar) - 88) / 4) - 2u)*4+88));
     bar.Store(8+16+0, asuint(1.0));
     {
-        float4x4 _value2 = transpose(float4x4(float4(0.0.xxxx), float4(1.0.xxxx), float4(2.0.xxxx), float4(3.0.xxxx)));
+        float4x4 _value2 = float4x4(float4(0.0.xxxx), float4(1.0.xxxx), float4(2.0.xxxx), float4(3.0.xxxx));
         bar.Store4(0+0, asuint(_value2[0]));
         bar.Store4(0+16, asuint(_value2[1]));
         bar.Store4(0+32, asuint(_value2[2]));
@@ -43,7 +43,7 @@ float4 foo(VertexInput_foo vertexinput_foo) : SV_Position
     }
     c[(vertexinput_foo.vi1 + 1u)] = 42;
     int value = c[vertexinput_foo.vi1];
-    return mul(matrix1, float4(int4(value.xxxx)));
+    return mul(float4(int4(value.xxxx)), matrix1);
 }
 
 [numthreads(1, 1, 1)]
diff --git a/tests/out/hlsl/shadow.hlsl b/tests/out/hlsl/shadow.hlsl
index 32d23dd931..742c41cbd5 100644
--- a/tests/out/hlsl/shadow.hlsl
+++ b/tests/out/hlsl/shadow.hlsl
@@ -6,7 +6,7 @@ struct Globals {
 };
 
 struct Light {
-    float4x4 proj;
+    row_major float4x4 proj;
     float4 pos;
     float4 color;
 };
@@ -51,9 +51,9 @@ float4 fs_main(FragmentInput_fs_main fragmentinput_fs_main) : SV_Target0
             break;
         }
         uint _expr19 = i;
-        Light light = {transpose(float4x4(asfloat(s_lights.Load4(_expr19*96+0+0+0)), asfloat(s_lights.Load4(_expr19*96+0+0+16)), asfloat(s_lights.Load4(_expr19*96+0+0+32)), asfloat(s_lights.Load4(_expr19*96+0+0+48)))), asfloat(s_lights.Load4(_expr19*96+0+64)), asfloat(s_lights.Load4(_expr19*96+0+80))};
+        Light light = {float4x4(asfloat(s_lights.Load4(_expr19*96+0+0+0)), asfloat(s_lights.Load4(_expr19*96+0+0+16)), asfloat(s_lights.Load4(_expr19*96+0+0+32)), asfloat(s_lights.Load4(_expr19*96+0+0+48))), asfloat(s_lights.Load4(_expr19*96+0+64)), asfloat(s_lights.Load4(_expr19*96+0+80))};
         uint _expr22 = i;
-        const float _e25 = fetch_shadow(_expr22, mul(light.proj, fragmentinput_fs_main.position1));
+        const float _e25 = fetch_shadow(_expr22, mul(fragmentinput_fs_main.position1, light.proj));
         float3 light_dir = normalize((light.pos.xyz - fragmentinput_fs_main.position1.xyz));
         float diffuse = max(0.0, dot(normal, light_dir));
         float3 _expr34 = color;
diff --git a/tests/out/hlsl/skybox.hlsl b/tests/out/hlsl/skybox.hlsl
index 058b0cbb60..39da82be70 100644
--- a/tests/out/hlsl/skybox.hlsl
+++ b/tests/out/hlsl/skybox.hlsl
@@ -10,8 +10,8 @@ struct VertexOutput {
 };
 
 struct Data {
-    float4x4 proj_inv;
-    float4x4 view;
+    row_major float4x4 proj_inv;
+    row_major float4x4 view;
 };
 
 cbuffer r_data : register(b0) { Data r_data; }
@@ -41,8 +41,8 @@ VertexOutput vs_main(VertexInput_vs_main vertexinput_vs_main)
     float4 _expr35 = r_data.view[2];
     float3x3 inv_model_view = transpose(float3x3(_expr27.xyz, _expr31.xyz, _expr35.xyz));
     float4x4 _expr40 = r_data.proj_inv;
-    float4 unprojected = mul(_expr40, pos);
-    const VertexOutput vertexoutput1 = { pos, mul(inv_model_view, unprojected.xyz) };
+    float4 unprojected = mul(pos, _expr40);
+    const VertexOutput vertexoutput1 = { pos, mul(unprojected.xyz, inv_model_view) };
     return vertexoutput1;
 }