From 03944191db049d80fff4ebf288117fe54409f662 Mon Sep 17 00:00:00 2001
From: Ryan Dick <ryanjdick3@gmail.com>
Date: Tue, 24 Dec 2024 22:29:11 +0000
Subject: [PATCH] Split test_autocast_modules.py into separate test files to
 mirror the source file structure.

---
 .../test_custom_invoke_linear_8_bit_lt.py     | 70 +++++++++++++++++
 .../test_custom_invoke_linear_nf4.py}         | 75 +------------------
 2 files changed, 74 insertions(+), 71 deletions(-)
 create mode 100644 tests/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/test_custom_invoke_linear_8_bit_lt.py
 rename tests/backend/model_manager/load/model_cache/torch_module_autocast/{test_autocast_modules.py => custom_modules/test_custom_invoke_linear_nf4.py} (50%)

diff --git a/tests/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/test_custom_invoke_linear_8_bit_lt.py b/tests/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/test_custom_invoke_linear_8_bit_lt.py
new file mode 100644
index 0000000000..9f07363f24
--- /dev/null
+++ b/tests/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/test_custom_invoke_linear_8_bit_lt.py
@@ -0,0 +1,70 @@
+import pytest
+import torch
+
+if not torch.cuda.is_available():
+    pytest.skip("CUDA is not available", allow_module_level=True)
+else:
+    from invokeai.backend.model_manager.load.model_cache.torch_module_autocast.custom_modules.custom_invoke_linear_8_bit_lt import (
+        CustomInvokeLinear8bitLt,
+    )
+    from invokeai.backend.quantization.bnb_llm_int8 import InvokeLinear8bitLt
+
+
+@pytest.fixture
+def linear_8bit_lt_layer():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
+    torch.manual_seed(1)
+
+    orig_layer = torch.nn.Linear(32, 64)
+    orig_layer_state_dict = orig_layer.state_dict()
+
+    # Prepare a quantized InvokeLinear8bitLt layer.
+    quantized_layer = InvokeLinear8bitLt(input_features=32, output_features=64, has_fp16_weights=False)
+    quantized_layer.load_state_dict(orig_layer_state_dict)
+    quantized_layer.to("cuda")
+
+    # Assert that the InvokeLinear8bitLt layer is quantized.
+    assert quantized_layer.weight.CB is not None
+    assert quantized_layer.weight.SCB is not None
+    assert quantized_layer.weight.CB.dtype == torch.int8
+
+    return quantized_layer
+
+
+def test_custom_invoke_linear_8bit_lt_all_weights_on_cuda(linear_8bit_lt_layer: InvokeLinear8bitLt):
+    """Test CustomInvokeLinear8bitLt inference with all weights on the GPU."""
+    # Run inference on the original layer.
+    x = torch.randn(1, 32).to("cuda")
+    y_quantized = linear_8bit_lt_layer(x)
+
+    # Wrap the InvokeLinear8bitLt layer in a CustomInvokeLinear8bitLt layer, and run inference on it.
+    linear_8bit_lt_layer.__class__ = CustomInvokeLinear8bitLt
+    y_custom = linear_8bit_lt_layer(x)
+
+    # Assert that the quantized and custom layers produce the same output.
+    assert torch.allclose(y_quantized, y_custom, atol=1e-5)
+
+
+def test_custom_invoke_linear_8bit_lt_all_weights_on_cpu(linear_8bit_lt_layer: InvokeLinear8bitLt):
+    """Test CustomInvokeLinear8bitLt inference with all weights on the CPU (streaming to the GPU)."""
+    # Run inference on the original layer.
+    x = torch.randn(1, 32).to("cuda")
+    y_quantized = linear_8bit_lt_layer(x)
+
+    # Copy the state dict to the CPU and reload it.
+    state_dict = linear_8bit_lt_layer.state_dict()
+    state_dict = {k: v.to("cpu") for k, v in state_dict.items()}
+    linear_8bit_lt_layer.load_state_dict(state_dict)
+
+    # Inference of the original layer should fail.
+    with pytest.raises(RuntimeError):
+        linear_8bit_lt_layer(x)
+
+    # Wrap the InvokeLinear8bitLt layer in a CustomInvokeLinear8bitLt layer, and run inference on it.
+    linear_8bit_lt_layer.__class__ = CustomInvokeLinear8bitLt
+    y_custom = linear_8bit_lt_layer(x)
+
+    # Assert that the quantized and custom layers produce the same output.
+    assert torch.allclose(y_quantized, y_custom, atol=1e-5)
diff --git a/tests/backend/model_manager/load/model_cache/torch_module_autocast/test_autocast_modules.py b/tests/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/test_custom_invoke_linear_nf4.py
similarity index 50%
rename from tests/backend/model_manager/load/model_cache/torch_module_autocast/test_autocast_modules.py
rename to tests/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/test_custom_invoke_linear_nf4.py
index ba5f27cdaa..1b74a9d656 100644
--- a/tests/backend/model_manager/load/model_cache/torch_module_autocast/test_autocast_modules.py
+++ b/tests/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/test_custom_invoke_linear_nf4.py
@@ -1,77 +1,10 @@
 import pytest
 import torch
 
-if not torch.cuda.is_available():
-    pytest.skip("CUDA is not available", allow_module_level=True)
-else:
-    from invokeai.backend.model_manager.load.model_cache.torch_module_autocast.custom_modules.custom_invoke_linear_8_bit_lt import (
-        CustomInvokeLinear8bitLt,
-    )
-    from invokeai.backend.model_manager.load.model_cache.torch_module_autocast.custom_modules.custom_invoke_linear_nf4 import (
-        CustomInvokeLinearNF4,
-    )
-    from invokeai.backend.quantization.bnb_llm_int8 import InvokeLinear8bitLt
-    from invokeai.backend.quantization.bnb_nf4 import InvokeLinearNF4
-
-
-@pytest.fixture
-def linear_8bit_lt_layer():
-    if not torch.cuda.is_available():
-        pytest.skip("CUDA is not available")
-
-    torch.manual_seed(1)
-
-    orig_layer = torch.nn.Linear(32, 64)
-    orig_layer_state_dict = orig_layer.state_dict()
-
-    # Prepare a quantized InvokeLinear8bitLt layer.
-    quantized_layer = InvokeLinear8bitLt(input_features=32, output_features=64, has_fp16_weights=False)
-    quantized_layer.load_state_dict(orig_layer_state_dict)
-    quantized_layer.to("cuda")
-
-    # Assert that the InvokeLinear8bitLt layer is quantized.
-    assert quantized_layer.weight.CB is not None
-    assert quantized_layer.weight.SCB is not None
-    assert quantized_layer.weight.CB.dtype == torch.int8
-
-    return quantized_layer
-
-
-def test_custom_invoke_linear_8bit_lt_all_weights_on_cuda(linear_8bit_lt_layer: InvokeLinear8bitLt):
-    """Test CustomInvokeLinear8bitLt inference with all weights on the GPU."""
-    # Run inference on the original layer.
-    x = torch.randn(1, 32).to("cuda")
-    y_quantized = linear_8bit_lt_layer(x)
-
-    # Wrap the InvokeLinear8bitLt layer in a CustomInvokeLinear8bitLt layer, and run inference on it.
-    linear_8bit_lt_layer.__class__ = CustomInvokeLinear8bitLt
-    y_custom = linear_8bit_lt_layer(x)
-
-    # Assert that the quantized and custom layers produce the same output.
-    assert torch.allclose(y_quantized, y_custom, atol=1e-5)
-
-
-def test_custom_invoke_linear_8bit_lt_all_weights_on_cpu(linear_8bit_lt_layer: InvokeLinear8bitLt):
-    """Test CustomInvokeLinear8bitLt inference with all weights on the CPU (streaming to the GPU)."""
-    # Run inference on the original layer.
-    x = torch.randn(1, 32).to("cuda")
-    y_quantized = linear_8bit_lt_layer(x)
-
-    # Copy the state dict to the CPU and reload it.
-    state_dict = linear_8bit_lt_layer.state_dict()
-    state_dict = {k: v.to("cpu") for k, v in state_dict.items()}
-    linear_8bit_lt_layer.load_state_dict(state_dict)
-
-    # Inference of the original layer should fail.
-    with pytest.raises(RuntimeError):
-        linear_8bit_lt_layer(x)
-
-    # Wrap the InvokeLinear8bitLt layer in a CustomInvokeLinear8bitLt layer, and run inference on it.
-    linear_8bit_lt_layer.__class__ = CustomInvokeLinear8bitLt
-    y_custom = linear_8bit_lt_layer(x)
-
-    # Assert that the quantized and custom layers produce the same output.
-    assert torch.allclose(y_quantized, y_custom, atol=1e-5)
+from invokeai.backend.model_manager.load.model_cache.torch_module_autocast.custom_modules.custom_invoke_linear_nf4 import (
+    CustomInvokeLinearNF4,
+)
+from invokeai.backend.quantization.bnb_nf4 import InvokeLinearNF4
 
 
 @pytest.fixture