Add CustomInvokeLinearNF4 to enable CPU -> GPU streaming for InvokeLinearNF4 layers.

2026-04-23 03:00:31 -04:00 · 2024-12-22 20:52:03 +00:00
parent 1b56020876
commit dc54e8763b
2 changed files with 105 additions and 0 deletions
--- a/invokeai/backend/model_manager/load/model_cache/torch_module_autocast/autocast_modules.py
+++ b/invokeai/backend/model_manager/load/model_cache/torch_module_autocast/autocast_modules.py
@@ -1,9 +1,11 @@
+import copy
 from typing import TypeVar

 import bitsandbytes as bnb
 import torch

 from invokeai.backend.quantization.bnb_llm_int8 import InvokeLinear8bitLt
+from invokeai.backend.quantization.bnb_nf4 import InvokeLinearNF4

 T = TypeVar("T", torch.Tensor, None, torch.Tensor | None)

@@ -84,3 +86,37 @@ class CustomInvokeLinear8bitLt(InvokeLinear8bitLt):
        # it's dtype field must be accessible, even though it's not used. We pass in self.weight even though it could be
        # on the wrong device.
        return bnb.matmul(x, self.weight, bias=cast_to_device(self.bias, x.device), state=matmul_state)
+
+
+class CustomInvokeLinearNF4(InvokeLinearNF4):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        bnb.nn.modules.fix_4bit_weight_quant_state_from_module(self)
+
+        # weights are cast automatically as Int8Params, but the bias has to be cast manually
+        if self.bias is not None and self.bias.dtype != x.dtype:
+            self.bias.data = self.bias.data.to(x.dtype)
+
+        if not self.compute_type_is_set:
+            self.set_compute_type(x)
+            self.compute_type_is_set = True
+
+        inp_dtype = x.dtype
+        if self.compute_dtype is not None:
+            x = x.to(self.compute_dtype)
+
+        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
+
+        # HACK(ryand): Casting self.weight to the device also casts the self.weight.quant_state in-place (i.e. it
+        # does not follow the tensor semantics of returning a new copy when converting to a different device). This
+        # means that quant_state elements that started on the CPU would be left on the GPU, which we don't want. To
+        # avoid this side effect we make a shallow copy of the original quant_state so that we can restore it. Fixing
+        # this properly would require more invasive changes to the bitsandbytes library.
+
+        # Make a shallow copy of the quant_state so that we can undo the in-place modification that occurs when casting
+        # to a new device.
+        old_quant_state = copy.copy(self.weight.quant_state)
+        weight = cast_to_device(self.weight, x.device)
+        self.weight.quant_state = old_quant_state
+
+        bias = cast_to_device(self.bias, x.device)
+        return bnb.matmul_4bit(x, weight.t(), bias=bias, quant_state=weight.quant_state).to(inp_dtype)