Performance optimizations for LoRAs applied on top of GGML-quantized tensors.

2026-04-23 03:00:31 -04:00 · 2025-01-24 20:52:25 +00:00
parent 6c919e1bca
commit 229834a5e8
1 changed files with 3 additions and 1 deletions
--- a/invokeai/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/custom_module_mixin.py
+++ b/invokeai/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/custom_module_mixin.py
@@ -52,7 +52,9 @@ class CustomModuleMixin:
            if type(param) is torch.nn.Parameter and type(param.data) is torch.Tensor:
                pass
            elif type(param) is GGMLTensor:
-                pass
+                # Move to device and dequantize here. Doing it in the patch layer can result in redundant casts /
+                # dequantizations.
+                orig_params[param_name] = param.to(device=device).get_dequantized_tensor()
            else:
                orig_params[param_name] = torch.empty(get_param_shape(param), device="meta")