Fix bug with partial offload of model buffers.

Fix bug in ModelCache that was causing it to offload more models from VRAM than necessary.
Fix handling of torch.nn.Module buffers in CachedModelWithPartialLoad.
2026-01-21 11:48:09 -05:00 · 2024-12-10 22:19:17 +00:00 · 2024-12-10 20:38:37 +00:00 · 2024-12-10 19:38:04 +00:00 · 2024-12-10 17:27:33 +00:00 · 2024-12-10 16:41:52 +00:00
23 changed files with 650 additions and 308 deletions
--- a/invokeai/app/invocations/compel.py
+++ b/invokeai/app/invocations/compel.py
@@ -82,10 +82,11 @@ class CompelInvocation(BaseInvocation):
            # apply all patches while the model is on the target device
            text_encoder_info.model_on_device() as (cached_weights, text_encoder),
            tokenizer_info as tokenizer,
-            LoRAPatcher.apply_lora_patches(
+            LoRAPatcher.apply_smart_lora_patches(
                model=text_encoder,
                patches=_lora_loader(),
                prefix="lora_te_",
+                dtype=TorchDevice.choose_torch_dtype(),
                cached_weights=cached_weights,
            ),
            # Apply CLIP Skip after LoRA to prevent LoRA application from failing on skipped layers.
@@ -179,10 +180,11 @@ class SDXLPromptInvocationBase:
            # apply all patches while the model is on the target device
            text_encoder_info.model_on_device() as (cached_weights, text_encoder),
            tokenizer_info as tokenizer,
-            LoRAPatcher.apply_lora_patches(
+            LoRAPatcher.apply_smart_lora_patches(
                text_encoder,
                patches=_lora_loader(),
                prefix=lora_prefix,
+                dtype=TorchDevice.choose_torch_dtype(),
                cached_weights=cached_weights,
            ),
            # Apply CLIP Skip after LoRA to prevent LoRA application from failing on skipped layers.
--- a/invokeai/app/invocations/denoise_latents.py
+++ b/invokeai/app/invocations/denoise_latents.py
@@ -1003,10 +1003,11 @@ class DenoiseLatentsInvocation(BaseInvocation):
            ModelPatcher.apply_freeu(unet, self.unet.freeu_config),
            SeamlessExt.static_patch_model(unet, self.unet.seamless_axes),  # FIXME
            # Apply the LoRA after unet has been moved to its target device for faster patching.
-            LoRAPatcher.apply_lora_patches(
+            LoRAPatcher.apply_smart_lora_patches(
                model=unet,
                patches=_lora_loader(),
                prefix="lora_unet_",
+                dtype=unet.dtype,
                cached_weights=cached_weights,
            ),
        ):
--- a/invokeai/app/invocations/flux_denoise.py
+++ b/invokeai/app/invocations/flux_denoise.py
@@ -296,10 +296,11 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard):
            if config.format in [ModelFormat.Checkpoint]:
                # The model is non-quantized, so we can apply the LoRA weights directly into the model.
                exit_stack.enter_context(
-                    LoRAPatcher.apply_lora_patches(
+                    LoRAPatcher.apply_smart_lora_patches(
                        model=transformer,
                        patches=self._lora_iterator(context),
                        prefix=FLUX_LORA_TRANSFORMER_PREFIX,
+                        dtype=inference_dtype,
                        cached_weights=cached_weights,
                    )
                )
@@ -311,7 +312,7 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard):
                # The model is quantized, so apply the LoRA weights as sidecar layers. This results in slower inference,
                # than directly patching the weights, but is agnostic to the quantization format.
                exit_stack.enter_context(
-                    LoRAPatcher.apply_lora_sidecar_patches(
+                    LoRAPatcher.apply_lora_wrapper_patches(
                        model=transformer,
                        patches=self._lora_iterator(context),
                        prefix=FLUX_LORA_TRANSFORMER_PREFIX,
--- a/invokeai/app/invocations/flux_text_encoder.py
+++ b/invokeai/app/invocations/flux_text_encoder.py
@@ -22,6 +22,7 @@ from invokeai.backend.lora.lora_model_raw import LoRAModelRaw
 from invokeai.backend.lora.lora_patcher import LoRAPatcher
 from invokeai.backend.model_manager.config import ModelFormat
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningFieldData, FLUXConditioningInfo
+from invokeai.backend.util.devices import TorchDevice


@invocation(
@@ -111,10 +112,11 @@ class FluxTextEncoderInvocation(BaseInvocation):
            if clip_text_encoder_config.format in [ModelFormat.Diffusers]:
                # The model is non-quantized, so we can apply the LoRA weights directly into the model.
                exit_stack.enter_context(
-                    LoRAPatcher.apply_lora_patches(
+                    LoRAPatcher.apply_smart_lora_patches(
                        model=clip_text_encoder,
                        patches=self._clip_lora_iterator(context),
                        prefix=FLUX_LORA_CLIP_PREFIX,
+                        dtype=TorchDevice.choose_torch_dtype(),
                        cached_weights=cached_weights,
                    )
                )
--- a/invokeai/app/invocations/sd3_text_encoder.py
+++ b/invokeai/app/invocations/sd3_text_encoder.py
@@ -21,6 +21,7 @@ from invokeai.backend.lora.lora_model_raw import LoRAModelRaw
 from invokeai.backend.lora.lora_patcher import LoRAPatcher
 from invokeai.backend.model_manager.config import ModelFormat
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningFieldData, SD3ConditioningInfo
+from invokeai.backend.util.devices import TorchDevice

 # The SD3 T5 Max Sequence Length set based on the default in diffusers.
 SD3_T5_MAX_SEQ_LEN = 256
@@ -150,10 +151,11 @@ class Sd3TextEncoderInvocation(BaseInvocation):
            if clip_text_encoder_config.format in [ModelFormat.Diffusers]:
                # The model is non-quantized, so we can apply the LoRA weights directly into the model.
                exit_stack.enter_context(
-                    LoRAPatcher.apply_lora_patches(
+                    LoRAPatcher.apply_smart_lora_patches(
                        model=clip_text_encoder,
                        patches=self._clip_lora_iterator(context, clip_model),
                        prefix=FLUX_LORA_CLIP_PREFIX,
+                        dtype=TorchDevice.choose_torch_dtype(),
                        cached_weights=cached_weights,
                    )
                )
--- a/invokeai/app/invocations/tiled_multi_diffusion_denoise_latents.py
+++ b/invokeai/app/invocations/tiled_multi_diffusion_denoise_latents.py
@@ -207,7 +207,9 @@ class TiledMultiDiffusionDenoiseLatents(BaseInvocation):
        with (
            ExitStack() as exit_stack,
            unet_info as unet,
-            LoRAPatcher.apply_lora_patches(model=unet, patches=_lora_loader(), prefix="lora_unet_"),
+            LoRAPatcher.apply_smart_lora_patches(
+                model=unet, patches=_lora_loader(), prefix="lora_unet_", dtype=unet.dtype
+            ),
        ):
            assert isinstance(unet, UNet2DConditionModel)
            latents = latents.to(device=unet.device, dtype=unet.dtype)
--- a/invokeai/backend/lora/lora_layer_wrappers.py
+++ b/invokeai/backend/lora/lora_layer_wrappers.py
@@ -0,0 +1,133 @@
+import torch
+
+from invokeai.backend.lora.layers.any_lora_layer import AnyLoRALayer
+from invokeai.backend.lora.layers.concatenated_lora_layer import ConcatenatedLoRALayer
+from invokeai.backend.lora.layers.lora_layer import LoRALayer
+
+
+class LoRASidecarWrapper(torch.nn.Module):
+    def __init__(self, orig_module: torch.nn.Module, lora_layers: list[AnyLoRALayer], lora_weights: list[float]):
+        super().__init__()
+        self._orig_module = orig_module
+        self._lora_layers = lora_layers
+        self._lora_weights = lora_weights
+
+    @property
+    def orig_module(self) -> torch.nn.Module:
+        return self._orig_module
+
+    def add_lora_layer(self, lora_layer: AnyLoRALayer, lora_weight: float):
+        self._lora_layers.append(lora_layer)
+        self._lora_weights.append(lora_weight)
+
+    @torch.no_grad()
+    def _get_lora_patched_parameters(
+        self, orig_params: dict[str, torch.Tensor], lora_layers: list[AnyLoRALayer], lora_weights: list[float]
+    ) -> dict[str, torch.Tensor]:
+        params: dict[str, torch.Tensor] = {}
+        for lora_layer, lora_weight in zip(lora_layers, lora_weights, strict=True):
+            layer_params = lora_layer.get_parameters(self._orig_module)
+            for param_name, param_weight in layer_params.items():
+                if orig_params[param_name].shape != param_weight.shape:
+                    param_weight = param_weight.reshape(orig_params[param_name].shape)
+
+                if param_name not in params:
+                    params[param_name] = param_weight * (lora_layer.scale() * lora_weight)
+                else:
+                    params[param_name] += param_weight * (lora_layer.scale() * lora_weight)
+
+        return params
+
+
+class LoRALinearWrapper(LoRASidecarWrapper):
+    def _lora_linear_forward(self, input: torch.Tensor, lora_layer: LoRALayer, lora_weight: float) -> torch.Tensor:
+        """An optimized implementation of the residual calculation for a Linear LoRALayer."""
+        x = torch.nn.functional.linear(input, lora_layer.down)
+        if lora_layer.mid is not None:
+            x = torch.nn.functional.linear(x, lora_layer.mid)
+        x = torch.nn.functional.linear(x, lora_layer.up, bias=lora_layer.bias)
+        x *= lora_weight * lora_layer.scale()
+        return x
+
+    def _concatenated_lora_forward(
+        self, input: torch.Tensor, concatenated_lora_layer: ConcatenatedLoRALayer, lora_weight: float
+    ) -> torch.Tensor:
+        """An optimized implementation of the residual calculation for a Linear ConcatenatedLoRALayer."""
+        x_chunks: list[torch.Tensor] = []
+        for lora_layer in concatenated_lora_layer.lora_layers:
+            x_chunk = torch.nn.functional.linear(input, lora_layer.down)
+            if lora_layer.mid is not None:
+                x_chunk = torch.nn.functional.linear(x_chunk, lora_layer.mid)
+            x_chunk = torch.nn.functional.linear(x_chunk, lora_layer.up, bias=lora_layer.bias)
+            x_chunk *= lora_weight * lora_layer.scale()
+            x_chunks.append(x_chunk)
+
+        # TODO(ryand): Generalize to support concat_axis != 0.
+        assert concatenated_lora_layer.concat_axis == 0
+        x = torch.cat(x_chunks, dim=-1)
+        return x
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        # Split the LoRA layers into those that have optimized implementations and those that don't.
+        optimized_layer_types = (LoRALayer, ConcatenatedLoRALayer)
+        optimized_layers = [
+            (layer, weight)
+            for layer, weight in zip(self._lora_layers, self._lora_weights, strict=True)
+            if isinstance(layer, optimized_layer_types)
+        ]
+        non_optimized_layers = [
+            (layer, weight)
+            for layer, weight in zip(self._lora_layers, self._lora_weights, strict=True)
+            if not isinstance(layer, optimized_layer_types)
+        ]
+
+        # First, calculate the residual for LoRA layers for which there is an optimized implementation.
+        residual = None
+        for lora_layer, lora_weight in optimized_layers:
+            if isinstance(lora_layer, LoRALayer):
+                added_residual = self._lora_linear_forward(input, lora_layer, lora_weight)
+            elif isinstance(lora_layer, ConcatenatedLoRALayer):
+                added_residual = self._concatenated_lora_forward(input, lora_layer, lora_weight)
+            else:
+                raise ValueError(f"Unsupported LoRA layer type: {type(lora_layer)}")
+
+            if residual is None:
+                residual = added_residual
+            else:
+                residual += added_residual
+
+        # Next, calculate the residuals for the LoRA layers for which there is no optimized implementation.
+        if non_optimized_layers:
+            unoptimized_layers, unoptimized_weights = zip(*non_optimized_layers, strict=True)
+            params = self._get_lora_patched_parameters(
+                orig_params={"weight": self._orig_module.weight, "bias": self._orig_module.bias},
+                lora_layers=unoptimized_layers,
+                lora_weights=unoptimized_weights,
+            )
+            added_residual = torch.nn.functional.linear(input, params["weight"], params.get("bias", None))
+            if residual is None:
+                residual = added_residual
+            else:
+                residual += added_residual
+
+        return self.orig_module(input) + residual
+
+
+class LoRAConv1dWrapper(LoRASidecarWrapper):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        params = self._get_lora_patched_parameters(
+            orig_params={"weight": self._orig_module.weight, "bias": self._orig_module.bias},
+            lora_layers=self._lora_layers,
+            lora_weights=self._lora_weights,
+        )
+        return self.orig_module(input) + torch.nn.functional.conv1d(input, params["weight"], params.get("bias", None))
+
+
+class LoRAConv2dWrapper(LoRASidecarWrapper):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        params = self._get_lora_patched_parameters(
+            orig_params={"weight": self._orig_module.weight, "bias": self._orig_module.bias},
+            lora_layers=self._lora_layers,
+            lora_weights=self._lora_weights,
+        )
+        return self.orig_module(input) + torch.nn.functional.conv2d(input, params["weight"], params.get("bias", None))
--- a/invokeai/backend/lora/lora_patcher.py
+++ b/invokeai/backend/lora/lora_patcher.py
@@ -4,19 +4,126 @@ from typing import Dict, Iterable, Optional, Tuple
 import torch

 from invokeai.backend.lora.layers.any_lora_layer import AnyLoRALayer
-from invokeai.backend.lora.layers.concatenated_lora_layer import ConcatenatedLoRALayer
-from invokeai.backend.lora.layers.lora_layer import LoRALayer
-from invokeai.backend.lora.lora_model_raw import LoRAModelRaw
-from invokeai.backend.lora.sidecar_layers.concatenated_lora.concatenated_lora_linear_sidecar_layer import (
-    ConcatenatedLoRALinearSidecarLayer,
+from invokeai.backend.lora.lora_layer_wrappers import (
+    LoRAConv1dWrapper,
+    LoRAConv2dWrapper,
+    LoRALinearWrapper,
+    LoRASidecarWrapper,
 )
-from invokeai.backend.lora.sidecar_layers.lora.lora_linear_sidecar_layer import LoRALinearSidecarLayer
-from invokeai.backend.lora.sidecar_layers.lora_sidecar_module import LoRASidecarModule
+from invokeai.backend.lora.lora_model_raw import LoRAModelRaw
 from invokeai.backend.util.devices import TorchDevice
 from invokeai.backend.util.original_weights_storage import OriginalWeightsStorage


 class LoRAPatcher:
+    @staticmethod
+    @torch.no_grad()
+    @contextmanager
+    def apply_smart_lora_patches(
+        model: torch.nn.Module,
+        patches: Iterable[Tuple[LoRAModelRaw, float]],
+        prefix: str,
+        dtype: torch.dtype,
+        cached_weights: Optional[Dict[str, torch.Tensor]] = None,
+    ):
+        """Apply 'smart' LoRA patching that chooses whether to use direct patching or a sidecar wrapper for each module."""
+
+        # original_weights are stored for unpatching layers that are directly patched.
+        original_weights = OriginalWeightsStorage(cached_weights)
+        # original_modules are stored for unpatching layers that are wrapped in a LoRASidecarWrapper.
+        original_modules: dict[str, torch.nn.Module] = {}
+        try:
+            for patch, patch_weight in patches:
+                LoRAPatcher._apply_smart_lora_patch(
+                    model=model,
+                    prefix=prefix,
+                    patch=patch,
+                    patch_weight=patch_weight,
+                    original_weights=original_weights,
+                    original_modules=original_modules,
+                    dtype=dtype,
+                )
+
+            yield
+        finally:
+            # Restore directly patched layers.
+            for param_key, weight in original_weights.get_changed_weights():
+                model.get_parameter(param_key).copy_(weight)
+
+            # Restore LoRASidecarWrapper modules.
+            # Note: This logic assumes no nested modules in original_modules.
+            for module_key, orig_module in original_modules.items():
+                module_parent_key, module_name = LoRAPatcher._split_parent_key(module_key)
+                parent_module = model.get_submodule(module_parent_key)
+                LoRAPatcher._set_submodule(parent_module, module_name, orig_module)
+
+    @staticmethod
+    @torch.no_grad()
+    def _apply_smart_lora_patch(
+        model: torch.nn.Module,
+        prefix: str,
+        patch: LoRAModelRaw,
+        patch_weight: float,
+        original_weights: OriginalWeightsStorage,
+        original_modules: dict[str, torch.nn.Module],
+        dtype: torch.dtype,
+    ):
+        """Apply a single LoRA patch to a model using the 'smart' patching strategy that chooses whether to use direct
+        patching or a sidecar wrapper for each module.
+        """
+        if patch_weight == 0:
+            return
+
+        # If the layer keys contain a dot, then they are not flattened, and can be directly used to access model
+        # submodules. If the layer keys do not contain a dot, then they are flattened, meaning that all '.' have been
+        # replaced with '_'. Non-flattened keys are preferred, because they allow submodules to be accessed directly
+        # without searching, but some legacy code still uses flattened keys.
+        layer_keys_are_flattened = "." not in next(iter(patch.layers.keys()))
+
+        prefix_len = len(prefix)
+
+        for layer_key, layer in patch.layers.items():
+            if not layer_key.startswith(prefix):
+                continue
+
+            module_key, module = LoRAPatcher._get_submodule(
+                model, layer_key[prefix_len:], layer_key_is_flattened=layer_keys_are_flattened
+            )
+
+            # Decide whether to use direct patching or a sidecar wrapper.
+            # Direct patching is preferred, because it results in better runtime speed.
+            # Reasons to use sidecar patching:
+            # - The module is already wrapped in a LoRASidecarWrapper.
+            # - The module is quantized.
+            # - The module is on the CPU (and we don't want to store a second full copy of the original weights on the
+            #   CPU, since this would double the RAM usage)
+            # NOTE: For now, we don't check if the layer is quantized here. We assume that this is checked in the caller
+            # and that the caller will use the 'apply_lora_wrapper_patches' method if the layer is quantized.
+            # TODO(ryand): Handle the case where we are running without a GPU. Should we set a config flag that allows
+            # forcing full patching even on the CPU?
+            if isinstance(module, LoRASidecarWrapper) or LoRAPatcher._is_any_part_of_layer_on_cpu(module):
+                LoRAPatcher._apply_lora_layer_wrapper_patch(
+                    model=model,
+                    module_to_patch=module,
+                    module_to_patch_key=module_key,
+                    patch=layer,
+                    patch_weight=patch_weight,
+                    original_modules=original_modules,
+                    dtype=dtype,
+                )
+            else:
+                LoRAPatcher._apply_lora_layer_patch(
+                    module_to_patch=module,
+                    module_to_patch_key=module_key,
+                    patch=layer,
+                    patch_weight=patch_weight,
+                    original_weights=original_weights,
+                )
+
+    @staticmethod
+    def _is_any_part_of_layer_on_cpu(layer: torch.nn.Module) -> bool:
+        return any(p.device.type == "cpu" for p in layer.parameters())
+
    @staticmethod
    @torch.no_grad()
    @contextmanager
@@ -40,7 +147,7 @@ class LoRAPatcher:
        original_weights = OriginalWeightsStorage(cached_weights)
        try:
            for patch, patch_weight in patches:
-                LoRAPatcher.apply_lora_patch(
+                LoRAPatcher._apply_lora_patch(
                    model=model,
                    prefix=prefix,
                    patch=patch,
@@ -56,7 +163,7 @@ class LoRAPatcher:

    @staticmethod
    @torch.no_grad()
-    def apply_lora_patch(
+    def _apply_lora_patch(
        model: torch.nn.Module,
        prefix: str,
        patch: LoRAModelRaw,
@@ -91,48 +198,67 @@ class LoRAPatcher:
                model, layer_key[prefix_len:], layer_key_is_flattened=layer_keys_are_flattened
            )

-            # All of the LoRA weight calculations will be done on the same device as the module weight.
-            # (Performance will be best if this is a CUDA device.)
-            device = module.weight.device
-            dtype = module.weight.dtype
+            LoRAPatcher._apply_lora_layer_patch(
+                module_to_patch=module,
+                module_to_patch_key=module_key,
+                patch=layer,
+                patch_weight=patch_weight,
+                original_weights=original_weights,
+            )

-            layer_scale = layer.scale()
+    @staticmethod
+    @torch.no_grad()
+    def _apply_lora_layer_patch(
+        module_to_patch: torch.nn.Module,
+        module_to_patch_key: str,
+        patch: AnyLoRALayer,
+        patch_weight: float,
+        original_weights: OriginalWeightsStorage,
+    ):
+        # All of the LoRA weight calculations will be done on the same device as the module weight.
+        # (Performance will be best if this is a CUDA device.)
+        device = module_to_patch.weight.device
+        dtype = module_to_patch.weight.dtype

-            # We intentionally move to the target device first, then cast. Experimentally, this was found to
-            # be significantly faster for 16-bit CPU tensors being moved to a CUDA device than doing the
-            # same thing in a single call to '.to(...)'.
-            layer.to(device=device)
-            layer.to(dtype=torch.float32)
+        layer_scale = patch.scale()

-            # TODO(ryand): Using torch.autocast(...) over explicit casting may offer a speed benefit on CUDA
-            # devices here. Experimentally, it was found to be very slow on CPU. More investigation needed.
-            for param_name, lora_param_weight in layer.get_parameters(module).items():
-                param_key = module_key + "." + param_name
-                module_param = module.get_parameter(param_name)
+        # We intentionally move to the target device first, then cast. Experimentally, this was found to
+        # be significantly faster for 16-bit CPU tensors being moved to a CUDA device than doing the
+        # same thing in a single call to '.to(...)'.
+        patch.to(device=device)
+        patch.to(dtype=torch.float32)

-                # Save original weight
-                original_weights.save(param_key, module_param)
+        # TODO(ryand): Using torch.autocast(...) over explicit casting may offer a speed benefit on CUDA
+        # devices here. Experimentally, it was found to be very slow on CPU. More investigation needed.
+        for param_name, lora_param_weight in patch.get_parameters(module_to_patch).items():
+            param_key = module_to_patch_key + "." + param_name
+            module_param = module_to_patch.get_parameter(param_name)

-                if module_param.shape != lora_param_weight.shape:
-                    lora_param_weight = lora_param_weight.reshape(module_param.shape)
+            # Save original weight
+            original_weights.save(param_key, module_param)

-                lora_param_weight *= patch_weight * layer_scale
-                module_param += lora_param_weight.to(dtype=dtype)
+            if module_param.shape != lora_param_weight.shape:
+                lora_param_weight = lora_param_weight.reshape(module_param.shape)

-            layer.to(device=TorchDevice.CPU_DEVICE)
+            lora_param_weight *= patch_weight * layer_scale
+            module_param += lora_param_weight.to(dtype=dtype)
+
+        patch.to(device=TorchDevice.CPU_DEVICE)

    @staticmethod
    @torch.no_grad()
    @contextmanager
-    def apply_lora_sidecar_patches(
+    def apply_lora_wrapper_patches(
        model: torch.nn.Module,
        patches: Iterable[Tuple[LoRAModelRaw, float]],
        prefix: str,
        dtype: torch.dtype,
    ):
-        """Apply one or more LoRA sidecar patches to a model within a context manager. Sidecar patches incur some
-        overhead compared to normal LoRA patching, but they allow for LoRA layers to applied to base layers in any
-        quantization format.
+        """Apply one or more LoRA wrapper patches to a model within a context manager. Wrapper patches incur some
+        runtime overhead compared to normal LoRA patching, but they enable:
+        - LoRA layers to be applied to quantized models
+        - LoRA layers to be applied to CPU layers without needing to store a full copy of the original weights (i.e.
+          avoid doubling the memory requirements).

        Args:
            model (torch.nn.Module): The model to patch.
@@ -140,14 +266,11 @@ class LoRAPatcher:
                associated weights. An iterator is used so that the LoRA patches do not need to be loaded into memory
                all at once.
            prefix (str): The keys in the patches will be filtered to only include weights with this prefix.
-            dtype (torch.dtype): The compute dtype of the sidecar layers. This cannot easily be inferred from the model,
-                since the sidecar layers are typically applied on top of quantized layers whose weight dtype is
-                different from their compute dtype.
        """
        original_modules: dict[str, torch.nn.Module] = {}
        try:
            for patch, patch_weight in patches:
-                LoRAPatcher._apply_lora_sidecar_patch(
+                LoRAPatcher._apply_lora_wrapper_patch(
                    model=model,
                    prefix=prefix,
                    patch=patch,
@@ -165,7 +288,7 @@ class LoRAPatcher:
                LoRAPatcher._set_submodule(parent_module, module_name, orig_module)

    @staticmethod
-    def _apply_lora_sidecar_patch(
+    def _apply_lora_wrapper_patch(
        model: torch.nn.Module,
        patch: LoRAModelRaw,
        patch_weight: float,
@@ -173,7 +296,7 @@ class LoRAPatcher:
        original_modules: dict[str, torch.nn.Module],
        dtype: torch.dtype,
    ):
-        """Apply a single LoRA sidecar patch to a model."""
+        """Apply a single LoRA wrapper patch to a model."""

        if patch_weight == 0:
            return
@@ -194,28 +317,47 @@ class LoRAPatcher:
                model, layer_key[prefix_len:], layer_key_is_flattened=layer_keys_are_flattened
            )

-            # Initialize the LoRA sidecar layer.
-            lora_sidecar_layer = LoRAPatcher._initialize_lora_sidecar_layer(module, layer, patch_weight)
+            LoRAPatcher._apply_lora_layer_wrapper_patch(
+                model=model,
+                module_to_patch=module,
+                module_to_patch_key=module_key,
+                patch=layer,
+                patch_weight=patch_weight,
+                original_modules=original_modules,
+                dtype=dtype,
+            )

-            # Replace the original module with a LoRASidecarModule if it has not already been done.
-            if module_key in original_modules:
-                # The module has already been patched with a LoRASidecarModule. Append to it.
-                assert isinstance(module, LoRASidecarModule)
-                lora_sidecar_module = module
-            else:
-                # The module has not yet been patched with a LoRASidecarModule. Create one.
-                lora_sidecar_module = LoRASidecarModule(module, [])
-                original_modules[module_key] = module
-                module_parent_key, module_name = LoRAPatcher._split_parent_key(module_key)
-                module_parent = model.get_submodule(module_parent_key)
-                LoRAPatcher._set_submodule(module_parent, module_name, lora_sidecar_module)
+    @staticmethod
+    @torch.no_grad()
+    def _apply_lora_layer_wrapper_patch(
+        model: torch.nn.Module,
+        module_to_patch: torch.nn.Module,
+        module_to_patch_key: str,
+        patch: AnyLoRALayer,
+        patch_weight: float,
+        original_modules: dict[str, torch.nn.Module],
+        dtype: torch.dtype,
+    ):
+        """Apply a single LoRA wrapper patch to a model."""

-            # Move the LoRA sidecar layer to the same device/dtype as the orig module.
-            # TODO(ryand): Experiment with moving to the device first, then casting. This could be faster.
-            lora_sidecar_layer.to(device=lora_sidecar_module.orig_module.weight.device, dtype=dtype)
+        # Replace the original module with a LoRASidecarWrapper if it has not already been done.
+        if not isinstance(module_to_patch, LoRASidecarWrapper):
+            lora_wrapper_layer = LoRAPatcher._initialize_lora_wrapper_layer(module_to_patch)
+            original_modules[module_to_patch_key] = module_to_patch
+            module_parent_key, module_name = LoRAPatcher._split_parent_key(module_to_patch_key)
+            module_parent = model.get_submodule(module_parent_key)
+            LoRAPatcher._set_submodule(module_parent, module_name, lora_wrapper_layer)
+            orig_module = module_to_patch
+        else:
+            assert module_to_patch_key in original_modules
+            lora_wrapper_layer = module_to_patch
+            orig_module = module_to_patch.orig_module

-            # Add the LoRA sidecar layer to the LoRASidecarModule.
-            lora_sidecar_module.add_lora_layer(lora_sidecar_layer)
+        # Move the LoRA layer to the same device/dtype as the orig module.
+        patch.to(device=orig_module.weight.device, dtype=dtype)
+
+        # Add the LoRA wrapper layer to the LoRASidecarWrapper.
+        lora_wrapper_layer.add_lora_layer(patch, patch_weight)

    @staticmethod
    def _split_parent_key(module_key: str) -> tuple[str, str]:
@@ -236,17 +378,13 @@ class LoRAPatcher:
            raise ValueError(f"Invalid module key: {module_key}")

    @staticmethod
-    def _initialize_lora_sidecar_layer(orig_layer: torch.nn.Module, lora_layer: AnyLoRALayer, patch_weight: float):
-        # TODO(ryand): Add support for more original layer types and LoRA layer types.
-        if isinstance(orig_layer, torch.nn.Linear) or (
-            isinstance(orig_layer, LoRASidecarModule) and isinstance(orig_layer.orig_module, torch.nn.Linear)
-        ):
-            if isinstance(lora_layer, LoRALayer):
-                return LoRALinearSidecarLayer(lora_layer=lora_layer, weight=patch_weight)
-            elif isinstance(lora_layer, ConcatenatedLoRALayer):
-                return ConcatenatedLoRALinearSidecarLayer(concatenated_lora_layer=lora_layer, weight=patch_weight)
-            else:
-                raise ValueError(f"Unsupported Linear LoRA layer type: {type(lora_layer)}")
+    def _initialize_lora_wrapper_layer(orig_layer: torch.nn.Module):
+        if isinstance(orig_layer, torch.nn.Linear):
+            return LoRALinearWrapper(orig_layer, [], [])
+        elif isinstance(orig_layer, torch.nn.Conv1d):
+            return LoRAConv1dWrapper(orig_layer, [], [])
+        elif isinstance(orig_layer, torch.nn.Conv2d):
+            return LoRAConv2dWrapper(orig_layer, [], [])
        else:
            raise ValueError(f"Unsupported layer type: {type(orig_layer)}")

--- a/invokeai/backend/lora/sidecar_layers/init.py
+++ b/invokeai/backend/lora/sidecar_layers/init.py
--- a/invokeai/backend/lora/sidecar_layers/concatenated_lora/init.py
+++ b/invokeai/backend/lora/sidecar_layers/concatenated_lora/init.py
--- a/invokeai/backend/lora/sidecar_layers/concatenated_lora/concatenated_lora_linear_sidecar_layer.py
+++ b/invokeai/backend/lora/sidecar_layers/concatenated_lora/concatenated_lora_linear_sidecar_layer.py
@@ -1,34 +0,0 @@
-import torch
-
-from invokeai.backend.lora.layers.concatenated_lora_layer import ConcatenatedLoRALayer
-
-
-class ConcatenatedLoRALinearSidecarLayer(torch.nn.Module):
-    def __init__(
-        self,
-        concatenated_lora_layer: ConcatenatedLoRALayer,
-        weight: float,
-    ):
-        super().__init__()
-
-        self._concatenated_lora_layer = concatenated_lora_layer
-        self._weight = weight
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        x_chunks: list[torch.Tensor] = []
-        for lora_layer in self._concatenated_lora_layer.lora_layers:
-            x_chunk = torch.nn.functional.linear(input, lora_layer.down)
-            if lora_layer.mid is not None:
-                x_chunk = torch.nn.functional.linear(x_chunk, lora_layer.mid)
-            x_chunk = torch.nn.functional.linear(x_chunk, lora_layer.up, bias=lora_layer.bias)
-            x_chunk *= self._weight * lora_layer.scale()
-            x_chunks.append(x_chunk)
-
-        # TODO(ryand): Generalize to support concat_axis != 0.
-        assert self._concatenated_lora_layer.concat_axis == 0
-        x = torch.cat(x_chunks, dim=-1)
-        return x
-
-    def to(self, device: torch.device | None = None, dtype: torch.dtype | None = None):
-        self._concatenated_lora_layer.to(device=device, dtype=dtype)
-        return self
--- a/invokeai/backend/lora/sidecar_layers/lora/init.py
+++ b/invokeai/backend/lora/sidecar_layers/lora/init.py
--- a/invokeai/backend/lora/sidecar_layers/lora/lora_linear_sidecar_layer.py
+++ b/invokeai/backend/lora/sidecar_layers/lora/lora_linear_sidecar_layer.py
@@ -1,27 +0,0 @@
-import torch
-
-from invokeai.backend.lora.layers.lora_layer import LoRALayer
-
-
-class LoRALinearSidecarLayer(torch.nn.Module):
-    def __init__(
-        self,
-        lora_layer: LoRALayer,
-        weight: float,
-    ):
-        super().__init__()
-
-        self._lora_layer = lora_layer
-        self._weight = weight
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = torch.nn.functional.linear(x, self._lora_layer.down)
-        if self._lora_layer.mid is not None:
-            x = torch.nn.functional.linear(x, self._lora_layer.mid)
-        x = torch.nn.functional.linear(x, self._lora_layer.up, bias=self._lora_layer.bias)
-        x *= self._weight * self._lora_layer.scale()
-        return x
-
-    def to(self, device: torch.device | None = None, dtype: torch.dtype | None = None):
-        self._lora_layer.to(device=device, dtype=dtype)
-        return self
--- a/invokeai/backend/lora/sidecar_layers/lora_sidecar_layer.py
+++ b/invokeai/backend/lora/sidecar_layers/lora_sidecar_layer.py
--- a/invokeai/backend/lora/sidecar_layers/lora_sidecar_module.py
+++ b/invokeai/backend/lora/sidecar_layers/lora_sidecar_module.py
@@ -1,24 +0,0 @@
-import torch
-
-
-class LoRASidecarModule(torch.nn.Module):
-    """A LoRA sidecar module that wraps an original module and adds LoRA layers to it."""
-
-    def __init__(self, orig_module: torch.nn.Module, lora_layers: list[torch.nn.Module]):
-        super().__init__()
-        self.orig_module = orig_module
-        self._lora_layers = lora_layers
-
-    def add_lora_layer(self, lora_layer: torch.nn.Module):
-        self._lora_layers.append(lora_layer)
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        x = self.orig_module(input)
-        for lora_layer in self._lora_layers:
-            x += lora_layer(input)
-        return x
-
-    def to(self, device: torch.device | None = None, dtype: torch.dtype | None = None):
-        self._orig_module.to(device=device, dtype=dtype)
-        for lora_layer in self._lora_layers:
-            lora_layer.to(device=device, dtype=dtype)
--- a/invokeai/backend/model_manager/load/model_cache/cached_model/cached_model_with_partial_load.py
+++ b/invokeai/backend/model_manager/load/model_cache/cached_model/cached_model_with_partial_load.py
@@ -1,3 +1,5 @@
+import itertools
+
 import torch

 from invokeai.backend.model_manager.load.model_cache.torch_function_autocast_context import (
@@ -35,10 +37,9 @@ class CachedModelWithPartialLoad:
        # Monkey-patch the model to add autocasting to the model's forward method.
        add_autocast_to_module_forward(model, compute_device)

-        # TODO(ryand): Manage a read-only CPU copy of the model state dict.
-        # TODO(ryand): Add memoization for total_bytes and cur_vram_bytes?
-
-        self._total_bytes = sum(calc_tensor_size(p) for p in self._model.parameters())
+        self._total_bytes = sum(
+            calc_tensor_size(p) for p in itertools.chain(self._model.parameters(), self._model.buffers())
+        )
        self._cur_vram_bytes: int | None = None

    @property
@@ -58,7 +59,9 @@ class CachedModelWithPartialLoad:
        """Get the size (in bytes) of the weights that are currently in VRAM."""
        if self._cur_vram_bytes is None:
            self._cur_vram_bytes = sum(
-                calc_tensor_size(p) for p in self._model.parameters() if p.device.type == self._compute_device.type
+                calc_tensor_size(p)
+                for p in itertools.chain(self._model.parameters(), self._model.buffers())
+                if p.device.type == self._compute_device.type
            )
        return self._cur_vram_bytes

@@ -79,8 +82,7 @@ class CachedModelWithPartialLoad:
        """
        vram_bytes_loaded = 0

-        # TODO(ryand): Iterate over buffers too?
-        for key, param in self._model.named_parameters():
+        for key, param in itertools.chain(self._model.named_parameters(), self._model.named_buffers()):
            # Skip parameters that are already on the compute device.
            if param.device.type == self._compute_device.type:
                continue
@@ -96,13 +98,18 @@ class CachedModelWithPartialLoad:
            # We use the 'overwrite' strategy from torch.nn.Module._apply().
            # TODO(ryand): For some edge cases (e.g. quantized models?), we may need to support other strategies (e.g.
            # swap).
-            assert isinstance(param, torch.nn.Parameter)
-            assert param.is_leaf
-            out_param = torch.nn.Parameter(param.to(self._compute_device, copy=True), requires_grad=param.requires_grad)
-            set_nested_attr(self._model, key, out_param)
-            # We did not port the param.grad handling from torch.nn.Module._apply(), because we do not expect to be
-            # handling gradients. We assert that this assumption is true.
-            assert param.grad is None
+            if isinstance(param, torch.nn.Parameter):
+                assert param.is_leaf
+                out_param = torch.nn.Parameter(
+                    param.to(self._compute_device, copy=True), requires_grad=param.requires_grad
+                )
+                set_nested_attr(self._model, key, out_param)
+                # We did not port the param.grad handling from torch.nn.Module._apply(), because we do not expect to be
+                # handling gradients. We assert that this assumption is true.
+                assert param.grad is None
+            else:
+                # Handle buffers.
+                set_nested_attr(self._model, key, param.to(self._compute_device, copy=True))

            vram_bytes_loaded += param_size

@@ -120,17 +127,21 @@ class CachedModelWithPartialLoad:
        """
        vram_bytes_freed = 0

-        # TODO(ryand): Iterate over buffers too?
-        for key, param in self._model.named_parameters():
+        for key, param in itertools.chain(self._model.named_parameters(), self._model.named_buffers()):
            if vram_bytes_freed >= vram_bytes_to_free:
                break

            if param.device.type != self._compute_device.type:
                continue

-            # Create a new parameter, but inject the existing CPU tensor into it.
-            out_param = torch.nn.Parameter(self._cpu_state_dict[key], requires_grad=param.requires_grad)
-            set_nested_attr(self._model, key, out_param)
+            if isinstance(param, torch.nn.Parameter):
+                # Create a new parameter, but inject the existing CPU tensor into it.
+                out_param = torch.nn.Parameter(self._cpu_state_dict[key], requires_grad=param.requires_grad)
+                set_nested_attr(self._model, key, out_param)
+            else:
+                # Handle buffers.
+                set_nested_attr(self._model, key, self._cpu_state_dict[key])
+
            vram_bytes_freed += calc_tensor_size(param)

        if self._cur_vram_bytes is not None:
--- a/invokeai/backend/model_manager/load/model_cache/model_cache.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache.py
@@ -233,11 +233,15 @@ class ModelCache:
        """Helper function for self.lock(). Loads a locked model into VRAM."""
        vram_available = self._get_vram_available()

-        # The amount of additional VRAM that will be used if we fully load the model into VRAM.
+        # Calculate model_vram_needed, the amount of additional VRAM that will be used if we fully load the model into
+        # VRAM.
        model_cur_vram_bytes = cache_entry.cached_model.cur_vram_bytes()
        model_total_bytes = cache_entry.cached_model.total_bytes()
        model_vram_needed = model_total_bytes - model_cur_vram_bytes

+        # The amount of VRAM that must be freed to make room for model_vram_needed.
+        vram_bytes_to_free = max(0, model_vram_needed - vram_available)
+
        self._logger.debug(
            f"Before unloading: {self._get_vram_state_str(model_cur_vram_bytes, model_total_bytes, vram_available)}"
        )
@@ -246,7 +250,7 @@ class ModelCache:
        # 1. If the model can fit entirely in VRAM, then make enough room for it to be loaded fully.
        # 2. If the model can't fit fully into VRAM, then unload all other models and load as much of the model as
        #    possible.
-        vram_bytes_freed = self._offload_unlocked_models(model_vram_needed)
+        vram_bytes_freed = self._offload_unlocked_models(vram_bytes_to_free)
        self._logger.debug(f"Unloaded models (if necessary): vram_bytes_freed={(vram_bytes_freed/MB):.2f}MB")

        # Check the updated vram_available after offloading.
--- a/tests/backend/lora/sidecar_layers/concatenated_lora/test_concatenated_lora_linear_sidecar_layer.py
+++ b/tests/backend/lora/sidecar_layers/concatenated_lora/test_concatenated_lora_linear_sidecar_layer.py
@@ -1,49 +0,0 @@
-import copy
-
-import torch
-
-from invokeai.backend.lora.layers.concatenated_lora_layer import ConcatenatedLoRALayer
-from invokeai.backend.lora.layers.lora_layer import LoRALayer
-from invokeai.backend.lora.sidecar_layers.concatenated_lora.concatenated_lora_linear_sidecar_layer import (
-    ConcatenatedLoRALinearSidecarLayer,
-)
-from invokeai.backend.lora.sidecar_layers.lora_sidecar_module import LoRASidecarModule
-
-
-def test_concatenated_lora_linear_sidecar_layer():
-    """Test that a ConcatenatedLoRALinearSidecarLayer is equivalent to patching a linear layer with the ConcatenatedLoRA
-    layer.
-    """
-
-    # Create a linear layer.
-    in_features = 5
-    sub_layer_out_features = [5, 10, 15]
-    linear = torch.nn.Linear(in_features, sum(sub_layer_out_features))
-
-    # Create a ConcatenatedLoRA layer.
-    rank = 4
-    sub_layers: list[LoRALayer] = []
-    for out_features in sub_layer_out_features:
-        down = torch.randn(rank, in_features)
-        up = torch.randn(out_features, rank)
-        bias = torch.randn(out_features)
-        sub_layers.append(LoRALayer(up=up, mid=None, down=down, alpha=1.0, bias=bias))
-    concatenated_lora_layer = ConcatenatedLoRALayer(sub_layers, concat_axis=0)
-
-    # Patch the ConcatenatedLoRA layer into the linear layer.
-    linear_patched = copy.deepcopy(linear)
-    linear_patched.weight.data += (
-        concatenated_lora_layer.get_weight(linear_patched.weight) * concatenated_lora_layer.scale()
-    )
-    linear_patched.bias.data += concatenated_lora_layer.get_bias(linear_patched.bias) * concatenated_lora_layer.scale()
-
-    # Create a ConcatenatedLoRALinearSidecarLayer.
-    concatenated_lora_linear_sidecar_layer = ConcatenatedLoRALinearSidecarLayer(concatenated_lora_layer, weight=1.0)
-    linear_with_sidecar = LoRASidecarModule(linear, [concatenated_lora_linear_sidecar_layer])
-
-    # Run the ConcatenatedLoRA-patched linear layer and the ConcatenatedLoRALinearSidecarLayer and assert they are
-    # equal.
-    input = torch.randn(1, in_features)
-    output_patched = linear_patched(input)
-    output_sidecar = linear_with_sidecar(input)
-    assert torch.allclose(output_patched, output_sidecar, atol=1e-6)
--- a/tests/backend/lora/sidecar_layers/lora/test_lora_linear_sidecar_layer.py
+++ b/tests/backend/lora/sidecar_layers/lora/test_lora_linear_sidecar_layer.py
@@ -1,38 +0,0 @@
-import copy
-
-import torch
-
-from invokeai.backend.lora.layers.lora_layer import LoRALayer
-from invokeai.backend.lora.sidecar_layers.lora.lora_linear_sidecar_layer import LoRALinearSidecarLayer
-from invokeai.backend.lora.sidecar_layers.lora_sidecar_module import LoRASidecarModule
-
-
-@torch.no_grad()
-def test_lora_linear_sidecar_layer():
-    """Test that a LoRALinearSidecarLayer is equivalent to patching a linear layer with the LoRA layer."""
-
-    # Create a linear layer.
-    in_features = 10
-    out_features = 20
-    linear = torch.nn.Linear(in_features, out_features)
-
-    # Create a LoRA layer.
-    rank = 4
-    down = torch.randn(rank, in_features)
-    up = torch.randn(out_features, rank)
-    bias = torch.randn(out_features)
-    lora_layer = LoRALayer(up=up, mid=None, down=down, alpha=1.0, bias=bias)
-
-    # Patch the LoRA layer into the linear layer.
-    linear_patched = copy.deepcopy(linear)
-    linear_patched.weight.data += lora_layer.get_weight(linear_patched.weight) * lora_layer.scale()
-    linear_patched.bias.data += lora_layer.get_bias(linear_patched.bias) * lora_layer.scale()
-    # Create a LoRALinearSidecarLayer.
-    lora_linear_sidecar_layer = LoRALinearSidecarLayer(lora_layer, weight=1.0)
-    linear_with_sidecar = LoRASidecarModule(linear, [lora_linear_sidecar_layer])
-
-    # Run the LoRA-patched linear layer and the LoRALinearSidecarLayer and assert they are equal.
-    input = torch.randn(1, in_features)
-    output_patched = linear_patched(input)
-    output_sidecar = linear_with_sidecar(input)
-    assert torch.allclose(output_patched, output_sidecar, atol=1e-6)
--- a/tests/backend/lora/test_lora_layer_wrappers.py
+++ b/tests/backend/lora/test_lora_layer_wrappers.py
@@ -0,0 +1,69 @@
+import copy
+
+import torch
+
+from invokeai.backend.lora.layers.concatenated_lora_layer import ConcatenatedLoRALayer
+from invokeai.backend.lora.layers.lora_layer import LoRALayer
+from invokeai.backend.lora.lora_layer_wrappers import LoRALinearWrapper
+
+
+@torch.no_grad()
+def test_lora_linear_wrapper():
+    # Create a linear layer.
+    in_features = 10
+    out_features = 20
+    linear = torch.nn.Linear(in_features, out_features)
+
+    # Create a LoRA layer.
+    rank = 4
+    down = torch.randn(rank, in_features)
+    up = torch.randn(out_features, rank)
+    bias = torch.randn(out_features)
+    lora_layer = LoRALayer(up=up, mid=None, down=down, alpha=1.0, bias=bias)
+
+    # Patch the LoRA layer into the linear layer.
+    linear_patched = copy.deepcopy(linear)
+    linear_patched.weight.data += lora_layer.get_weight(linear_patched.weight) * lora_layer.scale()
+    linear_patched.bias.data += lora_layer.get_bias(linear_patched.bias) * lora_layer.scale()
+
+    # Create a LoRALinearWrapper.
+    lora_wrapped = LoRALinearWrapper(linear, [lora_layer], [1.0])
+
+    # Run the LoRA-patched linear layer and the LoRALinearWrapper and assert they are equal.
+    input = torch.randn(1, in_features)
+    output_patched = linear_patched(input)
+    output_wrapped = lora_wrapped(input)
+    assert torch.allclose(output_patched, output_wrapped, atol=1e-6)
+
+
+def test_concatenated_lora_linear_wrapper():
+    # Create a linear layer.
+    in_features = 5
+    sub_layer_out_features = [5, 10, 15]
+    linear = torch.nn.Linear(in_features, sum(sub_layer_out_features))
+
+    # Create a ConcatenatedLoRA layer.
+    rank = 4
+    sub_layers: list[LoRALayer] = []
+    for out_features in sub_layer_out_features:
+        down = torch.randn(rank, in_features)
+        up = torch.randn(out_features, rank)
+        bias = torch.randn(out_features)
+        sub_layers.append(LoRALayer(up=up, mid=None, down=down, alpha=1.0, bias=bias))
+    concatenated_lora_layer = ConcatenatedLoRALayer(sub_layers, concat_axis=0)
+
+    # Patch the ConcatenatedLoRA layer into the linear layer.
+    linear_patched = copy.deepcopy(linear)
+    linear_patched.weight.data += (
+        concatenated_lora_layer.get_weight(linear_patched.weight) * concatenated_lora_layer.scale()
+    )
+    linear_patched.bias.data += concatenated_lora_layer.get_bias(linear_patched.bias) * concatenated_lora_layer.scale()
+
+    # Create a LoRALinearWrapper.
+    lora_wrapped = LoRALinearWrapper(linear, [concatenated_lora_layer], [1.0])
+
+    # Run the ConcatenatedLoRA-patched linear layer and the LoRALinearWrapper and assert they are equal.
+    input = torch.randn(1, in_features)
+    output_patched = linear_patched(input)
+    output_wrapped = lora_wrapped(input)
+    assert torch.allclose(output_patched, output_wrapped, atol=1e-6)
--- a/tests/backend/lora/test_lora_patcher.py
+++ b/tests/backend/lora/test_lora_patcher.py
@@ -2,11 +2,15 @@ import pytest
 import torch

 from invokeai.backend.lora.layers.lora_layer import LoRALayer
+from invokeai.backend.lora.lora_layer_wrappers import LoRASidecarWrapper
 from invokeai.backend.lora.lora_model_raw import LoRAModelRaw
 from invokeai.backend.lora.lora_patcher import LoRAPatcher
+from invokeai.backend.model_manager.load.model_cache.cached_model.cached_model_with_partial_load import (
+    CachedModelWithPartialLoad,
+)


-class DummyModule(torch.nn.Module):
+class DummyModuleWithOneLayer(torch.nn.Module):
    def __init__(self, in_features: int, out_features: int, device: str, dtype: torch.dtype):
        super().__init__()
        self.linear_layer_1 = torch.nn.Linear(in_features, out_features, device=device, dtype=dtype)
@@ -15,8 +19,18 @@ class DummyModule(torch.nn.Module):
        return self.linear_layer_1(x)


+class DummyModuleWithTwoLayers(torch.nn.Module):
+    def __init__(self, in_features: int, out_features: int, device: str, dtype: torch.dtype):
+        super().__init__()
+        self.linear_layer_1 = torch.nn.Linear(in_features, out_features, device=device, dtype=dtype)
+        self.linear_layer_2 = torch.nn.Linear(out_features, out_features, device=device, dtype=dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear_layer_2(self.linear_layer_1(x))
+
+
@pytest.mark.parametrize(
-    ["device", "num_layers"],
+    ["device", "num_loras"],
    [
        ("cpu", 1),
        pytest.param("cuda", 1, marks=pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA device")),
@@ -25,7 +39,7 @@ class DummyModule(torch.nn.Module):
    ],
 )
@torch.no_grad()
-def test_apply_lora_patches(device: str, num_layers: int):
+def test_apply_lora_patches(device: str, num_loras: int):
    """Test the basic behavior of ModelPatcher.apply_lora_patches(...). Check that patching and unpatching produce the
    correct result, and that model/LoRA tensors are moved between devices as expected.
    """
@@ -33,12 +47,12 @@ def test_apply_lora_patches(device: str, num_layers: int):
    linear_in_features = 4
    linear_out_features = 8
    lora_rank = 2
-    model = DummyModule(linear_in_features, linear_out_features, device=device, dtype=torch.float16)
+    model = DummyModuleWithOneLayer(linear_in_features, linear_out_features, device=device, dtype=torch.float16)

-    # Initialize num_layers LoRA models with weights of 0.5.
+    # Initialize num_loras LoRA models with weights of 0.5.
    lora_weight = 0.5
    lora_models: list[tuple[LoRAModelRaw, float]] = []
-    for _ in range(num_layers):
+    for _ in range(num_loras):
        lora_layers = {
            "linear_layer_1": LoRALayer.from_state_dict_values(
                values={
@@ -51,7 +65,7 @@ def test_apply_lora_patches(device: str, num_layers: int):
        lora_models.append((lora, lora_weight))

    orig_linear_weight = model.linear_layer_1.weight.data.detach().clone()
-    expected_patched_linear_weight = orig_linear_weight + (lora_rank * lora_weight * num_layers)
+    expected_patched_linear_weight = orig_linear_weight + (lora_rank * lora_weight * num_loras)

    with LoRAPatcher.apply_lora_patches(model=model, patches=lora_models, prefix=""):
        # After patching, all LoRA layer weights should have been moved back to the cpu.
@@ -79,7 +93,7 @@ def test_apply_lora_patches_change_device():
    linear_out_features = 8
    lora_dim = 2
    # Initialize the model on the CPU.
-    model = DummyModule(linear_in_features, linear_out_features, device="cpu", dtype=torch.float16)
+    model = DummyModuleWithOneLayer(linear_in_features, linear_out_features, device="cpu", dtype=torch.float16)

    lora_layers = {
        "linear_layer_1": LoRALayer.from_state_dict_values(
@@ -110,7 +124,7 @@ def test_apply_lora_patches_change_device():


@pytest.mark.parametrize(
-    ["device", "num_layers"],
+    ["device", "num_loras"],
    [
        ("cpu", 1),
        pytest.param("cuda", 1, marks=pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA device")),
@@ -118,18 +132,18 @@ def test_apply_lora_patches_change_device():
        pytest.param("cuda", 2, marks=pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA device")),
    ],
 )
-def test_apply_lora_sidecar_patches(device: str, num_layers: int):
-    """Test the basic behavior of ModelPatcher.apply_lora_sidecar_patches(...). Check that unpatching works correctly."""
+def test_apply_lora_wrapper_patches(device: str, num_loras: int):
+    """Test the basic behavior of ModelPatcher.apply_lora_wrapper_patches(...). Check that unpatching works correctly."""
    dtype = torch.float16
    linear_in_features = 4
    linear_out_features = 8
    lora_rank = 2
-    model = DummyModule(linear_in_features, linear_out_features, device=device, dtype=dtype)
+    model = DummyModuleWithOneLayer(linear_in_features, linear_out_features, device=device, dtype=dtype)

-    # Initialize num_layers LoRA models with weights of 0.5.
+    # Initialize num_loras LoRA models with weights of 0.5.
    lora_weight = 0.5
    lora_models: list[tuple[LoRAModelRaw, float]] = []
-    for _ in range(num_layers):
+    for _ in range(num_loras):
        lora_layers = {
            "linear_layer_1": LoRALayer.from_state_dict_values(
                values={
@@ -146,7 +160,7 @@ def test_apply_lora_sidecar_patches(device: str, num_layers: int):
    output_before_patch = model(input)

    # Patch the model and run inference during the patch.
-    with LoRAPatcher.apply_lora_sidecar_patches(model=model, patches=lora_models, prefix="", dtype=dtype):
+    with LoRAPatcher.apply_lora_wrapper_patches(model=model, patches=lora_models, prefix="", dtype=dtype):
        output_during_patch = model(input)

    # Run inference after unpatching.
@@ -159,20 +173,140 @@ def test_apply_lora_sidecar_patches(device: str, num_layers: int):
    assert torch.allclose(output_before_patch, output_after_patch)


+@pytest.mark.parametrize(
+    ["device", "num_loras"],
+    [
+        ("cpu", 1),
+        pytest.param("cuda", 1, marks=pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA device")),
+        ("cpu", 2),
+        pytest.param("cuda", 2, marks=pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA device")),
+    ],
+)
@torch.no_grad()
-@pytest.mark.parametrize(["num_layers"], [(1,), (2,)])
-def test_apply_lora_sidecar_patches_matches_apply_lora_patches(num_layers: int):
-    """Test that apply_lora_sidecar_patches(...) produces the same model outputs as apply_lora_patches(...)."""
+def test_apply_smart_lora_patches(device: str, num_loras: int):
+    """Test the basic behavior of ModelPatcher.apply_smart_lora_patches(...). Check that unpatching works correctly."""
+    dtype = torch.float16
+    linear_in_features = 4
+    linear_out_features = 8
+    lora_rank = 2
+    model = DummyModuleWithOneLayer(linear_in_features, linear_out_features, device=device, dtype=dtype)
+
+    # Initialize num_loras LoRA models with weights of 0.5.
+    lora_weight = 0.5
+    lora_models: list[tuple[LoRAModelRaw, float]] = []
+    for _ in range(num_loras):
+        lora_layers = {
+            "linear_layer_1": LoRALayer.from_state_dict_values(
+                values={
+                    "lora_down.weight": torch.ones((lora_rank, linear_in_features), device="cpu", dtype=torch.float16),
+                    "lora_up.weight": torch.ones((linear_out_features, lora_rank), device="cpu", dtype=torch.float16),
+                },
+            )
+        }
+        lora = LoRAModelRaw(lora_layers)
+        lora_models.append((lora, lora_weight))
+
+    # Run inference before patching the model.
+    input = torch.randn(1, linear_in_features, device=device, dtype=dtype)
+    output_before_patch = model(input)
+
+    # Patch the model and run inference during the patch.
+    with LoRAPatcher.apply_smart_lora_patches(model=model, patches=lora_models, prefix="", dtype=dtype):
+        output_during_patch = model(input)
+
+    # Run inference after unpatching.
+    output_after_patch = model(input)
+
+    # Check that the output before patching is different from the output during patching.
+    assert not torch.allclose(output_before_patch, output_during_patch)
+
+    # Check that the output before patching is the same as the output after patching.
+    assert torch.allclose(output_before_patch, output_after_patch)
+
+
+@pytest.mark.parametrize(["num_loras"], [(1,), (2,)])
+@torch.no_grad()
+def test_apply_smart_lora_patches_to_partially_loaded_model(num_loras: int):
+    """Test the behavior of ModelPatcher.apply_smart_lora_patches(...) when it is applied to a
+    CachedModelWithPartialLoad that is partially loaded into VRAM.
+    """
+
+    if not torch.cuda.is_available():
+        pytest.skip("requires CUDA device")
+
+    # Initialize the model on the CPU.
+    dtype = torch.float16
+    linear_in_features = 4
+    linear_out_features = 8
+    lora_rank = 2
+    model = DummyModuleWithTwoLayers(linear_in_features, linear_out_features, device="cpu", dtype=dtype)
+    cached_model = CachedModelWithPartialLoad(model=model, compute_device=torch.device("cuda"))
+    model_total_bytes = cached_model.total_bytes()
+    assert cached_model.cur_vram_bytes() == 0
+
+    # Partially load the model into VRAM.
+    target_vram_bytes = int(model_total_bytes * 0.6)
+    _ = cached_model.partial_load_to_vram(target_vram_bytes)
+    assert cached_model.model.linear_layer_1.weight.device.type == "cuda"
+    assert cached_model.model.linear_layer_2.weight.device.type == "cpu"
+
+    # Initialize num_loras LoRA models with weights of 0.5.
+    lora_weight = 0.5
+    lora_models: list[tuple[LoRAModelRaw, float]] = []
+    for _ in range(num_loras):
+        lora_layers = {
+            "linear_layer_1": LoRALayer.from_state_dict_values(
+                values={
+                    "lora_down.weight": torch.ones((lora_rank, linear_in_features), device="cpu", dtype=torch.float16),
+                    "lora_up.weight": torch.ones((linear_out_features, lora_rank), device="cpu", dtype=torch.float16),
+                },
+            ),
+            "linear_layer_2": LoRALayer.from_state_dict_values(
+                values={
+                    "lora_down.weight": torch.ones((lora_rank, linear_out_features), device="cpu", dtype=torch.float16),
+                    "lora_up.weight": torch.ones((linear_out_features, lora_rank), device="cpu", dtype=torch.float16),
+                },
+            ),
+        }
+        lora = LoRAModelRaw(lora_layers)
+        lora_models.append((lora, lora_weight))
+
+    # Run inference before patching the model.
+    input = torch.randn(1, linear_in_features, device="cuda", dtype=dtype)
+    output_before_patch = cached_model.model(input)
+
+    # Patch the model and run inference during the patch.
+    with LoRAPatcher.apply_smart_lora_patches(model=cached_model.model, patches=lora_models, prefix="", dtype=dtype):
+        # Check that the second layer is wrapped in a LoRASidecarWrapper, but the first layer is not.
+        assert not isinstance(cached_model.model.linear_layer_1, LoRASidecarWrapper)
+        assert isinstance(cached_model.model.linear_layer_2, LoRASidecarWrapper)
+
+        output_during_patch = cached_model.model(input)
+
+    # Run inference after unpatching.
+    output_after_patch = cached_model.model(input)
+
+    # Check that the output before patching is different from the output during patching.
+    assert not torch.allclose(output_before_patch, output_during_patch)
+
+    # Check that the output before patching is the same as the output after patching.
+    assert torch.allclose(output_before_patch, output_after_patch)
+
+
+@torch.no_grad()
+@pytest.mark.parametrize(["num_loras"], [(1,), (2,)])
+def test_all_patching_methods_produce_same_output(num_loras: int):
+    """Test that apply_lora_wrapper_patches(...) produces the same model outputs as apply_lora_patches(...)."""
    dtype = torch.float32
    linear_in_features = 4
    linear_out_features = 8
    lora_rank = 2
-    model = DummyModule(linear_in_features, linear_out_features, device="cpu", dtype=dtype)
+    model = DummyModuleWithOneLayer(linear_in_features, linear_out_features, device="cpu", dtype=dtype)

-    # Initialize num_layers LoRA models with weights of 0.5.
+    # Initialize num_loras LoRA models with weights of 0.5.
    lora_weight = 0.5
    lora_models: list[tuple[LoRAModelRaw, float]] = []
-    for _ in range(num_layers):
+    for _ in range(num_loras):
        lora_layers = {
            "linear_layer_1": LoRALayer.from_state_dict_values(
                values={
@@ -189,9 +323,13 @@ def test_apply_lora_sidecar_patches_matches_apply_lora_patches(num_layers: int):
    with LoRAPatcher.apply_lora_patches(model=model, patches=lora_models, prefix=""):
        output_lora_patches = model(input)

-    with LoRAPatcher.apply_lora_sidecar_patches(model=model, patches=lora_models, prefix="", dtype=dtype):
-        output_lora_sidecar_patches = model(input)
+    with LoRAPatcher.apply_lora_wrapper_patches(model=model, patches=lora_models, prefix="", dtype=dtype):
+        output_lora_wrapper_patches = model(input)
+
+    with LoRAPatcher.apply_smart_lora_patches(model=model, patches=lora_models, prefix="", dtype=dtype):
+        output_smart_lora_patches = model(input)

    # Note: We set atol=1e-5 because the test failed occasionally with the default atol=1e-8. Slight numerical
    # differences are tolerable and expected due to the difference between sidecar vs. patching.
-    assert torch.allclose(output_lora_patches, output_lora_sidecar_patches, atol=1e-5)
+    assert torch.allclose(output_lora_patches, output_lora_wrapper_patches, atol=1e-5)
+    assert torch.allclose(output_lora_patches, output_smart_lora_patches, atol=1e-5)
--- a/tests/backend/model_manager/load/model_cache/cached_model/test_cached_model_with_partial_load.py
+++ b/tests/backend/model_manager/load/model_cache/cached_model/test_cached_model_with_partial_load.py
@@ -1,3 +1,5 @@
+import itertools
+
 import pytest
 import torch

@@ -20,15 +22,11 @@ parameterize_mps_and_cuda = pytest.mark.parametrize(

@parameterize_mps_and_cuda
 def test_cached_model_total_bytes(device: str):
-    if device == "cuda" and not torch.cuda.is_available():
-        pytest.skip("CUDA is not available.")
-    if device == "mps" and not torch.backends.mps.is_available():
-        pytest.skip("MPS is not available.")
-
    model = DummyModule()
    cached_model = CachedModelWithPartialLoad(model=model, compute_device=torch.device(device))
    linear_numel = 10 * 10 + 10
-    assert cached_model.total_bytes() == linear_numel * 4 * 2
+    buffer_numel = 10 * 10
+    assert cached_model.total_bytes() == (2 * linear_numel + buffer_numel) * 4


@parameterize_mps_and_cuda
@@ -43,6 +41,7 @@ def test_cached_model_cur_vram_bytes(device: str):
    assert cached_model.cur_vram_bytes() > 0
    assert cached_model.cur_vram_bytes() == cached_model.total_bytes()
    assert all(p.device.type == device for p in model.parameters())
+    assert all(p.device.type == device for p in model.buffers())


@parameterize_mps_and_cuda
@@ -59,7 +58,9 @@ def test_cached_model_partial_load(device: str):
    assert loaded_bytes > 0
    assert loaded_bytes < model_total_bytes
    assert loaded_bytes == cached_model.cur_vram_bytes()
-    assert loaded_bytes == sum(calc_tensor_size(p) for p in model.parameters() if p.device.type == device)
+    assert loaded_bytes == sum(
+        calc_tensor_size(p) for p in itertools.chain(model.parameters(), model.buffers()) if p.device.type == device
+    )


@parameterize_mps_and_cuda
@@ -80,11 +81,13 @@ def test_cached_model_partial_unload(device: str):
    assert freed_bytes >= bytes_to_free
    assert freed_bytes < model_total_bytes
    assert freed_bytes == model_total_bytes - cached_model.cur_vram_bytes()
-    assert freed_bytes == sum(calc_tensor_size(p) for p in model.parameters() if p.device.type == "cpu")
+    assert freed_bytes == sum(
+        calc_tensor_size(p) for p in itertools.chain(model.parameters(), model.buffers()) if p.device.type == "cpu"
+    )


@parameterize_mps_and_cuda
-def test_cached_model_full_load(device: str):
+def test_cached_model_full_load_and_unload(device: str):
    model = DummyModule()
    cached_model = CachedModelWithPartialLoad(model=model, compute_device=torch.device(device))

@@ -97,7 +100,14 @@ def test_cached_model_full_load(device: str):
    assert loaded_bytes > 0
    assert loaded_bytes == model_total_bytes
    assert loaded_bytes == cached_model.cur_vram_bytes()
-    assert all(p.device.type == device for p in model.parameters())
+    assert all(p.device.type == device for p in itertools.chain(model.parameters(), model.buffers()))
+
+    # Full unload the model from VRAM.
+    unloaded_bytes = cached_model.full_unload_from_vram()
+    assert unloaded_bytes > 0
+    assert unloaded_bytes == model_total_bytes
+    assert cached_model.cur_vram_bytes() == 0
+    assert all(p.device.type == "cpu" for p in itertools.chain(model.parameters(), model.buffers()))


@parameterize_mps_and_cuda
@@ -122,7 +132,7 @@ def test_cached_model_full_load_from_partial(device: str):
    assert loaded_bytes_2 < model_total_bytes
    assert loaded_bytes + loaded_bytes_2 == cached_model.cur_vram_bytes()
    assert loaded_bytes + loaded_bytes_2 == model_total_bytes
-    assert all(p.device.type == device for p in model.parameters())
+    assert all(p.device.type == device for p in itertools.chain(model.parameters(), model.buffers()))


@parameterize_mps_and_cuda
@@ -146,7 +156,7 @@ def test_cached_model_full_unload_from_partial(device: str):
    assert unloaded_bytes > 0
    assert unloaded_bytes == loaded_bytes
    assert cached_model.cur_vram_bytes() == 0
-    assert all(p.device.type == "cpu" for p in model.parameters())
+    assert all(p.device.type == "cpu" for p in itertools.chain(model.parameters(), model.buffers()))


@parameterize_mps_and_cuda
--- a/tests/backend/model_manager/load/model_cache/dummy_module.py
+++ b/tests/backend/model_manager/load/model_cache/dummy_module.py
@@ -6,6 +6,7 @@ class DummyModule(torch.nn.Module):
        super().__init__()
        self.linear1 = torch.nn.Linear(10, 10)
        self.linear2 = torch.nn.Linear(10, 10)
+        self.register_buffer("buffer1", torch.ones(10, 10))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.linear1(x)
Author	SHA1	Message	Date
Ryan Dick	f6045682c0	Fix bug with partial offload of model buffers.	2024-12-10 22:19:17 +00:00
Ryan Dick	84a75ddb72	Fix bug in ModelCache that was causing it to offload more models from VRAM than necessary.	2024-12-10 20:38:37 +00:00
Ryan Dick	a9fb1c82a0	Fix handling of torch.nn.Module buffers in CachedModelWithPartialLoad.	2024-12-10 19:38:04 +00:00
Ryan Dick	cc7391e630	Enable LoRAPatcher.apply_smart_lora_patches(...) throughout the stack.	2024-12-10 17:27:33 +00:00
Ryan Dick	62d595f695	(minor) Rename num_layers -> num_loras in unit tests.	2024-12-10 16:41:52 +00:00
Ryan Dick	5e2080266e	Add test_apply_smart_lora_patches_to_partially_loaded_model(...).	2024-12-10 16:38:48 +00:00
Ryan Dick	ed7bb7ea3d	Add LoRAPatcher.smart_apply_lora_patches()	2024-12-10 16:26:34 +00:00
Ryan Dick	62407f7c6b	Refactor LoRAPatcher slightly in preparation for a 'smart' patcher.	2024-12-10 15:36:36 +00:00
Ryan Dick	80128e1e14	Fix LoRAPatcher.apply_lora_wrapper_patches(...)	2024-12-10 03:10:23 +00:00
Ryan Dick	4c84d39e7d	Finish consolidating LoRA sidecar wrapper implementations.	2024-12-10 02:54:32 +00:00
Ryan Dick	0c4a368555	Begin to consolidate the LoRA sidecar and LoRA layer wrapper implementations.	2024-12-10 01:16:01 +00:00
Ryan Dick	55dc762a91	Fix bias handling in LoRAModuleWrapper and add unit test that checks that all LoRA patching methods produce the same outputs.	2024-12-09 16:59:37 +00:00
Ryan Dick	d825d3856e	Add LoRA wrapper patching to LoRAPatcher.	2024-12-09 16:35:23 +00:00
Ryan Dick	d94733f55a	Add LoRA wrapper layer.	2024-12-09 15:17:50 +00:00