Change definition of VRAM in use for the ModelCache from sum of model weights to the total torch.cuda.memory_allocated().

2026-04-23 03:00:31 -04:00 · 2025-01-06 20:38:17 +00:00
parent e5180c4e6b
commit 6a9de1fcf3
1 changed files with 24 additions and 14 deletions
--- a/invokeai/backend/model_manager/load/model_cache/model_cache.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache.py
@@ -247,7 +247,6 @@ class ModelCache:
    def _load_locked_model(self, cache_entry: CacheRecord) -> None:
        """Helper function for self.lock(). Loads a locked model into VRAM."""
        start_time = time.time()
-        vram_available = self._get_vram_available()

        # Calculate model_vram_needed, the amount of additional VRAM that will be used if we fully load the model into
        # VRAM.
@@ -255,9 +254,7 @@ class ModelCache:
        model_total_bytes = cache_entry.cached_model.total_bytes()
        model_vram_needed = model_total_bytes - model_cur_vram_bytes

-        # The amount of VRAM that must be freed to make room for model_vram_needed.
-        vram_bytes_to_free = max(0, model_vram_needed - vram_available)
-
+        vram_available = self._get_vram_available()
        self._logger.debug(
            f"Before unloading: {self._get_vram_state_str(model_cur_vram_bytes, model_total_bytes, vram_available)}"
        )
@@ -266,7 +263,7 @@ class ModelCache:
        # 1. If the model can fit entirely in VRAM, then make enough room for it to be loaded fully.
        # 2. If the model can't fit fully into VRAM, then unload all other models and load as much of the model as
        #    possible.
-        vram_bytes_freed = self._offload_unlocked_models(vram_bytes_to_free)
+        vram_bytes_freed = self._offload_unlocked_models(model_vram_needed)
        self._logger.debug(f"Unloaded models (if necessary): vram_bytes_freed={(vram_bytes_freed/MB):.2f}MB")

        # Check the updated vram_available after offloading.
@@ -278,7 +275,9 @@ class ModelCache:
        # Move as much of the model as possible into VRAM.
        # For testing, only allow 10% of the model to be loaded into VRAM.
        # vram_available = int(model_vram_needed * 0.1)
-        model_bytes_loaded = self._move_model_to_vram(cache_entry, vram_available)
+        # We add 1 MB to the available VRAM to account for small errors in memory tracking (e.g. off-by-one). A fully
+        # loaded model is much faster than a 95% loaded model.
+        model_bytes_loaded = self._move_model_to_vram(cache_entry, vram_available + MB)

        model_cur_vram_bytes = cache_entry.cached_model.cur_vram_bytes()
        vram_available = self._get_vram_available()
@@ -330,7 +329,14 @@ class ModelCache:

    def _get_vram_in_use(self) -> int:
        """Get the amount of VRAM currently in use by the cache."""
-        return sum(ce.cached_model.cur_vram_bytes() for ce in self._cached_models.values())
+        if self._execution_device.type == "cuda":
+            return torch.cuda.memory_allocated()
+        elif self._execution_device.type == "mps":
+            return torch.mps.current_allocated_memory()
+        else:
+            raise ValueError(f"Unsupported execution device type: {self._execution_device.type}")
+        # Alternative definition of VRAM in use:
+        # return sum(ce.cached_model.cur_vram_bytes() for ce in self._cached_models.values())

    def _get_ram_available(self) -> int:
        """Get the amount of RAM available for the cache to use, while keeping memory pressure under control."""
@@ -357,24 +363,28 @@ class ModelCache:
            + f"vram_available={(vram_available/MB):.0f} MB, "
        )

-    def _offload_unlocked_models(self, vram_bytes_to_free: int) -> int:
-        """Offload models from the execution_device until vram_bytes_to_free bytes are freed, or all models are
+    def _offload_unlocked_models(self, vram_bytes_required: int) -> int:
+        """Offload models from the execution_device until vram_bytes_required bytes are available, or all models are
        offloaded. Of course, locked models are not offloaded.

        Returns:
-            int: The number of bytes freed.
+            int: The number of bytes freed based on believed model sizes. The actual change in VRAM may be different.
        """
-        self._logger.debug(f"Offloading unlocked models with goal of freeing {vram_bytes_to_free/MB:.2f}MB of VRAM.")
+        self._logger.debug(
+            f"Offloading unlocked models with goal of making room for {vram_bytes_required/MB:.2f}MB of VRAM."
+        )
        vram_bytes_freed = 0
        # TODO(ryand): Give more thought to the offloading policy used here.
        cache_entries_increasing_size = sorted(self._cached_models.values(), key=lambda x: x.cached_model.total_bytes())
        for cache_entry in cache_entries_increasing_size:
-            if vram_bytes_freed >= vram_bytes_to_free:
+            # We do not fully trust the count of bytes freed, so we check again on each iteration.
+            vram_available = self._get_vram_available()
+            vram_bytes_to_free = vram_bytes_required - vram_available
+            if vram_bytes_to_free <= 0:
                break
            if cache_entry.is_locked:
                continue
-
-            cache_entry_bytes_freed = self._move_model_to_ram(cache_entry, vram_bytes_to_free - vram_bytes_freed)
+            cache_entry_bytes_freed = self._move_model_to_ram(cache_entry, vram_bytes_to_free)
            if cache_entry_bytes_freed > 0:
                self._logger.debug(
                    f"Unloaded {cache_entry.key} from VRAM to free {(cache_entry_bytes_freed/MB):.0f} MB."