cleaned model unlock code; added debugging statements

Update README.md
prep 0.0.1 release
2026-04-23 03:00:31 -04:00 · 2024-06-23 22:55:24 -04:00 · 2024-06-23 22:31:42 -04:00 · 2024-06-23 22:29:23 -04:00 · 2024-06-23 13:16:29 -04:00 · 2024-06-23 12:43:58 -04:00
46 changed files with 768 additions and 635 deletions
--- a/README.md
+++ b/README.md
@@ -1,3 +1,12 @@
+# InvokeAI-MGPU - Experimental version of Invoke supporting rendering on multiple GPUs
+
+This is a fork of the wonderful [InvokeAI Text-to-Image
+Generator](https://github.com/invoke-ai/InvokeAI) that has been
+modified to allow simultaneous rendering of images on systems with
+multiple CUDA GPUs. This support is very experimental and should not be
+used in production. Please click on the **[Releases](https://github.com/lstein/InvokeAI-Multi-GPU/releases)** button link on the
+right sidebar for the installer.
+
 <div align="center">

 ![project hero](https://github.com/invoke-ai/InvokeAI/assets/31807370/6e3728c7-e90e-4711-905c-3b55844ff5be)
--- a/docs/contributing/MODEL_MANAGER.md
+++ b/docs/contributing/MODEL_MANAGER.md
@@ -1328,7 +1328,7 @@ from invokeai.app.services.model_load import ModelLoadService, ModelLoaderRegist

 config = InvokeAIAppConfig.get_config()
 ram_cache = ModelCache(
- max_cache_size=config.ram_cache_size, max_vram_cache_size=config.vram_cache_size, logger=logger
+ max_cache_size=config.ram_cache_size, logger=logger
 )
 convert_cache = ModelConvertCache(
 cache_path=config.models_convert_cache_path, max_size=config.convert_cache_size
--- a/installer/create_installer.sh
+++ b/installer/create_installer.sh
@@ -96,6 +96,8 @@ for f in templates *.txt *.reg; do
 done
 mkdir InvokeAI-Installer/lib
 cp lib/*.py InvokeAI-Installer/lib
+mkdir InvokeAI-Installer/dist
+cp dist/*.whl InvokeAI-Installer/dist

 # Install scripts
 # Mac/Linux
--- a/installer/lib/installer.py
+++ b/installer/lib/installer.py
@@ -137,7 +137,6 @@ class Installer:
        import messages

        if wheel:
-            messages.installing_from_wheel(wheel.name)
            version = get_version_from_wheel_filename(wheel.name)
        else:
            messages.welcome(self.available_releases)
--- a/installer/lib/main.py
+++ b/installer/lib/main.py
@@ -8,7 +8,15 @@ from pathlib import Path

 from installer import Installer

+def find_wheel() -> Path:
+    dist = Path('./dist')
+    wheel = next(dist.glob('*.whl'))
+    assert wheel is not None
+    return wheel
+
 if __name__ == "__main__":
+    wheel = find_wheel()
+
    parser = argparse.ArgumentParser()

    parser.add_argument(
@@ -42,7 +50,7 @@ if __name__ == "__main__":
        dest="wheel",
        help="Specifies a wheel for the InvokeAI package. Used for troubleshooting or testing prereleases.",
        type=Path,
-        default=None,
+        default=wheel
    )

    args = parser.parse_args()
--- a/invokeai/app/api/routers/model_manager.py
+++ b/invokeai/app/api/routers/model_manager.py
@@ -6,6 +6,8 @@ import pathlib
 import shutil
 import traceback
 from copy import deepcopy
+from enum import Enum
+from tempfile import TemporaryDirectory
 from typing import Any, Dict, List, Optional, Type

 from fastapi import Body, Path, Query, Response, UploadFile
@@ -16,10 +18,10 @@ from pydantic import AnyHttpUrl, BaseModel, ConfigDict, Field
 from starlette.exceptions import HTTPException
 from typing_extensions import Annotated

+from invokeai.app.services.config import get_config
 from invokeai.app.services.model_images.model_images_common import ModelImageFileNotFoundException
 from invokeai.app.services.model_install.model_install_common import ModelInstallJob
 from invokeai.app.services.model_records import (
-    DuplicateModelException,
    InvalidModelException,
    ModelRecordChanges,
    UnknownModelException,
@@ -30,8 +32,8 @@ from invokeai.backend.model_manager.config import (
    MainCheckpointConfig,
    ModelFormat,
    ModelType,
-    SubModelType,
 )
+from invokeai.backend.model_manager.load.model_cache.model_cache_base import CacheStats
 from invokeai.backend.model_manager.metadata.fetch.huggingface import HuggingFaceMetadataFetch
 from invokeai.backend.model_manager.metadata.metadata_base import ModelMetadataWithFiles, UnknownMetadataException
 from invokeai.backend.model_manager.search import ModelSearch
@@ -53,6 +55,13 @@ class ModelsList(BaseModel):
    model_config = ConfigDict(use_enum_values=True)


+class CacheType(str, Enum):
+    """Cache type - one of vram or ram."""
+
+    RAM = "RAM"
+    VRAM = "VRAM"
+
+
 def add_cover_image_to_model_config(config: AnyModelConfig, dependencies: Type[ApiDependencies]) -> AnyModelConfig:
    """Add a cover image URL to a model configuration."""
    cover_image = dependencies.invoker.services.model_images.get_url(config.key)
@@ -174,18 +183,6 @@ async def get_model_record(
        raise HTTPException(status_code=404, detail=str(e))


-# @model_manager_router.get("/summary", operation_id="list_model_summary")
-# async def list_model_summary(
-#     page: int = Query(default=0, description="The page to get"),
-#     per_page: int = Query(default=10, description="The number of models per page"),
-#     order_by: ModelRecordOrderBy = Query(default=ModelRecordOrderBy.Default, description="The attribute to order by"),
-# ) -> PaginatedResults[ModelSummary]:
-#     """Gets a page of model summary data."""
-#     record_store = ApiDependencies.invoker.services.model_manager.store
-#     results: PaginatedResults[ModelSummary] = record_store.list_models(page=page, per_page=per_page, order_by=order_by)
-#     return results
-
-
 class FoundModel(BaseModel):
    path: str = Field(description="Path to the model")
    is_installed: bool = Field(description="Whether or not the model is already installed")
@@ -746,39 +743,36 @@ async def convert_model(
        logger.error(f"The model with key {key} is not a main checkpoint model.")
        raise HTTPException(400, f"The model with key {key} is not a main checkpoint model.")

-    # loading the model will convert it into a cached diffusers file
-    try:
-        cc_size = loader.convert_cache.max_size
-        if cc_size == 0:  # temporary set the convert cache to a positive number so that cached model is written
-            loader._convert_cache.max_size = 1.0
-        loader.load_model(model_config, submodel_type=SubModelType.Scheduler)
-    finally:
-        loader._convert_cache.max_size = cc_size
+    with TemporaryDirectory(dir=ApiDependencies.invoker.services.configuration.models_path) as tmpdir:
+        convert_path = pathlib.Path(tmpdir) / pathlib.Path(model_config.path).stem
+        converted_model = loader.load_model(model_config)
+        # write the converted file to the convert path
+        raw_model = converted_model.model
+        assert hasattr(raw_model, "save_pretrained")
+        raw_model.save_pretrained(convert_path)
+        assert convert_path.exists()

-    # Get the path of the converted model from the loader
-    cache_path = loader.convert_cache.cache_path(key)
-    assert cache_path.exists()
+        # temporarily rename the original safetensors file so that there is no naming conflict
+        original_name = model_config.name
+        model_config.name = f"{original_name}.DELETE"
+        changes = ModelRecordChanges(name=model_config.name)
+        store.update_model(key, changes=changes)

-    # temporarily rename the original safetensors file so that there is no naming conflict
-    original_name = model_config.name
-    model_config.name = f"{original_name}.DELETE"
-    changes = ModelRecordChanges(name=model_config.name)
-    store.update_model(key, changes=changes)
-
-    # install the diffusers
-    try:
-        new_key = installer.install_path(
-            cache_path,
-            config={
-                "name": original_name,
-                "description": model_config.description,
-                "hash": model_config.hash,
-                "source": model_config.source,
-            },
-        )
-    except DuplicateModelException as e:
-        logger.error(str(e))
-        raise HTTPException(status_code=409, detail=str(e))
+        # install the diffusers
+        try:
+            new_key = installer.install_path(
+                convert_path,
+                config={
+                    "name": original_name,
+                    "description": model_config.description,
+                    "hash": model_config.hash,
+                    "source": model_config.source,
+                },
+            )
+        except Exception as e:
+            logger.error(str(e))
+            store.update_model(key, changes=ModelRecordChanges(name=original_name))
+            raise HTTPException(status_code=409, detail=str(e))

    # Update the model image if the model had one
    try:
@@ -791,8 +785,8 @@ async def convert_model(
    # delete the original safetensors file
    installer.delete(key)

-    # delete the cached version
-    shutil.rmtree(cache_path)
+    # delete the temporary directory
+    # shutil.rmtree(cache_path)

    # return the config record for the new diffusers directory
    new_config = store.get_model(new_key)
@@ -816,3 +810,79 @@ async def get_starter_models() -> list[StarterModel]:
        model.dependencies = missing_deps

    return starter_models
+
+
+@model_manager_router.get(
+    "/model_cache",
+    operation_id="get_cache_size",
+    response_model=float,
+    summary="Get maximum size of model manager RAM or VRAM cache.",
+)
+async def get_cache_size(cache_type: CacheType = Query(description="The cache type", default=CacheType.RAM)) -> float:
+    """Return the current RAM or VRAM cache size setting (in GB)."""
+    cache = ApiDependencies.invoker.services.model_manager.load.ram_cache
+    if cache_type == CacheType.RAM:
+        return cache.max_cache_size
+    elif cache_type == CacheType.VRAM:
+        return cache.max_vram_cache_size
+    else:
+        raise ValueError(f"Unexpected {cache_type=}.")
+
+
+@model_manager_router.put(
+    "/model_cache",
+    operation_id="set_cache_size",
+    response_model=float,
+    summary="Set maximum size of model manager RAM or VRAM cache, optionally writing new value out to invokeai.yaml config file.",
+)
+async def set_cache_size(
+    value: float = Query(description="The new value for the maximum cache size"),
+    cache_type: CacheType = Query(description="The cache type", default=CacheType.RAM),
+    persist: bool = Query(description="Write new value out to invokeai.yaml", default=False),
+) -> float:
+    """Set the current RAM or VRAM cache size setting (in GB). ."""
+    cache = ApiDependencies.invoker.services.model_manager.load.ram_cache
+    app_config = get_config()
+    vram_bak, ram_bak = (app_config.vram, app_config.ram)
+
+    if cache_type == CacheType.RAM:
+        cache.max_cache_size = value
+        app_config.ram = value
+    elif cache_type == CacheType.VRAM:
+        cache.max_vram_cache_size = value
+        app_config.vram = value
+    else:
+        raise ValueError(f"Unexpected {cache_type=}.")
+
+    if persist:
+        config_path = app_config.config_file_path
+        new_config_path = config_path.with_suffix(".yaml.new")
+        backup_config_path = config_path.with_suffix(".yaml.bak")
+        shutil.copy(config_path, backup_config_path)
+        try:
+            app_config.write_file(new_config_path)
+            shutil.move(new_config_path, config_path)
+        except Exception as e:
+            shutil.move(backup_config_path, config_path)
+            app_config.max_vram_cache_size = vram_bak
+            app_config.max_cache_size = ram_bak
+            raise RuntimeError(f"Failed to save configuration to {config_path}: {e}") from e
+
+    if cache_type == CacheType.VRAM:
+        return cache.max_vram_cache_size
+    elif cache_type == CacheType.RAM:
+        return cache.max_cache_size
+    else:
+        raise ValueError(f"Unexpected {cache_type=}.")
+
+
+@model_manager_router.get(
+    "/stats",
+    operation_id="get_stats",
+    response_model=Optional[CacheStats],
+    summary="Get model manager RAM cache performance statistics.",
+)
+async def get_stats() -> Optional[CacheStats]:
+    """Return performance statistics on the model manager's RAM cache. Will return null if no models have been loaded."""
+
+    return ApiDependencies.invoker.services.model_manager.load.ram_cache.stats
--- a/invokeai/app/invocations/compel.py
+++ b/invokeai/app/invocations/compel.py
@@ -103,6 +103,7 @@ class CompelInvocation(BaseInvocation):
                textual_inversion_manager=ti_manager,
                dtype_for_device_getter=TorchDevice.choose_torch_dtype,
                truncate_long_prompts=False,
+                device=TorchDevice.choose_torch_device(),
            )

            conjunction = Compel.parse_prompt_string(self.prompt)
@@ -117,6 +118,7 @@ class CompelInvocation(BaseInvocation):
        conditioning_data = ConditioningFieldData(conditionings=[BasicConditioningInfo(embeds=c)])

        conditioning_name = context.conditioning.save(conditioning_data)
+
        return ConditioningOutput(
            conditioning=ConditioningField(
                conditioning_name=conditioning_name,
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@@ -3,6 +3,7 @@

 from __future__ import annotations

+import copy
 import locale
 import os
 import re
@@ -25,14 +26,13 @@ DB_FILE = Path("invokeai.db")
 LEGACY_INIT_FILE = Path("invokeai.init")
 DEFAULT_RAM_CACHE = 10.0
 DEFAULT_VRAM_CACHE = 0.25
-DEFAULT_CONVERT_CACHE = 20.0
-DEVICE = Literal["auto", "cpu", "cuda", "cuda:1", "mps"]
+DEVICE = Literal["auto", "cpu", "cuda:0", "cuda:1", "cuda:2", "cuda:3", "cuda:4", "cuda:5", "cuda:6", "cuda:7", "mps"]
 PRECISION = Literal["auto", "float16", "bfloat16", "float32"]
 ATTENTION_TYPE = Literal["auto", "normal", "xformers", "sliced", "torch-sdp"]
 ATTENTION_SLICE_SIZE = Literal["auto", "balanced", "max", 1, 2, 3, 4, 5, 6, 7, 8]
 LOG_FORMAT = Literal["plain", "color", "syslog", "legacy"]
 LOG_LEVEL = Literal["debug", "info", "warning", "error", "critical"]
-CONFIG_SCHEMA_VERSION = "4.0.1"
+CONFIG_SCHEMA_VERSION = "4.0.2"


 def get_default_ram_cache_size() -> float:
@@ -85,7 +85,7 @@ class InvokeAIAppConfig(BaseSettings):
        log_tokenization: Enable logging of parsed prompt tokens.
        patchmatch: Enable patchmatch inpaint code.
        models_dir: Path to the models directory.
-        convert_cache_dir: Path to the converted models cache directory. When loading a non-diffusers model, it will be converted and store on disk at this location.
+        convert_cache_dir: Path to the converted models cache directory (DEPRECATED, but do not delete because it is needed for migration from previous versions).
        download_cache_dir: Path to the directory that contains dynamically downloaded models.
        legacy_conf_dir: Path to directory of legacy checkpoint config files.
        db_dir: Path to InvokeAI databases directory.
@@ -102,10 +102,10 @@ class InvokeAIAppConfig(BaseSettings):
        profiles_dir: Path to profiles output directory.
        ram: Maximum memory amount used by memory model cache for rapid switching (GB).
        vram: Amount of VRAM reserved for model storage (GB).
-        convert_cache: Maximum size of on-disk converted models cache (GB).
        lazy_offload: Keep models in VRAM until their space is needed.
        log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
-        device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda`, `cuda:1`, `mps`
+        device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda:0`, `cuda:1`, `cuda:2`, `cuda:3`, `cuda:4`, `cuda:5`, `cuda:6`, `cuda:7`, `mps`
+        devices: List of execution devices; will override default device selected.
        precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.<br>Valid values: `auto`, `float16`, `bfloat16`, `float32`
        sequential_guidance: Whether to calculate guidance in serial instead of in parallel, lowering memory requirements.
        attention_type: Attention type.<br>Valid values: `auto`, `normal`, `xformers`, `sliced`, `torch-sdp`
@@ -113,6 +113,7 @@ class InvokeAIAppConfig(BaseSettings):
        force_tiled_decode: Whether to enable tiled VAE decode (reduces memory consumption with some performance penalty).
        pil_compress_level: The compress_level setting of PIL.Image.save(), used for PNG encoding. All settings are lossless. 0 = no compression, 1 = fastest with slightly larger filesize, 9 = slowest with smallest filesize. 1 is typically the best setting.
        max_queue_size: Maximum number of items in the session queue.
+        max_threads: Maximum number of session queue execution threads. Autocalculated from number of GPUs if not set.
        clear_queue_on_startup: Empties session queue on startup.
        allow_nodes: List of nodes to allow. Omit to allow all.
        deny_nodes: List of nodes to deny. Omit to deny none.
@@ -148,7 +149,7 @@ class InvokeAIAppConfig(BaseSettings):

    # PATHS
    models_dir:                    Path = Field(default=Path("models"),     description="Path to the models directory.")
-    convert_cache_dir:             Path = Field(default=Path("models/.convert_cache"), description="Path to the converted models cache directory. When loading a non-diffusers model, it will be converted and store on disk at this location.")
+    convert_cache_dir:             Path = Field(default=Path("models/.convert_cache"), description="Path to the converted models cache directory (DEPRECATED, but do not delete because it is needed for migration from previous versions).")
    download_cache_dir:            Path = Field(default=Path("models/.download_cache"), description="Path to the directory that contains dynamically downloaded models.")
    legacy_conf_dir:               Path = Field(default=Path("configs"), description="Path to directory of legacy checkpoint config files.")
    db_dir:                        Path = Field(default=Path("databases"),  description="Path to InvokeAI databases directory.")
@@ -170,14 +171,14 @@ class InvokeAIAppConfig(BaseSettings):
    profiles_dir:                  Path = Field(default=Path("profiles"),   description="Path to profiles output directory.")

    # CACHE
-    ram:                          float = Field(default_factory=get_default_ram_cache_size, gt=0, description="Maximum memory amount used by memory model cache for rapid switching (GB).")
-    vram:                         float = Field(default=DEFAULT_VRAM_CACHE, ge=0, description="Amount of VRAM reserved for model storage (GB).")
-    convert_cache:                float = Field(default=DEFAULT_CONVERT_CACHE, ge=0, description="Maximum size of on-disk converted models cache (GB).")
+    ram:                           float = Field(default_factory=get_default_ram_cache_size, gt=0, description="Maximum memory amount used by memory model cache for rapid switching (GB).")
+    vram:                          float = Field(default=DEFAULT_VRAM_CACHE, ge=0, description="Amount of VRAM reserved for model storage (GB).")
    lazy_offload:                  bool = Field(default=True,               description="Keep models in VRAM until their space is needed.")
    log_memory_usage:              bool = Field(default=False,              description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")

    # DEVICE
    device:                      DEVICE = Field(default="auto",             description="Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.")
+    devices:      Optional[list[DEVICE]] = Field(default=None,              description="List of execution devices; will override default device selected.")
    precision:                PRECISION = Field(default="auto",             description="Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.")

    # GENERATION
@@ -187,6 +188,7 @@ class InvokeAIAppConfig(BaseSettings):
    force_tiled_decode:            bool = Field(default=False,              description="Whether to enable tiled VAE decode (reduces memory consumption with some performance penalty).")
    pil_compress_level:             int = Field(default=1,                  description="The compress_level setting of PIL.Image.save(), used for PNG encoding. All settings are lossless. 0 = no compression, 1 = fastest with slightly larger filesize, 9 = slowest with smallest filesize. 1 is typically the best setting.")
    max_queue_size:                 int = Field(default=10000, gt=0,        description="Maximum number of items in the session queue.")
+    max_threads:          Optional[int] = Field(default=None,               description="Maximum number of session queue execution threads. Autocalculated from number of GPUs if not set.")
    clear_queue_on_startup:        bool = Field(default=False,              description="Empties session queue on startup.")

    # NODES
@@ -357,14 +359,14 @@ class DefaultInvokeAIAppConfig(InvokeAIAppConfig):
        return (init_settings,)


-def migrate_v3_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig:
-    """Migrate a v3 config dictionary to a current config object.
+def migrate_v3_config_dict(config_dict: dict[str, Any]) -> dict[str, Any]:
+    """Migrate a v3 config dictionary to a v4.0.0.

    Args:
        config_dict: A dictionary of settings from a v3 config file.

    Returns:
-        An instance of `InvokeAIAppConfig` with the migrated settings.
+        An `InvokeAIAppConfig` config dict.

    """
    parsed_config_dict: dict[str, Any] = {}
@@ -376,9 +378,6 @@ def migrate_v3_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig:
            # `max_cache_size` was renamed to `ram` some time in v3, but both names were used
            if k == "max_cache_size" and "ram" not in category_dict:
                parsed_config_dict["ram"] = v
-            # `max_vram_cache_size` was renamed to `vram` some time in v3, but both names were used
-            if k == "max_vram_cache_size" and "vram" not in category_dict:
-                parsed_config_dict["vram"] = v
            # autocast was removed in v4.0.1
            if k == "precision" and v == "autocast":
                parsed_config_dict["precision"] = "auto"
@@ -398,34 +397,44 @@ def migrate_v3_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig:
            elif k in InvokeAIAppConfig.model_fields:
                # skip unknown fields
                parsed_config_dict[k] = v
-    # When migrating the config file, we should not include currently-set environment variables.
-    config = DefaultInvokeAIAppConfig.model_validate(parsed_config_dict)
-
-    return config
+    parsed_config_dict["schema_version"] = "4.0.0"
+    return parsed_config_dict


-def migrate_v4_0_0_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig:
-    """Migrate v4.0.0 config dictionary to a current config object.
+def migrate_v4_0_0_to_4_0_1_config_dict(config_dict: dict[str, Any]) -> dict[str, Any]:
+    """Migrate v4.0.0 config dictionary to a v4.0.1 config dictionary.

    Args:
        config_dict: A dictionary of settings from a v4.0.0 config file.

    Returns:
-        An instance of `InvokeAIAppConfig` with the migrated settings.
+        A config dict with the settings migrated to v4.0.1.
    """
-    parsed_config_dict: dict[str, Any] = {}
-    for k, v in config_dict.items():
-        # autocast was removed from precision in v4.0.1
-        if k == "precision" and v == "autocast":
-            parsed_config_dict["precision"] = "auto"
-        else:
-            parsed_config_dict[k] = v
-        if k == "schema_version":
-            parsed_config_dict[k] = CONFIG_SCHEMA_VERSION
-    config = DefaultInvokeAIAppConfig.model_validate(parsed_config_dict)
-    return config
+    parsed_config_dict: dict[str, Any] = copy.deepcopy(config_dict)
+    # precision "autocast" was replaced by "auto" in v4.0.1
+    if parsed_config_dict.get("precision") == "autocast":
+        parsed_config_dict["precision"] = "auto"
+    parsed_config_dict["schema_version"] = "4.0.1"
+    return parsed_config_dict


+def migrate_v4_0_1_to_4_0_2_config_dict(config_dict: dict[str, Any]) -> dict[str, Any]:
+    """Migrate v4.0.1 config dictionary to a v4.0.2 config dictionary.
+
+    Args:
+        config_dict: A dictionary of settings from a v4.0.1 config file.
+
+    Returns:
+        An config dict with the settings migrated to v4.0.2.
+    """
+    parsed_config_dict: dict[str, Any] = copy.deepcopy(config_dict)
+    # convert_cache was removed in 4.0.2
+    parsed_config_dict.pop("convert_cache", None)
+    parsed_config_dict["schema_version"] = "4.0.2"
+    return parsed_config_dict
+
+
+# TO DO: replace this with a formal registration and migration system
 def load_and_migrate_config(config_path: Path) -> InvokeAIAppConfig:
    """Load and migrate a config file to the latest version.

@@ -437,25 +446,30 @@ def load_and_migrate_config(config_path: Path) -> InvokeAIAppConfig:
    """
    assert config_path.suffix == ".yaml"
    with open(config_path, "rt", encoding=locale.getpreferredencoding()) as file:
-        loaded_config_dict = yaml.safe_load(file)
+        loaded_config_dict: dict[str, Any] = yaml.safe_load(file)

    assert isinstance(loaded_config_dict, dict)

+    migrated = False
    if "InvokeAI" in loaded_config_dict:
-        # This is a v3 config file, attempt to migrate it
+        migrated = True
+        loaded_config_dict = migrate_v3_config_dict(loaded_config_dict)  # pyright: ignore [reportUnknownArgumentType]
+    if loaded_config_dict["schema_version"] == "4.0.0":
+        migrated = True
+        loaded_config_dict = migrate_v4_0_0_to_4_0_1_config_dict(loaded_config_dict)
+    if loaded_config_dict["schema_version"] == "4.0.1":
+        migrated = True
+        loaded_config_dict = migrate_v4_0_1_to_4_0_2_config_dict(loaded_config_dict)
+
+    if migrated:
        shutil.copy(config_path, config_path.with_suffix(".yaml.bak"))
        try:
-            # loaded_config_dict could be the wrong shape, but we will catch all exceptions below
-            migrated_config = migrate_v3_config_dict(loaded_config_dict)  # pyright: ignore [reportUnknownArgumentType]
+            # load and write without environment variables
+            migrated_config = DefaultInvokeAIAppConfig.model_validate(loaded_config_dict)
+            migrated_config.write_file(config_path)
        except Exception as e:
            shutil.copy(config_path.with_suffix(".yaml.bak"), config_path)
            raise RuntimeError(f"Failed to load and migrate v3 config file {config_path}: {e}") from e
-        migrated_config.write_file(config_path)
-        return migrated_config
-
-    if loaded_config_dict["schema_version"] == "4.0.0":
-        loaded_config_dict = migrate_v4_0_0_config_dict(loaded_config_dict)
-        loaded_config_dict.write_file(config_path)

    # Attempt to load as a v4 config file
    try:
--- a/invokeai/app/services/invocation_services.py
+++ b/invokeai/app/services/invocation_services.py
@@ -53,11 +53,11 @@ class InvocationServices:
        model_images: "ModelImageFileStorageBase",
        model_manager: "ModelManagerServiceBase",
        download_queue: "DownloadQueueServiceBase",
-        performance_statistics: "InvocationStatsServiceBase",
        session_queue: "SessionQueueBase",
        session_processor: "SessionProcessorBase",
        invocation_cache: "InvocationCacheBase",
        names: "NameServiceBase",
+        performance_statistics: "InvocationStatsServiceBase",
        urls: "UrlServiceBase",
        workflow_records: "WorkflowRecordsStorageBase",
        tensors: "ObjectSerializerBase[torch.Tensor]",
@@ -77,11 +77,11 @@ class InvocationServices:
        self.model_images = model_images
        self.model_manager = model_manager
        self.download_queue = download_queue
-        self.performance_statistics = performance_statistics
        self.session_queue = session_queue
        self.session_processor = session_processor
        self.invocation_cache = invocation_cache
        self.names = names
+        self.performance_statistics = performance_statistics
        self.urls = urls
        self.workflow_records = workflow_records
        self.tensors = tensors
--- a/invokeai/app/services/invocation_stats/invocation_stats_default.py
+++ b/invokeai/app/services/invocation_stats/invocation_stats_default.py
@@ -74,9 +74,9 @@ class InvocationStatsService(InvocationStatsServiceBase):
            )
            self._stats[graph_execution_state_id].add_node_execution_stats(node_stats)

-    def reset_stats(self):
-        self._stats = {}
-        self._cache_stats = {}
+    def reset_stats(self, graph_execution_state_id: str):
+        self._stats.pop(graph_execution_state_id)
+        self._cache_stats.pop(graph_execution_state_id)

    def get_stats(self, graph_execution_state_id: str) -> InvocationStatsSummary:
        graph_stats_summary = self._get_graph_summary(graph_execution_state_id)
--- a/invokeai/app/services/model_install/model_install_default.py
+++ b/invokeai/app/services/model_install/model_install_default.py
@@ -284,9 +284,14 @@ class ModelInstallService(ModelInstallServiceBase):
        unfinished_jobs = [x for x in self._install_jobs if not x.in_terminal_state]
        self._install_jobs = unfinished_jobs

-    def _migrate_yaml(self) -> None:
+    def _migrate_yaml(self, rename_yaml: Optional[bool] = True, overwrite_db: Optional[bool] = False) -> None:
        db_models = self.record_store.all_models()

+        if overwrite_db:
+            for model in db_models:
+                self.record_store.del_model(model.key)
+            db_models = self.record_store.all_models()
+
        legacy_models_yaml_path = (
            self._app_config.legacy_models_yaml_path or self._app_config.root_path / "configs" / "models.yaml"
        )
@@ -336,7 +341,8 @@ class ModelInstallService(ModelInstallServiceBase):
                        self._logger.warning(f"Model at {model_path} could not be migrated: {e}")

            # Rename `models.yaml` to `models.yaml.bak` to prevent re-migration
-            legacy_models_yaml_path.rename(legacy_models_yaml_path.with_suffix(".yaml.bak"))
+            if rename_yaml:
+                legacy_models_yaml_path.rename(legacy_models_yaml_path.with_suffix(".yaml.bak"))

        # Unset the path - we are done with it either way
        self._app_config.legacy_models_yaml_path = None
--- a/invokeai/app/services/model_load/model_load_base.py
+++ b/invokeai/app/services/model_load/model_load_base.py
@@ -7,7 +7,6 @@ from typing import Callable, Optional

 from invokeai.backend.model_manager import AnyModel, AnyModelConfig, SubModelType
 from invokeai.backend.model_manager.load import LoadedModel, LoadedModelWithoutConfig
-from invokeai.backend.model_manager.load.convert_cache import ModelConvertCacheBase
 from invokeai.backend.model_manager.load.model_cache.model_cache_base import ModelCacheBase


@@ -30,8 +29,8 @@ class ModelLoadServiceBase(ABC):

    @property
    @abstractmethod
-    def convert_cache(self) -> ModelConvertCacheBase:
-        """Return the checkpoint convert cache used by this loader."""
+    def gpu_count(self) -> int:
+        """Return the number of GPUs we are configured to use."""

    @abstractmethod
    def load_model_from_path(
--- a/invokeai/app/services/model_load/model_load_default.py
+++ b/invokeai/app/services/model_load/model_load_default.py
@@ -17,7 +17,6 @@ from invokeai.backend.model_manager.load import (
    ModelLoaderRegistry,
    ModelLoaderRegistryBase,
 )
-from invokeai.backend.model_manager.load.convert_cache import ModelConvertCacheBase
 from invokeai.backend.model_manager.load.model_cache.model_cache_base import ModelCacheBase
 from invokeai.backend.model_manager.load.model_loaders.generic_diffusers import GenericDiffusersLoader
 from invokeai.backend.util.devices import TorchDevice
@@ -33,7 +32,6 @@ class ModelLoadService(ModelLoadServiceBase):
        self,
        app_config: InvokeAIAppConfig,
        ram_cache: ModelCacheBase[AnyModel],
-        convert_cache: ModelConvertCacheBase,
        registry: Optional[Type[ModelLoaderRegistryBase]] = ModelLoaderRegistry,
    ):
        """Initialize the model load service."""
@@ -42,10 +40,10 @@ class ModelLoadService(ModelLoadServiceBase):
        self._logger = logger
        self._app_config = app_config
        self._ram_cache = ram_cache
-        self._convert_cache = convert_cache
        self._registry = registry

    def start(self, invoker: Invoker) -> None:
+        """Start the service."""
        self._invoker = invoker

    @property
@@ -54,9 +52,9 @@ class ModelLoadService(ModelLoadServiceBase):
        return self._ram_cache

    @property
-    def convert_cache(self) -> ModelConvertCacheBase:
-        """Return the checkpoint convert cache used by this loader."""
-        return self._convert_cache
+    def gpu_count(self) -> int:
+        """Return the number of GPUs available for our uses."""
+        return len(self._ram_cache.execution_devices)

    def load_model(self, model_config: AnyModelConfig, submodel_type: Optional[SubModelType] = None) -> LoadedModel:
        """
@@ -76,7 +74,6 @@ class ModelLoadService(ModelLoadServiceBase):
            app_config=self._app_config,
            logger=self._logger,
            ram_cache=self._ram_cache,
-            convert_cache=self._convert_cache,
        ).load_model(model_config, submodel_type)

        if hasattr(self, "_invoker"):
--- a/invokeai/app/services/model_manager/model_manager_base.py
+++ b/invokeai/app/services/model_manager/model_manager_base.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2023 Lincoln D. Stein and the InvokeAI Team

 from abc import ABC, abstractmethod
+from typing import Optional, Set

 import torch
 from typing_extensions import Self
@@ -31,7 +32,7 @@ class ModelManagerServiceBase(ABC):
        model_record_service: ModelRecordServiceBase,
        download_queue: DownloadQueueServiceBase,
        events: EventServiceBase,
-        execution_device: torch.device,
+        execution_devices: Optional[Set[torch.device]] = None,
    ) -> Self:
        """
        Construct the model manager service instance.
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -1,14 +1,10 @@
 # Copyright (c) 2023 Lincoln D. Stein and the InvokeAI Team
 """Implementation of ModelManagerServiceBase."""

-from typing import Optional
-
-import torch
 from typing_extensions import Self

 from invokeai.app.services.invoker import Invoker
-from invokeai.backend.model_manager.load import ModelCache, ModelConvertCache, ModelLoaderRegistry
-from invokeai.backend.util.devices import TorchDevice
+from invokeai.backend.model_manager.load import ModelCache, ModelLoaderRegistry
 from invokeai.backend.util.logging import InvokeAILogger

 from ..config import InvokeAIAppConfig
@@ -69,7 +65,6 @@ class ModelManagerService(ModelManagerServiceBase):
        model_record_service: ModelRecordServiceBase,
        download_queue: DownloadQueueServiceBase,
        events: EventServiceBase,
-        execution_device: Optional[torch.device] = None,
    ) -> Self:
        """
        Construct the model manager service instance.
@@ -82,15 +77,11 @@ class ModelManagerService(ModelManagerServiceBase):
        ram_cache = ModelCache(
            max_cache_size=app_config.ram,
            max_vram_cache_size=app_config.vram,
-            lazy_offloading=app_config.lazy_offload,
            logger=logger,
-            execution_device=execution_device or TorchDevice.choose_torch_device(),
        )
-        convert_cache = ModelConvertCache(cache_path=app_config.convert_cache_path, max_size=app_config.convert_cache)
        loader = ModelLoadService(
            app_config=app_config,
            ram_cache=ram_cache,
-            convert_cache=convert_cache,
            registry=ModelLoaderRegistry,
        )
        installer = ModelInstallService(
--- a/invokeai/app/services/object_serializer/object_serializer_disk.py
+++ b/invokeai/app/services/object_serializer/object_serializer_disk.py
@@ -1,5 +1,6 @@
 import shutil
 import tempfile
+import threading
 import typing
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional, TypeVar
@@ -70,7 +71,10 @@ class ObjectSerializerDisk(ObjectSerializerBase[T]):
        return self._output_dir / name

    def _new_name(self) -> str:
-        return f"{self._obj_class_name}_{uuid_string()}"
+        tid = threading.current_thread().ident
+        # Add tid to the object name because uuid4 not thread-safe on windows
+        # See https://stackoverflow.com/questions/2759644/python-multiprocessing-doesnt-play-nicely-with-uuid-uuid4
+        return f"{self._obj_class_name}_{tid}-{uuid_string()}"

    def _tempdir_cleanup(self) -> None:
        """Calls `cleanup` on the temporary directory, if it exists."""
--- a/invokeai/app/services/session_processor/session_processor_default.py
+++ b/invokeai/app/services/session_processor/session_processor_default.py
@@ -1,8 +1,9 @@
 import traceback
 from contextlib import suppress
-from threading import BoundedSemaphore, Thread
+from queue import Queue
+from threading import BoundedSemaphore, Lock, Thread
 from threading import Event as ThreadEvent
-from typing import Optional
+from typing import Optional, Set

 from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput
 from invokeai.app.services.events.events_common import (
@@ -26,6 +27,7 @@ from invokeai.app.services.session_queue.session_queue_common import SessionQueu
 from invokeai.app.services.shared.graph import NodeInputError
 from invokeai.app.services.shared.invocation_context import InvocationContextData, build_invocation_context
 from invokeai.app.util.profiler import Profiler
+from invokeai.backend.util.devices import TorchDevice

 from ..invoker import Invoker
 from .session_processor_base import InvocationServices, SessionProcessorBase, SessionRunnerBase
@@ -57,8 +59,11 @@ class DefaultSessionRunner(SessionRunnerBase):
        self._on_after_run_node_callbacks = on_after_run_node_callbacks or []
        self._on_node_error_callbacks = on_node_error_callbacks or []
        self._on_after_run_session_callbacks = on_after_run_session_callbacks or []
+        self._process_lock = Lock()

-    def start(self, services: InvocationServices, cancel_event: ThreadEvent, profiler: Optional[Profiler] = None):
+    def start(
+        self, services: InvocationServices, cancel_event: ThreadEvent, profiler: Optional[Profiler] = None
+    ) -> None:
        self._services = services
        self._cancel_event = cancel_event
        self._profiler = profiler
@@ -76,7 +81,8 @@ class DefaultSessionRunner(SessionRunnerBase):
        # Loop over invocations until the session is complete or canceled
        while True:
            try:
-                invocation = queue_item.session.next()
+                with self._process_lock:
+                    invocation = queue_item.session.next()
            # Anything other than a `NodeInputError` is handled as a processor error
            except NodeInputError as e:
                error_type = e.__class__.__name__
@@ -108,7 +114,7 @@ class DefaultSessionRunner(SessionRunnerBase):

        self._on_after_run_session(queue_item=queue_item)

-    def run_node(self, invocation: BaseInvocation, queue_item: SessionQueueItem):
+    def run_node(self, invocation: BaseInvocation, queue_item: SessionQueueItem) -> None:
        try:
            # Any unhandled exception in this scope is an invocation error & will fail the graph
            with self._services.performance_statistics.collect_stats(invocation, queue_item.session_id):
@@ -210,7 +216,7 @@ class DefaultSessionRunner(SessionRunnerBase):
            # we don't care about that - suppress the error.
            with suppress(GESStatsNotFoundError):
                self._services.performance_statistics.log_stats(queue_item.session.id)
-                self._services.performance_statistics.reset_stats()
+                self._services.performance_statistics.reset_stats(queue_item.session.id)

            for callback in self._on_after_run_session_callbacks:
                callback(queue_item=queue_item)
@@ -324,7 +330,7 @@ class DefaultSessionProcessor(SessionProcessorBase):

    def start(self, invoker: Invoker) -> None:
        self._invoker: Invoker = invoker
-        self._queue_item: Optional[SessionQueueItem] = None
+        self._active_queue_items: Set[SessionQueueItem] = set()
        self._invocation: Optional[BaseInvocation] = None

        self._resume_event = ThreadEvent()
@@ -350,7 +356,14 @@ class DefaultSessionProcessor(SessionProcessorBase):
            else None
        )

+        self._worker_thread_count = self._invoker.services.configuration.max_threads or len(
+            TorchDevice.execution_devices()
+        )
+
+        self._session_worker_queue: Queue[SessionQueueItem] = Queue()
+
        self.session_runner.start(services=invoker.services, cancel_event=self._cancel_event, profiler=self._profiler)
+        # Session processor - singlethreaded
        self._thread = Thread(
            name="session_processor",
            target=self._process,
@@ -363,6 +376,16 @@ class DefaultSessionProcessor(SessionProcessorBase):
        )
        self._thread.start()

+        # Session processor workers - multithreaded
+        self._invoker.services.logger.debug(f"Starting {self._worker_thread_count} session processing threads.")
+        for _i in range(0, self._worker_thread_count):
+            worker = Thread(
+                name="session_worker",
+                target=self._process_next_session,
+                daemon=True,
+            )
+            worker.start()
+
    def stop(self, *args, **kwargs) -> None:
        self._stop_event.set()

@@ -370,7 +393,7 @@ class DefaultSessionProcessor(SessionProcessorBase):
        self._poll_now_event.set()

    async def _on_queue_cleared(self, event: FastAPIEvent[QueueClearedEvent]) -> None:
-        if self._queue_item and self._queue_item.queue_id == event[1].queue_id:
+        if any(item.queue_id == event[1].queue_id for item in self._active_queue_items):
            self._cancel_event.set()
            self._poll_now()

@@ -378,7 +401,7 @@ class DefaultSessionProcessor(SessionProcessorBase):
        self._poll_now()

    async def _on_queue_item_status_changed(self, event: FastAPIEvent[QueueItemStatusChangedEvent]) -> None:
-        if self._queue_item and event[1].status in ["completed", "failed", "canceled"]:
+        if self._active_queue_items and event[1].status in ["completed", "failed", "canceled"]:
            # When the queue item is canceled via HTTP, the queue item status is set to `"canceled"` and this event is
            # emitted. We need to respond to this event and stop graph execution. This is done by setting the cancel
            # event, which the session runner checks between invocations. If set, the session runner loop is broken.
@@ -403,7 +426,7 @@ class DefaultSessionProcessor(SessionProcessorBase):
    def get_status(self) -> SessionProcessorStatus:
        return SessionProcessorStatus(
            is_started=self._resume_event.is_set(),
-            is_processing=self._queue_item is not None,
+            is_processing=len(self._active_queue_items) > 0,
        )

    def _process(
@@ -428,30 +451,22 @@ class DefaultSessionProcessor(SessionProcessorBase):
                    resume_event.wait()

                    # Get the next session to process
-                    self._queue_item = self._invoker.services.session_queue.dequeue()
+                    queue_item = self._invoker.services.session_queue.dequeue()

-                    if self._queue_item is None:
+                    if queue_item is None:
                        # The queue was empty, wait for next polling interval or event to try again
                        self._invoker.services.logger.debug("Waiting for next polling interval or event")
                        poll_now_event.wait(self._polling_interval)
                        continue

-                    self._invoker.services.logger.debug(f"Executing queue item {self._queue_item.item_id}")
+                    self._session_worker_queue.put(queue_item)
+                    self._invoker.services.logger.debug(f"Scheduling queue item {queue_item.item_id} to run")
                    cancel_event.clear()

                    # Run the graph
-                    self.session_runner.run(queue_item=self._queue_item)
+                    # self.session_runner.run(queue_item=self._queue_item)

-                except Exception as e:
-                    error_type = e.__class__.__name__
-                    error_message = str(e)
-                    error_traceback = traceback.format_exc()
-                    self._on_non_fatal_processor_error(
-                        queue_item=self._queue_item,
-                        error_type=error_type,
-                        error_message=error_message,
-                        error_traceback=error_traceback,
-                    )
+                except Exception:
                    # Wait for next polling interval or event to try again
                    poll_now_event.wait(self._polling_interval)
                    continue
@@ -466,9 +481,25 @@ class DefaultSessionProcessor(SessionProcessorBase):
        finally:
            stop_event.clear()
            poll_now_event.clear()
-            self._queue_item = None
            self._thread_semaphore.release()

+    def _process_next_session(self) -> None:
+        while True:
+            self._resume_event.wait()
+            queue_item = self._session_worker_queue.get()
+            if queue_item.status == "canceled":
+                continue
+            try:
+                self._active_queue_items.add(queue_item)
+                # reserve a GPU for this session - may block
+                with self._invoker.services.model_manager.load.ram_cache.reserve_execution_device():
+                    # Run the session on the reserved GPU
+                    self.session_runner.run(queue_item=queue_item)
+            except Exception:
+                continue
+            finally:
+                self._active_queue_items.remove(queue_item)
+
    def _on_non_fatal_processor_error(
        self,
        queue_item: Optional[SessionQueueItem],
--- a/invokeai/app/services/session_queue/session_queue_common.py
+++ b/invokeai/app/services/session_queue/session_queue_common.py
@@ -236,6 +236,9 @@ class SessionQueueItemWithoutGraph(BaseModel):
        }
    )

+    def __hash__(self) -> int:
+        return self.item_id
+

 class SessionQueueItemDTO(SessionQueueItemWithoutGraph):
    pass
--- a/invokeai/app/services/shared/invocation_context.py
+++ b/invokeai/app/services/shared/invocation_context.py
@@ -2,6 +2,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Callable, Optional, Union

+import torch
 from PIL.Image import Image
 from pydantic.networks import AnyHttpUrl
 from torch import Tensor
@@ -26,11 +27,13 @@ from invokeai.backend.model_manager.config import (
 from invokeai.backend.model_manager.load.load_base import LoadedModel, LoadedModelWithoutConfig
 from invokeai.backend.stable_diffusion.diffusers_pipeline import PipelineIntermediateState
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningFieldData
+from invokeai.backend.util.devices import TorchDevice

 if TYPE_CHECKING:
    from invokeai.app.invocations.baseinvocation import BaseInvocation
    from invokeai.app.invocations.model import ModelIdentifierField
    from invokeai.app.services.session_queue.session_queue_common import SessionQueueItem
+    from invokeai.backend.model_manager.load.model_cache.model_cache_base import ModelCacheBase

 """
 The InvocationContext provides access to various services and data about the current invocation.
@@ -323,7 +326,6 @@ class ConditioningInterface(InvocationContextInterface):
        Returns:
            The loaded conditioning data.
        """
-
        return self._services.conditioning.load(name)


@@ -557,6 +559,28 @@ class UtilInterface(InvocationContextInterface):
            is_canceled=self.is_canceled,
        )

+    def torch_device(self) -> torch.device:
+        """
+        Return a torch device to use in the current invocation.
+
+        Returns:
+            A torch.device not currently in use by the system.
+        """
+        ram_cache: "ModelCacheBase[AnyModel]" = self._services.model_manager.load.ram_cache
+        return ram_cache.get_execution_device()
+
+    def torch_dtype(self, device: Optional[torch.device] = None) -> torch.dtype:
+        """
+        Return a precision type to use with the current invocation and torch device.
+
+        Args:
+            device: Optional device.
+
+        Returns:
+            A torch.dtype suited for the current device.
+        """
+        return TorchDevice.choose_torch_dtype(device)
+

 class InvocationContext:
    """Provides access to various services and data for the current invocation.
--- a/invokeai/app/services/shared/sqlite/sqlite_util.py
+++ b/invokeai/app/services/shared/sqlite/sqlite_util.py
@@ -14,6 +14,7 @@ from invokeai.app.services.shared.sqlite_migrator.migrations.migration_8 import
 from invokeai.app.services.shared.sqlite_migrator.migrations.migration_9 import build_migration_9
 from invokeai.app.services.shared.sqlite_migrator.migrations.migration_10 import build_migration_10
 from invokeai.app.services.shared.sqlite_migrator.migrations.migration_11 import build_migration_11
+from invokeai.app.services.shared.sqlite_migrator.migrations.migration_12 import build_migration_12
 from invokeai.app.services.shared.sqlite_migrator.sqlite_migrator_impl import SqliteMigrator


@@ -45,6 +46,7 @@ def init_db(config: InvokeAIAppConfig, logger: Logger, image_files: ImageFileSto
    migrator.register_migration(build_migration_9())
    migrator.register_migration(build_migration_10())
    migrator.register_migration(build_migration_11(app_config=config, logger=logger))
+    migrator.register_migration(build_migration_12(app_config=config))
    migrator.run_migrations()

    return db
--- a/invokeai/app/services/shared/sqlite_migrator/migrations/migration_12.py
+++ b/invokeai/app/services/shared/sqlite_migrator/migrations/migration_12.py
@@ -0,0 +1,35 @@
+import shutil
+import sqlite3
+
+from invokeai.app.services.config import InvokeAIAppConfig
+from invokeai.app.services.shared.sqlite_migrator.sqlite_migrator_common import Migration
+
+
+class Migration12Callback:
+    def __init__(self, app_config: InvokeAIAppConfig) -> None:
+        self._app_config = app_config
+
+    def __call__(self, cursor: sqlite3.Cursor) -> None:
+        self._remove_model_convert_cache_dir()
+
+    def _remove_model_convert_cache_dir(self) -> None:
+        """
+        Removes unused model convert cache directory
+        """
+        convert_cache = self._app_config.convert_cache_path
+        shutil.rmtree(convert_cache, ignore_errors=True)
+
+
+def build_migration_12(app_config: InvokeAIAppConfig) -> Migration:
+    """
+    Build the migration from database version 11 to 12.
+
+    This migration removes the now-unused model convert cache directory.
+    """
+    migration_12 = Migration(
+        from_version=11,
+        to_version=12,
+        callback=Migration12Callback(app_config),
+    )
+
+    return migration_12
--- a/invokeai/backend/model_manager/config.py
+++ b/invokeai/backend/model_manager/config.py
@@ -24,6 +24,7 @@ import time
 from enum import Enum
 from typing import Literal, Optional, Type, TypeAlias, Union

+import diffusers
 import torch
 from diffusers.models.modeling_utils import ModelMixin
 from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag, TypeAdapter
@@ -37,7 +38,7 @@ from ..raw_model import RawModel

 # ModelMixin is the base class for all diffusers and transformers models
 # RawModel is the InvokeAI wrapper class for ip_adapters, loras, textual_inversion and onnx runtime
-AnyModel = Union[ModelMixin, RawModel, torch.nn.Module, Dict[str, torch.Tensor]]
+AnyModel = Union[ModelMixin, RawModel, torch.nn.Module, Dict[str, torch.Tensor], diffusers.DiffusionPipeline]


 class InvalidModelConfigException(Exception):
--- a/invokeai/backend/model_manager/convert_ckpt_to_diffusers.py
+++ b/invokeai/backend/model_manager/convert_ckpt_to_diffusers.py
@@ -1,83 +0,0 @@
-# Adapted for use in InvokeAI by Lincoln Stein, July 2023
-#
-"""Conversion script for the Stable Diffusion checkpoints."""
-
-from pathlib import Path
-from typing import Optional
-
-import torch
-from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
-from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
-    convert_ldm_vae_checkpoint,
-    create_vae_diffusers_config,
-    download_controlnet_from_original_ckpt,
-    download_from_original_stable_diffusion_ckpt,
-)
-from omegaconf import DictConfig
-
-from . import AnyModel
-
-
-def convert_ldm_vae_to_diffusers(
-    checkpoint: torch.Tensor | dict[str, torch.Tensor],
-    vae_config: DictConfig,
-    image_size: int,
-    dump_path: Optional[Path] = None,
-    precision: torch.dtype = torch.float16,
-) -> AutoencoderKL:
-    """Convert a checkpoint-style VAE into a Diffusers VAE"""
-    vae_config = create_vae_diffusers_config(vae_config, image_size=image_size)
-    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-
-    vae = AutoencoderKL(**vae_config)
-    vae.load_state_dict(converted_vae_checkpoint)
-    vae.to(precision)
-
-    if dump_path:
-        vae.save_pretrained(dump_path, safe_serialization=True)
-
-    return vae
-
-
-def convert_ckpt_to_diffusers(
-    checkpoint_path: str | Path,
-    dump_path: Optional[str | Path] = None,
-    precision: torch.dtype = torch.float16,
-    use_safetensors: bool = True,
-    **kwargs,
-) -> AnyModel:
-    """
-    Takes all the arguments of download_from_original_stable_diffusion_ckpt(),
-    and in addition a path-like object indicating the location of the desired diffusers
-    model to be written.
-    """
-    pipe = download_from_original_stable_diffusion_ckpt(Path(checkpoint_path).as_posix(), **kwargs)
-    pipe = pipe.to(precision)
-
-    # TO DO: save correct repo variant
-    if dump_path:
-        pipe.save_pretrained(
-            dump_path,
-            safe_serialization=use_safetensors,
-        )
-    return pipe
-
-
-def convert_controlnet_to_diffusers(
-    checkpoint_path: Path,
-    dump_path: Optional[Path] = None,
-    precision: torch.dtype = torch.float16,
-    **kwargs,
-) -> AnyModel:
-    """
-    Takes all the arguments of download_controlnet_from_original_ckpt(),
-    and in addition a path-like object indicating the location of the desired diffusers
-    model to be written.
-    """
-    pipe = download_controlnet_from_original_ckpt(checkpoint_path.as_posix(), **kwargs)
-    pipe = pipe.to(precision)
-
-    # TO DO: save correct repo variant
-    if dump_path:
-        pipe.save_pretrained(dump_path, safe_serialization=True)
-    return pipe
--- a/invokeai/backend/model_manager/load/init.py
+++ b/invokeai/backend/model_manager/load/init.py
@@ -6,7 +6,6 @@ Init file for the model loader.
 from importlib import import_module
 from pathlib import Path

-from .convert_cache.convert_cache_default import ModelConvertCache
 from .load_base import LoadedModel, LoadedModelWithoutConfig, ModelLoaderBase
 from .load_default import ModelLoader
 from .model_cache.model_cache_default import ModelCache
@@ -21,7 +20,6 @@ __all__ = [
    "LoadedModel",
    "LoadedModelWithoutConfig",
    "ModelCache",
-    "ModelConvertCache",
    "ModelLoaderBase",
    "ModelLoader",
    "ModelLoaderRegistryBase",
--- a/invokeai/backend/model_manager/load/convert_cache/init.py
+++ b/invokeai/backend/model_manager/load/convert_cache/init.py
@@ -1,4 +0,0 @@
-from .convert_cache_base import ModelConvertCacheBase
-from .convert_cache_default import ModelConvertCache
-
-__all__ = ["ModelConvertCacheBase", "ModelConvertCache"]
--- a/invokeai/backend/model_manager/load/convert_cache/convert_cache_base.py
+++ b/invokeai/backend/model_manager/load/convert_cache/convert_cache_base.py
@@ -1,28 +0,0 @@
-"""
-Disk-based converted model cache.
-"""
-
-from abc import ABC, abstractmethod
-from pathlib import Path
-
-
-class ModelConvertCacheBase(ABC):
-    @property
-    @abstractmethod
-    def max_size(self) -> float:
-        """Return the maximum size of this cache directory."""
-        pass
-
-    @abstractmethod
-    def make_room(self, size: float) -> None:
-        """
-        Make sufficient room in the cache directory for a model of max_size.
-
-        :param size: Size required (GB)
-        """
-        pass
-
-    @abstractmethod
-    def cache_path(self, key: str) -> Path:
-        """Return the path for a model with the indicated key."""
-        pass
--- a/invokeai/backend/model_manager/load/convert_cache/convert_cache_default.py
+++ b/invokeai/backend/model_manager/load/convert_cache/convert_cache_default.py
@@ -1,83 +0,0 @@
-"""
-Placeholder for convert cache implementation.
-"""
-
-import shutil
-from pathlib import Path
-
-from invokeai.backend.util import GIG, directory_size
-from invokeai.backend.util.logging import InvokeAILogger
-from invokeai.backend.util.util import safe_filename
-
-from .convert_cache_base import ModelConvertCacheBase
-
-
-class ModelConvertCache(ModelConvertCacheBase):
-    def __init__(self, cache_path: Path, max_size: float = 10.0):
-        """Initialize the convert cache with the base directory and a limit on its maximum size (in GBs)."""
-        if not cache_path.exists():
-            cache_path.mkdir(parents=True)
-        self._cache_path = cache_path
-        self._max_size = max_size
-
-        # adjust cache size at startup in case it has been changed
-        if self._cache_path.exists():
-            self.make_room(0.0)
-
-    @property
-    def max_size(self) -> float:
-        """Return the maximum size of this cache directory (GB)."""
-        return self._max_size
-
-    @max_size.setter
-    def max_size(self, value: float) -> None:
-        """Set the maximum size of this cache directory (GB)."""
-        self._max_size = value
-
-    def cache_path(self, key: str) -> Path:
-        """Return the path for a model with the indicated key."""
-        key = safe_filename(self._cache_path, key)
-        return self._cache_path / key
-
-    def make_room(self, size: float) -> None:
-        """
-        Make sufficient room in the cache directory for a model of max_size.
-
-        :param size: Size required (GB)
-        """
-        size_needed = directory_size(self._cache_path) + size
-        max_size = int(self.max_size) * GIG
-        logger = InvokeAILogger.get_logger()
-
-        if size_needed <= max_size:
-            return
-
-        logger.debug(
-            f"Convert cache has gotten too large {(size_needed / GIG):4.2f} > {(max_size / GIG):4.2f}G.. Trimming."
-        )
-
-        # For this to work, we make the assumption that the directory contains
-        # a 'model_index.json', 'unet/config.json' file, or a 'config.json' file at top level.
-        # This should be true for any diffusers model.
-        def by_atime(path: Path) -> float:
-            for config in ["model_index.json", "unet/config.json", "config.json"]:
-                sentinel = path / config
-                if sentinel.exists():
-                    return sentinel.stat().st_atime
-
-            # no sentinel file found! - pick the most recent file in the directory
-            try:
-                atimes = sorted([x.stat().st_atime for x in path.iterdir() if x.is_file()], reverse=True)
-                return atimes[0]
-            except IndexError:
-                return 0.0
-
-        # sort by last access time - least accessed files will be at the end
-        lru_models = sorted(self._cache_path.iterdir(), key=by_atime, reverse=True)
-        logger.debug(f"cached models in descending atime order: {lru_models}")
-        while size_needed > max_size and len(lru_models) > 0:
-            next_victim = lru_models.pop()
-            victim_size = directory_size(next_victim)
-            logger.debug(f"Removing cached converted model {next_victim} to free {victim_size / GIG} GB")
-            shutil.rmtree(next_victim)
-            size_needed -= victim_size
--- a/invokeai/backend/model_manager/load/load_base.py
+++ b/invokeai/backend/model_manager/load/load_base.py
@@ -18,7 +18,6 @@ from invokeai.backend.model_manager.config import (
    AnyModelConfig,
    SubModelType,
 )
-from invokeai.backend.model_manager.load.convert_cache.convert_cache_base import ModelConvertCacheBase
 from invokeai.backend.model_manager.load.model_cache.model_cache_base import ModelCacheBase, ModelLockerBase


@@ -65,8 +64,7 @@ class LoadedModelWithoutConfig:

    def __enter__(self) -> AnyModel:
        """Context entry."""
-        self._locker.lock()
-        return self.model
+        return self._locker.lock()

    def __exit__(self, *args: Any, **kwargs: Any) -> None:
        """Context exit."""
@@ -112,7 +110,6 @@ class ModelLoaderBase(ABC):
        app_config: InvokeAIAppConfig,
        logger: Logger,
        ram_cache: ModelCacheBase[AnyModel],
-        convert_cache: ModelConvertCacheBase,
    ):
        """Initialize the loader."""
        pass
@@ -138,12 +135,6 @@ class ModelLoaderBase(ABC):
        """Return size in bytes of the model, calculated before loading."""
        pass

-    @property
-    @abstractmethod
-    def convert_cache(self) -> ModelConvertCacheBase:
-        """Return the convert cache associated with this loader."""
-        pass
-
    @property
    @abstractmethod
    def ram_cache(self) -> ModelCacheBase[AnyModel]:
--- a/invokeai/backend/model_manager/load/load_default.py
+++ b/invokeai/backend/model_manager/load/load_default.py
@@ -12,8 +12,7 @@ from invokeai.backend.model_manager import (
    InvalidModelConfigException,
    SubModelType,
 )
-from invokeai.backend.model_manager.config import DiffusersConfigBase, ModelType
-from invokeai.backend.model_manager.load.convert_cache import ModelConvertCacheBase
+from invokeai.backend.model_manager.config import DiffusersConfigBase
 from invokeai.backend.model_manager.load.load_base import LoadedModel, ModelLoaderBase
 from invokeai.backend.model_manager.load.model_cache.model_cache_base import ModelCacheBase, ModelLockerBase
 from invokeai.backend.model_manager.load.model_util import calc_model_size_by_fs
@@ -30,13 +29,11 @@ class ModelLoader(ModelLoaderBase):
        app_config: InvokeAIAppConfig,
        logger: Logger,
        ram_cache: ModelCacheBase[AnyModel],
-        convert_cache: ModelConvertCacheBase,
    ):
        """Initialize the loader."""
        self._app_config = app_config
        self._logger = logger
        self._ram_cache = ram_cache
-        self._convert_cache = convert_cache
        self._torch_dtype = TorchDevice.choose_torch_dtype()

    def load_model(self, model_config: AnyModelConfig, submodel_type: Optional[SubModelType] = None) -> LoadedModel:
@@ -50,23 +47,15 @@ class ModelLoader(ModelLoaderBase):
        :param submodel_type: an ModelType enum indicating the portion of
               the model to retrieve (e.g. ModelType.Vae)
        """
-        if model_config.type is ModelType.Main and not submodel_type:
-            raise InvalidModelConfigException("submodel_type is required when loading a main model")
-
        model_path = self._get_model_path(model_config)

        if not model_path.exists():
            raise InvalidModelConfigException(f"Files for model '{model_config.name}' not found at {model_path}")

        with skip_torch_weight_init():
-            locker = self._convert_and_load(model_config, model_path, submodel_type)
+            locker = self._load_and_cache(model_config, submodel_type)
        return LoadedModel(config=model_config, _locker=locker)

-    @property
-    def convert_cache(self) -> ModelConvertCacheBase:
-        """Return the convert cache associated with this loader."""
-        return self._convert_cache
-
    @property
    def ram_cache(self) -> ModelCacheBase[AnyModel]:
        """Return the ram cache associated with this loader."""
@@ -76,20 +65,15 @@ class ModelLoader(ModelLoaderBase):
        model_base = self._app_config.models_path
        return (model_base / config.path).resolve()

-    def _convert_and_load(
-        self, config: AnyModelConfig, model_path: Path, submodel_type: Optional[SubModelType] = None
-    ) -> ModelLockerBase:
+    def _load_and_cache(self, config: AnyModelConfig, submodel_type: Optional[SubModelType] = None) -> ModelLockerBase:
+        stats_name = ":".join([config.base, config.type, config.name, (submodel_type or "")])
        try:
-            return self._ram_cache.get(config.key, submodel_type)
+            return self._ram_cache.get(config.key, submodel_type, stats_name=stats_name)
        except IndexError:
            pass

-        cache_path: Path = self._convert_cache.cache_path(str(model_path))
-        if self._needs_conversion(config, model_path, cache_path):
-            loaded_model = self._do_convert(config, model_path, cache_path, submodel_type)
-        else:
-            config.path = str(cache_path) if cache_path.exists() else str(self._get_model_path(config))
-            loaded_model = self._load_model(config, submodel_type)
+        config.path = str(self._get_model_path(config))
+        loaded_model = self._load_model(config, submodel_type)

        self._ram_cache.put(
            config.key,
@@ -100,7 +84,7 @@ class ModelLoader(ModelLoaderBase):
        return self._ram_cache.get(
            key=config.key,
            submodel_type=submodel_type,
-            stats_name=":".join([config.base, config.type, config.name, (submodel_type or "")]),
+            stats_name=stats_name,
        )

    def get_size_fs(
@@ -113,28 +97,6 @@ class ModelLoader(ModelLoaderBase):
            variant=config.repo_variant if isinstance(config, DiffusersConfigBase) else None,
        )

-    def _do_convert(
-        self, config: AnyModelConfig, model_path: Path, cache_path: Path, submodel_type: Optional[SubModelType] = None
-    ) -> AnyModel:
-        self.convert_cache.make_room(calc_model_size_by_fs(model_path))
-        pipeline = self._convert_model(config, model_path, cache_path if self.convert_cache.max_size > 0 else None)
-        if submodel_type:
-            # Proactively load the various submodels into the RAM cache so that we don't have to re-convert
-            # the entire pipeline every time a new submodel is needed.
-            for subtype in SubModelType:
-                if subtype == submodel_type:
-                    continue
-                if submodel := getattr(pipeline, subtype.value, None):
-                    self._ram_cache.put(config.key, submodel_type=subtype, model=submodel)
-        return getattr(pipeline, submodel_type.value) if submodel_type else pipeline
-
-    def _needs_conversion(self, config: AnyModelConfig, model_path: Path, dest_path: Path) -> bool:
-        return False
-
-    # This needs to be implemented in subclasses that handle checkpoints
-    def _convert_model(self, config: AnyModelConfig, model_path: Path, output_path: Optional[Path] = None) -> AnyModel:
-        raise NotImplementedError
-
    # This needs to be implemented in the subclass
    def _load_model(
        self,
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
@@ -8,9 +8,10 @@ model will be cleared and (re)loaded from disk when next needed.
 """

 from abc import ABC, abstractmethod
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from logging import Logger
-from typing import Dict, Generic, Optional, TypeVar
+from typing import Dict, Generator, Generic, Optional, Set, TypeVar

 import torch

@@ -69,6 +70,7 @@ class CacheRecord(Generic[T]):
    """

    key: str
+    size: int
    model: T
    device: torch.device
    state_dict: Optional[Dict[str, torch.Tensor]]
@@ -115,30 +117,50 @@ class ModelCacheBase(ABC, Generic[T]):

    @property
    @abstractmethod
-    def execution_device(self) -> torch.device:
-        """Return the exection device (e.g. "cuda" for VRAM)."""
+    def execution_devices(self) -> Set[torch.device]:
+        """Return the set of available execution devices."""
        pass

-    @property
+    @contextmanager
    @abstractmethod
-    def lazy_offloading(self) -> bool:
-        """Return true if the cache is configured to lazily offload models in VRAM."""
+    def reserve_execution_device(self, timeout: int = 0) -> Generator[torch.device, None, None]:
+        """Reserve an execution device (GPU) under the current thread id."""
+        pass
+
+    @abstractmethod
+    def get_execution_device(self) -> torch.device:
+        """
+        Return an execution device that has been reserved for current thread.
+
+        Note that reservations are done using the current thread's TID.
+        It might be better to do this using the session ID, but that involves
+        too many detailed changes to model manager calls.
+
+        May generate a ValueError if no GPU has been reserved.
+        """
        pass

    @property
    @abstractmethod
    def max_cache_size(self) -> float:
-        """Return true if the cache is configured to lazily offload models in VRAM."""
+        """Return the maximum size the RAM cache can grow to."""
        pass

+    @max_cache_size.setter
    @abstractmethod
-    def offload_unlocked_models(self, size_required: int) -> None:
-        """Offload from VRAM any models not actively in use."""
+    def max_cache_size(self, value: float) -> None:
+        """Set the cap on vram cache size."""
+
+    @property
+    @abstractmethod
+    def max_vram_cache_size(self) -> float:
+        """Return the maximum size the VRAM cache can grow to."""
        pass

+    @max_vram_cache_size.setter
    @abstractmethod
-    def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
-        """Move model into the indicated device."""
+    def max_vram_cache_size(self, value: float) -> float:
+        """Set the maximum size the VRAM cache can grow to."""
        pass

    @property
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -20,10 +20,13 @@ context. Use like this:

 import gc
 import math
+import sys
+import threading
 import time
-from contextlib import suppress
+from contextlib import contextmanager, suppress
 from logging import Logger
-from typing import Dict, List, Optional
+from threading import BoundedSemaphore
+from typing import Dict, Generator, List, Optional, Set

 import torch

@@ -39,9 +42,7 @@ from .model_locker import ModelLocker
 # Maximum size of the cache, in gigs
 # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
 DEFAULT_MAX_CACHE_SIZE = 6.0
-
-# amount of GPU memory to hold in reserve for use by generations (GB)
-DEFAULT_MAX_VRAM_CACHE_SIZE = 2.75
+DEFAULT_MAX_VRAM_CACHE_SIZE = 0.25

 # actual size of a gig
 GIG = 1073741824
@@ -57,12 +58,10 @@ class ModelCache(ModelCacheBase[AnyModel]):
        self,
        max_cache_size: float = DEFAULT_MAX_CACHE_SIZE,
        max_vram_cache_size: float = DEFAULT_MAX_VRAM_CACHE_SIZE,
-        execution_device: torch.device = torch.device("cuda"),
        storage_device: torch.device = torch.device("cpu"),
+        execution_devices: Optional[Set[torch.device]] = None,
        precision: torch.dtype = torch.float16,
-        sequential_offload: bool = False,
        lazy_offloading: bool = True,
-        sha_chunksize: int = 16777216,
        log_memory_usage: bool = False,
        logger: Optional[Logger] = None,
    ):
@@ -70,23 +69,18 @@ class ModelCache(ModelCacheBase[AnyModel]):
        Initialize the model RAM cache.

        :param max_cache_size: Maximum size of the RAM cache [6.0 GB]
-        :param execution_device: Torch device to load active model into [torch.device('cuda')]
        :param storage_device: Torch device to save inactive model in [torch.device('cpu')]
        :param precision: Precision for loaded models [torch.float16]
-        :param lazy_offloading: Keep model in VRAM until another model needs to be loaded
-        :param sequential_offload: Conserve VRAM by loading and unloading each stage of the pipeline sequentially
        :param log_memory_usage: If True, a memory snapshot will be captured before and after every model cache
            operation, and the result will be logged (at debug level). There is a time cost to capturing the memory
            snapshots, so it is recommended to disable this feature unless you are actively inspecting the model cache's
            behaviour.
        """
-        # allow lazy offloading only when vram cache enabled
-        self._lazy_offloading = lazy_offloading and max_vram_cache_size > 0
        self._precision: torch.dtype = precision
        self._max_cache_size: float = max_cache_size
        self._max_vram_cache_size: float = max_vram_cache_size
-        self._execution_device: torch.device = execution_device
        self._storage_device: torch.device = storage_device
+        self._ram_lock = threading.Lock()
        self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
        self._log_memory_usage = log_memory_usage
        self._stats: Optional[CacheStats] = None
@@ -94,25 +88,88 @@ class ModelCache(ModelCacheBase[AnyModel]):
        self._cached_models: Dict[str, CacheRecord[AnyModel]] = {}
        self._cache_stack: List[str] = []

+        # device to thread id
+        self._device_lock = threading.Lock()
+        self._execution_devices: Dict[torch.device, int] = {x: 0 for x in TorchDevice.execution_devices()}
+        self._free_execution_device = BoundedSemaphore(len(self._execution_devices))
+
+        self.logger.info(
+            f"Using rendering device(s): {', '.join(sorted([str(x) for x in self._execution_devices.keys()]))}"
+        )
+
    @property
    def logger(self) -> Logger:
        """Return the logger used by the cache."""
        return self._logger

-    @property
-    def lazy_offloading(self) -> bool:
-        """Return true if the cache is configured to lazily offload models in VRAM."""
-        return self._lazy_offloading
-
    @property
    def storage_device(self) -> torch.device:
        """Return the storage device (e.g. "CPU" for RAM)."""
        return self._storage_device

    @property
-    def execution_device(self) -> torch.device:
-        """Return the exection device (e.g. "cuda" for VRAM)."""
-        return self._execution_device
+    def execution_devices(self) -> Set[torch.device]:
+        """Return the set of available execution devices."""
+        devices = self._execution_devices.keys()
+        return set(devices)
+
+    def get_execution_device(self) -> torch.device:
+        """
+        Return an execution device that has been reserved for current thread.
+
+        Note that reservations are done using the current thread's TID.
+        It would be better to do this using the session ID, but that involves
+        too many detailed changes to model manager calls.
+
+        May generate a ValueError if no GPU has been reserved.
+        """
+        current_thread = threading.current_thread().ident
+        assert current_thread is not None
+        assigned = [x for x, tid in self._execution_devices.items() if current_thread == tid]
+        if not assigned:
+            raise ValueError(f"No GPU has been reserved for the use of thread {current_thread}")
+        self.logger.info(f"Returned device {assigned[0]} for execution thread {current_thread}")
+        return assigned[0]
+
+    @contextmanager
+    def reserve_execution_device(self, timeout: Optional[int] = None) -> Generator[torch.device, None, None]:
+        """Reserve an execution device (e.g. GPU) for exclusive use by a generation thread.
+
+        Note that the reservation is done using the current thread's TID.
+        It would be better to do this using the session ID, but that involves
+        too many detailed changes to model manager calls.
+        """
+        device = None
+        with self._device_lock:
+            current_thread = threading.current_thread().ident
+            assert current_thread is not None
+
+            # look for a device that has already been assigned to this thread
+            assigned = [x for x, tid in self._execution_devices.items() if current_thread == tid]
+            if assigned:
+                device = assigned[0]
+
+        # no device already assigned. Get one.
+        if device is None:
+            self._free_execution_device.acquire(timeout=timeout)
+            with self._device_lock:
+                free_device = [x for x, tid in self._execution_devices.items() if tid == 0]
+                self._execution_devices[free_device[0]] = current_thread
+                device = free_device[0]
+
+        # we are outside the lock region now
+        self.logger.info(f"Reserved torch device {device} for execution thread {current_thread}")
+
+        # Tell TorchDevice to use this object to get the torch device.
+        TorchDevice.set_model_cache(self)
+        try:
+            yield device
+        finally:
+            with self._device_lock:
+                self.logger.info(f"Released torch device {device}")
+                self._execution_devices[device] = 0
+                self._free_execution_device.release()
+                TorchDevice.empty_cache()

    @property
    def max_cache_size(self) -> float:
@@ -124,6 +181,16 @@ class ModelCache(ModelCacheBase[AnyModel]):
        """Set the cap on cache size."""
        self._max_cache_size = value

+    @property
+    def max_vram_cache_size(self) -> float:
+        """Return the cap on vram cache size."""
+        return self._max_vram_cache_size
+
+    @max_vram_cache_size.setter
+    def max_vram_cache_size(self, value: float) -> None:
+        """Set the cap on vram cache size."""
+        self._max_vram_cache_size = value
+
    @property
    def stats(self) -> Optional[CacheStats]:
        """Return collected CacheStats object."""
@@ -184,36 +251,37 @@ class ModelCache(ModelCacheBase[AnyModel]):

        This may raise an IndexError if the model is not in the cache.
        """
-        key = self._make_cache_key(key, submodel_type)
-        if key in self._cached_models:
-            if self.stats:
-                self.stats.hits += 1
-        else:
-            if self.stats:
-                self.stats.misses += 1
-            raise IndexError(f"The model with key {key} is not in the cache.")
+        with self._ram_lock:
+            key = self._make_cache_key(key, submodel_type)
+            if key in self._cached_models:
+                if self.stats:
+                    self.stats.hits += 1
+            else:
+                if self.stats:
+                    self.stats.misses += 1
+                raise IndexError(f"The model with key {key} is not in the cache.")

-        cache_entry = self._cached_models[key]
+            cache_entry = self._cached_models[key]

-        # more stats
-        if self.stats:
-            stats_name = stats_name or key
-            self.stats.cache_size = int(self._max_cache_size * GIG)
-            self.stats.high_watermark = max(self.stats.high_watermark, self.cache_size())
-            self.stats.in_cache = len(self._cached_models)
-            self.stats.loaded_model_sizes[stats_name] = max(
-                self.stats.loaded_model_sizes.get(stats_name, 0), cache_entry.size
+            # more stats
+            if self.stats:
+                stats_name = stats_name or key
+                self.stats.cache_size = int(self._max_cache_size * GIG)
+                self.stats.high_watermark = max(self.stats.high_watermark, self.cache_size())
+                self.stats.in_cache = len(self._cached_models)
+                self.stats.loaded_model_sizes[stats_name] = max(
+                    self.stats.loaded_model_sizes.get(stats_name, 0), cache_entry.size
+                )
+
+            # this moves the entry to the top (right end) of the stack
+            with suppress(Exception):
+                self._cache_stack.remove(key)
+            self._cache_stack.append(key)
+            return ModelLocker(
+                cache=self,
+                cache_entry=cache_entry,
            )

-        # this moves the entry to the top (right end) of the stack
-        with suppress(Exception):
-            self._cache_stack.remove(key)
-        self._cache_stack.append(key)
-        return ModelLocker(
-            cache=self,
-            cache_entry=cache_entry,
-        )
-
    def _capture_memory_snapshot(self) -> Optional[MemorySnapshot]:
        if self._log_memory_usage:
            return MemorySnapshot.capture()
@@ -225,23 +293,12 @@ class ModelCache(ModelCacheBase[AnyModel]):
        else:
            return model_key

-    def offload_unlocked_models(self, size_required: int) -> None:
+    def offload_unlocked_models(self) -> None:
        """Move any unused models from VRAM."""
-        reserved = self._max_vram_cache_size * GIG
-        vram_in_use = torch.cuda.memory_allocated() + size_required
-        self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM needed for models; max allowed={(reserved/GIG):.2f}GB")
-        for _, cache_entry in sorted(self._cached_models.items(), key=lambda x: x[1].size):
-            if vram_in_use <= reserved:
-                break
-            if not cache_entry.loaded:
-                continue
-            if not cache_entry.locked:
+        for _, cache_entry in self._cached_models.items():
+            if cache_entry.loaded and not cache_entry.locked:
                self.move_model_to_device(cache_entry, self.storage_device)
                cache_entry.loaded = False
-                vram_in_use = torch.cuda.memory_allocated() + size_required
-                self.logger.debug(
-                    f"Removing {cache_entry.key} from VRAM to free {(cache_entry.size/GIG):.2f}GB; vram free = {(torch.cuda.memory_allocated()/GIG):.2f}GB"
-                )

        TorchDevice.empty_cache()

@@ -256,9 +313,7 @@ class ModelCache(ModelCacheBase[AnyModel]):
        self.logger.debug(f"Called to move {cache_entry.key} to {target_device}")
        source_device = cache_entry.device

-        # Note: We compare device types only so that 'cuda' == 'cuda:0'.
-        # This would need to be revised to support multi-GPU.
-        if torch.device(source_device).type == torch.device(target_device).type:
+        if source_device == target_device:
            return

        # Some models don't have a `to` method, in which case they run in RAM/CPU.
@@ -285,9 +340,9 @@ class ModelCache(ModelCacheBase[AnyModel]):
                else:
                    new_dict: Dict[str, torch.Tensor] = {}
                    for k, v in cache_entry.state_dict.items():
-                        new_dict[k] = v.to(torch.device(target_device), copy=True, non_blocking=True)
+                        new_dict[k] = v.to(target_device, copy=True)
                    cache_entry.model.load_state_dict(new_dict, assign=True)
-            cache_entry.model.to(target_device, non_blocking=True)
+            cache_entry.model.to(target_device)
            cache_entry.device = target_device
        except Exception as e:  # blow away cache entry
            self._delete_cache_entry(cache_entry)
@@ -295,7 +350,7 @@ class ModelCache(ModelCacheBase[AnyModel]):

        snapshot_after = self._capture_memory_snapshot()
        end_model_to_time = time.time()
-        self.logger.debug(
+        self.logger.info(
            f"Moved model '{cache_entry.key}' from {source_device} to"
            f" {target_device} in {(end_model_to_time-start_model_to_time):.2f}s."
            f"Estimated model size: {(cache_entry.size/GIG):.3f} GB."
@@ -349,8 +404,6 @@ class ModelCache(ModelCacheBase[AnyModel]):

    def make_room(self, size: int) -> None:
        """Make enough room in the cache to accommodate a new model of indicated size."""
-        # calculate how much memory this model will require
-        # multiplier = 2 if self.precision==torch.float32 else 1
        bytes_needed = size
        maximum_size = self.max_cache_size * GIG  # stored in GB, convert to bytes
        current_size = self.cache_size()
@@ -404,6 +457,20 @@ class ModelCache(ModelCacheBase[AnyModel]):
        TorchDevice.empty_cache()
        self.logger.debug(f"After making room: cached_models={len(self._cached_models)}")

+    def _check_free_vram(self, target_device: torch.device, needed_size: int) -> None:
+        if target_device.type != "cuda":
+            return
+        vram_device = (  # mem_get_info() needs an indexed device
+            target_device if target_device.index is not None else torch.device(str(target_device), index=0)
+        )
+        free_mem, _ = torch.cuda.mem_get_info(vram_device)
+        if needed_size > free_mem:
+            raise torch.cuda.OutOfMemoryError
+
    def _delete_cache_entry(self, cache_entry: CacheRecord[AnyModel]) -> None:
        self._cache_stack.remove(cache_entry.key)
        del self._cached_models[cache_entry.key]
+
+    @staticmethod
+    def _device_name(device: torch.device) -> str:
+        return f"{device.type}:{device.index}"
--- a/invokeai/backend/model_manager/load/model_cache/model_locker.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_locker.py
@@ -10,6 +10,8 @@ from invokeai.backend.model_manager import AnyModel

 from .model_cache_base import CacheRecord, ModelCacheBase, ModelLockerBase

+MAX_GPU_WAIT = 600  # wait up to 10 minutes for a GPU to become free
+

 class ModelLocker(ModelLockerBase):
    """Internal class that mediates movement in and out of GPU."""
@@ -37,11 +39,10 @@ class ModelLocker(ModelLockerBase):
        """Move the model into the execution device (GPU) and lock it."""
        self._cache_entry.lock()
        try:
-            if self._cache.lazy_offloading:
-                self._cache.offload_unlocked_models(self._cache_entry.size)
-            self._cache.move_model_to_device(self._cache_entry, self._cache.execution_device)
+            device = self._cache.get_execution_device()
+            self._cache.move_model_to_device(self._cache_entry, device)
            self._cache_entry.loaded = True
-            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {self._cache.execution_device}")
+            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {device}")
            self._cache.print_cuda_stats()
        except torch.cuda.OutOfMemoryError:
            self._cache.logger.warning("Insufficient GPU memory to load model. Aborting")
@@ -56,6 +57,5 @@ class ModelLocker(ModelLockerBase):
    def unlock(self) -> None:
        """Call upon exit from context."""
        self._cache_entry.unlock()
-        if not self._cache.lazy_offloading:
-            self._cache.offload_unlocked_models(0)
-            self._cache.print_cuda_stats()
+        self._cache.offload_unlocked_models()
+        self._cache.print_cuda_stats()
--- a/invokeai/backend/model_manager/load/model_loaders/controlnet.py
+++ b/invokeai/backend/model_manager/load/model_loaders/controlnet.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, Lincoln D. Stein and the InvokeAI Development Team
 """Class for ControlNet model loading in InvokeAI."""

-from pathlib import Path
 from typing import Optional

+from diffusers import ControlNetModel
+
 from invokeai.backend.model_manager import (
    AnyModel,
    AnyModelConfig,
@@ -11,8 +12,7 @@ from invokeai.backend.model_manager import (
    ModelFormat,
    ModelType,
 )
-from invokeai.backend.model_manager.config import CheckpointConfigBase
-from invokeai.backend.model_manager.convert_ckpt_to_diffusers import convert_controlnet_to_diffusers
+from invokeai.backend.model_manager.config import ControlNetCheckpointConfig, SubModelType

 from .. import ModelLoaderRegistry
 from .generic_diffusers import GenericDiffusersLoader
@@ -23,36 +23,15 @@ from .generic_diffusers import GenericDiffusersLoader
 class ControlNetLoader(GenericDiffusersLoader):
    """Class to load ControlNet models."""

-    def _needs_conversion(self, config: AnyModelConfig, model_path: Path, dest_path: Path) -> bool:
-        if not isinstance(config, CheckpointConfigBase):
-            return False
-        elif (
-            dest_path.exists()
-            and (dest_path / "config.json").stat().st_mtime >= (config.converted_at or 0.0)
-            and (dest_path / "config.json").stat().st_mtime >= model_path.stat().st_mtime
-        ):
-            return False
-        else:
-            return True
-
-    def _convert_model(self, config: AnyModelConfig, model_path: Path, output_path: Optional[Path] = None) -> AnyModel:
-        assert isinstance(config, CheckpointConfigBase)
-        image_size = (
-            512
-            if config.base == BaseModelType.StableDiffusion1
-            else 768
-            if config.base == BaseModelType.StableDiffusion2
-            else 1024
-        )
-
-        self._logger.info(f"Converting {model_path} to diffusers format")
-        with open(self._app_config.legacy_conf_path / config.config_path, "r") as config_stream:
-            result = convert_controlnet_to_diffusers(
-                model_path,
-                output_path,
-                original_config_file=config_stream,
-                image_size=image_size,
-                precision=self._torch_dtype,
-                from_safetensors=model_path.suffix == ".safetensors",
+    def _load_model(
+        self,
+        config: AnyModelConfig,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> AnyModel:
+        if isinstance(config, ControlNetCheckpointConfig):
+            return ControlNetModel.from_single_file(
+                config.path,
+                torch_dtype=self._torch_dtype,
            )
-        return result
+        else:
+            return super()._load_model(config, submodel_type)
--- a/invokeai/backend/model_manager/load/model_loaders/lora.py
+++ b/invokeai/backend/model_manager/load/model_loaders/lora.py
@@ -15,7 +15,6 @@ from invokeai.backend.model_manager import (
    ModelType,
    SubModelType,
 )
-from invokeai.backend.model_manager.load.convert_cache import ModelConvertCacheBase
 from invokeai.backend.model_manager.load.model_cache.model_cache_base import ModelCacheBase

 from .. import ModelLoader, ModelLoaderRegistry
@@ -32,10 +31,9 @@ class LoRALoader(ModelLoader):
        app_config: InvokeAIAppConfig,
        logger: Logger,
        ram_cache: ModelCacheBase[AnyModel],
-        convert_cache: ModelConvertCacheBase,
    ):
        """Initialize the loader."""
-        super().__init__(app_config, logger, ram_cache, convert_cache)
+        super().__init__(app_config, logger, ram_cache)
        self._model_base: Optional[BaseModelType] = None

    def _load_model(
--- a/invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py
+++ b/invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py
@@ -4,22 +4,28 @@
 from pathlib import Path
 from typing import Optional

+from diffusers import (
+    StableDiffusionInpaintPipeline,
+    StableDiffusionPipeline,
+    StableDiffusionXLInpaintPipeline,
+    StableDiffusionXLPipeline,
+)
+
 from invokeai.backend.model_manager import (
    AnyModel,
    AnyModelConfig,
    BaseModelType,
    ModelFormat,
    ModelType,
-    SchedulerPredictionType,
+    ModelVariantType,
    SubModelType,
 )
 from invokeai.backend.model_manager.config import (
    CheckpointConfigBase,
    DiffusersConfigBase,
    MainCheckpointConfig,
-    ModelVariantType,
 )
-from invokeai.backend.model_manager.convert_ckpt_to_diffusers import convert_ckpt_to_diffusers
+from invokeai.backend.util.silence_warnings import SilenceWarnings

 from .. import ModelLoaderRegistry
 from .generic_diffusers import GenericDiffusersLoader
@@ -48,8 +54,12 @@ class StableDiffusionDiffusersModel(GenericDiffusersLoader):
        config: AnyModelConfig,
        submodel_type: Optional[SubModelType] = None,
    ) -> AnyModel:
-        if not submodel_type is not None:
+        if isinstance(config, CheckpointConfigBase):
+            return self._load_from_singlefile(config, submodel_type)
+
+        if submodel_type is None:
            raise Exception("A submodel type must be provided when loading main pipelines.")
+
        model_path = Path(config.path)
        load_class = self.get_hf_load_class(model_path, submodel_type)
        repo_variant = config.repo_variant if isinstance(config, DiffusersConfigBase) else None
@@ -71,46 +81,58 @@ class StableDiffusionDiffusersModel(GenericDiffusersLoader):

        return result

-    def _needs_conversion(self, config: AnyModelConfig, model_path: Path, dest_path: Path) -> bool:
-        if not isinstance(config, CheckpointConfigBase):
-            return False
-        elif (
-            dest_path.exists()
-            and (dest_path / "model_index.json").stat().st_mtime >= (config.converted_at or 0.0)
-            and (dest_path / "model_index.json").stat().st_mtime >= model_path.stat().st_mtime
-        ):
-            return False
-        else:
-            return True
-
-    def _convert_model(self, config: AnyModelConfig, model_path: Path, output_path: Optional[Path] = None) -> AnyModel:
+    def _load_from_singlefile(
+        self,
+        config: AnyModelConfig,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> AnyModel:
+        load_classes = {
+            BaseModelType.StableDiffusion1: {
+                ModelVariantType.Normal: StableDiffusionPipeline,
+                ModelVariantType.Inpaint: StableDiffusionInpaintPipeline,
+            },
+            BaseModelType.StableDiffusion2: {
+                ModelVariantType.Normal: StableDiffusionPipeline,
+                ModelVariantType.Inpaint: StableDiffusionInpaintPipeline,
+            },
+            BaseModelType.StableDiffusionXL: {
+                ModelVariantType.Normal: StableDiffusionXLPipeline,
+                ModelVariantType.Inpaint: StableDiffusionXLInpaintPipeline,
+            },
+        }
        assert isinstance(config, MainCheckpointConfig)
-        base = config.base
-
+        try:
+            load_class = load_classes[config.base][config.variant]
+        except KeyError as e:
+            raise Exception(f"No diffusers pipeline known for base={config.base}, variant={config.variant}") from e
        prediction_type = config.prediction_type.value
        upcast_attention = config.upcast_attention
-        image_size = (
-            1024
-            if base == BaseModelType.StableDiffusionXL
-            else 768
-            if config.prediction_type == SchedulerPredictionType.VPrediction and base == BaseModelType.StableDiffusion2
-            else 512
-        )

-        self._logger.info(f"Converting {model_path} to diffusers format")
+        # Without SilenceWarnings we get log messages like this:
+        # site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
+        # warnings.warn(
+        # Some weights of the model checkpoint were not used when initializing CLIPTextModel:
+        # ['text_model.embeddings.position_ids']
+        # Some weights of the model checkpoint were not used when initializing CLIPTextModelWithProjection:
+        # ['text_model.embeddings.position_ids']

-        loaded_model = convert_ckpt_to_diffusers(
-            model_path,
-            output_path,
-            model_type=self.model_base_to_model_type[base],
-            original_config_file=self._app_config.legacy_conf_path / config.config_path,
-            extract_ema=True,
-            from_safetensors=model_path.suffix == ".safetensors",
-            precision=self._torch_dtype,
-            prediction_type=prediction_type,
-            image_size=image_size,
-            upcast_attention=upcast_attention,
-            load_safety_checker=False,
-            num_in_channels=VARIANT_TO_IN_CHANNEL_MAP[config.variant],
-        )
-        return loaded_model
+        with SilenceWarnings():
+            pipeline = load_class.from_single_file(
+                config.path,
+                torch_dtype=self._torch_dtype,
+                prediction_type=prediction_type,
+                upcast_attention=upcast_attention,
+                load_safety_checker=False,
+            )
+
+        if not submodel_type:
+            return pipeline
+
+        # Proactively load the various submodels into the RAM cache so that we don't have to re-load
+        # the entire pipeline every time a new submodel is needed.
+        for subtype in SubModelType:
+            if subtype == submodel_type:
+                continue
+            if submodel := getattr(pipeline, subtype.value, None):
+                self._ram_cache.put(config.key, submodel_type=subtype, model=submodel)
+        return getattr(pipeline, submodel_type.value)
--- a/invokeai/backend/model_manager/load/model_loaders/vae.py
+++ b/invokeai/backend/model_manager/load/model_loaders/vae.py
@@ -1,12 +1,9 @@
 # Copyright (c) 2024, Lincoln D. Stein and the InvokeAI Development Team
 """Class for VAE model loading in InvokeAI."""

-from pathlib import Path
 from typing import Optional

-import torch
-from omegaconf import DictConfig, OmegaConf
-from safetensors.torch import load_file as safetensors_load_file
+from diffusers import AutoencoderKL

 from invokeai.backend.model_manager import (
    AnyModelConfig,
@@ -14,8 +11,7 @@ from invokeai.backend.model_manager import (
    ModelFormat,
    ModelType,
 )
-from invokeai.backend.model_manager.config import AnyModel, CheckpointConfigBase
-from invokeai.backend.model_manager.convert_ckpt_to_diffusers import convert_ldm_vae_to_diffusers
+from invokeai.backend.model_manager.config import AnyModel, SubModelType, VAECheckpointConfig

 from .. import ModelLoaderRegistry
 from .generic_diffusers import GenericDiffusersLoader
@@ -26,39 +22,15 @@ from .generic_diffusers import GenericDiffusersLoader
 class VAELoader(GenericDiffusersLoader):
    """Class to load VAE models."""

-    def _needs_conversion(self, config: AnyModelConfig, model_path: Path, dest_path: Path) -> bool:
-        if not isinstance(config, CheckpointConfigBase):
-            return False
-        elif (
-            dest_path.exists()
-            and (dest_path / "config.json").stat().st_mtime >= (config.converted_at or 0.0)
-            and (dest_path / "config.json").stat().st_mtime >= model_path.stat().st_mtime
-        ):
-            return False
+    def _load_model(
+        self,
+        config: AnyModelConfig,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> AnyModel:
+        if isinstance(config, VAECheckpointConfig):
+            return AutoencoderKL.from_single_file(
+                config.path,
+                torch_dtype=self._torch_dtype,
+            )
        else:
-            return True
-
-    def _convert_model(self, config: AnyModelConfig, model_path: Path, output_path: Optional[Path] = None) -> AnyModel:
-        assert isinstance(config, CheckpointConfigBase)
-        config_file = self._app_config.legacy_conf_path / config.config_path
-
-        if model_path.suffix == ".safetensors":
-            checkpoint = safetensors_load_file(model_path, device="cpu")
-        else:
-            checkpoint = torch.load(model_path, map_location="cpu")
-
-        # sometimes weights are hidden under "state_dict", and sometimes not
-        if "state_dict" in checkpoint:
-            checkpoint = checkpoint["state_dict"]
-
-        ckpt_config = OmegaConf.load(config_file)
-        assert isinstance(ckpt_config, DictConfig)
-        self._logger.info(f"Converting {model_path} to diffusers format")
-        vae_model = convert_ldm_vae_to_diffusers(
-            checkpoint=checkpoint,
-            vae_config=ckpt_config,
-            image_size=512,
-            precision=self._torch_dtype,
-            dump_path=output_path,
-        )
-        return vae_model
+            return super()._load_model(config, submodel_type)
--- a/invokeai/backend/model_manager/probe.py
+++ b/invokeai/backend/model_manager/probe.py
@@ -312,6 +312,8 @@ class ModelProbe(object):
            config_file = (
                "stable-diffusion/v1-inference.yaml"
                if base_type is BaseModelType.StableDiffusion1
+                else "stable-diffusion/sd_xl_base.yaml"
+                if base_type is BaseModelType.StableDiffusionXL
                else "stable-diffusion/v2-inference.yaml"
            )
        else:
--- a/invokeai/backend/util/devices.py
+++ b/invokeai/backend/util/devices.py
@@ -1,10 +1,16 @@
-from typing import Dict, Literal, Optional, Union
+"""Torch Device class provides torch device selection services."""
+
+from typing import TYPE_CHECKING, Dict, Literal, Optional, Set, Union

 import torch
 from deprecated import deprecated

 from invokeai.app.services.config.config_default import get_config

+if TYPE_CHECKING:
+    from invokeai.backend.model_manager.config import AnyModel
+    from invokeai.backend.model_manager.load.model_cache.model_cache_base import ModelCacheBase
+
 # legacy APIs
 TorchPrecisionNames = Literal["float32", "float16", "bfloat16"]
 CPU_DEVICE = torch.device("cpu")
@@ -42,9 +48,23 @@ PRECISION_TO_NAME: Dict[torch.dtype, TorchPrecisionNames] = {v: k for k, v in NA
 class TorchDevice:
    """Abstraction layer for torch devices."""

+    _model_cache: Optional["ModelCacheBase[AnyModel]"] = None
+
+    @classmethod
+    def set_model_cache(cls, cache: "ModelCacheBase[AnyModel]"):
+        """Set the current model cache."""
+        cls._model_cache = cache
+
    @classmethod
    def choose_torch_device(cls) -> torch.device:
        """Return the torch.device to use for accelerated inference."""
+        if cls._model_cache:
+            return cls._model_cache.get_execution_device()
+        else:
+            return cls._choose_device()
+
+    @classmethod
+    def _choose_device(cls) -> torch.device:
        app_config = get_config()
        if app_config.device != "auto":
            device = torch.device(app_config.device)
@@ -56,11 +76,19 @@ class TorchDevice:
            device = CPU_DEVICE
        return cls.normalize(device)

+    @classmethod
+    def execution_devices(cls) -> Set[torch.device]:
+        """Return a list of torch.devices that can be used for accelerated inference."""
+        app_config = get_config()
+        if app_config.devices is None:
+            return cls._lookup_execution_devices()
+        return {torch.device(x) for x in app_config.devices}
+
    @classmethod
    def choose_torch_dtype(cls, device: Optional[torch.device] = None) -> torch.dtype:
        """Return the precision to use for accelerated inference."""
-        device = device or cls.choose_torch_device()
        config = get_config()
+        device = device or cls._choose_device()
        if device.type == "cuda" and torch.cuda.is_available():
            device_name = torch.cuda.get_device_name(device)
            if "GeForce GTX 1660" in device_name or "GeForce GTX 1650" in device_name:
@@ -108,3 +136,13 @@ class TorchDevice:
    @classmethod
    def _to_dtype(cls, precision_name: TorchPrecisionNames) -> torch.dtype:
        return NAME_TO_PRECISION[precision_name]
+
+    @classmethod
+    def _lookup_execution_devices(cls) -> Set[torch.device]:
+        if torch.cuda.is_available():
+            devices = {torch.device(f"cuda:{x}") for x in range(0, torch.cuda.device_count())}
+        elif torch.backends.mps.is_available():
+            devices = {torch.device("mps")}
+        else:
+            devices = {torch.device("cpu")}
+        return devices
--- a/invokeai/version/invokeai_version.py
+++ b/invokeai/version/invokeai_version.py
@@ -1 +1 @@
-__version__ = "4.2.4"
+__version__ = "0.0.2"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools~=65.5", "pip~=22.3", "wheel"]
 build-backend = "setuptools.build_meta"

 [project]
-name = "InvokeAI"
+name = "InvokeAI-MGPU"
 description = "An implementation of Stable Diffusion which provides various new features and options to aid the image generation process"
 requires-python = ">=3.10, <3.12"
 readme = { content-type = "text/markdown", file = "README.md" }
--- a/scripts/populate_model_db_from_yaml.py
+++ b/scripts/populate_model_db_from_yaml.py
@@ -0,0 +1,54 @@
+#!/bin/env python
+
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+
+from invokeai.app.services.config import InvokeAIAppConfig, get_config
+from invokeai.app.services.download import DownloadQueueService
+from invokeai.app.services.model_install import ModelInstallService
+from invokeai.app.services.model_records import ModelRecordServiceSQL
+from invokeai.app.services.shared.sqlite.sqlite_database import SqliteDatabase
+from invokeai.backend.util.logging import InvokeAILogger
+
+
+def get_args() -> Namespace:
+    parser = ArgumentParser(description="Update models database from yaml file")
+    parser.add_argument("--root", type=Path, required=False, default=None)
+    parser.add_argument("--yaml_file", type=Path, required=False, default=None)
+    return parser.parse_args()
+
+
+def populate_config() -> InvokeAIAppConfig:
+    args = get_args()
+    config = get_config()
+    if args.root:
+        config._root = args.root
+    if args.yaml_file:
+        config.legacy_models_yaml_path = args.yaml_file
+    else:
+        config.legacy_models_yaml_path = config.root_path / "configs/models.yaml"
+    return config
+
+
+def initialize_installer(config: InvokeAIAppConfig) -> ModelInstallService:
+    logger = InvokeAILogger.get_logger(config=config)
+    db = SqliteDatabase(config.db_path, logger)
+    record_store = ModelRecordServiceSQL(db)
+    queue = DownloadQueueService()
+    queue.start()
+    installer = ModelInstallService(app_config=config, record_store=record_store, download_queue=queue)
+    return installer
+
+
+def main() -> None:
+    config = populate_config()
+    installer = initialize_installer(config)
+    installer._migrate_yaml(rename_yaml=False, overwrite_db=True)
+    print("\n<INSTALLED MODELS>")
+    print("\t".join(["key", "name", "type", "path"]))
+    for model in installer.record_store.all_models():
+        print("\t".join([model.key, model.name, model.type, (config.models_path / model.path).as_posix()]))
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/backend/model_manager/model_loading/test_model_load.py
+++ b/tests/backend/model_manager/model_loading/test_model_load.py
@@ -14,13 +14,14 @@ def test_loading(mm2_model_manager: ModelManagerServiceBase, embedding_file: Pat
    matches = store.search_by_attr(model_name="test_embedding")
    assert len(matches) == 0
    key = mm2_model_manager.install.register_path(embedding_file)
-    loaded_model = mm2_model_manager.load.load_model(store.get_model(key))
-    assert loaded_model is not None
-    assert loaded_model.config.key == key
-    with loaded_model as model:
-        assert isinstance(model, TextualInversionModelRaw)
+    with mm2_model_manager.load.ram_cache.reserve_execution_device():
+        loaded_model = mm2_model_manager.load.load_model(store.get_model(key))
+        assert loaded_model is not None
+        assert loaded_model.config.key == key
+        with loaded_model as model:
+            assert isinstance(model, TextualInversionModelRaw)

-    config = mm2_model_manager.store.get_model(key)
-    loaded_model_2 = mm2_model_manager.load.load_model(config)
+        config = mm2_model_manager.store.get_model(key)
+        loaded_model_2 = mm2_model_manager.load.load_model(config)

-    assert loaded_model.config.key == loaded_model_2.config.key
+        assert loaded_model.config.key == loaded_model_2.config.key
--- a/tests/backend/model_manager/model_manager_fixtures.py
+++ b/tests/backend/model_manager/model_manager_fixtures.py
@@ -25,7 +25,7 @@ from invokeai.backend.model_manager.config import (
    ModelVariantType,
    VAEDiffusersConfig,
 )
-from invokeai.backend.model_manager.load import ModelCache, ModelConvertCache
+from invokeai.backend.model_manager.load import ModelCache
 from invokeai.backend.util.logging import InvokeAILogger
 from tests.backend.model_manager.model_metadata.metadata_examples import (
    HFTestLoraMetadata,
@@ -89,17 +89,14 @@ def mm2_download_queue(mm2_session: Session) -> DownloadQueueServiceBase:


@pytest.fixture
-def mm2_loader(mm2_app_config: InvokeAIAppConfig, mm2_record_store: ModelRecordServiceBase) -> ModelLoadServiceBase:
+def mm2_loader(mm2_app_config: InvokeAIAppConfig) -> ModelLoadServiceBase:
    ram_cache = ModelCache(
        logger=InvokeAILogger.get_logger(),
        max_cache_size=mm2_app_config.ram,
-        max_vram_cache_size=mm2_app_config.vram,
    )
-    convert_cache = ModelConvertCache(mm2_app_config.convert_cache_path)
    return ModelLoadService(
        app_config=mm2_app_config,
        ram_cache=ram_cache,
-        convert_cache=convert_cache,
    )


--- a/tests/backend/util/test_devices.py
+++ b/tests/backend/util/test_devices.py
@@ -8,7 +8,9 @@ import pytest
 import torch

 from invokeai.app.services.config import get_config
+from invokeai.backend.model_manager.load import ModelCache
 from invokeai.backend.util.devices import TorchDevice, choose_precision, choose_torch_device, torch_dtype
+from tests.backend.model_manager.model_manager_fixtures import *  # noqa F403

 devices = ["cpu", "cuda:0", "cuda:1", "mps"]
 device_types_cpu = [("cpu", torch.float32), ("cuda:0", torch.float32), ("mps", torch.float32)]
@@ -20,6 +22,7 @@ device_types_mps = [("cpu", torch.float32), ("cuda:0", torch.float32), ("mps", t
 def test_device_choice(device_name):
    config = get_config()
    config.device = device_name
+    TorchDevice.set_model_cache(None)  # disable dynamic selection of GPU device
    torch_device = TorchDevice.choose_torch_device()
    assert torch_device == torch.device(device_name)

@@ -130,3 +133,32 @@ def test_legacy_precision_name():
        assert "float16" == choose_precision(torch.device("cuda"))
        assert "float16" == choose_precision(torch.device("mps"))
        assert "float32" == choose_precision(torch.device("cpu"))
+
+
+def test_multi_device_support_1():
+    config = get_config()
+    config.devices = ["cuda:0", "cuda:1"]
+    assert TorchDevice.execution_devices() == {torch.device("cuda:0"), torch.device("cuda:1")}
+
+
+def test_multi_device_support_2():
+    config = get_config()
+    config.devices = None
+    with (
+        patch("torch.cuda.device_count", return_value=3),
+        patch("torch.cuda.is_available", return_value=True),
+    ):
+        assert TorchDevice.execution_devices() == {
+            torch.device("cuda:0"),
+            torch.device("cuda:1"),
+            torch.device("cuda:2"),
+        }
+
+
+def test_multi_device_support_3():
+    config = get_config()
+    config.devices = ["cuda:0", "cuda:1"]
+    cache = ModelCache()
+    with cache.reserve_execution_device() as gpu:
+        assert gpu in [torch.device(x) for x in config.devices]
+        assert TorchDevice.choose_torch_device() == gpu
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -17,7 +17,6 @@ from invokeai.app.services.config.config_default import InvokeAIAppConfig
 from invokeai.app.services.images.images_default import ImageService
 from invokeai.app.services.invocation_cache.invocation_cache_memory import MemoryInvocationCache
 from invokeai.app.services.invocation_services import InvocationServices
-from invokeai.app.services.invocation_stats.invocation_stats_default import InvocationStatsService
 from invokeai.app.services.invoker import Invoker
 from invokeai.backend.util.logging import InvokeAILogger
 from tests.backend.model_manager.model_manager_fixtures import *  # noqa: F403
@@ -49,13 +48,13 @@ def mock_services() -> InvocationServices:
        model_manager=None,  # type: ignore
        download_queue=None,  # type: ignore
        names=None,  # type: ignore
-        performance_statistics=InvocationStatsService(),
        session_processor=None,  # type: ignore
        session_queue=None,  # type: ignore
        urls=None,  # type: ignore
        workflow_records=None,  # type: ignore
        tensors=None,  # type: ignore
        conditioning=None,  # type: ignore
+        performance_statistics=None,  # type: ignore
    )


--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -92,7 +92,6 @@ def test_migrate_v3_config_from_file(tmp_path: Path, patch_rootdir: None):
    assert config.host == "192.168.1.1"
    assert config.port == 8080
    assert config.ram == 100
-    assert config.vram == 50
    assert config.legacy_models_yaml_path == Path("/custom/models.yaml")
    # This should be stripped out
    assert not hasattr(config, "esrgan")
Author	SHA1	Message	Date
Lincoln Stein	9adafdefd7	cleaned model unlock code; added debugging statements	2024-06-23 22:55:24 -04:00
Lincoln Stein	06d1678641	Update README.md	2024-06-23 22:31:42 -04:00
Lincoln Stein	2999b039a3	prep 0.0.1 release	2024-06-23 22:29:23 -04:00
Lincoln Stein	75dcff92f9	incorporate single-file loading	2024-06-23 13:16:29 -04:00
Lincoln Stein	aff5700cce	merge cache setting api	2024-06-23 12:43:58 -04:00
Lincoln Stein	6932f27b43	fixup code broken by merge with main	2024-06-23 12:17:16 -04:00
Lincoln Stein	0df018bd4e	resolve merge conflicts	2024-06-23 10:31:35 -04:00
Lincoln Stein	ebe373c614	Merge branch 'main' into lstein/feat/set-cache-sizes	2024-06-21 15:36:47 -04:00
Lincoln Stein	27195b1672	code cleanup after @ryand review	2024-06-21 15:36:37 -04:00
Lincoln Stein	787671c2c2	Update invokeai/app/api/routers/model_manager.py Co-authored-by: Ryan Dick <ryanjdick3@gmail.com>	2024-06-21 15:15:31 -04:00
Lincoln Stein	5c8cf991a9	remove use of original_config_file in load_single_file()	2024-06-20 22:28:22 -04:00
Lincoln Stein	b0574f85bc	Merge branch 'lstein/bugfix/sdxl-vae-conversion' into lstein/feat/load-one-file	2024-06-19 23:48:21 -04:00
Lincoln Stein	2a4254c7c3	merge with main	2024-06-19 23:48:19 -04:00
Lincoln Stein	349239e336	associate sdxl config with sdxl VAEs	2024-06-19 23:43:56 -04:00
Lincoln Stein	4c5bad6352	[MM] add API routes for getting & setting MM cache sizes, and retrieving MM stats	2024-06-19 21:35:50 -04:00
Lincoln Stein	74f0c317ce	Merge branch 'main' into lstein/feat/load-one-file	2024-06-19 10:26:37 -04:00
Lincoln Stein	3a622af3b2	Merge branch 'main' into lstein/feat/load-one-file	2024-06-18 13:45:03 -04:00
Lincoln Stein	c87cad3e91	simplified config schema migration code	2024-06-18 13:43:12 -04:00
Lincoln Stein	7088d5610b	add script to sync models db with models.yaml	2024-06-16 19:50:49 -04:00
Lincoln Stein	1109708029	Merge branch 'main' into lstein/feat/load-one-file	2024-06-15 20:36:40 -04:00
Lincoln Stein	e7b7737c76	Merge branch 'lstein/feat/load-one-file' of github.com:invoke-ai/InvokeAI into lstein/feat/load-one-file	2024-06-15 19:57:44 -04:00
Lincoln Stein	17e9d4f7af	implement lightweight version-by-version config migration	2024-06-15 19:57:35 -04:00
Lincoln Stein	1411fbbd1a	Update invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py Co-authored-by: Ryan Dick <ryanjdick3@gmail.com>	2024-06-15 19:08:29 -04:00
Lincoln Stein	6b788bff51	Update invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py Co-authored-by: Ryan Dick <ryanjdick3@gmail.com>	2024-06-15 19:08:15 -04:00
Lincoln Stein	379d02d209	migrate config file to remove convert_cache setting	2024-06-12 17:09:12 -04:00
Lincoln Stein	7634991107	rename migration_11 before conflict merge with main	2024-06-12 16:19:41 -04:00
Lincoln Stein	acce4d393e	working, needs sql migrator update	2024-06-12 16:18:15 -04:00
Lincoln Stein	b268cc2db9	adjust the convert api - not right just yet	2024-06-07 22:00:48 -04:00
Lincoln Stein	067b805044	use model_class.load_singlefile() instead of converting; works, but performance is poor	2024-06-07 15:34:14 -04:00
Lincoln Stein	589a7959c0	fixup unit tests and remove debugging statements	2024-06-02 21:28:54 -04:00
Lincoln Stein	e26360f85b	merged multi-gpu support into new session_processor architecture	2024-06-02 14:10:08 -04:00
Lincoln Stein	debef2476e	Merge branch 'main' into lstein/feat/multi-gpu	2024-05-06 16:48:51 -04:00
Lincoln Stein	e57809e1c6	Merge branch 'main' into lstein/feat/multi-gpu	2024-05-03 00:05:04 -04:00
Lincoln Stein	1c0067f931	Merge branch 'main' into lstein/feat/multi-gpu	2024-04-30 18:14:03 -04:00
Lincoln Stein	c3d1252892	revert to old system for doing RAM <-> VRAM transfers; new way leaks memory	2024-04-17 09:51:57 -04:00
Lincoln Stein	84f5cbdd97	make choose_torch_dtype() usable outside an invocation context	2024-04-16 19:19:19 -04:00
Lincoln Stein	edac01d4fb	reverse stupid hack	2024-04-16 18:13:59 -04:00
Lincoln Stein	d04c880cce	fix ValueError on model manager install	2024-04-16 17:57:40 -04:00
Lincoln Stein	763a2e2632	added more unit tests	2024-04-16 17:18:51 -04:00
Lincoln Stein	eaadc55c7d	make pause/resume work in multithreaded environment	2024-04-16 16:55:56 -04:00
Lincoln Stein	89f8326c0b	Merge branch 'lstein/feat/multi-gpu' of github.com:invoke-ai/InvokeAI into lstein/feat/multi-gpu	2024-04-16 16:27:08 -04:00
Lincoln Stein	99558de178	device selection calls go through TorchDevice	2024-04-16 16:26:58 -04:00
Lincoln Stein	77130f108d	Merge branch 'main' into lstein/feat/multi-gpu	2024-04-16 16:14:27 -04:00
Lincoln Stein	371f5bc782	simplify logic for retrieving execution devices	2024-04-16 15:52:03 -04:00
Lincoln Stein	fb9b7fb63a	make object_serializer._new_name() thread-safe; add max_threads config	2024-04-16 15:23:49 -04:00
Lincoln Stein	bd833900a3	add tid to cache name to avoid non-safe uuid4 on windows	2024-04-16 15:02:06 -04:00
Lincoln Stein	a84f3058e2	revert object_serializer_forward_cache.py	2024-04-15 22:28:48 -04:00
Lincoln Stein	f7436f3bae	fixup config_default; patch TorchDevice to work dynamically	2024-04-15 22:15:50 -04:00
Lincoln Stein	7dd93cb810	fix merge issues; likely nonfunctional	2024-04-15 21:16:21 -04:00
Lincoln Stein	9adb15f86c	working but filled with debug statements	2024-04-01 18:44:24 -04:00
Lincoln Stein	3d69372785	implement session-level reservation of gpus	2024-04-01 16:01:43 -04:00
Lincoln Stein	eca29c41d0	added notes	2024-04-01 13:30:02 -04:00
Lincoln Stein	9df0980c46	parallel processing working on single-GPU, not tested on multi	2024-04-01 00:07:47 -04:00
Lincoln Stein	cef51ad80d	Merge branch 'psyche/fix/nodes/processor-cpu-usage' into lstein/feat/multi-gpu	2024-03-31 17:05:23 -04:00
Lincoln Stein	83356ec74c	fix merge conflicts	2024-03-31 17:04:57 -04:00
Lincoln Stein	9336a076de	add locking around thread-critical sections	2024-03-31 16:58:56 -04:00
psychedelicious	32d3e4dc5c	feat(nodes): simplify processor loop with an early continue Prefer an early return/continue to reduce the indentation of the processor loop. Easier to read. There are other ways to improve its structure but at first glance, they seem to involve changing the logic in scarier ways.	2024-04-01 07:55:42 +11:00
Lincoln Stein	a1dcab9c38	remove references to vram_cache in tests	2024-03-31 16:52:01 -04:00
psychedelicious	bd9b00a6bf	fix(nodes): 100% cpu usage when processor paused Should be waiting on the resume event instead of checking it in a loop	2024-04-01 07:45:36 +11:00
Lincoln Stein	eaa2c68693	remove vram_cache and don't move VRAM models back into CPU	2024-03-31 16:37:13 -04:00
Lincoln Stein	24d73280ee	Merge branch 'main' into lstein/feat/multi-gpu	2024-03-21 20:29:38 -04:00
Lincoln Stein	6b991a5269	add draft multi-gpu support	2024-03-19 23:27:38 -04:00