From 5759a390f937f18f2532970901161bb7b36a32f0 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Sun, 9 Jul 2023 18:35:04 -0400 Subject: [PATCH] introduce gpu_mem_reserved configuration parameter --- invokeai/app/services/config.py | 4 +++- invokeai/backend/model_management/model_cache.py | 15 ++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/invokeai/app/services/config.py b/invokeai/app/services/config.py index 9c67eeca4c..921f01665c 100644 --- a/invokeai/app/services/config.py +++ b/invokeai/app/services/config.py @@ -23,9 +23,10 @@ InvokeAI: xformers_enabled: false sequential_guidance: false precision: float16 - max_loaded_models: 4 + max_cache_size: 6 always_use_cpu: false free_gpu_mem: false + reserve_gpu_mem: 1 Features: nsfw_checker: true restore: true @@ -365,6 +366,7 @@ setting environment variables INVOKEAI_. free_gpu_mem : bool = Field(default=False, description="If true, purge model from GPU after each generation.", category='Memory/Performance') max_loaded_models : int = Field(default=3, gt=0, description="(DEPRECATED: use max_cache_size) Maximum number of models to keep in memory for rapid switching", category='Memory/Performance') max_cache_size : float = Field(default=6.0, gt=0, description="Maximum memory amount used by model cache for rapid switching", category='Memory/Performance') + gpu_mem_reserved : float = Field(default=1.75, ge=0, description="Amount of VRAM to reserve for use during generation", category='Memory/Performance') precision : Literal[tuple(['auto','float16','float32','autocast'])] = Field(default='float16',description='Floating point precision', category='Memory/Performance') sequential_guidance : bool = Field(default=False, description="Whether to calculate guidance in serial instead of in parallel, lowering memory requirements", category='Memory/Performance') xformers_enabled : bool = Field(default=True, description="Enable/disable memory-efficient attention", category='Memory/Performance') diff --git a/invokeai/backend/model_management/model_cache.py b/invokeai/backend/model_management/model_cache.py index ecda15beac..f6d3c49bc0 100644 --- a/invokeai/backend/model_management/model_cache.py +++ b/invokeai/backend/model_management/model_cache.py @@ -36,6 +36,9 @@ from .models import BaseModelType, ModelType, SubModelType, ModelBase # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously DEFAULT_MAX_CACHE_SIZE = 6.0 +# amount of GPU memory to hold in reserve for use by generations (GB) +DEFAULT_GPU_MEM_RESERVED= 1.75 + # actual size of a gig GIG = 1073741824 @@ -88,6 +91,7 @@ class ModelCache(object): sequential_offload: bool=False, lazy_offloading: bool=True, sha_chunksize: int = 16777216, + gpu_mem_reserved: float=DEFAULT_GPU_MEM_RESERVED, logger: types.ModuleType = logger ): ''' @@ -99,12 +103,11 @@ class ModelCache(object): :param sequential_offload: Conserve VRAM by loading and unloading each stage of the pipeline sequentially :param sha_chunksize: Chunksize to use when calculating sha256 model hash ''' - #max_cache_size = 9999 self.model_infos: Dict[str, ModelBase] = dict() self.lazy_offloading = lazy_offloading - #self.sequential_offload: bool=sequential_offload self.precision: torch.dtype=precision - self.max_cache_size: int=max_cache_size + self.max_cache_size: float=max_cache_size + self.gpu_mem_reserved: float=gpu_mem_reserved self.execution_device: torch.device=execution_device self.storage_device: torch.device=storage_device self.sha_chunksize=sha_chunksize @@ -346,11 +349,13 @@ class ModelCache(object): self.logger.debug(f"After unloading: cached_models={len(self._cached_models)}") def _offload_unlocked_models(self, size_needed: int=0): + reserved = self.gpu_mem_reserved * GIG for model_key, cache_entry in sorted(self._cached_models.items(), key=lambda x:x[1].size): free_mem, used_mem = torch.cuda.mem_get_info() - self.logger.debug(f'Require {(size_needed/GIG):.2f}GB VRAM. Have {(free_mem/GIG):.2f}GB available.') + free_mem -= reserved + self.logger.debug(f'Require {(size_needed/GIG):.2f}GB VRAM. Have {(free_mem/GIG):.2f}GB available ({(reserved/GIG):.2f} reserved).') if free_mem > size_needed: - return + break if not cache_entry.locked and cache_entry.loaded: self.logger.debug(f'Offloading {model_key} from {self.execution_device} into {self.storage_device}') with VRAMUsage() as mem: