Make quantized loading fast for both T5XXL and FLUX transformer.

Make quantized loading fast.
WIP - experimentation
2026-01-23 02:08:23 -05:00 · 2024-08-09 19:54:09 +00:00 · 2024-08-09 16:39:43 +00:00 · 2024-08-09 16:23:37 +00:00 · 2024-08-08 18:12:04 -04:00 · 2024-08-08 18:23:20 +00:00
9 changed files with 708 additions and 9 deletions
--- a/invokeai/app/invocations/flux_text_to_image.py
+++ b/invokeai/app/invocations/flux_text_to_image.py
@@ -0,0 +1,278 @@
 from pathlib import Path
 from typing import Literal
 import torch
 from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler
 from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel
 from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
 from optimum.quanto import qfloat8
 from PIL import Image
 from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
 from transformers.models.auto import AutoModelForTextEncoding
 from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
 from invokeai.app.invocations.fields import InputField, WithBoard, WithMetadata
 from invokeai.app.invocations.primitives import ImageOutput
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.backend.quantization.fast_quantized_diffusion_model import FastQuantizedDiffusersModel
 from invokeai.backend.quantization.fast_quantized_transformers_model import FastQuantizedTransformersModel
 from invokeai.backend.util.devices import TorchDevice
 TFluxModelKeys = Literal["flux-schnell"]
 FLUX_MODELS: dict[TFluxModelKeys, str] = {"flux-schnell": "black-forest-labs/FLUX.1-schnell"}
 class QuantizedFluxTransformer2DModel(FastQuantizedDiffusersModel):
    base_class = FluxTransformer2DModel
 class QuantizedModelForTextEncoding(FastQuantizedTransformersModel):
    auto_class = AutoModelForTextEncoding
@invocation(
    "flux_text_to_image",
    title="FLUX Text to Image",
    tags=["image"],
    category="image",
    version="1.0.0",
 )
 class FluxTextToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
    """Text-to-image generation using a FLUX model."""
    model: TFluxModelKeys = InputField(description="The FLUX model to use for text-to-image generation.")
    use_8bit: bool = InputField(
        default=False, description="Whether to quantize the transformer model to 8-bit precision."
    )
    positive_prompt: str = InputField(description="Positive prompt for text-to-image generation.")
    width: int = InputField(default=1024, multiple_of=16, description="Width of the generated image.")
    height: int = InputField(default=1024, multiple_of=16, description="Height of the generated image.")
    num_steps: int = InputField(default=4, description="Number of diffusion steps.")
    guidance: float = InputField(
        default=4.0,
        description="The guidance strength. Higher values adhere more strictly to the prompt, and will produce less diverse images.",
    )
    seed: int = InputField(default=0, description="Randomness seed for reproducibility.")
    @torch.no_grad()
    def invoke(self, context: InvocationContext) -> ImageOutput:
        model_path = context.models.download_and_cache_model(FLUX_MODELS[self.model])
        t5_embeddings, clip_embeddings = self._encode_prompt(context, model_path)
        latents = self._run_diffusion(context, model_path, clip_embeddings, t5_embeddings)
        image = self._run_vae_decoding(context, model_path, latents)
        image_dto = context.images.save(image=image)
        return ImageOutput.build(image_dto)
    def _encode_prompt(self, context: InvocationContext, flux_model_dir: Path) -> tuple[torch.Tensor, torch.Tensor]:
        # Determine the T5 max sequence length based on the model.
        if self.model == "flux-schnell":
            max_seq_len = 256
        # elif self.model == "flux-dev":
        #     max_seq_len = 512
        else:
            raise ValueError(f"Unknown model: {self.model}")
        # Load the CLIP tokenizer.
        clip_tokenizer_path = flux_model_dir / "tokenizer"
        clip_tokenizer = CLIPTokenizer.from_pretrained(clip_tokenizer_path, local_files_only=True)
        assert isinstance(clip_tokenizer, CLIPTokenizer)
        # Load the T5 tokenizer.
        t5_tokenizer_path = flux_model_dir / "tokenizer_2"
        t5_tokenizer = T5TokenizerFast.from_pretrained(t5_tokenizer_path, local_files_only=True)
        assert isinstance(t5_tokenizer, T5TokenizerFast)
        clip_text_encoder_path = flux_model_dir / "text_encoder"
        t5_text_encoder_path = flux_model_dir / "text_encoder_2"
        with (
            context.models.load_local_model(
                model_path=clip_text_encoder_path, loader=self._load_flux_text_encoder
            ) as clip_text_encoder,
            context.models.load_local_model(
                model_path=t5_text_encoder_path, loader=self._load_flux_text_encoder_2
            ) as t5_text_encoder,
        ):
            assert isinstance(clip_text_encoder, CLIPTextModel)
            assert isinstance(t5_text_encoder, T5EncoderModel)
            pipeline = FluxPipeline(
                scheduler=None,
                vae=None,
                text_encoder=clip_text_encoder,
                tokenizer=clip_tokenizer,
                text_encoder_2=t5_text_encoder,
                tokenizer_2=t5_tokenizer,
                transformer=None,
            )
            # prompt_embeds: T5 embeddings
            # pooled_prompt_embeds: CLIP embeddings
            prompt_embeds, pooled_prompt_embeds, text_ids = pipeline.encode_prompt(
                prompt=self.positive_prompt,
                prompt_2=self.positive_prompt,
                device=TorchDevice.choose_torch_device(),
                max_sequence_length=max_seq_len,
            )
        assert isinstance(prompt_embeds, torch.Tensor)
        assert isinstance(pooled_prompt_embeds, torch.Tensor)
        return prompt_embeds, pooled_prompt_embeds
    def _run_diffusion(
        self,
        context: InvocationContext,
        flux_model_dir: Path,
        clip_embeddings: torch.Tensor,
        t5_embeddings: torch.Tensor,
    ):
        scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(flux_model_dir / "scheduler", local_files_only=True)
        # HACK(ryand): Manually empty the cache. Currently we don't check the size of the model before loading it from
        # disk. Since the transformer model is large (24GB), there's a good chance that it will OOM on 32GB RAM systems
        # if the cache is not empty.
        context.models._services.model_manager.load.ram_cache.make_room(24 * 2**30)
        transformer_path = flux_model_dir / "transformer"
        with context.models.load_local_model(
            model_path=transformer_path, loader=self._load_flux_transformer
        ) as transformer:
            assert isinstance(transformer, FluxTransformer2DModel)
            flux_pipeline_with_transformer = FluxPipeline(
                scheduler=scheduler,
                vae=None,
                text_encoder=None,
                tokenizer=None,
                text_encoder_2=None,
                tokenizer_2=None,
                transformer=transformer,
            )
            t5_embeddings = t5_embeddings.to(dtype=transformer.dtype)
            clip_embeddings = clip_embeddings.to(dtype=transformer.dtype)
            latents = flux_pipeline_with_transformer(
                height=self.height,
                width=self.width,
                num_inference_steps=self.num_steps,
                guidance_scale=self.guidance,
                generator=torch.Generator().manual_seed(self.seed),
                prompt_embeds=t5_embeddings,
                pooled_prompt_embeds=clip_embeddings,
                output_type="latent",
                return_dict=False,
            )[0]
        assert isinstance(latents, torch.Tensor)
        return latents
    def _run_vae_decoding(
        self,
        context: InvocationContext,
        flux_model_dir: Path,
        latents: torch.Tensor,
    ) -> Image.Image:
        vae_path = flux_model_dir / "vae"
        with context.models.load_local_model(model_path=vae_path, loader=self._load_flux_vae) as vae:
            assert isinstance(vae, AutoencoderKL)
            flux_pipeline_with_vae = FluxPipeline(
                scheduler=None,
                vae=vae,
                text_encoder=None,
                tokenizer=None,
                text_encoder_2=None,
                tokenizer_2=None,
                transformer=None,
            )
            latents = flux_pipeline_with_vae._unpack_latents(
                latents, self.height, self.width, flux_pipeline_with_vae.vae_scale_factor
            )
            latents = (
                latents / flux_pipeline_with_vae.vae.config.scaling_factor
            ) + flux_pipeline_with_vae.vae.config.shift_factor
            latents = latents.to(dtype=vae.dtype)
            image = flux_pipeline_with_vae.vae.decode(latents, return_dict=False)[0]
            image = flux_pipeline_with_vae.image_processor.postprocess(image, output_type="pil")[0]
        assert isinstance(image, Image.Image)
        return image
    @staticmethod
    def _load_flux_text_encoder(path: Path) -> CLIPTextModel:
        model = CLIPTextModel.from_pretrained(path, local_files_only=True)
        assert isinstance(model, CLIPTextModel)
        return model
    def _load_flux_text_encoder_2(self, path: Path) -> T5EncoderModel:
        if self.use_8bit:
            model_8bit_path = path / "quantized"
            if model_8bit_path.exists():
                # The quantized model exists, load it.
                # TODO(ryand): The requantize(...) operation in from_pretrained(...) is very slow. This seems like
                # something that we should be able to make much faster.
                q_model = QuantizedModelForTextEncoding.from_pretrained(model_8bit_path)
                # Access the underlying wrapped model.
                # We access the wrapped model, even though it is private, because it simplifies the type checking by
                # always returning a T5EncoderModel from this function.
                model = q_model._wrapped
            else:
                # The quantized model does not exist yet, quantize and save it.
                # TODO(ryand): dtype?
                model = T5EncoderModel.from_pretrained(path, local_files_only=True)
                assert isinstance(model, T5EncoderModel)
                q_model = QuantizedModelForTextEncoding.quantize(model, weights=qfloat8)
                model_8bit_path.mkdir(parents=True, exist_ok=True)
                q_model.save_pretrained(model_8bit_path)
                # (See earlier comment about accessing the wrapped model.)
                model = q_model._wrapped
        else:
            model = T5EncoderModel.from_pretrained(path, local_files_only=True)
        assert isinstance(model, T5EncoderModel)
        return model
    def _load_flux_transformer(self, path: Path) -> FluxTransformer2DModel:
        if self.use_8bit:
            model_8bit_path = path / "quantized"
            if model_8bit_path.exists():
                # The quantized model exists, load it.
                # TODO(ryand): The requantize(...) operation in from_pretrained(...) is very slow. This seems like
                # something that we should be able to make much faster.
                q_model = QuantizedFluxTransformer2DModel.from_pretrained(model_8bit_path)
                # Access the underlying wrapped model.
                # We access the wrapped model, even though it is private, because it simplifies the type checking by
                # always returning a FluxTransformer2DModel from this function.
                model = q_model._wrapped
            else:
                # The quantized model does not exist yet, quantize and save it.
                # TODO(ryand): Loading in float16 and then quantizing seems to result in NaNs. In order to run this on
                # GPUs that don't support bfloat16, we would need to host the quantized model instead of generating it
                # here.
                model = FluxTransformer2DModel.from_pretrained(path, local_files_only=True, torch_dtype=torch.bfloat16)
                assert isinstance(model, FluxTransformer2DModel)
                q_model = QuantizedFluxTransformer2DModel.quantize(model, weights=qfloat8)
                model_8bit_path.mkdir(parents=True, exist_ok=True)
                q_model.save_pretrained(model_8bit_path)
                # (See earlier comment about accessing the wrapped model.)
                model = q_model._wrapped
        else:
            model = FluxTransformer2DModel.from_pretrained(path, local_files_only=True, torch_dtype=torch.bfloat16)
        assert isinstance(model, FluxTransformer2DModel)
        return model
    @staticmethod
    def _load_flux_vae(path: Path) -> AutoencoderKL:
        model = AutoencoderKL.from_pretrained(path, local_files_only=True)
        assert isinstance(model, AutoencoderKL)
        return model
--- a/invokeai/backend/load_flux_model.py
+++ b/invokeai/backend/load_flux_model.py
@@ -0,0 +1,129 @@
 import json
 import os
 import time
 from pathlib import Path
 from typing import Union
 import torch
 from diffusers.models.model_loading_utils import load_state_dict
 from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel
 from diffusers.utils import (
    CONFIG_NAME,
    SAFE_WEIGHTS_INDEX_NAME,
    SAFETENSORS_WEIGHTS_NAME,
    _get_checkpoint_shard_files,
    is_accelerate_available,
 )
 from optimum.quanto import qfloat8
 from optimum.quanto.models import QuantizedDiffusersModel
 from optimum.quanto.models.shared_dict import ShardedStateDict
 from invokeai.backend.requantize import requantize
 class QuantizedFluxTransformer2DModel(QuantizedDiffusersModel):
    base_class = FluxTransformer2DModel
    @classmethod
    def from_pretrained(cls, model_name_or_path: Union[str, os.PathLike]):
        if cls.base_class is None:
            raise ValueError("The `base_class` attribute needs to be configured.")
        if not is_accelerate_available():
            raise ValueError("Reloading a quantized diffusers model requires the accelerate library.")
        from accelerate import init_empty_weights
        if os.path.isdir(model_name_or_path):
            # Look for a quantization map
            qmap_path = os.path.join(model_name_or_path, cls._qmap_name())
            if not os.path.exists(qmap_path):
                raise ValueError(f"No quantization map found in {model_name_or_path}: is this a quantized model ?")
            # Look for original model config file.
            model_config_path = os.path.join(model_name_or_path, CONFIG_NAME)
            if not os.path.exists(model_config_path):
                raise ValueError(f"{CONFIG_NAME} not found in {model_name_or_path}.")
            with open(qmap_path, "r", encoding="utf-8") as f:
                qmap = json.load(f)
            with open(model_config_path, "r", encoding="utf-8") as f:
                original_model_cls_name = json.load(f)["_class_name"]
            configured_cls_name = cls.base_class.__name__
            if configured_cls_name != original_model_cls_name:
                raise ValueError(
                    f"Configured base class ({configured_cls_name}) differs from what was derived from the provided configuration ({original_model_cls_name})."
                )
            # Create an empty model
            config = cls.base_class.load_config(model_name_or_path)
            with init_empty_weights():
                model = cls.base_class.from_config(config)
            # Look for the index of a sharded checkpoint
            checkpoint_file = os.path.join(model_name_or_path, SAFE_WEIGHTS_INDEX_NAME)
            if os.path.exists(checkpoint_file):
                # Convert the checkpoint path to a list of shards
                _, sharded_metadata = _get_checkpoint_shard_files(model_name_or_path, checkpoint_file)
                # Create a mapping for the sharded safetensor files
                state_dict = ShardedStateDict(model_name_or_path, sharded_metadata["weight_map"])
            else:
                # Look for a single checkpoint file
                checkpoint_file = os.path.join(model_name_or_path, SAFETENSORS_WEIGHTS_NAME)
                if not os.path.exists(checkpoint_file):
                    raise ValueError(f"No safetensor weights found in {model_name_or_path}.")
                # Get state_dict from model checkpoint
                state_dict = load_state_dict(checkpoint_file)
            # Requantize and load quantized weights from state_dict
            requantize(model, state_dict=state_dict, quantization_map=qmap)
            model.eval()
            return cls(model)
        else:
            raise NotImplementedError("Reloading quantized models directly from the hub is not supported yet.")
 def load_flux_transformer(path: Path) -> FluxTransformer2DModel:
    # model = FluxTransformer2DModel.from_pretrained(path, local_files_only=True, torch_dtype=torch.bfloat16)
    model_8bit_path = path / "quantized"
    if model_8bit_path.exists():
        # The quantized model exists, load it.
        # TODO(ryand): The requantize(...) operation in from_pretrained(...) is very slow. This seems like
        # something that we should be able to make much faster.
        q_model = QuantizedFluxTransformer2DModel.from_pretrained(model_8bit_path)
        # Access the underlying wrapped model.
        # We access the wrapped model, even though it is private, because it simplifies the type checking by
        # always returning a FluxTransformer2DModel from this function.
        model = q_model._wrapped
    else:
        # The quantized model does not exist yet, quantize and save it.
        # TODO(ryand): Loading in float16 and then quantizing seems to result in NaNs. In order to run this on
        # GPUs that don't support bfloat16, we would need to host the quantized model instead of generating it
        # here.
        model = FluxTransformer2DModel.from_pretrained(path, local_files_only=True, torch_dtype=torch.bfloat16)
        assert isinstance(model, FluxTransformer2DModel)
        q_model = QuantizedFluxTransformer2DModel.quantize(model, weights=qfloat8)
        model_8bit_path.mkdir(parents=True, exist_ok=True)
        q_model.save_pretrained(model_8bit_path)
        # (See earlier comment about accessing the wrapped model.)
        model = q_model._wrapped
    assert isinstance(model, FluxTransformer2DModel)
    return model
 def main():
    start = time.time()
    model = load_flux_transformer(
        Path("/data/invokeai/models/.download_cache/black-forest-labs_flux.1-schnell/FLUX.1-schnell/transformer/")
    )
    print(f"Time to load: {time.time() - start}s")
    print("hi")
 if __name__ == "__main__":
    main()
--- a/invokeai/backend/model_manager/util/select_hf_files.py
+++ b/invokeai/backend/model_manager/util/select_hf_files.py
@@ -54,6 +54,7 @@ def filter_files(
                "lora_weights.safetensors",
                "weights.pb",
                "onnx_data",
                "spiece.model", # Added for `black-forest-labs/FLUX.1-schnell`.
            )
        ):
            paths.append(file)
@@ -62,7 +63,7 @@ def filter_files(
        # downloading random checkpoints that might also be in the repo. However there is no guarantee
        # that a checkpoint doesn't contain "model" in its name, and no guarantee that future diffusers models
        # will adhere to this naming convention, so this is an area to be careful of.
-        elif re.search(r"model(\.[^.]+)?\.(safetensors|bin|onnx|xml|pth|pt|ckpt|msgpack)$", file.name):
+        elif re.search(r"model.*\.(safetensors|bin|onnx|xml|pth|pt|ckpt|msgpack)$", file.name):
            paths.append(file)
    # limit search to subfolder if requested
@@ -97,7 +98,9 @@ def _filter_by_variant(files: List[Path], variant: ModelRepoVariant) -> Set[Path
            if variant == ModelRepoVariant.Flax:
                result.add(path)
-        elif path.suffix in [".json", ".txt"]:
+        # Note: '.model' was added to support:
        # https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/768d12a373ed5cc9ef9a9dea7504dc09fcc14842/tokenizer_2/spiece.model
        elif path.suffix in [".json", ".txt", ".model"]:
            result.add(path)
        elif variant in [
@@ -140,6 +143,23 @@ def _filter_by_variant(files: List[Path], variant: ModelRepoVariant) -> Set[Path
            continue
    for candidate_list in subfolder_weights.values():
        # Check if at least one of the files has the explicit fp16 variant.
        at_least_one_fp16 = False
        for candidate in candidate_list:
            if len(candidate.path.suffixes) == 2 and candidate.path.suffixes[0] == ".fp16":
                at_least_one_fp16 = True
                break
        if not at_least_one_fp16:
            # If none of the candidates in this candidate_list have the explicit fp16 variant label, then this
            # candidate_list probably doesn't adhere to the variant naming convention that we expected. In this case,
            # we'll simply keep all the candidates. An example of a model that hits this case is
            # `black-forest-labs/FLUX.1-schnell` (as of commit 012d2fd).
            for candidate in candidate_list:
                result.add(candidate.path)
        # The candidate_list seems to have the expected variant naming convention. We'll select the highest scoring
        # candidate.
        highest_score_candidate = max(candidate_list, key=lambda candidate: candidate.score)
        if highest_score_candidate:
            result.add(highest_score_candidate.path)
--- a/invokeai/backend/quantization/fast_quantized_diffusion_model.py
+++ b/invokeai/backend/quantization/fast_quantized_diffusion_model.py
@@ -0,0 +1,77 @@
 import json
 import os
 from typing import Union
 from diffusers.models.model_loading_utils import load_state_dict
 from diffusers.utils import (
    CONFIG_NAME,
    SAFE_WEIGHTS_INDEX_NAME,
    SAFETENSORS_WEIGHTS_NAME,
    _get_checkpoint_shard_files,
    is_accelerate_available,
 )
 from optimum.quanto.models import QuantizedDiffusersModel
 from optimum.quanto.models.shared_dict import ShardedStateDict
 from invokeai.backend.requantize import requantize
 class FastQuantizedDiffusersModel(QuantizedDiffusersModel):
    @classmethod
    def from_pretrained(cls, model_name_or_path: Union[str, os.PathLike]):
        """We override the `from_pretrained()` method in order to use our custom `requantize()` implementation."""
        if cls.base_class is None:
            raise ValueError("The `base_class` attribute needs to be configured.")
        if not is_accelerate_available():
            raise ValueError("Reloading a quantized diffusers model requires the accelerate library.")
        from accelerate import init_empty_weights
        if os.path.isdir(model_name_or_path):
            # Look for a quantization map
            qmap_path = os.path.join(model_name_or_path, cls._qmap_name())
            if not os.path.exists(qmap_path):
                raise ValueError(f"No quantization map found in {model_name_or_path}: is this a quantized model ?")
            # Look for original model config file.
            model_config_path = os.path.join(model_name_or_path, CONFIG_NAME)
            if not os.path.exists(model_config_path):
                raise ValueError(f"{CONFIG_NAME} not found in {model_name_or_path}.")
            with open(qmap_path, "r", encoding="utf-8") as f:
                qmap = json.load(f)
            with open(model_config_path, "r", encoding="utf-8") as f:
                original_model_cls_name = json.load(f)["_class_name"]
            configured_cls_name = cls.base_class.__name__
            if configured_cls_name != original_model_cls_name:
                raise ValueError(
                    f"Configured base class ({configured_cls_name}) differs from what was derived from the provided configuration ({original_model_cls_name})."
                )
            # Create an empty model
            config = cls.base_class.load_config(model_name_or_path)
            with init_empty_weights():
                model = cls.base_class.from_config(config)
            # Look for the index of a sharded checkpoint
            checkpoint_file = os.path.join(model_name_or_path, SAFE_WEIGHTS_INDEX_NAME)
            if os.path.exists(checkpoint_file):
                # Convert the checkpoint path to a list of shards
                _, sharded_metadata = _get_checkpoint_shard_files(model_name_or_path, checkpoint_file)
                # Create a mapping for the sharded safetensor files
                state_dict = ShardedStateDict(model_name_or_path, sharded_metadata["weight_map"])
            else:
                # Look for a single checkpoint file
                checkpoint_file = os.path.join(model_name_or_path, SAFETENSORS_WEIGHTS_NAME)
                if not os.path.exists(checkpoint_file):
                    raise ValueError(f"No safetensor weights found in {model_name_or_path}.")
                # Get state_dict from model checkpoint
                state_dict = load_state_dict(checkpoint_file)
            # Requantize and load quantized weights from state_dict
            requantize(model, state_dict=state_dict, quantization_map=qmap)
            model.eval()
            return cls(model)
        else:
            raise NotImplementedError("Reloading quantized models directly from the hub is not supported yet.")
--- a/invokeai/backend/quantization/fast_quantized_transformers_model.py
+++ b/invokeai/backend/quantization/fast_quantized_transformers_model.py
@@ -0,0 +1,61 @@
 import json
 import os
 from typing import Union
 from optimum.quanto.models import QuantizedTransformersModel
 from optimum.quanto.models.shared_dict import ShardedStateDict
 from transformers import AutoConfig
 from transformers.modeling_utils import get_checkpoint_shard_files, load_state_dict
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, is_accelerate_available
 from invokeai.backend.requantize import requantize
 class FastQuantizedTransformersModel(QuantizedTransformersModel):
    @classmethod
    def from_pretrained(cls, model_name_or_path: Union[str, os.PathLike]):
        """We override the `from_pretrained()` method in order to use our custom `requantize()` implementation."""
        if cls.auto_class is None:
            raise ValueError(
                "Quantized models cannot be reloaded using {cls}: use a specialized quantized class such as QuantizedModelForCausalLM instead."
            )
        if not is_accelerate_available():
            raise ValueError("Reloading a quantized transformers model requires the accelerate library.")
        from accelerate import init_empty_weights
        if os.path.isdir(model_name_or_path):
            # Look for a quantization map
            qmap_path = os.path.join(model_name_or_path, cls._qmap_name())
            if not os.path.exists(qmap_path):
                raise ValueError(f"No quantization map found in {model_name_or_path}: is this a quantized model ?")
            with open(qmap_path, "r", encoding="utf-8") as f:
                qmap = json.load(f)
            # Create an empty model
            config = AutoConfig.from_pretrained(model_name_or_path)
            with init_empty_weights():
                model = cls.auto_class.from_config(config)
            # Look for the index of a sharded checkpoint
            checkpoint_file = os.path.join(model_name_or_path, SAFE_WEIGHTS_INDEX_NAME)
            if os.path.exists(checkpoint_file):
                # Convert the checkpoint path to a list of shards
                checkpoint_file, sharded_metadata = get_checkpoint_shard_files(model_name_or_path, checkpoint_file)
                # Create a mapping for the sharded safetensor files
                state_dict = ShardedStateDict(model_name_or_path, sharded_metadata["weight_map"])
            else:
                # Look for a single checkpoint file
                checkpoint_file = os.path.join(model_name_or_path, SAFE_WEIGHTS_NAME)
                if not os.path.exists(checkpoint_file):
                    raise ValueError(f"No safetensor weights found in {model_name_or_path}.")
                # Get state_dict from model checkpoint
                state_dict = load_state_dict(checkpoint_file)
            # Requantize and load quantized weights from state_dict
            requantize(model, state_dict=state_dict, quantization_map=qmap)
            if getattr(model.config, "tie_word_embeddings", True):
                # Tie output weight embeddings to input weight embeddings
                # Note that if they were quantized they would NOT be tied
                model.tie_weights()
            # Set model in evaluation mode as it is done in transformers
            model.eval()
            return cls(model)
        else:
            raise NotImplementedError("Reloading quantized models directly from the hub is not supported yet.")
--- a/invokeai/backend/requantize.py
+++ b/invokeai/backend/requantize.py
@@ -0,0 +1,53 @@
 from typing import Any, Dict
 import torch
 from optimum.quanto.quantize import _quantize_submodule
 # def custom_freeze(model: torch.nn.Module):
 #     for name, m in model.named_modules():
 #         if isinstance(m, QModuleMixin):
 #             m.weight =
 #             m.freeze()
 def requantize(
    model: torch.nn.Module,
    state_dict: Dict[str, Any],
    quantization_map: Dict[str, Dict[str, str]],
    device: torch.device = None,
 ):
    if device is None:
        device = next(model.parameters()).device
        if device.type == "meta":
            device = torch.device("cpu")
    # Quantize the model with parameters from the quantization map
    for name, m in model.named_modules():
        qconfig = quantization_map.get(name, None)
        if qconfig is not None:
            weights = qconfig["weights"]
            if weights == "none":
                weights = None
            activations = qconfig["activations"]
            if activations == "none":
                activations = None
            _quantize_submodule(model, name, m, weights=weights, activations=activations)
    # Move model parameters and buffers to CPU before materializing quantized weights
    for name, m in model.named_modules():
        def move_tensor(t, device):
            if t.device.type == "meta":
                return torch.empty_like(t, device=device)
            return t.to(device)
        for name, param in m.named_parameters(recurse=False):
            setattr(m, name, torch.nn.Parameter(move_tensor(param, "cpu")))
        for name, param in m.named_buffers(recurse=False):
            setattr(m, name, move_tensor(param, "cpu"))
    # Freeze model and move to target device
    # freeze(model)
    # model.to(device)
    # Load the quantized model weights
    model.load_state_dict(state_dict, strict=False)
--- a/invokeai/backend/util/hotfixes.py
+++ b/invokeai/backend/util/hotfixes.py
@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import diffusers
 import torch
 from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import FromOriginalControlNetMixin
+from diffusers.loaders.single_file_model import FromOriginalModelMixin
 from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor
 from diffusers.models.controlnet import ControlNetConditioningEmbedding, ControlNetOutput, zero_module
 from diffusers.models.embeddings import (
@@ -32,7 +32,7 @@ from invokeai.backend.util.logging import InvokeAILogger
 logger = InvokeAILogger.get_logger(__name__)
-class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlNetMixin):
+class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
    """
    A ControlNet model.
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,31 +33,35 @@ classifiers = [
 ]
 dependencies = [
  # Core generation dependencies, pinned for reproducible builds.
-  "accelerate==0.30.1",
+  "accelerate==0.33.0",
  "clip_anytorch==2.6.0",       # replacing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip",
  "compel==2.0.2",
  "controlnet-aux==0.0.7",
-  "diffusers[torch]==0.27.2",
+  # TODO(ryand): Bump this once the next diffusers release is ready.
  "diffusers[torch] @ git+https://github.com/huggingface/diffusers.git@4c6152c2fb0ade468aadb417102605a07a8635d3",
  "invisible-watermark==0.2.0", # needed to install SDXL base and refiner using their repo_ids
  "mediapipe==0.10.7",          # needed for "mediapipeface" controlnet model
  "numpy==1.26.4",              # >1.24.0 is needed to use the 'strict' argument to np.testing.assert_array_equal()
  "onnx==1.15.0",
  "onnxruntime==1.16.3",
  "opencv-python==4.9.0.80",
  "optimum-quanto==0.2.4",
  "pytorch-lightning==2.1.3",
  "safetensors==0.4.3",
  # sentencepiece is required to load T5TokenizerFast (used by FLUX).
  "sentencepiece==0.2.0",
  "spandrel==0.3.4",
  "timm==0.6.13",               # needed to override timm latest in controlnet_aux, see  https://github.com/isl-org/ZoeDepth/issues/26
-  "torch==2.2.2",
+  "torch==2.4.0",
  "torchmetrics==0.11.4",
  "torchsde==0.2.6",
-  "torchvision==0.17.2",
+  "torchvision==0.19.0",
  "transformers==4.41.1",
  # Core application dependencies, pinned for reproducible builds.
  "fastapi-events==0.11.1",
  "fastapi==0.111.0",
-  "huggingface-hub==0.23.1",
+  "huggingface-hub==0.24.5",
  "pydantic-settings==2.2.1",
  "pydantic==2.7.2",
  "python-socketio==5.11.1",
--- a/tests/backend/model_manager/util/test_hf_model_select.py
+++ b/tests/backend/model_manager/util/test_hf_model_select.py
@@ -326,3 +326,80 @@ def test_select_multiple_weights(
 ) -> None:
    filtered_files = filter_files(sd15_test_files, variant)
    assert set(filtered_files) == {Path(f) for f in expected_files}
@pytest.fixture
 def flux_schnell_test_files() -> list[Path]:
    return [
        Path(f)
        for f in [
            "FLUX.1-schnell/.gitattributes",
            "FLUX.1-schnell/README.md",
            "FLUX.1-schnell/ae.safetensors",
            "FLUX.1-schnell/flux1-schnell.safetensors",
            "FLUX.1-schnell/model_index.json",
            "FLUX.1-schnell/scheduler/scheduler_config.json",
            "FLUX.1-schnell/schnell_grid.jpeg",
            "FLUX.1-schnell/text_encoder/config.json",
            "FLUX.1-schnell/text_encoder/model.safetensors",
            "FLUX.1-schnell/text_encoder_2/config.json",
            "FLUX.1-schnell/text_encoder_2/model-00001-of-00002.safetensors",
            "FLUX.1-schnell/text_encoder_2/model-00002-of-00002.safetensors",
            "FLUX.1-schnell/text_encoder_2/model.safetensors.index.json",
            "FLUX.1-schnell/tokenizer/merges.txt",
            "FLUX.1-schnell/tokenizer/special_tokens_map.json",
            "FLUX.1-schnell/tokenizer/tokenizer_config.json",
            "FLUX.1-schnell/tokenizer/vocab.json",
            "FLUX.1-schnell/tokenizer_2/special_tokens_map.json",
            "FLUX.1-schnell/tokenizer_2/spiece.model",
            "FLUX.1-schnell/tokenizer_2/tokenizer.json",
            "FLUX.1-schnell/tokenizer_2/tokenizer_config.json",
            "FLUX.1-schnell/transformer/config.json",
            "FLUX.1-schnell/transformer/diffusion_pytorch_model-00001-of-00003.safetensors",
            "FLUX.1-schnell/transformer/diffusion_pytorch_model-00002-of-00003.safetensors",
            "FLUX.1-schnell/transformer/diffusion_pytorch_model-00003-of-00003.safetensors",
            "FLUX.1-schnell/transformer/diffusion_pytorch_model.safetensors.index.json",
            "FLUX.1-schnell/vae/config.json",
            "FLUX.1-schnell/vae/diffusion_pytorch_model.safetensors",
        ]
    ]
@pytest.mark.parametrize(
    ["variant", "expected_files"],
    [
        (
            ModelRepoVariant.Default,
            [
                "FLUX.1-schnell/model_index.json",
                "FLUX.1-schnell/scheduler/scheduler_config.json",
                "FLUX.1-schnell/text_encoder/config.json",
                "FLUX.1-schnell/text_encoder/model.safetensors",
                "FLUX.1-schnell/text_encoder_2/config.json",
                "FLUX.1-schnell/text_encoder_2/model-00001-of-00002.safetensors",
                "FLUX.1-schnell/text_encoder_2/model-00002-of-00002.safetensors",
                "FLUX.1-schnell/text_encoder_2/model.safetensors.index.json",
                "FLUX.1-schnell/tokenizer/merges.txt",
                "FLUX.1-schnell/tokenizer/special_tokens_map.json",
                "FLUX.1-schnell/tokenizer/tokenizer_config.json",
                "FLUX.1-schnell/tokenizer/vocab.json",
                "FLUX.1-schnell/tokenizer_2/special_tokens_map.json",
                "FLUX.1-schnell/tokenizer_2/spiece.model",
                "FLUX.1-schnell/tokenizer_2/tokenizer.json",
                "FLUX.1-schnell/tokenizer_2/tokenizer_config.json",
                "FLUX.1-schnell/transformer/config.json",
                "FLUX.1-schnell/transformer/diffusion_pytorch_model-00001-of-00003.safetensors",
                "FLUX.1-schnell/transformer/diffusion_pytorch_model-00002-of-00003.safetensors",
                "FLUX.1-schnell/transformer/diffusion_pytorch_model-00003-of-00003.safetensors",
                "FLUX.1-schnell/transformer/diffusion_pytorch_model.safetensors.index.json",
                "FLUX.1-schnell/vae/config.json",
                "FLUX.1-schnell/vae/diffusion_pytorch_model.safetensors",
            ],
        ),
    ],
 )
 def test_select_flux_schnell_files(
    flux_schnell_test_files: list[Path], variant: ModelRepoVariant, expected_files: list[str]
 ) -> None:
    filtered_files = filter_files(flux_schnell_test_files, variant)
    assert set(filtered_files) == {Path(f) for f in expected_files}
Author	SHA1	Message	Date
Ryan Dick	a8a2fc106d	Make quantized loading fast for both T5XXL and FLUX transformer.	2024-08-09 19:54:09 +00:00
Ryan Dick	d23ad1818d	Make quantized loading fast.	2024-08-09 16:39:43 +00:00
Ryan Dick	4181ab654b	WIP - experimentation	2024-08-09 16:23:37 +00:00
Ryan Dick	1c97360f9f	Make float16 inference work with FLUX on 24GB GPU.	2024-08-08 18:12:04 -04:00
Ryan Dick	74d6fceeb6	Add support for 8-bit quantizatino of the FLUX T5XXL text encoder.	2024-08-08 18:23:20 +00:00
Ryan Dick	766ddc18dc	Make 8-bit quantization save/reload work for the FLUX transformer. Reload is still very slow with the current optimum.quanto implementation.	2024-08-08 16:40:11 +00:00
Ryan Dick	e6ff7488a1	Minor improvements to FLUX workflow.	2024-08-07 22:10:09 +00:00
Ryan Dick	89a652cfcd	Got FLUX schnell working with 8-bit quantization. Still lots of rough edges to clean up.	2024-08-07 19:50:03 +00:00
Ryan Dick	b227b9059d	Use the FluxPipeline.encode_prompt() api rather than trying to run the two text encoders separately.	2024-08-07 15:12:01 +00:00
Ryan Dick	3599a4a3e4	Add sentencepiece dependency for the T5 tokenizer.	2024-08-07 14:18:19 +00:00
Ryan Dick	5dd619e137	First draft of FluxTextToImageInvocation.	2024-08-06 21:51:22 +00:00
Ryan Dick	7d447cbb88	Update HF download logic to work for black-forest-labs/FLUX.1-schnell.	2024-08-06 19:34:49 +00:00
Ryan Dick	3bbba7e4b1	Update imports for compatibility with bumped diffusers version.	2024-08-06 17:56:36 +00:00
Ryan Dick	b1845019fe	Bump diffusers version to include FLUX support.	2024-08-06 11:52:05 -04:00