Add SD inpainting

python apps/stable_diffusion/scripts/inpaint.py --prompt="prompt" --img_path=path/to/img --mask_path=path/to/mask --import_mlir --max_length=77 --hf_model_id="stabilityai/stable-diffusion-2-inpainting"
2026-04-03 03:00:17 -04:00 · 2023-01-29 17:03:29 -08:00
parent 41ee65b377
commit 6582475955
13 changed files with 853 additions and 13 deletions
--- a/apps/stable_diffusion/scripts/inpaint.py
+++ b/apps/stable_diffusion/scripts/inpaint.py
@@ -0,0 +1,350 @@
+import os
+
+if "AMD_ENABLE_LLPC" not in os.environ:
+    os.environ["AMD_ENABLE_LLPC"] = "1"
+
+import sys
+import json
+import torch
+import re
+import time
+from pathlib import Path
+from PIL import Image, PngImagePlugin
+from datetime import datetime as dt
+from dataclasses import dataclass
+from csv import DictWriter
+from apps.stable_diffusion.src import (
+    args,
+    InpaintPipeline,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+)
+
+
+@dataclass
+class Config:
+    model_id: str
+    ckpt_loc: str
+    precision: str
+    batch_size: int
+    max_length: int
+    height: int
+    width: int
+    device: str
+
+
+# This has to come before importing cache objects
+if args.clear_all:
+    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
+    from glob import glob
+    import shutil
+
+    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
+    for vmfb in vmfbs:
+        if os.path.exists(vmfb):
+            os.remove(vmfb)
+    # Temporary workaround of deleting yaml files to incorporate diffusers' pipeline.
+    # TODO: Remove this once we have better weight updation logic.
+    inference_yaml = ["v2-inference-v.yaml", "v1-inference.yaml"]
+    for yaml in inference_yaml:
+        if os.path.exists(yaml):
+            os.remove(yaml)
+    home = os.path.expanduser("~")
+    if os.name == "nt":  # Windows
+        appdata = os.getenv("LOCALAPPDATA")
+        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
+        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
+    elif os.name == "unix":
+        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
+        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
+
+
+# save output images and the inputs corresponding to it.
+def save_output_img(output_img, img_seed):
+    output_path = args.output_dir if args.output_dir else Path.cwd()
+    generated_imgs_path = Path(output_path, "generated_imgs")
+    generated_imgs_path.mkdir(parents=True, exist_ok=True)
+    csv_path = Path(generated_imgs_path, "imgs_details.csv")
+
+    prompt_slice = re.sub("[^a-zA-Z0-9]", "_", args.prompts[0][:15])
+    out_img_name = (
+        f"{prompt_slice}_{img_seed}_{dt.now().strftime('%y%m%d_%H%M%S')}"
+    )
+
+    img_model = args.hf_model_id
+    if args.ckpt_loc:
+        img_model = os.path.basename(args.ckpt_loc)
+
+    if args.output_img_format == "jpg":
+        out_img_path = Path(generated_imgs_path, f"{out_img_name}.jpg")
+        output_img.save(out_img_path, quality=95, subsampling=0)
+    else:
+        out_img_path = Path(generated_imgs_path, f"{out_img_name}.png")
+        pngInfo = PngImagePlugin.PngInfo()
+
+        if args.write_metadata_to_png:
+            pngInfo.add_text(
+                "parameters",
+                f"{args.prompts[0]}\nNegative prompt: {args.negative_prompts[0]}\nSteps:{args.steps}, Sampler: {args.scheduler}, CFG scale: {args.guidance_scale}, Seed: {img_seed}, Size: {args.width}x{args.height}, Model: {img_model}",
+            )
+
+        output_img.save(out_img_path, "PNG", pnginfo=pngInfo)
+
+        if args.output_img_format not in ["png", "jpg"]:
+            print(
+                f"[ERROR] Format {args.output_img_format} is not supported yet."
+                "Image saved as png instead. Supported formats: png / jpg"
+            )
+
+    new_entry = {
+        "VARIANT": img_model,
+        "SCHEDULER": args.scheduler,
+        "PROMPT": args.prompts[0],
+        "NEG_PROMPT": args.negative_prompts[0],
+        "IMG_INPUT": args.img_path,
+        "MASK_INPUT": args.mask_path,
+        "SEED": img_seed,
+        "CFG_SCALE": args.guidance_scale,
+        "PRECISION": args.precision,
+        "STEPS": args.steps,
+        "HEIGHT": args.height,
+        "WIDTH": args.width,
+        "MAX_LENGTH": args.max_length,
+        "OUTPUT": out_img_path,
+    }
+
+    with open(csv_path, "a") as csv_obj:
+        dictwriter_obj = DictWriter(csv_obj, fieldnames=list(new_entry.keys()))
+        dictwriter_obj.writerow(new_entry)
+        csv_obj.close()
+
+    if args.save_metadata_to_json:
+        del new_entry["OUTPUT"]
+        json_path = Path(generated_imgs_path, f"{out_img_name}.json")
+        with open(json_path, "w") as f:
+            json.dump(new_entry, f, indent=4)
+
+
+inpaint_obj = None
+config_obj = None
+schedulers = None
+
+
+# Exposed to UI.
+def inpaint_inf(
+    prompt: str,
+    negative_prompt: str,
+    image: Image,
+    mask_image: Image,
+    height: int,
+    width: int,
+    steps: int,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+):
+    global inpaint_obj
+    global config_obj
+    global schedulers
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.steps = steps
+    args.scheduler = scheduler
+
+    # set ckpt_loc and hf_model_id.
+    types = (
+        ".ckpt",
+        ".safetensors",
+    )  # the tuple of file types
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = custom_model
+    else:
+        args.hf_model_id = custom_model
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    new_config_obj = Config(
+        args.hf_model_id,
+        args.ckpt_loc,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+    )
+    if config_obj != new_config_obj:
+        config_obj = new_config_obj
+        args.precision = precision
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.use_tuned = True
+        args.import_mlir = False
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-2-inpainting"
+        )
+        schedulers = get_schedulers(model_id)
+        scheduler_obj = schedulers[scheduler]
+        inpaint_obj = InpaintPipeline.from_pretrained(
+            scheduler_obj,
+            args.import_mlir,
+            args.hf_model_id,
+            args.ckpt_loc,
+            args.precision,
+            args.max_length,
+            args.batch_size,
+            args.height,
+            args.width,
+            args.use_base_vae,
+            args.use_tuned,
+        )
+
+    if not inpaint_obj:
+        sys.exit("text to image pipeline must not return a null value")
+
+    inpaint_obj.scheduler = schedulers[scheduler]
+
+    start_time = time.time()
+    inpaint_obj.log = ""
+    generated_imgs = []
+    seeds = []
+    img_seed = utils.sanitize_seed(seed)
+    for i in range(batch_count):
+        if i > 0:
+            img_seed = utils.sanitize_seed(-1)
+        out_imgs = inpaint_obj.generate_images(
+            prompt,
+            negative_prompt,
+            image,
+            mask_image,
+            batch_size,
+            height,
+            width,
+            steps,
+            guidance_scale,
+            img_seed,
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+        )
+        save_output_img(out_imgs[0], img_seed)
+        generated_imgs.extend(out_imgs)
+        seeds.append(img_seed)
+        inpaint_obj.log += "\n"
+
+    total_time = time.time() - start_time
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+    text_output += f"\nscheduler={args.scheduler}, device={device}"
+    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={seeds}"
+    text_output += f"\nsize={args.height}x{args.width}, batch-count={batch_count}, batch-size={args.batch_size}, max_length={args.max_length}"
+    text_output += inpaint_obj.log
+    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+    return generated_imgs, text_output
+
+
+if __name__ == "__main__":
+    if args.img_path is None:
+        print("Flag --img_path is required.")
+        exit()
+    if args.mask_path is None:
+        print("Flag --mask_path is required.")
+        exit()
+    if "inpaint" not in args.hf_model_id:
+        print("Please use inpainting model with --hf_model_id.")
+        exit()
+
+    dtype = torch.float32 if args.precision == "fp32" else torch.half
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    set_init_device_flags()
+    schedulers = get_schedulers(args.hf_model_id)
+    scheduler_obj = schedulers[args.scheduler]
+    seed = args.seed
+    image = Image.open(args.img_path)
+    mask_image = Image.open(args.mask_path)
+
+    inpaint_obj = InpaintPipeline.from_pretrained(
+        scheduler_obj,
+        args.import_mlir,
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.precision,
+        args.max_length,
+        args.batch_size,
+        args.height,
+        args.width,
+        args.use_base_vae,
+        args.use_tuned,
+    )
+
+    for run in range(args.runs):
+        if run > 0:
+            seed = -1
+        seed = utils.sanitize_seed(seed)
+
+        start_time = time.time()
+        generated_imgs = inpaint_obj.generate_images(
+            args.prompts,
+            args.negative_prompts,
+            image,
+            mask_image,
+            args.batch_size,
+            args.height,
+            args.width,
+            args.steps,
+            args.guidance_scale,
+            seed,
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+        )
+        total_time = time.time() - start_time
+        text_output = f"prompt={args.prompts}"
+        text_output += f"\nnegative prompt={args.negative_prompts}"
+        text_output += (
+            f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+        )
+        text_output += f"\nscheduler={args.scheduler}, device={args.device}"
+        text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={seed}, size={args.height}x{args.width}"
+        text_output += (
+            f", batch size={args.batch_size}, max_length={args.max_length}"
+        )
+        text_output += inpaint_obj.log
+        text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+        save_output_img(generated_imgs[0], seed)
+        print(text_output)
--- a/apps/stable_diffusion/src/init.py
+++ b/apps/stable_diffusion/src/init.py
@@ -4,5 +4,8 @@ from apps.stable_diffusion.src.utils import (
    prompt_examples,
    get_available_devices,
 )
-from apps.stable_diffusion.src.pipelines import Text2ImagePipeline
+from apps.stable_diffusion.src.pipelines import (
+    Text2ImagePipeline,
+    InpaintPipeline,
+)
 from apps.stable_diffusion.src.schedulers import get_schedulers
--- a/apps/stable_diffusion/src/models/init.py
+++ b/apps/stable_diffusion/src/models/init.py
@@ -2,6 +2,7 @@ from apps.stable_diffusion.src.models.model_wrappers import (
    SharkifyStableDiffusionModel,
 )
 from apps.stable_diffusion.src.models.opt_params import (
+    get_vae_encode,
    get_vae,
    get_unet,
    get_clip,
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -28,15 +28,19 @@ def replace_shape_str(shape, max_len, width, height, batch_size):
        elif shape[i] == "width":
            new_shape.append(width)
        elif isinstance(shape[i], str):
+            mul_val = int(shape[i].split("*")[0])
            if "batch_size" in shape[i]:
-                mul_val = int(shape[i].split("*")[0])
                new_shape.append(batch_size * mul_val)
+            elif "height" in shape[i]:
+                new_shape.append(height * mul_val)
+            elif "width" in shape[i]:
+                new_shape.append(width * mul_val)
        else:
            new_shape.append(shape[i])
    return new_shape


-# Get the input info for various models i.e. "unet", "clip", "vae".
+# Get the input info for various models i.e. "unet", "clip", "vae", "vae_encode".
 def get_input_info(model_info, max_len, width, height, batch_size):
    dtype_config = {"f32": torch.float32, "i64": torch.int64}
    input_map = defaultdict(list)
@@ -122,6 +126,32 @@ class SharkifyStableDiffusionModel:
        if not (height % 8 == 0 and height >= 384):
            sys.exit("height should be greater than 384 and multiple of 8")

+    def get_vae_encode(self):
+        class VaeEncodeModel(torch.nn.Module):
+            def __init__(self, model_id=self.model_id):
+                super().__init__()
+                self.vae = AutoencoderKL.from_pretrained(
+                    model_id,
+                    subfolder="vae",
+                )
+
+            def forward(self, input):
+                latents = self.vae.encode(input).latent_dist.sample()
+                return 0.18215 * latents
+
+        vae_encode = VaeEncodeModel()
+        inputs = tuple(self.inputs["vae_encode"])
+        is_f16 = True if self.precision == "fp16" else False
+        shark_vae_encode = compile_through_fx(
+            vae_encode,
+            inputs,
+            is_f16=is_f16,
+            use_tuned=self.use_tuned,
+            model_name="vae_encode" + self.model_name,
+            extra_args=get_opt_flags("vae", precision=self.precision),
+        )
+        return shark_vae_encode
+
    def get_vae(self):
        class VaeModel(torch.nn.Module):
            def __init__(self, model_id=self.model_id, base_vae=self.base_vae):
@@ -219,7 +249,7 @@ class SharkifyStableDiffusionModel:

    # Compiles Clip, Unet and Vae with `base_model_id` as defining their input
    # configiration.
-    def compile_all(self, base_model_id):
+    def compile_all(self, base_model_id, if_inpaint):
        self.inputs = get_input_info(
            base_models[base_model_id],
            self.max_len,
@@ -230,15 +260,19 @@ class SharkifyStableDiffusionModel:
        compiled_unet = self.get_unet()
        compiled_vae = self.get_vae()
        compiled_clip = self.get_clip()
-        
+        if if_inpaint:
+            compiled_vae_encode = self.get_vae_encode()
+            return compiled_clip, compiled_unet, compiled_vae, compiled_vae_encode
+
        return compiled_clip, compiled_unet, compiled_vae

    def __call__(self):
        # Step 1:
        # --  Fetch all vmfbs for the model, if present, else delete the lot.
+        if_inpaint = "inpaint" in self.model_id
        vmfbs = fetch_or_delete_vmfbs(
-            self.model_name, self.base_vae, self.precision
-        )   
+            self.model_name, self.base_vae, if_inpaint, self.precision
+        )
        if vmfbs[0]:
            # -- If all vmfbs are indeed present, we also try and fetch the base
            #    model configuration for running SD with custom checkpoints.
@@ -274,7 +308,10 @@ class SharkifyStableDiffusionModel:
        print("Inferring base model configuration.")
        for model_id in base_models:
            try:
-                compiled_clip, compiled_unet, compiled_vae = self.compile_all(model_id)
+                if if_inpaint:
+                    compiled_clip, compiled_unet, compiled_vae, compiled_vae_encode = self.compile_all(model_id, if_inpaint)
+                else:
+                    compiled_clip, compiled_unet, compiled_vae = self.compile_all(model_id, if_inpaint)
            except Exception as e:
                if args.enable_stack_trace:
                    traceback.print_exc()
@@ -289,6 +326,13 @@ class SharkifyStableDiffusionModel:
            # the knowledge of base model id accordingly into `args.hf_model_id`.
            if args.ckpt_loc != "":
                args.hf_model_id = model_id
+            if if_inpaint:
+                return (
+                    compiled_clip,
+                    compiled_unet,
+                    compiled_vae,
+                    compiled_vae_encode,
+                )
            return compiled_clip, compiled_unet, compiled_vae
        sys.exit(
            "Cannot compile the model. Please re-run the command with `--enable_stack_trace` flag and create an issue with detailed log at https://github.com/nod-ai/SHARK/issues"
--- a/apps/stable_diffusion/src/models/opt_params.py
+++ b/apps/stable_diffusion/src/models/opt_params.py
@@ -16,6 +16,8 @@ hf_model_variant_map = {
    "stabilityai/stable-diffusion-2-1": ["stablediffusion", "v2_1base"],
    "stabilityai/stable-diffusion-2-1-base": ["stablediffusion", "v2_1base"],
    "CompVis/stable-diffusion-v1-4": ["stablediffusion", "v1_4"],
+    "runwayml/stable-diffusion-inpainting": ["stablediffusion", "inpaint_v1"],
+    "stabilityai/stable-diffusion-2-inpainting": ["stablediffusion", "inpaint_v2"],
 }


@@ -52,6 +54,23 @@ def get_unet():
    return get_shark_model(bucket, model_name, iree_flags)


+def get_vae_encode():
+    variant, version = get_variant_version(args.hf_model_id)
+    # Tuned model is present only for `fp16` precision.
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    if "vulkan" not in args.device and args.use_tuned:
+        bucket_key = f"{variant}/{is_tuned}/{args.device}"
+        model_key = f"{variant}/{version}/vae_encode/{args.precision}/length_77/{is_tuned}/{args.device}"
+    else:
+        bucket_key = f"{variant}/{is_tuned}"
+        model_key = f"{variant}/{version}/vae_encode/{args.precision}/length_77/{is_tuned}"
+
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "vae", is_tuned, args.precision
+    )
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
 def get_vae():
    variant, version = get_variant_version(args.hf_model_id)
    # Tuned model is present only for `fp16` precision.
--- a/apps/stable_diffusion/src/pipelines/init.py
+++ b/apps/stable_diffusion/src/pipelines/init.py
@@ -1,3 +1,6 @@
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_txt2img import (
    Text2ImagePipeline,
 )
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_inpaint import (
+    InpaintPipeline,
+)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_inpaint.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_inpaint.py
@@ -0,0 +1,229 @@
+import torch
+from tqdm.auto import tqdm
+import numpy as np
+from random import randint
+from PIL import Image
+from transformers import CLIPTokenizer
+from typing import Union
+from shark.shark_inference import SharkInference
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+)
+from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+    StableDiffusionPipeline,
+)
+
+
+class InpaintPipeline(StableDiffusionPipeline):
+    def __init__(
+        self,
+        vae_encode: SharkInference,
+        vae: SharkInference,
+        text_encoder: SharkInference,
+        tokenizer: CLIPTokenizer,
+        unet: SharkInference,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+        ],
+    ):
+        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
+        self.vae_encode = vae_encode
+
+    def prepare_mask_and_masked_image(self, image, mask):
+        # preprocess image
+        if isinstance(image, (Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        # preprocess mask
+        if isinstance(mask, (Image.Image, np.ndarray)):
+            mask = [mask]
+
+        if isinstance(mask, list) and isinstance(mask[0], Image.Image):
+            mask = np.concatenate(
+                [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0
+            )
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+
+        masked_image = image * (mask < 0.5)
+
+        return mask, masked_image
+
+    def prepare_latents(
+        self,
+        batch_size,
+        height,
+        width,
+        generator,
+        num_inference_steps,
+        dtype,
+    ):
+        latents = torch.randn(
+            (
+                batch_size,
+                4,
+                height // 8,
+                width // 8,
+            ),
+            generator=generator,
+            dtype=torch.float32,
+        ).to(dtype)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.scheduler.is_scale_input_called = True
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_mask_latents(
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        height,
+        width,
+        dtype,
+    ):
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // 8, width // 8)
+        )
+        mask = mask.to(dtype)
+
+        masked_image = masked_image.to(dtype)
+        masked_image_latents = self.vae_encode("forward", (masked_image,))
+        masked_image_latents = torch.from_numpy(masked_image_latents)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(
+                batch_size // masked_image_latents.shape[0], 1, 1, 1
+            )
+        return mask, masked_image_latents
+
+    def generate_images(
+        self,
+        prompts,
+        neg_prompts,
+        image,
+        mask_image,
+        batch_size,
+        height,
+        width,
+        num_inference_steps,
+        guidance_scale,
+        seed,
+        max_length,
+        dtype,
+        use_base_vae,
+        cpu_scheduling,
+    ):
+        # prompts and negative prompts must be a list.
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        if isinstance(neg_prompts, str):
+            neg_prompts = [neg_prompts]
+
+        prompts = prompts * batch_size
+        neg_prompts = neg_prompts * batch_size
+
+        # seed generator to create the inital latent noise. Also handle out of range seeds.
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        if seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+        generator = torch.manual_seed(seed)
+
+        # Get initial latents
+        init_latents = self.prepare_latents(
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            dtype=dtype,
+        )
+
+        # Get text embeddings from prompts
+        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+
+        # guidance scale as a float32 tensor.
+        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
+
+        # Preprocess mask and image
+        mask, masked_image = self.prepare_mask_and_masked_image(
+            image, mask_image
+        )
+
+        # Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask=mask,
+            masked_image=masked_image,
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            dtype=dtype,
+        )
+
+        # Get Image latents
+        latents = self.produce_img_latents(
+            latents=init_latents,
+            text_embeddings=text_embeddings,
+            guidance_scale=guidance_scale,
+            total_timesteps=self.scheduler.timesteps,
+            dtype=dtype,
+            cpu_scheduling=cpu_scheduling,
+            mask=mask,
+            masked_image_latents=masked_image_latents,
+        )
+
+        # Img latents -> PIL images
+        all_imgs = []
+        for i in tqdm(range(0, latents.shape[0], batch_size)):
+            imgs = self.decode_latents(
+                latents=latents[i : i + batch_size],
+                use_base_vae=use_base_vae,
+                cpu_scheduling=cpu_scheduling,
+            )
+            all_imgs.extend(imgs)
+
+        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
@@ -1,4 +1,5 @@
 import torch
+import numpy as np
 from transformers import CLIPTokenizer
 from PIL import Image
 from tqdm.auto import tqdm
@@ -16,6 +17,7 @@ from shark.shark_inference import SharkInference
 from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
 from apps.stable_diffusion.src.models import (
    SharkifyStableDiffusionModel,
+    get_vae_encode,
    get_vae,
    get_clip,
    get_unet,
@@ -112,6 +114,8 @@ class StableDiffusionPipeline:
        total_timesteps,
        dtype,
        cpu_scheduling,
+        mask=None,
+        masked_image_latents=None,
        return_all_latents=False,
    ):
        step_time_sum = 0
@@ -122,6 +126,15 @@ class StableDiffusionPipeline:
            step_start_time = time.time()
            timestep = torch.tensor([t]).to(dtype).detach().numpy()
            latent_model_input = self.scheduler.scale_model_input(latents, t)
+            if mask is not None and masked_image_latents is not None:
+                latent_model_input = torch.cat(
+                    [
+                        torch.from_numpy(np.asarray(latent_model_input)),
+                        mask,
+                        masked_image_latents,
+                    ],
+                    dim=1,
+                ).to(dtype)
            if cpu_scheduling:
                latent_model_input = latent_model_input.detach().numpy()

@@ -199,8 +212,23 @@ class StableDiffusionPipeline:
                use_base_vae=use_base_vae,
                use_tuned=use_tuned,
            )
+            if "inpaint" in model_id:
+                clip, unet, vae, vae_encode = mlir_import()
+                return cls(
+                    vae_encode, vae, clip, get_tokenizer(), unet, scheduler
+                )
            clip, unet, vae = mlir_import()
            return cls(vae, clip, get_tokenizer(), unet, scheduler)
+
+        if "inpaint" in model_id:
+            return cls(
+                get_vae_encode(),
+                get_vae(),
+                get_clip(),
+                get_tokenizer(),
+                get_unet(),
+                scheduler,
+            )
        return cls(
            get_vae(), get_clip(), get_tokenizer(), get_unet(), scheduler
        )
--- a/apps/stable_diffusion/src/utils/resources/base_model.json
+++ b/apps/stable_diffusion/src/utils/resources/base_model.json
@@ -29,6 +29,14 @@
                "dtype": "f32"
            }
        },
+        "vae_encode": {
+            "image" : {
+                "shape" : [
+                    "1*batch_size",3,"8*height","8*width"
+                ],
+                "dtype":"f32"
+            }
+        },
        "vae": {
            "latents" : {
                "shape" : [
@@ -77,6 +85,126 @@
                "dtype": "f32"
            }
        },
+        "vae_encode": {
+            "image" : {
+                "shape" : [
+                    "1*batch_size",3,"8*height","8*width"
+                ],
+                "dtype":"f32"
+            }
+        },
+        "vae": {
+            "latents" : {
+                "shape" : [
+                    "1*batch_size",4,"height","width"
+                ],
+                "dtype":"f32"
+            }
+        },
+        "clip": {
+            "token" : {
+                "shape" : [
+                    "2*batch_size",
+                    "max_len"
+                ],
+                "dtype":"i64"
+            }
+        }
+    },
+    "runwayml/stable-diffusion-inpainting": {
+        "unet": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    9,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    768
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "vae_encode": {
+            "image" : {
+                "shape" : [
+                    "1*batch_size",3,"8*height","8*width"
+                ],
+                "dtype":"f32"
+            }
+        },
+        "vae": {
+            "latents" : {
+                "shape" : [
+                    "1*batch_size",4,"height","width"
+                ],
+                "dtype":"f32"
+            }
+        },
+        "clip": {
+            "token" : {
+                "shape" : [
+                    "2*batch_size",
+                    "max_len"
+                ],
+                "dtype":"i64"
+            }
+        }
+    },
+    "stabilityai/stable-diffusion-2-inpainting": {
+        "unet": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    9,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    1024
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "vae_encode": {
+            "image" : {
+                "shape" : [
+                    "1*batch_size",3,"8*height","8*width"
+                ],
+                "dtype":"f32"
+            }
+        },
        "vae": {
            "latents" : {
                "shape" : [
--- a/apps/stable_diffusion/src/utils/resources/model_config.json
+++ b/apps/stable_diffusion/src/utils/resources/model_config.json
@@ -3,6 +3,8 @@
    "stablediffusion/v1_4":"CompVis/stable-diffusion-v1-4",
    "stablediffusion/v2_1base":"stabilityai/stable-diffusion-2-1-base",
    "stablediffusion/v2_1":"stabilityai/stable-diffusion-2-1",
+    "stablediffusion/inpaint_v1":"runwayml/stable-diffusion-inpainting",
+    "stablediffusion/inpaint_v2":"stabilityai/stable-diffusion-2-inpainting",
    "anythingv3/v1_4":"Linaqruf/anything-v3.0",
    "analogdiffusion/v1_4":"wavymulder/Analog-Diffusion",
    "openjourney/v1_4":"prompthero/openjourney",
--- a/apps/stable_diffusion/src/utils/resources/model_db.json
+++ b/apps/stable_diffusion/src/utils/resources/model_db.json
@@ -42,6 +42,17 @@
    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"vae2_8dec_fp16",
    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/inpaint_v1/unet/fp16/length_77/untuned":"unet_inpaint_fp16",
+    "stablediffusion/inpaint_v1/unet/fp32/length_77/untuned":"unet_inpaint_fp32",
+    "stablediffusion/inpaint_v1/vae_encode/fp16/length_77/untuned":"vae_encode_inpaint_fp16",
+    "stablediffusion/inpaint_v1/vae_encode/fp32/length_77/untuned":"vae_encode_inpaint_fp32",
+    "stablediffusion/inpaint_v1/vae/fp16/length_77/untuned":"vae_inpaint_fp16",
+    "stablediffusion/inpaint_v1/vae/fp32/length_77/untuned":"vae_inpaint_fp32",
+    "stablediffusion/inpaint_v1/clip/fp32/length_77/untuned":"clip_inpaint_fp32",
+    "stablediffusion/inpaint_v2/unet/fp16/length_77/untuned":"unet_inpaint_fp16",
+    "stablediffusion/inpaint_v2/vae_encode/fp16/length_77/untuned":"vae_encode_inpaint_fp16",
+    "stablediffusion/inpaint_v2/vae/fp16/length_77/untuned":"vae_inpaint_fp16",
+    "stablediffusion/inpaint_v2/clip/fp32/length_77/untuned":"clip_inpaint_fp32",
    "anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
    "anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
    "anythingv3/v2_1base/unet/fp16/length_77/tuned/cuda":"av3_unet_19dec_fp16_cuda_tuned",
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -29,6 +29,18 @@ p.add_argument(
    help="text you don't want to see in the generated image.",
 )

+p.add_argument(
+    "--img_path",
+    type=str,
+    help="Path to the image input for img2img/inpainting",
+)
+
+p.add_argument(
+    "--mask_path",
+    type=str,
+    help="Path to the mask image input for inpainting",
+)
+
 p.add_argument(
    "--steps",
    type=int,
--- a/apps/stable_diffusion/src/utils/utils.py
+++ b/apps/stable_diffusion/src/utils/utils.py
@@ -274,6 +274,8 @@ def set_init_device_flags():
        "stabilityai/stable-diffusion-2-1",
        "stabilityai/stable-diffusion-2-1-base",
        "CompVis/stable-diffusion-v1-4",
+        "runwayml/stable-diffusion-inpainting",
+        "stabilityai/stable-diffusion-2-inpainting",
    ]:
        args.import_mlir = True

@@ -395,7 +397,7 @@ def preprocessCKPT(custom_weights):


 def load_vmfb(vmfb_path, model, precision):
-    model = "vae" if "base_vae" in model else model
+    model = "vae" if "base_vae" in model or "vae_encode" in model else model
    precision = "fp32" if "clip" in model else precision
    extra_args = get_opt_flags(model, precision)
    shark_module = SharkInference(mlir_module=None, device=args.device)
@@ -403,16 +405,24 @@ def load_vmfb(vmfb_path, model, precision):
    return shark_module


-# This utility returns vmfbs of Clip, Unet and Vae, in case all three of them
+# This utility returns vmfbs of Clip, Unet, Vae and Vae_encode, in case all of them
 # are present; deletes them otherwise.
-def fetch_or_delete_vmfbs(basic_model_name, use_base_vae, precision="fp32"):
-    model_name = ["clip", "unet", "base_vae" if use_base_vae else "vae"]
+def fetch_or_delete_vmfbs(
+    basic_model_name, use_base_vae, if_inpaint, precision="fp32"
+):
+    model_name = [
+        "clip",
+        "unet",
+        "base_vae" if use_base_vae else "vae",
+    ]
+    if if_inpaint:
+        model_name.append("vae_encode")
    vmfb_path = [
        get_vmfb_path_name(model + basic_model_name)[0] for model in model_name
    ]
    vmfb_present = [os.path.isfile(vmfb) for vmfb in vmfb_path]
    all_vmfb_present = functools.reduce(operator.__and__, vmfb_present)
-    compiled_models = [None] * 3
+    compiled_models = [None] * 4 if if_inpaint else [None] * 3
    # We need to delete vmfbs only if some of the models were compiled.
    if not all_vmfb_present:
        for i in range(len(vmfb_path)):