[SD] Add Stencil feature to SD pipeline (#1111)

* [WIP] Add ControlNet to SD pipeline -- This commit adds ControlNet to SD pipeline. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> * [SD] Add ControlNet to img2img + fix bug for img2img scheduler -- This commit adds ControlNet execution to img2img. -- It restructures the addition of ControlNet variants. -- It also fixes scheduler selecting bug for img2img pipeline. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> * add shark models for stencilSD * Add Stencil controlled SD in img2img pipeline (#1106) * use shark stencil modules * adjust diffusers change * modify to use pipeline * remove control from unet * pump stencils through unet * complete integration in img2img * fix lint and comments * [SD] Add ControlNet pipeline + integrate with WebUI + add compiled flow execution -- This commit creates a dedicated SD pipeline for ControlNet. -- Integrates it with img2img WebUI. -- Integrates the compiled execution flow for ControlNet. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> * [SD] Stencil execution * Remove integration setup * [SD] Fix args.use_stencil overriding bug + vmfb caching issue -- This commit fixes args.use_stencil overriding issue which caused img2img pipeline to pick wrong set of modules. -- It also fixes vmfb caching issue to speed up the loading time and pick right set of modules based on a mask. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> --------- Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> Co-authored-by: Abhishek Varma <abhishek@nod-labs.com> Co-authored-by: PhaneeshB <b.phaneesh@gmail.com>
2026-02-19 11:56:43 -05:00 · 2023-03-02 00:14:40 +05:30
parent f09574538c
commit be3cdec290
15 changed files with 840 additions and 66 deletions
--- a/apps/stable_diffusion/scripts/img2img.py
+++ b/apps/stable_diffusion/scripts/img2img.py
@@ -6,6 +6,7 @@ from dataclasses import dataclass
 from apps.stable_diffusion.src import (
    args,
    Image2ImagePipeline,
+    StencilPipeline,
    get_schedulers,
    set_init_device_flags,
    utils,
@@ -24,6 +25,7 @@ class Config:
    height: int
    width: int
    device: str
+    use_stencil: str


 img2img_obj = None
@@ -50,6 +52,7 @@ def img2img_inf(
    precision: str,
    device: str,
    max_length: int,
+    use_stencil: str,
    save_metadata_to_json: bool,
    save_metadata_to_png: bool,
 ):
@@ -92,8 +95,24 @@ def img2img_inf(
    args.save_metadata_to_json = save_metadata_to_json
    args.write_metadata_to_png = save_metadata_to_png

+    use_stencil = None if use_stencil == "None" else use_stencil
+    args.use_stencil = use_stencil
+    if use_stencil is not None:
+        args.scheduler = "DDIM"
+        args.hf_model_id = "runwayml/stable-diffusion-v1-5"
+    elif args.scheduler != "PNDM":
+        if "Shark" in args.scheduler:
+            print(
+                f"SharkEulerDiscrete scheduler not supported. Switching to PNDM scheduler"
+            )
+            args.scheduler = "PNDM"
+        else:
+            sys.exit(
+                "Img2Img works best with PNDM scheduler. Other schedulers are not supported yet."
+            )
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    args.precision = precision
    dtype = torch.float32 if precision == "fp32" else torch.half
-    cpu_scheduling = not scheduler.startswith("Shark")
    new_config_obj = Config(
        args.hf_model_id,
        args.ckpt_loc,
@@ -103,10 +122,10 @@ def img2img_inf(
        height,
        width,
        device,
+        use_stencil,
    )
    if not img2img_obj or config_obj != new_config_obj:
        config_obj = new_config_obj
-        args.precision = precision
        args.batch_size = batch_size
        args.max_length = max_length
        args.height = height
@@ -123,21 +142,40 @@ def img2img_inf(
        )
        schedulers = get_schedulers(model_id)
        scheduler_obj = schedulers[scheduler]
-        img2img_obj = Image2ImagePipeline.from_pretrained(
-            scheduler_obj,
-            args.import_mlir,
-            args.hf_model_id,
-            args.ckpt_loc,
-            args.custom_vae,
-            args.precision,
-            args.max_length,
-            args.batch_size,
-            args.height,
-            args.width,
-            args.use_base_vae,
-            args.use_tuned,
-            low_cpu_mem_usage=args.low_cpu_mem_usage,
-        )
+        if use_stencil is not None:
+            args.use_tuned = False
+            img2img_obj = StencilPipeline.from_pretrained(
+                scheduler_obj,
+                args.import_mlir,
+                args.hf_model_id,
+                args.ckpt_loc,
+                args.custom_vae,
+                args.precision,
+                args.max_length,
+                args.batch_size,
+                args.height,
+                args.width,
+                args.use_base_vae,
+                args.use_tuned,
+                low_cpu_mem_usage=args.low_cpu_mem_usage,
+                use_stencil=use_stencil,
+            )
+        else:
+            img2img_obj = Image2ImagePipeline.from_pretrained(
+                scheduler_obj,
+                args.import_mlir,
+                args.hf_model_id,
+                args.ckpt_loc,
+                args.custom_vae,
+                args.precision,
+                args.max_length,
+                args.batch_size,
+                args.height,
+                args.width,
+                args.use_base_vae,
+                args.use_tuned,
+                low_cpu_mem_usage=args.low_cpu_mem_usage,
+            )

    img2img_obj.scheduler = schedulers[scheduler]

@@ -165,6 +203,7 @@ def img2img_inf(
            dtype,
            args.use_base_vae,
            cpu_scheduling,
+            use_stencil=use_stencil,
        )
        save_output_img(out_imgs[0], img_seed, extra_info)
        generated_imgs.extend(out_imgs)
@@ -195,11 +234,11 @@ if __name__ == "__main__":
    # When the models get uploaded, it should be default to False.
    args.import_mlir = True

-    dtype = torch.float32 if args.precision == "fp32" else torch.half
-    cpu_scheduling = not args.scheduler.startswith("Shark")
-    set_init_device_flags()
-    schedulers = get_schedulers(args.hf_model_id)
-    if args.scheduler != "PNDM":
+    use_stencil = args.use_stencil
+    if use_stencil:
+        args.scheduler = "DDIM"
+        args.hf_model_id = "runwayml/stable-diffusion-v1-5"
+    elif args.scheduler != "PNDM":
        if "Shark" in args.scheduler:
            print(
                f"SharkEulerDiscrete scheduler not supported. Switching to PNDM scheduler"
@@ -209,28 +248,49 @@ if __name__ == "__main__":
            sys.exit(
                "Img2Img works best with PNDM scheduler. Other schedulers are not supported yet."
            )
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    dtype = torch.float32 if args.precision == "fp32" else torch.half
+    set_init_device_flags()
+    schedulers = get_schedulers(args.hf_model_id)

    scheduler_obj = schedulers[args.scheduler]
    image = Image.open(args.img_path).convert("RGB")
    seed = utils.sanitize_seed(args.seed)
-
    # Adjust for height and width based on model

-    img2img_obj = Image2ImagePipeline.from_pretrained(
-        scheduler_obj,
-        args.import_mlir,
-        args.hf_model_id,
-        args.ckpt_loc,
-        args.custom_vae,
-        args.precision,
-        args.max_length,
-        args.batch_size,
-        args.height,
-        args.width,
-        args.use_base_vae,
-        args.use_tuned,
-        low_cpu_mem_usage=args.low_cpu_mem_usage,
-    )
+    if use_stencil:
+        img2img_obj = StencilPipeline.from_pretrained(
+            scheduler_obj,
+            args.import_mlir,
+            args.hf_model_id,
+            args.ckpt_loc,
+            args.custom_vae,
+            args.precision,
+            args.max_length,
+            args.batch_size,
+            args.height,
+            args.width,
+            args.use_base_vae,
+            args.use_tuned,
+            low_cpu_mem_usage=args.low_cpu_mem_usage,
+            use_stencil=use_stencil,
+        )
+    else:
+        img2img_obj = Image2ImagePipeline.from_pretrained(
+            scheduler_obj,
+            args.import_mlir,
+            args.hf_model_id,
+            args.ckpt_loc,
+            args.custom_vae,
+            args.precision,
+            args.max_length,
+            args.batch_size,
+            args.height,
+            args.width,
+            args.use_base_vae,
+            args.use_tuned,
+            low_cpu_mem_usage=args.low_cpu_mem_usage,
+        )

    start_time = time.time()
    generated_imgs = img2img_obj.generate_images(
@@ -248,6 +308,7 @@ if __name__ == "__main__":
        dtype,
        args.use_base_vae,
        cpu_scheduling,
+        use_stencil=use_stencil,
    )
    total_time = time.time() - start_time
    text_output = f"prompt={args.prompts}"
--- a/apps/stable_diffusion/src/init.py
+++ b/apps/stable_diffusion/src/init.py
@@ -11,5 +11,6 @@ from apps.stable_diffusion.src.pipelines import (
    Image2ImagePipeline,
    InpaintPipeline,
    OutpaintPipeline,
+    StencilPipeline,
 )
 from apps.stable_diffusion.src.schedulers import get_schedulers
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -1,4 +1,4 @@
-from diffusers import AutoencoderKL, UNet2DConditionModel
+from diffusers import AutoencoderKL, UNet2DConditionModel, ControlNetModel
 from transformers import CLIPTextModel
 from collections import defaultdict
 import torch
@@ -117,10 +117,14 @@ class SharkifyStableDiffusionModel:
        self.model_name = self.model_name + "_" + get_path_stem(self.model_id)
        self.low_cpu_mem_usage = low_cpu_mem_usage

-    def get_extended_name_for_all_model(self):
+    def get_extended_name_for_all_model(self, mask_to_fetch):
        model_name = {}
-        sub_model_list = ["clip", "unet", "vae", "vae_encode"]
+        sub_model_list = ["clip", "unet", "stencil_unet", "vae", "vae_encode", "stencil_adaptor"]
+        index = 0
        for model in sub_model_list:
+            if mask_to_fetch[index] == False:
+                index += 1
+                continue
            sub_model = model
            model_config = self.model_name
            if "vae" == model:
@@ -129,6 +133,7 @@ class SharkifyStableDiffusionModel:
                if self.base_vae:
                    sub_model = "base_vae"
            model_name[model] = get_extended_name(sub_model + model_config)
+            index += 1
        return model_name

    def check_params(self, max_len, width, height):
@@ -215,6 +220,112 @@ class SharkifyStableDiffusionModel:
        )
        return shark_vae

+    def get_controlled_unet(self):
+        class ControlledUnetModel(torch.nn.Module):
+            def __init__(
+                self, model_id=self.model_id, low_cpu_mem_usage=False
+            ):
+                super().__init__()
+                self.unet = UNet2DConditionModel.from_pretrained(
+                    "takuma104/control_sd15_canny",  # TODO: ADD with model ID
+                    subfolder="unet",
+                    low_cpu_mem_usage=low_cpu_mem_usage,
+                )
+                self.in_channels = self.unet.in_channels
+                self.train(False)
+
+            def forward( self, latent, timestep, text_embedding, guidance_scale, control1,
+                         control2, control3, control4, control5, control6, control7,
+                         control8, control9, control10, control11, control12, control13,
+            ):
+                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+                db_res_samples = tuple([ control1, control2, control3, control4, control5, control6, control7, control8, control9, control10, control11, control12,])
+                mb_res_samples = control13
+                latents = torch.cat([latent] * 2)
+                unet_out = self.unet.forward(
+                    latents,
+                    timestep,
+                    encoder_hidden_states=text_embedding,
+                    down_block_additional_residuals=db_res_samples,
+                    mid_block_additional_residual=mb_res_samples,
+                    return_dict=False,
+                )[0]
+                noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+                return noise_pred
+
+        unet = ControlledUnetModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
+        is_f16 = True if self.precision == "fp16" else False
+
+        inputs = tuple(self.inputs["stencil_unet"])
+        input_mask = [True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True,]
+        shark_controlled_unet = compile_through_fx(
+            unet,
+            inputs,
+            model_name=self.model_name["stencil_unet"],
+            is_f16=is_f16,
+            f16_input_mask=input_mask,
+            use_tuned=self.use_tuned,
+            extra_args=get_opt_flags("unet", precision=self.precision),
+        )
+        return shark_controlled_unet
+
+    def get_control_net(self):
+        class StencilControlNetModel(torch.nn.Module):
+            def __init__(
+                self, model_id=self.model_id, low_cpu_mem_usage=False
+            ):
+                super().__init__()
+                self.cnet = ControlNetModel.from_pretrained(
+                    "takuma104/control_sd15_canny", # TODO: ADD with model ID
+                    subfolder="controlnet",
+                    low_cpu_mem_usage=low_cpu_mem_usage,
+                )
+                self.in_channels = self.cnet.in_channels
+                self.train(False)
+
+            def forward(
+                self,
+                latent,
+                timestep,
+                text_embedding,
+                stencil_image_input,
+            ):
+                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+                # TODO: guidance NOT NEEDED change in `get_input_info` later
+                latents = torch.cat(
+                    [latent] * 2
+                )  # needs to be same as controlledUNET latents
+                stencil_image = torch.cat(
+                    [stencil_image_input] * 2
+                )  # needs to be same as controlledUNET latents
+                down_block_res_samples, mid_block_res_sample = self.cnet.forward(
+                    latents,
+                    timestep,
+                    encoder_hidden_states=text_embedding,
+                    controlnet_cond=stencil_image,
+                    return_dict=False,
+                )
+                return tuple(list(down_block_res_samples) + [mid_block_res_sample])
+
+        scnet = StencilControlNetModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
+        is_f16 = True if self.precision == "fp16" else False
+
+        inputs = tuple(self.inputs["stencil_adaptor"])
+        input_mask = [True, True, True, True]
+        shark_cnet = compile_through_fx(
+            scnet,
+            inputs,
+            model_name=self.model_name["stencil_adaptor"],
+            is_f16=is_f16,
+            f16_input_mask=input_mask,
+            use_tuned=self.use_tuned,
+            extra_args=get_opt_flags("unet", precision=self.precision),
+        )
+        return shark_cnet
+
    def get_unet(self):
        class UnetModel(torch.nn.Module):
            def __init__(self, model_id=self.model_id, low_cpu_mem_usage=False):
@@ -232,8 +343,9 @@ class SharkifyStableDiffusionModel:
                    else:
                        self.unet.set_attention_slice(args.attention_slicing)

+            # TODO: Instead of flattening the `control` try to use the list.
            def forward(
-                self, latent, timestep, text_embedding, guidance_scale
+                self, latent, timestep, text_embedding, guidance_scale,
            ):
                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
                latents = torch.cat([latent] * 2)
@@ -306,7 +418,7 @@ class SharkifyStableDiffusionModel:
            
    # Compiles Clip, Unet and Vae with `base_model_id` as defining their input
    # configiration.
-    def compile_all(self, base_model_id, need_vae_encode):
+    def compile_all(self, base_model_id, need_vae_encode, need_stencil):
        self.inputs = get_input_info(
            base_models[base_model_id],
            self.max_len,
@@ -314,23 +426,45 @@ class SharkifyStableDiffusionModel:
            self.height,
            self.batch_size,
        )
-        compiled_unet = self.get_unet()
+        compiled_controlnet = None
+        compiled_controlled_unet = None
+        compiled_unet = None
+        if need_stencil:
+            compiled_controlnet = self.get_control_net()
+            compiled_controlled_unet = self.get_controlled_unet()
+        else:
+            compiled_unet = self.get_unet()
        if self.custom_vae != "":
            print("Plugging in custom Vae")
        compiled_vae = self.get_vae()
        compiled_clip = self.get_clip()
+
+        if need_stencil:
+            return compiled_clip, compiled_controlled_unet, compiled_vae, compiled_controlnet
        if need_vae_encode:
            compiled_vae_encode = self.get_vae_encode()
            return compiled_clip, compiled_unet, compiled_vae, compiled_vae_encode

-        return compiled_clip, compiled_unet, compiled_vae
+        return compiled_clip, compiled_unet, compiled_vae, None

    def __call__(self):
        # Step 1:
        # --  Fetch all vmfbs for the model, if present, else delete the lot.
-        need_vae_encode = args.img_path is not None
-        self.model_name = self.get_extended_name_for_all_model()
-        vmfbs = fetch_or_delete_vmfbs(self.model_name, need_vae_encode, self.precision)   
+        need_vae_encode, need_stencil = False, False
+        if args.img_path is not None:
+            if args.use_stencil is not None:
+                need_stencil = True
+            else:
+                need_vae_encode = True
+        # `mask_to_fetch` prepares a mask to pick a combination out of :-
+        # ["clip", "unet", "stencil_unet", "vae", "vae_encode", "stencil_adaptor"]
+        mask_to_fetch = [True, True, False, True, False, False]
+        if need_vae_encode:
+            mask_to_fetch = [True, True, False, True, True, False]
+        elif need_stencil:
+            mask_to_fetch = [True, False, True, True, False, True]
+        self.model_name = self.get_extended_name_for_all_model(mask_to_fetch)
+        vmfbs = fetch_or_delete_vmfbs(self.model_name, self.precision)   
        if vmfbs[0]:
            # -- If all vmfbs are indeed present, we also try and fetch the base
            #    model configuration for running SD with custom checkpoints.
@@ -339,8 +473,6 @@ class SharkifyStableDiffusionModel:
            if args.hf_model_id == "":
                sys.exit("Base model configuration for the custom model is missing. Use `--clear_all` and re-run.")
            print("Loaded vmfbs from cache and successfully fetched base model configuration.")
-            if not need_vae_encode:
-                return vmfbs[:3]
            return vmfbs

        # Step 2:
@@ -363,7 +495,7 @@ class SharkifyStableDiffusionModel:
            print("Compiling all the models with the fetched base model configuration.")
            if args.ckpt_loc != "":
                args.hf_model_id = base_model_fetched
-            return self.compile_all(base_model_fetched, need_vae_encode)
+            return self.compile_all(base_model_fetched, need_vae_encode, need_stencil)

        # Step 3:
        # -- This is the retry mechanism where the base model's configuration is not
@@ -372,9 +504,11 @@ class SharkifyStableDiffusionModel:
        for model_id in base_models:
            try:
                if need_vae_encode:
-                    compiled_clip, compiled_unet, compiled_vae, compiled_vae_encode = self.compile_all(model_id, need_vae_encode)
+                    compiled_clip, compiled_unet, compiled_vae, compiled_vae_encode = self.compile_all(model_id, need_vae_encode, need_stencil)
+                elif need_stencil:
+                    compiled_clip, compiled_unet, compiled_vae, compiled_controlnet = self.compile_all(model_id, need_vae_encode, need_stencil)
                else:
-                    compiled_clip, compiled_unet, compiled_vae = self.compile_all(model_id, need_vae_encode)
+                    compiled_clip, compiled_unet, compiled_vae = self.compile_all(model_id, need_vae_encode, need_stencil)
            except Exception as e:
                print("Retrying with a different base model configuration")
                continue
@@ -394,6 +528,13 @@ class SharkifyStableDiffusionModel:
                    compiled_vae,
                    compiled_vae_encode,
                )
+            if need_stencil:
+                return (
+                    compiled_clip,
+                    compiled_unet,
+                    compiled_vae,
+                    compiled_controlnet,
+                )
            return compiled_clip, compiled_unet, compiled_vae
        sys.exit(
            "Cannot compile the model. Please create an issue with the detailed log at https://github.com/nod-ai/SHARK/issues"
--- a/apps/stable_diffusion/src/pipelines/init.py
+++ b/apps/stable_diffusion/src/pipelines/init.py
@@ -10,3 +10,6 @@ from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_inpaint
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_outpaint import (
    OutpaintPipeline,
 )
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_stencil import (
+    StencilPipeline,
+)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
@@ -112,6 +112,7 @@ class Image2ImagePipeline(StableDiffusionPipeline):
        dtype,
        use_base_vae,
        cpu_scheduling,
+        use_stencil,
    ):
        # prompts and negative prompts must be a list.
        if isinstance(prompts, str):
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
@@ -0,0 +1,150 @@
+import torch
+import time
+import numpy as np
+from tqdm.auto import tqdm
+from random import randint
+from PIL import Image
+from transformers import CLIPTokenizer
+from typing import Union
+from shark.shark_inference import SharkInference
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+)
+from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+    StableDiffusionPipeline,
+)
+from apps.stable_diffusion.src.utils import controlnet_hint_conversion
+
+
+class StencilPipeline(StableDiffusionPipeline):
+    def __init__(
+        self,
+        controlnet: SharkInference,
+        vae: SharkInference,
+        text_encoder: SharkInference,
+        tokenizer: CLIPTokenizer,
+        unet: SharkInference,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+        ],
+    ):
+        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
+        self.controlnet = controlnet
+
+    def prepare_latents(
+        self,
+        batch_size,
+        height,
+        width,
+        generator,
+        num_inference_steps,
+        dtype,
+    ):
+        latents = torch.randn(
+            (
+                batch_size,
+                4,
+                height // 8,
+                width // 8,
+            ),
+            generator=generator,
+            dtype=torch.float32,
+        ).to(dtype)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.scheduler.is_scale_input_called = True
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def generate_images(
+        self,
+        prompts,
+        neg_prompts,
+        image,
+        batch_size,
+        height,
+        width,
+        num_inference_steps,
+        strength,
+        guidance_scale,
+        seed,
+        max_length,
+        dtype,
+        use_base_vae,
+        cpu_scheduling,
+        use_stencil,
+    ):
+        # Control Embedding check & conversion
+        # TODO: 1. Change `num_images_per_prompt`.
+        controlnet_hint = controlnet_hint_conversion(
+            image, use_stencil, height, width, dtype, num_images_per_prompt=1
+        )
+        # prompts and negative prompts must be a list.
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        if isinstance(neg_prompts, str):
+            neg_prompts = [neg_prompts]
+
+        prompts = prompts * batch_size
+        neg_prompts = neg_prompts * batch_size
+
+        # seed generator to create the inital latent noise. Also handle out of range seeds.
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        if seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+        generator = torch.manual_seed(seed)
+
+        # Get text embeddings from prompts
+        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+
+        # guidance scale as a float32 tensor.
+        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
+
+        # Prepare initial latent.
+        init_latents = self.prepare_latents(
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            dtype=dtype,
+        )
+        final_timesteps = self.scheduler.timesteps
+
+        # Get Image latents
+        latents = self.produce_stencil_latents(
+            latents=init_latents,
+            text_embeddings=text_embeddings,
+            guidance_scale=guidance_scale,
+            total_timesteps=final_timesteps,
+            dtype=dtype,
+            cpu_scheduling=cpu_scheduling,
+            controlnet_hint=controlnet_hint,
+            controlnet=self.controlnet,
+        )
+
+        # Img latents -> PIL images
+        all_imgs = []
+        for i in tqdm(range(0, latents.shape[0], batch_size)):
+            imgs = self.decode_latents(
+                latents=latents[i : i + batch_size],
+                use_base_vae=use_base_vae,
+                cpu_scheduling=cpu_scheduling,
+            )
+            all_imgs.extend(imgs)
+
+        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
@@ -20,6 +20,9 @@ from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils i
    StableDiffusionPipeline,
 )

+import cv2
+from PIL import Image
+

 class Text2ImagePipeline(StableDiffusionPipeline):
    def __init__(
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
@@ -110,6 +110,118 @@ class StableDiffusionPipeline:
        pil_images = [Image.fromarray(image) for image in images.numpy()]
        return pil_images

+    def produce_stencil_latents(
+        self,
+        latents,
+        text_embeddings,
+        guidance_scale,
+        total_timesteps,
+        dtype,
+        cpu_scheduling,
+        controlnet_hint=None,
+        controlnet=None,
+        controlnet_conditioning_scale: float = 1.0,
+        mask=None,
+        masked_image_latents=None,
+        return_all_latents=False,
+    ):
+        step_time_sum = 0
+        latent_history = [latents]
+        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
+        text_embeddings_numpy = text_embeddings.detach().numpy()
+        for i, t in tqdm(enumerate(total_timesteps)):
+            step_start_time = time.time()
+            timestep = torch.tensor([t]).to(dtype)
+            latent_model_input = self.scheduler.scale_model_input(latents, t)
+            if mask is not None and masked_image_latents is not None:
+                latent_model_input = torch.cat(
+                    [
+                        torch.from_numpy(np.asarray(latent_model_input)),
+                        mask,
+                        masked_image_latents,
+                    ],
+                    dim=1,
+                ).to(dtype)
+            if cpu_scheduling:
+                latent_model_input = latent_model_input.detach().numpy()
+
+            if not torch.is_tensor(latent_model_input):
+                latent_model_input_1 = torch.from_numpy(
+                    np.asarray(latent_model_input)
+                ).to(dtype)
+            else:
+                latent_model_input_1 = latent_model_input
+            control = controlnet(
+                "forward",
+                (
+                    latent_model_input_1,
+                    timestep,
+                    text_embeddings,
+                    controlnet_hint,
+                ),
+                send_to_host=False,
+            )
+            down_block_res_samples = control[0:12]
+            mid_block_res_sample = control[12:]
+            down_block_res_samples = [
+                down_block_res_sample * controlnet_conditioning_scale
+                for down_block_res_sample in down_block_res_samples
+            ]
+            mid_block_res_sample = (
+                mid_block_res_sample[0] * controlnet_conditioning_scale
+            )
+            timestep = timestep.detach().numpy()
+            # Profiling Unet.
+            profile_device = start_profiling(file_path="unet.rdc")
+            # TODO: Pass `control` as it is to Unet. Same as TODO mentioned in model_wrappers.py.
+            noise_pred = self.unet(
+                "forward",
+                (
+                    latent_model_input,
+                    timestep,
+                    text_embeddings_numpy,
+                    guidance_scale,
+                    down_block_res_samples[0],
+                    down_block_res_samples[1],
+                    down_block_res_samples[2],
+                    down_block_res_samples[3],
+                    down_block_res_samples[4],
+                    down_block_res_samples[5],
+                    down_block_res_samples[6],
+                    down_block_res_samples[7],
+                    down_block_res_samples[8],
+                    down_block_res_samples[9],
+                    down_block_res_samples[10],
+                    down_block_res_samples[11],
+                    mid_block_res_sample,
+                ),
+                send_to_host=False,
+            )
+            end_profiling(profile_device)
+
+            if cpu_scheduling:
+                noise_pred = torch.from_numpy(noise_pred.to_host())
+                latents = self.scheduler.step(
+                    noise_pred, t, latents
+                ).prev_sample
+            else:
+                latents = self.scheduler.step(noise_pred, t, latents)
+
+            latent_history.append(latents)
+            step_time = (time.time() - step_start_time) * 1000
+            #  self.log += (
+            #      f"\nstep = {i} | timestep = {t} | time = {step_time:.2f}ms"
+            #  )
+            step_time_sum += step_time
+
+        avg_step_time = step_time_sum / len(total_timesteps)
+        self.log += f"\nAverage step time: {avg_step_time}ms/it"
+
+        if not return_all_latents:
+            return latents
+        all_latents = torch.cat(latent_history, dim=0)
+        return all_latents
+
    def produce_img_latents(
        self,
        latents,
@@ -205,6 +317,7 @@ class StableDiffusionPipeline:
        use_base_vae: bool,
        use_tuned: bool,
        low_cpu_mem_usage: bool = False,
+        use_stencil: bool = False,
    ):
        if import_mlir:
            mlir_import = SharkifyStableDiffusionModel(
@@ -229,6 +342,11 @@ class StableDiffusionPipeline:
                return cls(
                    vae_encode, vae, clip, get_tokenizer(), unet, scheduler
                )
+            if cls.__name__ in ["StencilPipeline"]:
+                clip, unet, vae, controlnet = mlir_import()
+                return cls(
+                    controlnet, vae, clip, get_tokenizer(), unet, scheduler
+                )
            clip, unet, vae = mlir_import()
            return cls(vae, clip, get_tokenizer(), unet, scheduler)
        try:
@@ -245,6 +363,12 @@ class StableDiffusionPipeline:
                    get_unet(),
                    scheduler,
                )
+            if cls.__name__ == "StencilPipeline":
+                import sys
+
+                sys.exit(
+                    "StencilPipeline not supported with SharkTank currently."
+                )
            return cls(
                get_vae(), get_clip(), get_tokenizer(), get_unet(), scheduler
            )
@@ -272,5 +396,10 @@ class StableDiffusionPipeline:
                return cls(
                    vae_encode, vae, clip, get_tokenizer(), unet, scheduler
                )
+            if cls.__name__ == "StencilPipeline":
+                clip, unet, vae, controlnet = mlir_import()
+                return cls(
+                    controlnet, vae, clip, get_tokenizer(), unet, scheduler
+                )
            clip, unet, vae = mlir_import()
            return cls(vae, clip, get_tokenizer(), unet, scheduler)
--- a/apps/stable_diffusion/src/utils/init.py
+++ b/apps/stable_diffusion/src/utils/init.py
@@ -11,6 +11,9 @@ from apps.stable_diffusion.src.utils.resources import (
 )
 from apps.stable_diffusion.src.utils.sd_annotation import sd_model_annotation
 from apps.stable_diffusion.src.utils.stable_args import args
+from apps.stable_diffusion.src.utils.stencils.stencil_utils import (
+    controlnet_hint_conversion,
+)
 from apps.stable_diffusion.src.utils.utils import (
    get_shark_model,
    compile_through_fx,
--- a/apps/stable_diffusion/src/utils/resources/base_model.json
+++ b/apps/stable_diffusion/src/utils/resources/base_model.json
@@ -85,6 +85,116 @@
                "dtype": "f32"
            }
        },
+        "stencil_adaptor": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    768
+                ],
+                "dtype": "f32"
+            },
+            "controlnet_hint": {
+                "shape": [1, 3, 512, 512],
+                "dtype": "f32"
+            }
+        },
+        "stencil_unet": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    768
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            },
+            "control1": {
+                "shape": [2, 320, 64, 64],
+                "dtype": "f32"
+            },
+            "control2": {
+                "shape": [2, 320, 64, 64],
+                "dtype": "f32"
+            },
+            "control3": {
+                "shape": [2, 320, 64, 64],
+                "dtype": "f32"
+            },
+            "control4": {
+                "shape": [2, 320, 32, 32],
+                "dtype": "f32"
+            },
+            "control5": {
+                "shape": [2, 640, 32, 32],
+                "dtype": "f32"
+            },
+            "control6": {
+                "shape": [2, 640, 32, 32],
+                "dtype": "f32"
+            },
+            "control7": {
+                "shape": [2, 640, 16, 16],
+                "dtype": "f32"
+            },
+            "control8": {
+                "shape": [2, 1280, 16, 16],
+                "dtype": "f32"
+            },
+            "control9": {
+                "shape": [2, 1280, 16, 16],
+                "dtype": "f32"
+            },
+            "control10": {
+                "shape": [2, 1280, 8, 8],
+                "dtype": "f32"
+            },
+            "control11": {
+                "shape": [2, 1280, 8, 8],
+                "dtype": "f32"
+            },
+            "control12": {
+                "shape": [2, 1280, 8, 8],
+                "dtype": "f32"
+            },
+            "control13": {
+                "shape": [2, 1280, 8, 8],
+                "dtype": "f32"
+            }
+        },
        "vae_encode": {
            "image" : {
                "shape" : [
@@ -223,4 +333,4 @@
            }
        }
    }
-}
+}
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -272,6 +272,12 @@ p.add_argument(
    help="Amount of attention slicing to use (one of 'max', 'auto', 'none', or an integer)",
 )

+p.add_argument(
+    "--use_stencil",
+    choices=["canny"],
+    help="Enable the stencil feature.",
+)
+
 ##############################################################################
 ### IREE - Vulkan supported flags
 ##############################################################################
--- a/apps/stable_diffusion/src/utils/stencils/canny/init.py
+++ b/apps/stable_diffusion/src/utils/stencils/canny/init.py
@@ -0,0 +1,6 @@
+import cv2
+
+
+class CannyDetector:
+    def __call__(self, img, low_threshold, high_threshold):
+        return cv2.Canny(img, low_threshold, high_threshold)
--- a/apps/stable_diffusion/src/utils/stencils/stencil_utils.py
+++ b/apps/stable_diffusion/src/utils/stencils/stencil_utils.py
@@ -0,0 +1,155 @@
+import cv2
+import numpy as np
+from PIL import Image
+import torch
+from apps.stable_diffusion.src.utils.stencils.canny import CannyDetector
+
+stencil = {}
+
+
+def HWC3(x):
+    assert x.dtype == np.uint8
+    if x.ndim == 2:
+        x = x[:, :, None]
+    assert x.ndim == 3
+    H, W, C = x.shape
+    assert C == 1 or C == 3 or C == 4
+    if C == 3:
+        return x
+    if C == 1:
+        return np.concatenate([x, x, x], axis=2)
+    if C == 4:
+        color = x[:, :, 0:3].astype(np.float32)
+        alpha = x[:, :, 3:4].astype(np.float32) / 255.0
+        y = color * alpha + 255.0 * (1.0 - alpha)
+        y = y.clip(0, 255).astype(np.uint8)
+        return y
+
+
+def resize_image(input_image, resolution):
+    H, W, C = input_image.shape
+    H = float(H)
+    W = float(W)
+    k = float(resolution) / min(H, W)
+    H *= k
+    W *= k
+    H = int(np.round(H / 64.0)) * 64
+    W = int(np.round(W / 64.0)) * 64
+    img = cv2.resize(
+        input_image,
+        (W, H),
+        interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA,
+    )
+    return img
+
+
+def controlnet_hint_shaping(
+    controlnet_hint, height, width, dtype, num_images_per_prompt=1
+):
+    channels = 3
+    if isinstance(controlnet_hint, torch.Tensor):
+        # torch.Tensor: acceptble shape are any of chw, bchw(b==1) or bchw(b==num_images_per_prompt)
+        shape_chw = (channels, height, width)
+        shape_bchw = (1, channels, height, width)
+        shape_nchw = (num_images_per_prompt, channels, height, width)
+        if controlnet_hint.shape in [shape_chw, shape_bchw, shape_nchw]:
+            controlnet_hint = controlnet_hint.to(
+                dtype=dtype, device=torch.device("cpu")
+            )
+            if controlnet_hint.shape != shape_nchw:
+                controlnet_hint = controlnet_hint.repeat(
+                    num_images_per_prompt, 1, 1, 1
+                )
+            return controlnet_hint
+        else:
+            raise ValueError(
+                f"Acceptble shape of `stencil` are any of ({channels}, {height}, {width}),"
+                + f" (1, {channels}, {height}, {width}) or ({num_images_per_prompt}, "
+                + f"{channels}, {height}, {width}) but is {controlnet_hint.shape}"
+            )
+    elif isinstance(controlnet_hint, np.ndarray):
+        # np.ndarray: acceptable shape is any of hw, hwc, bhwc(b==1) or bhwc(b==num_images_per_promot)
+        # hwc is opencv compatible image format. Color channel must be BGR Format.
+        if controlnet_hint.shape == (height, width):
+            controlnet_hint = np.repeat(
+                controlnet_hint[:, :, np.newaxis], channels, axis=2
+            )  # hw -> hwc(c==3)
+        shape_hwc = (height, width, channels)
+        shape_bhwc = (1, height, width, channels)
+        shape_nhwc = (num_images_per_prompt, height, width, channels)
+        if controlnet_hint.shape in [shape_hwc, shape_bhwc, shape_nhwc]:
+            controlnet_hint = torch.from_numpy(controlnet_hint.copy())
+            controlnet_hint = controlnet_hint.to(
+                dtype=dtype, device=torch.device("cpu")
+            )
+            controlnet_hint /= 255.0
+            if controlnet_hint.shape != shape_nhwc:
+                controlnet_hint = controlnet_hint.repeat(
+                    num_images_per_prompt, 1, 1, 1
+                )
+            controlnet_hint = controlnet_hint.permute(
+                0, 3, 1, 2
+            )  # b h w c -> b c h w
+            return controlnet_hint
+        else:
+            raise ValueError(
+                f"Acceptble shape of `stencil` are any of ({width}, {channels}), "
+                + f"({height}, {width}, {channels}), "
+                + f"(1, {height}, {width}, {channels}) or "
+                + f"({num_images_per_prompt}, {channels}, {height}, {width}) but is {controlnet_hint.shape}"
+            )
+    elif isinstance(controlnet_hint, Image.Image):
+        if controlnet_hint.size == (width, height):
+            controlnet_hint = controlnet_hint.convert(
+                "RGB"
+            )  # make sure 3 channel RGB format
+            controlnet_hint = np.array(controlnet_hint)  # to numpy
+            controlnet_hint = controlnet_hint[:, :, ::-1]  # RGB -> BGR
+            return controlnet_hint_shaping(
+                controlnet_hint, height, width, num_images_per_prompt
+            )
+        else:
+            raise ValueError(
+                f"Acceptable image size of `stencil` is ({width}, {height}) but is {controlnet_hint.size}"
+            )
+    else:
+        raise ValueError(
+            f"Acceptable type of `stencil` are any of torch.Tensor, np.ndarray, PIL.Image.Image but is {type(controlnet_hint)}"
+        )
+
+
+def controlnet_hint_conversion(
+    image, use_stencil, height, width, dtype, num_images_per_prompt=1
+):
+    controlnet_hint = None
+    match use_stencil:
+        case "canny":
+            print("Detecting edge with canny")
+            controlnet_hint = hint_canny(image, width)
+        case _:
+            return None
+    controlnet_hint = controlnet_hint_shaping(
+        controlnet_hint, height, width, dtype, num_images_per_prompt
+    )
+    return controlnet_hint
+
+
+# Stencil 1. Canny
+def hint_canny(
+    image: Image.Image,
+    width=512,
+    height=512,
+    low_threshold=100,
+    high_threshold=200,
+):
+    with torch.no_grad():
+        input_image = np.array(image)
+        image_resolution = width
+
+        img = resize_image(HWC3(input_image), image_resolution)
+
+        if not "canny" in stencil:
+            stencil["canny"] = CannyDetector()
+        detected_map = stencil["canny"](img, low_threshold, high_threshold)
+        detected_map = HWC3(detected_map)
+        return detected_map
--- a/apps/stable_diffusion/src/utils/utils.py
+++ b/apps/stable_diffusion/src/utils/utils.py
@@ -450,6 +450,7 @@ def preprocessCKPT(custom_weights):

 def load_vmfb(vmfb_path, model, precision):
    model = "vae" if "base_vae" in model or "vae_encode" in model else model
+    model = "unet" if "stencil" in model else model
    precision = "fp32" if "clip" in model else precision
    extra_args = get_opt_flags(model, precision)
    shark_module = SharkInference(mlir_module=None, device=args.device)
@@ -459,32 +460,28 @@ def load_vmfb(vmfb_path, model, precision):

 # This utility returns vmfbs of Clip, Unet, Vae and Vae_encode, in case all of them
 # are present; deletes them otherwise.
-def fetch_or_delete_vmfbs(
-    extended_model_name, need_vae_encode, precision="fp32"
-):
+def fetch_or_delete_vmfbs(extended_model_name, precision="fp32"):
    vmfb_path = [
        get_vmfb_path_name(extended_model_name[model])
        for model in extended_model_name
    ]
+    number_of_vmfbs = len(vmfb_path)
    vmfb_present = [os.path.isfile(vmfb) for vmfb in vmfb_path]
    all_vmfb_present = True
-    compiled_models = []
-    for i in range(3):
+    compiled_models = [None] * number_of_vmfbs
+
+    for i in range(number_of_vmfbs):
        all_vmfb_present = all_vmfb_present and vmfb_present[i]
-        compiled_models.append(None)
-    if need_vae_encode:
-        all_vmfb_present = all_vmfb_present and vmfb_present[3]
-        compiled_models.append(None)

    # We need to delete vmfbs only if some of the models were compiled.
    if not all_vmfb_present:
-        for i in range(len(compiled_models)):
+        for i in range(number_of_vmfbs):
            if vmfb_present[i]:
                os.remove(vmfb_path[i])
                print("Deleted: ", vmfb_path[i])
    else:
        model_name = [model for model in extended_model_name.keys()]
-        for i in range(len(compiled_models)):
+        for i in range(number_of_vmfbs):
            compiled_models[i] = load_vmfb(
                vmfb_path[i], model_name[i], precision
            )
--- a/apps/stable_diffusion/web/ui/img2img_ui.py
+++ b/apps/stable_diffusion/web/ui/img2img_ui.py
@@ -79,6 +79,13 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                    height=300
                )

+                with gr.Accordion(label="Stencil Options", open=False):
+                    with gr.Row():
+                        use_stencil = gr.Dropdown(
+                            label="Stencil model",
+                            value="None",
+                            choices=["None", "canny"],
+                        )
                with gr.Accordion(label="Advanced Options", open=False):
                    with gr.Row():
                        scheduler = gr.Dropdown(
@@ -116,7 +123,7 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                                "fp16",
                                "fp32",
                            ],
-                            visible=False,
+                            visible=True,
                        )
                        max_length = gr.Radio(
                            label="Max Length",
@@ -221,6 +228,7 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                precision,
                device,
                max_length,
+                use_stencil,
                save_metadata_to_json,
                save_metadata_to_png,
            ],