Fix pascal (#1346 )

* Add fp32 for upscaler VAE * Plumb Pascal vulkan support
Add fp32 for upscaler VAE (#1345 )
2026-04-20 03:00:34 -04:00 · 2023-04-23 20:28:25 -07:00 · 2023-04-23 15:27:55 -07:00 · 2023-04-23 11:48:06 -07:00 · 2023-04-22 20:13:56 -05:00 · 2023-04-22 15:23:41 -07:00
18 changed files with 1239 additions and 639 deletions
--- a/apps/language_models/scripts/stablelm.py
+++ b/apps/language_models/scripts/stablelm.py
@@ -0,0 +1,207 @@
+import torch
+import shark
+from shark.shark_importer import import_with_fx
+from shark.shark_inference import SharkInference
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    StoppingCriteria,
+    StoppingCriteriaList,
+)
+import torch_mlir
+from apps.stable_diffusion.src.utils import (
+    base_models,
+    get_opt_flags,
+    get_vmfb_path_name,
+)
+from apps.stable_diffusion.src.models.model_wrappers import replace_shape_str
+import os
+from io import BytesIO
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "stabilityai/stablelm-tuned-alpha-7b"
+)
+
+
+class StopOnTokens(StoppingCriteria):
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        stop_ids = [50278, 50279, 50277, 1, 0]
+        for stop_id in stop_ids:
+            if input_ids[0][-1] == stop_id:
+                return True
+        return False
+
+
+system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
+- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
+- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
+- StableLM will refuse to participate in anything that could harm a human.
+"""
+
+prompt = f"{system_prompt}<|USER|>What's your mood today?<|ASSISTANT|>"
+
+inputs = tokenizer(prompt, return_tensors="pt")
+
+
+class SLM(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = AutoModelForCausalLM.from_pretrained(
+            "stabilityai/stablelm-tuned-alpha-7b"
+        )
+
+    def forward(self, input_ids, attention_mask):
+        return self.model(input_ids, attention_mask)[0]
+
+
+slm_model = SLM()
+
+res_pytorch = slm_model(inputs["input_ids"], inputs["attention_mask"])
+
+import torch
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+from typing import List
+
+fx_g = make_fx(
+    slm_model,
+    decomposition_table=get_decompositions(
+        [
+            torch.ops.aten.embedding_dense_backward,
+            torch.ops.aten.native_layer_norm_backward,
+            torch.ops.aten.slice_backward,
+            torch.ops.aten.select_backward,
+            torch.ops.aten.norm.ScalarOpt_dim,
+            torch.ops.aten.native_group_norm,
+            torch.ops.aten.upsample_bilinear2d.vec,
+            torch.ops.aten.split.Tensor,
+            torch.ops.aten.split_with_sizes,
+        ]
+    ),
+)(inputs["input_ids"], inputs["attention_mask"])
+
+
+def _remove_nones(fx_g: torch.fx.GraphModule) -> List[int]:
+    removed_indexes = []
+    for node in fx_g.graph.nodes:
+        if node.op == "output":
+            assert (
+                len(node.args) == 1
+            ), "Output node must have a single argument"
+            node_arg = node.args[0]
+            if isinstance(node_arg, (list, tuple)):
+                node_arg = list(node_arg)
+                node_args_len = len(node_arg)
+                for i in range(node_args_len):
+                    curr_index = node_args_len - (i + 1)
+                    if node_arg[curr_index] is None:
+                        removed_indexes.append(curr_index)
+                        node_arg.pop(curr_index)
+                node.args = (tuple(node_arg),)
+                break
+
+    if len(removed_indexes) > 0:
+        fx_g.graph.lint()
+        fx_g.graph.eliminate_dead_code()
+        fx_g.recompile()
+    removed_indexes.sort()
+    return removed_indexes
+
+
+def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
+    """
+    Replace tuple with tuple element in functions that return one-element tuples.
+    Returns true if an unwrapping took place, and false otherwise.
+    """
+    unwrapped_tuple = False
+    for node in fx_g.graph.nodes:
+        if node.op == "output":
+            assert (
+                len(node.args) == 1
+            ), "Output node must have a single argument"
+            node_arg = node.args[0]
+            if isinstance(node_arg, tuple):
+                if len(node_arg) == 1:
+                    node.args = (node_arg[0],)
+                    unwrapped_tuple = True
+                    break
+
+    if unwrapped_tuple:
+        fx_g.graph.lint()
+        fx_g.recompile()
+    return unwrapped_tuple
+
+
+def transform_fx(fx_g):
+    for node in fx_g.graph.nodes:
+        if node.op == "call_function":
+            if node.target in [
+                torch.ops.aten.empty,
+            ]:
+                # aten.empty should be filled with zeros.
+                if node.target in [torch.ops.aten.empty]:
+                    with fx_g.graph.inserting_after(node):
+                        new_node = fx_g.graph.call_function(
+                            torch.ops.aten.zero_,
+                            args=(node,),
+                        )
+                        node.append(new_node)
+                        node.replace_all_uses_with(new_node)
+                        new_node.args = (node,)
+
+    fx_g.graph.lint()
+
+
+transform_fx(fx_g)
+fx_g.recompile()
+removed_none_indexes = _remove_nones(fx_g)
+was_unwrapped = _unwrap_single_tuple_return(fx_g)
+
+fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+fx_g.recompile()
+
+
+def strip_overloads(gm):
+    """
+    Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+    Args:
+        gm(fx.GraphModule): The input Fx graph module to be modified
+    """
+    for node in gm.graph.nodes:
+        if isinstance(node.target, torch._ops.OpOverload):
+            node.target = node.target.overloadpacket
+    gm.recompile()
+
+
+strip_overloads(fx_g)
+
+ts_g = torch.jit.script(fx_g)
+
+module = torch_mlir.compile(
+    ts_g,
+    [inputs["input_ids"], inputs["attention_mask"]],
+    torch_mlir.OutputType.LINALG_ON_TENSORS,
+    use_tracing=False,
+    verbose=False,
+)
+
+bytecode_stream = BytesIO()
+module.operation.write_bytecode(bytecode_stream)
+bytecode = bytecode_stream.getvalue()
+
+shark_module = SharkInference(
+    mlir_module=bytecode, device="cuda", mlir_dialect="tm_tensor"
+)
+shark_module.compile()
+
+result_shark = shark_module(
+    "forward", [inputs["input_ids"], inputs["attention_mask"]]
+)
+
+print("Result PyTorch")
+print(res_pytorch)
+print("Result SHARK")
+print(result_shark)
--- a/apps/stable_diffusion/scripts/init.py
+++ b/apps/stable_diffusion/scripts/init.py
@@ -1,4 +1 @@
-from apps.stable_diffusion.scripts.inpaint import inpaint_inf
-from apps.stable_diffusion.scripts.outpaint import outpaint_inf
-from apps.stable_diffusion.scripts.upscaler import upscaler_inf
 from apps.stable_diffusion.scripts.train_lora_word import lora_train
--- a/apps/stable_diffusion/scripts/inpaint.py
+++ b/apps/stable_diffusion/scripts/inpaint.py
@@ -14,192 +14,6 @@ from apps.stable_diffusion.src import (
 from apps.stable_diffusion.src.utils import get_generation_text_info


-# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
-init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
-init_use_tuned = args.use_tuned
-init_import_mlir = args.import_mlir
-
-
-# Exposed to UI.
-def inpaint_inf(
-    prompt: str,
-    negative_prompt: str,
-    image_dict,
-    height: int,
-    width: int,
-    inpaint_full_res: bool,
-    inpaint_full_res_padding: int,
-    steps: int,
-    guidance_scale: float,
-    seed: int,
-    batch_count: int,
-    batch_size: int,
-    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
-    custom_vae: str,
-    precision: str,
-    device: str,
-    max_length: int,
-    save_metadata_to_json: bool,
-    save_metadata_to_png: bool,
-    lora_weights: str,
-    lora_hf_id: str,
-    ondemand: bool,
-):
-    from apps.stable_diffusion.web.ui.utils import (
-        get_custom_model_pathfile,
-        get_custom_vae_or_lora_weights,
-        Config,
-    )
-    import apps.stable_diffusion.web.utils.global_obj as global_obj
-    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
-        SD_STATE_CANCEL,
-    )
-
-    args.prompts = [prompt]
-    args.negative_prompts = [negative_prompt]
-    args.guidance_scale = guidance_scale
-    args.steps = steps
-    args.scheduler = scheduler
-    args.img_path = "not none"
-    args.mask_path = "not none"
-    args.ondemand = ondemand
-
-    # set ckpt_loc and hf_model_id.
-    args.ckpt_loc = ""
-    args.hf_model_id = ""
-    args.custom_vae = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, both must not be empty",
-            )
-        args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = get_custom_model_pathfile(custom_model)
-    else:
-        args.hf_model_id = custom_model
-    if custom_vae != "None":
-        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")
-
-    args.use_lora = get_custom_vae_or_lora_weights(
-        lora_weights, lora_hf_id, "lora"
-    )
-
-    args.save_metadata_to_json = save_metadata_to_json
-    args.write_metadata_to_png = save_metadata_to_png
-
-    dtype = torch.float32 if precision == "fp32" else torch.half
-    cpu_scheduling = not scheduler.startswith("Shark")
-    new_config_obj = Config(
-        "inpaint",
-        args.hf_model_id,
-        args.ckpt_loc,
-        args.custom_vae,
-        precision,
-        batch_size,
-        max_length,
-        height,
-        width,
-        device,
-        use_lora=args.use_lora,
-        use_stencil=None,
-        ondemand=ondemand,
-    )
-    if (
-        not global_obj.get_sd_obj()
-        or global_obj.get_cfg_obj() != new_config_obj
-    ):
-        global_obj.clear_cache()
-        global_obj.set_cfg_obj(new_config_obj)
-        args.precision = precision
-        args.batch_count = batch_count
-        args.batch_size = batch_size
-        args.max_length = max_length
-        args.height = height
-        args.width = width
-        args.device = device.split("=>", 1)[1].strip()
-        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
-        args.use_tuned = init_use_tuned
-        args.import_mlir = init_import_mlir
-        set_init_device_flags()
-        model_id = (
-            args.hf_model_id
-            if args.hf_model_id
-            else "stabilityai/stable-diffusion-2-inpainting"
-        )
-        global_obj.set_schedulers(get_schedulers(model_id))
-        scheduler_obj = global_obj.get_scheduler(scheduler)
-        global_obj.set_sd_obj(
-            InpaintPipeline.from_pretrained(
-                scheduler=scheduler_obj,
-                import_mlir=args.import_mlir,
-                model_id=args.hf_model_id,
-                ckpt_loc=args.ckpt_loc,
-                custom_vae=args.custom_vae,
-                precision=args.precision,
-                max_length=args.max_length,
-                batch_size=args.batch_size,
-                height=args.height,
-                width=args.width,
-                use_base_vae=args.use_base_vae,
-                use_tuned=args.use_tuned,
-                low_cpu_mem_usage=args.low_cpu_mem_usage,
-                debug=args.import_debug if args.import_mlir else False,
-                use_lora=args.use_lora,
-                ondemand=args.ondemand,
-            )
-        )
-
-    global_obj.set_sd_scheduler(scheduler)
-
-    start_time = time.time()
-    global_obj.get_sd_obj().log = ""
-    generated_imgs = []
-    seeds = []
-    img_seed = utils.sanitize_seed(seed)
-    image = image_dict["image"]
-    mask_image = image_dict["mask"]
-    text_output = ""
-    for i in range(batch_count):
-        if i > 0:
-            img_seed = utils.sanitize_seed(-1)
-        out_imgs = global_obj.get_sd_obj().generate_images(
-            prompt,
-            negative_prompt,
-            image,
-            mask_image,
-            batch_size,
-            height,
-            width,
-            inpaint_full_res,
-            inpaint_full_res_padding,
-            steps,
-            guidance_scale,
-            img_seed,
-            args.max_length,
-            dtype,
-            args.use_base_vae,
-            cpu_scheduling,
-        )
-        seeds.append(img_seed)
-        total_time = time.time() - start_time
-        text_output = get_generation_text_info(seeds, device)
-        text_output += "\n" + global_obj.get_sd_obj().log
-        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
-
-        if global_obj.get_sd_status() == SD_STATE_CANCEL:
-            break
-        else:
-            save_output_img(out_imgs[0], img_seed)
-            generated_imgs.extend(out_imgs)
-            yield generated_imgs, text_output
-
-    return generated_imgs, text_output
-
-
 def main():
    if args.clear_all:
        clear_all()
--- a/apps/stable_diffusion/scripts/outpaint.py
+++ b/apps/stable_diffusion/scripts/outpaint.py
@@ -11,202 +11,6 @@ from apps.stable_diffusion.src import (
    clear_all,
    save_output_img,
 )
-from apps.stable_diffusion.src.utils import get_generation_text_info
-
-
-# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
-init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
-init_use_tuned = args.use_tuned
-init_import_mlir = args.import_mlir
-
-
-# Exposed to UI.
-def outpaint_inf(
-    prompt: str,
-    negative_prompt: str,
-    init_image,
-    pixels: int,
-    mask_blur: int,
-    directions: list,
-    noise_q: float,
-    color_variation: float,
-    height: int,
-    width: int,
-    steps: int,
-    guidance_scale: float,
-    seed: int,
-    batch_count: int,
-    batch_size: int,
-    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
-    custom_vae: str,
-    precision: str,
-    device: str,
-    max_length: int,
-    save_metadata_to_json: bool,
-    save_metadata_to_png: bool,
-    lora_weights: str,
-    lora_hf_id: str,
-    ondemand: bool,
-):
-    from apps.stable_diffusion.web.ui.utils import (
-        get_custom_model_pathfile,
-        get_custom_vae_or_lora_weights,
-        Config,
-    )
-    import apps.stable_diffusion.web.utils.global_obj as global_obj
-    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
-        SD_STATE_CANCEL,
-    )
-
-    args.prompts = [prompt]
-    args.negative_prompts = [negative_prompt]
-    args.guidance_scale = guidance_scale
-    args.steps = steps
-    args.scheduler = scheduler
-    args.img_path = "not none"
-    args.ondemand = ondemand
-
-    # set ckpt_loc and hf_model_id.
-    args.ckpt_loc = ""
-    args.hf_model_id = ""
-    args.custom_vae = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, both must not be empty",
-            )
-        args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = get_custom_model_pathfile(custom_model)
-    else:
-        args.hf_model_id = custom_model
-    if custom_vae != "None":
-        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")
-
-    args.use_lora = get_custom_vae_or_lora_weights(
-        lora_weights, lora_hf_id, "lora"
-    )
-
-    args.save_metadata_to_json = save_metadata_to_json
-    args.write_metadata_to_png = save_metadata_to_png
-
-    dtype = torch.float32 if precision == "fp32" else torch.half
-    cpu_scheduling = not scheduler.startswith("Shark")
-    new_config_obj = Config(
-        "outpaint",
-        args.hf_model_id,
-        args.ckpt_loc,
-        args.custom_vae,
-        precision,
-        batch_size,
-        max_length,
-        height,
-        width,
-        device,
-        use_lora=args.use_lora,
-        use_stencil=None,
-        ondemand=ondemand,
-    )
-    if (
-        not global_obj.get_sd_obj()
-        or global_obj.get_cfg_obj() != new_config_obj
-    ):
-        global_obj.clear_cache()
-        global_obj.set_cfg_obj(new_config_obj)
-        args.precision = precision
-        args.batch_count = batch_count
-        args.batch_size = batch_size
-        args.max_length = max_length
-        args.height = height
-        args.width = width
-        args.device = device.split("=>", 1)[1].strip()
-        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
-        args.use_tuned = init_use_tuned
-        args.import_mlir = init_import_mlir
-        set_init_device_flags()
-        model_id = (
-            args.hf_model_id
-            if args.hf_model_id
-            else "stabilityai/stable-diffusion-2-inpainting"
-        )
-        global_obj.set_schedulers(get_schedulers(model_id))
-        scheduler_obj = global_obj.get_scheduler(scheduler)
-        global_obj.set_sd_obj(
-            OutpaintPipeline.from_pretrained(
-                scheduler_obj,
-                args.import_mlir,
-                args.hf_model_id,
-                args.ckpt_loc,
-                args.custom_vae,
-                args.precision,
-                args.max_length,
-                args.batch_size,
-                args.height,
-                args.width,
-                args.use_base_vae,
-                args.use_tuned,
-                use_lora=args.use_lora,
-                ondemand=args.ondemand,
-            )
-        )
-
-    global_obj.set_sd_scheduler(scheduler)
-
-    start_time = time.time()
-    global_obj.get_sd_obj().log = ""
-    generated_imgs = []
-    seeds = []
-    img_seed = utils.sanitize_seed(seed)
-
-    left = True if "left" in directions else False
-    right = True if "right" in directions else False
-    top = True if "up" in directions else False
-    bottom = True if "down" in directions else False
-
-    text_output = ""
-    for i in range(batch_count):
-        if i > 0:
-            img_seed = utils.sanitize_seed(-1)
-        out_imgs = global_obj.get_sd_obj().generate_images(
-            prompt,
-            negative_prompt,
-            init_image,
-            pixels,
-            mask_blur,
-            left,
-            right,
-            top,
-            bottom,
-            noise_q,
-            color_variation,
-            batch_size,
-            height,
-            width,
-            steps,
-            guidance_scale,
-            img_seed,
-            args.max_length,
-            dtype,
-            args.use_base_vae,
-            cpu_scheduling,
-        )
-        seeds.append(img_seed)
-        total_time = time.time() - start_time
-        text_output = get_generation_text_info(seeds, device)
-        text_output += "\n" + global_obj.get_sd_obj().log
-        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
-
-        if global_obj.get_sd_status() == SD_STATE_CANCEL:
-            break
-        else:
-            save_output_img(out_imgs[0], img_seed)
-            generated_imgs.extend(out_imgs)
-            yield generated_imgs, text_output
-
-    return generated_imgs, text_output


 def main():
--- a/apps/stable_diffusion/scripts/tuner.py
+++ b/apps/stable_diffusion/scripts/tuner.py
@@ -54,12 +54,19 @@ def main():
    # Get device and device specific arguments
    device, device_spec_args = get_device_args()
    device_spec = ""
+    vulkan_target_triple = ""
    if device_spec_args:
        device_spec = device_spec_args[-1].split("=")[-1].strip()
        if device == "vulkan":
+            vulkan_target_triple = device_spec
            device_spec = device_spec.split("-")[0]

    # Add winograd annotation for vulkan device
+    use_winograd = (
+        True
+        if device == "vulkan" and args.annotation_model in ["unet", "vae"]
+        else False
+    )
    winograd_config = (
        load_winograd_configs()
        if device == "vulkan" and args.annotation_model in ["unet", "vae"]
@@ -71,19 +78,23 @@ def main():
            input_contents=mlir_module,
            config_path=winograd_config,
            search_op="conv",
-            winograd=True,
+            winograd=use_winograd,
        )

    # Dump model dispatches
-    if device == "vulkan" and device_spec == "rdna3":
-        device = "vulkan/RX 7900"
    generates_dir = Path.home() / "tmp"
    if not os.path.exists(generates_dir):
        os.makedirs(generates_dir)
    dump_mlir = generates_dir / "temp.mlir"
    dispatch_dir = generates_dir / f"{model_name}_{device_spec}_dispatches"
    export_module_to_mlir_file(input_module, dump_mlir)
-    dump_dispatches(dump_mlir, device, dispatch_dir, False)
+    dump_dispatches(
+        dump_mlir,
+        device,
+        dispatch_dir,
+        vulkan_target_triple,
+        use_winograd=use_winograd,
+    )

    # Tune each dispatch
    dtype = "f16" if args.precision == "fp16" else "f32"
@@ -106,6 +117,7 @@ def main():
            batch_size=1,
            config_filename=config_filename,
            use_dispatch=True,
+            vulkan_target_triple=vulkan_target_triple,
        )
        tuner.tune()

--- a/apps/stable_diffusion/scripts/upscaler.py
+++ b/apps/stable_diffusion/scripts/upscaler.py
@@ -13,198 +13,6 @@ from apps.stable_diffusion.src import (
 )


-# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
-init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
-init_use_tuned = args.use_tuned
-init_import_mlir = args.import_mlir
-
-
-# Exposed to UI.
-def upscaler_inf(
-    prompt: str,
-    negative_prompt: str,
-    init_image,
-    height: int,
-    width: int,
-    steps: int,
-    noise_level: int,
-    guidance_scale: float,
-    seed: int,
-    batch_count: int,
-    batch_size: int,
-    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
-    custom_vae: str,
-    precision: str,
-    device: str,
-    max_length: int,
-    save_metadata_to_json: bool,
-    save_metadata_to_png: bool,
-    lora_weights: str,
-    lora_hf_id: str,
-    ondemand: bool,
-):
-    from apps.stable_diffusion.web.ui.utils import (
-        get_custom_model_pathfile,
-        get_custom_vae_or_lora_weights,
-        Config,
-    )
-    import apps.stable_diffusion.web.utils.global_obj as global_obj
-
-    args.prompts = [prompt]
-    args.negative_prompts = [negative_prompt]
-    args.guidance_scale = guidance_scale
-    args.seed = seed
-    args.steps = steps
-    args.scheduler = scheduler
-    args.ondemand = ondemand
-
-    if init_image is None:
-        return None, "An Initial Image is required"
-    image = init_image.convert("RGB").resize((height, width))
-
-    # set ckpt_loc and hf_model_id.
-    args.ckpt_loc = ""
-    args.hf_model_id = ""
-    args.custom_vae = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, both must not be empty",
-            )
-        args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = get_custom_model_pathfile(custom_model)
-    else:
-        args.hf_model_id = custom_model
-    if custom_vae != "None":
-        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")
-
-    args.save_metadata_to_json = save_metadata_to_json
-    args.write_metadata_to_png = save_metadata_to_png
-
-    args.use_lora = get_custom_vae_or_lora_weights(
-        lora_weights, lora_hf_id, "lora"
-    )
-
-    dtype = torch.float32 if precision == "fp32" else torch.half
-    cpu_scheduling = not scheduler.startswith("Shark")
-    args.height = 128
-    args.width = 128
-    new_config_obj = Config(
-        "upscaler",
-        args.hf_model_id,
-        args.ckpt_loc,
-        args.custom_vae,
-        precision,
-        batch_size,
-        max_length,
-        args.height,
-        args.width,
-        device,
-        use_lora=args.use_lora,
-        use_stencil=None,
-        ondemand=ondemand,
-    )
-    if (
-        not global_obj.get_sd_obj()
-        or global_obj.get_cfg_obj() != new_config_obj
-    ):
-        global_obj.clear_cache()
-        global_obj.set_cfg_obj(new_config_obj)
-        args.batch_size = batch_size
-        args.max_length = max_length
-        args.device = device.split("=>", 1)[1].strip()
-        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
-        args.use_tuned = init_use_tuned
-        args.import_mlir = init_import_mlir
-        set_init_device_flags()
-        model_id = (
-            args.hf_model_id
-            if args.hf_model_id
-            else "stabilityai/stable-diffusion-2-1-base"
-        )
-        global_obj.set_schedulers(get_schedulers(model_id))
-        scheduler_obj = global_obj.get_scheduler(scheduler)
-        global_obj.set_sd_obj(
-            UpscalerPipeline.from_pretrained(
-                scheduler_obj,
-                args.import_mlir,
-                args.hf_model_id,
-                args.ckpt_loc,
-                args.custom_vae,
-                args.precision,
-                args.max_length,
-                args.batch_size,
-                args.height,
-                args.width,
-                args.use_base_vae,
-                args.use_tuned,
-                low_cpu_mem_usage=args.low_cpu_mem_usage,
-                use_lora=args.use_lora,
-                ondemand=args.ondemand,
-            )
-        )
-
-    global_obj.set_sd_scheduler(scheduler)
-    global_obj.get_sd_obj().low_res_scheduler = global_obj.get_scheduler(
-        "DDPM"
-    )
-
-    start_time = time.time()
-    global_obj.get_sd_obj().log = ""
-    generated_imgs = []
-    seeds = []
-    img_seed = utils.sanitize_seed(seed)
-    extra_info = {"NOISE LEVEL": noise_level}
-    for current_batch in range(batch_count):
-        if current_batch > 0:
-            img_seed = utils.sanitize_seed(-1)
-        low_res_img = image
-        high_res_img = Image.new("RGB", (height * 4, width * 4))
-
-        for i in range(0, width, 128):
-            for j in range(0, height, 128):
-                box = (j, i, j + 128, i + 128)
-                upscaled_image = global_obj.get_sd_obj().generate_images(
-                    prompt,
-                    negative_prompt,
-                    low_res_img.crop(box),
-                    batch_size,
-                    args.height,
-                    args.width,
-                    steps,
-                    noise_level,
-                    guidance_scale,
-                    img_seed,
-                    args.max_length,
-                    dtype,
-                    args.use_base_vae,
-                    cpu_scheduling,
-                )
-                high_res_img.paste(upscaled_image[0], (j * 4, i * 4))
-
-        save_output_img(high_res_img, img_seed, extra_info)
-        generated_imgs.append(high_res_img)
-        seeds.append(img_seed)
-        global_obj.get_sd_obj().log += "\n"
-        yield generated_imgs, global_obj.get_sd_obj().log
-
-    total_time = time.time() - start_time
-    text_output = f"prompt={args.prompts}"
-    text_output += f"\nnegative prompt={args.negative_prompts}"
-    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
-    text_output += f"\nscheduler={args.scheduler}, device={device}"
-    text_output += f"\nsteps={steps}, noise_level={noise_level}, guidance_scale={guidance_scale}, seed={seeds}"
-    text_output += f"\nsize={height}x{width}, batch_count={batch_count}, batch_size={batch_size}, max_length={args.max_length}"
-    text_output += global_obj.get_sd_obj().log
-    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
-
-    yield generated_imgs, text_output
-
-
 if __name__ == "__main__":
    if args.clear_all:
        clear_all()
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -267,7 +267,7 @@ class SharkifyStableDiffusionModel:

        vae = VaeModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
        inputs = tuple(self.inputs["vae"])
-        is_f16 = True if self.precision == "fp16" else False
+        is_f16 = True if not self.is_upscaler and self.precision == "fp16" else False
        save_dir = os.path.join(self.sharktank_dir, self.model_name["vae"])
        if self.debug:
            os.makedirs(save_dir, exist_ok=True)
--- a/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
+++ b/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
@@ -40,6 +40,7 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
    def compile(self):
        SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
        BATCH_SIZE = args.batch_size
+        device = args.device.split(":", 1)[0].strip()

        model_input = {
            "euler": {
@@ -92,7 +93,7 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
            self.scaling_model, _ = compile_through_fx(
                model=scaling_model,
                inputs=(example_latent, example_sigma),
-                extended_model_name=f"euler_scale_model_input_{BATCH_SIZE}_{args.height}_{args.width}"
+                extended_model_name=f"euler_scale_model_input_{BATCH_SIZE}_{args.height}_{args.width}_{device}_"
                + args.precision,
                extra_args=iree_flags,
            )
@@ -101,7 +102,7 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
            self.step_model, _ = compile_through_fx(
                step_model,
                (example_output, example_sigma, example_latent, example_dt),
-                extended_model_name=f"euler_step_{BATCH_SIZE}_{args.height}_{args.width}"
+                extended_model_name=f"euler_step_{BATCH_SIZE}_{args.height}_{args.width}_{device}_"
                + args.precision,
                extra_args=iree_flags,
            )
--- a/apps/stable_diffusion/src/utils/resources/opt_flags.json
+++ b/apps/stable_diffusion/src/utils/resources/opt_flags.json
@@ -45,12 +45,12 @@
    "untuned": {
      "fp16": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-preprocessing-pad-linalg-ops{pad-size=32}))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32},iree-linalg-ext-convert-conv2d-to-winograd))"
        ]
      },
      "fp32": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-preprocessing-pad-linalg-ops{pad-size=16}))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=16},iree-linalg-ext-convert-conv2d-to-winograd))"
        ]
      }
    }
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -12,7 +12,12 @@ if args.clear_all:

 if __name__ == "__main__":
    if args.api:
-        from apps.stable_diffusion.web.ui import txt2img_inf, img2img_api
+        from apps.stable_diffusion.web.ui import (
+            txt2img_api,
+            img2img_api,
+            upscaler_api,
+            inpaint_api,
+        )
        from fastapi import FastAPI, APIRouter
        import uvicorn

@@ -20,8 +25,13 @@ if __name__ == "__main__":
        global_obj._init()

        app = FastAPI()
-        app.add_api_route("/sdapi/v1/txt2img", txt2img_inf, methods=["post"])
+        app.add_api_route("/sdapi/v1/txt2img", txt2img_api, methods=["post"])
        app.add_api_route("/sdapi/v1/img2img", img2img_api, methods=["post"])
+        app.add_api_route("/sdapi/v1/inpaint", inpaint_api, methods=["post"])
+        #  app.add_api_route(
+        #      "/sdapi/v1/outpaint", outpaint_api, methods=["post"]
+        #  )
+        app.add_api_route("/sdapi/v1/upscaler", upscaler_api, methods=["post"])
        app.include_router(APIRouter())
        uvicorn.run(app, host="127.0.0.1", port=args.server_port)
        sys.exit(0)
--- a/apps/stable_diffusion/web/ui/init.py
+++ b/apps/stable_diffusion/web/ui/init.py
@@ -1,5 +1,6 @@
 from apps.stable_diffusion.web.ui.txt2img_ui import (
    txt2img_inf,
+    txt2img_api,
    txt2img_web,
    txt2img_gallery,
    txt2img_sendto_img2img,
@@ -8,8 +9,8 @@ from apps.stable_diffusion.web.ui.txt2img_ui import (
    txt2img_sendto_upscaler,
 )
 from apps.stable_diffusion.web.ui.img2img_ui import (
-    img2img_api,
    img2img_inf,
+    img2img_api,
    img2img_web,
    img2img_gallery,
    img2img_init_image,
@@ -18,6 +19,8 @@ from apps.stable_diffusion.web.ui.img2img_ui import (
    img2img_sendto_upscaler,
 )
 from apps.stable_diffusion.web.ui.inpaint_ui import (
+    inpaint_inf,
+    inpaint_api,
    inpaint_web,
    inpaint_gallery,
    inpaint_init_image,
@@ -26,6 +29,8 @@ from apps.stable_diffusion.web.ui.inpaint_ui import (
    inpaint_sendto_upscaler,
 )
 from apps.stable_diffusion.web.ui.outpaint_ui import (
+    outpaint_inf,
+    outpaint_api,
    outpaint_web,
    outpaint_gallery,
    outpaint_init_image,
@@ -34,6 +39,8 @@ from apps.stable_diffusion.web.ui.outpaint_ui import (
    outpaint_sendto_upscaler,
 )
 from apps.stable_diffusion.web.ui.upscaler_ui import (
+    upscaler_inf,
+    upscaler_api,
    upscaler_web,
    upscaler_gallery,
    upscaler_init_image,
--- a/apps/stable_diffusion/web/ui/css/sd_dark_theme.css
+++ b/apps/stable_diffusion/web/ui/css/sd_dark_theme.css
@@ -173,7 +173,30 @@ footer {
 #gallery .thumbnail-item.thumbnail-lg {
    aspect-ratio: unset;
    max-height: calc(55vh - (2 * var(--spacing-lg)));
-    min-height: 390px
+}
+@media (min-width: 1921px) {
+    /* Force a 768px_height + 4px_margin_height + navbar_height for the gallery */
+    #gallery .grid-wrap, #gallery .preview{
+        min-height: calc(768px + 4px + var(--size-14));
+        max-height: calc(768px + 4px + var(--size-14));
+    }
+    /* Limit height to 768px_height + 2px_margin_height for the thumbnails */
+    #gallery .thumbnail-item.thumbnail-lg {
+        max-height: 770px !important;
+    }
+}
+/* Don't upscale when viewing in solo image mode */
+#gallery .preview img {
+    object-fit: scale-down;
+}
+/* Navbar images in cover mode*/
+#gallery .preview .thumbnail-item img {
+    object-fit: cover;
+}
+
+/* Limit the stable diffusion text output height */
+#std_output textarea {
+    max-height: 215px;
 }

 /* Prevent progress bar to block gallery navigation while building images (Gradio V3.19.0) */
--- a/apps/stable_diffusion/web/ui/img2img_ui.py
+++ b/apps/stable_diffusion/web/ui/img2img_ui.py
@@ -8,7 +8,6 @@ from PIL import Image
 import base64
 from io import BytesIO
 from fastapi.exceptions import HTTPException
-from apps.stable_diffusion.src import args
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
@@ -530,18 +529,16 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                        show_label=False,
                        elem_id="gallery",
                    ).style(columns=[2], object_fit="contain")
+                    output_dir = (
+                        args.output_dir if args.output_dir else Path.cwd()
+                    )
+                    output_dir = Path(output_dir, "generated_imgs")
                    std_output = gr.Textbox(
-                        value="Nothing to show.",
+                        value=f"Images will be saved at {output_dir}",
                        lines=1,
+                        elem_id="std_output",
                        show_label=False,
                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
                with gr.Row():
                    img2img_sendto_inpaint = gr.Button(value="SendTo Inpaint")
                    img2img_sendto_outpaint = gr.Button(
--- a/apps/stable_diffusion/web/ui/inpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/inpaint_ui.py
@@ -1,9 +1,13 @@
 from pathlib import Path
 import os
+import torch
+import time
+import sys
 import gradio as gr
 from PIL import Image
-from apps.stable_diffusion.scripts import inpaint_inf
-from apps.stable_diffusion.src import args
+import base64
+from io import BytesIO
+from fastapi.exceptions import HTTPException
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
@@ -13,6 +17,275 @@ from apps.stable_diffusion.web.ui.utils import (
    predefined_paint_models,
    cancel_sd,
 )
+from apps.stable_diffusion.src import (
+    args,
+    InpaintPipeline,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    clear_all,
+    save_output_img,
+)
+from apps.stable_diffusion.src.utils import get_generation_text_info
+
+
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir
+
+
+# Exposed to UI.
+def inpaint_inf(
+    prompt: str,
+    negative_prompt: str,
+    image_dict,
+    height: int,
+    width: int,
+    inpaint_full_res: bool,
+    inpaint_full_res_padding: int,
+    steps: int,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    custom_vae: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_hf_id: str,
+    ondemand: bool,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+        SD_STATE_CANCEL,
+    )
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.steps = steps
+    args.scheduler = scheduler
+    args.img_path = "not none"
+    args.mask_path = "not none"
+    args.ondemand = ondemand
+
+    # set ckpt_loc and hf_model_id.
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    args.custom_vae = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+    else:
+        args.hf_model_id = custom_model
+    if custom_vae != "None":
+        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")
+
+    args.use_lora = get_custom_vae_or_lora_weights(
+        lora_weights, lora_hf_id, "lora"
+    )
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    new_config_obj = Config(
+        "inpaint",
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.custom_vae,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+        use_lora=args.use_lora,
+        use_stencil=None,
+        ondemand=ondemand,
+    )
+    if (
+        not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
+        args.precision = precision
+        args.batch_count = batch_count
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-2-inpainting"
+        )
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
+        global_obj.set_sd_obj(
+            InpaintPipeline.from_pretrained(
+                scheduler=scheduler_obj,
+                import_mlir=args.import_mlir,
+                model_id=args.hf_model_id,
+                ckpt_loc=args.ckpt_loc,
+                custom_vae=args.custom_vae,
+                precision=args.precision,
+                max_length=args.max_length,
+                batch_size=args.batch_size,
+                height=args.height,
+                width=args.width,
+                use_base_vae=args.use_base_vae,
+                use_tuned=args.use_tuned,
+                low_cpu_mem_usage=args.low_cpu_mem_usage,
+                debug=args.import_debug if args.import_mlir else False,
+                use_lora=args.use_lora,
+                ondemand=args.ondemand,
+            )
+        )
+
+    global_obj.set_sd_scheduler(scheduler)
+
+    start_time = time.time()
+    global_obj.get_sd_obj().log = ""
+    generated_imgs = []
+    seeds = []
+    img_seed = utils.sanitize_seed(seed)
+    image = image_dict["image"]
+    mask_image = image_dict["mask"]
+    text_output = ""
+    for i in range(batch_count):
+        if i > 0:
+            img_seed = utils.sanitize_seed(-1)
+        out_imgs = global_obj.get_sd_obj().generate_images(
+            prompt,
+            negative_prompt,
+            image,
+            mask_image,
+            batch_size,
+            height,
+            width,
+            inpaint_full_res,
+            inpaint_full_res_padding,
+            steps,
+            guidance_scale,
+            img_seed,
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+        )
+        seeds.append(img_seed)
+        total_time = time.time() - start_time
+        text_output = get_generation_text_info(seeds, device)
+        text_output += "\n" + global_obj.get_sd_obj().log
+        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
+
+        if global_obj.get_sd_status() == SD_STATE_CANCEL:
+            break
+        else:
+            save_output_img(out_imgs[0], img_seed)
+            generated_imgs.extend(out_imgs)
+            yield generated_imgs, text_output
+
+    return generated_imgs, text_output
+
+
+def decode_base64_to_image(encoding):
+    if encoding.startswith("data:image/"):
+        encoding = encoding.split(";", 1)[1].split(",", 1)[1]
+    try:
+        image = Image.open(BytesIO(base64.b64decode(encoding)))
+        return image
+    except Exception as err:
+        print(err)
+        raise HTTPException(status_code=500, detail="Invalid encoded image")
+
+
+def encode_pil_to_base64(images):
+    encoded_imgs = []
+    for image in images:
+        with BytesIO() as output_bytes:
+            if args.output_img_format.lower() == "png":
+                image.save(output_bytes, format="PNG")
+
+            elif args.output_img_format.lower() in ("jpg", "jpeg"):
+                image.save(output_bytes, format="JPEG")
+            else:
+                raise HTTPException(
+                    status_code=500, detail="Invalid image format"
+                )
+            bytes_data = output_bytes.getvalue()
+            encoded_imgs.append(base64.b64encode(bytes_data))
+    return encoded_imgs
+
+
+# Inpaint Rest API.
+def inpaint_api(
+    InputData: dict,
+):
+    print(
+        f'Prompt: {InputData["prompt"]}, Negative Prompt: {InputData["negative_prompt"]}, Seed: {InputData["seed"]}'
+    )
+    init_image = decode_base64_to_image(InputData["image"])
+    mask = decode_base64_to_image(InputData["mask"])
+    res = inpaint_inf(
+        InputData["prompt"],
+        InputData["negative_prompt"],
+        {"image": init_image, "mask": mask},
+        InputData["height"],
+        InputData["width"],
+        InputData["is_full_res"],
+        InputData["full_res_padding"],
+        InputData["steps"],
+        InputData["cfg_scale"],
+        InputData["seed"],
+        batch_count=1,
+        batch_size=1,
+        scheduler="EulerDiscrete",
+        custom_model="None",
+        hf_model_id=InputData["hf_model_id"]
+        if "hf_model_id" in InputData.keys()
+        else "stabilityai/stable-diffusion-2-1-base",
+        custom_vae="None",
+        precision="fp16",
+        device=available_devices[0],
+        max_length=64,
+        save_metadata_to_json=False,
+        save_metadata_to_png=False,
+        lora_weights="None",
+        lora_hf_id="",
+        ondemand=False,
+    )
+    return {
+        "images": encode_pil_to_base64(res[0]),
+        "parameters": {},
+        "info": res[1],
+    }


 with gr.Blocks(title="Inpainting") as inpaint_web:
@@ -216,18 +489,16 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                        show_label=False,
                        elem_id="gallery",
                    ).style(columns=[2], object_fit="contain")
+                    output_dir = (
+                        args.output_dir if args.output_dir else Path.cwd()
+                    )
+                    output_dir = Path(output_dir, "generated_imgs")
                    std_output = gr.Textbox(
-                        value="Nothing to show.",
+                        value=f"Images will be saved at {output_dir}",
                        lines=1,
+                        elem_id="std_output",
                        show_label=False,
                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
                with gr.Row():
                    inpaint_sendto_img2img = gr.Button(value="SendTo Img2Img")
                    inpaint_sendto_outpaint = gr.Button(
--- a/apps/stable_diffusion/web/ui/outpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/outpaint_ui.py
@@ -1,9 +1,13 @@
 from pathlib import Path
 import os
+import torch
+import time
+import sys
 import gradio as gr
 from PIL import Image
-from apps.stable_diffusion.scripts import outpaint_inf
-from apps.stable_diffusion.src import args
+import base64
+from io import BytesIO
+from fastapi.exceptions import HTTPException
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
@@ -13,6 +17,286 @@ from apps.stable_diffusion.web.ui.utils import (
    predefined_paint_models,
    cancel_sd,
 )
+from apps.stable_diffusion.src import (
+    args,
+    OutpaintPipeline,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    clear_all,
+    save_output_img,
+)
+from apps.stable_diffusion.src.utils import get_generation_text_info
+
+
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir
+
+
+# Exposed to UI.
+def outpaint_inf(
+    prompt: str,
+    negative_prompt: str,
+    init_image,
+    pixels: int,
+    mask_blur: int,
+    directions: list,
+    noise_q: float,
+    color_variation: float,
+    height: int,
+    width: int,
+    steps: int,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    custom_vae: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_hf_id: str,
+    ondemand: bool,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+        SD_STATE_CANCEL,
+    )
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.steps = steps
+    args.scheduler = scheduler
+    args.img_path = "not none"
+    args.ondemand = ondemand
+
+    # set ckpt_loc and hf_model_id.
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    args.custom_vae = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+    else:
+        args.hf_model_id = custom_model
+    if custom_vae != "None":
+        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")
+
+    args.use_lora = get_custom_vae_or_lora_weights(
+        lora_weights, lora_hf_id, "lora"
+    )
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    new_config_obj = Config(
+        "outpaint",
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.custom_vae,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+        use_lora=args.use_lora,
+        use_stencil=None,
+        ondemand=ondemand,
+    )
+    if (
+        not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
+        args.precision = precision
+        args.batch_count = batch_count
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-2-inpainting"
+        )
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
+        global_obj.set_sd_obj(
+            OutpaintPipeline.from_pretrained(
+                scheduler_obj,
+                args.import_mlir,
+                args.hf_model_id,
+                args.ckpt_loc,
+                args.custom_vae,
+                args.precision,
+                args.max_length,
+                args.batch_size,
+                args.height,
+                args.width,
+                args.use_base_vae,
+                args.use_tuned,
+                use_lora=args.use_lora,
+                ondemand=args.ondemand,
+            )
+        )
+
+    global_obj.set_sd_scheduler(scheduler)
+
+    start_time = time.time()
+    global_obj.get_sd_obj().log = ""
+    generated_imgs = []
+    seeds = []
+    img_seed = utils.sanitize_seed(seed)
+
+    left = True if "left" in directions else False
+    right = True if "right" in directions else False
+    top = True if "up" in directions else False
+    bottom = True if "down" in directions else False
+
+    text_output = ""
+    for i in range(batch_count):
+        if i > 0:
+            img_seed = utils.sanitize_seed(-1)
+        out_imgs = global_obj.get_sd_obj().generate_images(
+            prompt,
+            negative_prompt,
+            init_image,
+            pixels,
+            mask_blur,
+            left,
+            right,
+            top,
+            bottom,
+            noise_q,
+            color_variation,
+            batch_size,
+            height,
+            width,
+            steps,
+            guidance_scale,
+            img_seed,
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+        )
+        seeds.append(img_seed)
+        total_time = time.time() - start_time
+        text_output = get_generation_text_info(seeds, device)
+        text_output += "\n" + global_obj.get_sd_obj().log
+        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
+
+        if global_obj.get_sd_status() == SD_STATE_CANCEL:
+            break
+        else:
+            save_output_img(out_imgs[0], img_seed)
+            generated_imgs.extend(out_imgs)
+            yield generated_imgs, text_output
+
+    return generated_imgs, text_output
+
+
+def decode_base64_to_image(encoding):
+    if encoding.startswith("data:image/"):
+        encoding = encoding.split(";", 1)[1].split(",", 1)[1]
+    try:
+        image = Image.open(BytesIO(base64.b64decode(encoding)))
+        return image
+    except Exception as err:
+        print(err)
+        raise HTTPException(status_code=500, detail="Invalid encoded image")
+
+
+def encode_pil_to_base64(images):
+    encoded_imgs = []
+    for image in images:
+        with BytesIO() as output_bytes:
+            if args.output_img_format.lower() == "png":
+                image.save(output_bytes, format="PNG")
+
+            elif args.output_img_format.lower() in ("jpg", "jpeg"):
+                image.save(output_bytes, format="JPEG")
+            else:
+                raise HTTPException(
+                    status_code=500, detail="Invalid image format"
+                )
+            bytes_data = output_bytes.getvalue()
+            encoded_imgs.append(base64.b64encode(bytes_data))
+    return encoded_imgs
+
+
+# Inpaint Rest API.
+def outpaint_api(
+    InputData: dict,
+):
+    print(
+        f'Prompt: {InputData["prompt"]}, Negative Prompt: {InputData["negative_prompt"]}, Seed: {InputData["seed"]}'
+    )
+    init_image = decode_base64_to_image(InputData["init_images"][0])
+    res = outpaint_inf(
+        InputData["prompt"],
+        InputData["negative_prompt"],
+        init_image,
+        InputData["pixels"],
+        InputData["mask_blur"],
+        InputData["directions"],
+        InputData["noise_q"],
+        InputData["color_variation"],
+        InputData["height"],
+        InputData["width"],
+        InputData["steps"],
+        InputData["cfg_scale"],
+        InputData["seed"],
+        batch_count=1,
+        batch_size=1,
+        scheduler="EulerDiscrete",
+        custom_model="None",
+        hf_model_id=InputData["hf_model_id"]
+        if "hf_model_id" in InputData.keys()
+        else "stabilityai/stable-diffusion-2-1-base",
+        custom_vae="None",
+        precision="fp16",
+        device=available_devices[0],
+        max_length=64,
+        save_metadata_to_json=False,
+        save_metadata_to_png=False,
+        lora_weights="None",
+        lora_hf_id="",
+        ondemand=False,
+    )
+    return {
+        "images": encode_pil_to_base64(res[0]),
+        "parameters": {},
+        "info": res[1],
+    }


 with gr.Blocks(title="Outpainting") as outpaint_web:
@@ -235,18 +519,16 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                        show_label=False,
                        elem_id="gallery",
                    ).style(columns=[2], object_fit="contain")
+                    output_dir = (
+                        args.output_dir if args.output_dir else Path.cwd()
+                    )
+                    output_dir = Path(output_dir, "generated_imgs")
                    std_output = gr.Textbox(
-                        value="Nothing to show.",
+                        value=f"Images will be saved at {output_dir}",
                        lines=1,
+                        elem_id="std_output",
                        show_label=False,
                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
                with gr.Row():
                    outpaint_sendto_img2img = gr.Button(value="SendTo Img2Img")
                    outpaint_sendto_inpaint = gr.Button(value="SendTo Inpaint")
--- a/apps/stable_diffusion/web/ui/txt2img_ui.py
+++ b/apps/stable_diffusion/web/ui/txt2img_ui.py
@@ -2,8 +2,12 @@ from pathlib import Path
 import os
 import torch
 import time
+import sys
 import gradio as gr
 from PIL import Image
+import base64
+from io import BytesIO
+from fastapi.exceptions import HTTPException
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
@@ -200,6 +204,63 @@ def txt2img_inf(
    return generated_imgs, text_output


+def encode_pil_to_base64(images):
+    encoded_imgs = []
+    for image in images:
+        with BytesIO() as output_bytes:
+            if args.output_img_format.lower() == "png":
+                image.save(output_bytes, format="PNG")
+
+            elif args.output_img_format.lower() in ("jpg", "jpeg"):
+                image.save(output_bytes, format="JPEG")
+            else:
+                raise HTTPException(
+                    status_code=500, detail="Invalid image format"
+                )
+            bytes_data = output_bytes.getvalue()
+            encoded_imgs.append(base64.b64encode(bytes_data))
+    return encoded_imgs
+
+
+# Text2Img Rest API.
+def txt2img_api(
+    InputData: dict,
+):
+    print(
+        f'Prompt: {InputData["prompt"]}, Negative Prompt: {InputData["negative_prompt"]}, Seed: {InputData["seed"]}'
+    )
+    res = txt2img_inf(
+        InputData["prompt"],
+        InputData["negative_prompt"],
+        InputData["height"],
+        InputData["width"],
+        InputData["steps"],
+        InputData["cfg_scale"],
+        InputData["seed"],
+        batch_count=1,
+        batch_size=1,
+        scheduler="EulerDiscrete",
+        custom_model="None",
+        hf_model_id=InputData["hf_model_id"]
+        if "hf_model_id" in InputData.keys()
+        else "stabilityai/stable-diffusion-2-1-base",
+        custom_vae="None",
+        precision="fp16",
+        device=available_devices[0],
+        max_length=64,
+        save_metadata_to_json=False,
+        save_metadata_to_png=False,
+        lora_weights="None",
+        lora_hf_id="",
+        ondemand=False,
+    )
+    return {
+        "images": encode_pil_to_base64(res[0]),
+        "parameters": {},
+        "info": res[1],
+    }
+
+
 with gr.Blocks(title="Text-to-Image") as txt2img_web:
    with gr.Row(elem_id="ui_title"):
        nod_logo = Image.open(nodlogo_loc)
@@ -405,18 +466,16 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                        show_label=False,
                        elem_id="gallery",
                    ).style(columns=[2], object_fit="contain")
+                    output_dir = (
+                        args.output_dir if args.output_dir else Path.cwd()
+                    )
+                    output_dir = Path(output_dir, "generated_imgs")
                    std_output = gr.Textbox(
-                        value="Nothing to show.",
+                        value=f"Images will be saved at {output_dir}",
                        lines=1,
+                        elem_id="std_output",
                        show_label=False,
                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
                with gr.Row():
                    txt2img_sendto_img2img = gr.Button(value="SendTo Img2Img")
                    txt2img_sendto_inpaint = gr.Button(value="SendTo Inpaint")
--- a/apps/stable_diffusion/web/ui/upscaler_ui.py
+++ b/apps/stable_diffusion/web/ui/upscaler_ui.py
@@ -1,9 +1,13 @@
 from pathlib import Path
 import os
+import torch
+import time
+import sys
 import gradio as gr
 from PIL import Image
-from apps.stable_diffusion.scripts import upscaler_inf
-from apps.stable_diffusion.src import args
+import base64
+from io import BytesIO
+from fastapi.exceptions import HTTPException
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
@@ -11,7 +15,280 @@ from apps.stable_diffusion.web.ui.utils import (
    get_custom_model_files,
    scheduler_list_cpu_only,
    predefined_upscaler_models,
+    cancel_sd,
 )
+from apps.stable_diffusion.src import (
+    args,
+    UpscalerPipeline,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    clear_all,
+    save_output_img,
+)
+
+
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir
+
+
+# Exposed to UI.
+def upscaler_inf(
+    prompt: str,
+    negative_prompt: str,
+    init_image,
+    height: int,
+    width: int,
+    steps: int,
+    noise_level: int,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    custom_vae: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_hf_id: str,
+    ondemand: bool,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.seed = seed
+    args.steps = steps
+    args.scheduler = scheduler
+    args.ondemand = ondemand
+
+    if init_image is None:
+        return None, "An Initial Image is required"
+    image = init_image.convert("RGB").resize((height, width))
+
+    # set ckpt_loc and hf_model_id.
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    args.custom_vae = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+    else:
+        args.hf_model_id = custom_model
+    if custom_vae != "None":
+        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    args.use_lora = get_custom_vae_or_lora_weights(
+        lora_weights, lora_hf_id, "lora"
+    )
+
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    args.height = 128
+    args.width = 128
+    new_config_obj = Config(
+        "upscaler",
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.custom_vae,
+        precision,
+        batch_size,
+        max_length,
+        args.height,
+        args.width,
+        device,
+        use_lora=args.use_lora,
+        use_stencil=None,
+        ondemand=ondemand,
+    )
+    if (
+        not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.device = device.split("=>", 1)[1].strip()
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-2-1-base"
+        )
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
+        global_obj.set_sd_obj(
+            UpscalerPipeline.from_pretrained(
+                scheduler_obj,
+                args.import_mlir,
+                args.hf_model_id,
+                args.ckpt_loc,
+                args.custom_vae,
+                args.precision,
+                args.max_length,
+                args.batch_size,
+                args.height,
+                args.width,
+                args.use_base_vae,
+                args.use_tuned,
+                low_cpu_mem_usage=args.low_cpu_mem_usage,
+                use_lora=args.use_lora,
+                ondemand=args.ondemand,
+            )
+        )
+
+    global_obj.set_sd_scheduler(scheduler)
+    global_obj.get_sd_obj().low_res_scheduler = global_obj.get_scheduler(
+        "DDPM"
+    )
+
+    start_time = time.time()
+    global_obj.get_sd_obj().log = ""
+    generated_imgs = []
+    seeds = []
+    img_seed = utils.sanitize_seed(seed)
+    extra_info = {"NOISE LEVEL": noise_level}
+    for current_batch in range(batch_count):
+        if current_batch > 0:
+            img_seed = utils.sanitize_seed(-1)
+        low_res_img = image
+        high_res_img = Image.new("RGB", (height * 4, width * 4))
+
+        for i in range(0, width, 128):
+            for j in range(0, height, 128):
+                box = (j, i, j + 128, i + 128)
+                upscaled_image = global_obj.get_sd_obj().generate_images(
+                    prompt,
+                    negative_prompt,
+                    low_res_img.crop(box),
+                    batch_size,
+                    args.height,
+                    args.width,
+                    steps,
+                    noise_level,
+                    guidance_scale,
+                    img_seed,
+                    args.max_length,
+                    dtype,
+                    args.use_base_vae,
+                    cpu_scheduling,
+                )
+                high_res_img.paste(upscaled_image[0], (j * 4, i * 4))
+
+        save_output_img(high_res_img, img_seed, extra_info)
+        generated_imgs.append(high_res_img)
+        seeds.append(img_seed)
+        global_obj.get_sd_obj().log += "\n"
+        yield generated_imgs, global_obj.get_sd_obj().log
+
+    total_time = time.time() - start_time
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+    text_output += f"\nscheduler={args.scheduler}, device={device}"
+    text_output += f"\nsteps={steps}, noise_level={noise_level}, guidance_scale={guidance_scale}, seed={seeds}"
+    text_output += f"\nsize={height}x{width}, batch_count={batch_count}, batch_size={batch_size}, max_length={args.max_length}"
+    text_output += global_obj.get_sd_obj().log
+    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+    yield generated_imgs, text_output
+
+
+def decode_base64_to_image(encoding):
+    if encoding.startswith("data:image/"):
+        encoding = encoding.split(";", 1)[1].split(",", 1)[1]
+    try:
+        image = Image.open(BytesIO(base64.b64decode(encoding)))
+        return image
+    except Exception as err:
+        print(err)
+        raise HTTPException(status_code=500, detail="Invalid encoded image")
+
+
+def encode_pil_to_base64(images):
+    encoded_imgs = []
+    for image in images:
+        with BytesIO() as output_bytes:
+            if args.output_img_format.lower() == "png":
+                image.save(output_bytes, format="PNG")
+
+            elif args.output_img_format.lower() in ("jpg", "jpeg"):
+                image.save(output_bytes, format="JPEG")
+            else:
+                raise HTTPException(
+                    status_code=500, detail="Invalid image format"
+                )
+            bytes_data = output_bytes.getvalue()
+            encoded_imgs.append(base64.b64encode(bytes_data))
+    return encoded_imgs
+
+
+# Upscaler Rest API.
+def upscaler_api(
+    InputData: dict,
+):
+    print(
+        f'Prompt: {InputData["prompt"]}, Negative Prompt: {InputData["negative_prompt"]}, Seed: {InputData["seed"]}'
+    )
+    init_image = decode_base64_to_image(InputData["init_images"][0])
+    res = upscaler_inf(
+        InputData["prompt"],
+        InputData["negative_prompt"],
+        init_image,
+        InputData["height"],
+        InputData["width"],
+        InputData["steps"],
+        InputData["noise_level"],
+        InputData["cfg_scale"],
+        InputData["seed"],
+        batch_count=1,
+        batch_size=1,
+        scheduler="EulerDiscrete",
+        custom_model="None",
+        hf_model_id=InputData["hf_model_id"]
+        if "hf_model_id" in InputData.keys()
+        else "stabilityai/stable-diffusion-2-1-base",
+        custom_vae="None",
+        precision="fp16",
+        device=available_devices[0],
+        max_length=64,
+        save_metadata_to_json=False,
+        save_metadata_to_png=False,
+        lora_weights="None",
+        lora_hf_id="",
+        ondemand=False,
+    )
+    return {
+        "images": encode_pil_to_base64(res[0]),
+        "parameters": {},
+        "info": res[1],
+    }


 with gr.Blocks(title="Upscaler") as upscaler_web:
@@ -213,18 +490,16 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                        show_label=False,
                        elem_id="gallery",
                    ).style(columns=[2], object_fit="contain")
+                    output_dir = (
+                        args.output_dir if args.output_dir else Path.cwd()
+                    )
+                    output_dir = Path(output_dir, "generated_imgs")
                    std_output = gr.Textbox(
-                        value="Nothing to show.",
+                        value=f"Images will be saved at {output_dir}",
                        lines=1,
+                        elem_id="std_output",
                        show_label=False,
                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
                with gr.Row():
                    upscaler_sendto_img2img = gr.Button(value="SendTo Img2Img")
                    upscaler_sendto_inpaint = gr.Button(value="SendTo Inpaint")
--- a/shark/iree_utils/vulkan_target_env_utils.py
+++ b/shark/iree_utils/vulkan_target_env_utils.py
@@ -133,7 +133,7 @@ def get_vendor(triple):
        return "Apple"
    if arch in ["arc", "UHD"]:
        return "Intel"
-    if arch in ["turing", "ampere"]:
+    if arch in ["turing", "ampere", "pascal"]:
        return "NVIDIA"
    if arch == "ardeno":
        return "Qualcomm"
@@ -151,7 +151,7 @@ def get_device_type(triple):
        return "Unknown"
    if arch == "cpu":
        return "CPU"
-    if arch in ["turing", "ampere", "arc"]:
+    if arch in ["turing", "ampere", "arc", "pascal"]:
        return "DiscreteGPU"
    if arch in ["rdna1", "rdna2", "rdna3", "rgcn3", "rgcn5"]:
        if product == "ivega10":
@@ -389,6 +389,39 @@ def get_vulkan_target_capabilities(triple):
                "ShuffleRelative",
            ]

+    elif arch in ["pascal"]:
+        cap["maxComputeSharedMemorySize"] = 49152
+        cap["maxComputeWorkGroupInvocations"] = 1536
+        cap["maxComputeWorkGroupSize"] = [1536, 1024, 64]
+
+        cap["subgroupSize"] = 32
+        cap["minSubgroupSize"] = 32
+        cap["maxSubgroupSize"] = 32
+        cap["subgroupFeatures"] = [
+            "Basic",
+            "Vote",
+            "Arithmetic",
+            "Ballot",
+            "Shuffle",
+            "ShuffleRelative",
+            "Clustered",
+            "Quad",
+        ]
+
+        cap["shaderFloat16"] = True
+        cap["shaderFloat64"] = True
+        cap["shaderInt8"] = True
+        cap["shaderInt16"] = True
+        cap["shaderInt64"] = True
+        cap["storageBuffer16BitAccess"] = True
+        cap["storagePushConstant16"] = True
+        cap["uniformAndStorageBuffer16BitAccess"] = True
+        cap["storageBuffer8BitAccess"] = True
+        cap["storagePushConstant8"] = True
+        cap["uniformAndStorageBuffer8BitAccess"] = True
+        cap["variablePointers"] = True
+        cap["variablePointersStorageBuffer"] = True
+
    elif arch in ["ampere", "turing"]:
        cap["maxComputeSharedMemorySize"] = 49152
        cap["maxComputeWorkGroupInvocations"] = 1024
Author	SHA1	Message	Date
powderluv	09e3f63d5b	Fix pascal (#1346 ) * Add fp32 for upscaler VAE * Plumb Pascal vulkan support	2023-04-23 20:28:25 -07:00
powderluv	d60a5a9396	Add fp32 for upscaler VAE (#1345 )	2023-04-23 15:27:55 -07:00
m68k-fr	90df0ee365	[Web] Gallery set to a 768px reference for high-end desktop users (#1344 )	2023-04-23 11:48:06 -07:00
nirvedhmeshram	133c1bcadd	add device to scheduler model names (#1338 )	2023-04-22 20:13:56 -05:00
powderluv	caadbe14e9	Revert VAE to use im2col (#1339 )	2023-04-22 15:23:41 -07:00
Ean Garvey	5f5823ccd9	Fix inference object imports for SD apps. (#1334 )	2023-04-21 13:40:48 -05:00
Vivek Khandelwal	d2f7e03b7e	Add StableLM model (#1331 )	2023-04-21 09:51:02 -07:00
Gaurav Shukla	0b01bbe479	[SD] Add txt2img/upscaler/inpaint/outpaint Rest API (#1325 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-04-21 09:06:06 -07:00
yzhang93	25c5fc44ae	Modify tuner.py to take vulkan target triple flag (#1328 )	2023-04-20 14:31:32 -07:00