Remove non-SD args from generate_sharktank.py

Fix restart SD session error + override args.use_tuned temporarily
-- This commit fixes the session restart error for SD. -- It also overrides `args.use_tuned` for `import_mlir`, and sets `use_tuned` as `False`. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>
2026-04-20 03:00:34 -04:00 · 2023-02-07 17:43:21 +00:00 · 2023-02-07 19:50:48 +05:30 · 2023-02-07 01:48:36 -08:00 · 2023-02-07 14:06:07 +05:30 · 2023-02-07 14:06:07 +05:30
16 changed files with 146 additions and 93 deletions
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -143,7 +143,7 @@ jobs:
          then 
            export SHA=$(git log -1 --format='%h')
            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/${DATE}_$SHA
-            gsutil -m cp -r gs://shark_tank/${DATE}_$SHA/* gs://shark_tank/latest/
+            gsutil -m cp -r gs://shark_tank/${DATE}_$SHA/* gs://shark_tank/nightly/
        fi
        rm -rf ./wheelhouse/nodai*

--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -111,7 +111,7 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./shark_tmp/shark_cache" -k cpu
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank -k cpu
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv

@@ -121,7 +121,7 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./shark_tmp/shark_cache" -k cuda
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank -k cuda
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
        # Disabled due to black image bug
@@ -136,7 +136,7 @@ jobs:
        export DYLD_LIBRARY_PATH=/usr/local/lib/
        echo $PATH
        pip list | grep -E "torch|iree"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./shark_tmp/shark_cache" -k vulkan
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" -k vulkan

    - name: Validate Vulkan Models (a100)
      if: matrix.suite == 'vulkan' && matrix.os == 'a100'
@@ -144,7 +144,7 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./shark_tmp/shark_cache" -k vulkan
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank -k vulkan
        python build_tools/stable_diffusion_testing.py --device=vulkan

    - name: Validate Vulkan Models (Windows)
--- a/.gitignore
+++ b/.gitignore
@@ -159,6 +159,9 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/

+# vscode related
+.vscode
+
 # Shark related artefacts
 *venv/
 shark_tmp/
@@ -172,3 +175,10 @@ onnx_models/

 # Generated images
 generated_imgs/
+
+# Custom model related artefacts
+apps/stable_diffusion/src/utils/resources/variants.json
+models/
+
+# models folder
+apps/stable_diffusion/web/models/
--- a/apps/stable_diffusion/shark_sd.spec
+++ b/apps/stable_diffusion/shark_sd.spec
@@ -30,6 +30,7 @@ datas += [
         ( 'src/utils/resources/model_db.json', 'resources' ),
         ( 'src/utils/resources/opt_flags.json', 'resources' ),
         ( 'src/utils/resources/base_model.json', 'resources' ),
+         ( 'web/css/*', 'css' ),
         ( 'web/logos/*', 'logos' )
         ]

--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -13,6 +13,7 @@ from apps.stable_diffusion.src.utils import (
    fetch_or_delete_vmfbs,
    preprocessCKPT,
    get_path_to_diffusers_checkpoint,
+    fetch_and_update_base_model_id,
 )


@@ -79,8 +80,8 @@ class SharkifyStableDiffusionModel:
        self.width = width // 8
        self.batch_size = batch_size
        self.custom_weights = custom_weights
-        if self.custom_weights != "":
-            assert self.custom_weights.lower().endswith(
+        if custom_weights != "":
+            assert custom_weights.lower().endswith(
                (".ckpt", ".safetensors")
            ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
            custom_weights = get_path_to_diffusers_checkpoint(custom_weights)
@@ -216,35 +217,72 @@ class SharkifyStableDiffusionModel:
        )
        return shark_clip

+    # Compiles Clip, Unet and Vae with `base_model_id` as defining their input
+    # configiration.
+    def compile_all(self, base_model_id):
+        self.inputs = get_input_info(
+            base_models[base_model_id],
+            self.max_len,
+            self.width,
+            self.height,
+            self.batch_size,
+        )
+        compiled_unet = self.get_unet()
+        compiled_vae = self.get_vae()
+        compiled_clip = self.get_clip()
+        
+        return compiled_clip, compiled_unet, compiled_vae
+
    def __call__(self):
+        # Step 1:
+        # --  Fetch all vmfbs for the model, if present, else delete the lot.
        vmfbs = fetch_or_delete_vmfbs(
            self.model_name, self.base_vae, self.precision
-        )
+        )   
        if vmfbs[0]:
-            print("Loading vmfbs from cache")
+            # -- If all vmfbs are indeed present, we also try and fetch the base
+            #    model configuration for running SD with custom checkpoints.
+            if self.custom_weights != "":
+                args.hf_model_id = fetch_and_update_base_model_id(self.custom_weights)
+            if args.hf_model_id == "":
+                sys.exit("Base model configuration for the custom model is missing. Use `--clear_all` and re-run.")
+            print("Loaded vmfbs from cache and successfully fetched base model configuration.")
            return vmfbs
+
+        # Step 2:
+        # -- If vmfbs weren't found, we try to see if the base model configuration
+        #    for the required SD run is known to us and bypass the retry mechanism.
+        model_to_run = ""
        if self.custom_weights != "":
+            model_to_run = self.custom_weights
            assert self.custom_weights.lower().endswith(
                (".ckpt", ".safetensors")
            ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
            preprocessCKPT(self.custom_weights)
+        else:
+            model_to_run = args.hf_model_id
+        base_model_fetched = fetch_and_update_base_model_id(model_to_run)
+        if base_model_fetched != "":
+            print("Compiling all the models with the fetched base model configuration.")
+            if args.ckpt_loc != "":
+                args.hf_model_id = base_model_fetched
+            return self.compile_all(base_model_fetched)
+
+        # Step 3:
+        # -- This is the retry mechanism where the base model's configuration is not
+        #    known to us and figure that out by trial and error.
+        print("Inferring base model configuration.")
        for model_id in base_models:
-            self.inputs = get_input_info(
-                base_models[model_id],
-                self.max_len,
-                self.width,
-                self.height,
-                self.batch_size,
-            )
            try:
-                compiled_unet = self.get_unet()
-                compiled_vae = self.get_vae()
-                compiled_clip = self.get_clip()
+                compiled_clip, compiled_unet, compiled_vae = self.compile_all(model_id)
            except Exception as e:
                if args.enable_stack_trace:
                    traceback.print_exc()
                print("Retrying with a different base model configuration")
                continue
+            # -- Once a successful compilation has taken place we'd want to store
+            #    the base model's configuration inferred.
+            fetch_and_update_base_model_id(model_to_run, model_id)
            # This is done just because in main.py we are basing the choice of tokenizer and scheduler
            # on `args.hf_model_id`. Since now, we don't maintain 1:1 mapping of variants and the base
            # model and rely on retrying method to find the input configuration, we should also update
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
@@ -186,6 +186,8 @@ class StableDiffusionPipeline:
        use_tuned: bool,
    ):
        if import_mlir:
+            # TODO: Delet this when on-the-fly tuning of models work.
+            use_tuned = False
            mlir_import = SharkifyStableDiffusionModel(
                model_id,
                ckpt_loc,
--- a/apps/stable_diffusion/src/utils/init.py
+++ b/apps/stable_diffusion/src/utils/init.py
@@ -8,6 +8,7 @@ from apps.stable_diffusion.src.utils.resources import (
    base_models,
    opt_flags,
    resource_path,
+    fetch_and_update_base_model_id,
 )
 from apps.stable_diffusion.src.utils.sd_annotation import sd_model_annotation
 from apps.stable_diffusion.src.utils.stable_args import args
--- a/apps/stable_diffusion/src/utils/resources.py
+++ b/apps/stable_diffusion/src/utils/resources.py
@@ -35,3 +35,28 @@ base_models = get_json_file("resources/base_model.json")

 # Contains optimization flags for different models.
 opt_flags = get_json_file("resources/opt_flags.json")
+
+
+# `fetch_and_update_base_model_id` is a resource utility function which
+# helps maintaining mapping of the model to run with its base model.
+# If `base_model` is "", then this function tries to fetch the base model
+# info for the `model_to_run`.
+def fetch_and_update_base_model_id(model_to_run, base_model=""):
+    path = "resources/variants.json"
+    loc_json = resource_path(path)
+    data = {model_to_run: base_model}
+    json_data = {}
+    if os.path.exists(loc_json):
+        with open(loc_json, "r", encoding="utf-8") as jsonFile:
+            json_data = json.load(jsonFile)
+            # Return with base_model's info if base_model is "".
+            if base_model == "":
+                if model_to_run in json_data:
+                    base_model = json_data[model_to_run]
+                return base_model
+    elif base_model == "":
+        return base_model
+    # Update JSON data to contain an entry mapping model_to_run with base_model.
+    json_data.update(data)
+    with open(loc_json, "w", encoding="utf-8") as jsonFile:
+        json.dump(json_data, jsonFile)
--- a/build_tools/populate_sharktank_ci.sh
+++ b/build_tools/populate_sharktank_ci.sh
@@ -2,4 +2,4 @@

 IMPORTER=1 BENCHMARK=1 ./setup_venv.sh
 source $GITHUB_WORKSPACE/shark.venv/bin/activate
-python generate_sharktank.py --upload=False --ci_tank_dir=True
+python generate_sharktank.py
--- a/generate_sharktank.py
+++ b/generate_sharktank.py
@@ -13,7 +13,6 @@ import os
 import csv
 import argparse
 from shark.shark_importer import SharkImporter
-from shark.parser import shark_args
 import subprocess as sp
 import hashlib
 import numpy as np
@@ -267,16 +266,17 @@ if __name__ == "__main__":
    # old_args = parser.parse_args()

    home = str(Path.home())
-    if args.ci_tank_dir == True:
-        WORKDIR = os.path.join(os.path.dirname(__file__), "gen_shark_tank")
-    else:
-        WORKDIR = os.path.join(home, ".local/shark_tank/")
+    WORKDIR = os.path.join(os.path.dirname(__file__), "gen_shark_tank")
+    torch_model_csv = os.path.join(
+        os.path.dirname(__file__), "tank", "torch_model_list.csv"
+    )
+    tf_model_csv = os.path.join(
+        os.path.dirname(__file__), "tank", "tf_model_list.csv"
+    )
+    tflite_model_csv = os.path.join(
+        os.path.dirname(__file__), "tank", "tflite", "tflite_model_list.csv"
+    )

-    if args.torch_model_csv:
-        save_torch_model(args.torch_model_csv)
-
-    if args.tf_model_csv:
-        save_tf_model(args.tf_model_csv)
-
-    if args.tflite_model_csv:
-        save_tflite_model(args.tflite_model_csv)
+    save_torch_model(torch_model_csv)
+    save_tf_model(tf_model_csv)
+    save_tflite_model(tflite_model_csv)
--- a/shark/parser.py
+++ b/shark/parser.py
@@ -15,24 +15,6 @@
 import argparse
 import os

-
-def dir_path(path):
-    if os.path.isdir(path):
-        return path
-    else:
-        os.mkdir(path)
-        return path
-
-
-def dir_file(path):
-    if os.path.isfile(path):
-        return path
-    else:
-        raise argparse.ArgumentTypeError(
-            f"readable_file:{path} is not a valid file"
-        )
-
-
 parser = argparse.ArgumentParser(description="SHARK runner.")
 parser.add_argument(
    "--device",
@@ -40,12 +22,6 @@ parser.add_argument(
    default="cpu",
    help="Device on which shark_runner runs. options are cpu, cuda, and vulkan",
 )
-parser.add_argument(
-    "--repro_dir",
-    help="Directory to which module files will be saved for reproduction or debugging.",
-    type=dir_path,
-    default="shark_tmp",
-)
 parser.add_argument(
    "--enable_tf32",
    type=bool,
@@ -83,10 +59,16 @@ parser.add_argument(
 )
 parser.add_argument(
    "--update_tank",
-    default=False,
+    default=True,
    action="store_true",
    help="When enabled, SHARK downloader will update local shark_tank if local hash is different from latest upstream hash.",
 )
+parser.add_argument(
+    "--force_update_tank",
+    default=False,
+    action="store_true",
+    help="When enabled, SHARK downloader will force an update of local shark_tank artifacts for each request.",
+)
 parser.add_argument(
    "--local_tank_cache",
    default=None,
--- a/shark/shark_benchmark_runner.py
+++ b/shark/shark_benchmark_runner.py
@@ -82,7 +82,7 @@ class SharkBenchmarkRunner(SharkRunner):
            self.vmfb_file = export_iree_module_to_vmfb(
                mlir_module,
                device,
-                shark_args.repro_dir,
+                ".",
                self.mlir_dialect,
                extra_args=self.extra_args,
            )
--- a/shark/shark_downloader.py
+++ b/shark/shark_downloader.py
@@ -79,23 +79,21 @@ input_type_to_np_dtype = {
 # Save the model in the home local so it needn't be fetched everytime in the CI.
 home = str(Path.home())
 alt_path = os.path.join(os.path.dirname(__file__), "../gen_shark_tank/")
-custom_path_list = None
-if shark_args.local_tank_cache is not None:
-    custom_path_list = shark_args.local_tank_cache.split("/")
+custom_path = shark_args.local_tank_cache

-if os.path.exists(alt_path):
-    WORKDIR = alt_path
-    print(
-        f"Using {WORKDIR} as shark_tank directory. Delete this directory if you aren't working from locally generated shark_tank."
-    )
-if custom_path_list:
-    custom_path = os.path.join(*custom_path_list)
+if custom_path is not None:
    if not os.path.exists(custom_path):
        os.mkdir(custom_path)

    WORKDIR = custom_path

    print(f"Using {WORKDIR} as local shark_tank cache directory.")
+
+if os.path.exists(alt_path):
+    WORKDIR = alt_path
+    print(
+        f"Using {WORKDIR} as shark_tank directory. Delete this directory if you aren't working from locally generated shark_tank."
+    )
 else:
    WORKDIR = os.path.join(home, ".local/shark_tank/")
    print(
@@ -148,15 +146,14 @@ def download_model(
    model_dir = os.path.join(WORKDIR, model_dir_name)
    full_gs_url = tank_url.rstrip("/") + "/" + model_dir_name

-    if shark_args.update_tank == True:
-        print(f"Updating artifacts for model {model_name}...")
-        download_public_file(full_gs_url, model_dir)
-
-    elif not check_dir_exists(
+    if not check_dir_exists(
        model_dir_name, frontend=frontend, dynamic=dyn_str
    ):
        print(f"Downloading artifacts for model {model_name}...")
        download_public_file(full_gs_url, model_dir)
+    elif shark_args.force_update_tank == True:
+        print(f"Force-updating artifacts for model {model_name}...")
+        download_public_file(full_gs_url, model_dir)
    else:
        if not _internet_connected():
            print(
@@ -178,7 +175,11 @@ def download_model(
                )
            except FileNotFoundError:
                upstream_hash = None
-            if local_hash != upstream_hash:
+            if local_hash != upstream_hash and shark_args.update_tank == True:
+                print(f"Updating artifacts for model {model_name}...")
+                download_public_file(full_gs_url, model_dir)
+
+            elif local_hash != upstream_hash:
                print(
                    "Hash does not match upstream in gs://shark_tank/latest. If you want to use locally generated artifacts, this is working as intended. Otherwise, run with --update_tank."
                )
--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -81,7 +81,7 @@ class SharkImporter:
            self.return_str,
        )

-    def _tf_mlir(self, func_name, save_dir="./shark_tmp/"):
+    def _tf_mlir(self, func_name, save_dir="."):
        from iree.compiler import tf as tfc

        return tfc.compile_module(
@@ -91,7 +91,7 @@ class SharkImporter:
            output_file=save_dir,
        )

-    def _tflite_mlir(self, func_name, save_dir="./shark_tmp/"):
+    def _tflite_mlir(self, func_name, save_dir="."):
        from iree.compiler import tflite as tflitec

        self.mlir_model = tflitec.compile_file(
--- a/shark/torch_mlir_utils.py
+++ b/shark/torch_mlir_utils.py
@@ -65,7 +65,7 @@ def get_torch_mlir_module(
    if jit_trace:
        ignore_traced_shapes = True

-    tempfile.tempdir = shark_args.repro_dir
+    tempfile.tempdir = "."

    mlir_module = torch_mlir.compile(
        module,
--- a/tank/test_models.py
+++ b/tank/test_models.py
@@ -136,7 +136,7 @@ class SharkModuleTester:

    def create_and_check_module(self, dynamic, device):
        shark_args.local_tank_cache = self.local_tank_cache
-        shark_args.update_tank = self.update_tank
+        shark_args.force_update_tank = self.update_tank
        if "nhcw-nhwc" in self.config["flags"] and not os.path.isfile(
            ".use-iree"
        ):
@@ -212,12 +212,11 @@ class SharkModuleTester:
        )

    def save_reproducers(self):
-        # Saves contents of IREE TempFileSaver temporary directory to ./shark_tmp/saved/<test_case>.
-        src = os.path.join(*self.temp_dir.split("/"))
-        saves = os.path.join(".", "shark_tmp", "saved")
-        trg = os.path.join(saves, self.tmp_prefix)
-        if not os.path.isdir(saves):
-            os.mkdir(saves)
+        # Saves contents of IREE TempFileSaver temporary directory to ./{temp_dir}/saved/<test_case>.
+        src = self.temp_dir
+        trg = os.path.join("reproducers", self.tmp_prefix)
+        if not os.path.isdir("reproducers"):
+            os.mkdir("reproducers")
        if not os.path.isdir(trg):
            os.mkdir(trg)
        files = os.listdir(src)
@@ -227,10 +226,7 @@ class SharkModuleTester:
    def upload_repro(self):
        import subprocess

-        src = os.path.join(*self.temp_dir.split("/"))
-        repro_path = os.path.join(
-            ".", "shark_tmp", "saved", self.tmp_prefix, "*"
-        )
+        repro_path = os.path.join("reproducers", self.tmp_prefix, "*")

        bashCommand = f"gsutil cp -r {repro_path} gs://shark-public/builder/repro_artifacts/{self.ci_sha}/{self.tmp_prefix}/"
        process = subprocess.run(bashCommand.split())
@@ -329,11 +325,8 @@ class SharkModuleTest(unittest.TestCase):
        )
        self.module_tester.tmp_prefix = safe_name.replace("/", "_")

-        if not os.path.isdir("shark_tmp"):
-            os.mkdir("shark_tmp")
-
        tempdir = tempfile.TemporaryDirectory(
-            prefix=self.module_tester.tmp_prefix, dir="shark_tmp"
+            prefix=self.module_tester.tmp_prefix, dir="."
        )
        self.module_tester.temp_dir = tempdir.name
Author	SHA1	Message	Date
Ean Garvey	4f5b8e7a7b	Remove non-SD args from generate_sharktank.py	2023-02-07 17:43:21 +00:00
Abhishek Varma	eeb20b531a	Fix restart SD session error + override args.use_tuned temporarily -- This commit fixes the session restart error for SD. -- It also overrides `args.use_tuned` for `import_mlir`, and sets `use_tuned` as `False`. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-02-07 19:50:48 +05:30
cstueckrath	9dca842c22	Update .gitignore to exclude models (#967 ) the models folder will be stashed along with other changes and most likely kill git doing so.	2023-02-07 01:48:36 -08:00
Ean Garvey	1eb9436836	Fix generate_sharktank args.	2023-02-07 14:06:07 +05:30
Ean Garvey	9604d9ce81	make --update_tank update only if hash mismatch	2023-02-07 14:06:07 +05:30
Ean Garvey	481d0553d8	Remove unnecessary repro_dir / shark_tmp usage	2023-02-07 14:06:07 +05:30
powderluv	60035cd63a	Add css in exe (#963 ) exe should now default to dark theme too	2023-02-06 15:26:08 -08:00