Compare commits

...

15 Commits

Author SHA1 Message Date
saienduri
4529fd0461 Update requirements.txt 2024-08-06 19:29:40 -07:00
saienduri
4c2bb4b7b4 Update requirements.txt 2024-08-06 17:15:42 -07:00
saienduri
d5013fd13e Update requirements.txt (#2157) 2024-06-18 13:41:35 -07:00
Ean Garvey
26f80ccbbb Fixes to UI config defaults, config loading, and warnings. (#2153) 2024-05-31 18:14:27 -04:00
Ean Garvey
d2c3752dc7 Fix batch count and tweaks to chatbot. (#2151)
* Fix batch count

* Add button to unload models manually.

* Add compiled pipeline option

* Add brevitas to requirements

* Tweaks to chatbot

* Change script loading trigger
2024-05-31 18:48:28 +05:30
Ean Garvey
4505c4549f Force inlined weights on igpu for now, small fixes to chatbot (#2149)
* Add igpu and custom triple support.

* Small fixes to igpu, SDXL-turbo

* custom pipe loading

* formatting

* Remove old nodlogo import.
2024-05-30 11:40:42 -05:00
Gaurav Shukla
793495c9c6 [ui] Add AMD logo in shark studio
Signed-Off-by: Gaurav Shukla <gaurav.shukla@amd.com>
2024-05-30 21:43:15 +05:30
Ean Garvey
13e1d8d98a Add igpu and custom triple support. (#2148) 2024-05-29 17:39:36 -05:00
Ean Garvey
2074df40ad Point to nod fork of diffusers. (#2146) 2024-05-29 00:56:21 -05:00
Ean Garvey
7b30582408 Point to SRT links for windows. (#2145) 2024-05-29 01:20:30 -04:00
Ean Garvey
151195ab74 Add a few requirements for ensured parity with turbine-models requirements. (#2142)
* Add scipy to requirements.

Adds diffusers req and a note for torchsde.
2024-05-28 15:37:31 -05:00
Ean Garvey
8146f0bd2f Remove leftover merge conflict line from setup script. (#2141) 2024-05-28 11:04:45 -07:00
Ean Garvey
68e9281778 (Studio2) Refactors SD pipeline to rely on turbine-models pipeline, fixes to LLM, gitignore (#2129)
* Shark Studio SDXL support, HIP driver support, simpler device info, small fixes

* Fixups to llm API/UI and ignore user config files.

* Small fixes for unifying pipelines.

* Update requirements.txt for iree-turbine (#2130)

* Fix Llama2 on CPU (#2133)

* Filesystem cleanup and custom model fixes (#2127)

* Fix some formatting issues

* Remove IREE pin (fixes exe issue) (#2126)

* Update find links for IREE packages (#2136)

* Shark Studio SDXL support, HIP driver support, simpler device info, small fixes

* Abstract out SD pipelines from Studio Webui (WIP)

* Switch from pin to minimum torch version and fix index url

* Fix device parsing.

* Fix linux setup

* Fix custom weights.

---------

Co-authored-by: saienduri <77521230+saienduri@users.noreply.github.com>
Co-authored-by: gpetters-amd <159576198+gpetters-amd@users.noreply.github.com>
Co-authored-by: gpetters94 <gpetters@protonmail.com>
2024-05-28 13:18:31 -04:00
Ean Garvey
fd07cae991 Update find links for IREE packages (#2136) 2024-05-13 11:43:17 -05:00
gpetters94
6cb86a843e Remove IREE pin (fixes exe issue) (#2126)
* Diagnose a build issue

* Remove IREE pin

* Revert the build on pull request change
2024-04-30 12:27:30 -05:00
31 changed files with 685 additions and 620 deletions

View File

@@ -53,6 +53,7 @@ jobs:
python process_skipfiles.py
$env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
pip install -e .
pip freeze -l
pyinstaller .\apps\shark_studio\shark_studio.spec
mv ./dist/nodai_shark_studio.exe ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
signtool sign /f c:\g\shark_02152023.cer /fd certHash /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe

View File

@@ -81,6 +81,5 @@ jobs:
source shark.venv/bin/activate
pip install -r requirements.txt --no-cache-dir
pip install -e .
pip uninstall -y torch
pip install torch==2.1.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
python apps/shark_studio/tests/api_test.py
# Disabled due to hang when exporting test llama2
# python apps/shark_studio/tests/api_test.py

8
.gitignore vendored
View File

@@ -164,7 +164,7 @@ cython_debug/
# vscode related
.vscode
# Shark related artefacts
# Shark related artifacts
*venv/
shark_tmp/
*.vmfb
@@ -172,6 +172,7 @@ shark_tmp/
tank/dict_configs.py
*.csv
reproducers/
apps/shark_studio/web/configs
# ORT related artefacts
cache_models/
@@ -188,6 +189,11 @@ variants.json
# models folder
apps/stable_diffusion/web/models/
# model artifacts (SHARK)
*.tempfile
*.mlir
*.vmfb
# Stencil annotators.
stencil_annotator/

View File

@@ -3,8 +3,13 @@ from turbine_models.model_runner import vmfbRunner
from turbine_models.gen_external_params.gen_external_params import gen_external_params
import time
from shark.iree_utils.compile_utils import compile_module_to_flatbuffer
from apps.shark_studio.web.utils.file_utils import get_resource_path
from apps.shark_studio.web.utils.file_utils import (
get_resource_path,
get_checkpoints_path,
)
from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
from apps.shark_studio.api.utils import parse_device
from urllib.request import urlopen
import iree.runtime as ireert
from itertools import chain
import gc
@@ -65,6 +70,7 @@ class LanguageModel:
use_system_prompt=True,
streaming_llm=False,
):
_, _, self.triple = parse_device(device)
self.hf_model_name = llm_model_map[model_name]["hf_model_name"]
self.device = device.split("=>")[-1].strip()
self.backend = self.device.split("://")[0]
@@ -155,7 +161,9 @@ class LanguageModel:
use_auth_token=hf_auth_token,
)
elif not os.path.exists(self.tempfile_name):
self.torch_ir, self.tokenizer = llm_model_map[model_name]["initializer"](
self.torch_ir, self.tokenizer = llm_model_map[self.hf_model_name][
"initializer"
](
self.hf_model_name,
hf_auth_token,
compile_to="torch",
@@ -163,6 +171,7 @@ class LanguageModel:
precision=self.precision,
quantization=self.quantization,
streaming_llm=self.streaming_llm,
decomp_attn=True,
)
with open(self.tempfile_name, "w+") as f:
f.write(self.torch_ir)
@@ -192,11 +201,27 @@ class LanguageModel:
)
elif self.backend == "vulkan":
flags.extend(["--iree-stream-resource-max-allocation-size=4294967296"])
elif self.backend == "rocm":
flags.extend(
[
"--iree-codegen-llvmgpu-enable-transform-dialect-jit=false",
"--iree-llvmgpu-enable-prefetch=true",
"--iree-opt-outer-dim-concat=true",
"--iree-flow-enable-aggressive-fusion",
]
)
if "gfx9" in self.triple:
flags.extend(
[
f"--iree-codegen-transform-dialect-library={get_mfma_spec_path(self.triple, get_checkpoints_path())}",
"--iree-codegen-llvmgpu-use-vector-distribution=true",
]
)
flags.extend(llm_model_map[self.hf_model_name]["compile_flags"])
flatbuffer_blob = compile_module_to_flatbuffer(
self.tempfile_name,
device=self.device,
frontend="torch",
frontend="auto",
model_config_path=None,
extra_args=flags,
write_to=self.vmfb_name,
@@ -258,8 +283,7 @@ class LanguageModel:
history.append(format_out(token))
while (
format_out(token)
!= llm_model_map["meta-llama/Llama-2-7b-chat-hf"]["stop_token"]
format_out(token) != llm_model_map[self.hf_model_name]["stop_token"]
and len(history) < self.max_tokens
):
dec_time = time.time()
@@ -273,10 +297,7 @@ class LanguageModel:
self.prev_token_len = token_len + len(history)
if (
format_out(token)
== llm_model_map["meta-llama/Llama-2-7b-chat-hf"]["stop_token"]
):
if format_out(token) == llm_model_map[self.hf_model_name]["stop_token"]:
break
for i in range(len(history)):
@@ -310,7 +331,7 @@ class LanguageModel:
self.first_input = False
history.append(int(token))
while token != llm_model_map["meta-llama/Llama-2-7b-chat-hf"]["stop_token"]:
while token != llm_model_map[self.hf_model_name]["stop_token"]:
dec_time = time.time()
result = self.hf_mod(token.reshape([1, 1]), past_key_values=pkv)
history.append(int(token))
@@ -321,7 +342,7 @@ class LanguageModel:
self.prev_token_len = token_len + len(history)
if token == llm_model_map["meta-llama/Llama-2-7b-chat-hf"]["stop_token"]:
if token == llm_model_map[self.hf_model_name]["stop_token"]:
break
for i in range(len(history)):
if type(history[i]) != int:
@@ -331,6 +352,17 @@ class LanguageModel:
return result_output, total_time
def get_mfma_spec_path(target_chip, save_dir):
url = "https://raw.githubusercontent.com/iree-org/iree/main/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir"
attn_spec = urlopen(url).read().decode("utf-8")
spec_path = os.path.join(save_dir, "attention_and_matmul_spec_mfma.mlir")
if os.path.exists(spec_path):
return spec_path
with open(spec_path, "w") as f:
f.write(attn_spec)
return spec_path
def llm_chat_api(InputData: dict):
from datetime import datetime as dt

View File

@@ -1,54 +1,82 @@
import gc
import torch
import gradio as gr
import time
import os
import json
import numpy as np
import copy
import importlib.util
import sys
from tqdm.auto import tqdm
from pathlib import Path
from random import randint
from turbine_models.custom_models.sd_inference import clip, unet, vae
from turbine_models.custom_models.sd_inference.sd_pipeline import SharkSDPipeline
from turbine_models.custom_models.sdxl_inference.sdxl_compiled_pipeline import (
SharkSDXLPipeline,
)
from apps.shark_studio.api.controlnet import control_adapter_map
from apps.shark_studio.api.utils import parse_device
from apps.shark_studio.web.utils.state import status_label
from apps.shark_studio.web.utils.file_utils import (
safe_name,
get_resource_path,
get_checkpoints_path,
)
from apps.shark_studio.modules.pipeline import SharkPipelineBase
from apps.shark_studio.modules.schedulers import get_schedulers
from apps.shark_studio.modules.prompt_encoding import (
get_weighted_text_embeddings,
)
from apps.shark_studio.modules.img_processing import (
resize_stencil,
save_output_img,
resamplers,
resampler_list,
)
from apps.shark_studio.modules.ckpt_processing import (
preprocessCKPT,
process_custom_pipe_weights,
save_irpa,
)
from transformers import CLIPTokenizer
from diffusers.image_processor import VaeImageProcessor
sd_model_map = {
"clip": {
"initializer": clip.export_clip_model,
},
"unet": {
"initializer": unet.export_unet_model,
},
"vae_decode": {
"initializer": vae.export_vae_model,
},
EMPTY_SD_MAP = {
"clip": None,
"scheduler": None,
"unet": None,
"vae_decode": None,
}
EMPTY_SDXL_MAP = {
"prompt_encoder": None,
"scheduled_unet": None,
"vae_decode": None,
"pipeline": None,
"full_pipeline": None,
}
EMPTY_FLAGS = {
"clip": None,
"unet": None,
"vae": None,
"pipeline": None,
}
class StableDiffusion(SharkPipelineBase):
def load_script(source, module_name):
"""
reads file source and loads it as a module
:param source: file to load
:param module_name: name of module to register in sys.modules
:return: loaded module
"""
spec = importlib.util.spec_from_file_location(module_name, source)
module = importlib.util.module_from_spec(spec)
sys.modules[module_name] = module
spec.loader.exec_module(module)
return module
class StableDiffusion:
# This class is responsible for executing image generation and creating
# /managing a set of compiled modules to run Stable Diffusion. The init
# aims to be as general as possible, and the class will infer and compile
@@ -61,66 +89,45 @@ class StableDiffusion(SharkPipelineBase):
height: int,
width: int,
batch_size: int,
steps: int,
scheduler: str,
precision: str,
device: str,
target_triple: str = None,
custom_vae: str = None,
num_loras: int = 0,
import_ir: bool = True,
is_controlled: bool = False,
hf_auth_token=None,
external_weights: str = "safetensors",
):
self.model_max_length = 77
self.batch_size = batch_size
self.precision = precision
self.dtype = torch.float16 if precision == "fp16" else torch.float32
self.height = height
self.width = width
self.scheduler_obj = {}
static_kwargs = {
"pipe": {
"external_weights": "safetensors",
},
"clip": {"hf_model_name": base_model_id},
"unet": {
"hf_model_name": base_model_id,
"unet_model": unet.UnetModel(hf_model_name=base_model_id),
"batch_size": batch_size,
# "is_controlled": is_controlled,
# "num_loras": num_loras,
"height": height,
"width": width,
"precision": precision,
"max_length": self.model_max_length,
},
"vae_encode": {
"hf_model_name": base_model_id,
"vae_model": vae.VaeModel(
hf_model_name=custom_vae if custom_vae else base_model_id,
),
"batch_size": batch_size,
"height": height,
"width": width,
"precision": precision,
},
"vae_decode": {
"hf_model_name": base_model_id,
"vae_model": vae.VaeModel(
hf_model_name=custom_vae if custom_vae else base_model_id,
),
"batch_size": batch_size,
"height": height,
"width": width,
"precision": precision,
},
}
super().__init__(sd_model_map, base_model_id, static_kwargs, device, import_ir)
self.compiled_pipeline = False
self.base_model_id = base_model_id
self.custom_vae = custom_vae
self.is_sdxl = "xl" in self.base_model_id.lower()
self.is_custom = ".py" in self.base_model_id.lower()
if self.is_custom:
custom_module = load_script(
os.path.join(get_checkpoints_path("scripts"), self.base_model_id),
"custom_pipeline",
)
self.turbine_pipe = custom_module.StudioPipeline
self.model_map = custom_module.MODEL_MAP
elif self.is_sdxl:
self.turbine_pipe = SharkSDXLPipeline
self.model_map = EMPTY_SDXL_MAP
else:
self.turbine_pipe = SharkSDPipeline
self.model_map = EMPTY_SD_MAP
max_length = 64
target_backend, self.rt_device, triple = parse_device(device, target_triple)
pipe_id_list = [
safe_name(base_model_id),
str(batch_size),
str(self.model_max_length),
str(max_length),
f"{str(height)}x{str(width)}",
precision,
self.device,
triple,
]
if num_loras > 0:
pipe_id_list.append(str(num_loras) + "lora")
@@ -129,305 +136,147 @@ class StableDiffusion(SharkPipelineBase):
if custom_vae:
pipe_id_list.append(custom_vae)
self.pipe_id = "_".join(pipe_id_list)
print(f"\n[LOG] Pipeline initialized with pipe_id: {self.pipe_id}.")
del static_kwargs
gc.collect()
def prepare_pipe(self, custom_weights, adapters, embeddings, is_img2img):
print(f"\n[LOG] Preparing pipeline...")
self.is_img2img = is_img2img
self.schedulers = get_schedulers(self.base_model_id)
self.weights_path = os.path.join(
get_checkpoints_path(), self.safe_name(self.base_model_id)
self.pipeline_dir = Path(os.path.join(get_checkpoints_path(), self.pipe_id))
self.weights_path = Path(
os.path.join(
get_checkpoints_path(), safe_name(self.base_model_id + "_" + precision)
)
)
if not os.path.exists(self.weights_path):
os.mkdir(self.weights_path)
for model in adapters:
self.model_map[model] = adapters[model]
decomp_attn = True
attn_spec = None
if triple in ["gfx940", "gfx942", "gfx90a"]:
decomp_attn = False
attn_spec = "mfma"
elif triple in ["gfx1100", "gfx1103", "gfx1150"]:
decomp_attn = False
attn_spec = "wmma"
if triple in ["gfx1103", "gfx1150"]:
# external weights have issues on igpu
external_weights = None
elif target_backend == "llvm-cpu":
decomp_attn = False
for submodel in self.static_kwargs:
if custom_weights:
custom_weights_params, _ = process_custom_pipe_weights(custom_weights)
if submodel not in ["clip", "clip2"]:
self.static_kwargs[submodel][
"external_weights"
] = custom_weights_params
else:
self.static_kwargs[submodel]["external_weight_path"] = os.path.join(
self.weights_path, submodel + ".safetensors"
self.sd_pipe = self.turbine_pipe(
hf_model_name=base_model_id,
scheduler_id=scheduler,
height=height,
width=width,
precision=precision,
max_length=max_length,
batch_size=batch_size,
num_inference_steps=steps,
device=target_backend,
iree_target_triple=triple,
ireec_flags=EMPTY_FLAGS,
attn_spec=attn_spec,
decomp_attn=decomp_attn,
pipeline_dir=self.pipeline_dir,
external_weights_dir=self.weights_path,
external_weights=external_weights,
custom_vae=custom_vae,
)
print(f"\n[LOG] Pipeline initialized with pipe_id: {self.pipe_id}.")
gc.collect()
def prepare_pipe(
self, custom_weights, adapters, embeddings, is_img2img, compiled_pipeline
):
print(f"\n[LOG] Preparing pipeline...")
self.is_img2img = False
mlirs = copy.deepcopy(self.model_map)
vmfbs = copy.deepcopy(self.model_map)
weights = copy.deepcopy(self.model_map)
if not self.is_sdxl:
compiled_pipeline = False
self.compiled_pipeline = compiled_pipeline
if custom_weights:
custom_weights = os.path.join(
get_checkpoints_path("checkpoints"),
safe_name(self.base_model_id.split("/")[-1]),
custom_weights,
)
diffusers_weights_path = preprocessCKPT(custom_weights, self.precision)
for key in weights:
if key in ["scheduled_unet", "unet"]:
unet_weights_path = os.path.join(
diffusers_weights_path,
"unet",
"diffusion_pytorch_model.safetensors",
)
else:
self.static_kwargs[submodel]["external_weight_path"] = os.path.join(
self.weights_path, submodel + ".safetensors"
)
weights[key] = save_irpa(unet_weights_path, "unet.")
self.get_compiled_map(pipe_id=self.pipe_id)
print("\n[LOG] Pipeline successfully prepared for runtime.")
elif key in ["clip", "prompt_encoder"]:
if not self.is_sdxl:
sd1_path = os.path.join(
diffusers_weights_path, "text_encoder", "model.safetensors"
)
weights[key] = save_irpa(sd1_path, "text_encoder_model.")
else:
clip_1_path = os.path.join(
diffusers_weights_path, "text_encoder", "model.safetensors"
)
clip_2_path = os.path.join(
diffusers_weights_path,
"text_encoder_2",
"model.safetensors",
)
weights[key] = [
save_irpa(clip_1_path, "text_encoder_model_1."),
save_irpa(clip_2_path, "text_encoder_model_2."),
]
elif key in ["vae_decode"] and weights[key] is None:
vae_weights_path = os.path.join(
diffusers_weights_path,
"vae",
"diffusion_pytorch_model.safetensors",
)
weights[key] = save_irpa(vae_weights_path, "vae.")
vmfbs, weights = self.sd_pipe.check_prepared(
mlirs, vmfbs, weights, interactive=False
)
print(f"\n[LOG] Loading pipeline to device {self.rt_device}.")
self.sd_pipe.load_pipeline(
vmfbs, weights, self.rt_device, self.compiled_pipeline
)
print(
"\n[LOG] Pipeline successfully prepared for runtime. Generating images..."
)
return
def encode_prompts_weight(
self,
prompt,
negative_prompt,
do_classifier_free_guidance=True,
):
# Encodes the prompt into text encoder hidden states.
self.load_submodels(["clip"])
self.tokenizer = CLIPTokenizer.from_pretrained(
self.base_model_id,
subfolder="tokenizer",
)
clip_inf_start = time.time()
text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
pipe=self,
prompt=prompt,
uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
)
if do_classifier_free_guidance:
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
pad = (0, 0) * (len(text_embeddings.shape) - 2)
pad = pad + (
0,
self.static_kwargs["unet"]["max_length"] - text_embeddings.shape[1],
)
text_embeddings = torch.nn.functional.pad(text_embeddings, pad)
# SHARK: Report clip inference time
clip_inf_time = (time.time() - clip_inf_start) * 1000
if self.ondemand:
self.unload_submodels(["clip"])
gc.collect()
print(f"\n[LOG] Clip Inference time (ms) = {clip_inf_time:.3f}")
return text_embeddings.numpy().astype(np.float16)
def prepare_latents(
self,
generator,
num_inference_steps,
image,
strength,
):
noise = torch.randn(
(
self.batch_size,
4,
self.height // 8,
self.width // 8,
),
generator=generator,
dtype=self.dtype,
).to("cpu")
self.scheduler.set_timesteps(num_inference_steps)
if self.is_img2img:
init_timestep = min(
int(num_inference_steps * strength), num_inference_steps
)
t_start = max(num_inference_steps - init_timestep, 0)
timesteps = self.scheduler.timesteps[t_start:]
latents = self.encode_image(image)
latents = self.scheduler.add_noise(latents, noise, timesteps[0].repeat(1))
return latents, [timesteps]
else:
self.scheduler.is_scale_input_called = True
latents = noise * self.scheduler.init_noise_sigma
return latents, self.scheduler.timesteps
def encode_image(self, input_image):
self.load_submodels(["vae_encode"])
vae_encode_start = time.time()
latents = self.run("vae_encode", input_image)
vae_inf_time = (time.time() - vae_encode_start) * 1000
if self.ondemand:
self.unload_submodels(["vae_encode"])
print(f"\n[LOG] VAE Encode Inference time (ms): {vae_inf_time:.3f}")
return latents
def produce_img_latents(
self,
latents,
text_embeddings,
guidance_scale,
total_timesteps,
cpu_scheduling,
mask=None,
masked_image_latents=None,
return_all_latents=False,
):
# self.status = SD_STATE_IDLE
step_time_sum = 0
latent_history = [latents]
text_embeddings = torch.from_numpy(text_embeddings).to(self.dtype)
text_embeddings_numpy = text_embeddings.detach().numpy()
guidance_scale = torch.Tensor([guidance_scale]).to(self.dtype)
self.load_submodels(["unet"])
for i, t in tqdm(enumerate(total_timesteps)):
step_start_time = time.time()
timestep = torch.tensor([t]).to(self.dtype).detach().numpy()
latent_model_input = self.scheduler.scale_model_input(latents, t).to(
self.dtype
)
if mask is not None and masked_image_latents is not None:
latent_model_input = torch.cat(
[
torch.from_numpy(np.asarray(latent_model_input)).to(self.dtype),
mask,
masked_image_latents,
],
dim=1,
).to(self.dtype)
if cpu_scheduling:
latent_model_input = latent_model_input.detach().numpy()
# Profiling Unet.
# profile_device = start_profiling(file_path="unet.rdc")
noise_pred = self.run(
"unet",
[
latent_model_input,
timestep,
text_embeddings_numpy,
guidance_scale,
],
)
# end_profiling(profile_device)
if cpu_scheduling:
noise_pred = torch.from_numpy(noise_pred.to_host())
latents = self.scheduler.step(noise_pred, t, latents).prev_sample
else:
latents = self.run("scheduler_step", (noise_pred, t, latents))
latent_history.append(latents)
step_time = (time.time() - step_start_time) * 1000
# print(
# f"\n [LOG] step = {i} | timestep = {t} | time = {step_time:.2f}ms"
# )
step_time_sum += step_time
# if self.status == SD_STATE_CANCEL:
# break
if self.ondemand:
self.unload_submodels(["unet"])
gc.collect()
avg_step_time = step_time_sum / len(total_timesteps)
print(f"\n[LOG] Average step time: {avg_step_time}ms/it")
if not return_all_latents:
return latents
all_latents = torch.cat(latent_history, dim=0)
return all_latents
def decode_latents(self, latents, cpu_scheduling=True):
latents_numpy = latents.to(self.dtype)
if cpu_scheduling:
latents_numpy = latents.detach().numpy()
# profile_device = start_profiling(file_path="vae.rdc")
vae_start = time.time()
images = self.run("vae_decode", latents_numpy).to_host()
vae_inf_time = (time.time() - vae_start) * 1000
# end_profiling(profile_device)
print(f"\n[LOG] VAE Inference time (ms): {vae_inf_time:.3f}")
images = torch.from_numpy(images).permute(0, 2, 3, 1).float().numpy()
pil_images = self.image_processor.numpy_to_pil(images)
return pil_images
def generate_images(
self,
prompt,
negative_prompt,
image,
scheduler,
steps,
strength,
guidance_scale,
seed,
ondemand,
repeatable_seeds,
resample_type,
control_mode,
hints,
):
# TODO: Batched args
self.image_processor = VaeImageProcessor(do_convert_rgb=True)
self.scheduler = self.schedulers[scheduler]
self.ondemand = ondemand
if self.is_img2img:
image, _ = self.image_processor.preprocess(image, resample_type)
else:
image = None
print("\n[LOG] Generating images...")
batched_args = [
prompt,
negative_prompt,
image,
]
for arg in batched_args:
if not isinstance(arg, list):
arg = [arg] * self.batch_size
if len(arg) < self.batch_size:
arg = arg * self.batch_size
else:
arg = [arg[i] for i in range(self.batch_size)]
text_embeddings = self.encode_prompts_weight(
img = self.sd_pipe.generate_images(
prompt,
negative_prompt,
1,
guidance_scale,
seed,
return_imgs=True,
)
uint32_info = np.iinfo(np.uint32)
uint32_min, uint32_max = uint32_info.min, uint32_info.max
if seed < uint32_min or seed >= uint32_max:
seed = randint(uint32_min, uint32_max)
generator = torch.manual_seed(seed)
init_latents, final_timesteps = self.prepare_latents(
generator=generator,
num_inference_steps=steps,
image=image,
strength=strength,
)
latents = self.produce_img_latents(
latents=init_latents,
text_embeddings=text_embeddings,
guidance_scale=guidance_scale,
total_timesteps=final_timesteps,
cpu_scheduling=True, # until we have schedulers through Turbine
)
# Img latents -> PIL images
all_imgs = []
self.load_submodels(["vae_decode"])
for i in tqdm(range(0, latents.shape[0], self.batch_size)):
imgs = self.decode_latents(
latents=latents[i : i + self.batch_size],
cpu_scheduling=True,
)
all_imgs.extend(imgs)
if self.ondemand:
self.unload_submodels(["vae_decode"])
return all_imgs
return img
def shark_sd_fn_dict_input(
sd_kwargs: dict,
):
print("[LOG] Submitting Request...")
print("\n[LOG] Submitting Request...")
for key in sd_kwargs:
if sd_kwargs[key] in [None, []]:
@@ -437,9 +286,34 @@ def shark_sd_fn_dict_input(
if key == "seed":
sd_kwargs[key] = int(sd_kwargs[key])
for i in range(1):
generated_imgs = yield from shark_sd_fn(**sd_kwargs)
yield generated_imgs
# TODO: move these checks into the UI code so we don't have gradio warnings in a generalized dict input function.
if not sd_kwargs["device"]:
gr.Warning("No device specified. Please specify a device.")
return None, ""
if sd_kwargs["height"] not in [512, 1024]:
gr.Warning("Height must be 512 or 1024. This is a temporary limitation.")
return None, ""
if sd_kwargs["height"] != sd_kwargs["width"]:
gr.Warning("Height and width must be the same. This is a temporary limitation.")
return None, ""
if sd_kwargs["base_model_id"] == "stabilityai/sdxl-turbo":
if sd_kwargs["steps"] > 10:
gr.Warning("Max steps for sdxl-turbo is 10. 1 to 4 steps are recommended.")
return None, ""
if sd_kwargs["guidance_scale"] > 3:
gr.Warning(
"sdxl-turbo CFG scale should be less than 2.0 if using negative prompt, 0 otherwise."
)
return None, ""
if sd_kwargs["target_triple"] == "":
if parse_device(sd_kwargs["device"], sd_kwargs["target_triple"])[2] == "":
gr.Warning(
"Target device architecture could not be inferred. Please specify a target triple, e.g. 'gfx1100' for a Radeon 7900xtx."
)
return None, ""
generated_imgs = yield from shark_sd_fn(**sd_kwargs)
return generated_imgs
def shark_sd_fn(
@@ -460,8 +334,9 @@ def shark_sd_fn(
custom_vae: str,
precision: str,
device: str,
target_triple: str,
ondemand: bool,
repeatable_seeds: bool,
compiled_pipeline: bool,
resample_type: str,
controlnets: dict,
embeddings: dict,
@@ -471,8 +346,6 @@ def shark_sd_fn(
sd_init_image = [sd_init_image]
is_img2img = True if sd_init_image[0] is not None else False
print("\n[LOG] Performing Stable Diffusion Pipeline setup...")
from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
import apps.shark_studio.web.utils.globals as global_obj
@@ -481,6 +354,7 @@ def shark_sd_fn(
control_mode = None
hints = []
num_loras = 0
import_ir = True
for i in embeddings:
num_loras += 1 if embeddings[i] else 0
if "model" in controlnets:
@@ -512,28 +386,29 @@ def shark_sd_fn(
"batch_size": batch_size,
"precision": precision,
"device": device,
"target_triple": target_triple,
"custom_vae": custom_vae,
"num_loras": num_loras,
"import_ir": cmd_opts.import_mlir,
"import_ir": import_ir,
"is_controlled": is_controlled,
"steps": steps,
"scheduler": scheduler,
}
submit_prep_kwargs = {
"custom_weights": custom_weights,
"adapters": adapters,
"embeddings": embeddings,
"is_img2img": is_img2img,
"compiled_pipeline": compiled_pipeline,
}
submit_run_kwargs = {
"prompt": prompt,
"negative_prompt": negative_prompt,
"image": sd_init_image,
"steps": steps,
"scheduler": scheduler,
"strength": strength,
"guidance_scale": guidance_scale,
"seed": seed,
"ondemand": ondemand,
"repeatable_seeds": repeatable_seeds,
"resample_type": resample_type,
"control_mode": control_mode,
"hints": hints,
@@ -566,22 +441,35 @@ def shark_sd_fn(
for current_batch in range(batch_count):
start_time = time.time()
out_imgs = global_obj.get_sd_obj().generate_images(**submit_run_kwargs)
total_time = time.time() - start_time
text_output = f"Total image(s) generation time: {total_time:.4f}sec"
print(f"\n[LOG] {text_output}")
if not isinstance(out_imgs, list):
out_imgs = [out_imgs]
# total_time = time.time() - start_time
# text_output = f"Total image(s) generation time: {total_time:.4f}sec"
# print(f"\n[LOG] {text_output}")
# if global_obj.get_sd_status() == SD_STATE_CANCEL:
# break
# else:
save_output_img(
out_imgs[current_batch],
seed,
sd_kwargs,
)
for batch in range(batch_size):
save_output_img(
out_imgs[batch],
seed,
sd_kwargs,
)
generated_imgs.extend(out_imgs)
# TODO: make seed changes over batch counts more configurable.
submit_run_kwargs["seed"] = submit_run_kwargs["seed"] + 1
yield generated_imgs, status_label(
"Stable Diffusion", current_batch + 1, batch_count, batch_size
)
return generated_imgs, ""
return (generated_imgs, "")
def unload_sd():
print("Unloading models.")
import apps.shark_studio.web.utils.globals as global_obj
global_obj.clear_cache()
gc.collect()
def cancel_sd():
@@ -596,6 +484,10 @@ def view_json_file(file_path):
return content
def safe_name(name):
return name.replace("/", "_").replace("\\", "_").replace(".", "_")
if __name__ == "__main__":
from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
import apps.shark_studio.web.utils.globals as global_obj

View File

@@ -52,6 +52,13 @@ def get_available_devices():
set_iree_runtime_flags()
available_devices = []
rocm_devices = get_devices_by_name("rocm")
available_devices.extend(rocm_devices)
cpu_device = get_devices_by_name("cpu-sync")
available_devices.extend(cpu_device)
cpu_device = get_devices_by_name("cpu-task")
available_devices.extend(cpu_device)
from shark.iree_utils.vulkan_utils import (
get_all_vulkan_devices,
)
@@ -64,17 +71,28 @@ def get_available_devices():
id += 1
if id != 0:
print(f"vulkan devices are available.")
available_devices.extend(vulkan_devices)
metal_devices = get_devices_by_name("metal")
available_devices.extend(metal_devices)
cuda_devices = get_devices_by_name("cuda")
available_devices.extend(cuda_devices)
rocm_devices = get_devices_by_name("rocm")
available_devices.extend(rocm_devices)
cpu_device = get_devices_by_name("cpu-sync")
available_devices.extend(cpu_device)
cpu_device = get_devices_by_name("cpu-task")
available_devices.extend(cpu_device)
hip_devices = get_devices_by_name("hip")
available_devices.extend(hip_devices)
for idx, device_str in enumerate(available_devices):
if "AMD Radeon(TM) Graphics =>" in device_str:
igpu_id_candidates = [
x.split("w/")[-1].split("=>")[0]
for x in available_devices
if "M Graphics" in x
]
for igpu_name in igpu_id_candidates:
if igpu_name:
available_devices[idx] = device_str.replace(
"AMD Radeon(TM) Graphics", igpu_name
)
break
return available_devices
@@ -127,6 +145,57 @@ def set_iree_runtime_flags():
set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
def parse_device(device_str, target_override=""):
from shark.iree_utils.compile_utils import (
clean_device_info,
get_iree_target_triple,
iree_target_map,
)
rt_driver, device_id = clean_device_info(device_str)
target_backend = iree_target_map(rt_driver)
if device_id:
rt_device = f"{rt_driver}://{device_id}"
else:
rt_device = rt_driver
if target_override:
return target_backend, rt_device, target_override
match target_backend:
case "vulkan-spirv":
triple = get_iree_target_triple(device_str)
return target_backend, rt_device, triple
case "rocm":
triple = get_rocm_target_chip(device_str)
return target_backend, rt_device, triple
case "llvm-cpu":
return "llvm-cpu", "local-task", "x86_64-linux-gnu"
def get_rocm_target_chip(device_str):
# TODO: Use a data file to map device_str to target chip.
rocm_chip_map = {
"6700": "gfx1031",
"6800": "gfx1030",
"6900": "gfx1030",
"7900": "gfx1100",
"MI300X": "gfx942",
"MI300A": "gfx940",
"MI210": "gfx90a",
"MI250": "gfx90a",
"MI100": "gfx908",
"MI50": "gfx906",
"MI60": "gfx906",
"780M": "gfx1103",
}
for key in rocm_chip_map:
if key in device_str:
return rocm_chip_map[key]
raise AssertionError(
f"Device {device_str} not recognized. Please file an issue at https://github.com/nod-ai/SHARK/issues."
)
def get_all_devices(driver_name):
"""
Inputs: driver_name

View File

@@ -2,6 +2,11 @@ import os
import json
import re
import requests
import torch
import safetensors
from shark_turbine.aot.params import (
ParameterArchiveBuilder,
)
from io import BytesIO
from pathlib import Path
from tqdm import tqdm
@@ -15,21 +20,21 @@ from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
)
def get_path_to_diffusers_checkpoint(custom_weights):
def get_path_to_diffusers_checkpoint(custom_weights, precision="fp16"):
path = Path(custom_weights)
diffusers_path = path.parent.absolute()
diffusers_directory_name = os.path.join("diffusers", path.stem)
diffusers_directory_name = os.path.join("diffusers", path.stem + f"_{precision}")
complete_path_to_diffusers = diffusers_path / diffusers_directory_name
complete_path_to_diffusers.mkdir(parents=True, exist_ok=True)
path_to_diffusers = complete_path_to_diffusers.as_posix()
return path_to_diffusers
def preprocessCKPT(custom_weights, is_inpaint=False):
path_to_diffusers = get_path_to_diffusers_checkpoint(custom_weights)
def preprocessCKPT(custom_weights, precision="fp16", is_inpaint=False):
path_to_diffusers = get_path_to_diffusers_checkpoint(custom_weights, precision)
if next(Path(path_to_diffusers).iterdir(), None):
print("Checkpoint already loaded at : ", path_to_diffusers)
return
return path_to_diffusers
else:
print(
"Diffusers' checkpoint will be identified here : ",
@@ -51,8 +56,24 @@ def preprocessCKPT(custom_weights, is_inpaint=False):
from_safetensors=from_safetensors,
num_in_channels=num_in_channels,
)
if precision == "fp16":
pipe.to(dtype=torch.float16)
pipe.save_pretrained(path_to_diffusers)
del pipe
print("Loading complete")
return path_to_diffusers
def save_irpa(weights_path, prepend_str):
weights = safetensors.torch.load_file(weights_path)
archive = ParameterArchiveBuilder()
for key in weights.keys():
new_key = prepend_str + key
archive.add_tensor(new_key, weights[key])
irpa_file = weights_path.replace(".safetensors", ".irpa")
archive.save(irpa_file)
return irpa_file
def convert_original_vae(vae_checkpoint):

View File

@@ -24,47 +24,47 @@ def get_schedulers(model_id):
model_id,
subfolder="scheduler",
)
schedulers["DDPM"] = DDPMScheduler.from_pretrained(
model_id,
subfolder="scheduler",
)
schedulers["KDPM2Discrete"] = KDPM2DiscreteScheduler.from_pretrained(
model_id,
subfolder="scheduler",
)
schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
model_id,
subfolder="scheduler",
)
schedulers["DDIM"] = DDIMScheduler.from_pretrained(
model_id,
subfolder="scheduler",
)
schedulers["LCMScheduler"] = LCMScheduler.from_pretrained(
model_id,
subfolder="scheduler",
)
schedulers["DPMSolverMultistep"] = DPMSolverMultistepScheduler.from_pretrained(
model_id, subfolder="scheduler", algorithm_type="dpmsolver"
)
schedulers["DPMSolverMultistep++"] = DPMSolverMultistepScheduler.from_pretrained(
model_id, subfolder="scheduler", algorithm_type="dpmsolver++"
)
schedulers["DPMSolverMultistepKarras"] = (
DPMSolverMultistepScheduler.from_pretrained(
model_id,
subfolder="scheduler",
use_karras_sigmas=True,
)
)
schedulers["DPMSolverMultistepKarras++"] = (
DPMSolverMultistepScheduler.from_pretrained(
model_id,
subfolder="scheduler",
algorithm_type="dpmsolver++",
use_karras_sigmas=True,
)
)
# schedulers["DDPM"] = DDPMScheduler.from_pretrained(
# model_id,
# subfolder="scheduler",
# )
# schedulers["KDPM2Discrete"] = KDPM2DiscreteScheduler.from_pretrained(
# model_id,
# subfolder="scheduler",
# )
# schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
# model_id,
# subfolder="scheduler",
# )
# schedulers["DDIM"] = DDIMScheduler.from_pretrained(
# model_id,
# subfolder="scheduler",
# )
# schedulers["LCMScheduler"] = LCMScheduler.from_pretrained(
# model_id,
# subfolder="scheduler",
# )
# schedulers["DPMSolverMultistep"] = DPMSolverMultistepScheduler.from_pretrained(
# model_id, subfolder="scheduler", algorithm_type="dpmsolver"
# )
# schedulers["DPMSolverMultistep++"] = DPMSolverMultistepScheduler.from_pretrained(
# model_id, subfolder="scheduler", algorithm_type="dpmsolver++"
# )
# schedulers["DPMSolverMultistepKarras"] = (
# DPMSolverMultistepScheduler.from_pretrained(
# model_id,
# subfolder="scheduler",
# use_karras_sigmas=True,
# )
# )
# schedulers["DPMSolverMultistepKarras++"] = (
# DPMSolverMultistepScheduler.from_pretrained(
# model_id,
# subfolder="scheduler",
# algorithm_type="dpmsolver++",
# use_karras_sigmas=True,
# )
# )
schedulers["EulerDiscrete"] = EulerDiscreteScheduler.from_pretrained(
model_id,
subfolder="scheduler",
@@ -75,24 +75,24 @@ def get_schedulers(model_id):
subfolder="scheduler",
)
)
schedulers["DEISMultistep"] = DEISMultistepScheduler.from_pretrained(
model_id,
subfolder="scheduler",
)
schedulers["DPMSolverSinglestep"] = DPMSolverSinglestepScheduler.from_pretrained(
model_id,
subfolder="scheduler",
)
schedulers["KDPM2AncestralDiscrete"] = (
KDPM2AncestralDiscreteScheduler.from_pretrained(
model_id,
subfolder="scheduler",
)
)
schedulers["HeunDiscrete"] = HeunDiscreteScheduler.from_pretrained(
model_id,
subfolder="scheduler",
)
# schedulers["DEISMultistep"] = DEISMultistepScheduler.from_pretrained(
# model_id,
# subfolder="scheduler",
# )
# schedulers["DPMSolverSinglestep"] = DPMSolverSinglestepScheduler.from_pretrained(
# model_id,
# subfolder="scheduler",
# )
# schedulers["KDPM2AncestralDiscrete"] = (
# KDPM2AncestralDiscreteScheduler.from_pretrained(
# model_id,
# subfolder="scheduler",
# )
# )
# schedulers["HeunDiscrete"] = HeunDiscreteScheduler.from_pretrained(
# model_id,
# subfolder="scheduler",
# )
return schedulers
@@ -101,17 +101,18 @@ def export_scheduler_model(model):
scheduler_model_map = {
"PNDM": export_scheduler_model("PNDMScheduler"),
# "DPMSolverSDE": export_scheduler_model("DpmSolverSDEScheduler"),
"EulerDiscrete": export_scheduler_model("EulerDiscreteScheduler"),
"EulerAncestralDiscrete": export_scheduler_model("EulerAncestralDiscreteScheduler"),
"LCM": export_scheduler_model("LCMScheduler"),
"LMSDiscrete": export_scheduler_model("LMSDiscreteScheduler"),
"PNDM": export_scheduler_model("PNDMScheduler"),
"DDPM": export_scheduler_model("DDPMScheduler"),
"DDIM": export_scheduler_model("DDIMScheduler"),
"DPMSolverMultistep": export_scheduler_model("DPMSolverMultistepScheduler"),
"KDPM2Discrete": export_scheduler_model("KDPM2DiscreteScheduler"),
"DEISMultistep": export_scheduler_model("DEISMultistepScheduler"),
"DPMSolverSinglestep": export_scheduler_model("DPMSolverSingleStepScheduler"),
"KDPM2AncestralDiscrete": export_scheduler_model("KDPM2AncestralDiscreteScheduler"),
"HeunDiscrete": export_scheduler_model("HeunDiscreteScheduler"),
# "LCM": export_scheduler_model("LCMScheduler"),
# "LMSDiscrete": export_scheduler_model("LMSDiscreteScheduler"),
# "DDPM": export_scheduler_model("DDPMScheduler"),
# "DDIM": export_scheduler_model("DDIMScheduler"),
# "DPMSolverMultistep": export_scheduler_model("DPMSolverMultistepScheduler"),
# "KDPM2Discrete": export_scheduler_model("KDPM2DiscreteScheduler"),
# "DEISMultistep": export_scheduler_model("DEISMultistepScheduler"),
# "DPMSolverSinglestep": export_scheduler_model("DPMSolverSingleStepScheduler"),
# "KDPM2AncestralDiscrete": export_scheduler_model("KDPM2AncestralDiscreteScheduler"),
# "HeunDiscrete": export_scheduler_model("HeunDiscreteScheduler"),
}

View File

@@ -36,6 +36,7 @@ class LLMAPITest(unittest.TestCase):
device="cpu",
precision="fp32",
quantization="None",
streaming_llm=True,
)
count = 0
label = "Turkishoure Turkish"

View File

@@ -1,28 +0,0 @@
{
"prompt": [
"a photo taken of the front of a super-car drifting on a road near mountains at high speeds with smoke coming off the tires, front angle, front point of view, trees in the mountains of the background, ((sharp focus))"
],
"negative_prompt": [
"watermark, signature, logo, text, lowres, ((monochrome, grayscale)), blurry, ugly, blur, oversaturated, cropped"
],
"sd_init_image": [null],
"height": 512,
"width": 512,
"steps": 50,
"strength": 0.8,
"guidance_scale": 7.5,
"seed": "-1",
"batch_count": 1,
"batch_size": 1,
"scheduler": "EulerDiscrete",
"base_model_id": "stabilityai/stable-diffusion-2-1-base",
"custom_weights": null,
"custom_vae": null,
"precision": "fp16",
"device": "AMD Radeon RX 7900 XTX => vulkan://0",
"ondemand": false,
"repeatable_seeds": false,
"resample_type": "Nearest Neighbor",
"controlnets": {},
"embeddings": {}
}

View File

@@ -76,8 +76,8 @@ def launch_webui(address):
def webui():
from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
from apps.shark_studio.web.ui.utils import (
nodicon_loc,
nodlogo_loc,
amdicon_loc,
amdlogo_loc,
)
launch_api = cmd_opts.api
@@ -172,9 +172,9 @@ def webui():
analytics_enabled=False,
title="Shark Studio 2.0 Beta",
) as studio_web:
nod_logo = Image.open(nodlogo_loc)
amd_logo = Image.open(amdlogo_loc)
gr.Image(
value=nod_logo,
value=amd_logo,
show_label=False,
interactive=False,
elem_id="tab_bar_logo",
@@ -209,7 +209,7 @@ def webui():
inbrowser=True,
server_name="0.0.0.0",
server_port=cmd_opts.server_port,
favicon_path=nodicon_loc,
favicon_path=amdicon_loc,
)

View File

@@ -137,7 +137,8 @@ with gr.Blocks(title="Chat") as chat_element:
streaming_llm = gr.Checkbox(
label="Run in streaming mode (requires recompilation)",
value=True,
interactive=True,
interactive=False,
visible=False,
)
prompt_prefix = gr.Checkbox(
label="Add System Prompt",

View File

@@ -367,7 +367,7 @@ footer {
#tab_bar_logo .image-container {
object-fit: scale-down;
position: absolute !important;
top: 14px;
top: 10px;
right: 0px;
height: 36px;
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 10 KiB

View File

@@ -10,7 +10,7 @@ from apps.shark_studio.web.utils.file_utils import (
get_generated_imgs_path,
get_generated_imgs_todays_subdir,
)
from apps.shark_studio.web.ui.utils import nodlogo_loc
from apps.shark_studio.web.ui.utils import amdlogo_loc
from apps.shark_studio.web.utils.metadata import displayable_metadata
# -- Functions for file, directory and image info querying
@@ -60,7 +60,7 @@ def output_subdirs() -> list[str]:
# --- Define UI layout for Gradio
with gr.Blocks() as outputgallery_element:
nod_logo = Image.open(nodlogo_loc)
amd_logo = Image.open(amdlogo_loc)
with gr.Row(elem_id="outputgallery_gallery"):
# needed to workaround gradio issue:
@@ -73,7 +73,7 @@ with gr.Blocks() as outputgallery_element:
with gr.Column(scale=6):
logo = gr.Image(
label="Getting subdirectories...",
value=nod_logo,
value=amd_logo,
interactive=False,
visible=True,
show_label=True,

View File

@@ -14,12 +14,12 @@ from apps.shark_studio.web.utils.file_utils import (
get_checkpoints_path,
get_checkpoints,
get_configs_path,
write_default_sd_config,
write_default_sd_configs,
)
from apps.shark_studio.api.sd import (
sd_model_map,
shark_sd_fn_dict_input,
cancel_sd,
unload_sd,
)
from apps.shark_studio.api.controlnet import (
cnet_preview,
@@ -33,7 +33,7 @@ from apps.shark_studio.modules.img_processing import (
)
from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
from apps.shark_studio.web.ui.utils import (
nodlogo_loc,
amdlogo_loc,
none_to_str_none,
str_none_to_none,
)
@@ -45,11 +45,10 @@ from apps.shark_studio.modules import logger
import apps.shark_studio.web.utils.globals as global_obj
sd_default_models = [
"CompVis/stable-diffusion-v1-4",
"runwayml/stable-diffusion-v1-5",
"stabilityai/stable-diffusion-2-1-base",
"stabilityai/stable-diffusion-2-1",
"stabilityai/stable-diffusion-xl-1.0",
"stabilityai/stable-diffusion-xl-base-1.0",
"stabilityai/sdxl-turbo",
]
@@ -119,8 +118,9 @@ def pull_sd_configs(
custom_vae,
precision,
device,
target_triple,
ondemand,
repeatable_seeds,
compiled_pipeline,
resample_type,
controlnets,
embeddings,
@@ -177,8 +177,9 @@ def load_sd_cfg(sd_json: dict, load_sd_config: str):
sd_json["custom_vae"],
sd_json["precision"],
sd_json["device"],
sd_json["target_triple"],
sd_json["ondemand"],
sd_json["repeatable_seeds"],
sd_json["compiled_pipeline"],
sd_json["resample_type"],
sd_json["controlnets"],
sd_json["embeddings"],
@@ -231,14 +232,9 @@ def import_original(original_img, width, height):
def base_model_changed(base_model_id):
ckpt_path = Path(
os.path.join(
cmd_opts.model_dir, "checkpoints", os.path.basename(str(base_model_id))
)
)
ckpt_path.mkdir(parents=True, exist_ok=True)
new_choices = get_checkpoints(ckpt_path) + get_checkpoints(model_type="checkpoints")
new_choices = get_checkpoints(
os.path.join("checkpoints", os.path.basename(str(base_model_id)))
) + get_checkpoints(model_type="checkpoints")
return gr.Dropdown(
value=new_choices[0] if len(new_choices) > 0 else "None",
@@ -260,6 +256,11 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
choices=global_obj.get_device_list(),
allow_custom_value=False,
)
target_triple = gr.Textbox(
elem_id="target_triple",
label="Architecture",
value="",
)
with gr.Row():
ondemand = gr.Checkbox(
value=cmd_opts.lowvram,
@@ -282,18 +283,19 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
elem_id="custom_model",
value="stabilityai/stable-diffusion-2-1-base",
choices=sd_default_models,
allow_custom_value=True,
) # base_model_id
with gr.Row():
height = gr.Slider(
384,
768,
1024,
value=cmd_opts.height,
step=8,
label="\U00002195\U0000FE0F Height",
)
width = gr.Slider(
384,
768,
1024,
value=cmd_opts.width,
step=8,
label="\U00002194\U0000FE0F Width",
@@ -604,35 +606,34 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
interactive=True,
visible=True,
)
repeatable_seeds = gr.Checkbox(
cmd_opts.repeatable_seeds,
label="Use Repeatable Seeds for Batches",
compiled_pipeline = gr.Checkbox(
False,
label="Faster txt2img (SDXL only)",
)
with gr.Row():
stable_diffusion = gr.Button("Start")
random_seed = gr.Button("Randomize Seed")
random_seed.click(
lambda: -1,
inputs=[],
outputs=[seed],
unload = gr.Button("Unload Models")
unload.click(
fn=unload_sd,
queue=False,
show_progress=False,
)
stop_batch = gr.Button("Stop")
with gr.Tab(label="Config", id=102) as sd_tab_config:
with gr.Column(elem_classes=["sd-right-panel"]):
Path(get_configs_path()).mkdir(parents=True, exist_ok=True)
default_config_file = os.path.join(
get_configs_path(),
"default_sd_config.json",
)
write_default_sd_config(default_config_file)
sd_json = gr.JSON(
label="SD Config",
elem_classes=["fill"],
value=view_json_file(default_config_file),
render=False,
)
with gr.Row(elem_classes=["fill"]):
Path(get_configs_path()).mkdir(
parents=True, exist_ok=True
)
default_config_file = os.path.join(
get_configs_path(),
"default_sd_config.json",
)
write_default_sd_configs(get_configs_path())
sd_json = gr.JSON(
elem_classes=["fill"],
value=view_json_file(default_config_file),
)
with gr.Row():
with gr.Column(scale=3):
load_sd_config = gr.FileExplorer(
@@ -682,8 +683,9 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
custom_vae,
precision,
device,
target_triple,
ondemand,
repeatable_seeds,
compiled_pipeline,
resample_type,
cnet_config,
embeddings_config,
@@ -695,8 +697,6 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
inputs=[sd_json, sd_config_name],
outputs=[sd_config_name],
)
with gr.Row(elem_classes=["fill"]):
sd_json.render()
save_sd_config.click(
fn=save_sd_cfg,
inputs=[sd_json, sd_config_name],
@@ -708,6 +708,7 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
value=f"{sd_model_info}\n"
f"Images will be saved at "
f"{get_generated_imgs_path()}",
lines=2,
elem_id="std_output",
show_label=True,
label="Log",
@@ -717,8 +718,6 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
logger.read_sd_logs, None, std_output, every=1
)
sd_status = gr.Textbox(visible=False)
with gr.Tab(label="Automation", id=104) as sd_tab_automation:
pass
pull_kwargs = dict(
fn=pull_sd_configs,
@@ -740,8 +739,9 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
custom_vae,
precision,
device,
target_triple,
ondemand,
repeatable_seeds,
compiled_pipeline,
resample_type,
cnet_config,
embeddings_config,

View File

@@ -10,8 +10,8 @@ def resource_path(relative_path):
return os.path.join(base_path, relative_path)
nodlogo_loc = resource_path("logos/nod-logo.png")
nodicon_loc = resource_path("logos/nod-icon.png")
amdlogo_loc = resource_path("logos/amd-logo.jpg")
amdicon_loc = resource_path("logos/amd-icon.jpg")
class HSLHue(IntEnum):

View File

@@ -0,0 +1,95 @@
default_sd_config = r"""{
"prompt": [
"a photo taken of the front of a super-car drifting on a road near mountains at high speeds with smoke coming off the tires, front angle, front point of view, trees in the mountains of the background, ((sharp focus))"
],
"negative_prompt": [
"watermark, signature, logo, text, lowres, ((monochrome, grayscale)), blurry, ugly, blur, oversaturated, cropped"
],
"sd_init_image": [null],
"height": 512,
"width": 512,
"steps": 50,
"strength": 0.8,
"guidance_scale": 7.5,
"seed": "-1",
"batch_count": 1,
"batch_size": 1,
"scheduler": "EulerDiscrete",
"base_model_id": "stabilityai/stable-diffusion-2-1-base",
"custom_weights": null,
"custom_vae": null,
"precision": "fp16",
"device": "",
"target_triple": "",
"ondemand": false,
"compiled_pipeline": false,
"resample_type": "Nearest Neighbor",
"controlnets": {},
"embeddings": {}
}"""
sdxl_30steps = r"""{
"prompt": [
"a cat under the snow with blue eyes, covered by snow, cinematic style, medium shot, professional photo, animal"
],
"negative_prompt": [
"watermark, signature, logo, text, lowres, ((monochrome, grayscale)), blurry, ugly, blur, oversaturated, cropped"
],
"sd_init_image": [null],
"height": 1024,
"width": 1024,
"steps": 30,
"strength": 0.8,
"guidance_scale": 7.5,
"seed": "-1",
"batch_count": 1,
"batch_size": 1,
"scheduler": "EulerDiscrete",
"base_model_id": "stabilityai/stable-diffusion-xl-base-1.0",
"custom_weights": null,
"custom_vae": null,
"precision": "fp16",
"device": "",
"target_triple": "",
"ondemand": false,
"compiled_pipeline": true,
"resample_type": "Nearest Neighbor",
"controlnets": {},
"embeddings": {}
}"""
sdxl_turbo = r"""{
"prompt": [
"A cat wearing a hat that says 'TURBO' on it. The cat is sitting on a skateboard."
],
"negative_prompt": [
""
],
"sd_init_image": [null],
"height": 512,
"width": 512,
"steps": 2,
"strength": 0.8,
"guidance_scale": 0,
"seed": "-1",
"batch_count": 1,
"batch_size": 1,
"scheduler": "EulerAncestralDiscrete",
"base_model_id": "stabilityai/sdxl-turbo",
"custom_weights": null,
"custom_vae": null,
"precision": "fp16",
"device": "",
"target_triple": "",
"ondemand": false,
"compiled_pipeline": true,
"resample_type": "Nearest Neighbor",
"controlnets": {},
"embeddings": {}
}"""
default_sd_configs = {
"default_sd_config.json": default_sd_config,
"sdxl-30steps.json": sdxl_30steps,
"sdxl-turbo.json": sdxl_turbo,
}

View File

@@ -11,43 +11,18 @@ checkpoints_filetypes = (
"*.safetensors",
)
default_sd_config = r"""{
"prompt": [
"a photo taken of the front of a super-car drifting on a road near mountains at high speeds with smoke coming off the tires, front angle, front point of view, trees in the mountains of the background, ((sharp focus))"
],
"negative_prompt": [
"watermark, signature, logo, text, lowres, ((monochrome, grayscale)), blurry, ugly, blur, oversaturated, cropped"
],
"sd_init_image": [null],
"height": 512,
"width": 512,
"steps": 50,
"strength": 0.8,
"guidance_scale": 7.5,
"seed": "-1",
"batch_count": 1,
"batch_size": 1,
"scheduler": "EulerDiscrete",
"base_model_id": "stabilityai/stable-diffusion-2-1-base",
"custom_weights": null,
"custom_vae": null,
"precision": "fp16",
"device": "AMD Radeon RX 7900 XTX => vulkan://0",
"ondemand": false,
"repeatable_seeds": false,
"resample_type": "Nearest Neighbor",
"controlnets": {},
"embeddings": {}
}"""
from apps.shark_studio.web.utils.default_configs import default_sd_configs
def write_default_sd_config(path):
with open(path, "w") as f:
f.write(default_sd_config)
def write_default_sd_configs(path):
for key in default_sd_configs.keys():
config_fpath = os.path.join(path, key)
with open(config_fpath, "w") as f:
f.write(default_sd_configs[key])
def safe_name(name):
return name.replace("/", "_").replace("-", "_")
return name.split("/")[-1].replace("-", "_")
def get_path_stem(path):

View File

@@ -3,9 +3,8 @@ from pathlib import Path
from apps.shark_studio.web.utils.file_utils import (
get_checkpoint_pathfile,
)
from apps.shark_studio.api.sd import (
sd_model_map,
)
from apps.shark_studio.api.sd import EMPTY_SD_MAP as sd_model_map
from apps.shark_studio.modules.schedulers import (
scheduler_model_map,
)

View File

@@ -17,7 +17,7 @@ def clear_tmp_mlir():
and filename.endswith(".mlir")
]
for filename in mlir_files:
os.remove(shark_tmp + filename)
os.remove(os.path.join(shark_tmp, filename))
print(f"Clearing .mlir temporary files took {time() - cleanup_start:.4f} seconds.")

View File

@@ -10,7 +10,7 @@ from utils import get_datasets
shark_root = Path(__file__).parent.parent
demo_css = shark_root.joinpath("web/demo.css").resolve()
nodlogo_loc = shark_root.joinpath("web/models/stable_diffusion/logos/nod-logo.png")
nodlogo_loc = shark_root.joinpath("web/models/stable_diffusion/logos/amd-logo.jpg")
with gr.Blocks(title="Dataset Annotation Tool", css=demo_css) as shark_web:

View File

@@ -1,13 +1,16 @@
-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-f https://openxla.github.io/iree/pip-release-links.html
-f https://download.pytorch.org/whl/nightly/cpu
-f https://iree.dev/pip-release-links.html
--pre
setuptools
wheel
torch==2.3.0
shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
turbine-models @ git+https://github.com/nod-ai/SHARK-Turbine.git@main#subdirectory=models
turbine-models @ git+https://github.com/nod-ai/SHARK-Turbine.git@deprecated-constraints#subdirectory=models
diffusers @ git+https://github.com/nod-ai/diffusers@0.29.0.dev0-shark
brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b
# SHARK Runner
tqdm
@@ -17,8 +20,6 @@ google-cloud-storage
# Testing
pytest
pytest-xdist
pytest-forked
Pillow
parameterized
@@ -26,8 +27,10 @@ parameterized
#accelerate is now required for diffusers import from ckpt.
accelerate
scipy
transformers==4.37.1
torchsde # Required for Stable Diffusion SDE schedulers.
ftfy
gradio==4.19.2
gradio==4.29.0
altair
omegaconf
# 0.3.2 doesn't have binaries for arm64

View File

@@ -88,8 +88,8 @@ else {python -m venv .\shark.venv\}
.\shark.venv\Scripts\activate
python -m pip install --upgrade pip
pip install wheel
pip install -r requirements.txt
# remove this when windows DLL issues are fixed from LLVM changes
pip install --force-reinstall https://github.com/openxla/iree/releases/download/candidate-20240326.843/iree_compiler-20240326.843-cp311-cp311-win_amd64.whl https://github.com/openxla/iree/releases/download/candidate-20240326.843/iree_runtime-20240326.843-cp311-cp311-win_amd64.whl
pip install --pre -r requirements.txt
pip install --force-reinstall https://github.com/nod-ai/SRT/releases/download/candidate-20240528.279/iree_compiler-20240528.279-cp311-cp311-win_amd64.whl https://github.com/nod-ai/SRT/releases/download/candidate-20240528.279/iree_runtime-20240528.279-cp311-cp311-win_amd64.whl
pip install -e .
Write-Host "Source your venv with ./shark.venv/Scripts/activate"

View File

@@ -84,21 +84,7 @@ else
PYTORCH_URL=https://download.pytorch.org/whl/nightly/cpu/
fi
$PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f ${PYTORCH_URL}
if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
T_VER=$($PYTHON -m pip show torch | grep Version)
T_VER_MIN=${T_VER:14:12}
TV_VER=$($PYTHON -m pip show torchvision | grep Version)
TV_VER_MAJ=${TV_VER:9:6}
$PYTHON -m pip uninstall -y torchvision
$PYTHON -m pip install torchvision==${TV_VER_MAJ}${T_VER_MIN} --no-deps -f https://download.pytorch.org/whl/nightly/cpu/torchvision/
if [ $? -eq 0 ];then
echo "Successfully Installed torch + cu118."
else
echo "Could not install torch + cu118." >&2
fi
fi
$PYTHON -m pip install --no-warn-conflicts -e . -f ${RUNTIME} -f ${PYTORCH_URL}
if [[ -z "${NO_BREVITAS}" ]]; then
$PYTHON -m pip install git+https://github.com/Xilinx/brevitas.git@dev

View File

@@ -76,6 +76,7 @@ _IREE_DEVICE_MAP = {
"vulkan": "vulkan",
"metal": "metal",
"rocm": "rocm",
"hip": "hip",
"intel-gpu": "level_zero",
}
@@ -94,6 +95,7 @@ _IREE_TARGET_MAP = {
"vulkan": "vulkan-spirv",
"metal": "metal",
"rocm": "rocm",
"hip": "rocm",
"intel-gpu": "opencl-spirv",
}

View File

@@ -62,13 +62,16 @@ def get_iree_device_args(device, extra_args=[]):
from shark.iree_utils.gpu_utils import get_iree_rocm_args
return get_iree_rocm_args(device_num=device_num, extra_args=extra_args)
if device == "hip":
from shark.iree_utils.gpu_utils import get_iree_rocm_args
return get_iree_rocm_args(device_num=device_num, extra_args=extra_args, hip_driver=True)
return []
def get_iree_target_triple(device):
args = get_iree_device_args(device)
for flag in args:
if "triple" in flag.split("-"):
triple = flag.split("=")
if "triple" in flag:
triple = flag.split("=")[-1]
return triple
return ""
@@ -89,9 +92,9 @@ def clean_device_info(raw_device):
if len(device_id) <= 2:
device_id = int(device_id)
if device not in ["rocm", "vulkan"]:
if device not in ["hip", "rocm", "vulkan"]:
device_id = None
if device in ["rocm", "vulkan"] and device_id == None:
if device in ["hip", "rocm", "vulkan"] and device_id == None:
device_id = 0
return device, device_id

View File

@@ -52,7 +52,7 @@ def check_rocm_device_arch_in_args(extra_args):
return None
def get_rocm_device_arch(device_num=0, extra_args=[]):
def get_rocm_device_arch(device_num=0, extra_args=[], hip_driver=False):
# ROCM Device Arch selection:
# 1 : User given device arch using `--iree-rocm-target-chip` flag
# 2 : Device arch from `iree-run-module --dump_devices=rocm` for device on index <device_num>
@@ -68,15 +68,23 @@ def get_rocm_device_arch(device_num=0, extra_args=[]):
arch_in_device_dump = None
# get rocm arch from iree dump devices
def get_devices_info_from_dump(dump):
def get_devices_info_from_dump(dump, driver):
from os import linesep
dump_clean = list(
filter(
lambda s: "--device=rocm" in s or "gpu-arch-name:" in s,
dump.split(linesep),
if driver == "hip":
dump_clean = list(
filter(
lambda s: "AMD" in s,
dump.split(linesep),
)
)
else:
dump_clean = list(
filter(
lambda s: f"--device={driver}" in s or "gpu-arch-name:" in s,
dump.split(linesep),
)
)
)
arch_pairs = [
(
dump_clean[i].split("=")[1].strip(),
@@ -87,16 +95,17 @@ def get_rocm_device_arch(device_num=0, extra_args=[]):
return arch_pairs
dump_device_info = None
driver = "hip" if hip_driver else "rocm"
try:
dump_device_info = run_cmd(
"iree-run-module --dump_devices=rocm", raise_err=True
"iree-run-module --dump_devices=" + driver, raise_err=True
)
except Exception as e:
print("could not execute `iree-run-module --dump_devices=rocm`")
print("could not execute `iree-run-module --dump_devices=" + driver + "`")
if dump_device_info is not None:
device_num = 0 if device_num is None else device_num
device_arch_pairs = get_devices_info_from_dump(dump_device_info[0])
device_arch_pairs = get_devices_info_from_dump(dump_device_info[0], driver)
if len(device_arch_pairs) > device_num: # can find arch in the list
arch_in_device_dump = device_arch_pairs[device_num][1]
@@ -107,24 +116,22 @@ def get_rocm_device_arch(device_num=0, extra_args=[]):
default_rocm_arch = "gfx1100"
print(
"Did not find ROCm architecture from `--iree-rocm-target-chip` flag"
"\n or from `iree-run-module --dump_devices=rocm` command."
"\n or from `iree-run-module --dump_devices` command."
f"\nUsing {default_rocm_arch} as ROCm arch for compilation."
)
return default_rocm_arch
# Get the default gpu args given the architecture.
def get_iree_rocm_args(device_num=0, extra_args=[]):
def get_iree_rocm_args(device_num=0, extra_args=[], hip_driver=False):
ireert.flags.FUNCTION_INPUT_VALIDATION = False
rocm_flags = ["--iree-rocm-link-bc=true"]
rocm_flags = []
if check_rocm_device_arch_in_args(extra_args) is None:
rocm_arch = get_rocm_device_arch(device_num, extra_args)
rocm_arch = get_rocm_device_arch(device_num, extra_args, hip_driver=hip_driver)
rocm_flags.append(f"--iree-rocm-target-chip={rocm_arch}")
return rocm_flags
# Some constants taken from cuda.h
CUDA_SUCCESS = 0
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16