Compare commits

..

1 Commits

Author SHA1 Message Date
Prashant Kumar
326827198b Update vulkan_utils.py 2022-10-11 20:53:41 +05:30
161 changed files with 1693 additions and 12836 deletions

View File

@@ -1,37 +0,0 @@
# See: https://github.com/llvm/torch-mlir/issues/1374
name: Publish releases page
on:
workflow_dispatch:
jobs:
scrape_and_publish_releases:
name: "Scrape and publish releases"
runs-on: ubuntu-latest
# Don't run this in everyone's forks.
if: github.repository == 'nod-ai/SHARK'
steps:
- name: Checking out repository
uses: actions/checkout@v2
with:
token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
- name: Run scrape releases script
run: python ./build_tools/scrape_releases.py nod-ai SHARK > /tmp/index.html
shell: bash
- run: git fetch --all
- run: git switch github-pages
- run: git config --global user.email "none@none.com"
- run: git config --global user.name "nod-ai"
- run: mv /tmp/index.html package-index/index.html
- run: git add package-index/index.html
# Only try to make a commit if the file has changed.
- run: git diff --cached --exit-code || git commit -m "Update releases."
- name: GitHub Push
uses: ad-m/github-push-action@v0.6.0
with:
github_token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
branch: github-pages

View File

@@ -9,84 +9,7 @@ on:
workflow_dispatch:
jobs:
windows-build:
runs-on: 7950X
strategy:
fail-fast: false
matrix:
python-version: ["3.10"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Compute version
shell: powershell
run: |
$package_version = $(Get-Date -UFormat "%Y%m%d")+"."+${{ github.run_number }}
$package_version_ = $(Get-Date -UFormat "%Y%m%d")+"_"+${{ github.run_number }}
$tag_name=$package_version
echo "package_version=$package_version" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
echo "package_version_=$package_version_" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
echo "tag_name=$tag_name" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
- name: Create Release
id: create_release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:
tag_name: ${{ env.tag_name }}
release_name: nod.ai SHARK ${{ env.tag_name }}
body: |
Automatic snapshot release of nod.ai SHARK.
draft: true
prerelease: false
- name: Build Package
shell: powershell
run: |
./setup_venv.ps1
pyinstaller web/shark_sd.spec
mv ./dist/shark_sd.exe ./dist/shark_sd_${{ env.package_version_ }}.exe
signtool sign /f C:\shark_2023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_${{ env.package_version_ }}.exe
pyinstaller .\shark\examples\shark_inference\stable_diffusion\shark_sd_cli.spec
mv ./dist/shark_sd_cli.exe ./dist/shark_sd_cli_${{ env.package_version_ }}.exe
signtool sign /f C:\shark_2023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_cli_${{ env.package_version_ }}.exe
# GHA windows VM OOMs so disable for now
#- name: Build and validate the SHARK Runtime package
# shell: powershell
# run: |
# $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
# pip wheel -v -w dist . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
- uses: actions/upload-artifact@v2
with:
path: dist/*
- name: Upload Release Assets
id: upload-release-assets
uses: dwenegar/upload-release-assets@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:
release_id: ${{ steps.create_release.outputs.id }}
assets_path: ./dist/*
- name: Publish Release
id: publish_release
uses: eregon/publish-release@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:
release_id: ${{ steps.create_release.outputs.id }}
linux-build:
build:
runs-on: a100
strategy:
@@ -109,13 +32,36 @@ jobs:
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
- name: Compute version
run: |
package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
tag_name="${package_version}"
echo "package_version=${package_version}" >> $GITHUB_ENV
echo "tag_name=${tag_name}" >> $GITHUB_ENV
- name: Create Release
id: create_release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:
tag_name: ${{ env.tag_name }}
release_name: nod.ai SHARK ${{ env.tag_name }}
body: |
Automatic snapshot release of nod.ai SHARK.
draft: true
prerelease: false
- name: Find Torch-MLIR Release
run: |
TM_HTML_URL="$(python3 -c "import urllib.request, json, sys; u=json.loads(urllib.request.urlopen('https://api.github.com/repos/llvm/torch-mlir/releases/latest').read().decode()).get('html_url', False); print(u) if u else sys.exit(1);")"
TM_RELEASE_DIR=${TM_HTML_URL/"tag"/"expanded_assets"}
echo "TM_RELEASE_DIR=${TM_RELEASE_DIR}" >> $GITHUB_ENV
- name: Install dependencies
run: |
echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
echo "Torch-MLIR Release DIR is ${{ env.TM_RELEASE_DIR }}"
python -m pip install --upgrade pip
python -m pip install flake8 pytest toml
if [ -f requirements.txt ]; then pip install -r requirements.txt -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html; fi
if [ -f requirements.txt ]; then pip install -r requirements.txt -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/nod-ai/SHARK-Runtime/releases; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
@@ -124,26 +70,25 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py
- name: Build and validate the IREE package
if: ${{ matrix.backend == 'IREE' }}
continue-on-error: true
run: |
cd $GITHUB_WORKSPACE
USE_IREE=1 VENV_DIR=iree.venv ./setup_venv.sh
source iree.venv/bin/activate
package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
SHARK_PACKAGE_VERSION=${package_version} \
pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://iree-org.github.io/iree/pip-release-links.html
pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/iree-org/iree/releases
# Install the built wheel
pip install ./wheelhouse/nodai*
# Validate the Models
/bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" -k "not metal" |
pytest tank/test_models.py |
tail -n 1 |
tee -a pytest_results.txt
if !(grep -Fxq " failed" pytest_results.txt)
then
export SHA=$(git log -1 --format='%h')
gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/${DATE}_$SHA
gsutil -m cp -r gs://shark_tank/${DATE}_$SHA/* gs://shark_tank/latest/
gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/$SHA
gsutil -m cp -r gs://shark_tank/$SHA/* gs://shark_tank/latest/
fi
rm -rf ./wheelhouse/nodai*
@@ -155,10 +100,32 @@ jobs:
source shark.venv/bin/activate
package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
SHARK_PACKAGE_VERSION=${package_version} \
pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/nod-ai/SHARK-Runtime/releases
# Install the built wheel
pip install ./wheelhouse/nodai*
# Validate the Models
pytest --ci --ci_sha=${SHORT_SHA} -k "not metal" |
pytest tank/test_models.py |
tail -n 1 |
tee -a pytest_results.txt
publish:
runs-on: a100
needs: build
steps:
- name: Upload Release Assets
if: ${{ matrix.backend == 'SHARK' }}
id: upload-release-assets
uses: dwenegar/upload-release-assets@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:
release_id: ${{ steps.create_release.outputs.id }}
assets_path: ${GITHUB_WORKSPACE}/wheelhouse/nodai_*.whl
- name: Publish Release
if: ${{ matrix.backend == 'SHARK' }}
id: publish_release
uses: eregon/publish-release@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:
release_id: ${{ steps.create_release.outputs.id }}

View File

@@ -6,24 +6,10 @@ name: Validate Models on Shark Runtime
on:
push:
branches: [ main ]
paths-ignore:
- '**.md'
- 'shark/examples/**'
pull_request:
branches: [ main ]
paths-ignore:
- '**.md'
- 'shark/examples/**'
workflow_dispatch:
# Ensure that only a single job or workflow using the same
# concurrency group will run at a time. This would cancel
# any in-progress jobs in the same github workflow and github
# ref (e.g. refs/heads/main or refs/pull/<pr_number>/merge).
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
build-validate:
strategy:
@@ -46,6 +32,8 @@ jobs:
suite: cuda
- os: MacStudio
suite: cpu
- os: MacStudio
suite: vulkan
- os: icelake
suite: vulkan
- os: icelake
@@ -100,9 +88,9 @@ jobs:
if: matrix.suite == 'cpu'
run: |
cd $GITHUB_WORKSPACE
PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
source shark.venv/bin/activate
pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="${GITHUB_WORKSPACE}/shark_tmp/shark_cache" -k cpu
pytest --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k cpu
gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv
@@ -112,26 +100,14 @@ jobs:
cd $GITHUB_WORKSPACE
PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
source shark.venv/bin/activate
pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="${GITHUB_WORKSPACE}/shark_tmp/shark_cache" -k cuda
pytest --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k cuda
gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
sh build_tools/stable_diff_main_test.sh
- name: Validate Vulkan Models (MacOS)
if: matrix.suite == 'vulkan' && matrix.os == 'MacStudio'
- name: Validate Vulkan Models
if: matrix.suite == 'vulkan'
run: |
cd $GITHUB_WORKSPACE
PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
source shark.venv/bin/activate
export DYLD_LIBRARY_PATH=/usr/local/lib/
echo $PATH
pip list | grep -E "torch|iree"
pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" -k vulkan --update_tank
- name: Validate Vulkan Models (a100)
if: matrix.suite == 'vulkan' && matrix.os != 'MacStudio'
run: |
cd $GITHUB_WORKSPACE
PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
source shark.venv/bin/activate
pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="${GITHUB_WORKSPACE}/shark_tmp/shark_cache" -k vulkan
pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k vulkan

8
.gitignore vendored
View File

@@ -31,6 +31,7 @@ MANIFEST
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
@@ -162,14 +163,7 @@ cython_debug/
# Shark related artefacts
*venv/
shark_tmp/
*.vmfb
.use-iree
tank/dict_configs.py
# ORT related artefacts
cache_models/
onnx_models/
#web logging
web/logs/
web/stored_results/stable_diffusion/

408
README.md
View File

@@ -5,119 +5,25 @@ High Performance Machine Learning and Data Analytics for CPUs, GPUs, Accelerator
[![Nightly Release](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml)
[![Validate torch-models on Shark Runtime](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml)
## Communication Channels
## Installation (Windows, Linux and macOS)
## Check out the code
```shell
git clone https://github.com/nod-ai/SHARK.git
cd SHARK
```
## Setup your Python VirtualEnvironment and Dependencies
### Windows 10/11 Users
* Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)
* Install Git for Windows from [here](https://git-scm.com/download/win)
#### Allow the install script to run in Powershell
```powershell
set-executionpolicy remotesigned
```
#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
```powershell
./setup_venv.ps1 #You can re-run this script to get the latest version
```
### Linux / macOS Users
```shell
./setup_venv.sh
source shark.venv/bin/activate
```
* [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
* [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
### Run Stable Diffusion on your device - WebUI
#### Windows 10/11 Users
```powershell
(shark.venv) PS C:\Users\nod\SHARK> cd web
(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
```
#### Linux Users
```shell
(shark.venv) > cd web
(shark.venv) > python index.py
```
#### Access Stable Diffusion on http://localhost:8080/?__theme=dark
<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
### Run Stable Diffusion on your device - Commandline
#### Install your hardware drivers
* [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mril-iree)
* [macOS Users] Download and install the latest Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home)
* [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
Other users please ensure you have your latest vendor drivers and Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home) and if you are using vulkan check `vulkaninfo` works in a terminal window
#### Windows 10/11 Users
```powershell
(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
```
#### Linux / macOS Users
```shell
python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
```
You can replace `vulkan` with `cpu` to run on your CPU or with `cuda` to run on CUDA devices. If you have multiple vulkan devices you can address them with `--device=vulkan://1` etc
The output on a 7900XTX would like:
```shell
Stats for run 0:
Average step time: 47.19188690185547ms/it
Clip Inference time (ms) = 109.531
VAE Inference time (ms): 78.590
Total image generation time: 2.5788655281066895sec
```
Here are some samples generated:
![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
For more options to the Stable Diffusion model read [this](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md)
Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware.
## Installation
<details>
<summary>Binary Installation</summary>
<summary>Installation (Linux and macOS)</summary>
### Setup a new pip Virtual Environment
This step sets up a new VirtualEnv for Python
```shell
python --version #Check you have 3.10 on Linux, macOS or Windows Powershell
python --version #Check you have 3.7->3.10 on Linux or 3.10 on macOS
python -m venv shark_venv
source shark_venv/bin/activate # Use shark_venv/Scripts/activate on Windows
source shark_venv/bin/activate
# If you are using conda create and activate a new conda env
@@ -132,14 +38,9 @@ python -m pip install --upgrade pip
This step pip installs SHARK and related packages on Linux Python 3.7, 3.8, 3.9, 3.10 and macOS Python 3.10
```shell
pip install nodai-shark -f https://nod-ai.github.io/SHARK/package-index/ -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
pip install nodai-shark -f https://github.com/nod-ai/SHARK/releases -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/shark-runtime/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
```
### Run shark tank model tests.
```shell
pytest tank/test_models.py
```
See tank/README.md for a more detailed walkthrough of our pytest suite and CLI.
If you are on an Intel macOS machine you need this [workaround](https://github.com/nod-ai/SHARK/issues/102) for an upstream issue.
### Download and run Resnet50 sample
@@ -160,27 +61,29 @@ python ./minilm_jit.py --device="cpu" #use cuda or vulkan or metal
</details>
<details>
<summary>Development, Testing and Benchmarks</summary>
<summary>Source Installation</summary>
If you want to use Python3.10 and with TF Import tools you can use the environment variables like:
Set `USE_IREE=1` to use upstream IREE
```
# PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh
```
## Check out the code
### Run any of the hundreds of SHARK tank models via the test framework
```shell
python -m shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
# Or a pytest
pytest tank/test_models.py -k "MiniLM"
git clone https://github.com/nod-ai/SHARK.git
```
## Setup your Python VirtualEnvironment and Dependencies
```shell
# Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...).
./setup_venv.sh
source shark.venv/bin/activate
```
For example if you want to use Python3.10 and upstream IREE with TF Import tools you can use the environment variables like:
```
# PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 USE_IREE=1 ./setup_venv.sh
```
If you are a *Torch-mlir developer or an IREE developer* and want to test local changes you can uninstall
the provided packages with `pip uninstall torch-mlir` and / or `pip uninstall iree-compiler iree-runtime` and build locally
with Python bindings and set your PYTHONPATH as mentioned [here](https://github.com/iree-org/iree/tree/main/docs/api_docs/python#install-iree-binaries)
with Python bindings and set your PYTHONPATH as mentioned [here](https://google.github.io/iree/bindings/python/)
for IREE and [here](https://github.com/llvm/torch-mlir/blob/main/development.md#setup-python-environment-to-export-the-built-python-packages)
for Torch-MLIR.
@@ -199,39 +102,82 @@ for Torch-MLIR.
```
Now the SHARK will use your locally build Torch-MLIR repo.
## Benchmarking Dispatches
To produce benchmarks of individual dispatches, you can add `--dispatch_benchmarks=All --dispatch_benchmarks_dir=<output_dir>` to your command line argument.
If you only want to compile specific dispatches, you can specify them with a space seperated string instead of `"All"`. E.G. `--dispatch_benchmarks="0 1 2 10"`
if you want to instead incorporate this into a python script, you can pass the `dispatch_benchmarks` and `dispatch_benchmarks_dir` commands when initializing `SharkInference`, and the benchmarks will be generated when compiled. E.G:
### Run a demo script
```shell
python -m shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
# Or a pytest
pytest tank/test_models.py -k "MiniLM"
```
shark_module = SharkInference(
mlir_model,
func_name,
device=args.device,
mlir_dialect="tm_tensor",
dispatch_benchmarks="all",
dispatch_benchmarks_dir="results"
)
```
Output will include:
- An ordered list ordered-dispatches.txt of all the dispatches with their runtime
- Inside the specified directory, there will be a directory for each dispatch (there will be mlir files for all dispatches, but only compiled binaries and benchmark data for the specified dispatches)
- An .mlir file containing the dispatch benchmark
- A compiled .vmfb file containing the dispatch benchmark
- An .mlir file containing just the hal executable
- A compiled .vmfb file of the hal executable
- A .txt file containing benchmark output
See tank/README.md for instructions on how to run model tests and benchmarks from the SHARK tank.
</details>
<details>
<summary>Testing and Benchmarks</summary>
### Run all model tests on CPU/GPU/VULKAN/Metal
```shell
pytest tank/test_models.py
# If on Linux for multithreading on CPU (faster results):
pytest tank/test_models.py -n auto
```
### Running specific tests
```shell
# Search for test cases by including a keyword that matches all or part of the test case's name;
pytest tank/test_models.py -k "keyword"
# Test cases are named uniformly by format test_module_<model_name_underscores_only>_<torch/tf>_<static/dynamic>_<device>.
# Example: Test all models on nvidia gpu:
pytest tank/test_models.py -k "cuda"
# Example: Test all tensorflow resnet models on Vulkan backend:
pytest tank/test_models.py -k "resnet and tf and vulkan"
# Exclude a test case:
pytest tank/test_models.py -k "not ..."
### Run benchmarks on SHARK tank pytests and generate bench_results.csv with results.
(the following requires source installation with `IMPORTER=1 ./setup_venv.sh`)
```shell
pytest --benchmark tank/test_models.py
# Just do static GPU benchmarks for PyTorch tests:
pytest --benchmark tank/test_models.py -k "pytorch and static and cuda"
```
### Benchmark Resnet50, MiniLM on CPU
(requires source installation with `IMPORTER=1 ./setup_venv.sh`)
```shell
# We suggest running the following commands as root before running benchmarks on CPU:
cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | awk -F, '{print $2}' | sort -n | uniq | ( while read X ; do echo $X ; echo 0 > /sys/devices/system/cpu/cpu$X/online ; done )
echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
# Benchmark canonical Resnet50 on CPU via pytest
pytest --benchmark tank/test_models -k "resnet50 and tf_static_cpu"
# Benchmark canonical MiniLM on CPU via pytest
pytest --benchmark tank/test_models -k "MiniLM and cpu"
# Benchmark MiniLM on CPU via transformer-benchmarks:
git clone --recursive https://github.com/nod-ai/transformer-benchmarks.git
cd transformer-benchmarks
./perf-ci.sh -n
# Check detail.csv for MLIR/IREE results.
```
</details>
<details>
<summary>API Reference</summary>
@@ -282,26 +228,160 @@ result = shark_module.forward((arg0, arg1))
```
</details>
## Supported and Validated Models
SHARK is maintained to support the latest innovations in ML Models:
<details>
<summary>PyTorch Models</summary>
| TF HuggingFace Models | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------|----------|-------------|
| BERT | :green_heart: | :green_heart: | :green_heart: |
| DistilBERT | :green_heart: | :green_heart: | :green_heart: |
| GPT2 | :green_heart: | :green_heart: | :green_heart: |
| BLOOM | :green_heart: | :green_heart: | :green_heart: |
| Stable Diffusion | :green_heart: | :green_heart: | :green_heart: |
| Vision Transformer | :green_heart: | :green_heart: | :green_heart: |
| ResNet50 | :green_heart: | :green_heart: | :green_heart: |
### Huggingface PyTorch Models
For a complete list of the models supported in SHARK, please refer to [tank/README.md](https://github.com/nod-ai/SHARK/blob/main/tank/README.md).
| Hugging Face Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :green_heart: (JIT) | :green_heart: | :green_heart: | :green_heart: |
| Albert | :green_heart: (JIT) | :green_heart: | :green_heart: | :green_heart: |
| BigBird | :green_heart: (AOT) | | | |
| DistilBERT | :green_heart: (JIT) | :green_heart: | :green_heart: | :green_heart: |
| GPT2 | :broken_heart: (AOT) | | | |
| MobileBert | :green_heart: (JIT) | :green_heart: | :green_heart: | :green_heart: |
## Communication Channels
### Torchvision Models
* [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
* [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
| TORCHVISION Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|--------------------|----------------------|----------|----------|-------------|
| AlexNet | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| DenseNet121 | :green_heart: (Script) | | | |
| MNasNet1_0 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| MobileNetV2 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| MobileNetV3 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Unet | :broken_heart: (Script) | | | |
| Resnet18 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnet50 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnet101 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnext50_32x4d | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| ShuffleNet_v2 | :broken_heart: (Script) | | | |
| SqueezeNet | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| EfficientNet | :green_heart: (Script) | | | |
| Regnet | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnest | :broken_heart: (Script) | | | |
| Vision Transformer | :green_heart: (Script) | | | |
| VGG 16 | :green_heart: (Script) | :green_heart: | :green_heart: | |
| Wide Resnet | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| RAFT | :broken_heart: (JIT) | | | |
For more information refer to [MODEL TRACKING SHEET](https://docs.google.com/spreadsheets/d/15PcjKeHZIrB5LfDyuw7DGEEE8XnQEX2aX8lm8qbxV8A/edit#gid=0)
### PyTorch Training Models
| Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :broken_heart: | :broken_heart: | | |
| FullyConnected | :green_heart: | :green_heart: | | |
</details>
<details>
<summary>JAX Models</summary>
### JAX Models
| Models | JAX-MHLO lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| DALL-E | :broken_heart: | :broken_heart: | | |
| FullyConnected | :green_heart: | :green_heart: | | |
</details>
<details>
<summary>TFLite Models</summary>
### TFLite Models
| Models | TOSA/LinAlg | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :broken_heart: | :broken_heart: | | |
| FullyConnected | :green_heart: | :green_heart: | | |
| albert | :green_heart: | :green_heart: | | |
| asr_conformer | :green_heart: | :green_heart: | | |
| bird_classifier | :green_heart: | :green_heart: | | |
| cartoon_gan | :green_heart: | :green_heart: | | |
| craft_text | :green_heart: | :green_heart: | | |
| deeplab_v3 | :green_heart: | :green_heart: | | |
| densenet | :green_heart: | :green_heart: | | |
| east_text_detector | :green_heart: | :green_heart: | | |
| efficientnet_lite0_int8 | :green_heart: | :green_heart: | | |
| efficientnet | :green_heart: | :green_heart: | | |
| gpt2 | :green_heart: | :green_heart: | | |
| image_stylization | :green_heart: | :green_heart: | | |
| inception_v4 | :green_heart: | :green_heart: | | |
| inception_v4_uint8 | :green_heart: | :green_heart: | | |
| lightning_fp16 | :green_heart: | :green_heart: | | |
| lightning_i8 | :green_heart: | :green_heart: | | |
| lightning | :green_heart: | :green_heart: | | |
| magenta | :green_heart: | :green_heart: | | |
| midas | :green_heart: | :green_heart: | | |
| mirnet | :green_heart: | :green_heart: | | |
| mnasnet | :green_heart: | :green_heart: | | |
| mobilebert_edgetpu_s_float | :green_heart: | :green_heart: | | |
| mobilebert_edgetpu_s_quant | :green_heart: | :green_heart: | | |
| mobilebert | :green_heart: | :green_heart: | | |
| mobilebert_tf2_float | :green_heart: | :green_heart: | | |
| mobilebert_tf2_quant | :green_heart: | :green_heart: | | |
| mobilenet_ssd_quant | :green_heart: | :green_heart: | | |
| mobilenet_v1 | :green_heart: | :green_heart: | | |
| mobilenet_v1_uint8 | :green_heart: | :green_heart: | | |
| mobilenet_v2_int8 | :green_heart: | :green_heart: | | |
| mobilenet_v2 | :green_heart: | :green_heart: | | |
| mobilenet_v2_uint8 | :green_heart: | :green_heart: | | |
| mobilenet_v3-large | :green_heart: | :green_heart: | | |
| mobilenet_v3-large_uint8 | :green_heart: | :green_heart: | | |
| mobilenet_v35-int8 | :green_heart: | :green_heart: | | |
| nasnet | :green_heart: | :green_heart: | | |
| person_detect | :green_heart: | :green_heart: | | |
| posenet | :green_heart: | :green_heart: | | |
| resnet_50_int8 | :green_heart: | :green_heart: | | |
| rosetta | :green_heart: | :green_heart: | | |
| spice | :green_heart: | :green_heart: | | |
| squeezenet | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v1 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v1_uint8 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2_fpnlite | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2_fpnlite_uint8 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2_int8 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2 | :green_heart: | :green_heart: | | |
| ssd_spaghettinet_large | :green_heart: | :green_heart: | | |
| ssd_spaghettinet_large_uint8 | :green_heart: | :green_heart: | | |
| visual_wake_words_i8 | :green_heart: | :green_heart: | | |
</details>
<details>
<summary>TF Models</summary>
### Tensorflow Models (Inference)
| Hugging Face Models | tf-mhlo lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| albert-base-v2 | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| DistilBERT | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| CamemBert | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| ConvBert | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| Deberta | | | | |
| electra | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| funnel | | | | |
| layoutlm | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| longformer | | | | |
| mobile-bert | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| remembert | | | | |
| tapas | | | | |
| flaubert | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| roberta | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| xlm-roberta | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| mpnet | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
</details>
## Related Projects

View File

@@ -42,7 +42,7 @@ class TFHuggingFaceLanguage(tf.Module):
input_ids=x, attention_mask=y, token_type_ids=z, training=False
)
@tf.function(input_signature=tf_bert_input, jit_compile=True)
@tf.function(input_signature=tf_bert_input)
def forward(self, input_ids, attention_mask, token_type_ids):
return self.m.predict(input_ids, attention_mask, token_type_ids)

View File

@@ -1,41 +0,0 @@
import argparse
import torchvision
import numpy as np
import requests
import shutil
import os
import subprocess
parser = argparse.ArgumentParser()
parser.add_argument("-n", "--newfile")
parser.add_argument(
"-g",
"--golden_url",
default="https://storage.googleapis.com/shark_tank/testdata/cyberpunk_fores_42_0_230119_021148.png",
)
def get_image(url, local_filename):
res = requests.get(url, stream=True)
if res.status_code == 200:
with open(local_filename, "wb") as f:
shutil.copyfileobj(res.raw, f)
return torchvision.io.read_image(local_filename).numpy()
if __name__ == "__main__":
args = parser.parse_args()
new = torchvision.io.read_image(args.newfile).numpy() / 255.0
tempfile_name = os.path.join(os.getcwd(), "golden.png")
golden = get_image(args.golden_url, tempfile_name) / 255.0
diff = np.abs(new - golden)
mean = np.mean(diff)
if not mean < 0.2:
subprocess.run(
["gsutil", "cp", args.newfile, "gs://shark_tank/testdata/builder/"]
)
raise SystemExit("new and golden not close")
else:
print("SUCCESS")

View File

@@ -1,5 +1,5 @@
#!/bin/bash
IMPORTER=1 BENCHMARK=1 ./setup_venv.sh
IMPORTER=1 ./setup_venv.sh
source $GITHUB_WORKSPACE/shark.venv/bin/activate
python generate_sharktank.py --upload=False --ci_tank_dir=True

View File

@@ -1,37 +0,0 @@
"""Scrapes the github releases API to generate a static pip-install-able releases page.
See https://github.com/llvm/torch-mlir/issues/1374
"""
import argparse
import json
import requests
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("owner", type=str)
parser.add_argument("repo", type=str)
args = parser.parse_args()
# Get releases
response = requests.get(
f"https://api.github.com/repos/{args.owner}/{args.repo}/releases"
)
body = json.loads(response.content)
# Parse releases
releases = []
for row in body:
for asset in row["assets"]:
releases.append((asset["name"], asset["browser_download_url"]))
# Output HTML
html = """<!DOCTYPE html>
<html>
<body>
"""
for name, url in releases:
html += f" <a href='{url}'>{name}</a><br />\n"
html += """ </body>
</html>"""
print(html)

View File

@@ -1,6 +0,0 @@
rm -rf ./test_images
mkdir test_images
python shark/examples/shark_inference/stable_diffusion/main.py --device=vulkan --output_dir=./test_images --no-load_vmfb --no-use_tuned
python build_tools/image_comparison.py -n ./test_images/*.png
exit $?

View File

@@ -36,12 +36,6 @@ def pytest_addoption(parser):
default="False",
help="Enables uploading of reproduction artifacts upon test case failure during iree-compile or validation. Must be passed with --ci_sha option ",
)
parser.addoption(
"--update_tank",
action="store_true",
default="False",
help="Update local shark tank with latest artifacts.",
)
parser.addoption(
"--ci_sha",
action="store",

3
cpp/.gitignore vendored
View File

@@ -1,3 +0,0 @@
*.mlir
*.vmfb
*.ini

View File

@@ -54,29 +54,5 @@ python -m pip install tensorflow
*Run the vulkan_gui*
```bash
./build/vulkan_gui/iree-samples-resnet-vulkan-gui
```
## Other models
A tool for benchmarking other models is built and can be invoked with a command like the following
```bash
./build/vulkan_gui/iree-vulkan-gui --module-file=path/to/.vmfb --function_input=...
```
see `./build/vulkan_gui/iree-vulkan-gui --help` for an explanation on the function input. For example, stable diffusion unet can be tested with the following commands:
```bash
wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/stable_diff_tf.mlir
iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 stable_diff_tf.mlir -o stable_diff_tf.vmfb
./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=2x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32
```
VAE and Autoencoder are also available
```bash
# VAE
wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/vae_tf/vae.mlir
iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 vae.mlir -o vae.vmfb
./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x4x64x64xf32
# CLIP Autoencoder
wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/clip_tf/clip_autoencoder.mlir
iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 clip_autoencoder.mlir -o clip_autoencoder.vmfb
./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x77xi32 --function_input=1x77xi32
./build/vulkan_gui/iree-samples-vulkan-gui
```

View File

@@ -1,6 +1,7 @@
import numpy as np
import tensorflow as tf
from shark.shark_inference import SharkInference
from shark.shark_downloader import download_tf_model
def load_and_preprocess_image(fname: str):

View File

@@ -40,77 +40,45 @@ set(IMGUI_DIR ${CMAKE_BINARY_DIR}/_deps/imgui-src)
message("Looking for Imgui in ${IMGUI_DIR}")
include_directories(${IMGUI_DIR} ${IMGUI_DIR}/backends ..)
function(iree_vulkan_sample)
cmake_parse_arguments(
_RULE
""
"NAME"
"SRCS"
${ARGN}
)
# Define the sample executable.
set(_NAME "${_RULE_NAME}")
set(SRCS "${_RULE_SRCS}")
add_executable(${_NAME} "")
target_sources(${_NAME}
PRIVATE
${SRCS}
"${IMGUI_DIR}/backends/imgui_impl_sdl.cpp"
"${IMGUI_DIR}/backends/imgui_impl_vulkan.cpp"
"${IMGUI_DIR}/imgui.cpp"
"${IMGUI_DIR}/imgui_draw.cpp"
"${IMGUI_DIR}/imgui_demo.cpp"
"${IMGUI_DIR}/imgui_tables.cpp"
"${IMGUI_DIR}/imgui_widgets.cpp"
)
set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "${_NAME}")
target_include_directories(${_NAME} PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
)
target_link_libraries(${_NAME}
SDL2::SDL2
Vulkan::Vulkan
iree_runtime_runtime
iree_base_internal_main
iree_hal_drivers_vulkan_registration_registration
iree_modules_hal_hal
iree_vm_vm
iree_vm_bytecode_module
iree_vm_cc
iree_tooling_vm_util_cc
iree_tooling_context_util
)
if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(_GUI_LINKOPTS "-SUBSYSTEM:CONSOLE")
else()
set(_GUI_LINKOPTS "")
endif()
target_link_options(${_NAME}
PRIVATE
${_GUI_LINKOPTS}
)
endfunction()
iree_vulkan_sample(
NAME
iree-samples-resnet-vulkan-gui
SRCS
vulkan_resnet_inference_gui.cc
# Define the sample executable.
set(_NAME "iree-samples-vulkan-gui")
add_executable(${_NAME} "")
target_sources(${_NAME}
PRIVATE
vulkan_inference_gui.cc
"${IMGUI_DIR}/backends/imgui_impl_sdl.cpp"
"${IMGUI_DIR}/backends/imgui_impl_vulkan.cpp"
"${IMGUI_DIR}/imgui.cpp"
"${IMGUI_DIR}/imgui_draw.cpp"
"${IMGUI_DIR}/imgui_demo.cpp"
"${IMGUI_DIR}/imgui_tables.cpp"
"${IMGUI_DIR}/imgui_widgets.cpp"
)
set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "iree-samples-vulkan-gui")
target_include_directories(${_NAME} PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
)
target_link_libraries(${_NAME}
SDL2::SDL2
Vulkan::Vulkan
iree_runtime_runtime
iree_base_internal_main
iree_hal_drivers_vulkan_registration_registration
iree_modules_hal_hal
iree_vm_vm
iree_vm_bytecode_module
iree_vm_cc
)
iree_vulkan_sample(
NAME
iree-vulkan-gui
if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(_GUI_LINKOPTS "-SUBSYSTEM:CONSOLE")
else()
set(_GUI_LINKOPTS "")
endif()
SRCS
vulkan_inference_gui.cc
target_link_options(${_NAME}
PRIVATE
${_GUI_LINKOPTS}
)
message(STATUS "Configured vulkan_gui sample successfully")

View File

@@ -18,12 +18,6 @@
#include <set>
#include <vector>
#include <fstream>
#include <array>
#include <cstdio>
#include <cstdlib>
#include <iterator>
#include <string>
#include <utility>
#include "iree/hal/drivers/vulkan/api.h"
@@ -36,15 +30,6 @@
#include "iree/vm/bytecode_module.h"
#include "iree/vm/ref_cc.h"
// iree-run-module
#include "iree/base/internal/flags.h"
#include "iree/base/status_cc.h"
#include "iree/base/tracing.h"
#include "iree/modules/hal/types.h"
#include "iree/tooling/comparison.h"
#include "iree/tooling/context_util.h"
#include "iree/tooling/vm_util_cc.h"
// Other dependencies (helpers, etc.)
#include "iree/base/internal/main.h"
@@ -53,49 +38,6 @@
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
IREE_FLAG(string, entry_function, "",
"Name of a function contained in the module specified by module_file "
"to run.");
// TODO(benvanik): move --function_input= flag into a util.
static iree_status_t parse_function_io(iree_string_view_t flag_name,
void* storage,
iree_string_view_t value) {
auto* list = (std::vector<std::string>*)storage;
list->push_back(std::string(value.data, value.size));
return iree_ok_status();
}
static void print_function_io(iree_string_view_t flag_name, void* storage,
FILE* file) {
auto* list = (std::vector<std::string>*)storage;
if (list->empty()) {
fprintf(file, "# --%.*s=\n", (int)flag_name.size, flag_name.data);
} else {
for (size_t i = 0; i < list->size(); ++i) {
fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
list->at(i).c_str());
}
}
}
static std::vector<std::string> FLAG_function_inputs;
IREE_FLAG_CALLBACK(
parse_function_io, print_function_io, &FLAG_function_inputs, function_input,
"An input (a) value or (b) buffer of the format:\n"
" (a) scalar value\n"
" value\n"
" e.g.: --function_input=\"3.14\"\n"
" (b) buffer:\n"
" [shape]xtype=[value]\n"
" e.g.: --function_input=\"2x2xi32=1 2 3 4\"\n"
"Optionally, brackets may be used to separate the element values:\n"
" 2x2xi32=[[1 2][3 4]]\n"
"Raw binary files can be read to provide buffer contents:\n"
" 2x2xi32=@some/file.bin\n"
"numpy npy files (from numpy.save) can be read to provide 1+ values:\n"
" @some.npy\n"
"Each occurrence of the flag indicates an input in the order they were\n"
"specified on the command line.");
typedef struct iree_file_toc_t {
const char* name; // the file's original name
char* data; // beginning of the file
@@ -145,6 +87,225 @@ static void check_vk_result(VkResult err) {
abort();
}
// Helper function to find Vulkan memory type bits. See ImGui_ImplVulkan_MemoryType() in imgui_impl_vulkan.cpp
uint32_t findMemoryType(uint32_t type_filter, VkMemoryPropertyFlags properties)
{
VkPhysicalDeviceMemoryProperties mem_properties;
vkGetPhysicalDeviceMemoryProperties(g_PhysicalDevice, &mem_properties);
for (uint32_t i = 0; i < mem_properties.memoryTypeCount; i++)
{
if ((type_filter & (1 << i)) && (mem_properties.memoryTypes[i].propertyFlags & properties) == properties)
{
return i;
}
}
return 0xFFFFFFFF; // Unable to find memoryType
}
// Helper function to load an image with common settings and return a VkDescriptorSet as a sort of Vulkan pointer
bool LoadTextureFromFile(const char* filename, VkDescriptorSet* img_ds, int* image_width, int* image_height)
{
// Specifying 4 channels forces stb to load the image in RGBA which is an easy format for Vulkan
int image_channels = 4;
unsigned char* image_data = stbi_load(filename, image_width, image_height, 0, image_channels);
if (image_data == NULL)
{
return false;
}
// Calculate allocation size (in number of bytes)
size_t image_size = (*image_width)*(*image_height)*image_channels;
VkResult err;
// Create the Vulkan image.
VkImage texture_image;
VkDeviceMemory texture_image_memory;
{
VkImageCreateInfo info = {};
info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
info.imageType = VK_IMAGE_TYPE_2D;
info.format = VK_FORMAT_R8G8B8A8_UNORM;
info.extent.width = *image_width;
info.extent.height = *image_height;
info.extent.depth = 1;
info.mipLevels = 1;
info.arrayLayers = 1;
info.samples = VK_SAMPLE_COUNT_1_BIT;
info.tiling = VK_IMAGE_TILING_OPTIMAL;
info.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT;
info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
err = vkCreateImage(g_Device, &info, g_Allocator, &texture_image);
check_vk_result(err);
VkMemoryRequirements req;
vkGetImageMemoryRequirements(g_Device, texture_image, &req);
VkMemoryAllocateInfo alloc_info = {};
alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
alloc_info.allocationSize = req.size;
alloc_info.memoryTypeIndex = findMemoryType(req.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
err = vkAllocateMemory(g_Device, &alloc_info, g_Allocator, &texture_image_memory);
check_vk_result(err);
err = vkBindImageMemory(g_Device, texture_image, texture_image_memory, 0);
check_vk_result(err);
}
// Create the Image View
VkImageView image_view;
{
VkImageViewCreateInfo info = {};
info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
info.image = texture_image;
info.viewType = VK_IMAGE_VIEW_TYPE_2D;
info.format = VK_FORMAT_R8G8B8A8_UNORM;
info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
info.subresourceRange.levelCount = 1;
info.subresourceRange.layerCount = 1;
err = vkCreateImageView(g_Device, &info, g_Allocator, &image_view);
check_vk_result(err);
}
// Create Sampler
VkSampler sampler;
{
VkSamplerCreateInfo sampler_info{};
sampler_info.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
sampler_info.magFilter = VK_FILTER_LINEAR;
sampler_info.minFilter = VK_FILTER_LINEAR;
sampler_info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR;
sampler_info.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT; // outside image bounds just use border color
sampler_info.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT;
sampler_info.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT;
sampler_info.minLod = -1000;
sampler_info.maxLod = 1000;
sampler_info.maxAnisotropy = 1.0f;
err = vkCreateSampler(g_Device, &sampler_info, g_Allocator, &sampler);
check_vk_result(err);
}
// Create Descriptor Set using ImGUI's implementation
*img_ds = ImGui_ImplVulkan_AddTexture(sampler, image_view, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
// Create Upload Buffer
VkBuffer upload_buffer;
VkDeviceMemory upload_buffer_memory;
{
VkBufferCreateInfo buffer_info = {};
buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
buffer_info.size = image_size;
buffer_info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
err = vkCreateBuffer(g_Device, &buffer_info, g_Allocator, &upload_buffer);
check_vk_result(err);
VkMemoryRequirements req;
vkGetBufferMemoryRequirements(g_Device, upload_buffer, &req);
VkMemoryAllocateInfo alloc_info = {};
alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
alloc_info.allocationSize = req.size;
alloc_info.memoryTypeIndex = findMemoryType(req.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
err = vkAllocateMemory(g_Device, &alloc_info, g_Allocator, &upload_buffer_memory);
check_vk_result(err);
err = vkBindBufferMemory(g_Device, upload_buffer, upload_buffer_memory, 0);
check_vk_result(err);
}
// Upload to Buffer:
{
void* map = NULL;
err = vkMapMemory(g_Device, upload_buffer_memory, 0, image_size, 0, &map);
check_vk_result(err);
memcpy(map, image_data, image_size);
VkMappedMemoryRange range[1] = {};
range[0].sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
range[0].memory = upload_buffer_memory;
range[0].size = image_size;
err = vkFlushMappedMemoryRanges(g_Device, 1, range);
check_vk_result(err);
vkUnmapMemory(g_Device, upload_buffer_memory);
}
// Release image memory using stb
stbi_image_free(image_data);
// Create a command buffer that will perform following steps when hit in the command queue.
// TODO: this works in the example, but may need input if this is an acceptable way to access the pool/create the command buffer.
VkCommandPool command_pool = g_MainWindowData.Frames[g_MainWindowData.FrameIndex].CommandPool;
VkCommandBuffer command_buffer;
{
VkCommandBufferAllocateInfo alloc_info{};
alloc_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
alloc_info.commandPool = command_pool;
alloc_info.commandBufferCount = 1;
err = vkAllocateCommandBuffers(g_Device, &alloc_info, &command_buffer);
check_vk_result(err);
VkCommandBufferBeginInfo begin_info = {};
begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
begin_info.flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
err = vkBeginCommandBuffer(command_buffer, &begin_info);
check_vk_result(err);
}
// Copy to Image
{
VkImageMemoryBarrier copy_barrier[1] = {};
copy_barrier[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
copy_barrier[0].dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
copy_barrier[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
copy_barrier[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
copy_barrier[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
copy_barrier[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
copy_barrier[0].image = texture_image;
copy_barrier[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
copy_barrier[0].subresourceRange.levelCount = 1;
copy_barrier[0].subresourceRange.layerCount = 1;
vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, NULL, 0, NULL, 1, copy_barrier);
VkBufferImageCopy region = {};
region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
region.imageSubresource.layerCount = 1;
region.imageExtent.width = *image_width;
region.imageExtent.height = *image_height;
region.imageExtent.depth = 1;
vkCmdCopyBufferToImage(command_buffer, upload_buffer, texture_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &region);
VkImageMemoryBarrier use_barrier[1] = {};
use_barrier[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
use_barrier[0].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
use_barrier[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
use_barrier[0].oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
use_barrier[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
use_barrier[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
use_barrier[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
use_barrier[0].image = texture_image;
use_barrier[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
use_barrier[0].subresourceRange.levelCount = 1;
use_barrier[0].subresourceRange.layerCount = 1;
vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, 0, NULL, 0, NULL, 1, use_barrier);
}
// End command buffer
{
VkSubmitInfo end_info = {};
end_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
end_info.commandBufferCount = 1;
end_info.pCommandBuffers = &command_buffer;
err = vkEndCommandBuffer(command_buffer);
check_vk_result(err);
err = vkQueueSubmit(g_Queue, 1, &end_info, VK_NULL_HANDLE);
check_vk_result(err);
err = vkDeviceWaitIdle(g_Device);
check_vk_result(err);
}
return true;
}
// Returns the names of the Vulkan layers used for the given IREE
// |extensibility_set| and |features|.
std::vector<const char*> GetIreeLayers(
@@ -562,16 +723,7 @@ namespace iree {
extern "C" int iree_main(int argc, char** argv) {
iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);
if (argc > 1) {
// Avoid iree-run-module spinning endlessly on stdin if the user uses single
// dashes for flags.
printf(
"[ERROR] unexpected positional argument (expected none)."
" Did you use pass a flag with a single dash ('-')?"
" Use '--' instead.\n");
return 1;
}
fprintf(stdout, "starting yo\n");
// --------------------------------------------------------------------------
// Create a window.
@@ -683,6 +835,8 @@ extern "C" int iree_main(int argc, char** argv) {
// Demo state.
bool show_iree_window = true;
// --------------------------------------------------------------------------
// --------------------------------------------------------------------------
// Setup IREE.
@@ -746,44 +900,69 @@ extern "C" int iree_main(int argc, char** argv) {
// Load bytecode module
//iree_file_toc_t module_file_toc;
//const char network_model[] = "resnet50_tf.vmfb";
//fprintf(stdout, "Loading: %s\n", network_model);
//if (load_file(network_model, &module_file_toc.data, &module_file_toc.size) == false)
//{
// abort();
// return 1;
//}
//fprintf(stdout, "module size: %zu\n", module_file_toc.size);
iree_file_toc_t module_file_toc;
const char network_model[] = "resnet50_tf.vmfb";
fprintf(stdout, "Loading: %s\n", network_model);
if (load_file(network_model, &module_file_toc.data, &module_file_toc.size) == false)
{
abort();
return 1;
}
fprintf(stdout, "module size: %zu\n", module_file_toc.size);
static float input_res50[224*224*3];
static float output_res50[1000];
char filename[] = "dog_imagenet.jpg";
fprintf(stdout, "loading: %s\n", filename);
int x,y,n;
//unsigned char *image_raw = stbi_load(filename, &x, &y, &n, 3);
stbi_load(filename, &x, &y, &n, 3);
fprintf(stdout, "res: %i x %i x %i\n", x, y, n);
/* Preprocessing needs to go here. For now use a buffer preprocessed in python.
//convert image into floating point format
for(int i=0;i<224*224*3;i++)
{
input_res50[i]= ((float)image_raw[i])/255.0f;
}*/
std::ifstream fin("dog.bin", std::ifstream::in | std::ifstream::binary);
fin.read((char*)input_res50, 224*224*3*sizeof(float));
// load image again so imgui can display it
int my_image_width = 0;
int my_image_height = 0;
VkDescriptorSet my_image_texture = 0;
bool ret = LoadTextureFromFile(filename, &my_image_texture, &my_image_width, &my_image_height);
fprintf(stdout, "creating vulkan image: %s\n", ret ?"OK":"FAIL");
IM_ASSERT(ret);
iree_vm_module_t* bytecode_module = nullptr;
iree_status_t module_status = iree_tooling_load_module_from_flags(
iree_instance, iree_allocator_system(), &bytecode_module);
if (!iree_status_is_ok(module_status))
return -1;
//IREE_CHECK_OK(iree_vm_bytecode_module_create(
// iree_instance,
// iree_const_byte_span_t{
// reinterpret_cast<const uint8_t*>(module_file_toc.data),
// module_file_toc.size},
// iree_allocator_null(), iree_allocator_system(), &bytecode_module));
//// Query for details about what is in the loaded module.
//iree_vm_module_signature_t bytecode_module_signature =
// iree_vm_module_signature(bytecode_module);
//fprintf(stdout, "Module loaded, have <%" PRIhsz "> exported functions:\n",
// bytecode_module_signature.export_function_count);
//for (int i = 0; i < bytecode_module_signature.export_function_count; ++i) {
// iree_vm_function_t function;
// IREE_CHECK_OK(iree_vm_module_lookup_function_by_ordinal(
// bytecode_module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
// auto function_name = iree_vm_function_name(&function);
// auto function_signature = iree_vm_function_signature(&function);
IREE_CHECK_OK(iree_vm_bytecode_module_create(
iree_instance,
iree_const_byte_span_t{
reinterpret_cast<const uint8_t*>(module_file_toc.data),
module_file_toc.size},
iree_allocator_null(), iree_allocator_system(), &bytecode_module));
// Query for details about what is in the loaded module.
iree_vm_module_signature_t bytecode_module_signature =
iree_vm_module_signature(bytecode_module);
fprintf(stdout, "Module loaded, have <%" PRIhsz "> exported functions:\n",
bytecode_module_signature.export_function_count);
for (int i = 0; i < bytecode_module_signature.export_function_count; ++i) {
iree_vm_function_t function;
IREE_CHECK_OK(iree_vm_module_lookup_function_by_ordinal(
bytecode_module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
auto function_name = iree_vm_function_name(&function);
auto function_signature = iree_vm_function_signature(&function);
// fprintf(stdout, " %d: '%.*s' with calling convention '%.*s'\n", i,
// (int)function_name.size, function_name.data,
// (int)function_signature.calling_convention.size,
// function_signature.calling_convention.data);
//}
fprintf(stdout, " %d: '%.*s' with calling convention '%.*s'\n", i,
(int)function_name.size, function_name.data,
(int)function_signature.calling_convention.size,
function_signature.calling_convention.data);
}
// Allocate a context that will hold the module state across invocations.
iree_vm_context_t* iree_context = nullptr;
@@ -809,42 +988,33 @@ extern "C" int iree_main(int argc, char** argv) {
// Write inputs into mappable buffers.
iree_hal_allocator_t* allocator =
iree_hal_device_allocator(iree_vk_device);
//iree_hal_memory_type_t input_memory_type =
// static_cast<iree_hal_memory_type_t>(
// IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
// IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE);
//iree_hal_buffer_usage_t input_buffer_usage =
// static_cast<iree_hal_buffer_usage_t>(IREE_HAL_BUFFER_USAGE_DEFAULT);
//iree_hal_buffer_params_t buffer_params;
//buffer_params.type = input_memory_type;
//buffer_params.usage = input_buffer_usage;
//buffer_params.access = IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE;
iree_hal_memory_type_t input_memory_type =
static_cast<iree_hal_memory_type_t>(
IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE);
iree_hal_buffer_usage_t input_buffer_usage =
static_cast<iree_hal_buffer_usage_t>(IREE_HAL_BUFFER_USAGE_DEFAULT);
iree_hal_buffer_params_t buffer_params;
buffer_params.type = input_memory_type;
buffer_params.usage = input_buffer_usage;
buffer_params.access = IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE;
// Wrap input buffers in buffer views.
vm::ref<iree_vm_list_t> inputs;
iree_status_t input_status = ParseToVariantList(
iree_hal_buffer_view_t* input0_buffer_view = nullptr;
constexpr iree_hal_dim_t input_buffer_shape[] = {1, 224, 224, 3};
IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer(
allocator,
iree::span<const std::string>{FLAG_function_inputs.data(),
FLAG_function_inputs.size()},
iree_allocator_system(), &inputs);
if (!iree_status_is_ok(input_status))
return -1;
//vm::ref<iree_vm_list_t> inputs;
//IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 6, iree_allocator_system(), &inputs));
/*shape_rank=*/4, /*shape=*/input_buffer_shape,
IREE_HAL_ELEMENT_TYPE_FLOAT_32,
IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
iree_make_const_byte_span(&input_res50, sizeof(input_res50)),
&input0_buffer_view));
//iree_hal_buffer_view_t* input0_buffer_view = nullptr;
//constexpr iree_hal_dim_t input_buffer_shape[] = {1, 224, 224, 3};
//IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer(
// allocator,
// /*shape_rank=*/4, /*shape=*/input_buffer_shape,
// IREE_HAL_ELEMENT_TYPE_FLOAT_32,
// IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
// iree_make_const_byte_span(&input_res50, sizeof(input_res50)),
// &input0_buffer_view));
//auto input0_buffer_view_ref = iree_hal_buffer_view_move_ref(input0_buffer_view);
//IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs.get(), &input0_buffer_view_ref));
vm::ref<iree_vm_list_t> inputs;
IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 6, iree_allocator_system(), &inputs));
auto input0_buffer_view_ref = iree_hal_buffer_view_move_ref(input0_buffer_view);
IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs.get(), &input0_buffer_view_ref));
// Prepare outputs list to accept results from the invocation.
@@ -853,7 +1023,6 @@ extern "C" int iree_main(int argc, char** argv) {
IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, kOutputCount * sizeof(float), iree_allocator_system(), &outputs));
// --------------------------------------------------------------------------
// Main loop.
bool done = false;
while (!done) {
@@ -907,11 +1076,46 @@ extern "C" int iree_main(int argc, char** argv) {
/*policy=*/nullptr, inputs.get(),
outputs.get(), iree_allocator_system()));
// Read back the results.
auto* output_buffer_view = reinterpret_cast<iree_hal_buffer_view_t*>(
iree_vm_list_get_ref_deref(outputs.get(),
0,
iree_hal_buffer_view_get_descriptor()));
IREE_CHECK_OK(iree_hal_device_transfer_d2h(
iree_vk_device,
iree_hal_buffer_view_buffer(output_buffer_view),
0,
output_res50, sizeof(output_res50),
IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout()));
// we want to run continuously so we can use tools like RenderDoc, RGP, etc...
dirty = true;
}
// find maxarg from results
float max = 0.0f;
int max_idx = -1;
for(int i=0;i<1000;i++)
{
if (output_res50[i] > max)
{
max = output_res50[i];
max_idx = i;
}
}
ImGui::Text("pointer = %p", my_image_texture);
ImGui::Text("size = %d x %d", my_image_width, my_image_height);
ImGui::Image((ImTextureID)my_image_texture, ImVec2(my_image_width, my_image_height));
// Display the latest computation output.
ImGui::Text("Max idx = [%i]", max_idx);
ImGui::Text("Max value = [%f]", max);
ImGui::Text("Resnet50 categories:");
ImGui::PlotHistogram("Histogram", output_res50, IM_ARRAYSIZE(output_res50), 0, NULL, 0.0f, 1.0f, ImVec2(0,80));
ImGui::Separator();
// Framerate counter.
ImGui::Text("Application average %.3f ms/frame (%.1f FPS)",
1000.0f / ImGui::GetIO().Framerate, ImGui::GetIO().Framerate);
@@ -933,7 +1137,6 @@ extern "C" int iree_main(int argc, char** argv) {
iree_vm_module_release(bytecode_module);
iree_vm_context_release(iree_context);
iree_hal_device_release(iree_vk_device);
iree_hal_allocator_release(allocator);
iree_hal_driver_release(iree_vk_driver);
iree_hal_vulkan_syms_release(iree_vk_syms);
iree_vm_instance_release(iree_instance);

File diff suppressed because it is too large Load Diff

View File

@@ -1,27 +0,0 @@
# Dataset annotation tool
SHARK annotator for adding or modifying prompts of dataset images
## Set up
Activate SHARK Python virtual environment and install additional packages
```shell
source ../shark.venv/bin/activate
pip install -r requirements.txt
```
## Run annotator
```shell
python annotation_tool.py
```
<img width="1280" alt="annotator" src="https://user-images.githubusercontent.com/49575973/214521137-7ef6ae10-7cd8-46e6-b270-b6c0445157f1.png">
* Select a dataset from `Dataset` dropdown list
* Select an image from `Image` dropdown list
* Image and the existing prompt will be loaded
* Select a prompt from `Prompt` dropdown list to modify or "Add new" to add a prompt
* Click `Save` to save changes, click `Delete` to delete prompt
* Click `Back` or `Next` to switch image, you could also select other images from `Image`
* Click `Finish` when finishing annotation or before switching dataset

View File

@@ -1,248 +0,0 @@
import gradio as gr
import json
import jsonlines
import os
from args import args
from pathlib import Path
from PIL import Image
from utils import get_datasets
shark_root = Path(__file__).parent.parent
demo_css = shark_root.joinpath("web/demo.css").resolve()
nodlogo_loc = shark_root.joinpath(
"web/models/stable_diffusion/logos/nod-logo.png"
)
with gr.Blocks(title="Dataset Annotation Tool", css=demo_css) as shark_web:
with gr.Row(elem_id="ui_title"):
nod_logo = Image.open(nodlogo_loc)
with gr.Column(scale=1, elem_id="demo_title_outer"):
gr.Image(
value=nod_logo,
show_label=False,
interactive=False,
elem_id="top_logo",
).style(width=150, height=100)
datasets, images, ds_w_prompts = get_datasets(args.gs_url)
prompt_data = dict()
with gr.Row(elem_id="ui_body"):
# TODO: add multiselect dataset, there is a gradio version conflict
dataset = gr.Dropdown(label="Dataset", choices=datasets)
image_name = gr.Dropdown(label="Image", choices=[])
with gr.Row(elem_id="ui_body"):
# TODO: add ability to search image by typing
with gr.Column(scale=1, min_width=600):
image = gr.Image(type="filepath").style(height=512)
with gr.Column(scale=1, min_width=600):
prompts = gr.Dropdown(
label="Prompts",
choices=[],
)
prompt = gr.Textbox(
label="Editor",
lines=3,
)
with gr.Row():
save = gr.Button("Save")
delete = gr.Button("Delete")
with gr.Row():
back_image = gr.Button("Back")
next_image = gr.Button("Next")
finish = gr.Button("Finish")
def filter_datasets(dataset):
if dataset is None:
return gr.Dropdown.update(value=None, choices=[])
# create the dataset dir if doesn't exist and download prompt file
dataset_path = str(shark_root) + "/dataset/" + dataset
if not os.path.exists(dataset_path):
os.mkdir(dataset_path)
# read prompt jsonlines file
prompt_data.clear()
if dataset in ds_w_prompts:
prompt_gs_path = args.gs_url + "/" + dataset + "/metadata.jsonl"
os.system(f'gsutil cp "{prompt_gs_path}" "{dataset_path}"/')
with jsonlines.open(dataset_path + "/metadata.jsonl") as reader:
for line in reader.iter(type=dict, skip_invalid=True):
prompt_data[line["file_name"]] = (
[line["text"]]
if type(line["text"]) is str
else line["text"]
)
return gr.Dropdown.update(choices=images[dataset])
dataset.change(fn=filter_datasets, inputs=dataset, outputs=image_name)
def display_image(dataset, image_name):
if dataset is None or image_name is None:
return gr.Image.update(value=None), gr.Dropdown.update(value=None)
# download and load the image
img_gs_path = args.gs_url + "/" + dataset + "/" + image_name
img_sub_path = "/".join(image_name.split("/")[:-1])
img_dst_path = (
str(shark_root) + "/dataset/" + dataset + "/" + img_sub_path + "/"
)
if not os.path.exists(img_dst_path):
os.mkdir(img_dst_path)
os.system(f'gsutil cp "{img_gs_path}" "{img_dst_path}"')
img = Image.open(img_dst_path + image_name.split("/")[-1])
if image_name not in prompt_data.keys():
prompt_data[image_name] = []
prompt_choices = ["Add new"]
prompt_choices += prompt_data[image_name]
return gr.Image.update(value=img), gr.Dropdown.update(
choices=prompt_choices
)
image_name.change(
fn=display_image,
inputs=[dataset, image_name],
outputs=[image, prompts],
)
def edit_prompt(prompts):
if prompts == "Add new":
return gr.Textbox.update(value=None)
return gr.Textbox.update(value=prompts)
prompts.change(fn=edit_prompt, inputs=prompts, outputs=prompt)
def save_prompt(dataset, image_name, prompts, prompt):
if (
dataset is None
or image_name is None
or prompts is None
or prompt is None
):
return
if prompts == "Add new":
prompt_data[image_name].append(prompt)
else:
idx = prompt_data[image_name].index(prompts)
prompt_data[image_name][idx] = prompt
prompt_path = (
str(shark_root) + "/dataset/" + dataset + "/metadata.jsonl"
)
# write prompt jsonlines file
with open(prompt_path, "w") as f:
for key, value in prompt_data.items():
if not value:
continue
v = value if len(value) > 1 else value[0]
f.write(json.dumps({"file_name": key, "text": v}))
f.write("\n")
prompt_choices = ["Add new"]
prompt_choices += prompt_data[image_name]
return gr.Dropdown.update(choices=prompt_choices, value=None)
save.click(
fn=save_prompt,
inputs=[dataset, image_name, prompts, prompt],
outputs=prompts,
)
def delete_prompt(dataset, image_name, prompts):
if dataset is None or image_name is None or prompts is None:
return
if prompts == "Add new":
return
prompt_data[image_name].remove(prompts)
prompt_path = (
str(shark_root) + "/dataset/" + dataset + "/metadata.jsonl"
)
# write prompt jsonlines file
with open(prompt_path, "w") as f:
for key, value in prompt_data.items():
if not value:
continue
v = value if len(value) > 1 else value[0]
f.write(json.dumps({"file_name": key, "text": v}))
f.write("\n")
prompt_choices = ["Add new"]
prompt_choices += prompt_data[image_name]
return gr.Dropdown.update(choices=prompt_choices, value=None)
delete.click(
fn=delete_prompt,
inputs=[dataset, image_name, prompts],
outputs=prompts,
)
def get_back_image(dataset, image_name):
if dataset is None or image_name is None:
return
# remove local image
img_path = str(shark_root) + "/dataset/" + dataset + "/" + image_name
os.system(f'rm "{img_path}"')
# get the index for the back image
idx = images[dataset].index(image_name)
if idx == 0:
return gr.Dropdown.update(value=None)
return gr.Dropdown.update(value=images[dataset][idx - 1])
back_image.click(
fn=get_back_image, inputs=[dataset, image_name], outputs=image_name
)
def get_next_image(dataset, image_name):
if dataset is None or image_name is None:
return
# remove local image
img_path = str(shark_root) + "/dataset/" + dataset + "/" + image_name
os.system(f'rm "{img_path}"')
# get the index for the next image
idx = images[dataset].index(image_name)
if idx == len(images[dataset]) - 1:
return gr.Dropdown.update(value=None)
return gr.Dropdown.update(value=images[dataset][idx + 1])
next_image.click(
fn=get_next_image, inputs=[dataset, image_name], outputs=image_name
)
def finish_annotation(dataset):
if dataset is None:
return
# upload prompt and remove local data
dataset_path = str(shark_root) + "/dataset/" + dataset
dataset_gs_path = args.gs_url + "/" + dataset + "/"
os.system(
f'gsutil cp "{dataset_path}/metadata.jsonl" "{dataset_gs_path}"'
)
os.system(f'rm -rf "{dataset_path}"')
return gr.Dropdown.update(value=None)
finish.click(fn=finish_annotation, inputs=dataset, outputs=dataset)
if __name__ == "__main__":
shark_web.launch(
share=args.share,
inbrowser=True,
server_name="0.0.0.0",
server_port=args.server_port,
)

View File

@@ -1,34 +0,0 @@
import argparse
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
##############################################################################
### Dataset Annotator flags
##############################################################################
p.add_argument(
"--gs_url",
type=str,
required=True,
help="URL to datasets in GS bucket",
)
p.add_argument(
"--share",
default=False,
action=argparse.BooleanOptionalAction,
help="flag for generating a public URL",
)
p.add_argument(
"--server_port",
type=int,
default=8080,
help="flag for setting server port",
)
##############################################################################
args = p.parse_args()

View File

@@ -1,3 +0,0 @@
# SHARK Annotator
gradio==3.15.0
jsonlines

View File

@@ -1,29 +0,0 @@
from google.cloud import storage
def get_datasets(gs_url):
datasets = set()
images = dict()
ds_w_prompts = []
storage_client = storage.Client()
bucket_name = gs_url.split("/")[2]
source_blob_name = "/".join(gs_url.split("/")[3:])
blobs = storage_client.list_blobs(bucket_name, prefix=source_blob_name)
for blob in blobs:
dataset_name = blob.name.split("/")[1]
if dataset_name == "":
continue
datasets.add(dataset_name)
if dataset_name not in images.keys():
images[dataset_name] = []
# check if image or jsonl
file_sub_path = "/".join(blob.name.split("/")[2:])
if "/" in file_sub_path:
images[dataset_name] += [file_sub_path]
elif "metadata.jsonl" in file_sub_path:
ds_w_prompts.append(dataset_name)
return list(datasets), images, ds_w_prompts

View File

@@ -14,11 +14,22 @@ import csv
import argparse
from shark.shark_importer import SharkImporter
from shark.parser import shark_args
import tensorflow as tf
import subprocess as sp
import hashlib
import numpy as np
from pathlib import Path
visible_default = tf.config.list_physical_devices("GPU")
try:
tf.config.set_visible_devices([], "GPU")
visible_devices = tf.config.get_visible_devices()
for device in visible_devices:
assert device.device_type != "GPU"
except:
# Invalid device or cannot modify virtual devices once initialized.
pass
def create_hash(file_name):
with open(file_name, "rb") as f:
@@ -30,12 +41,9 @@ def create_hash(file_name):
def save_torch_model(torch_model_list):
from tank.model_utils import (
get_hf_model,
get_vision_model,
get_hf_img_cls_model,
get_fp16_model,
)
from tank.model_utils import get_hf_model
from tank.model_utils import get_vision_model
from tank.model_utils import get_hf_img_cls_model
with open(torch_model_list) as csvfile:
torch_reader = csv.reader(csvfile, delimiter=",")
@@ -57,8 +65,7 @@ def save_torch_model(torch_model_list):
model, input, _ = get_hf_model(torch_model_name)
elif model_type == "hf_img_cls":
model, input, _ = get_hf_img_cls_model(torch_model_name)
elif model_type == "fp16":
model, input, _ = get_fp16_model(torch_model_name)
torch_model_name = torch_model_name.replace("/", "_")
torch_model_dir = os.path.join(
WORKDIR, str(torch_model_name) + "_torch"
@@ -99,17 +106,6 @@ def save_tf_model(tf_model_list):
get_keras_model,
get_TFhf_model,
)
import tensorflow as tf
visible_default = tf.config.list_physical_devices("GPU")
try:
tf.config.set_visible_devices([], "GPU")
visible_devices = tf.config.get_visible_devices()
for device in visible_devices:
assert device.device_type != "GPU"
except:
# Invalid device or cannot modify virtual devices once initialized.
pass
with open(tf_model_list) as csvfile:
tf_reader = csv.reader(csvfile, delimiter=",")
@@ -209,14 +205,14 @@ if __name__ == "__main__":
parser.add_argument(
"--torch_model_csv",
type=lambda x: is_valid_file(x),
default="./tank/torch_model_list.csv",
default="./tank/pytorch/torch_model_list.csv",
help="""Contains the file with torch_model name and args.
Please see: https://github.com/nod-ai/SHARK/blob/main/tank/torch_model_list.csv""",
Please see: https://github.com/nod-ai/SHARK/blob/main/tank/pytorch/torch_model_list.csv""",
)
parser.add_argument(
"--tf_model_csv",
type=lambda x: is_valid_file(x),
default="./tank/tf_model_list.csv",
default="./tank/tf/tf_model_list.csv",
help="Contains the file with tf model name and args.",
)
parser.add_argument(

View File

@@ -4,9 +4,9 @@ requires = [
"wheel",
"packaging",
"numpy>=1.22.4",
"torch-mlir>=20221021.633",
"iree-compiler>=20221022.190",
"iree-runtime>=20221022.190",
"numpy==1.22.4",
"torch-mlir>=20220428.420",
"iree-compiler>=20220427.13",
"iree-runtime>=20220427.13",
]
build-backend = "setuptools.build_meta"

View File

@@ -1,3 +1,3 @@
[pytest]
addopts = --verbose -p no:warnings
norecursedirs = inference tank/tflite examples benchmarks shark
norecursedirs = inference tank/tflite

View File

@@ -1,4 +1,4 @@
-f https://download.pytorch.org/whl/nightly/cpu/
-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
--pre
numpy
@@ -28,7 +28,6 @@ Pillow
# web dependecies.
gradio
altair
# Testing and support.
#lit

View File

@@ -2,9 +2,8 @@
--pre
numpy==1.22.4
torch
torchvision
pytorch-triton
tabulate
tqdm
@@ -15,8 +14,7 @@ iree-tools-tf
# TensorFlow and JAX.
gin-config
tensorflow==2.10.1
keras==2.10
tensorflow
#tf-models-nightly
#tensorflow-text-nightly
transformers
@@ -36,7 +34,6 @@ sacremoses
# web dependecies.
gradio
altair
scipy
#ONNX and ORT for benchmarking

View File

@@ -5,23 +5,10 @@ wheel
tqdm
# SHARK Downloader
google-cloud-storage
gsutil
# Testing
pytest
pytest-xdist
pytest-forked
Pillow
parameterized
# Add transformers, diffusers and scipy since it most commonly used
transformers
diffusers
scipy
ftfy
gradio
altair
# Keep PyInstaller at the end. Sometimes Windows Defender flags it but most folks can continue even if it errors
pefile
pyinstaller

View File

@@ -10,8 +10,8 @@ PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.4"
backend_deps = []
if "NO_BACKEND" in os.environ.keys():
backend_deps = [
"iree-compiler>=20221022.190",
"iree-runtime>=20221022.190",
"iree-compiler>=20220427.13",
"iree-runtime>=20220427.13",
]
setup(
@@ -33,11 +33,11 @@ setup(
"Operating System :: OS Independent",
],
packages=find_packages(exclude=("examples")),
python_requires=">=3.9",
python_requires=">=3.7",
install_requires=[
"numpy",
"PyYAML",
"torch-mlir>=20221021.633",
"torch-mlir>=20220428.420",
]
+ backend_deps,
)

View File

@@ -1,45 +0,0 @@
param([string]$arguments)
if ($arguments -eq "--update-src"){
git pull
}
#Write-Host "Installing python"
#Start-Process winget install Python.Python.3.10 '/quiet InstallAllUsers=1 PrependPath=1' -wait -NoNewWindow
#Write-Host "python installation completed successfully"
#Write-Host "Reload environment variables"
#$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
#Write-Host "Reloaded environment variables"
# redirect stderr into stdout
$p = &{python -V} 2>&1
# check if an ErrorRecord was returned
$version = if($p -is [System.Management.Automation.ErrorRecord])
{
# grab the version string from the error message
$p.Exception.Message
}
else
{
# otherwise return as is
$p
}
Write-Host "Python version found is"
Write-Host $p
Write-Host "Installing Build Dependencies"
python -m venv .\shark.venv\
.\shark.venv\Scripts\activate
pip install -r requirements.txt
pip install --pre torch-mlir torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/
pip install --upgrade -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html iree-compiler iree-runtime
Write-Host "Building SHARK..."
pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
Write-Host "Build and installation completed successfully"
Write-Host "Source your venv with ./shark.venv/Scripts/activate"

View File

@@ -76,16 +76,11 @@ fi
$PYTHON -m pip install --upgrade pip || die "Could not upgrade pip"
$PYTHON -m pip install --upgrade -r "$TD/requirements.txt"
if [ "$torch_mlir_bin" = true ]; then
if [[ $(uname -s) = 'Darwin' ]]; then
echo "MacOS detected. Installing torch-mlir from .whl, to avoid dependency problems with torch."
$PYTHON -m pip install --pre --no-cache-dir torch-mlir -f https://llvm.github.io/torch-mlir/package-index/ -f https://download.pytorch.org/whl/nightly/torch/
$PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
if [ $? -eq 0 ];then
echo "Successfully Installed torch-mlir"
else
$PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
if [ $? -eq 0 ];then
echo "Successfully Installed torch-mlir"
else
echo "Could not install torch-mlir" >&2
fi
echo "Could not install torch-mlir" >&2
fi
else
echo "${Red}No binaries found for Python $PYTHON_VERSION_X_Y on $(uname -s)"
@@ -94,46 +89,37 @@ else
exit 1
fi
if [[ -z "${USE_IREE}" ]]; then
rm .use-iree
RUNTIME="https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html"
RUNTIME="nod-ai/SHARK-Runtime"
else
touch ./.use-iree
RUNTIME="https://iree-org.github.io/iree/pip-release-links.html"
RUNTIME="google/iree"
fi
if [[ -z "${NO_BACKEND}" ]]; then
echo "Installing ${RUNTIME}..."
$PYTHON -m pip install --upgrade --find-links ${RUNTIME} iree-compiler iree-runtime
$PYTHON -m pip install --find-links https://github.com/${RUNTIME}/releases iree-compiler iree-runtime
else
echo "Not installing a backend, please make sure to add your backend to PYTHONPATH"
fi
if [[ ! -z "${IMPORTER}" ]]; then
echo "${Yellow}Installing importer tools.."
if [[ $(uname -s) = 'Linux' ]]; then
echo "${Yellow}Linux detected.. installing Linux importer tools"
#Always get the importer tools from upstream IREE
$PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer.txt" -f https://iree-org.github.io/iree/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
$PYTHON -m pip install --upgrade -r "$TD/requirements-importer.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
elif [[ $(uname -s) = 'Darwin' ]]; then
echo "${Yellow}macOS detected.. installing macOS importer tools"
#Conda seems to have some problems installing these packages and hope they get resolved upstream.
$PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer-macos.txt" -f ${RUNTIME} --extra-index-url https://download.pytorch.org/whl/nightly/cpu
$PYTHON -m pip install --upgrade -r "$TD/requirements-importer-macos.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
fi
fi
$PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f https://download.pytorch.org/whl/nightly/torch/
$PYTHON -m pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://github.com/${RUNTIME}/releases
if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
T_VER=$($PYTHON -m pip show torch | grep Version)
TORCH_VERSION=${T_VER:9:17}
TV_VER=$($PYTHON -m pip show torchvision | grep Version)
TV_VERSION=${TV_VER:9:18}
$PYTHON -m pip uninstall -y torch torchvision
$PYTHON -m pip install -U --pre --no-warn-conflicts triton
$PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu117/torch-${TORCH_VERSION}%2Bcu117-cp310-cp310-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu117/torchvision-${TV_VERSION}%2Bcu117-cp310-cp310-linux_x86_64.whl
$PYTHON -m pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
if [ $? -eq 0 ];then
echo "Successfully Installed torch + cu117."
echo "Successfully Installed torch + cu116."
else
echo "Could not install torch + cu117." >&2
echo "Could not install torch + cu116." >&2
fi
fi

View File

@@ -36,9 +36,7 @@
" from torchdynamo.optimizations.backends import create_backend\n",
" from torchdynamo.optimizations.subgraph import SubGraph\n",
"except ModuleNotFoundError:\n",
" print(\n",
" \"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo\"\n",
" )\n",
" print(\"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo\")\n",
" exit()\n",
"\n",
"# torch-mlir imports for compiling\n",
@@ -99,9 +97,7 @@
"\n",
" for node in fx_g.graph.nodes:\n",
" if node.op == \"output\":\n",
" assert (\n",
" len(node.args) == 1\n",
" ), \"Output node must have a single argument\"\n",
" assert len(node.args) == 1, \"Output node must have a single argument\"\n",
" node_arg = node.args[0]\n",
" if isinstance(node_arg, tuple) and len(node_arg) == 1:\n",
" node.args = (node_arg[0],)\n",
@@ -120,12 +116,8 @@
" if len(args) == 1 and isinstance(args[0], list):\n",
" args = args[0]\n",
"\n",
" linalg_module = compile(\n",
" ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS\n",
" )\n",
" callable, _ = get_iree_compiled_module(\n",
" linalg_module, \"cuda\", func_name=\"forward\"\n",
" )\n",
" linalg_module = compile(ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS)\n",
" callable, _ = get_iree_compiled_module(linalg_module, \"cuda\", func_name=\"forward\")\n",
"\n",
" def forward(*inputs):\n",
" return callable(*inputs)\n",
@@ -220,7 +212,6 @@
" assert isinstance(subgraph, SubGraph), \"Model must be a dynamo SubGraph.\"\n",
" return __torch_mlir(subgraph.model, *list(subgraph.example_inputs))\n",
"\n",
"\n",
"@torchdynamo.optimize(\"torch_mlir\")\n",
"def toy_example2(*args):\n",
" a, b = args\n",

View File

@@ -22,7 +22,7 @@ class CLIPModule(tf.Module):
input_ids=x, attention_mask=y, pixel_values=z
)
@tf.function(input_signature=clip_vit_inputs, jit_compile=True)
@tf.function(input_signature=clip_vit_inputs)
def forward(self, input_ids, attention_mask, pixel_values):
return self.m.predict(
input_ids, attention_mask, pixel_values

View File

@@ -1,15 +0,0 @@
## Running ESRGAN
```
1. pip install numpy opencv-python
2. mkdir InputImages
(this is where all the input images will reside in)
3. mkdir OutputImages
(this is where the model will generate all the images)
4. mkdir models
(save the .pth checkpoint file here)
5. python esrgan.py
```
- Download [RRDB_ESRGAN_x4.pth](https://drive.google.com/drive/u/0/folders/17VYV_SoZZesU6mbxz2dMAIccSSlqLecY) and place it in the `models` directory as mentioned above in step 4.
- Credits : [ESRGAN](https://github.com/xinntao/ESRGAN)

View File

@@ -1,240 +0,0 @@
from ast import arg
import os.path as osp
import glob
import cv2
import numpy as np
import torch
from torch.fx.experimental.proxy_tensor import make_fx
from torch._decomp import get_decompositions
from shark.shark_inference import SharkInference
import torch_mlir
import tempfile
import functools
import torch
import torch.nn as nn
import torch.nn.functional as F
def make_layer(block, n_layers):
layers = []
for _ in range(n_layers):
layers.append(block())
return nn.Sequential(*layers)
class ResidualDenseBlock_5C(nn.Module):
def __init__(self, nf=64, gc=32, bias=True):
super(ResidualDenseBlock_5C, self).__init__()
# gc: growth channel, i.e. intermediate channels
self.conv1 = nn.Conv2d(nf, gc, 3, 1, 1, bias=bias)
self.conv2 = nn.Conv2d(nf + gc, gc, 3, 1, 1, bias=bias)
self.conv3 = nn.Conv2d(nf + 2 * gc, gc, 3, 1, 1, bias=bias)
self.conv4 = nn.Conv2d(nf + 3 * gc, gc, 3, 1, 1, bias=bias)
self.conv5 = nn.Conv2d(nf + 4 * gc, nf, 3, 1, 1, bias=bias)
self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
# initialization
# mutil.initialize_weights([self.conv1, self.conv2, self.conv3, self.conv4, self.conv5], 0.1)
def forward(self, x):
x1 = self.lrelu(self.conv1(x))
x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
return x5 * 0.2 + x
class RRDB(nn.Module):
"""Residual in Residual Dense Block"""
def __init__(self, nf, gc=32):
super(RRDB, self).__init__()
self.RDB1 = ResidualDenseBlock_5C(nf, gc)
self.RDB2 = ResidualDenseBlock_5C(nf, gc)
self.RDB3 = ResidualDenseBlock_5C(nf, gc)
def forward(self, x):
out = self.RDB1(x)
out = self.RDB2(out)
out = self.RDB3(out)
return out * 0.2 + x
class RRDBNet(nn.Module):
def __init__(self, in_nc, out_nc, nf, nb, gc=32):
super(RRDBNet, self).__init__()
RRDB_block_f = functools.partial(RRDB, nf=nf, gc=gc)
self.conv_first = nn.Conv2d(in_nc, nf, 3, 1, 1, bias=True)
self.RRDB_trunk = make_layer(RRDB_block_f, nb)
self.trunk_conv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
#### upsampling
self.upconv1 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
self.upconv2 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
self.HRconv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
self.conv_last = nn.Conv2d(nf, out_nc, 3, 1, 1, bias=True)
self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
def forward(self, x):
fea = self.conv_first(x)
trunk = self.trunk_conv(self.RRDB_trunk(fea))
fea = fea + trunk
fea = self.lrelu(
self.upconv1(F.interpolate(fea, scale_factor=2, mode="nearest"))
)
fea = self.lrelu(
self.upconv2(F.interpolate(fea, scale_factor=2, mode="nearest"))
)
out = self.conv_last(self.lrelu(self.HRconv(fea)))
return out
############### Parsing args #####################
import argparse
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
p.add_argument("--device", type=str, default="cpu", help="the device to use")
p.add_argument(
"--mlir_loc",
type=str,
default=None,
help="location of the model's mlir file",
)
args = p.parse_args()
###################################################
def inference(input_m):
return model(input_m)
def load_mlir(mlir_loc):
import os
if mlir_loc == None:
return None
print(f"Trying to load the model from {mlir_loc}.")
with open(os.path.join(mlir_loc)) as f:
mlir_module = f.read()
return mlir_module
def compile_through_fx(model, inputs, mlir_loc=None):
module = load_mlir(mlir_loc)
if module == None:
fx_g = make_fx(
model,
decomposition_table=get_decompositions(
[
torch.ops.aten.embedding_dense_backward,
torch.ops.aten.native_layer_norm_backward,
torch.ops.aten.slice_backward,
torch.ops.aten.select_backward,
torch.ops.aten.norm.ScalarOpt_dim,
torch.ops.aten.native_group_norm,
torch.ops.aten.upsample_bilinear2d.vec,
torch.ops.aten.split.Tensor,
torch.ops.aten.split_with_sizes,
]
),
)(inputs)
fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
fx_g.recompile()
def strip_overloads(gm):
"""
Modifies the target of graph nodes in :attr:`gm` to strip overloads.
Args:
gm(fx.GraphModule): The input Fx graph module to be modified
"""
for node in gm.graph.nodes:
if isinstance(node.target, torch._ops.OpOverload):
node.target = node.target.overloadpacket
gm.recompile()
strip_overloads(fx_g)
ts_g = torch.jit.script(fx_g)
print("Torchscript graph generated successfully")
module = torch_mlir.compile(
ts_g,
inputs,
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=False,
verbose=False,
)
mlir_model = str(module)
func_name = "forward"
shark_module = SharkInference(
mlir_model, func_name, device=args.device, mlir_dialect="linalg"
)
shark_module.compile()
return shark_module
model_path = "models/RRDB_ESRGAN_x4.pth" # models/RRDB_ESRGAN_x4.pth OR models/RRDB_PSNR_x4.pth
# device = torch.device('cuda') # if you want to run on CPU, change 'cuda' -> cpu
device = torch.device("cpu")
test_img_folder = "InputImages/*"
model = RRDBNet(3, 3, 64, 23, gc=32)
model.load_state_dict(torch.load(model_path), strict=True)
model.eval()
model = model.to(device)
print("Model path {:s}. \nTesting...".format(model_path))
if __name__ == "__main__":
idx = 0
for path in glob.glob(test_img_folder):
idx += 1
base = osp.splitext(osp.basename(path))[0]
print(idx, base)
# read images
img = cv2.imread(path, cv2.IMREAD_COLOR)
img = img * 1.0 / 255
img = torch.from_numpy(
np.transpose(img[:, :, [2, 1, 0]], (2, 0, 1))
).float()
img_LR = img.unsqueeze(0)
img_LR = img_LR.to(device)
with torch.no_grad():
shark_module = compile_through_fx(inference, img_LR)
shark_output = shark_module.forward((img_LR,))
shark_output = torch.from_numpy(shark_output)
shark_output = (
shark_output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
)
esrgan_output = (
model(img_LR).data.squeeze().float().cpu().clamp_(0, 1).numpy()
)
# SHARK OUTPUT
shark_output = np.transpose(shark_output[[2, 1, 0], :, :], (1, 2, 0))
shark_output = (shark_output * 255.0).round()
cv2.imwrite(
"OutputImages/{:s}_rlt_shark_output.png".format(base), shark_output
)
print("Generated SHARK's output")
# ESRGAN OUTPUT
esrgan_output = np.transpose(esrgan_output[[2, 1, 0], :, :], (1, 2, 0))
esrgan_output = (esrgan_output * 255.0).round()
cv2.imwrite(
"OutputImages/{:s}_rlt_esrgan_output.png".format(base),
esrgan_output,
)
print("Generated ESRGAN's output")

View File

@@ -28,7 +28,7 @@ class AlbertModule(tf.Module):
self.m = TFAutoModelForMaskedLM.from_pretrained("albert-base-v2")
self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)
@tf.function(input_signature=t5_inputs, jit_compile=True)
@tf.function(input_signature=t5_inputs)
def forward(self, input_ids, attention_mask):
return self.m.predict(input_ids, attention_mask)

View File

@@ -1,9 +1,7 @@
from shark.shark_inference import SharkInference
from shark.shark_downloader import download_model
from shark.shark_downloader import download_torch_model
mlir_model, func_name, inputs, golden_out = download_model(
"bloom", frontend="torch"
)
mlir_model, func_name, inputs, golden_out = download_torch_model("bloom")
shark_module = SharkInference(
mlir_model, func_name, device="cpu", mlir_dialect="tm_tensor"

View File

@@ -19,7 +19,7 @@ class GPT2Module(tf.Module):
self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)
@tf.function(input_signature=gpt2_inputs, jit_compile=True)
@tf.function(input_signature=gpt2_inputs)
def forward(self, input_ids, attention_mask):
return self.m.predict(input_ids, attention_mask)

View File

@@ -26,7 +26,7 @@ class BertModule(tf.Module):
input_ids=x, attention_mask=y, token_type_ids=z, training=False
)
@tf.function(input_signature=bert_input, jit_compile=True)
@tf.function(input_signature=bert_input)
def forward(self, input_ids, attention_mask, token_type_ids):
return self.m.predict(input_ids, attention_mask, token_type_ids)

View File

@@ -1,10 +1,9 @@
from shark.shark_inference import SharkInference
from shark.shark_downloader import download_model
from shark.shark_downloader import download_torch_model
mlir_model, func_name, inputs, golden_out = download_model(
"microsoft/MiniLM-L12-H384-uncased",
frontend="torch",
mlir_model, func_name, inputs, golden_out = download_torch_model(
"microsoft/MiniLM-L12-H384-uncased"
)

View File

@@ -26,7 +26,7 @@ class BertModule(tf.Module):
input_ids=x, attention_mask=y, token_type_ids=z, training=False
)
@tf.function(input_signature=bert_input, jit_compile=True)
@tf.function(input_signature=bert_input)
def forward(self, input_ids, attention_mask, token_type_ids):
return self.m.predict(input_ids, attention_mask, token_type_ids)

View File

@@ -5,7 +5,7 @@ import torchvision.models as models
from torchvision import transforms
import sys
from shark.shark_inference import SharkInference
from shark.shark_downloader import download_model
from shark.shark_downloader import download_torch_model
################################## Preprocessing inputs and model ############
@@ -66,12 +66,10 @@ labels = load_labels()
## Can pass any img or input to the forward module.
mlir_model, func_name, inputs, golden_out = download_model(
"resnet50", frontend="torch"
)
mlir_model, func_name, inputs, golden_out = download_torch_model("resnet50")
shark_module = SharkInference(mlir_model, func_name, mlir_dialect="linalg")
shark_module.compile()
# shark_module.compile()
path = shark_module.save_module()
shark_module.load_module(path)
result = shark_module.forward((img.detach().numpy(),))

View File

@@ -47,7 +47,7 @@ def load_mlir(mlir_loc):
return mlir_module
def compile_through_fx(model, inputs, mlir_loc=None, extra_args=[]):
def compile_through_fx(model, inputs, mlir_loc=None):
module = load_mlir(mlir_loc)
if mlir_loc == None:
@@ -98,12 +98,9 @@ def compile_through_fx(model, inputs, mlir_loc=None, extra_args=[]):
func_name = "forward"
shark_module = SharkInference(
mlir_model,
func_name,
device=args.device,
mlir_dialect="tm_tensor",
mlir_model, func_name, device=args.device, mlir_dialect="tm_tensor"
)
shark_module.compile(extra_args)
shark_module.compile()
return shark_module
@@ -164,7 +161,6 @@ if __name__ == "__main__":
unet,
(latent_model_input, torch.tensor([1.0]), text_embeddings),
args.mlir_loc,
["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
)
# torch.jit.script(unet)

View File

@@ -17,7 +17,7 @@ from keras_cv.models.generative.stable_diffusion.text_encoder import (
)
from shark.shark_inference import SharkInference
from shark.shark_downloader import download_model
from shark.shark_downloader import download_tf_model
from PIL import Image
# pip install "git+https://github.com/keras-team/keras-cv.git"
@@ -75,8 +75,8 @@ class SharkStableDiffusion:
# Create models
self.text_encoder = TextEncoder(MAX_PROMPT_LENGTH)
mlir_model, func_name, inputs, golden_out = download_model(
"stable_diff", tank_url="gs://shark_tank/quinn", frontend="tf"
mlir_model, func_name, inputs, golden_out = download_tf_model(
"stable_diff", tank_url="gs://shark_tank/quinn"
)
shark_module = SharkInference(
mlir_model, func_name, device=device, mlir_dialect="mhlo"

View File

@@ -1,2 +0,0 @@
*.vmfb
*.jpg

View File

@@ -1,100 +0,0 @@
# STABLE DIFFUSION
## Installation
Follow setup instructions in the main [README.md](https://github.com/nod-ai/SHARK#readme) for regular usage.
## Using other supported Stable Diffusion variants with SHARK:
Currently we support fine-tuned versions of Stable Diffusion such as:
- [AnythingV3](https://huggingface.co/Linaqruf/anything-v3.0)
- [Analog Diffusion](https://huggingface.co/wavymulder/Analog-Diffusion)
use the flag `--hf_model_id=` to specify the repo-id of the model to be used.
```shell
python .\shark\examples\shark_inference\stable_diffusion\main.py --hf_model_id="Linaqruf/anything-v3.0" --max_length=77 --prompt="1girl, brown hair, green eyes, colorful, autumn, cumulonimbus clouds, lighting, blue sky, falling leaves, garden"
```
## Run a custom model using a HuggingFace `.ckpt` file:
* Install the following by running :-
```shell
pip install omegaconf safetensors pytorch_lightning
```
* Download a [.ckpt](https://huggingface.co/andite/anything-v4.0/resolve/main/anything-v4.0-pruned-fp32.ckpt) file in case you don't have a locally generated `.ckpt` file for StableDiffusion.
* Now pass the above `.ckpt` file to `ckpt_loc` command-line argument using the following :-
```shell
python3.10 main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd" --max_length=64 --import_mlir --ckpt_loc="/path/to/.ckpt/file"
```
* We use a combination of 2 flags to make this feature work : `import_mlir` and `ckpt_loc`.
* In case `ckpt_loc` is NOT specified then a [default](https://huggingface.co/stabilityai/stable-diffusion-2-1-base) HuggingFace repo-id is run via `hf_model_id`. So, you can use `import_mlir` and `hf_model_id` to run HuggingFace's StableDiffusion variants.
* Use custom model `.ckpt` files from [HuggingFace-StableDiffusion](https://huggingface.co/models?other=stable-diffusion) to generate images.
## Running the model for a `batch_size` and for a set of `runs`:
We currently support batch size in the range `[1, 3]`.
You can specify batch size using `batch_size` flag (defaults to `1`) and the number of times you want to run the model using `runs` flag (defaults to `1`).
In total, you'll be able to generate `batch_size * runs` number of images.
- Usage 1: Using the same prompt -
```shell
python3.10 main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd" --max_length=64 --import_mlir --hf_model_id="runwayml/stable-diffusion-v1-5" --batch_size=3
```
The example above generates `3` different images in total with the same prompt `tajmahal, oil on canvas, sunflowers, 4k, uhd`.
- Usage 2: Using different prompts -
```shell
python3.10 main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd" --max_length=64 --import_mlir --hf_model_id="runwayml/stable-diffusion-v1-5" --batch_size=3 -p="batman riding a horse, oil on canvas, 4k, uhd" -p="superman riding a horse, oil on canvas, 4k, uhd"
```
The example above generates `1` image for each different prompt, thus generating `3` images in total.
- Usage 3: Using `runs` -
```shell
python3.10 main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd" --max_length=64 --import_mlir --hf_model_id="runwayml/stable-diffusion-v1-5" --batch_size=2 --runs=3
```
The example above generates `6` different images in total, `2` images for each `runs`.
</details>
<details>
<summary>Debug Commands</summary>
## Debug commands and other advanced usage follows.
```shell
python main.py --precision="fp32"|"fp16" --device="cpu"|"cuda"|"vulkan" --import_mlir|--no-import_mlir --prompt "enter the text"
```
## dump all dispatch .spv and isa using amdllpc
```shell
python main.py --precision="fp16" --device="vulkan" --iree-vulkan-target-triple=rdna3-unknown-linux --no-load_vmfb --dispatch_benchmarks="all" --dispatch_benchmarks_dir="SD_dispatches" --dump_isa
```
## Compile and save the .vmfb (using vulkan fp16 as an example):
```shell
python shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb
```
## Capture an RGP trace
```shell
python shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb --enable_rgp
```
## Run the vae module with iree-benchmark-module (NCHW, fp16, vulkan, for example):
```shell
iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --device=vulkan --function_input=1x4x64x64xf16
```
## Run the unet module with iree-benchmark-module (same config as above):
```shell
##if you want to use .npz inputs:
unzip ~/.local/shark_tank/<your unet>/inputs.npz
iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --function_input=@arr_0.npy --function_input=1xf16 --function_input=@arr_2.npy --function_input=@arr_3.npy --function_input=@arr_4.npy
```
</details>

View File

@@ -1,25 +0,0 @@
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(
text=["a photo of a cat", "a photo of a dog"],
images=image,
return_tensors="pt",
padding=True,
)
outputs = model(**inputs)
logits_per_image = (
outputs.logits_per_image
) # this is the image-text similarity score
probs = logits_per_image.softmax(
dim=1
) # we can take the softmax to get the label probabilities

View File

@@ -1,330 +0,0 @@
import os
import sys
if "AMD_ENABLE_LLPC" not in os.environ:
os.environ["AMD_ENABLE_LLPC"] = "1"
if sys.platform == "darwin":
os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"
from transformers import CLIPTextModel, CLIPTokenizer
import torch
from PIL import Image
from diffusers import (
LMSDiscreteScheduler,
PNDMScheduler,
DDIMScheduler,
DPMSolverMultistepScheduler,
EulerDiscreteScheduler,
)
from tqdm.auto import tqdm
import numpy as np
from random import randint
from stable_args import args
from datetime import datetime as dt
import json
import re
from pathlib import Path
from model_wrappers import SharkifyStableDiffusionModel
# This has to come before importing cache objects
if args.clear_all:
print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
from glob import glob
import shutil
vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
for vmfb in vmfbs:
if os.path.exists(vmfb):
os.remove(vmfb)
home = os.path.expanduser("~")
if os.name == "nt": # Windows
appdata = os.getenv("LOCALAPPDATA")
shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
elif os.name == "unix":
shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
shutil.rmtree(os.path.join(home, ".local/shark_tank"))
from utils import set_init_device_flags, disk_space_check, preprocessCKPT
from schedulers import (
SharkEulerDiscreteScheduler,
)
import time
from shark.iree_utils.compile_utils import dump_isas
# Helper function to profile the vulkan device.
def start_profiling(file_path="foo.rdc", profiling_mode="queue"):
if args.vulkan_debug_utils and "vulkan" in args.device:
import iree
print(f"Profiling and saving to {file_path}.")
vulkan_device = iree.runtime.get_device(args.device)
vulkan_device.begin_profiling(mode=profiling_mode, file_path=file_path)
return vulkan_device
return None
def end_profiling(device):
if device:
return device.end_profiling()
if __name__ == "__main__":
dtype = torch.float32 if args.precision == "fp32" else torch.half
# Make it as default prompt
if len(args.prompts) == 0:
args.prompts = ["cyberpunk forest by Salvador Dali"]
prompt = args.prompts
neg_prompt = args.negative_prompts
height = args.height
width = args.width
num_inference_steps = args.steps # Number of denoising steps
# Scale for classifier-free guidance
guidance_scale = torch.tensor(args.guidance_scale).to(torch.float32)
batch_size = args.batch_size
prompt = prompt * batch_size if len(prompt) == 1 else prompt
len_of_prompt = len(prompt)
assert (
len_of_prompt == batch_size
), f"no. of prompts ({len_of_prompt}) is not equal to batch_size ({batch_size})"
print("Running StableDiffusion with the following config :-")
print(f"Batch size : {batch_size}")
print(f"Prompts : {prompt}")
print(f"Runs : {args.runs}")
# Try to make neg_prompt equal to batch_size by appending blank strings.
for i in range(batch_size - len(neg_prompt)):
neg_prompt.append("")
set_init_device_flags()
disk_space_check(Path.cwd())
if not args.import_mlir:
from opt_params import get_unet, get_vae, get_clip
clip = get_clip()
unet = get_unet()
vae = get_vae()
else:
if ".ckpt" in args.ckpt_loc:
preprocessCKPT()
mlir_import = SharkifyStableDiffusionModel(
args.hf_model_id,
args.ckpt_loc,
args.precision,
max_len=args.max_length,
batch_size=batch_size,
height=height,
width=width,
use_base_vae=args.use_base_vae,
use_tuned=args.use_tuned,
)
clip, unet, vae = mlir_import()
if args.dump_isa:
dump_isas(args.dispatch_benchmarks_dir)
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
scheduler = DPMSolverMultistepScheduler.from_pretrained(
"CompVis/stable-diffusion-v1-4",
subfolder="scheduler",
)
cpu_scheduling = True
if args.hf_model_id == "stabilityai/stable-diffusion-2-1":
tokenizer = CLIPTokenizer.from_pretrained(
"stabilityai/stable-diffusion-2-1", subfolder="tokenizer"
)
scheduler = DPMSolverMultistepScheduler.from_pretrained(
"stabilityai/stable-diffusion-2-1",
subfolder="scheduler",
)
if args.hf_model_id == "stabilityai/stable-diffusion-2-1-base":
tokenizer = CLIPTokenizer.from_pretrained(
"stabilityai/stable-diffusion-2-1-base", subfolder="tokenizer"
)
if args.use_compiled_scheduler:
scheduler = SharkEulerDiscreteScheduler.from_pretrained(
"stabilityai/stable-diffusion-2-1-base",
subfolder="scheduler",
)
scheduler.compile()
cpu_scheduling = False
else:
scheduler = EulerDiscreteScheduler.from_pretrained(
"stabilityai/stable-diffusion-2-1-base",
subfolder="scheduler",
)
for run in range(args.runs):
# Handle out of range seeds.
uint32_info = np.iinfo(np.uint32)
uint32_min, uint32_max = uint32_info.min, uint32_info.max
seed = args.seed
if run >= 1 or seed < uint32_min or seed >= uint32_max:
seed = randint(uint32_min, uint32_max)
generator = torch.manual_seed(
seed
) # Seed generator to create the inital latent noise
# create a random initial latent.
latents = torch.randn(
(batch_size, 4, height // 8, width // 8),
generator=generator,
dtype=torch.float32,
).to(dtype)
if run == 0:
# Warmup phase to improve performance.
if args.warmup_count >= 1:
vae_warmup_input = torch.clone(latents).detach().numpy()
clip_warmup_input = torch.randint(1, 2, (2, args.max_length))
for i in range(args.warmup_count):
vae("forward", (vae_warmup_input,))
clip("forward", (clip_warmup_input,))
start = time.time()
if run == 0:
text_input = tokenizer(
prompt,
padding="max_length",
max_length=args.max_length,
truncation=True,
return_tensors="pt",
)
max_length = text_input.input_ids.shape[-1]
uncond_input = tokenizer(
neg_prompt,
padding="max_length",
max_length=max_length,
truncation=True,
return_tensors="pt",
)
text_input = torch.cat(
[uncond_input.input_ids, text_input.input_ids]
)
clip_inf_start = time.time()
text_embeddings = clip("forward", (text_input,))
clip_inf_end = time.time()
text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
text_embeddings_numpy = text_embeddings.detach().numpy()
scheduler.set_timesteps(num_inference_steps)
scheduler.is_scale_input_called = True
latents = latents * scheduler.init_noise_sigma
avg_ms = 0
for i, t in tqdm(
enumerate(scheduler.timesteps), disable=args.hide_steps
):
step_start = time.time()
if not args.hide_steps:
print(f"i = {i} t = {t}", end="")
timestep = torch.tensor([t]).to(dtype).detach().numpy()
latent_model_input = scheduler.scale_model_input(latents, t)
if cpu_scheduling:
latent_model_input = latent_model_input.detach().numpy()
profile_device = start_profiling(file_path="unet.rdc")
noise_pred = unet(
"forward",
(
latent_model_input,
timestep,
text_embeddings_numpy,
guidance_scale,
),
send_to_host=False,
)
end_profiling(profile_device)
if cpu_scheduling:
noise_pred = torch.from_numpy(noise_pred.to_host())
latents = scheduler.step(noise_pred, t, latents).prev_sample
else:
latents = scheduler.step(noise_pred, t, latents)
step_time = time.time() - step_start
avg_ms += step_time
step_ms = int((step_time) * 1000)
if not args.hide_steps:
print(f" ({step_ms}ms)")
# scale and decode the image latents with vae
if args.use_base_vae:
latents = 1 / 0.18215 * latents
latents_numpy = latents
if cpu_scheduling:
latents_numpy = latents.detach().numpy()
profile_device = start_profiling(file_path="vae.rdc")
vae_start = time.time()
images = vae("forward", (latents_numpy,))
vae_end = time.time()
end_profiling(profile_device)
if args.use_base_vae:
image = torch.from_numpy(images)
image = (image.detach().cpu() * 255.0).numpy()
images = image.round()
end_time = time.time()
avg_ms = 1000 * avg_ms / args.steps
clip_inf_time = (clip_inf_end - clip_inf_start) * 1000
vae_inf_time = (vae_end - vae_start) * 1000
total_time = end_time - start
print(f"\nStats for run {run}:")
print(f"Average step time: {avg_ms}ms/it")
print(f"Clip Inference time (ms) = {clip_inf_time:.3f}")
print(f"VAE Inference time (ms): {vae_inf_time:.3f}")
print(f"\nTotal image generation time: {total_time}sec")
images = torch.from_numpy(images).to(torch.uint8).permute(0, 2, 3, 1)
pil_images = [Image.fromarray(image) for image in images.numpy()]
if args.output_dir is not None:
output_path = Path(args.output_dir)
output_path.mkdir(parents=True, exist_ok=True)
else:
output_path = Path.cwd()
disk_space_check(output_path, lim=5)
for i in range(batch_size):
json_store = {
"prompt": prompt[i],
"negative prompt": args.negative_prompts[i],
"seed": seed,
"hf_model_id": args.hf_model_id,
"precision": args.precision,
"steps": args.steps,
"guidance_scale": args.guidance_scale,
"scheduler": args.scheduler,
}
prompt_slice = re.sub("[^a-zA-Z0-9]", "_", prompt[i][:15])
img_name = f"{prompt_slice}_{seed}_{run}_{i}_{dt.now().strftime('%y%m%d_%H%M%S')}"
if args.output_img_format == "jpg":
pil_images[i].save(
output_path / f"{img_name}.jpg",
quality=95,
subsampling=0,
optimize=True,
progressive=True,
)
else:
pil_images[i].save(output_path / f"{img_name}.png", "PNG")
if args.output_img_format not in ["png", "jpg"]:
print(
f"[ERROR] Format {args.output_img_format} is not supported yet."
"saving image as png. Supported formats png / jpg"
)
with open(output_path / f"{img_name}.json", "w") as f:
f.write(json.dumps(json_store, indent=4))

View File

@@ -1,251 +0,0 @@
from diffusers import AutoencoderKL, UNet2DConditionModel
from transformers import CLIPTextModel
from utils import compile_through_fx, get_opt_flags
from resources import base_models
from collections import defaultdict
import torch
import sys
# These shapes are parameter dependent.
def replace_shape_str(shape, max_len, width, height, batch_size):
new_shape = []
for i in range(len(shape)):
if shape[i] == "max_len":
new_shape.append(max_len)
elif shape[i] == "height":
new_shape.append(height)
elif shape[i] == "width":
new_shape.append(width)
elif isinstance(shape[i], str):
if "batch_size" in shape[i]:
mul_val = int(shape[i].split("*")[0])
new_shape.append(batch_size * mul_val)
else:
new_shape.append(shape[i])
return new_shape
# Get the input info for various models i.e. "unet", "clip", "vae".
def get_input_info(model_info, max_len, width, height, batch_size):
dtype_config = {"f32": torch.float32, "i64": torch.int64}
input_map = defaultdict(list)
for k in model_info:
for inp in model_info[k]:
shape = model_info[k][inp]["shape"]
dtype = dtype_config[model_info[k][inp]["dtype"]]
tensor = None
if isinstance(shape, list):
clean_shape = replace_shape_str(
shape, max_len, width, height, batch_size
)
if dtype == torch.int64:
tensor = torch.randint(1, 3, tuple(clean_shape))
else:
tensor = torch.randn(*clean_shape).to(dtype)
elif isinstance(shape, int):
tensor = torch.tensor(shape).to(dtype)
else:
sys.exit("shape isn't specified correctly.")
input_map[k].append(tensor)
return input_map
class SharkifyStableDiffusionModel:
def __init__(
self,
model_id: str,
custom_weights: str,
precision: str,
max_len: int = 64,
width: int = 512,
height: int = 512,
batch_size: int = 1,
use_base_vae: bool = False,
use_tuned: bool = False,
):
self.check_params(max_len, width, height)
self.max_len = max_len
self.height = height // 8
self.width = width // 8
self.batch_size = batch_size
self.model_id = model_id if custom_weights == "" else custom_weights
self.precision = precision
self.base_vae = use_base_vae
self.model_name = (
str(batch_size)
+ "_"
+ str(max_len)
+ "_"
+ str(height)
+ "_"
+ str(width)
+ "_"
+ precision
)
self.use_tuned = use_tuned
# We need a better naming convention for the .vmfbs because despite
# using the custom model variant the .vmfb names remain the same and
# it'll always pick up the compiled .vmfb instead of compiling the
# custom model.
# So, currently, we add `self.model_id` in the `self.model_name` of
# .vmfb file.
# TODO: Have a better way of naming the vmfbs using self.model_name.
import re
model_name = re.sub(r"\W+", "_", self.model_id)
if model_name[0] == "_":
model_name = model_name[1:]
self.model_name = self.model_name + "_" + model_name
def check_params(self, max_len, width, height):
if not (max_len >= 32 and max_len <= 77):
sys.exit("please specify max_len in the range [32, 77].")
if not (width % 8 == 0 and width >= 384):
sys.exit("width should be greater than 384 and multiple of 8")
if not (height % 8 == 0 and height >= 384):
sys.exit("height should be greater than 384 and multiple of 8")
def get_vae(self):
class VaeModel(torch.nn.Module):
def __init__(self, model_id=self.model_id, base_vae=self.base_vae):
super().__init__()
self.vae = AutoencoderKL.from_pretrained(
model_id,
subfolder="vae",
)
self.base_vae = base_vae
def forward(self, input):
if not self.base_vae:
input = 1 / 0.18215 * input
x = self.vae.decode(input, return_dict=False)[0]
x = (x / 2 + 0.5).clamp(0, 1)
if self.base_vae:
return x
x = x * 255.0
return x.round()
vae = VaeModel()
inputs = tuple(self.inputs["vae"])
is_f16 = True if self.precision == "fp16" else False
vae_name = "base_vae" if self.base_vae else "vae"
shark_vae = compile_through_fx(
vae,
inputs,
is_f16=is_f16,
model_name=vae_name + self.model_name,
use_tuned=self.use_tuned,
extra_args=get_opt_flags("vae", precision=self.precision),
)
return shark_vae
def get_unet(self):
class UnetModel(torch.nn.Module):
def __init__(self, model_id=self.model_id):
super().__init__()
self.unet = UNet2DConditionModel.from_pretrained(
model_id,
subfolder="unet",
)
self.in_channels = self.unet.in_channels
self.train(False)
def forward(
self, latent, timestep, text_embedding, guidance_scale
):
# expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
latents = torch.cat([latent] * 2)
unet_out = self.unet.forward(
latents, timestep, text_embedding, return_dict=False
)[0]
noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
noise_pred = noise_pred_uncond + guidance_scale * (
noise_pred_text - noise_pred_uncond
)
return noise_pred
unet = UnetModel()
is_f16 = True if self.precision == "fp16" else False
inputs = tuple(self.inputs["unet"])
input_mask = [True, True, True, False]
shark_unet = compile_through_fx(
unet,
inputs,
model_name="unet" + self.model_name,
is_f16=is_f16,
f16_input_mask=input_mask,
use_tuned=self.use_tuned,
extra_args=get_opt_flags("unet", precision=self.precision),
)
return shark_unet
def get_clip(self):
class CLIPText(torch.nn.Module):
def __init__(self, model_id=self.model_id):
super().__init__()
self.text_encoder = CLIPTextModel.from_pretrained(
model_id,
subfolder="text_encoder",
)
def forward(self, input):
return self.text_encoder(input)[0]
clip_model = CLIPText()
shark_clip = compile_through_fx(
clip_model,
tuple(self.inputs["clip"]),
model_name="clip" + self.model_name,
extra_args=get_opt_flags("clip", precision="fp32"),
)
return shark_clip
def __call__(self):
from utils import get_vmfb_path_name
from stable_args import args
import traceback, functools, operator, os
model_name = ["clip", "base_vae" if self.base_vae else "vae", "unet"]
vmfb_path = [
get_vmfb_path_name(model + self.model_name)[0]
for model in model_name
]
for model_id in base_models:
self.inputs = get_input_info(
base_models[model_id],
self.max_len,
self.width,
self.height,
self.batch_size,
)
try:
compiled_unet = self.get_unet()
compiled_vae = self.get_vae()
compiled_clip = self.get_clip()
except Exception as e:
if args.enable_stack_trace:
traceback.print_exc()
vmfb_present = [os.path.isfile(vmfb) for vmfb in vmfb_path]
all_vmfb_present = functools.reduce(
operator.__and__, vmfb_present
)
# We need to delete vmfbs only if some of the models were compiled.
if not all_vmfb_present:
for i in range(len(vmfb_path)):
if vmfb_present[i]:
os.remove(vmfb_path[i])
print("Deleted: ", vmfb_path[i])
print("Retrying with a different base model configuration")
continue
# This is done just because in main.py we are basing the choice of tokenizer and scheduler
# on `args.hf_model_id`. Since now, we don't maintain 1:1 mapping of variants and the base
# model and rely on retrying method to find the input configuration, we should also update
# the knowledge of base model id accordingly into `args.hf_model_id`.
if args.ckpt_loc != "":
args.hf_model_id = model_id
return compiled_clip, compiled_unet, compiled_vae
sys.exit(
"Cannot compile the model. Please use `enable_stack_trace` and create an issue at https://github.com/nod-ai/SHARK/issues"
)

View File

@@ -1,109 +0,0 @@
import sys
from resources import models_db
from stable_args import args
from utils import get_shark_model
BATCH_SIZE = len(args.prompts)
if BATCH_SIZE != 1:
sys.exit("Only batch size 1 is supported.")
hf_model_variant_map = {
"Linaqruf/anything-v3.0": ["anythingv3", "v2_1base"],
"dreamlike-art/dreamlike-diffusion-1.0": ["dreamlike", "v2_1base"],
"prompthero/openjourney": ["openjourney", "v2_1base"],
"wavymulder/Analog-Diffusion": ["analogdiffusion", "v2_1base"],
"stabilityai/stable-diffusion-2-1": ["stablediffusion", "v2_1"],
"stabilityai/stable-diffusion-2-1-base": ["stablediffusion", "v2_1base"],
"CompVis/stable-diffusion-v1-4": ["stablediffusion", "v1_4"],
}
variant, version = hf_model_variant_map[args.hf_model_id]
def get_params(bucket_key, model_key, model, is_tuned, precision):
iree_flags = []
if len(args.iree_vulkan_target_triple) > 0:
iree_flags.append(
f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
)
# Disable bindings fusion to work with moltenVK.
if sys.platform == "darwin":
iree_flags.append("-iree-stream-fuse-binding=false")
try:
bucket = models_db[0][bucket_key]
model_name = models_db[1][model_key]
iree_flags += models_db[2][model][is_tuned][precision][
"default_compilation_flags"
]
except KeyError:
raise Exception(
f"{bucket_key}/{model_key} is not present in the models database"
)
if (
"specified_compilation_flags"
in models_db[2][model][is_tuned][precision]
):
device = (
args.device
if "://" not in args.device
else args.device.split("://")[0]
)
if (
device
not in models_db[2][model][is_tuned][precision][
"specified_compilation_flags"
]
):
device = "default_device"
iree_flags += models_db[2][model][is_tuned][precision][
"specified_compilation_flags"
][device]
return bucket, model_name, iree_flags
def get_unet():
# Tuned model is present only for `fp16` precision.
is_tuned = "tuned" if args.use_tuned else "untuned"
if "vulkan" not in args.device and args.use_tuned:
bucket_key = f"{variant}/{is_tuned}/{args.device}"
model_key = f"{variant}/{version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}/{args.device}"
else:
bucket_key = f"{variant}/{is_tuned}"
model_key = f"{variant}/{version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}"
bucket, model_name, iree_flags = get_params(
bucket_key, model_key, "unet", is_tuned, args.precision
)
return get_shark_model(bucket, model_name, iree_flags)
def get_vae():
# Tuned model is present only for `fp16` precision.
is_tuned = "tuned" if args.use_tuned else "untuned"
is_base = "/base" if args.use_base_vae else ""
if "vulkan" not in args.device and args.use_tuned:
bucket_key = f"{variant}/{is_tuned}/{args.device}"
model_key = f"{variant}/{version}/vae/{args.precision}/length_77/{is_tuned}{is_base}/{args.device}"
else:
bucket_key = f"{variant}/{is_tuned}"
model_key = f"{variant}/{version}/vae/{args.precision}/length_77/{is_tuned}{is_base}"
bucket, model_name, iree_flags = get_params(
bucket_key, model_key, "vae", is_tuned, args.precision
)
return get_shark_model(bucket, model_name, iree_flags)
def get_clip():
bucket_key = f"{variant}/untuned"
model_key = (
f"{variant}/{version}/clip/fp32/length_{args.max_length}/untuned"
)
bucket, model_name, iree_flags = get_params(
bucket_key, model_key, "clip", "untuned", "fp32"
)
return get_shark_model(bucket, model_name, iree_flags)

View File

@@ -1,44 +0,0 @@
Compile / Run Instructions:
To compile .vmfb for SD (vae, unet, CLIP), run the following commands with the .mlir in your local shark_tank cache (default location for Linux users is `~/.local/shark_tank`). These will be available once the script from [this README](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md) is run once.
Running the script mentioned above with the `--save_vmfb` flag will also save the .vmfb in your SHARK base directory if you want to skip straight to benchmarks.
Compile Commands FP32/FP16:
```shell
Vulkan AMD:
iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
# add --mlir-print-debuginfo --mlir-print-op-on-diagnostic=true for debug
# use iree-input-type=mhlo for tf models
CUDA NVIDIA:
iree-compile --iree-input-type=none --iree-hal-target-backends=cuda --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
CPU:
iree-compile --iree-input-type=none --iree-hal-target-backends=llvm-cpu --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
```
Run / Benchmark Command (FP32 - NCHW):
(NEED to use BS=2 since we do two forward passes to unet as a result of classifier free guidance.)
```shell
## Vulkan AMD:
iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --device=vulkan --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
## CUDA:
iree-benchmark-module --module_file=/path/to/vmfb --entry_function=forward --device=cuda --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
## CPU:
iree-benchmark-module --module_file=/path/to/vmfb --entry_function=forward --device=local-task --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
```
Run via vulkan_gui for RGP Profiling:
To build the vulkan app for profiling UNet follow the instructions [here](https://github.com/nod-ai/SHARK/tree/main/cpp) and then run the following command from the cpp directory with your compiled stable_diff.vmfb
```shell
./build/vulkan_gui/iree-vulkan-gui --module_file=/path/to/unet.vmfb --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
```

View File

@@ -1,37 +0,0 @@
import os
import json
import sys
def resource_path(relative_path):
"""Get absolute path to resource, works for dev and for PyInstaller"""
base_path = getattr(
sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
)
return os.path.join(base_path, relative_path)
def get_json_file(path):
json_var = []
loc_json = resource_path(path)
if os.path.exists(loc_json):
with open(loc_json, encoding="utf-8") as fopen:
json_var = json.load(fopen)
if not json_var:
print(f"Unable to fetch {path}")
return json_var
# TODO: This shouldn't be called from here, every time the file imports
# it will run all the global vars.
prompts_examples = get_json_file("resources/prompts.json")
models_db = get_json_file("resources/model_db.json")
# The base_model contains the input configuration for the different
# models and also helps in providing information for the variants.
base_models = get_json_file("resources/base_model.json")
# Contains optimization flags for different models.
opt_flags = get_json_file("resources/opt_flags.json")

View File

@@ -1,98 +0,0 @@
{
"stabilityai/stable-diffusion-2-1": {
"unet": {
"latents": {
"shape": [
"1*batch_size",
4,
"height",
"width"
],
"dtype": "f32"
},
"timesteps": {
"shape": [
1
],
"dtype": "f32"
},
"embedding": {
"shape": [
"2*batch_size",
"max_len",
1024
],
"dtype": "f32"
},
"guidance_scale": {
"shape": 2,
"dtype": "f32"
}
},
"vae": {
"latents" : {
"shape" : [
"1*batch_size",4,"height","width"
],
"dtype":"f32"
}
},
"clip": {
"token" : {
"shape" : [
"2*batch_size",
"max_len"
],
"dtype":"i64"
}
}
},
"CompVis/stable-diffusion-v1-4": {
"unet": {
"latents": {
"shape": [
"1*batch_size",
4,
"height",
"width"
],
"dtype": "f32"
},
"timesteps": {
"shape": [
1
],
"dtype": "f32"
},
"embedding": {
"shape": [
"2*batch_size",
"max_len",
768
],
"dtype": "f32"
},
"guidance_scale": {
"shape": 2,
"dtype": "f32"
}
},
"vae": {
"latents" : {
"shape" : [
"1*batch_size",4,"height","width"
],
"dtype":"f32"
}
},
"clip": {
"token" : {
"shape" : [
"2*batch_size",
"max_len"
],
"dtype":"i64"
}
}
}
}

View File

@@ -1,21 +0,0 @@
[
{
"stablediffusion/v1_4":"CompVis/stable-diffusion-v1-4",
"stablediffusion/v2_1base":"stabilityai/stable-diffusion-2-1-base",
"stablediffusion/v2_1":"stabilityai/stable-diffusion-2-1",
"anythingv3/v1_4":"Linaqruf/anything-v3.0",
"analogdiffusion/v1_4":"wavymulder/Analog-Diffusion",
"openjourney/v1_4":"prompthero/openjourney",
"dreamlike/v1_4":"dreamlike-art/dreamlike-diffusion-1.0"
},
{
"stablediffusion/fp16":"fp16",
"stablediffusion/fp32":"main",
"anythingv3/fp16":"diffusers",
"anythingv3/fp32":"diffusers",
"analogdiffusion/fp16":"main",
"analogdiffusion/fp32":"main",
"openjourney/fp16":"main",
"openjourney/fp32":"main"
}
]

View File

@@ -1,177 +0,0 @@
[
{
"stablediffusion/untuned":"gs://shark_tank/stable_diffusion",
"stablediffusion/tuned":"gs://shark_tank/sd_tuned",
"stablediffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
"anythingv3/untuned":"gs://shark_tank/sd_anythingv3",
"anythingv3/tuned":"gs://shark_tank/sd_tuned",
"anythingv3/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
"analogdiffusion/untuned":"gs://shark_tank/sd_analog_diffusion",
"analogdiffusion/tuned":"gs://shark_tank/sd_tuned",
"analogdiffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
"openjourney/untuned":"gs://shark_tank/sd_openjourney",
"openjourney/tuned":"gs://shark_tank/sd_tuned",
"dreamlike/untuned":"gs://shark_tank/sd_dreamlike_diffusion"
},
{
"stablediffusion/v1_4/unet/fp16/length_77/untuned":"unet_8dec_fp16",
"stablediffusion/v1_4/unet/fp16/length_77/tuned":"unet_8dec_fp16_tuned",
"stablediffusion/v1_4/unet/fp16/length_77/tuned/cuda":"unet_8dec_fp16_cuda_tuned",
"stablediffusion/v1_4/unet/fp32/length_77/untuned":"unet_1dec_fp32",
"stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_19dec_fp16",
"stablediffusion/v1_4/vae/fp16/length_77/tuned":"vae_19dec_fp16_tuned",
"stablediffusion/v1_4/vae/fp16/length_77/tuned/cuda":"vae_19dec_fp16_cuda_tuned",
"stablediffusion/v1_4/vae/fp16/length_77/untuned/base":"vae_8dec_fp16",
"stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1dec_fp32",
"stablediffusion/v1_4/clip/fp32/length_77/untuned":"clip_18dec_fp32",
"stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet2base_8dec_fp16",
"stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
"stablediffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"unet2base_8dec_fp16_cuda_tuned",
"stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet_19dec_v2p1base_fp16_64",
"stablediffusion/v2_1base/unet/fp16/length_64/tuned":"unet_19dec_v2p1base_fp16_64_tuned",
"stablediffusion/v2_1base/unet/fp16/length_64/tuned/cuda":"unet_19dec_v2p1base_fp16_64_cuda_tuned",
"stablediffusion/v2_1base/vae/fp16/length_77/untuned":"vae2base_19dec_fp16",
"stablediffusion/v2_1base/vae/fp16/length_77/tuned":"vae2base_19dec_fp16_tuned",
"stablediffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"vae2base_19dec_fp16_cuda_tuned",
"stablediffusion/v2_1base/vae/fp16/length_77/untuned/base":"vae2base_8dec_fp16",
"stablediffusion/v2_1base/vae/fp16/length_77/tuned/base":"vae2base_8dec_fp16_tuned",
"stablediffusion/v2_1base/vae/fp16/length_77/tuned/base/cuda":"vae2base_8dec_fp16_cuda_tuned",
"stablediffusion/v2_1base/clip/fp32/length_77/untuned":"clip2base_18dec_fp32",
"stablediffusion/v2_1base/clip/fp32/length_64/untuned":"clip_19dec_v2p1base_fp32_64",
"stablediffusion/v2_1/unet/fp16/length_77/untuned":"unet2_14dec_fp16",
"stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae2_19dec_fp16",
"stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"vae2_8dec_fp16",
"stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip2_18dec_fp32",
"anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
"anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
"anythingv3/v2_1base/unet/fp16/length_77/tuned/cuda":"av3_unet_19dec_fp16_cuda_tuned",
"anythingv3/v2_1base/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
"anythingv3/v2_1base/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
"anythingv3/v2_1base/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
"anythingv3/v2_1base/vae/fp16/length_77/tuned/cuda":"av3_vae_19dec_fp16_cuda_tuned",
"anythingv3/v2_1base/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
"anythingv3/v2_1base/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
"anythingv3/v2_1base/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
"anythingv3/v2_1base/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
"analogdiffusion/v2_1base/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
"analogdiffusion/v2_1base/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
"analogdiffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"ad_unet_19dec_fp16_cuda_tuned",
"analogdiffusion/v2_1base/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
"analogdiffusion/v2_1base/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
"analogdiffusion/v2_1base/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
"analogdiffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"ad_vae_19dec_fp16_cuda_tuned",
"analogdiffusion/v2_1base/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
"analogdiffusion/v2_1base/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
"analogdiffusion/v2_1base/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
"analogdiffusion/v2_1base/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
"openjourney/v2_1base/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
"openjourney/v2_1base/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
"openjourney/v2_1base/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
"openjourney/v2_1base/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
"openjourney/v2_1base/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
"openjourney/v2_1base/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
"openjourney/v2_1base/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
"dreamlike/v2_1base/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
"dreamlike/v2_1base/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
"dreamlike/v2_1base/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
"dreamlike/v2_1base/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
"dreamlike/v2_1base/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
"dreamlike/v2_1base/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
"dreamlike/v2_1base/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
},
{
"unet": {
"tuned": {
"fp16": {
"default_compilation_flags": []
},
"fp32": {
"default_compilation_flags": []
}
},
"untuned": {
"fp16": {
"default_compilation_flags": [
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=32"
],
"specified_compilation_flags": {
"cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
"default_device": ["--iree-flow-enable-conv-img2col-transform"]
}
},
"fp32": {
"default_compilation_flags": [
"--iree-flow-enable-conv-nchw-to-nhwc-transform",
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=16"
]
}
}
},
"vae": {
"tuned": {
"fp16": {
"default_compilation_flags": [
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=32",
"--iree-flow-enable-conv-img2col-transform"
]
},
"fp32": {
"default_compilation_flags": [
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=32",
"--iree-flow-enable-conv-img2col-transform"
]
}
},
"untuned": {
"fp16": {
"default_compilation_flags": [
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=32",
"--iree-flow-enable-conv-img2col-transform"
]
},
"fp32": {
"default_compilation_flags": [
"--iree-flow-enable-conv-nchw-to-nhwc-transform",
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=16"
]
}
}
},
"clip": {
"tuned": {
"fp16": {
"default_compilation_flags": [
"--iree-flow-linalg-ops-padding-size=16",
"--iree-flow-enable-padding-linalg-ops"
]
},
"fp32": {
"default_compilation_flags": [
"--iree-flow-linalg-ops-padding-size=16",
"--iree-flow-enable-padding-linalg-ops"
]
}
},
"untuned": {
"fp16": {
"default_compilation_flags": [
"--iree-flow-linalg-ops-padding-size=16",
"--iree-flow-enable-padding-linalg-ops"
]
},
"fp32": {
"default_compilation_flags": [
"--iree-flow-linalg-ops-padding-size=16",
"--iree-flow-enable-padding-linalg-ops"
]
}
}
}
}
]

View File

@@ -1,101 +0,0 @@
{
"unet": {
"tuned": {
"fp16": {
"default_compilation_flags": []
},
"fp32": {
"default_compilation_flags": []
}
},
"untuned": {
"fp16": {
"default_compilation_flags": [
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=32"
],
"specified_compilation_flags": {
"cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
"default_device": ["--iree-flow-enable-conv-img2col-transform"]
}
},
"fp32": {
"default_compilation_flags": [
"--iree-flow-enable-conv-nchw-to-nhwc-transform",
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=16"
]
}
}
},
"vae": {
"tuned": {
"fp16": {
"default_compilation_flags": [],
"specified_compilation_flags": {
"cuda": [],
"default_device": ["--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=32",
"--iree-flow-enable-conv-img2col-transform"]
}
},
"fp32": {
"default_compilation_flags": [],
"specified_compilation_flags": {
"cuda": [],
"default_device": [
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=32",
"--iree-flow-enable-conv-img2col-transform"
]
}
}
},
"untuned": {
"fp16": {
"default_compilation_flags": [
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=32",
"--iree-flow-enable-conv-img2col-transform"
]
},
"fp32": {
"default_compilation_flags": [
"--iree-flow-enable-conv-nchw-to-nhwc-transform",
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=16"
]
}
}
},
"clip": {
"tuned": {
"fp16": {
"default_compilation_flags": [
"--iree-flow-linalg-ops-padding-size=16",
"--iree-flow-enable-padding-linalg-ops"
]
},
"fp32": {
"default_compilation_flags": [
"--iree-flow-linalg-ops-padding-size=16",
"--iree-flow-enable-padding-linalg-ops"
]
}
},
"untuned": {
"fp16": {
"default_compilation_flags": [
"--iree-flow-linalg-ops-padding-size=16",
"--iree-flow-enable-padding-linalg-ops"
]
},
"fp32": {
"default_compilation_flags": [
"--iree-flow-linalg-ops-padding-size=16",
"--iree-flow-enable-padding-linalg-ops"
]
}
}
}
}

View File

@@ -1,8 +0,0 @@
[["A high tech solarpunk utopia in the Amazon rainforest"],
["A pikachu fine dining with a view to the Eiffel Tower"],
["A mecha robot in a favela in expressionist style"],
["an insect robot preparing a delicious meal"],
["A digital Illustration of the Babel tower, 4k, detailed, trending in artstation, fantasy vivid colors"],
["Cluttered house in the woods, anime, oil painting, high resolution, cottagecore, ghibli inspired, 4k"],
["A beautiful mansion beside a waterfall in the woods, by josef thoma, matte painting, trending on artstation HQ"],
["portrait photo of a asia old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes"]]

View File

@@ -1,144 +0,0 @@
import sys
import numpy as np
from typing import List, Optional, Tuple, Union
from diffusers import (
LMSDiscreteScheduler,
PNDMScheduler,
DDIMScheduler,
DPMSolverMultistepScheduler,
EulerDiscreteScheduler,
)
from diffusers.configuration_utils import register_to_config
from utils import compile_through_fx, get_shark_model
from stable_args import args
import torch
SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
BATCH_SIZE = len(args.prompts)
if len(args.prompts) == 0:
BATCH_SIZE = 1
model_input = {
"euler": {
"latent": torch.randn(
BATCH_SIZE, 4, args.height // 8, args.width // 8
),
"output": torch.randn(
BATCH_SIZE, 4, args.height // 8, args.width // 8
),
"sigma": torch.tensor(1).to(torch.float32),
"dt": torch.tensor(1).to(torch.float32),
},
}
class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
@register_to_config
def __init__(
self,
num_train_timesteps: int = 1000,
beta_start: float = 0.0001,
beta_end: float = 0.02,
beta_schedule: str = "linear",
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
prediction_type: str = "epsilon",
):
super().__init__(
num_train_timesteps,
beta_start,
beta_end,
beta_schedule,
trained_betas,
prediction_type,
)
def compile(self):
example_latent = model_input["euler"]["latent"]
example_output = model_input["euler"]["output"]
if args.precision == "fp16":
example_latent = example_latent.half()
example_output = example_output.half()
example_sigma = model_input["euler"]["sigma"]
example_dt = model_input["euler"]["dt"]
class ScalingModel(torch.nn.Module):
def __init__(self):
super().__init__()
def forward(self, latent, sigma):
return latent / ((sigma**2 + 1) ** 0.5)
class SchedulerStepModel(torch.nn.Module):
def __init__(self):
super().__init__()
def forward(self, noise_pred, sigma, latent, dt):
pred_original_sample = latent - sigma * noise_pred
derivative = (latent - pred_original_sample) / sigma
return latent + derivative * dt
iree_flags = []
if len(args.iree_vulkan_target_triple) > 0:
iree_flags.append(
f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
)
# Disable bindings fusion to work with moltenVK.
if sys.platform == "darwin":
iree_flags.append("-iree-stream-fuse-binding=false")
if args.import_mlir:
scaling_model = ScalingModel()
self.scaling_model = compile_through_fx(
scaling_model,
(example_latent, example_sigma),
model_name=f"euler_scale_model_input_{BATCH_SIZE}_{args.height}_{args.width}"
+ args.precision,
extra_args=iree_flags,
)
step_model = SchedulerStepModel()
self.step_model = compile_through_fx(
step_model,
(example_output, example_sigma, example_latent, example_dt),
model_name=f"euler_step_{BATCH_SIZE}_{args.height}_{args.width}"
+ args.precision,
extra_args=iree_flags,
)
else:
self.scaling_model = get_shark_model(
SCHEDULER_BUCKET,
"euler_scale_model_input_" + args.precision,
iree_flags,
)
self.step_model = get_shark_model(
SCHEDULER_BUCKET, "euler_step_" + args.precision, iree_flags
)
def scale_model_input(self, sample, timestep):
step_index = (self.timesteps == timestep).nonzero().item()
sigma = self.sigmas[step_index]
return self.scaling_model(
"forward",
(
sample,
sigma,
),
send_to_host=False,
)
def step(self, noise_pred, timestep, latent):
step_index = (self.timesteps == timestep).nonzero().item()
sigma = self.sigmas[step_index]
dt = self.sigmas[step_index + 1] - sigma
return self.step_model(
"forward",
(
noise_pred,
sigma,
latent,
dt,
),
send_to_host=False,
)

View File

@@ -1,191 +0,0 @@
import os
from shark.model_annotation import model_annotation, create_context
from shark.iree_utils._common import iree_target_map, run_cmd
from shark.shark_downloader import (
download_model,
download_public_file,
WORKDIR,
)
from shark.parser import shark_args
from stable_args import args
device = (
args.device if "://" not in args.device else args.device.split("://")[0]
)
# Download the model (Unet or VAE fp16) from shark_tank
def load_model_from_tank():
from opt_params import get_params, version, variant
shark_args.local_tank_cache = args.local_tank_cache
bucket_key = f"{variant}/untuned"
if args.annotation_model == "unet":
model_key = f"{variant}/{version}/unet/{args.precision}/length_{args.max_length}/untuned"
elif args.annotation_model == "vae":
is_base = "/base" if args.use_base_vae else ""
model_key = f"{variant}/{version}/vae/{args.precision}/length_77/untuned{is_base}"
bucket, model_name, iree_flags = get_params(
bucket_key, model_key, args.annotation_model, "untuned", args.precision
)
mlir_model, func_name, inputs, golden_out = download_model(
model_name,
tank_url=bucket,
frontend="torch",
)
return mlir_model, model_name
# Download the tuned config files from shark_tank
def load_winograd_configs():
config_bucket = "gs://shark_tank/sd_tuned/configs/"
config_name = f"{args.annotation_model}_winograd_{device}.json"
full_gs_url = config_bucket + config_name
winograd_config_dir = f"{WORKDIR}configs/" + config_name
print("Loading Winograd config file from ", winograd_config_dir)
download_public_file(full_gs_url, winograd_config_dir, True)
return winograd_config_dir
def load_lower_configs():
from opt_params import version, variant
config_bucket = "gs://shark_tank/sd_tuned/configs/"
config_version = version
if variant in ["anythingv3", "analogdiffusion"]:
args.max_length = 77
config_version = "v1_4"
if args.annotation_model == "vae":
args.max_length = 77
config_name = f"{args.annotation_model}_{config_version}_{args.precision}_len{args.max_length}_{device}.json"
full_gs_url = config_bucket + config_name
lowering_config_dir = f"{WORKDIR}configs/" + config_name
print("Loading lowering config file from ", lowering_config_dir)
download_public_file(full_gs_url, lowering_config_dir, True)
return lowering_config_dir
# Annotate the model with Winograd attribute on selected conv ops
def annotate_with_winograd(input_mlir, winograd_config_dir, model_name):
if model_name.split("_")[-1] != "tuned":
out_file_path = (
f"{args.annotation_output}/{model_name}_tuned_torch.mlir"
)
else:
out_file_path = f"{args.annotation_output}/{model_name}_torch.mlir"
with create_context() as ctx:
winograd_model = model_annotation(
ctx,
input_contents=input_mlir,
config_path=winograd_config_dir,
search_op="conv",
winograd=True,
)
with open(out_file_path, "w") as f:
f.write(str(winograd_model))
f.close()
return winograd_model, out_file_path
# For Unet annotate the model with tuned lowering configs
def annotate_with_lower_configs(
input_mlir, lowering_config_dir, model_name, use_winograd
):
if use_winograd:
dump_after = "iree-linalg-ext-convert-conv2d-to-winograd"
else:
dump_after = "iree-flow-pad-linalg-ops"
# Dump IR after padding/img2col/winograd passes
device_spec_args = ""
if device == "cuda":
from shark.iree_utils.gpu_utils import get_iree_gpu_args
gpu_flags = get_iree_gpu_args()
for flag in gpu_flags:
device_spec_args += flag + " "
elif device == "vulkan":
device_spec_args = (
f"--iree-vulkan-target-triple={args.iree_vulkan_target_triple} "
)
print("Applying tuned configs on", model_name)
run_cmd(
f"iree-compile {input_mlir} "
"--iree-input-type=tm_tensor "
f"--iree-hal-target-backends={iree_target_map(device)} "
f"{device_spec_args}"
"--iree-stream-resource-index-bits=64 "
"--iree-vm-target-index-bits=64 "
"--iree-flow-enable-padding-linalg-ops "
"--iree-flow-linalg-ops-padding-size=32 "
"--iree-flow-enable-conv-img2col-transform "
f"--mlir-print-ir-after={dump_after} "
"--compile-to=flow "
f"2>{args.annotation_output}/dump_after_winograd.mlir "
)
# Annotate the model with lowering configs in the config file
with create_context() as ctx:
tuned_model = model_annotation(
ctx,
input_contents=f"{args.annotation_output}/dump_after_winograd.mlir",
config_path=lowering_config_dir,
search_op="all",
)
# Remove the intermediate mlir and save the final annotated model
os.remove(f"{args.annotation_output}/dump_after_winograd.mlir")
if model_name.split("_")[-1] != "tuned":
out_file_path = (
f"{args.annotation_output}/{model_name}_tuned_torch.mlir"
)
else:
out_file_path = f"{args.annotation_output}/{model_name}_torch.mlir"
with open(out_file_path, "w") as f:
f.write(str(tuned_model))
f.close()
return tuned_model, out_file_path
def sd_model_annotation(mlir_model, model_name, model_from_tank=False):
if args.annotation_model == "unet" and device == "vulkan":
use_winograd = True
winograd_config_dir = load_winograd_configs()
winograd_model, model_path = annotate_with_winograd(
mlir_model, winograd_config_dir, model_name
)
lowering_config_dir = load_lower_configs()
tuned_model, output_path = annotate_with_lower_configs(
model_path, lowering_config_dir, model_name, use_winograd
)
elif args.annotation_model == "vae" and device == "vulkan":
use_winograd = True
winograd_config_dir = load_winograd_configs()
tuned_model, output_path = annotate_with_winograd(
mlir_model, winograd_config_dir, model_name
)
else:
use_winograd = False
if model_from_tank:
mlir_model = f"{WORKDIR}{model_name}_torch/{model_name}_torch.mlir"
else:
# Just use this function to convert bytecode to string
orig_model, model_path = annotate_with_winograd(
mlir_model, "", model_name
)
mlir_model = model_path
lowering_config_dir = load_lower_configs()
tuned_model, output_path = annotate_with_lower_configs(
mlir_model, lowering_config_dir, model_name, use_winograd
)
print(f"Saved the annotated mlir in {output_path}.")
return tuned_model, output_path
if __name__ == "__main__":
mlir_model, model_name = load_model_from_tank()
sd_model_annotation(mlir_model, model_name, model_from_tank=True)

View File

@@ -1,74 +0,0 @@
# -*- mode: python ; coding: utf-8 -*-
from PyInstaller.utils.hooks import collect_data_files
from PyInstaller.utils.hooks import copy_metadata
import sys ; sys.setrecursionlimit(sys.getrecursionlimit() * 5)
datas = []
datas += collect_data_files('torch')
datas += copy_metadata('torch')
datas += copy_metadata('tqdm')
datas += copy_metadata('regex')
datas += copy_metadata('requests')
datas += copy_metadata('packaging')
datas += copy_metadata('filelock')
datas += copy_metadata('numpy')
datas += copy_metadata('tokenizers')
datas += copy_metadata('importlib_metadata')
datas += copy_metadata('torchvision')
datas += copy_metadata('torch-mlir')
datas += copy_metadata('diffusers')
datas += copy_metadata('transformers')
datas += collect_data_files('iree')
datas += collect_data_files('google-cloud-storage')
datas += collect_data_files('shark')
datas += [
( 'resources/prompts.json', 'resources'),
( 'resources/model_db.json', 'resources'),
( 'resources/base_model.json', 'resources'),
( 'resources/opt_flags.json', 'resources'),
]
binaries = []
block_cipher = None
a = Analysis(
['main.py'],
pathex=['.'],
binaries=binaries,
datas=datas,
hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core' ],
hookspath=[],
hooksconfig={},
runtime_hooks=[],
excludes=[],
win_no_prefer_redirects=False,
win_private_assemblies=False,
cipher=block_cipher,
noarchive=False,
)
pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
exe = EXE(
pyz,
a.scripts,
a.binaries,
a.zipfiles,
a.datas,
[],
name='shark_sd_cli',
debug=False,
bootloader_ignore_signals=False,
strip=False,
upx=True,
upx_exclude=[],
runtime_tmpdir=None,
console=True,
disable_windowed_traceback=False,
argv_emulation=False,
target_arch=None,
codesign_identity=None,
entitlements_file=None,
)

View File

@@ -1,302 +0,0 @@
import argparse
from pathlib import Path
def path_expand(s):
return Path(s).expanduser().resolve()
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
##############################################################################
### Stable Diffusion Params
##############################################################################
p.add_argument(
"-p",
"--prompts",
action="append",
default=[],
help="text of which images to be generated.",
)
p.add_argument(
"--negative-prompts",
nargs="+",
default=[""],
help="text you don't want to see in the generated image.",
)
p.add_argument(
"--steps",
type=int,
default=50,
help="the no. of steps to do the sampling.",
)
p.add_argument(
"--seed",
type=int,
default=42,
help="the seed to use.",
)
p.add_argument(
"--batch_size",
type=int,
default=1,
choices=range(1, 4),
help="the number of inferences to be made in a single `run`.",
)
p.add_argument(
"--height",
type=int,
default=512,
help="the height of the output image.",
)
p.add_argument(
"--width",
type=int,
default=512,
help="the width of the output image.",
)
p.add_argument(
"--guidance_scale",
type=float,
default=7.5,
help="the value to be used for guidance scaling.",
)
p.add_argument(
"--max_length",
type=int,
default=64,
help="max length of the tokenizer output, options are 64 and 77.",
)
##############################################################################
### Model Config and Usage Params
##############################################################################
p.add_argument(
"--device", type=str, default="vulkan", help="device to run the model."
)
p.add_argument(
"--precision", type=str, default="fp16", help="precision to run the model."
)
p.add_argument(
"--import_mlir",
default=False,
action=argparse.BooleanOptionalAction,
help="imports the model from torch module to shark_module otherwise downloads the model from shark_tank.",
)
p.add_argument(
"--load_vmfb",
default=True,
action=argparse.BooleanOptionalAction,
help="attempts to load the model from a precompiled flatbuffer and compiles + saves it if not found.",
)
p.add_argument(
"--save_vmfb",
default=False,
action=argparse.BooleanOptionalAction,
help="saves the compiled flatbuffer to the local directory",
)
p.add_argument(
"--use_tuned",
default=True,
action=argparse.BooleanOptionalAction,
help="Download and use the tuned version of the model if available",
)
p.add_argument(
"--use_base_vae",
default=False,
action=argparse.BooleanOptionalAction,
help="Do conversion from the VAE output to pixel space on cpu.",
)
p.add_argument(
"--scheduler",
type=str,
default="SharkEulerDiscrete",
help="other supported schedulers are [PNDM, DDIM, LMSDiscrete, EulerDiscrete, DPMSolverMultistep]",
)
p.add_argument(
"--output_img_format",
type=str,
default="png",
help="specify the format in which output image is save. Supported options: jpg / png",
)
p.add_argument(
"--output_dir",
type=str,
default=None,
help="Directory path to save the output images and json",
)
p.add_argument(
"--runs",
type=int,
default=1,
help="number of images to be generated with random seeds in single execution",
)
p.add_argument(
"--ckpt_loc",
type=str,
default="",
help="Path to SD's .ckpt file.",
)
p.add_argument(
"--hf_model_id",
type=str,
default="stabilityai/stable-diffusion-2-1-base",
help="The repo-id of hugging face.",
)
p.add_argument(
"--enable_stack_trace",
default=False,
action=argparse.BooleanOptionalAction,
help="Enable showing the stack trace when retrying the base model configuration",
)
##############################################################################
### IREE - Vulkan supported flags
##############################################################################
p.add_argument(
"--iree-vulkan-target-triple",
type=str,
default="",
help="Specify target triple for vulkan",
)
p.add_argument(
"--vulkan_debug_utils",
default=False,
action=argparse.BooleanOptionalAction,
help="Profiles vulkan device and collects the .rdc info",
)
p.add_argument(
"--vulkan_large_heap_block_size",
default="4147483648",
help="flag for setting VMA preferredLargeHeapBlockSize for vulkan device, default is 4G",
)
p.add_argument(
"--vulkan_validation_layers",
default=False,
action=argparse.BooleanOptionalAction,
help="flag for disabling vulkan validation layers when benchmarking",
)
##############################################################################
### Misc. Debug and Optimization flags
##############################################################################
p.add_argument(
"--use_compiled_scheduler",
default=True,
action=argparse.BooleanOptionalAction,
help="use the default scheduler precompiled into the model if available",
)
p.add_argument(
"--local_tank_cache",
default="",
help="Specify where to save downloaded shark_tank artifacts. If this is not set, the default is ~/.local/shark_tank/.",
)
p.add_argument(
"--dump_isa",
default=False,
action="store_true",
help="When enabled call amdllpc to get ISA dumps. use with dispatch benchmarks.",
)
p.add_argument(
"--dispatch_benchmarks",
default=None,
help='dispatches to return benchamrk data on. use "All" for all, and None for none.',
)
p.add_argument(
"--dispatch_benchmarks_dir",
default="temp_dispatch_benchmarks",
help='directory where you want to store dispatch data generated with "--dispatch_benchmarks"',
)
p.add_argument(
"--enable_rgp",
default=False,
action=argparse.BooleanOptionalAction,
help="flag for inserting debug frames between iterations for use with rgp.",
)
p.add_argument(
"--hide_steps",
default=True,
action=argparse.BooleanOptionalAction,
help="flag for hiding the details of iteration/sec for each step.",
)
p.add_argument(
"--warmup_count",
type=int,
default=0,
help="flag setting warmup count for clip and vae [>= 0].",
)
p.add_argument(
"--clear_all",
default=False,
action=argparse.BooleanOptionalAction,
help="flag to clear all mlir and vmfb from common locations. Recompiling will take several minutes",
)
##############################################################################
### Web UI flags
##############################################################################
p.add_argument(
"--progress_bar",
default=True,
action=argparse.BooleanOptionalAction,
help="flag for removing the pregress bar animation during image generation",
)
##############################################################################
### SD model auto-annotation flags
##############################################################################
p.add_argument(
"--annotation_output",
type=path_expand,
default="./",
help="Directory to save the annotated mlir file",
)
p.add_argument(
"--annotation_model",
type=str,
default="unet",
help="Options are unet and vae.",
)
args = p.parse_args()

View File

@@ -1,154 +0,0 @@
# Stable Diffusion optimized for AMD RDNA2/RDNA3 GPUs
Before you start, please be aware that this is beta software that relies on a special AMD driver. Like all StableDiffusion GUIs published so far, you need some technical expertise to set it up. We apologize in advance if you bump into issues. If that happens, please don't hesitate to ask our Discord community for help! If you still can't get it to work, we're sorry, and please be assured that we (Nod and AMD) are working hard to improve the user experience in coming months.
If it works well for you, please "star" the following GitHub projects... this is one of the best ways to help and spread the word!
* https://github.com/nod-ai/SHARK
* https://github.com/iree-org/iree
## Install this specific AMD Drivers (AMD latest may not have all the fixes).
### AMD KB Drivers for RDNA2 and RDNA3:
*AMD Software: Adrenalin Edition 22.11.1 for MLIR/IREE Driver Version 22.20.29.09 for Windows® 10 and Windows® 11 (Windows Driver Store Version 31.0.12029.9003)*
First, for RDNA2 users, download this special driver in a folder of your choice. We recommend you keep the installation files around, since you may need to re-install it later, if Windows Update decides to overwrite it:
https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mlir-iree
For RDNA3, the latest driver 23.1.2 supports MLIR/IREE as well: https://www.amd.com/en/support/kb/release-notes/rn-rad-win-23-1-2-kb
KNOWN ISSUES with this special AMD driver:
* `Windows Update` may (depending how it's configured) automatically install a new official AMD driver that overwrites this IREE-specific driver. If Stable Diffusion used to work, then a few days later, it slows down a lot or produces incorrect results (e.g. black images), this may be the cause. To fix this problem, please check the installed driver version, and re-install the special driver if needed. (TODO: document how to prevent this `Windows Update` behavior!)
* Some people using this special driver experience mouse pointer accuracy issues, especially if using a larger-than-default mouse pointer. The clicked point isn't centered properly. One possible work-around is to reset the pointer size to "1" in "Change pointer size and color".
## Installation
Download the latest Windows SHARK SD binary [469 here](https://github.com/nod-ai/SHARK/releases/download/20230124.469/shark_sd_20230124_469.exe) in a folder of your choice. If you want nighly builds, you can look for them on the GitHub releases page.
Notes:
* We recommend that you download this EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files. Those contain Vulkan dispatches compiled from MLIR which can be outdated if you run a new EXE from the same folder. You can use `--clean_all` flag once to clean all the old files.
* If you recently updated the driver or this binary (EXE file), we recommend you:
* clear all the local artifacts with `--clear_all` OR
* clear the Vulkan shader cache: For Windows users this can be done by clearing the contents of `C:\Users\%username%\AppData\Local\AMD\VkCache\`. On Linux the same cache is typically located at `~/.cache/AMD/VkCache/`.
* clear the `huggingface` cache. In Windows, this is `C:\Users\%username%\.cache\huggingface`.
## Running
* Open a Command Prompt or Powershell terminal, change folder (`cd`) to the .exe folder. Then run the EXE from the command prompt. That way, if an error occurs, you'll be able to cut-and-paste it to ask for help. (if it always works for you without error, you may simply double-click the EXE to start the web browser)
* The first run may take about 10-15 minutes when the models are downloaded and compiled. Your patience is appreciated. The download could be about 5GB.
* If successful, you will likely see a Windows Defender message asking you to give permission to open a web server port. Accept it.
* Open a browser to access the Stable Diffusion web server. By default, the port is 8080, so you can go to http://localhost:8080/?__theme=dark.
## Stopping
* Select the command prompt that's running the EXE. Press CTRL-C and wait a moment. The application should stop.
* Please make sure to do the above step before you attempt to update the EXE to a new version.
# Results
<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
Here are some samples generated:
![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
<details>
<summary>Advanced Installation </summary>
## Setup your Python Virtual Environment and Dependencies
<details>
<summary> Windows 10/11 Users </summary>
* Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)
* Install Git for Windows from [here](https://git-scm.com/download/win)
#### Allow the install script to run in Powershell
```powershell
set-executionpolicy remotesigned
```
#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
```powershell
git clone https://github.com/nod-ai/SHARK.git
cd SHARK
./setup_venv.ps1 #You can re-run this script to get the latest version
```
</details>
<details>
<summary>Linux</summary>
```shell
git clone https://github.com/nod-ai/SHARK.git
cd SHARK
./setup_venv.sh
source shark.venv/bin/activate
```
</details>
### Run Stable Diffusion on your device - WebUI
<details>
<summary>Windows 10/11 Users</summary>
```powershell
(shark.venv) PS C:\Users\nod\SHARK> cd web
(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
```
</details>
<details>
<summary>Linux Users</summary>
```shell
(shark.venv) > cd web
(shark.venv) > python index.py
```
</details>
### Run Stable Diffusion on your device - Commandline
<details>
<summary>Windows 10/11 Users</summary>
```powershell
(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
```
</details>
<details>
<summary>Linux</summary>
```shell
python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
```
</details>
The output on a 7900XTX would like:
```shell
Stats for run 0:
Average step time: 47.19188690185547ms/it
Clip Inference time (ms) = 109.531
VAE Inference time (ms): 78.590
Total image generation time: 2.5788655281066895sec
```
For more options to the Stable Diffusion model read [this](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md)
</details>
<details>
<summary>Discord link</summary>
Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware.
</details>

View File

@@ -1,15 +0,0 @@
You need to pre-create your bot (https://core.telegram.org/bots#how-do-i-create-a-bot)
Then create in the directory web file .env
In it the record:
TG_TOKEN="your_token"
specifying your bot's token from previous step.
Then run telegram_bot.py with the same parameters that you use when running index.py, for example:
python telegram_bot.py --max_length=77 --vulkan_large_heap_block_size=0 --use_base_vae --local_tank_cache h:\shark\TEMP
Bot commands:
/select_model
/select_scheduler
/set_steps "integer number of steps"
/set_guidance_scale "integer number"
/set_negative_prompt "negative text"
Any other text triggers the creation of an image based on it.

View File

@@ -1,383 +0,0 @@
import os
import gc
import torch
from shark.shark_inference import SharkInference
from stable_args import args
from shark.shark_importer import import_with_fx
from shark.iree_utils.vulkan_utils import (
set_iree_vulkan_runtime_flags,
get_vulkan_target_triple,
)
from shark.iree_utils.gpu_utils import get_cuda_sm_cc
from resources import opt_flags
from sd_annotation import sd_model_annotation
import sys
def get_vmfb_path_name(model_name):
device = (
args.device
if "://" not in args.device
else "-".join(args.device.split("://"))
)
extended_name = "{}_{}".format(model_name, device)
vmfb_path = os.path.join(os.getcwd(), extended_name + ".vmfb")
return [vmfb_path, extended_name]
def _compile_module(shark_module, model_name, extra_args=[]):
if args.load_vmfb or args.save_vmfb:
[vmfb_path, extended_name] = get_vmfb_path_name(model_name)
if args.load_vmfb and os.path.isfile(vmfb_path) and not args.save_vmfb:
print(f"loading existing vmfb from: {vmfb_path}")
shark_module.load_module(vmfb_path, extra_args=extra_args)
else:
if args.save_vmfb:
print("Saving to {}".format(vmfb_path))
else:
print(
"No vmfb found. Compiling and saving to {}".format(
vmfb_path
)
)
path = shark_module.save_module(
os.getcwd(), extended_name, extra_args
)
shark_module.load_module(path, extra_args=extra_args)
else:
shark_module.compile(extra_args)
return shark_module
# Downloads the model from shark_tank and returns the shark_module.
def get_shark_model(tank_url, model_name, extra_args=[]):
from shark.shark_downloader import download_model
from shark.parser import shark_args
# Set local shark_tank cache directory.
shark_args.local_tank_cache = args.local_tank_cache
if "cuda" in args.device:
shark_args.enable_tf32 = True
mlir_model, func_name, inputs, golden_out = download_model(
model_name,
tank_url=tank_url,
frontend="torch",
)
shark_module = SharkInference(
mlir_model, device=args.device, mlir_dialect="linalg"
)
return _compile_module(shark_module, model_name, extra_args)
# Converts the torch-module into a shark_module.
def compile_through_fx(
model,
inputs,
model_name,
is_f16=False,
f16_input_mask=None,
use_tuned=False,
extra_args=[],
):
from shark.parser import shark_args
if "cuda" in args.device:
shark_args.enable_tf32 = True
mlir_module, func_name = import_with_fx(
model, inputs, is_f16, f16_input_mask
)
if use_tuned:
model_name = model_name + "_tuned"
tuned_model_path = f"{args.annotation_output}/{model_name}_torch.mlir"
if not os.path.exists(tuned_model_path):
if "vae" in model_name.split("_")[0]:
args.annotation_model = "vae"
tuned_model, tuned_model_path = sd_model_annotation(
mlir_module, model_name
)
del mlir_module, tuned_model
gc.collect()
with open(tuned_model_path, "rb") as f:
mlir_module = f.read()
f.close()
shark_module = SharkInference(
mlir_module,
device=args.device,
mlir_dialect="linalg",
)
return _compile_module(shark_module, model_name, extra_args)
def set_iree_runtime_flags():
vulkan_runtime_flags = [
f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
]
if args.enable_rgp:
vulkan_runtime_flags += [
f"--enable_rgp=true",
f"--vulkan_debug_utils=true",
]
set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
def get_all_devices(driver_name):
"""
Inputs: driver_name
Returns a list of all the available devices for a given driver sorted by
the iree path names of the device as in --list_devices option in iree.
"""
from iree.runtime import get_driver
driver = get_driver(driver_name)
device_list_src = driver.query_available_devices()
device_list_src.sort(key=lambda d: d["path"])
return device_list_src
def get_device_mapping(driver, key_combination=3):
"""This method ensures consistent device ordering when choosing
specific devices for execution
Args:
driver (str): execution driver (vulkan, cuda, rocm, etc)
key_combination (int, optional): choice for mapping value for device name.
1 : path
2 : name
3 : (name, path)
Defaults to 3.
Returns:
dict: map to possible device names user can input mapped to desired combination of name/path.
"""
from shark.iree_utils._common import iree_device_map
driver = iree_device_map(driver)
device_list = get_all_devices(driver)
device_map = dict()
def get_output_value(dev_dict):
if key_combination == 1:
return f"{driver}://{dev_dict['path']}"
if key_combination == 2:
return dev_dict["name"]
if key_combination == 3:
return (dev_dict["name"], f"{driver}://{dev_dict['path']}")
# mapping driver name to default device (driver://0)
device_map[f"{driver}"] = get_output_value(device_list[0])
for i, device in enumerate(device_list):
# mapping with index
device_map[f"{driver}://{i}"] = get_output_value(device)
# mapping with full path
device_map[f"{driver}://{device['path']}"] = get_output_value(device)
return device_map
def map_device_to_name_path(device, key_combination=3):
"""Gives the appropriate device data (supported name/path) for user selected execution device
Args:
device (str): user
key_combination (int, optional): choice for mapping value for device name.
1 : path
2 : name
3 : (name, path)
Defaults to 3.
Raises:
ValueError:
Returns:
str / tuple: returns the mapping str or tuple of mapping str for the device depending on key_combination value
"""
driver = device.split("://")[0]
device_map = get_device_mapping(driver, key_combination)
try:
device_mapping = device_map[device]
except KeyError:
raise ValueError(f"Device '{device}' is not a valid device.")
return device_mapping
def set_init_device_flags():
if "vulkan" in args.device:
# set runtime flags for vulkan.
set_iree_runtime_flags()
# set triple flag to avoid multiple calls to get_vulkan_triple_flag
device_name, args.device = map_device_to_name_path(args.device)
if not args.iree_vulkan_target_triple:
triple = get_vulkan_target_triple(device_name)
if triple is not None:
args.iree_vulkan_target_triple = triple
print(
f"Found device {device_name}. Using target triple {args.iree_vulkan_target_triple}."
)
elif "cuda" in args.device:
args.device = "cuda"
elif "cpu" in args.device:
args.device = "cpu"
# set max_length based on availability.
if args.hf_model_id in [
"Linaqruf/anything-v3.0",
"wavymulder/Analog-Diffusion",
"dreamlike-art/dreamlike-diffusion-1.0",
]:
args.max_length = 77
elif args.hf_model_id == "prompthero/openjourney":
args.max_length = 64
# Use tuned models in the case of fp16, vulkan rdna3 or cuda sm devices.
if (
args.hf_model_id
in ["prompthero/openjourney", "dreamlike-art/dreamlike-diffusion-1.0"]
or args.precision != "fp16"
or args.height != 512
or args.width != 512
or args.batch_size != 1
or ("vulkan" not in args.device and "cuda" not in args.device)
):
args.use_tuned = False
elif (
"vulkan" in args.device
and "rdna3" not in args.iree_vulkan_target_triple
):
args.use_tuned = False
elif "cuda" in args.device and get_cuda_sm_cc() not in [
"sm_80",
"sm_84",
"sm_86",
"sm_89",
]:
args.use_tuned = False
elif args.use_base_vae and args.hf_model_id not in [
"stabilityai/stable-diffusion-2-1-base",
"CompVis/stable-diffusion-v1-4",
]:
args.use_tuned = False
if args.use_tuned:
print(f"Using {args.device} tuned models for stablediffusion/fp16.")
else:
print("Tuned models are currently not supported for this setting.")
# Utility to get list of devices available.
def get_available_devices():
def get_devices_by_name(driver_name):
from shark.iree_utils._common import iree_device_map
device_list = []
try:
driver_name = iree_device_map(driver_name)
device_list_dict = get_all_devices(driver_name)
print(f"{driver_name} devices are available.")
except:
print(f"{driver_name} devices are not available.")
else:
for i, device in enumerate(device_list_dict):
device_list.append(f"{driver_name}://{i} => {device['name']}")
return device_list
set_iree_runtime_flags()
available_devices = []
vulkan_devices = get_devices_by_name("vulkan")
available_devices.extend(vulkan_devices)
cuda_devices = get_devices_by_name("cuda")
available_devices.extend(cuda_devices)
available_devices.append("cpu")
return available_devices
def disk_space_check(path, lim=20):
from shutil import disk_usage
du = disk_usage(path)
free = du.free / (1024 * 1024 * 1024)
if free <= lim:
print(f"[WARNING] Only {free:.2f}GB space available in {path}.")
def get_opt_flags(model, precision="fp16"):
iree_flags = []
is_tuned = "tuned" if args.use_tuned else "untuned"
if len(args.iree_vulkan_target_triple) > 0:
iree_flags.append(
f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
)
# Disable bindings fusion to work with moltenVK.
if sys.platform == "darwin":
iree_flags.append("-iree-stream-fuse-binding=false")
if "default_compilation_flags" in opt_flags[model][is_tuned][precision]:
iree_flags += opt_flags[model][is_tuned][precision][
"default_compilation_flags"
]
if "specified_compilation_flags" in opt_flags[model][is_tuned][precision]:
device = (
args.device
if "://" not in args.device
else args.device.split("://")[0]
)
if (
device
not in opt_flags[model][is_tuned][precision][
"specified_compilation_flags"
]
):
device = "default_device"
iree_flags += opt_flags[model][is_tuned][precision][
"specified_compilation_flags"
][device]
return iree_flags
def preprocessCKPT():
from pathlib import Path
path = Path(args.ckpt_loc)
diffusers_path = path.parent.absolute()
diffusers_directory_name = path.stem
complete_path_to_diffusers = diffusers_path / diffusers_directory_name
complete_path_to_diffusers.mkdir(parents=True, exist_ok=True)
print(
"Created directory : ",
diffusers_directory_name,
" at -> ",
diffusers_path,
)
path_to_diffusers = complete_path_to_diffusers.as_posix()
# TODO: Use the SD to Diffusers CKPT pipeline once it's included in the release.
sd_to_diffusers = os.path.join(os.getcwd(), "sd_to_diffusers.py")
if not os.path.isfile(sd_to_diffusers):
url = "https://raw.githubusercontent.com/huggingface/diffusers/8a3f0c1f7178f4a3d5a5b21ae8c2906f473e240d/scripts/convert_original_stable_diffusion_to_diffusers.py"
import requests
req = requests.get(url)
open(sd_to_diffusers, "wb").write(req.content)
print("Downloaded SD to Diffusers converter")
else:
print("SD to Diffusers converter already exists")
os.system(
"python "
+ sd_to_diffusers
+ " --checkpoint_path="
+ args.ckpt_loc
+ " --dump_path="
+ path_to_diffusers
)
args.ckpt_loc = path_to_diffusers
print("Custom model path is : ", args.ckpt_loc)

View File

@@ -18,7 +18,7 @@ class T5Module(tf.Module):
self.m = TFT5Model.from_pretrained("t5-small")
self.m.predict = lambda x, y: self.m(input_ids=x, decoder_input_ids=y)
@tf.function(input_signature=t5_inputs, jit_compile=True)
@tf.function(input_signature=t5_inputs)
def forward(self, input_ids, decoder_input_ids):
return self.m.predict(input_ids, decoder_input_ids)

View File

@@ -1,21 +0,0 @@
import requests
from PIL import Image
from io import BytesIO
from pipeline_shark_stable_diffusion_upscale import (
SharkStableDiffusionUpscalePipeline,
)
import torch
model_id = "stabilityai/stable-diffusion-x4-upscaler"
pipeline = SharkStableDiffusionUpscalePipeline(model_id)
# let's download an image
url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
response = requests.get(url)
low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
low_res_img = low_res_img.resize((128, 128))
prompt = "a white cat"
upscaled_image = pipeline(prompt=prompt, image=low_res_img).images[0]
upscaled_image.save("upsampled_cat.png")

View File

@@ -1,99 +0,0 @@
from diffusers import AutoencoderKL, UNet2DConditionModel
from transformers import CLIPTextModel
from utils import compile_through_fx
import torch
model_id = "stabilityai/stable-diffusion-x4-upscaler"
model_input = {
"clip": (torch.randint(1, 2, (1, 77)),),
"vae": (torch.randn(1, 4, 128, 128),),
"unet": (
torch.randn(2, 7, 128, 128), # latents
torch.tensor([1]).to(torch.float32), # timestep
torch.randn(2, 77, 1024), # embedding
torch.randn(2).to(torch.int64), # noise_level
),
}
def get_clip_mlir(model_name="clip_text", extra_args=[]):
text_encoder = CLIPTextModel.from_pretrained(
model_id,
subfolder="text_encoder",
)
class CLIPText(torch.nn.Module):
def __init__(self):
super().__init__()
self.text_encoder = text_encoder
def forward(self, input):
return self.text_encoder(input)[0]
clip_model = CLIPText()
shark_clip = compile_through_fx(
clip_model,
model_input["clip"],
model_name=model_name,
extra_args=extra_args,
)
return shark_clip
def get_vae_mlir(model_name="vae", extra_args=[]):
class VaeModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.vae = AutoencoderKL.from_pretrained(
model_id,
subfolder="vae",
)
def forward(self, input):
x = self.vae.decode(input, return_dict=False)[0]
return x
vae = VaeModel()
shark_vae = compile_through_fx(
vae,
model_input["vae"],
model_name=model_name,
extra_args=extra_args,
)
return shark_vae
def get_unet_mlir(model_name="unet", extra_args=[]):
class UnetModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.unet = UNet2DConditionModel.from_pretrained(
model_id,
subfolder="unet",
)
self.in_channels = self.unet.in_channels
self.train(False)
def forward(self, latent, timestep, text_embedding, noise_level):
unet_out = self.unet.forward(
latent,
timestep,
text_embedding,
noise_level,
return_dict=False,
)[0]
return unet_out
unet = UnetModel()
f16_input_mask = (True, True, True, False)
shark_unet = compile_through_fx(
unet,
model_input["unet"],
model_name=model_name,
is_f16=True,
f16_input_mask=f16_input_mask,
extra_args=extra_args,
)
return shark_unet

View File

@@ -1,53 +0,0 @@
import sys
from model_wrappers import (
get_vae_mlir,
get_unet_mlir,
get_clip_mlir,
)
from upscaler_args import args
from utils import get_shark_model
BATCH_SIZE = len(args.prompts)
if BATCH_SIZE != 1:
sys.exit("Only batch size 1 is supported.")
unet_flag = [
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=32",
"--iree-flow-enable-conv-img2col-transform",
]
vae_flag = [
"--iree-flow-enable-conv-nchw-to-nhwc-transform",
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=16",
]
clip_flag = [
"--iree-flow-linalg-ops-padding-size=16",
"--iree-flow-enable-padding-linalg-ops",
]
bucket = "gs://shark_tank/stable_diffusion/"
def get_unet():
model_name = "upscaler_unet"
if args.import_mlir:
return get_unet_mlir(model_name, unet_flag)
return get_shark_model(bucket, model_name, unet_flag)
def get_vae():
model_name = "upscaler_vae"
if args.import_mlir:
return get_vae_mlir(model_name, vae_flag)
return get_shark_model(bucket, model_name, vae_flag)
def get_clip():
model_name = "upscaler_clip"
if args.import_mlir:
return get_clip_mlir(model_name, clip_flag)
return get_shark_model(bucket, model_name, clip_flag)

View File

@@ -1,490 +0,0 @@
import inspect
from typing import Callable, List, Optional, Union
import numpy as np
import torch
import PIL
from PIL import Image
from diffusers.utils import is_accelerate_available
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel
from diffusers import (
DDIMScheduler,
DDPMScheduler,
LMSDiscreteScheduler,
PNDMScheduler,
)
from diffusers import logging
from diffusers.pipeline_utils import ImagePipelineOutput
from opt_params import get_unet, get_vae, get_clip
from tqdm.auto import tqdm
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
def preprocess(image):
if isinstance(image, torch.Tensor):
return image
elif isinstance(image, PIL.Image.Image):
image = [image]
if isinstance(image[0], PIL.Image.Image):
w, h = image[0].size
w, h = map(
lambda x: x - x % 64, (w, h)
) # resize to integer multiple of 64
image = [np.array(i.resize((w, h)))[None, :] for i in image]
image = np.concatenate(image, axis=0)
image = np.array(image).astype(np.float32) / 255.0
image = image.transpose(0, 3, 1, 2)
image = 2.0 * image - 1.0
image = torch.from_numpy(image)
elif isinstance(image[0], torch.Tensor):
image = torch.cat(image, dim=0)
return image
def shark_run_wrapper(model, *args):
np_inputs = tuple([x.detach().numpy() for x in args])
outputs = model("forward", np_inputs)
return torch.from_numpy(outputs)
class SharkStableDiffusionUpscalePipeline:
def __init__(
self,
model_id,
):
self.tokenizer = CLIPTokenizer.from_pretrained(
model_id, subfolder="tokenizer"
)
self.low_res_scheduler = DDPMScheduler.from_pretrained(
model_id,
subfolder="scheduler",
)
self.scheduler = DDIMScheduler.from_pretrained(
model_id,
subfolder="scheduler",
)
self.vae = get_vae()
self.unet = get_unet()
self.text_encoder = get_clip()
self.max_noise_level = (350,)
self._execution_device = "cpu"
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt,
):
r"""
Encodes the prompt into text encoder hidden states.
Args:
prompt (`str` or `list(int)`):
prompt to be encoded
device: (`torch.device`):
torch device
num_images_per_prompt (`int`):
number of images that should be generated per prompt
do_classifier_free_guidance (`bool`):
whether to use classifier free guidance or not
negative_prompt (`str` or `List[str]`):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
if `guidance_scale` is less than `1`).
"""
batch_size = len(prompt) if isinstance(prompt, list) else 1
text_inputs = self.tokenizer(
prompt,
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
text_input_ids = text_inputs.input_ids
untruncated_ids = self.tokenizer(
prompt, padding="longest", return_tensors="pt"
).input_ids
if untruncated_ids.shape[-1] >= text_input_ids.shape[
-1
] and not torch.equal(text_input_ids, untruncated_ids):
removed_text = self.tokenizer.batch_decode(
untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
)
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
)
# if (
# hasattr(self.text_encoder.config, "use_attention_mask")
# and self.text_encoder.config.use_attention_mask
# ):
# attention_mask = text_inputs.attention_mask.to(device)
# else:
# attention_mask = None
text_embeddings = shark_run_wrapper(
self.text_encoder, text_input_ids.to(device)
)
# duplicate text embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = text_embeddings.shape
text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
text_embeddings = text_embeddings.view(
bs_embed * num_images_per_prompt, seq_len, -1
)
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance:
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
)
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
" the batch size of `prompt`."
)
else:
uncond_tokens = negative_prompt
max_length = text_input_ids.shape[-1]
uncond_input = self.tokenizer(
uncond_tokens,
padding="max_length",
max_length=max_length,
truncation=True,
return_tensors="pt",
)
# if (
# hasattr(self.text_encoder.config, "use_attention_mask")
# and self.text_encoder.config.use_attention_mask
# ):
# attention_mask = uncond_input.attention_mask.to(device)
# else:
# attention_mask = None
uncond_embeddings = shark_run_wrapper(
self.text_encoder,
uncond_input.input_ids.to(device),
)
uncond_embeddings = uncond_embeddings
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = uncond_embeddings.shape[1]
uncond_embeddings = uncond_embeddings.repeat(
1, num_images_per_prompt, 1
)
uncond_embeddings = uncond_embeddings.view(
batch_size * num_images_per_prompt, seq_len, -1
)
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
return text_embeddings
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
def prepare_extra_step_kwargs(self, generator, eta):
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
accepts_eta = "eta" in set(
inspect.signature(self.scheduler.step).parameters.keys()
)
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
accepts_generator = "generator" in set(
inspect.signature(self.scheduler.step).parameters.keys()
)
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents with 0.18215->0.08333
def decode_latents(self, latents):
latents = 1 / 0.08333 * latents
image = shark_run_wrapper(self.vae, latents)
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
return image
def check_inputs(self, prompt, image, noise_level, callback_steps):
if not isinstance(prompt, str) and not isinstance(prompt, list):
raise ValueError(
f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
)
if (
not isinstance(image, torch.Tensor)
and not isinstance(image, PIL.Image.Image)
and not isinstance(image, list)
):
raise ValueError(
f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
)
# verify batch size of prompt and image are same if image is a list or tensor
if isinstance(image, list) or isinstance(image, torch.Tensor):
if isinstance(prompt, str):
batch_size = 1
else:
batch_size = len(prompt)
if isinstance(image, list):
image_batch_size = len(image)
else:
image_batch_size = image.shape[0]
if batch_size != image_batch_size:
raise ValueError(
f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
" Please make sure that passed `prompt` matches the batch size of `image`."
)
@staticmethod
def numpy_to_pil(images):
"""
Convert a numpy image or a batch of images to a PIL image.
"""
if images.ndim == 3:
images = images[None, ...]
images = (images * 255).round().astype("uint8")
if images.shape[-1] == 1:
# special case for grayscale (single channel) images
pil_images = [
Image.fromarray(image.squeeze(), mode="L") for image in images
]
else:
pil_images = [Image.fromarray(image) for image in images]
return pil_images
def prepare_latents(
self,
batch_size,
num_channels_latents,
height,
width,
dtype,
device,
generator,
latents=None,
):
shape = (batch_size, num_channels_latents, height, width)
if latents is None:
if device == "mps":
# randn does not work reproducibly on mps
latents = torch.randn(
shape, generator=generator, device="cpu", dtype=dtype
).to(device)
else:
latents = torch.randn(
shape, generator=generator, device=device, dtype=dtype
)
else:
if latents.shape != shape:
raise ValueError(
f"Unexpected latents shape, got {latents.shape}, expected {shape}"
)
latents = latents.to(device)
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
return latents
@torch.no_grad()
def __call__(
self,
prompt: Union[str, List[str]],
image: Union[
torch.FloatTensor, PIL.Image.Image, List[PIL.Image.Image]
],
num_inference_steps: int = 75,
guidance_scale: float = 9.0,
noise_level: int = 20,
negative_prompt: Optional[Union[str, List[str]]] = None,
num_images_per_prompt: Optional[int] = 1,
eta: float = 0.0,
generator: Optional[
Union[torch.Generator, List[torch.Generator]]
] = None,
latents: Optional[torch.FloatTensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[
Callable[[int, int, torch.FloatTensor], None]
] = None,
callback_steps: Optional[int] = 1,
):
# 1. Check inputs
self.check_inputs(prompt, image, noise_level, callback_steps)
# 2. Define call parameters
batch_size = 1 if isinstance(prompt, str) else len(prompt)
device = self._execution_device
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
# corresponds to doing no classifier free guidance.
do_classifier_free_guidance = guidance_scale > 1.0
# 3. Encode input prompt
text_embeddings = self._encode_prompt(
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt,
)
# 4. Preprocess image
image = preprocess(image)
image = image.to(dtype=text_embeddings.dtype, device=device)
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps, device=device)
timesteps = self.scheduler.timesteps
# 5. Add noise to image
noise_level = torch.tensor(
[noise_level], dtype=torch.long, device=device
)
if device == "mps":
# randn does not work reproducibly on mps
noise = torch.randn(
image.shape,
generator=generator,
device="cpu",
dtype=text_embeddings.dtype,
).to(device)
else:
noise = torch.randn(
image.shape,
generator=generator,
device=device,
dtype=text_embeddings.dtype,
)
image = self.low_res_scheduler.add_noise(image, noise, noise_level)
batch_multiplier = 2 if do_classifier_free_guidance else 1
image = torch.cat([image] * batch_multiplier * num_images_per_prompt)
noise_level = torch.cat([noise_level] * image.shape[0])
# 6. Prepare latent variables
height, width = image.shape[2:]
# num_channels_latents = self.vae.config.latent_channels
num_channels_latents = 4
latents = self.prepare_latents(
batch_size * num_images_per_prompt,
num_channels_latents,
height,
width,
text_embeddings.dtype,
device,
generator,
latents,
)
# 7. Check that sizes of image and latents match
num_channels_image = image.shape[1]
# if (
# num_channels_latents + num_channels_image
# != self.unet.config.in_channels
# ):
# raise ValueError(
# f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
# f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
# f" `num_channels_image`: {num_channels_image} "
# f" = {num_channels_latents+num_channels_image}. Please verify the config of"
# " `pipeline.unet` or your `image` input."
# )
# 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 9. Denoising loop
num_warmup_steps = (
len(timesteps) - num_inference_steps * self.scheduler.order
)
for i, t in tqdm(enumerate(timesteps)):
# expand the latents if we are doing classifier free guidance
latent_model_input = (
torch.cat([latents] * 2)
if do_classifier_free_guidance
else latents
)
# concat latents, mask, masked_image_latents in the channel dimension
latent_model_input = self.scheduler.scale_model_input(
latent_model_input, t
)
latent_model_input = torch.cat([latent_model_input, image], dim=1)
timestep = torch.tensor([t]).to(torch.float32)
# predict the noise residual
noise_pred = shark_run_wrapper(
self.unet,
latent_model_input.half(),
timestep,
text_embeddings.half(),
noise_level,
)
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + guidance_scale * (
noise_pred_text - noise_pred_uncond
)
# compute the previous noisy sample x_t -> x_t-1
latents = self.scheduler.step(
noise_pred, t, latents, **extra_step_kwargs
).prev_sample
# # call the callback, if provided
# if i == len(timesteps) - 1 or (
# (i + 1) > num_warmup_steps
# and (i + 1) % self.scheduler.order == 0
# ):
# progress_bar.update()
# if callback is not None and i % callback_steps == 0:
# callback(i, t, latents)
# 10. Post-processing
# make sure the VAE is in float32 mode, as it overflows in float16
# self.vae.to(dtype=torch.float32)
image = self.decode_latents(latents.float())
# 11. Convert to PIL
if output_type == "pil":
image = self.numpy_to_pil(image)
if not return_dict:
return (image,)
return ImagePipelineOutput(images=image)

View File

@@ -1,111 +0,0 @@
import argparse
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
##############################################################################
### Stable Diffusion Params
##############################################################################
p.add_argument(
"--prompts",
nargs="+",
default=["cyberpunk forest by Salvador Dali"],
help="text of which images to be generated.",
)
p.add_argument(
"--negative-prompts",
nargs="+",
default=[""],
help="text you don't want to see in the generated image.",
)
p.add_argument(
"--steps",
type=int,
default=50,
help="the no. of steps to do the sampling.",
)
p.add_argument(
"--seed",
type=int,
default=42,
help="the seed to use.",
)
p.add_argument(
"--guidance_scale",
type=float,
default=7.5,
help="the value to be used for guidance scaling.",
)
##############################################################################
### Model Config and Usage Params
##############################################################################
p.add_argument(
"--device", type=str, default="vulkan", help="device to run the model."
)
p.add_argument(
"--precision", type=str, default="fp16", help="precision to run the model."
)
p.add_argument(
"--import_mlir",
default=False,
action=argparse.BooleanOptionalAction,
help="imports the model from torch module to shark_module otherwise downloads the model from shark_tank.",
)
p.add_argument(
"--load_vmfb",
default=True,
action=argparse.BooleanOptionalAction,
help="attempts to load the model from a precompiled flatbuffer and compiles + saves it if not found.",
)
p.add_argument(
"--save_vmfb",
default=False,
action=argparse.BooleanOptionalAction,
help="saves the compiled flatbuffer to the local directory",
)
##############################################################################
### IREE - Vulkan supported flags
##############################################################################
p.add_argument(
"--iree-vulkan-target-triple",
type=str,
default="",
help="Specify target triple for vulkan",
)
p.add_argument(
"--vulkan_debug_utils",
default=False,
action=argparse.BooleanOptionalAction,
help="Profiles vulkan device and collects the .rdc info",
)
p.add_argument(
"--vulkan_large_heap_block_size",
default="4147483648",
help="flag for setting VMA preferredLargeHeapBlockSize for vulkan device, default is 4G",
)
p.add_argument(
"--vulkan_validation_layers",
default=False,
action=argparse.BooleanOptionalAction,
help="flag for disabling vulkan validation layers when benchmarking",
)
args = p.parse_args()

View File

@@ -1,234 +0,0 @@
import os
import torch
from shark.shark_inference import SharkInference
from upscaler_args import args
from shark.shark_importer import import_with_fx
from shark.iree_utils.vulkan_utils import (
set_iree_vulkan_runtime_flags,
get_vulkan_target_triple,
)
def _compile_module(shark_module, model_name, extra_args=[]):
if args.load_vmfb or args.save_vmfb:
device = (
args.device
if "://" not in args.device
else "-".join(args.device.split("://"))
)
extended_name = "{}_{}".format(model_name, device)
vmfb_path = os.path.join(os.getcwd(), extended_name + ".vmfb")
if args.load_vmfb and os.path.isfile(vmfb_path) and not args.save_vmfb:
print(f"loading existing vmfb from: {vmfb_path}")
shark_module.load_module(vmfb_path, extra_args=extra_args)
else:
if args.save_vmfb:
print("Saving to {}".format(vmfb_path))
else:
print(
"No vmfb found. Compiling and saving to {}".format(
vmfb_path
)
)
path = shark_module.save_module(
os.getcwd(), extended_name, extra_args
)
shark_module.load_module(path, extra_args=extra_args)
else:
shark_module.compile(extra_args)
return shark_module
# Downloads the model from shark_tank and returns the shark_module.
def get_shark_model(tank_url, model_name, extra_args=[]):
from shark.shark_downloader import download_model
from shark.parser import shark_args
# Set local shark_tank cache directory.
# shark_args.local_tank_cache = args.local_tank_cache
mlir_model, func_name, inputs, golden_out = download_model(
model_name,
tank_url=tank_url,
frontend="torch",
)
shark_module = SharkInference(
mlir_model, device=args.device, mlir_dialect="linalg"
)
return _compile_module(shark_module, model_name, extra_args)
# Converts the torch-module into a shark_module.
def compile_through_fx(
model, inputs, model_name, is_f16=False, f16_input_mask=None, extra_args=[]
):
mlir_module, func_name = import_with_fx(
model, inputs, is_f16, f16_input_mask
)
shark_module = SharkInference(
mlir_module,
device=args.device,
mlir_dialect="linalg",
)
return _compile_module(shark_module, model_name, extra_args)
def set_iree_runtime_flags():
vulkan_runtime_flags = [
f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
]
if args.enable_rgp:
vulkan_runtime_flags += [
f"--enable_rgp=true",
f"--vulkan_debug_utils=true",
]
set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
def get_all_devices(driver_name):
"""
Inputs: driver_name
Returns a list of all the available devices for a given driver sorted by
the iree path names of the device as in --list_devices option in iree.
"""
from iree.runtime import get_driver
driver = get_driver(driver_name)
device_list_src = driver.query_available_devices()
device_list_src.sort(key=lambda d: d["path"])
return device_list_src
def get_device_mapping(driver, key_combination=3):
"""This method ensures consistent device ordering when choosing
specific devices for execution
Args:
driver (str): execution driver (vulkan, cuda, rocm, etc)
key_combination (int, optional): choice for mapping value for device name.
1 : path
2 : name
3 : (name, path)
Defaults to 3.
Returns:
dict: map to possible device names user can input mapped to desired combination of name/path.
"""
from shark.iree_utils._common import iree_device_map
driver = iree_device_map(driver)
device_list = get_all_devices(driver)
device_map = dict()
def get_output_value(dev_dict):
if key_combination == 1:
return f"{driver}://{dev_dict['path']}"
if key_combination == 2:
return dev_dict["name"]
if key_combination == 3:
return (dev_dict["name"], f"{driver}://{dev_dict['path']}")
# mapping driver name to default device (driver://0)
device_map[f"{driver}"] = get_output_value(device_list[0])
for i, device in enumerate(device_list):
# mapping with index
device_map[f"{driver}://{i}"] = get_output_value(device)
# mapping with full path
device_map[f"{driver}://{device['path']}"] = get_output_value(device)
return device_map
def map_device_to_name_path(device, key_combination=3):
"""Gives the appropriate device data (supported name/path) for user selected execution device
Args:
device (str): user
key_combination (int, optional): choice for mapping value for device name.
1 : path
2 : name
3 : (name, path)
Defaults to 3.
Raises:
ValueError:
Returns:
str / tuple: returns the mapping str or tuple of mapping str for the device depending on key_combination value
"""
driver = device.split("://")[0]
device_map = get_device_mapping(driver, key_combination)
try:
device_mapping = device_map[device]
except KeyError:
raise ValueError(f"Device '{device}' is not a valid device.")
return device_mapping
def set_init_device_flags():
if "vulkan" in args.device:
# set runtime flags for vulkan.
set_iree_runtime_flags()
# set triple flag to avoid multiple calls to get_vulkan_triple_flag
device_name, args.device = map_device_to_name_path(args.device)
if not args.iree_vulkan_target_triple:
triple = get_vulkan_target_triple(device_name)
if triple is not None:
args.iree_vulkan_target_triple = triple
print(
f"Found device {device_name}. Using target triple {args.iree_vulkan_target_triple}."
)
elif "cuda" in args.device:
args.device = "cuda"
elif "cpu" in args.device:
args.device = "cpu"
# set max_length based on availability.
if args.variant in ["anythingv3", "analogdiffusion", "dreamlike"]:
args.max_length = 77
elif args.variant == "openjourney":
args.max_length = 64
# use tuned models only in the case of stablediffusion/fp16 and rdna3 cards.
if (
args.variant in ["openjourney", "dreamlike"]
or args.precision != "fp16"
or "vulkan" not in args.device
or "rdna3" not in args.iree_vulkan_target_triple
):
args.use_tuned = False
print("Tuned models are currently not supported for this setting.")
elif args.use_base_vae and args.variant != "stablediffusion":
args.use_tuned = False
print("Tuned models are currently not supported for this setting.")
if args.use_tuned:
print("Using tuned models for stablediffusion/fp16 and rdna3 card.")
# Utility to get list of devices available.
def get_available_devices():
def get_devices_by_name(driver_name):
from shark.iree_utils._common import iree_device_map
device_list = []
try:
driver_name = iree_device_map(driver_name)
device_list_dict = get_all_devices(driver_name)
print(f"{driver_name} devices are available.")
except:
print(f"{driver_name} devices are not available.")
else:
for i, device in enumerate(device_list_dict):
device_list.append(f"{driver_name}://{i} => {device['name']}")
return device_list
set_iree_runtime_flags()
available_devices = []
vulkan_devices = get_devices_by_name("vulkan")
available_devices.extend(vulkan_devices)
cuda_devices = get_devices_by_name("cuda")
available_devices.extend(cuda_devices)
available_devices.append("cpu")
return available_devices

View File

@@ -1,10 +1,8 @@
from shark.shark_inference import SharkInference
from shark.shark_downloader import download_model
from shark.shark_downloader import download_torch_model
mlir_model, func_name, inputs, golden_out = download_model(
"v_diffusion", frontend="torch"
)
mlir_model, func_name, inputs, golden_out = download_torch_model("v_diffusion")
shark_module = SharkInference(
mlir_model, func_name, device="vulkan", mlir_dialect="linalg"

View File

@@ -1,7 +1,7 @@
import torch
from torch.nn.utils import _stateless
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from shark.shark_trainer import SharkTrainer
from shark.shark_runner import SharkTrainer
class MiniLMSequenceClassification(torch.nn.Module):
@@ -42,7 +42,6 @@ def forward(params, buffers, args):
return params, buffers
shark_module = SharkTrainer(mod, inp)
shark_module.compile(forward)
shark_module = SharkTrainer(mod, inp, custom_inference_fn=forward)
print(shark_module.train())
print(shark_module.forward())

View File

@@ -52,8 +52,7 @@ class BertModule(tf.Module):
input_signature=[
bert_input, # inputs
tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32), # labels
],
jit_compile=True,
]
)
def forward(self, inputs, labels):
with tf.GradientTape() as tape:

View File

@@ -1,41 +0,0 @@
# Stable Diffusion Img2Img model
## Installation
<details>
<summary>Installation (Linux)</summary>
### Activate shark.venv Virtual Environment
```shell
source shark.venv/bin/activate
# Some older pip installs may not be able to handle the recent PyTorch deps
python -m pip install --upgrade pip
```
### Install dependencies
# Run the setup.sh script
```shell
./setup.sh
```
### Run the Stable diffusion Img2Img model
To run the model with the default set of images and params, run:
```shell
python stable_diffusion_img2img.py
```
To run the model with your set of images, and parameters you need to specify the following params:
1.) Input images directory with the arg `--input_dir` containing 3-5 images.
2.) What to teach the model? Using the arg `--what_to_teach`, allowed values are `object` or `style`.
3.) Placeholder token using the arg `--placeholder_token`, that represents your new concept. It should be passed with the opening and closing angle brackets. For ex: token is `cat-toy`, it should be passed as `<cat-toy>`.
4.) Initializer token using the arg `--initializer_token`, which summarise what is your new concept.
For the result, you need to pass the text prompt with the arg: `--prompt`. The prompt string should contain a "*s" in it, which will be replaced by the placeholder token during the inference.
By default the result images will go into the `sd_result` dir. To specify your output dir use the arg: `--output_dir`.
The default value of max_training_steps is `3000`, which takes some hours to complete. You can pass the smaller value with the arg `--training_steps`. Specify the number of images to be sampled for the result with the `--num_inference_samples` arg.

View File

@@ -1,25 +0,0 @@
#!/bin/bash
TD="$(cd $(dirname $0) && pwd)"
if [ -z "$PYTHON" ]; then
PYTHON="$(which python3)"
fi
function die() {
echo "Error executing command: $*"
exit 1
}
PYTHON_VERSION_X_Y=`${PYTHON} -c 'import sys; version=sys.version_info[:2]; print("{0}.{1}".format(*version))'`
echo "Python: $PYTHON"
echo "Python version: $PYTHON_VERSION_X_Y"
mkdir input_images
wget https://huggingface.co/datasets/valhalla/images/resolve/main/2.jpeg -P input_images/
wget https://huggingface.co/datasets/valhalla/images/resolve/main/3.jpeg -P input_images/
wget https://huggingface.co/datasets/valhalla/images/resolve/main/5.jpeg -P input_images/
wget https://huggingface.co/datasets/valhalla/images/resolve/main/6.jpeg -P input_images/
pip install diffusers["training"]==0.4.1 transformers ftfy opencv-python

View File

@@ -1,597 +0,0 @@
# Textual-inversion fine-tuning for Stable Diffusion using diffusers
# This script shows how to "teach" Stable Diffusion a new concept via
# textual-inversion using 🤗 Hugging Face [🧨 Diffusers library](https://github.com/huggingface/diffusers).
# By using just 3-5 images you can teach new concepts to Stable Diffusion
# and personalize the model on your own images.
import argparse
import itertools
import math
import os
import random
import cv2
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch.utils.data import Dataset
import PIL
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from diffusers import (
AutoencoderKL,
DDPMScheduler,
PNDMScheduler,
StableDiffusionPipeline,
UNet2DConditionModel,
)
from diffusers.hub_utils import init_git_repo, push_to_hub
from diffusers.optimization import get_scheduler
from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
from PIL import Image
from torchvision import transforms
from tqdm.auto import tqdm
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
YOUR_TOKEN = "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"
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
p.add_argument(
"--input_dir",
type=str,
default="input_images/",
help="the directory contains the images used for fine tuning",
)
p.add_argument(
"--output_dir",
type=str,
default="sd_result",
help="the directory contains the images used for fine tuning",
)
p.add_argument(
"--training_steps",
type=int,
default=3000,
help="the maximum number of training steps",
)
p.add_argument("--seed", type=int, default=42, help="the random seed")
p.add_argument(
"--what_to_teach",
type=str,
choices=["object", "style"],
default="object",
help="what is it that you are teaching?",
)
p.add_argument(
"--placeholder_token",
type=str,
default="<cat-toy>",
help="It is the token you are going to use to represent your new concept",
)
p.add_argument(
"--initializer_token",
type=str,
default="toy",
help="It is a word that can summarise what is your new concept",
)
p.add_argument(
"--inference_steps",
type=int,
default=50,
help="the number of steps for inference",
)
p.add_argument(
"--num_inference_samples",
type=int,
default=4,
help="the number of samples for inference",
)
p.add_argument(
"--prompt",
type=str,
default="a grafitti in a wall with a *s on it",
help="the text prompt to use",
)
args = p.parse_args()
if "*s" not in args.prompt:
raise ValueError(
f'The prompt should have a "*s" which will be replaced by a placeholder token.'
)
prompt1, prompt2 = args.prompt.split("*s")
args.prompt = prompt1 + args.placeholder_token + prompt2
pretrained_model_name_or_path = "CompVis/stable-diffusion-v1-4"
# Load input images.
images = []
for filename in os.listdir(args.input_dir):
img = cv2.imread(os.path.join(args.input_dir, filename))
if img is not None:
images.append(img)
# Setup the prompt templates for training
imagenet_templates_small = [
"a photo of a {}",
"a rendering of a {}",
"a cropped photo of the {}",
"the photo of a {}",
"a photo of a clean {}",
"a photo of a dirty {}",
"a dark photo of the {}",
"a photo of my {}",
"a photo of the cool {}",
"a close-up photo of a {}",
"a bright photo of the {}",
"a cropped photo of a {}",
"a photo of the {}",
"a good photo of the {}",
"a photo of one {}",
"a close-up photo of the {}",
"a rendition of the {}",
"a photo of the clean {}",
"a rendition of a {}",
"a photo of a nice {}",
"a good photo of a {}",
"a photo of the nice {}",
"a photo of the small {}",
"a photo of the weird {}",
"a photo of the large {}",
"a photo of a cool {}",
"a photo of a small {}",
]
imagenet_style_templates_small = [
"a painting in the style of {}",
"a rendering in the style of {}",
"a cropped painting in the style of {}",
"the painting in the style of {}",
"a clean painting in the style of {}",
"a dirty painting in the style of {}",
"a dark painting in the style of {}",
"a picture in the style of {}",
"a cool painting in the style of {}",
"a close-up painting in the style of {}",
"a bright painting in the style of {}",
"a cropped painting in the style of {}",
"a good painting in the style of {}",
"a close-up painting in the style of {}",
"a rendition in the style of {}",
"a nice painting in the style of {}",
"a small painting in the style of {}",
"a weird painting in the style of {}",
"a large painting in the style of {}",
]
# Setup the dataset
class TextualInversionDataset(Dataset):
def __init__(
self,
data_root,
tokenizer,
learnable_property="object", # [object, style]
size=512,
repeats=100,
interpolation="bicubic",
flip_p=0.5,
set="train",
placeholder_token="*",
center_crop=False,
):
self.data_root = data_root
self.tokenizer = tokenizer
self.learnable_property = learnable_property
self.size = size
self.placeholder_token = placeholder_token
self.center_crop = center_crop
self.flip_p = flip_p
self.image_paths = [
os.path.join(self.data_root, file_path)
for file_path in os.listdir(self.data_root)
]
self.num_images = len(self.image_paths)
self._length = self.num_images
if set == "train":
self._length = self.num_images * repeats
self.interpolation = {
"linear": PIL.Image.LINEAR,
"bilinear": PIL.Image.BILINEAR,
"bicubic": PIL.Image.BICUBIC,
"lanczos": PIL.Image.LANCZOS,
}[interpolation]
self.templates = (
imagenet_style_templates_small
if learnable_property == "style"
else imagenet_templates_small
)
self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
def __len__(self):
return self._length
def __getitem__(self, i):
example = {}
image = Image.open(self.image_paths[i % self.num_images])
if not image.mode == "RGB":
image = image.convert("RGB")
placeholder_string = self.placeholder_token
text = random.choice(self.templates).format(placeholder_string)
example["input_ids"] = self.tokenizer(
text,
padding="max_length",
truncation=True,
max_length=self.tokenizer.model_max_length,
return_tensors="pt",
).input_ids[0]
# default to score-sde preprocessing
img = np.array(image).astype(np.uint8)
if self.center_crop:
crop = min(img.shape[0], img.shape[1])
h, w, = (
img.shape[0],
img.shape[1],
)
img = img[
(h - crop) // 2 : (h + crop) // 2,
(w - crop) // 2 : (w + crop) // 2,
]
image = Image.fromarray(img)
image = image.resize(
(self.size, self.size), resample=self.interpolation
)
image = self.flip_transform(image)
image = np.array(image).astype(np.uint8)
image = (image / 127.5 - 1.0).astype(np.float32)
example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
return example
# Setting up the model
# Load the tokenizer and add the placeholder token as a additional special token.
# Please read and if you agree accept the LICENSE
# [here](https://huggingface.co/CompVis/stable-diffusion-v1-4) if you see an error
tokenizer = CLIPTokenizer.from_pretrained(
pretrained_model_name_or_path,
subfolder="tokenizer",
use_auth_token=YOUR_TOKEN,
)
# Add the placeholder token in tokenizer
num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
if num_added_tokens == 0:
raise ValueError(
f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
" `placeholder_token` that is not already in the tokenizer."
)
# Get token ids for our placeholder and initializer token.
# This code block will complain if initializer string is not a single token
# Convert the initializer_token, placeholder_token to ids
token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
# Check if initializer_token is a single token or a sequence of tokens
if len(token_ids) > 1:
raise ValueError("The initializer token must be a single token.")
initializer_token_id = token_ids[0]
placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
# Load the Stable Diffusion model
# Load models and create wrapper for stable diffusion
text_encoder = CLIPTextModel.from_pretrained(
pretrained_model_name_or_path,
subfolder="text_encoder",
use_auth_token=YOUR_TOKEN,
)
vae = AutoencoderKL.from_pretrained(
pretrained_model_name_or_path,
subfolder="vae",
use_auth_token=YOUR_TOKEN,
)
unet = UNet2DConditionModel.from_pretrained(
pretrained_model_name_or_path,
subfolder="unet",
use_auth_token=YOUR_TOKEN,
)
# We have added the `placeholder_token` in the `tokenizer` so we resize the token embeddings here,
# this will a new embedding vector in the token embeddings for our `placeholder_token`
text_encoder.resize_token_embeddings(len(tokenizer))
# Initialise the newly added placeholder token with the embeddings of the initializer token
token_embeds = text_encoder.get_input_embeddings().weight.data
token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
# In Textual-Inversion we only train the newly added embedding vector,
# so lets freeze rest of the model parameters here.
def freeze_params(params):
for param in params:
param.requires_grad = False
# Freeze vae and unet
freeze_params(vae.parameters())
freeze_params(unet.parameters())
# Freeze all parameters except for the token embeddings in text encoder
params_to_freeze = itertools.chain(
text_encoder.text_model.encoder.parameters(),
text_encoder.text_model.final_layer_norm.parameters(),
text_encoder.text_model.embeddings.position_embedding.parameters(),
)
freeze_params(params_to_freeze)
# Creating our training data
train_dataset = TextualInversionDataset(
data_root=args.input_dir,
tokenizer=tokenizer,
size=512,
placeholder_token=args.placeholder_token,
repeats=100,
learnable_property=args.what_to_teach, # Option selected above between object and style
center_crop=False,
set="train",
)
def create_dataloader(train_batch_size=1):
return torch.utils.data.DataLoader(
train_dataset, batch_size=train_batch_size, shuffle=True
)
# Create noise_scheduler for training.
noise_scheduler = DDPMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
num_train_timesteps=1000,
tensor_format="pt",
)
# Define hyperparameters for our training
hyperparameters = {
"learning_rate": 5e-04,
"scale_lr": True,
"max_train_steps": args.training_steps,
"train_batch_size": 1,
"gradient_accumulation_steps": 4,
"seed": args.seed,
"output_dir": "sd-concept-output",
}
def training_function(text_encoder, vae, unet):
logger = get_logger(__name__)
train_batch_size = hyperparameters["train_batch_size"]
gradient_accumulation_steps = hyperparameters[
"gradient_accumulation_steps"
]
learning_rate = hyperparameters["learning_rate"]
max_train_steps = hyperparameters["max_train_steps"]
output_dir = hyperparameters["output_dir"]
accelerator = Accelerator(
gradient_accumulation_steps=gradient_accumulation_steps,
)
train_dataloader = create_dataloader(train_batch_size)
if hyperparameters["scale_lr"]:
learning_rate = (
learning_rate
* gradient_accumulation_steps
* train_batch_size
* accelerator.num_processes
)
# Initialize the optimizer
optimizer = torch.optim.AdamW(
text_encoder.get_input_embeddings().parameters(), # only optimize the embeddings
lr=learning_rate,
)
text_encoder, optimizer, train_dataloader = accelerator.prepare(
text_encoder, optimizer, train_dataloader
)
# Move vae and unet to device
vae.to(accelerator.device)
unet.to(accelerator.device)
# Keep vae and unet in eval model as we don't train these
vae.eval()
unet.eval()
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(
len(train_dataloader) / gradient_accumulation_steps
)
num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
# Train!
total_batch_size = (
train_batch_size
* accelerator.num_processes
* gradient_accumulation_steps
)
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Instantaneous batch size per device = {train_batch_size}")
logger.info(
f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
)
logger.info(
f" Gradient Accumulation steps = {gradient_accumulation_steps}"
)
logger.info(f" Total optimization steps = {max_train_steps}")
# Only show the progress bar once on each machine.
progress_bar = tqdm(
range(max_train_steps), disable=not accelerator.is_local_main_process
)
progress_bar.set_description("Steps")
global_step = 0
for epoch in range(num_train_epochs):
text_encoder.train()
for step, batch in enumerate(train_dataloader):
with accelerator.accumulate(text_encoder):
# Convert images to latent space
latents = (
vae.encode(batch["pixel_values"])
.latent_dist.sample()
.detach()
)
latents = latents * 0.18215
# Sample noise that we'll add to the latents
noise = torch.randn(latents.shape).to(latents.device)
bsz = latents.shape[0]
# Sample a random timestep for each image
timesteps = torch.randint(
0,
noise_scheduler.num_train_timesteps,
(bsz,),
device=latents.device,
).long()
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_latents = noise_scheduler.add_noise(
latents, noise, timesteps
)
# Get the text embedding for conditioning
encoder_hidden_states = text_encoder(batch["input_ids"])[0]
# Predict the noise residual
noise_pred = unet(
noisy_latents, timesteps, encoder_hidden_states
).sample
loss = (
F.mse_loss(noise_pred, noise, reduction="none")
.mean([1, 2, 3])
.mean()
)
accelerator.backward(loss)
# Zero out the gradients for all token embeddings except the newly added
# embeddings for the concept, as we only want to optimize the concept embeddings
if accelerator.num_processes > 1:
grads = (
text_encoder.module.get_input_embeddings().weight.grad
)
else:
grads = text_encoder.get_input_embeddings().weight.grad
# Get the index for tokens that we want to zero the grads for
index_grads_to_zero = (
torch.arange(len(tokenizer)) != placeholder_token_id
)
grads.data[index_grads_to_zero, :] = grads.data[
index_grads_to_zero, :
].fill_(0)
optimizer.step()
optimizer.zero_grad()
# Checks if the accelerator has performed an optimization step behind the scenes
if accelerator.sync_gradients:
progress_bar.update(1)
global_step += 1
logs = {"loss": loss.detach().item()}
progress_bar.set_postfix(**logs)
if global_step >= max_train_steps:
break
accelerator.wait_for_everyone()
# Create the pipeline using using the trained modules and save it.
if accelerator.is_main_process:
pipeline = StableDiffusionPipeline(
text_encoder=accelerator.unwrap_model(text_encoder),
vae=vae,
unet=unet,
tokenizer=tokenizer,
scheduler=PNDMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
skip_prk_steps=True,
),
safety_checker=StableDiffusionSafetyChecker.from_pretrained(
"CompVis/stable-diffusion-safety-checker"
),
feature_extractor=CLIPFeatureExtractor.from_pretrained(
"openai/clip-vit-base-patch32"
),
)
pipeline.save_pretrained(output_dir)
# Also save the newly trained embeddings
learned_embeds = (
accelerator.unwrap_model(text_encoder)
.get_input_embeddings()
.weight[placeholder_token_id]
)
learned_embeds_dict = {
args.placeholder_token: learned_embeds.detach().cpu()
}
torch.save(
learned_embeds_dict, os.path.join(output_dir, "learned_embeds.bin")
)
import accelerate
accelerate.notebook_launcher(
training_function, args=(text_encoder, vae, unet), num_processes=1
)
# Set up the pipeline
pipe = StableDiffusionPipeline.from_pretrained(
hyperparameters["output_dir"],
# torch_dtype=torch.float16,
)
all_images = []
for _ in range(args.num_inference_samples):
images = pipe(
[args.prompt],
num_inference_steps=args.inference_steps,
guidance_scale=7.5,
).images
all_images.extend(images)
# output_path = os.path.abspath(os.path.join(os.getcwd(), args.output_dir))
if not os.path.isdir(args.output_dir):
os.mkdir(args.output_dir)
[
image.save(f"{args.output_dir}/{i}.jpeg")
for i, image in enumerate(all_images)
]

View File

@@ -21,6 +21,7 @@ import torch
from iree.runtime import DeviceArray
from torch_mlir._mlir_libs._mlir.ir import Module
from torch_mlir.compiler_utils import (
get_module_name_for_debug_dump,
run_pipeline_with_repro_report,
)
from torch_mlir.eager_mode.torch_mlir_eager_backend import (
@@ -63,13 +64,14 @@ class EagerModeIREELinalgOnTensorsBackend(TorchMLIREagerBackend):
)
def compile(self, imported_module: Module):
fn_name = get_module_name_for_debug_dump(imported_module)
run_pipeline_with_repro_report(
imported_module,
"torch-function-to-torch-backend-pipeline,torch-backend-to-linalg-on-tensors-backend-pipeline",
"EagerMode",
)
callable, _ = get_iree_compiled_module(
imported_module, self.raw_device_str
imported_module, self.raw_device_str, func_name=fn_name
)
return callable

View File

@@ -37,19 +37,7 @@ def run_cmd(cmd):
sys.exit("Exiting program due to error running:", cmd)
def iree_device_map(device):
uri_parts = device.split("://", 2)
if len(uri_parts) == 1:
return _IREE_DEVICE_MAP[uri_parts[0]]
else:
return f"{_IREE_DEVICE_MAP[uri_parts[0]]}://{uri_parts[1]}"
def get_supported_device_list():
return list(_IREE_DEVICE_MAP.keys())
_IREE_DEVICE_MAP = {
IREE_DEVICE_MAP = {
"cpu": "local-task",
"cuda": "cuda",
"vulkan": "vulkan",
@@ -58,14 +46,7 @@ _IREE_DEVICE_MAP = {
"intel-gpu": "level_zero",
}
def iree_target_map(device):
if "://" in device:
device = device.split("://")[0]
return _IREE_TARGET_MAP[device]
_IREE_TARGET_MAP = {
IREE_TARGET_MAP = {
"cpu": "llvm-cpu",
"cuda": "cuda",
"vulkan": "vulkan",
@@ -74,13 +55,9 @@ _IREE_TARGET_MAP = {
"intel-gpu": "opencl-spirv",
}
# Finds whether the required drivers are installed for the given device.
def check_device_drivers(device):
"""Checks necessary drivers present for gpu and vulkan devices"""
if "://" in device:
device = device.split("://")[0]
if device == "cuda":
try:
subprocess.check_output("nvidia-smi")

View File

@@ -13,13 +13,12 @@
# limitations under the License.
import iree.runtime.scripts.iree_benchmark_module as benchmark_module
from shark.iree_utils._common import run_cmd, iree_device_map
from shark.iree_utils.cpu_utils import get_cpu_count
from shark.iree_utils._common import run_cmd, IREE_DEVICE_MAP
import numpy as np
import os
import re
UNIT_TO_SECOND_MAP = {"us": 1e-6, "ms": 0.001, "s": 1}
UNIT_TO_SECOND_MAP = {"ms": 0.001, "s": 1}
def tensor_to_type_str(input_tensors: tuple, mlir_dialect: str):
@@ -70,40 +69,10 @@ def build_benchmark_args(
# TODO: Replace name of train with actual train fn name.
fn_name = "train"
benchmark_cl.append(f"--entry_function={fn_name}")
benchmark_cl.append(f"--device={iree_device_map(device)}")
benchmark_cl.append(f"--device={IREE_DEVICE_MAP[device]}")
mlir_input_types = tensor_to_type_str(input_tensors, mlir_dialect)
for mlir_input in mlir_input_types:
benchmark_cl.append(f"--function_input={mlir_input}")
if device == "cpu":
num_cpus = get_cpu_count()
if num_cpus is not None:
benchmark_cl.append(f"--task_topology_max_group_count={num_cpus}")
time_extractor = "| awk 'END{{print $2 $3}}'"
benchmark_cl.append(time_extractor)
return benchmark_cl
def build_benchmark_args_non_tensor_input(
input_file: str,
device: str,
inputs: tuple,
mlir_dialect: str,
function_name: str,
):
"""
Inputs: input_file leading to vmfb, input_tensor to function, target device,
and whether it is training or not.
Outputs: string that execute benchmark-module on target model.
"""
path = benchmark_module.__path__[0]
benchmarker_path = os.path.join(path, "..", "..", "iree-benchmark-module")
benchmark_cl = [benchmarker_path, f"--module_file={input_file}"]
# TODO: The function named can be passed as one of the args.
if function_name:
benchmark_cl.append(f"--entry_function={function_name}")
benchmark_cl.append(f"--device={iree_device_map(device)}")
for input in inputs:
benchmark_cl.append(f"--function_input={input}")
time_extractor = "| awk 'END{{print $2 $3}}'"
benchmark_cl.append(time_extractor)
return benchmark_cl

View File

@@ -13,37 +13,25 @@
# limitations under the License.
import iree.runtime as ireert
import iree.compiler as ireec
from shark.iree_utils._common import iree_device_map, iree_target_map
from shark.iree_utils.benchmark_utils import *
from shark.parser import shark_args
from shark.iree_utils._common import IREE_DEVICE_MAP, IREE_TARGET_MAP
import numpy as np
import os
import re
# Get the iree-compile arguments given device.
def get_iree_device_args(device, extra_args=[]):
device_uri = device.split("://")
if len(device_uri) > 1:
if device_uri[0] not in ["vulkan"]:
print(
f"Specific device selection only supported for vulkan now."
f"Proceeding with {device} as device."
)
if device_uri[0] == "cpu":
def get_iree_device_args(device):
if device == "cpu":
from shark.iree_utils.cpu_utils import get_iree_cpu_args
return get_iree_cpu_args()
if device_uri[0] == "cuda":
if device == "cuda":
from shark.iree_utils.gpu_utils import get_iree_gpu_args
return get_iree_gpu_args()
if device_uri[0] in ["metal", "vulkan"]:
if device in ["metal", "vulkan"]:
from shark.iree_utils.vulkan_utils import get_iree_vulkan_args
return get_iree_vulkan_args(extra_args=extra_args)
if device_uri[0] == "rocm":
return get_iree_vulkan_args()
if device == "rocm":
from shark.iree_utils.gpu_utils import get_iree_rocm_args
return get_iree_rocm_args()
@@ -70,181 +58,17 @@ def get_iree_common_args():
return [
"--iree-stream-resource-index-bits=64",
"--iree-vm-target-index-bits=64",
"--iree-util-zero-fill-elided-attrs",
]
# Args that are suitable only for certain models or groups of models.
# shark_args are passed down from pytests to control which models compile with these flags,
# but they can also be set in shark/parser.py
def get_model_specific_args():
ms_args = []
if shark_args.enable_conv_transform == True:
ms_args += ["--iree-flow-enable-conv-nchw-to-nhwc-transform"]
return ms_args
def create_dispatch_dirs(bench_dir, device):
protected_files = ["ordered-dispatches.txt"]
bench_dir_path = bench_dir.split("/")
bench_dir_path[-1] = "temp_" + bench_dir_path[-1]
tmp_bench_dir = "/".join(bench_dir_path)
for f_ in os.listdir(bench_dir):
if os.path.isfile(f"{bench_dir}/{f_}") and f_ not in protected_files:
dir_name = re.sub("\.\S*$", "", f_)
if os.path.exists(f"{bench_dir}/{dir_name}"):
os.system(f"rm -rf {bench_dir}/{dir_name}")
os.system(f"mkdir {bench_dir}/{dir_name}")
os.system(f"mv {bench_dir}/{f_} {bench_dir}/{dir_name}/{f_}")
for f_ in os.listdir(tmp_bench_dir):
if os.path.isfile(f"{tmp_bench_dir}/{f_}"):
dir_name = ""
for d_ in os.listdir(bench_dir):
if re.search(f"{d_}(?=\D)", f_):
dir_name = d_
if dir_name != "":
os.system(
f"mv {tmp_bench_dir}/{f_} {bench_dir}/{dir_name}/{dir_name}_benchmark.mlir"
)
def dump_isas(bench_dir):
for d_ in os.listdir(bench_dir):
if os.path.isdir(f"{bench_dir}/{d_}"):
for f_ in os.listdir(f"{bench_dir}/{d_}"):
if f_.endswith(".spv"):
os.system(
f"amdllpc -gfxip 11.0 {bench_dir}/{d_}/{f_} -v > \
{bench_dir}/{d_}/isa.txt"
)
def compile_benchmark_dirs(bench_dir, device, dispatch_benchmarks):
benchmark_runtimes = {}
dispatch_list = []
all_dispatches = False
if dispatch_benchmarks.lower().strip() == "all":
all_dispatches = True
else:
try:
dispatch_list = [
int(dispatch_index)
for dispatch_index in dispatch_benchmarks.split(" ")
]
except:
print("ERROR: Invalid dispatch benchmarks")
return None
for d_ in os.listdir(bench_dir):
if os.path.isdir(f"{bench_dir}/{d_}"):
in_dispatches = False
for dispatch in dispatch_list:
if str(dispatch) in d_:
in_dispatches = True
if all_dispatches or in_dispatches:
for f_ in os.listdir(f"{bench_dir}/{d_}"):
if "benchmark.mlir" in f_:
dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
module = dispatch_file.read()
dispatch_file.close()
flatbuffer_blob = ireec.compile_str(
module, target_backends=[iree_target_map(device)]
)
vmfb_file = open(
f"{bench_dir}/{d_}/{d_}_benchmark.vmfb", "wb"
)
vmfb_file.write(flatbuffer_blob)
vmfb_file.close()
config = get_iree_runtime_config(device)
vm_module = ireert.VmModule.from_flatbuffer(
config.vm_instance, flatbuffer_blob
)
benchmark_cl = build_benchmark_args_non_tensor_input(
input_file=f"{bench_dir}/{d_}/{d_}_benchmark.vmfb",
device=device,
inputs=(0,),
mlir_dialect="linalg",
function_name="",
)
benchmark_bash = open(
f"{bench_dir}/{d_}/{d_}_benchmark.sh", "w+"
)
benchmark_bash.write("#!/bin/bash\n")
benchmark_bash.write(" ".join(benchmark_cl))
benchmark_bash.close()
benchmark_data = run_benchmark_module(benchmark_cl)
benchmark_file = open(
f"{bench_dir}/{d_}/{d_}_data.txt", "w+"
)
benchmark_file.write(f"DISPATCH: {d_}\n")
benchmark_file.write(str(benchmark_data) + "\n")
benchmark_file.write(
"SHARK BENCHMARK RESULT: "
+ str(1 / (benchmark_data * 0.001))
+ "\n"
)
benchmark_file.close()
benchmark_runtimes[d_] = 1 / (benchmark_data * 0.001)
elif ".mlir" in f_ and "benchmark" not in f_:
dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
module = dispatch_file.read()
dispatch_file.close()
module = re.sub(
"hal.executable private",
"hal.executable public",
module,
)
flatbuffer_blob = ireec.compile_str(
module,
target_backends=[iree_target_map(device)],
extra_args=["--compile-mode=hal-executable"],
)
spirv_file = open(
f"{bench_dir}/{d_}/{d_}_spirv.vmfb", "wb"
)
spirv_file.write(flatbuffer_blob)
spirv_file.close()
ordered_dispatches = [
(k, v)
for k, v in sorted(
benchmark_runtimes.items(), key=lambda item: item[1]
)
][::-1]
f_ = open(f"{bench_dir}/ordered-dispatches.txt", "w+")
for dispatch in ordered_dispatches:
f_.write(f"{dispatch[0]}: {dispatch[1]}ms\n")
f_.close()
def compile_module_to_flatbuffer(
module,
device,
frontend,
model_config_path,
extra_args,
model_name="None",
module, device, frontend, func_name, model_config_path
):
# Setup Compile arguments wrt to frontends.
input_type = ""
args = get_iree_frontend_args(frontend)
args += get_iree_device_args(device, extra_args)
args += get_iree_device_args(device)
args += get_iree_common_args()
args += get_model_specific_args()
args += extra_args
if frontend in ["tensorflow", "tf"]:
input_type = "mhlo"
@@ -253,38 +77,40 @@ def compile_module_to_flatbuffer(
elif frontend in ["tflite", "tflite-tosa"]:
input_type = "tosa"
elif frontend in ["tm_tensor"]:
input_type = ireec.InputType.TM_TENSOR
input_type = frontend
# TODO: make it simpler.
# Compile according to the input type, else just try compiling.
if input_type not in ["mhlo", "tosa"]:
module = str(module)
if input_type != "":
# Currently for MHLO/TOSA.
flatbuffer_blob = ireec.compile_str(
module,
target_backends=[iree_target_map(device)],
target_backends=[IREE_TARGET_MAP[device]],
extra_args=args,
input_type=input_type,
)
else:
# Currently for Torch.
flatbuffer_blob = ireec.compile_str(
module,
target_backends=[iree_target_map(device)],
str(module),
target_backends=[IREE_TARGET_MAP[device]],
extra_args=args,
)
return flatbuffer_blob
def get_iree_module(flatbuffer_blob, device):
def get_iree_module(flatbuffer_blob, device, func_name):
# Returns the compiled module and the configs.
config = get_iree_runtime_config(device)
config = ireert.Config(IREE_DEVICE_MAP[device])
vm_module = ireert.VmModule.from_flatbuffer(
config.vm_instance, flatbuffer_blob
)
ctx = ireert.SystemContext(config=config)
ctx.add_vm_module(vm_module)
ModuleCompiled = ctx.modules.module
ModuleCompiled = ctx.modules.module[func_name]
return ModuleCompiled, config
@@ -292,22 +118,24 @@ def get_iree_compiled_module(
module,
device: str,
frontend: str = "torch",
func_name: str = "forward",
model_config_path: str = None,
extra_args: list = [],
):
"""Given a module returns the compiled .vmfb and configs"""
flatbuffer_blob = compile_module_to_flatbuffer(
module, device, frontend, model_config_path, extra_args
module, device, frontend, func_name, model_config_path
)
return get_iree_module(flatbuffer_blob, device)
return get_iree_module(flatbuffer_blob, device, func_name)
def load_flatbuffer(flatbuffer_path: str, device: str):
def load_flatbuffer(
flatbuffer_path: str, device: str, func_name: str = "forward"
):
with open(os.path.join(flatbuffer_path), "rb") as f:
flatbuffer_blob = f.read()
return get_iree_module(flatbuffer_blob, device)
return get_iree_module(flatbuffer_blob, device, func_name)
def export_iree_module_to_vmfb(
@@ -315,19 +143,14 @@ def export_iree_module_to_vmfb(
device: str,
directory: str,
mlir_dialect: str = "linalg",
func_name: str = "forward",
model_config_path: str = None,
module_name: str = None,
extra_args: list = [],
):
# Compiles the module given specs and saves it as .vmfb file.
flatbuffer_blob = compile_module_to_flatbuffer(
module, device, mlir_dialect, model_config_path, extra_args
module, device, mlir_dialect, func_name, model_config_path
)
if module_name is None:
device_name = (
device if "://" not in device else "-".join(device.split("://"))
)
module_name = f"{mlir_dialect}_{device_name}"
module_name = f"{mlir_dialect}_{func_name}_{device}"
filename = os.path.join(directory, module_name + ".vmfb")
print(f"Saved vmfb in {filename}.")
with open(filename, "wb") as f:
@@ -349,39 +172,18 @@ def export_module_to_mlir_file(module, frontend, directory: str):
return filename
def get_results(
compiled_vm,
function_name,
input,
config,
frontend="torch",
send_to_host=True,
):
def get_results(compiled_vm, input, config, frontend="torch"):
"""Runs a .vmfb file given inputs and config and returns output."""
device_inputs = [ireert.asdevicearray(config.device, a) for a in input]
result = compiled_vm[function_name](*device_inputs)
result = compiled_vm(*device_inputs)
result_tensors = []
if isinstance(result, tuple):
if send_to_host:
for val in result:
result_tensors.append(np.asarray(val, val.dtype))
else:
for val in result:
result_tensors.append(val)
for val in result:
result_tensors.append(np.copy(np.asarray(val, val.dtype)))
return result_tensors
elif isinstance(result, dict):
data = list(result.items())
if send_to_host:
res = np.array(data, dtype=object)
return np.copy(res)
return data
res = np.array(data, dtype=object)
return np.copy(res)
else:
if send_to_host and result is not None:
return result.to_host()
return result
def get_iree_runtime_config(device):
device = iree_device_map(device)
config = ireert.Config(device=ireert.get_device(device))
return config
return np.copy(np.asarray(result, dtype=result.dtype))

View File

@@ -15,31 +15,28 @@
# All the iree_cpu related functionalities go here.
import subprocess
import platform
def get_cpu_count():
import multiprocessing
try:
cpu_count = multiprocessing.cpu_count()
return cpu_count
except NotImplementedError:
return None
# Get the default cpu args.
def get_iree_cpu_args():
uname = platform.uname()
os_name, proc_name = uname.system, uname.machine
find_triple_cmd = "uname -s -m"
os_name, proc_name = (
subprocess.run(
find_triple_cmd, shell=True, stdout=subprocess.PIPE, check=True
)
.stdout.decode("utf-8")
.split()
)
if os_name == "Darwin":
kernel_version = uname.release
find_kernel_version_cmd = "uname -r"
kernel_version = subprocess.run(
find_kernel_version_cmd,
shell=True,
stdout=subprocess.PIPE,
check=True,
).stdout.decode("utf-8")
target_triple = f"{proc_name}-apple-darwin{kernel_version}"
elif os_name == "Linux":
target_triple = f"{proc_name}-linux-gnu"
elif os_name == "Windows":
target_triple = "x86_64-pc-windows-msvc"
else:
error_message = f"OS Type f{os_name} not supported and triple can't be determined, open issue to dSHARK team please :)"
raise Exception(error_message)

View File

@@ -25,8 +25,7 @@ def get_iree_gpu_args():
# TODO: Give the user_interface to pass the sm_arch.
sm_arch = get_cuda_sm_cc()
if (
sm_arch
in ["sm_70", "sm_72", "sm_75", "sm_80", "sm_84", "sm_86", "sm_89"]
sm_arch in ["sm_70", "sm_72", "sm_75", "sm_80", "sm_84", "sm_86"]
) and (shark_args.enable_tf32 == True):
return [
"--iree-hal-cuda-disable-loop-nounroll-wa",
@@ -57,7 +56,7 @@ CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
def get_cuda_sm_cc():
libnames = ("libcuda.so", "libcuda.dylib", "nvcuda.dll")
libnames = ("libcuda.so", "libcuda.dylib", "cuda.dll")
for libname in libnames:
try:
cuda = ctypes.CDLL(libname)

View File

@@ -1,470 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import OrderedDict
def get_vulkan_target_env(vulkan_target_triple):
arch, product, os = vulkan_target_triple.split("=")[1].split("-")
triple = (arch, product, os)
# get version
version = get_version(triple=triple)
# TODO get revision
revision = 120
# extensions
extensions = get_extensions(triple)
# get vendor
vendor = get_vendor(triple)
# get device type
device_type = get_device_type(triple)
# get capabilities
capabilities = get_vulkan_target_capabilities(triple)
target_env = f"#vk.target_env<{version}, r({revision}), {extensions}, {vendor}:{device_type}, #vk.caps< {capabilities} >>"
return target_env
def get_vulkan_target_env_flag(vulkan_target_triple):
target_env = get_vulkan_target_env(vulkan_target_triple)
target_env_flag = f"--iree-vulkan-target-env={target_env}"
return target_env_flag
def get_version(triple):
arch, product, os = triple
if os in ["android30", "android31"]:
return "v1.1"
if product in ["android30", "android31"]:
return "v1.1"
if arch in ["unknown"]:
return "v1.1"
return "v1.3"
def get_extensions(triple):
def make_ext_list(ext_list):
res = ""
for e in ext_list:
res += e + ", "
res = f"[{res[:-2]}]"
return res
arch, product, os = triple
if arch == "m1":
ext = [
"VK_KHR_16bit_storage",
"VK_KHR_8bit_storage",
"VK_KHR_shader_float16_int8",
"VK_KHR_storage_buffer_storage_class",
"VK_KHR_variable_pointers",
]
return make_ext_list(ext_list=ext)
if arch == "valhall":
ext = [
"VK_KHR_16bit_storage",
"VK_KHR_8bit_storage",
"VK_KHR_shader_float16_int8",
"VK_KHR_spirv_1_4",
"VK_KHR_storage_buffer_storage_class",
"VK_KHR_variable_pointers",
]
return make_ext_list(ext_list=ext)
if arch == "adreno":
ext = [
"VK_KHR_16bit_storage",
"VK_KHR_shader_float16_int8",
"VK_KHR_spirv_1_4",
"VK_KHR_storage_buffer_storage_class",
"VK_KHR_variable_pointers",
]
if os == "android31":
ext.append("VK_KHR_8bit_storage")
return make_ext_list(ext_list=ext)
if get_vendor(triple) == "SwiftShader":
ext = ["VK_KHR_storage_buffer_storage_class"]
return make_ext_list(ext_list=ext)
if arch == "unknown":
ext = [
"VK_KHR_storage_buffer_storage_class",
"VK_KHR_variable_pointers",
]
return make_ext_list(ext_list=ext)
ext = [
"VK_KHR_16bit_storage",
"VK_KHR_8bit_storage",
"VK_KHR_shader_float16_int8",
"VK_KHR_spirv_1_4",
"VK_KHR_storage_buffer_storage_class",
"VK_KHR_variable_pointers",
"VK_EXT_subgroup_size_control",
]
if get_vendor(triple) == "NVIDIA" or arch == "rdna3":
ext.append("VK_NV_cooperative_matrix")
return make_ext_list(ext_list=ext)
def get_vendor(triple):
arch, product, os = triple
if arch == "unknown":
return "Unknown"
if arch in ["rdna1", "rdna2", "rdna3", "rgcn3", "rgcn4", "rgcn5"]:
return "AMD"
if arch == "valhall":
return "ARM"
if arch == "m1":
return "Apple"
if arch in ["turing", "ampere"]:
return "NVIDIA"
if arch == "ardeno":
return "Qualcomm"
if arch == "cpu":
if product == "swiftshader":
return "SwiftShader"
return "Unknown"
print(f"Vendor for target triple - {triple} not found. Using unknown")
return "Unknown"
def get_device_type(triple):
arch, product, _ = triple
if arch == "unknown":
return "Unknown"
if arch == "cpu":
return "CPU"
if arch in ["turing", "ampere"]:
return "DiscreteGPU"
if arch in ["rdna1", "rdna2", "rdna3", "rgcn3", "rgcn5"]:
if product == "ivega10":
return "IntegratedGPU"
return "DiscreteGPU"
if arch in ["m1", "valhall", "adreno"]:
return "IntegratedGPU"
print(f"Device type for target triple - {triple} not found. Using unknown")
return "Unknown"
# get all the capabilities for the device
# TODO: make a dataclass for capabilites and init using vulkaninfo
def get_vulkan_target_capabilities(triple):
def get_subgroup_val(l):
return int(sum([subgroup_feature[sgf] for sgf in l]))
cap = OrderedDict()
arch, product, os = triple
subgroup_feature = {
"Basic": 1,
"Vote": 2,
"Arithmetic": 4,
"Ballot": 8,
"Shuffle": 16,
"ShuffleRelative": 32,
"Clustered": 64,
"Quad": 128,
"PartitionedNV": 256,
}
cap["maxComputeSharedMemorySize"] = 16384
cap["maxComputeWorkGroupInvocations"] = 128
cap["maxComputeWorkGroupSize"] = [128, 128, 64]
cap["subgroupSize"] = 32
cap["subgroupFeatures"] = ["Basic"]
cap["minSubgroupSize"] = None
cap["maxSubgroupSize"] = None
cap["shaderFloat16"] = False
cap["shaderFloat64"] = False
cap["shaderInt8"] = False
cap["shaderInt16"] = False
cap["shaderInt64"] = False
cap["storageBuffer16BitAccess"] = False
cap["storagePushConstant16"] = False
cap["uniformAndStorageBuffer16BitAccess"] = False
cap["storageBuffer8BitAccess"] = False
cap["storagePushConstant8"] = False
cap["uniformAndStorageBuffer8BitAccess"] = False
cap["variablePointers"] = False
cap["variablePointersStorageBuffer"] = False
cap["coopmatCases"] = None
if arch in ["rdna1", "rdna2", "rdna3"]:
cap["maxComputeSharedMemorySize"] = 65536
cap["maxComputeWorkGroupInvocations"] = 1024
cap["maxComputeWorkGroupSize"] = [1024, 1024, 1024]
cap["subgroupSize"] = 64
cap["minSubgroupSize"] = 32
cap["maxSubgroupSize"] = 64
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Shuffle",
"ShuffleRelative",
"Clustered",
"Quad",
]
cap["shaderFloat16"] = True
cap["shaderFloat64"] = True
cap["shaderInt8"] = True
cap["shaderInt16"] = True
cap["shaderInt64"] = True
cap["storageBuffer16BitAccess"] = True
cap["storagePushConstant16"] = True
cap["uniformAndStorageBuffer16BitAccess"] = True
cap["storageBuffer8BitAccess"] = True
cap["storagePushConstant8"] = True
cap["uniformAndStorageBuffer8BitAccess"] = True
cap["variablePointers"] = True
cap["variablePointersStorageBuffer"] = True
if arch == "rdna3":
# TODO: Get scope value
cap["coopmatCases"] = [
"mSize = 16, nSize = 16, kSize = 16, aType = f16, bType = f16, cType = f16, resultType = f16, scope = #vk.scope<Subgroup>"
]
if product == "rx5700xt":
cap["storagePushConstant16"] = False
cap["storagePushConstant8"] = False
elif arch in ["rgcn5", "rgcn4", "rgcn3"]:
cap["maxComputeSharedMemorySize"] = 65536
cap["maxComputeWorkGroupInvocations"] = 1024
cap["maxComputeWorkGroupSize"] = [1024, 1024, 1024]
cap["subgroupSize"] = 64
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Shuffle",
"ShuffleRelative",
"Clustered",
"Quad",
]
cap["minSubgroupSize"] = 64
cap["maxSubgroupSize"] = 64
if arch == "rgcn5":
cap["shaderFloat16"] = True
cap["shaderFloat64"] = True
cap["storageBuffer16BitAccess"] = True
cap["shaderInt8"] = True
cap["shaderInt16"] = True
cap["shaderInt64"] = True
cap["storagePushConstant16"] = False
cap["uniformAndStorageBuffer16BitAccess"] = True
cap["storageBuffer8BitAccess"] = True
cap["storagePushConstant8"] = False
cap["uniformAndStorageBuffer8BitAccess"] = True
cap["variablePointers"] = True
cap["variablePointersStorageBuffer"] = True
elif arch == "m1":
cap["maxComputeSharedMemorySize"] = 32768
cap["maxComputeWorkGroupInvocations"] = 1024
cap["maxComputeWorkGroupSize"] = [1024, 1024, 1024]
cap["subgroupSize"] = 32
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Shuffle",
"ShuffleRelative",
"Quad",
]
cap["shaderFloat16"] = True
cap["shaderFloat64"] = True
cap["shaderInt8"] = True
cap["shaderInt16"] = True
cap["shaderInt64"] = True
cap["storageBuffer16BitAccess"] = True
cap["storagePushConstant16"] = True
cap["uniformAndStorageBuffer16BitAccess"] = True
cap["storageBuffer8BitAccess"] = True
cap["storagePushConstant8"] = True
cap["uniformAndStorageBuffer8BitAccess"] = True
cap["variablePointers"] = True
cap["variablePointersStorageBuffer"] = True
elif arch == "valhall":
cap["maxComputeSharedMemorySize"] = 32768
cap["maxComputeWorkGroupInvocations"] = 512
cap["maxComputeWorkGroupSize"] = [512, 512, 512]
cap["subgroupSize"] = 16
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Clustered",
"Quad",
]
if os == "android31":
cap["subgroupFeatures"].append("Shuffle")
cap["subgroupFeatures"].append("ShuffleRelative")
cap["shaderFloat16"] = True
cap["shaderInt8"] = True
cap["shaderInt16"] = True
cap["storageBuffer16BitAccess"] = True
cap["storagePushConstant16"] = True
cap["uniformAndStorageBuffer16BitAccess"] = True
cap["storageBuffer8BitAccess"] = True
cap["storagePushConstant8"] = True
cap["uniformAndStorageBuffer8BitAccess"] = True
cap["variablePointers"] = True
cap["variablePointersStorageBuffer"] = True
elif arch == "cpu":
if product == "swiftshader":
cap["maxComputeSharedMemorySize"] = 16384
cap["subgroupSize"] = 4
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Shuffle",
"ShuffleRelative",
]
elif arch in ["ampere", "turing"]:
cap["maxComputeSharedMemorySize"] = 49152
cap["maxComputeWorkGroupInvocations"] = 1024
cap["maxComputeWorkGroupSize"] = [1024, 1024, 1024]
cap["subgroupSize"] = 32
cap["minSubgroupSize"] = 32
cap["maxSubgroupSize"] = 32
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Shuffle",
"ShuffleRelative",
"Clustered",
"Quad",
]
cap["shaderFloat16"] = True
cap["shaderFloat64"] = True
cap["shaderInt8"] = True
cap["shaderInt16"] = True
cap["shaderInt64"] = True
cap["storageBuffer16BitAccess"] = True
cap["storagePushConstant16"] = True
cap["uniformAndStorageBuffer16BitAccess"] = True
cap["storageBuffer8BitAccess"] = True
cap["storagePushConstant8"] = True
cap["uniformAndStorageBuffer8BitAccess"] = True
cap["variablePointers"] = True
cap["variablePointersStorageBuffer"] = True
cap["coopmatCases"] = [
"mSize = 8, nSize = 8, kSize = 32, aType = i8, bType = i8, cType = i32, resultType = i32, scope = #vk.scope<Subgroup>",
"mSize = 16, nSize = 16, kSize = 16, aType = f16, bType = f16, cType = f16, resultType = f16, scope = #vk.scope<Subgroup>",
"mSize = 16, nSize = 16, kSize = 16, aType = f16, bType = f16, cType = f32, resultType = f32, scope = #vk.scope<Subgroup>",
]
elif arch == "adreno":
cap["maxComputeSharedMemorySize"] = 32768
cap["maxComputeWorkGroupInvocations"] = 1024
cap["maxComputeWorkGroupSize"] = [1024, 1024, 64]
cap["subgroupSize"] = 64
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Shuffle",
"ShuffleRelative",
"Quad",
]
cap["shaderFloat16"] = True
cap["shaderInt8"] = True
cap["shaderInt16"] = True
cap["storageBuffer16BitAccess"] = True
if os == "andorid31":
cap["uniformAndStorageBuffer8BitAccess"] = True
cap["variablePointers"] = True
cap["variablePointersStorageBuffer"] = True
elif arch == "unknown":
cap["subgroupSize"] = 64
cap["variablePointers"] = False
cap["variablePointersStorageBuffer"] = False
else:
print(
f"Architecture {arch} not matched. Using default vulkan target device capability"
)
def get_comma_sep_str(ele_list):
l = ""
for ele in ele_list:
l += f"{ele}, "
l = f"[{l[:-2]}]"
return l
res = ""
for k, v in cap.items():
if v is None or v == False:
continue
if isinstance(v, bool):
res += f"{k} = {'unit' if v == True else None}, "
elif isinstance(v, list):
if k == "subgroupFeatures":
res += f"subgroupFeatures = {get_subgroup_val(v)}: i32, "
elif k == "maxComputeWorkGroupSize":
res += f"maxComputeWorkGroupSize = dense<{get_comma_sep_str(v)}>: vector<{len(v)}xi32>, "
elif k == "coopmatCases":
cmc = ""
for case in v:
cmc += f"#vk.coop_matrix_props<{case}>, "
res += f"cooperativeMatrixPropertiesNV = [{cmc[:-2]}], "
else:
res += f"{k} = {get_comma_sep_str(v)}, "
else:
res += f"{k} = {v}, "
res = res[:-2]
return res

View File

@@ -14,151 +14,48 @@
# All the iree_vulkan related functionalities go here.
from os import linesep
from shark.iree_utils._common import run_cmd
import iree.runtime as ireert
from sys import platform
from shark.iree_utils.vulkan_target_env_utils import get_vulkan_target_env_flag
def get_vulkan_device_name():
vulkaninfo_dump = run_cmd("vulkaninfo").split(linesep)
vulkaninfo_list = [s.strip() for s in vulkaninfo_dump if "deviceName" in s]
if len(vulkaninfo_list) == 0:
raise ValueError("No device name found in VulkanInfo!")
if len(vulkaninfo_list) > 1:
print("Following devices found:")
for i, dname in enumerate(vulkaninfo_list):
print(f"{i}. {dname}")
print(f"Choosing first one: {vulkaninfo_list[0]}")
return vulkaninfo_list[0]
def get_os_name():
if platform.startswith("linux"):
return "linux"
elif platform == "darwin":
return "macos"
elif platform == "win32":
return "windows"
else:
print("Cannot detect OS type, defaulting to linux.")
return "linux"
def get_vulkan_target_triple(device_name):
"""This method provides a target triple str for specified vulkan device.
Args:
device_name (str): name of the hardware device to be used with vulkan
Returns:
str or None: target triple or None if no match found for given name
"""
system_os = get_os_name()
# Apple Targets
if all(x in device_name for x in ("Apple", "M1")):
triple = "m1-moltenvk-macos"
elif all(x in device_name for x in ("Apple", "M2")):
triple = "m1-moltenvk-macos"
# Nvidia Targets
elif all(x in device_name for x in ("RTX", "2080")):
triple = f"turing-rtx2080-{system_os}"
elif all(x in device_name for x in ("A100", "SXM4")):
triple = f"ampere-a100-{system_os}"
elif all(x in device_name for x in ("RTX", "3090")):
triple = f"ampere-rtx3090-{system_os}"
elif all(x in device_name for x in ("RTX", "3080")):
triple = f"ampere-rtx3080-{system_os}"
elif all(x in device_name for x in ("RTX", "3070")):
triple = f"ampere-rtx3070-{system_os}"
elif all(x in device_name for x in ("RTX", "3060")):
triple = f"ampere-rtx3060-{system_os}"
elif all(x in device_name for x in ("RTX", "3050")):
triple = f"ampere-rtx3050-{system_os}"
# We use ampere until lovelace target triples are plumbed in.
elif all(x in device_name for x in ("RTX", "4090")):
triple = f"ampere-rtx4090-{system_os}"
elif all(x in device_name for x in ("RTX", "4080")):
triple = f"ampere-rtx4080-{system_os}"
elif all(x in device_name for x in ("RTX", "4070")):
triple = f"ampere-rtx4070-{system_os}"
elif all(x in device_name for x in ("RTX", "4000")):
triple = f"turing-rtx4000-{system_os}"
elif all(x in device_name for x in ("RTX", "5000")):
triple = f"turing-rtx5000-{system_os}"
elif all(x in device_name for x in ("RTX", "6000")):
triple = f"turing-rtx6000-{system_os}"
elif all(x in device_name for x in ("RTX", "8000")):
triple = f"turing-rtx8000-{system_os}"
elif all(x in device_name for x in ("TITAN", "RTX")):
triple = f"turing-titanrtx-{system_os}"
elif all(x in device_name for x in ("GTX", "1060")):
triple = f"pascal-gtx1060-{system_os}"
elif all(x in device_name for x in ("GTX", "1070")):
triple = f"pascal-gtx1070-{system_os}"
elif all(x in device_name for x in ("GTX", "1080")):
triple = f"pascal-gtx1080-{system_os}"
# Amd Targets
# Linux: Radeon RX 7900 XTX
# Windows: AMD Radeon RX 7900 XTX
elif all(x in device_name for x in ("RX", "7900")):
triple = f"rdna3-7900-{system_os}"
elif any(x in device_name for x in ("AMD", "Radeon")):
triple = f"rdna2-unknown-{system_os}"
else:
triple = None
return triple
def get_vulkan_triple_flag(device_name="", extra_args=[]):
for flag in extra_args:
if "-iree-vulkan-target-triple=" in flag:
print(f"Using target triple {flag.split('=')[1]}")
return None
if device_name == "" or device_name == [] or device_name is None:
vulkan_device = get_vulkan_device_name()
else:
vulkan_device = device_name
triple = get_vulkan_target_triple(vulkan_device)
if triple is not None:
def get_vulkan_triple_flag():
vulkan_device_cmd = "vulkaninfo | grep deviceName"
vulkan_device = run_cmd(vulkan_device_cmd).strip()
if all(x in vulkan_device for x in ("Apple", "M1")):
print(f"Found {vulkan_device} Device. Using m1-moltenvk-macos")
return "-iree-vulkan-target-triple=m1-moltenvk-macos"
elif all(x in vulkan_device for x in ("Apple", "M2")):
print("Found Apple M2 Device. Using m1-moltenvk-macos")
return "-iree-vulkan-target-triple=m1-moltenvk-macos"
elif all(x in vulkan_device for x in ("A100", "SXM4")):
print(f"Found {vulkan_device} Device. Using ampere-rtx3080-linux")
return "-iree-vulkan-target-triple=ampere-rtx3080-linux"
elif all(x in vulkan_device for x in ("RTX", "3090")):
print(f"Found {vulkan_device} Device. Using ampere-rtx3090-linux")
return "-iree-vulkan-target-triple=ampere-rtx3090-linux"
elif all(x in vulkan_device for x in ("Radeon", "RX 5")):
print(
f"Found vulkan device {vulkan_device}. Using target triple {triple}"
"Found AMD Radeon RX 5000 series device. Using rdna1-5700xt-linux"
)
return f"-iree-vulkan-target-triple={triple}"
print(
"""Optimized kernel for your target device is not added yet.
Contact SHARK Admin on discord[https://discord.com/invite/RUqY2h2s9u]
or pull up an issue."""
)
print(f"Target : {vulkan_device}")
return None
return "-iree-vulkan-target-triple=rdna1-5700xt-linux"
elif all(x in vulkan_device for x in ("Radeon", "RX 6")):
print(
"Found AMD Radeon RX 6000 series device. Using rdna2-unknown-linux"
)
return "-iree-vulkan-target-triple=rdna2-unknown-linux"
else:
print(
"""Optimized kernel for your target device is not added yet.
Contact SHARK Admin on discord[https://discord.com/invite/RUqY2h2s9u]
or pull up an issue."""
)
print(f"Target : {vulkan_device}")
return None
def get_iree_vulkan_args(extra_args=[]):
def get_iree_vulkan_args():
# vulkan_flag = ["--iree-flow-demote-i64-to-i32"]
res_vulkan_flag = []
vulkan_triple_flag = None
for arg in extra_args:
if "-iree-vulkan-target-triple=" in arg:
print(f"Using target triple {arg} from command line args")
vulkan_triple_flag = arg
break
if vulkan_triple_flag is None:
vulkan_triple_flag = get_vulkan_triple_flag(extra_args=extra_args)
vulkan_flag = []
vulkan_triple_flag = get_vulkan_triple_flag()
if vulkan_triple_flag is not None:
vulkan_target_env = get_vulkan_target_env_flag(vulkan_triple_flag)
res_vulkan_flag.append(vulkan_target_env)
return res_vulkan_flag
def set_iree_vulkan_runtime_flags(flags):
for flag in flags:
ireert.flags.parse_flags(flag)
return
vulkan_flag.append(vulkan_triple_flag)
return vulkan_flag

View File

@@ -12,19 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Usage:
This function takes the model mlir file and the tuned config file as input,
and output a new mlir file with lowering configs annotated on certain ops.
There are two ways to utilize the function:
1. Call model_annotation function within another python script
from shark.model_annotation import model_annotation
with create_context() as ctx:
module = model_annotation(ctx, input_contents=..., config_path=..., search_op=...)
2. Run model_annotation.py directly
python model_annotation.py -model path_to_original_mlir -config_path path_to_config_file
"""
import json
import os
import sys
@@ -39,27 +26,21 @@ def model_annotation(
*,
input_contents: str,
config_path: str,
search_op: str,
winograd: bool = False,
search_op: str = "matmul",
):
if os.path.isfile(input_contents):
with open(input_contents, "rb") as f:
input_contents = f.read()
module = ir.Module.parse(input_contents)
if config_path == "":
return module
if winograd:
with open(config_path, "r") as f:
data = json.load(f)
configs = data["c,f"]
else:
configs = load_model_configs(config_path)
with open(config_path, "r") as f:
data = json.load(f)
configs = data["options"]
# The Python API does not expose a general walk() function, so we just
# do it ourselves.
walk_children(module.operation, configs, search_op, winograd)
walk_children(module.operation, configs, 0, search_op)
if not module.operation.verify():
raise RuntimeError("Modified program does not verify!")
@@ -67,42 +48,8 @@ def model_annotation(
return module
def load_model_configs(config_path: str):
config = {}
with open(config_path, "r") as f:
for line in f:
data = json.loads(line)
if "identifier" not in data.keys():
continue
if data["identifier"] == "matmul":
matrix_size = [data["m"], data["n"], data["k"]]
elif data["identifier"] == "bmm":
matrix_size = [data["b"], data["m"], data["n"], data["k"]]
elif data["identifier"] == "generic":
matrix_size = [1, data["b"], data["m"], data["n"], data["k"]]
elif data["identifier"] == "conv":
matrix_size = [
data["n"],
data["ih"],
data["iw"],
data["c"],
data["kh"],
data["kw"],
data["f"],
data["oh"],
data["ow"],
data["d"],
data["s"],
data["p"],
]
config[shape_list_to_string(matrix_size)] = data
f.close()
return config
def walk_children(
op: ir.Operation, configs: List[Dict], search_op: str, winograd: bool
op: ir.Operation, configs: List[Dict], idx: int, search_op: str
):
if search_op == "matmul":
op_names = ["linalg.matmul", "mhlo.dot"]
@@ -110,8 +57,6 @@ def walk_children(
op_names = ["linalg.batch_matmul", "mhlo.dot_general"]
elif search_op == "conv":
op_names = ["mhlo.convolution", "linalg.conv_2d_nhwc_hwcf"]
elif search_op == "generic":
op_names = ["linalg.generic"]
elif search_op == "all":
op_names = [
"mhlo.dot",
@@ -120,7 +65,6 @@ def walk_children(
"linalg.matmul",
"linalg.batch_matmul",
"linalg.conv_2d_nhwc_hwcf",
"linalg.generic",
]
else:
raise ValueError(f"{search_op} op is not tunable.")
@@ -132,172 +76,36 @@ def walk_children(
# 'operation' and 'name' attributes.
if isinstance(child_op, ir.OpView):
child_op = child_op.operation
if winograd and child_op.name in [
"linalg.conv_2d_nchw_fchw",
"linalg.conv_2d_nhwc_hwcf",
]:
add_winograd_attribute(child_op, configs)
if child_op.name in op_names:
if child_op.name == "linalg.generic":
# This is for generic op that has contractionOpInterface
# which is basically einsum("mk,bkn->bmn")
op_result = str(child_op.results[0])
op_iterator = str(
child_op.attributes["iterator_types"]
)
if len(child_op.operands) != 3:
continue
if "reduction" not in op_iterator:
continue
if (
"arith.addf" not in op_result
or "arith.mulf" not in op_result
):
continue
if "arith.subf" in op_result:
continue
child_op_shape = get_op_shape(child_op, search_op)
if (
child_op_shape in configs.keys()
and configs[child_op_shape]["options"][0] != None
):
add_attributes(
child_op, configs[child_op_shape]["options"][0]
)
walk_children(child_op, configs, search_op, winograd)
if child_op.name in op_names and idx < len(configs):
add_attributes(child_op, configs[idx])
idx = idx + 1
print(f"Updated op {child_op}", file=sys.stderr)
walk_children(child_op, configs, idx, search_op)
def get_op_shape(op: ir.Operation, search_op: str):
shape_list = []
if search_op in ["generic", "all"]:
if op.name in ["linalg.generic"]:
input1 = str(op.operands[0].type)
input2 = str(op.operands[1].type)
m = input1.split("tensor<")[1].split("x")[0]
b = input2.split("tensor<")[1].split("x")[0]
k = input2.split("tensor<")[1].split("x")[1]
n = input2.split("tensor<")[1].split("x")[2]
shape_list = [1, int(b), int(m), int(n), int(k)]
def add_attributes(op: ir.Operation, config: Dict):
(
tile_sizes,
pipeline,
workgroup_size,
split_k,
pipeline_depth,
) = parse_config(config)
if search_op in ["matmul", "all"]:
if op.name in ["mhlo.dot"]:
op_result = str(op.results[0])
m = op_result.split("tensor<")[1].split("x")[0]
k = op_result.split("tensor<")[1].split("x")[1]
n = op_result.split("tensor<")[2].split("x")[1]
shape_list = [int(m), int(n), int(k)]
elif op.name in ["linalg.matmul"]:
op_result = str(op.results[0]).split("ins(")[1]
m = op_result.split("tensor<")[1].split("x")[0]
k = op_result.split("tensor<")[1].split("x")[1]
n = op_result.split("tensor<")[2].split("x")[1]
shape_list = [int(m), int(n), int(k)]
add_compilation_info(
op,
tile_sizes=tile_sizes,
pipeline=pipeline,
workgroup_size=workgroup_size,
pipeline_depth=pipeline_depth,
)
if search_op in ["bmm", "all"]:
if op.name in ["mhlo.dot_general"]:
op_result = str(op.results[0])
b = op_result.split("tensor<")[1].split("x")[1]
m = op_result.split("tensor<")[1].split("x")[2]
k = op_result.split("tensor<")[1].split("x")[3]
n = op_result.split("tensor<")[3].split("x")[3]
shape_list = [int(b), int(m), int(n), int(k)]
elif op.name in ["linalg.batch_matmul"]:
op_result = str(op.results[0]).split("ins(")[1]
b = op_result.split("tensor<")[1].split("x")[0]
m = op_result.split("tensor<")[1].split("x")[1]
k = op_result.split("tensor<")[1].split("x")[2]
n = op_result.split("tensor<")[3].split("x")[2]
shape_list = [int(b), int(m), int(n), int(k)]
if search_op in ["conv", "all"]:
if op.name in ["mhlo.convolution"]:
op_result = str(op.results[0])
dilation = (
str(op.attributes["rhs_dilation"])
.split("dense<")[1]
.split(">")[0]
)
stride = (
str(op.attributes["window_strides"])
.split("dense<")[1]
.split(">")[0]
)
pad = (
str(op.attributes["padding"]).split("dense<")[1].split(">")[0]
)
n = op_result.split("tensor<")[1].split("x")[0]
ih = op_result.split("tensor<")[1].split("x")[1]
iw = op_result.split("tensor<")[1].split("x")[2]
c = op_result.split("tensor<")[1].split("x")[3]
kh = op_result.split("tensor<")[2].split("x")[0]
kw = op_result.split("tensor<")[2].split("x")[1]
f = op_result.split("tensor<")[2].split("x")[3]
oh = op_result.split("tensor<")[3].split("x")[1]
ow = op_result.split("tensor<")[3].split("x")[2]
shape_list = [
int(n),
int(ih),
int(iw),
int(c),
int(kh),
int(kw),
int(f),
int(oh),
int(ow),
int(dilation),
int(stride),
int(pad),
]
elif op.name in ["linalg.conv_2d_nhwc_hwcf"]:
op_result = str(op.results[0]).split("ins(")[1]
dilation = (
str(op.attributes["dilations"])
.split("dense<")[1]
.split(">")[0]
)
stride = (
str(op.attributes["strides"]).split("dense<")[1].split(">")[0]
)
pad = 0
n = op_result.split("tensor<")[1].split("x")[0]
ih = op_result.split("tensor<")[1].split("x")[1]
iw = op_result.split("tensor<")[1].split("x")[2]
c = op_result.split("tensor<")[1].split("x")[3]
kh = op_result.split("tensor<")[2].split("x")[0]
kw = op_result.split("tensor<")[2].split("x")[1]
f = op_result.split("tensor<")[2].split("x")[3]
oh = op_result.split("tensor<")[3].split("x")[1]
ow = op_result.split("tensor<")[3].split("x")[2]
shape_list = [
int(n),
int(ih),
int(iw),
int(c),
int(kh),
int(kw),
int(f),
int(oh),
int(ow),
int(dilation),
int(stride),
int(pad),
]
shape_str = shape_list_to_string(shape_list)
return shape_str
if split_k:
add_attribute_by_name(op, "iree_flow_split_k", split_k)
def add_attributes(op: ir.Operation, config: List[Dict]):
# Parse the config file
split_k = None
pipeline_depth = None
store_stage = None
subgroup_size = None
if "GPU" in config["pipeline"]:
def parse_config(config: Dict):
if config["pipeline"] == "GPU" or config["pipeline"] == "GPU_TENSORCORE":
pipeline = (
"LLVMGPUMatmulSimt"
if config["pipeline"] == "GPU"
@@ -305,97 +113,51 @@ def add_attributes(op: ir.Operation, config: List[Dict]):
)
tile_sizes = [config["work_group_tile_sizes"]]
workgroup_size = config["work_group_sizes"]
if "pipeline_depth" in config.keys():
try:
pipeline_depth = config["pipeline_depth"]
if "split_k" in config.keys():
except:
pipeline_depth = None
try:
split_k = config["split_k"]
elif "SPIRV" in config["pipeline"]:
pipeline = config["pipeline"]
tile_sizes = [
config["work_group_tile_sizes"],
config["parallel_tile_sizes"],
config["reduction_tile_sizes"],
]
workgroup_size = config["work_group_sizes"]
if "vector_tile_sizes" in config.keys():
tile_sizes += [config["vector_tile_sizes"]]
if "window_tile_sizes" in config.keys():
tile_sizes += [config["window_tile_sizes"]]
if "subgroup_size" in config.keys():
subgroup_size = config["subgroup_size"]
if "pipeline_depth" in config.keys():
pipeline_depth = config["pipeline_depth"]
if "store_stage" in config.keys():
store_stage = config["store_stage"]
except:
split_k = None
else:
# For IREE CPU pipelines
pipeline = config["pipeline"]
tile_sizes = [
config["work_group_tile_sizes"],
config["parallel_tile_sizes"],
config["reduction_tile_sizes"],
config["l1_tile_sizes"],
config["vector_tile_sizes"],
]
workgroup_size = []
# Add compilation info as an attribute. We don't have a Python binding for CompilationInfo,
# so we just parse its string form.
if pipeline_depth != None:
translation_info = f"{pipeline} pipeline_depth = {pipeline_depth}"
if store_stage != None:
translation_info += f" store_stage = {store_stage}"
else:
translation_info = f"{pipeline}"
compilation_info = (
f"#iree_codegen.compilation_info<"
f"lowering_config = <tile_sizes = {repr(tile_sizes)}>, "
f"translation_info = <{translation_info}>, "
f"workgroup_size = {repr(workgroup_size)} "
)
if subgroup_size != None:
compilation_info += f", subgroup_size = {subgroup_size}>"
else:
compilation_info += ">"
attr = ir.Attribute.parse(compilation_info)
op.attributes["compilation_info"] = attr
# Add other attributes if required.
if split_k:
add_attribute_by_name(op, "iree_flow_split_k", split_k)
split_k = None
pipeline_depth = None
return tile_sizes, pipeline, workgroup_size, split_k, pipeline_depth
def add_winograd_attribute(op: ir.Operation, config: List):
op_result = str(op.results[0]).split("ins(")[1]
dilation = int(
str(op.attributes["dilations"]).split("dense<")[1].split(">")[0]
)
stride = int(
str(op.attributes["strides"]).split("dense<")[1].split(">")[0]
)
if op.name == "linalg.conv_2d_nchw_fchw":
f = int(op_result.split("tensor<")[2].split("x")[0])
c = int(op_result.split("tensor<")[2].split("x")[1])
kh = int(op_result.split("tensor<")[2].split("x")[2])
kw = int(op_result.split("tensor<")[2].split("x")[3])
else:
kh = int(op_result.split("tensor<")[2].split("x")[0])
kw = int(op_result.split("tensor<")[2].split("x")[1])
c = int(op_result.split("tensor<")[2].split("x")[2])
f = int(op_result.split("tensor<")[2].split("x")[3])
if (
dilation == 1
and stride == 1
and kh == 3
and kw == 3
and [c, f] in config
):
op.attributes["iree_winograd_conv"] = ir.IntegerAttr.get(
ir.IntegerType.get_signless(64), 1
def add_compilation_info(
op: ir.Operation,
tile_sizes: List[List[int]],
pipeline: str,
workgroup_size: List[int],
pipeline_depth: int,
):
# We don't have a Python binding for CompilationInfo, so we just parse
# its string form.
if pipeline_depth:
attr = ir.Attribute.parse(
f"#iree_codegen.compilation_info<"
f"lowering_config = <tile_sizes = {repr(tile_sizes)}>, "
f"translation_info = <{pipeline} pipeline_depth = {pipeline_depth}>, "
f"workgroup_size = {repr(workgroup_size)}>"
)
else:
attr = ir.Attribute.parse(
f"#iree_codegen.compilation_info<"
f"lowering_config = <tile_sizes = {repr(tile_sizes)}>, "
f"translation_info = <{pipeline}>, "
f"workgroup_size = {repr(workgroup_size)}>"
)
op.attributes["compilation_info"] = attr
def add_attribute_by_name(op: ir.Operation, name: str, val: int):
@@ -403,10 +165,6 @@ def add_attribute_by_name(op: ir.Operation, name: str, val: int):
op.attributes[name] = attr
def shape_list_to_string(input):
return "x".join([str(d) for d in input])
def create_context() -> ir.Context:
context = ir.Context()
ireec_trans.register_all_dialects(context)
@@ -415,48 +173,15 @@ def create_context() -> ir.Context:
if __name__ == "__main__":
import argparse
from pathlib import Path
def path_expand(s):
return Path(s).expanduser().resolve()
parser = argparse.ArgumentParser()
parser.add_argument(
"-model",
type=path_expand,
default="model.mlir",
help="Path to the input mlir file",
)
parser.add_argument(
"-config_path",
type=path_expand,
default="best_configs.json",
help="Path where stores the op config file",
)
parser.add_argument(
"-output_path",
type=path_expand,
default="tuned_model.mlir",
help="Path to save the annotated mlir file",
)
parser.add_argument(
"-search_op",
type=str,
default="all",
help="Op to be optimized. options are matmul, bmm, conv.",
)
args = parser.parse_args()
with create_context() as ctx:
module = model_annotation(
ctx,
input_contents=args.model,
config_path=args.config_path,
search_op=args.search_op,
input_contents=sys.argv[1],
config_path=sys.argv[2],
search_op="all",
)
mlir_str = str(module)
with open(args.output_path, "w") as f:
filename = "tuned_model.mlir"
with open(filename, "w") as f:
f.write(mlir_str)
print(f"Saved mlir in {args.output_path}.")
print(f"Saved mlir in {filename}.")

View File

@@ -93,23 +93,4 @@ parser.add_argument(
help="Specify where to save downloaded shark_tank artifacts. If this is not set, the default is ~/.local/shark_tank/.",
)
parser.add_argument(
"--dispatch_benchmarks",
default=None,
help='dispatches to return benchamrk data on. use "All" for all, and None for none.',
)
parser.add_argument(
"--dispatch_benchmarks_dir",
default="temp_dispatch_benchmarks",
help='directory where you want to store dispatch data generated with "--dispatch_benchmarks"',
)
parser.add_argument(
"--enable_conv_transform",
default=False,
action="store_true",
help="Enables the --iree-flow-enable-conv-nchw-to-nhwc-transform flag.",
)
shark_args, unknown = parser.parse_known_args()

View File

@@ -23,8 +23,6 @@ from datetime import datetime
import time
import csv
import os
import torch
import torch._dynamo as dynamo
class OnnxFusionOptions(object):
@@ -41,52 +39,29 @@ class OnnxFusionOptions(object):
self.no_attention_mask = False
def check_requirements(frontend):
import importlib
has_pkgs = False
if frontend == "torch":
tv_spec = importlib.util.find_spec("torchvision")
has_pkgs = tv_spec is not None
elif frontend in ["tensorflow", "tf"]:
keras_spec = importlib.util.find_spec("keras")
tf_spec = importlib.util.find_spec("tensorflow")
has_pkgs = keras_spec is not None and tf_spec is not None
return has_pkgs
class SharkBenchmarkRunner(SharkRunner):
# SharkRunner derived class with Benchmarking capabilities.
def __init__(
self,
mlir_module: bytes,
mlir_module: str,
function_name: str = "forward",
device: str = "none",
mlir_dialect: str = "linalg",
extra_args: list = [],
):
self.device = shark_args.device if device == "none" else device
self.enable_tf32 = shark_args.enable_tf32
self.frontend_model = None
self.vmfb_file = None
self.mlir_dialect = mlir_dialect
self.extra_args = extra_args
SharkRunner.__init__(
self,
mlir_module,
function_name,
device,
self.mlir_dialect,
self.extra_args,
compile_vmfb=True,
)
if self.vmfb_file == None:
self.vmfb_file = export_iree_module_to_vmfb(
mlir_module,
device,
shark_args.repro_dir,
self.mlir_dialect,
extra_args=self.extra_args,
mlir_module, device, shark_args.repro_dir, self.mlir_dialect
)
def setup_cl(self, input_tensors):
@@ -96,11 +71,11 @@ class SharkBenchmarkRunner(SharkRunner):
input_tensors,
mlir_dialect=self.mlir_dialect,
)
print(self.benchmark_cl)
def benchmark_frontend(self, modelname):
if self.mlir_dialect in ["linalg", "torch"]:
return self.benchmark_torch(modelname)
elif self.mlir_dialect in ["mhlo", "tf"]:
return self.benchmark_tf(modelname)
@@ -110,8 +85,6 @@ class SharkBenchmarkRunner(SharkRunner):
if self.device == "cuda":
torch.set_default_tensor_type(torch.cuda.FloatTensor)
if self.enable_tf32:
torch.backends.cuda.matmul.allow_tf32 = True
else:
torch.set_default_tensor_type(torch.FloatTensor)
torch_device = torch.device(
@@ -119,7 +92,6 @@ class SharkBenchmarkRunner(SharkRunner):
)
HFmodel, input = get_torch_model(modelname)[:2]
frontend_model = HFmodel.model
frontend_model = dynamo.optimize("inductor")(frontend_model)
frontend_model.to(torch_device)
input.to(torch_device)
@@ -142,45 +114,32 @@ class SharkBenchmarkRunner(SharkRunner):
def benchmark_tf(self, modelname):
import tensorflow as tf
visible_default = tf.config.list_physical_devices("GPU")
try:
tf.config.set_visible_devices([], "GPU")
visible_devices = tf.config.get_visible_devices()
for device in visible_devices:
assert device.device_type != "GPU"
except:
# Invalid device or cannot modify virtual devices once initialized.
pass
from tank.model_utils_tf import get_tf_model
# tf_device = "/GPU:0" if self.device == "cuda" else "/CPU:0"
tf_device = "/CPU:0"
with tf.device(tf_device):
model, input, = get_tf_model(
modelname
)[:2]
frontend_model = model
model, input, = get_tf_model(
modelname
)[:2]
frontend_model = model
for i in range(shark_args.num_warmup_iterations):
frontend_model.forward(*input)
for i in range(shark_args.num_warmup_iterations):
frontend_model.forward(*input)
begin = time.time()
for i in range(shark_args.num_iterations):
out = frontend_model.forward(*input)
if i == shark_args.num_iterations - 1:
end = time.time()
break
print(
f"TF benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
)
return [
f"{shark_args.num_iterations/(end-begin)}",
f"{((end-begin)/shark_args.num_iterations)*1000}",
]
begin = time.time()
for i in range(shark_args.num_iterations):
out = frontend_model.forward(*input)
if i == shark_args.num_iterations - 1:
end = time.time()
break
print(
f"TF benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
)
return [
f"{shark_args.num_iterations/(end-begin)}",
f"{((end-begin)/shark_args.num_iterations)*1000}",
]
def benchmark_c(self):
print(self.benchmark_cl)
result = run_benchmark_module(self.benchmark_cl)
print(f"Shark-IREE-C benchmark:{result} iter/second")
return [f"{result}", f"{1000/result}"]
@@ -188,11 +147,11 @@ class SharkBenchmarkRunner(SharkRunner):
def benchmark_python(self, inputs):
input_list = [x for x in inputs]
for i in range(shark_args.num_warmup_iterations):
self.run("forward", input_list)
self.run(input_list)
begin = time.time()
for i in range(shark_args.num_iterations):
out = self.run("forward", input_list)
out = self.run(input_list)
if i == shark_args.num_iterations - 1:
end = time.time()
print(
@@ -290,15 +249,19 @@ for currently supported models. Exiting benchmark ONNX."
return [param_count, model_tags, model_notes]
def compare_bench_results(self, baseline: str, result: str):
if baseline is not None:
# Takes a baseline and a result string and calculates a comparison, e.g. "1.04x baseline".
a = float(baseline)
b = float(result)
# Takes two numbers represented as strings and returns "<n>x slower/faster", as in "result is <n>x slower than baseline".
a = float(baseline)
b = float(result)
if a < b:
# result slower than baseline
comparison = (b - a) / a
comp_str = f"{round(comparison, 2)}x slower"
elif a > b:
# result faster than baseline
comparison = a / b
comp_str = f"{round(comparison, 2)}x baseline"
comp_str = f"{round(comparison, 2)}x faster"
else:
comp_str = "N/A"
comp_str = "equal"
return comp_str
def benchmark_all_csv(
@@ -339,10 +302,7 @@ for currently supported models. Exiting benchmark ONNX."
else:
bench_result["shape_type"] = "static"
bench_result["device"] = device_str
if "fp16" in modelname:
bench_result["data_type"] = "float16"
else:
bench_result["data_type"] = inputs[0].dtype
bench_result["data_type"] = inputs[0].dtype
for e in engines:
(
bench_result["param_count"],
@@ -351,21 +311,17 @@ for currently supported models. Exiting benchmark ONNX."
) = ["", "", ""]
if e == "frontend":
bench_result["engine"] = frontend
if check_requirements(frontend):
(
bench_result["iter/sec"],
bench_result["ms/iter"],
) = self.benchmark_frontend(modelname)
self.frontend_result = bench_result["ms/iter"]
bench_result["vs. PyTorch/TF"] = "baseline"
(
bench_result["param_count"],
bench_result["tags"],
bench_result["notes"],
) = self.get_metadata(modelname)
else:
self.frontend_result = None
continue
(
bench_result["iter/sec"],
bench_result["ms/iter"],
) = self.benchmark_frontend(modelname)
self.frontend_result = bench_result["ms/iter"]
bench_result["vs. PyTorch/TF"] = "="
(
bench_result["param_count"],
bench_result["tags"],
bench_result["notes"],
) = self.get_metadata(modelname)
elif e == "shark_python":
bench_result["engine"] = "shark_python"

View File

@@ -14,58 +14,11 @@
import numpy as np
import os
from tqdm.std import tqdm
import sys
import urllib.request
import json
import hashlib
from pathlib import Path
from shark.parser import shark_args
from google.cloud import storage
def download_public_file(
full_gs_url, destination_folder_name, single_file=False
):
"""Downloads a public blob from the bucket."""
# bucket_name = "gs://your-bucket-name/path/to/file"
# destination_file_name = "local/path/to/file"
storage_client = storage.Client.create_anonymous_client()
bucket_name = full_gs_url.split("/")[2]
source_blob_name = None
dest_filename = None
desired_file = None
if single_file:
desired_file = full_gs_url.split("/")[-1]
source_blob_name = "/".join(full_gs_url.split("/")[3:-1])
destination_folder_name, dest_filename = os.path.split(
destination_folder_name
)
else:
source_blob_name = "/".join(full_gs_url.split("/")[3:])
bucket = storage_client.bucket(bucket_name)
blobs = bucket.list_blobs(prefix=source_blob_name)
if not os.path.exists(destination_folder_name):
os.mkdir(destination_folder_name)
for blob in blobs:
blob_name = blob.name.split("/")[-1]
if single_file:
if blob_name == desired_file:
destination_filename = os.path.join(
destination_folder_name, dest_filename
)
with open(destination_filename, "wb") as f:
with tqdm.wrapattr(
f, "write", total=blob.size
) as file_obj:
storage_client.download_blob_to_file(blob, file_obj)
else:
continue
destination_filename = os.path.join(destination_folder_name, blob_name)
with open(destination_filename, "wb") as f:
with tqdm.wrapattr(f, "write", total=blob.size) as file_obj:
storage_client.download_blob_to_file(blob, file_obj)
input_type_to_np_dtype = {
"float32": np.float32,
@@ -77,6 +30,7 @@ input_type_to_np_dtype = {
"int8": np.int8,
}
# Save the model in the home local so it needn't be fetched everytime in the CI.
home = str(Path.home())
alt_path = os.path.join(os.path.dirname(__file__), "../gen_shark_tank/")
@@ -96,10 +50,10 @@ if custom_path:
else:
WORKDIR = os.path.join(home, ".local/shark_tank/")
print(
f"shark_tank local cache is located at {WORKDIR} . You may change this by setting the --local_tank_cache= flag"
f"shark_tank local cache is located at {WORKDIR} . You may change this by setting the --local_tank_cache="
" pytest flag"
)
# Checks whether the directory and files exists.
def check_dir_exists(model_name, frontend="torch", dynamic=""):
model_dir = os.path.join(WORKDIR, model_name)
@@ -125,67 +79,66 @@ def check_dir_exists(model_name, frontend="torch", dynamic=""):
and os.path.isfile(os.path.join(model_dir, "golden_out.npz"))
and os.path.isfile(os.path.join(model_dir, "hash.npy"))
):
print(f"""Using cached models from {WORKDIR}...""")
print(
f"""The models are present in the {WORKDIR}. If you want a fresh
download, consider deleting the directory."""
)
return True
return False
# Downloads the torch model from gs://shark_tank dir.
def download_model(
model_name,
dynamic=False,
tank_url="gs://shark_tank/latest",
frontend=None,
tuned=None,
def download_torch_model(
model_name, dynamic=False, tank_url="gs://shark_tank/latest"
):
model_name = model_name.replace("/", "_")
dyn_str = "_dynamic" if dynamic else ""
os.makedirs(WORKDIR, exist_ok=True)
model_dir_name = model_name + "_" + frontend
model_dir = os.path.join(WORKDIR, model_dir_name)
full_gs_url = tank_url.rstrip("/") + "/" + model_dir_name
model_dir_name = model_name + "_torch"
if shark_args.update_tank == True:
print(f"Updating artifacts for model {model_name}...")
download_public_file(full_gs_url, model_dir)
def gs_download_model():
gs_command = (
'gsutil -o "GSUtil:parallel_process_count=1" cp -r '
+ tank_url
+ "/"
+ model_dir_name
+ " "
+ WORKDIR
)
if os.system(gs_command) != 0:
raise Exception("model not present in the tank. Contact Nod Admin")
elif not check_dir_exists(
model_dir_name, frontend=frontend, dynamic=dyn_str
):
print(f"Downloading artifacts for model {model_name}...")
download_public_file(full_gs_url, model_dir)
if not check_dir_exists(model_dir_name, frontend="torch", dynamic=dyn_str):
gs_download_model()
else:
if not _internet_connected():
print(
"No internet connection. Using the model already present in the tank."
)
else:
local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
gs_hash_url = (
tank_url.rstrip("/") + "/" + model_dir_name + "/hash.npy"
)
download_public_file(
gs_hash_url,
os.path.join(model_dir, "upstream_hash.npy"),
single_file=True,
)
try:
upstream_hash = str(
np.load(os.path.join(model_dir, "upstream_hash.npy"))
)
except FileNotFoundError:
upstream_hash = None
if local_hash != upstream_hash:
model_dir = os.path.join(WORKDIR, model_dir_name)
local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
gs_hash = (
'gsutil -o "GSUtil:parallel_process_count=1" cp '
+ tank_url
+ "/"
+ model_dir_name
+ "/hash.npy"
+ " "
+ os.path.join(model_dir, "upstream_hash.npy")
)
if os.system(gs_hash) != 0:
raise Exception("hash of the model not present in the tank.")
upstream_hash = str(
np.load(os.path.join(model_dir, "upstream_hash.npy"))
)
if local_hash != upstream_hash:
if shark_args.update_tank == True:
gs_download_model()
else:
print(
"Hash does not match upstream in gs://shark_tank/latest. If you want to use locally generated artifacts, this is working as intended. Otherwise, run with --update_tank."
"Hash does not match upstream in gs://shark_tank/. If you are using SHARK Downloader with locally generated artifacts, this is working as intended."
)
model_dir = os.path.join(WORKDIR, model_dir_name)
tuned_str = "" if tuned is None else "_" + tuned
suffix = f"{dyn_str}_{frontend}{tuned_str}.mlir"
filename = os.path.join(model_dir, model_name + suffix)
with open(filename, mode="rb") as f:
with open(
os.path.join(model_dir, model_name + dyn_str + "_torch.mlir")
) as f:
mlir_file = f.read()
function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
@@ -197,11 +150,129 @@ def download_model(
return mlir_file, function_name, inputs_tuple, golden_out_tuple
def _internet_connected():
import requests as req
# Downloads the tflite model from gs://shark_tank dir.
def download_tflite_model(
model_name, dynamic=False, tank_url="gs://shark_tank/latest"
):
dyn_str = "_dynamic" if dynamic else ""
os.makedirs(WORKDIR, exist_ok=True)
model_dir_name = model_name + "_tflite"
try:
req.get("http://1.1.1.1")
return True
except:
return False
def gs_download_model():
gs_command = (
'gsutil -o "GSUtil:parallel_process_count=1" cp -r '
+ tank_url
+ "/"
+ model_dir_name
+ " "
+ WORKDIR
)
if os.system(gs_command) != 0:
raise Exception("model not present in the tank. Contact Nod Admin")
if not check_dir_exists(
model_dir_name, frontend="tflite", dynamic=dyn_str
):
gs_download_model()
else:
model_dir = os.path.join(WORKDIR, model_dir_name)
local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
gs_hash = (
'gsutil -o "GSUtil:parallel_process_count=1" cp '
+ tank_url
+ "/"
+ model_dir_name
+ "/hash.npy"
+ " "
+ os.path.join(model_dir, "upstream_hash.npy")
)
if os.system(gs_hash) != 0:
raise Exception("hash of the model not present in the tank.")
upstream_hash = str(
np.load(os.path.join(model_dir, "upstream_hash.npy"))
)
if local_hash != upstream_hash:
if shark_args.update_tank == True:
gs_download_model()
else:
print(
"Hash does not match upstream in gs://shark_tank/. If you are using SHARK Downloader with locally generated artifacts, this is working as intended."
)
model_dir = os.path.join(WORKDIR, model_dir_name)
with open(
os.path.join(model_dir, model_name + dyn_str + "_tflite.mlir")
) as f:
mlir_file = f.read()
function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
inputs = np.load(os.path.join(model_dir, "inputs.npz"))
golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))
inputs_tuple = tuple([inputs[key] for key in inputs])
golden_out_tuple = tuple([golden_out[key] for key in golden_out])
return mlir_file, function_name, inputs_tuple, golden_out_tuple
def download_tf_model(
model_name, tuned=None, tank_url="gs://shark_tank/latest"
):
model_name = model_name.replace("/", "_")
os.makedirs(WORKDIR, exist_ok=True)
model_dir_name = model_name + "_tf"
def gs_download_model():
gs_command = (
'gsutil -o "GSUtil:parallel_process_count=1" cp -r '
+ tank_url
+ "/"
+ model_dir_name
+ " "
+ WORKDIR
)
if os.system(gs_command) != 0:
raise Exception("model not present in the tank. Contact Nod Admin")
if not check_dir_exists(model_dir_name, frontend="tf"):
gs_download_model()
else:
model_dir = os.path.join(WORKDIR, model_dir_name)
local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
gs_hash = (
'gsutil -o "GSUtil:parallel_process_count=1" cp '
+ tank_url
+ "/"
+ model_dir_name
+ "/hash.npy"
+ " "
+ os.path.join(model_dir, "upstream_hash.npy")
)
if os.system(gs_hash) != 0:
raise Exception("hash of the model not present in the tank.")
upstream_hash = str(
np.load(os.path.join(model_dir, "upstream_hash.npy"))
)
if local_hash != upstream_hash:
if shark_args.update_tank == True:
gs_download_model()
else:
print(
"Hash does not match upstream in gs://shark_tank/. If you are using SHARK Downloader with locally generated artifacts, this is working as intended."
)
model_dir = os.path.join(WORKDIR, model_dir_name)
suffix = "_tf.mlir" if tuned is None else "_tf_" + tuned + ".mlir"
filename = os.path.join(model_dir, model_name + suffix)
if not os.path.isfile(filename):
filename = os.path.join(model_dir, model_name + "_tf.mlir")
with open(filename) as f:
mlir_file = f.read()
function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
inputs = np.load(os.path.join(model_dir, "inputs.npz"))
golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))
inputs_tuple = tuple([inputs[key] for key in inputs])
golden_out_tuple = tuple([golden_out[key] for key in golden_out])
return mlir_file, function_name, inputs_tuple, golden_out_tuple

View File

@@ -55,7 +55,6 @@ class SharkImporter:
inputs: tuple = (),
frontend: str = "torch",
raw_model_file: str = "",
return_str: bool = False,
):
self.module = module
self.inputs = None if len(inputs) == 0 else inputs
@@ -66,7 +65,6 @@ class SharkImporter:
)
sys.exit(1)
self.raw_model_file = raw_model_file
self.return_str = return_str
# NOTE: The default function for torch is "forward" and tf-lite is "main".
@@ -74,31 +72,24 @@ class SharkImporter:
from shark.torch_mlir_utils import get_torch_mlir_module
return get_torch_mlir_module(
self.module,
self.inputs,
is_dynamic,
tracing_required,
self.return_str,
self.module, self.inputs, is_dynamic, tracing_required
)
def _tf_mlir(self, func_name, save_dir="./shark_tmp/"):
def _tf_mlir(self, func_name):
from iree.compiler import tf as tfc
return tfc.compile_module(
self.module,
exported_names=[func_name],
import_only=True,
output_file=save_dir,
self.module, exported_names=[func_name], import_only=True
)
def _tflite_mlir(self, func_name, save_dir="./shark_tmp/"):
def _tflite_mlir(self, func_name):
from iree.compiler import tflite as tflitec
from shark.iree_utils._common import IREE_TARGET_MAP
self.mlir_model = tflitec.compile_file(
self.raw_model_file, # in tflite, it is a path to .tflite file, not a tflite interpreter
input_type="tosa",
import_only=True,
output_file=save_dir,
)
return self.mlir_model
@@ -108,7 +99,6 @@ class SharkImporter:
is_dynamic=False,
tracing_required=False,
func_name="forward",
save_dir="./shark_tmp/",
):
if self.frontend in ["torch", "pytorch"]:
if self.inputs == None:
@@ -118,15 +108,15 @@ class SharkImporter:
sys.exit(1)
return self._torch_mlir(is_dynamic, tracing_required), func_name
if self.frontend in ["tf", "tensorflow"]:
return self._tf_mlir(func_name, save_dir), func_name
return self._tf_mlir(func_name), func_name
if self.frontend in ["tflite", "tf-lite"]:
func_name = "main"
return self._tflite_mlir(func_name, save_dir), func_name
return self._tflite_mlir(func_name), func_name
# Converts the frontend specific tensors into np array.
def convert_to_numpy(self, array_tuple: tuple):
if self.frontend in ["torch", "pytorch"]:
return [x.detach().cpu().numpy() for x in array_tuple]
return [x.detach().numpy() for x in array_tuple]
if self.frontend in ["tf", "tensorflow"]:
return [x.numpy() for x in array_tuple]
@@ -140,20 +130,19 @@ class SharkImporter:
outputs_name = "golden_out.npz"
func_file_name = "function_name"
model_name_mlir = model_name + "_" + self.frontend + ".mlir"
try:
inputs = [x.cpu().detach() for x in inputs]
except AttributeError:
try:
inputs = [x.numpy() for x in inputs]
except AttributeError:
inputs = [x for x in inputs]
np.savez(os.path.join(dir, inputs_name), *inputs)
np.savez(os.path.join(dir, outputs_name), *outputs)
np.save(os.path.join(dir, func_file_name), np.array(func_name))
mlir_str = mlir_data
if self.frontend == "torch":
with open(os.path.join(dir, model_name_mlir), "wb") as mlir_file:
mlir_file.write(mlir_data)
mlir_str = mlir_data.operation.get_asm()
elif self.frontend == "tf":
mlir_str = mlir_data.decode("utf-8")
elif self.frontend == "tflite":
mlir_str = mlir_data.decode("utf-8")
with open(os.path.join(dir, model_name_mlir), "w") as mlir_file:
mlir_file.write(mlir_str)
return
@@ -170,13 +159,9 @@ class SharkImporter:
f"There is no input provided: {self.inputs}, please provide inputs or simply run import_mlir."
)
sys.exit(1)
model_name_mlir = model_name + "_" + self.frontend + ".mlir"
artifact_path = os.path.join(dir, model_name_mlir)
imported_mlir = self.import_mlir(
is_dynamic,
tracing_required,
func_name,
save_dir=artifact_path,
is_dynamic, tracing_required, func_name
)
# TODO: Make sure that any generic function name is accepted. Currently takes in the default function names.
# TODO: Check for multiple outputs.
@@ -186,7 +171,7 @@ class SharkImporter:
golden_out = self.module(*self.inputs)
if torch.is_tensor(golden_out):
golden_out = tuple(
golden_out.detach().cpu().numpy(),
golden_out.detach().numpy(),
)
else:
golden_out = self.convert_to_numpy(golden_out)
@@ -249,182 +234,3 @@ class SharkImporter:
self.inputs,
golden_out,
)
def get_f16_inputs(inputs, is_f16, f16_input_mask):
if is_f16 == False:
return inputs
if f16_input_mask == None:
return tuple([x.half() for x in inputs])
f16_masked_inputs = []
for i in range(len(inputs)):
if f16_input_mask[i]:
f16_masked_inputs.append(inputs[i].half())
else:
f16_masked_inputs.append(inputs[i])
return tuple(f16_masked_inputs)
def transform_fx(fx_g):
import torch
kwargs_dict = {
"dtype": torch.float16,
"device": torch.device(type="cpu"),
"pin_memory": False,
}
for node in fx_g.graph.nodes:
if node.op == "call_function":
if node.target in [
torch.ops.aten.arange,
torch.ops.aten.empty,
]:
node.kwargs = kwargs_dict
# Inputs and outputs of aten.var.mean should be upcasted to fp32.
if node.target in [torch.ops.aten.var_mean]:
with fx_g.graph.inserting_before(node):
new_node = fx_g.graph.call_function(
torch.ops.prims.convert_element_type,
args=(node.args[0], torch.float32),
kwargs={},
)
node.args = (new_node, node.args[1])
if node.name.startswith("getitem"):
with fx_g.graph.inserting_before(node):
if node.args[0].target in [torch.ops.aten.var_mean]:
new_node = fx_g.graph.call_function(
torch.ops.aten._to_copy,
args=(node,),
kwargs={"dtype": torch.float16},
)
node.append(new_node)
node.replace_all_uses_with(new_node)
new_node.args = (node,)
new_node.kwargs = {"dtype": torch.float16}
# aten.empty should be filled with zeros.
if node.target in [torch.ops.aten.empty]:
with fx_g.graph.inserting_after(node):
new_node = fx_g.graph.call_function(
torch.ops.aten.zero_,
args=(node,),
)
node.append(new_node)
node.replace_all_uses_with(new_node)
new_node.args = (node,)
fx_g.graph.lint()
# Doesn't replace the None type.
def change_fx_graph_return_to_tuple(fx_g):
for node in fx_g.graph.nodes:
if node.op == "output":
# output nodes always have one argument
node_arg = node.args[0]
out_nodes = []
if isinstance(node_arg, list):
# Don't return NoneType elements.
for out_node in node_arg:
if not isinstance(out_node, type(None)):
out_nodes.append(out_node)
# If there is a single tensor/element to be returned don't
# a tuple for it.
if len(out_nodes) == 1:
node.args = out_nodes
else:
node.args = (tuple(out_nodes),)
fx_g.graph.lint()
fx_g.recompile()
return fx_g
def flatten_training_input(inputs):
flattened_input = []
for i in inputs:
if isinstance(i, dict):
for value in i.values():
flattened_input.append(value.detach())
elif isinstance(i, tuple):
for value in i:
flattened_input.append(value)
else:
flattened_input.append(i)
return tuple(flattened_input)
# Applies fx conversion to the model and imports the mlir.
def import_with_fx(
model,
inputs,
is_f16=False,
f16_input_mask=None,
debug=False,
training=False,
return_str=False,
):
import torch
from torch.fx.experimental.proxy_tensor import make_fx
from torch._decomp import get_decompositions
# TODO: Control the decompositions.
fx_g = make_fx(
model,
decomposition_table=get_decompositions(
[
torch.ops.aten.embedding_dense_backward,
torch.ops.aten.native_layer_norm_backward,
torch.ops.aten.slice_backward,
torch.ops.aten.select_backward,
torch.ops.aten.norm.ScalarOpt_dim,
torch.ops.aten.native_group_norm,
torch.ops.aten.upsample_bilinear2d.vec,
torch.ops.aten.split.Tensor,
torch.ops.aten.split_with_sizes,
torch.ops.aten.native_layer_norm,
]
),
)(*inputs)
fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
fx_g.recompile()
def strip_overloads(gm):
"""
Modifies the target of graph nodes in :attr:`gm` to strip overloads.
Args:
gm(fx.GraphModule): The input Fx graph module to be modified
"""
for node in gm.graph.nodes:
if isinstance(node.target, torch._ops.OpOverload):
node.target = node.target.overloadpacket
gm.recompile()
strip_overloads(fx_g)
if is_f16:
fx_g = fx_g.half()
transform_fx(fx_g)
fx_g.recompile()
if training:
change_fx_graph_return_to_tuple(fx_g)
inputs = flatten_training_input(inputs)
ts_graph = torch.jit.script(fx_g)
inputs = get_f16_inputs(inputs, is_f16, f16_input_mask)
mlir_importer = SharkImporter(
ts_graph,
inputs,
frontend="torch",
return_str=return_str,
)
if debug and not is_f16:
(mlir_module, func_name), _, _ = mlir_importer.import_debug()
return mlir_module, func_name
mlir_module, func_name = mlir_importer.import_mlir()
return mlir_module, func_name

View File

@@ -12,8 +12,6 @@
from shark.iree_utils.compile_utils import (
export_iree_module_to_vmfb,
load_flatbuffer,
create_dispatch_dirs,
compile_benchmark_dirs,
)
import os
from shark.shark_runner import SharkRunner
@@ -39,7 +37,9 @@ class SharkInference:
Attributes
----------
mlir_module : str
mlir_module represented in string; modules from torch-mlir are serialized in bytecode format.
mlir_module represented in string.
function_name : str
function to execute in the given mlir_module.
device : str
device to execute the mlir_module on.
currently supports cpu, cuda, vulkan, and metal backends.
@@ -51,10 +51,10 @@ class SharkInference:
Methods
-------
__call__(function_name, inputs=None):
Runs the function with `function_name` within the mlir_module along
with the given inputs, if the inputs are not given it autogenerates the
inputs. Also, the inputs should be a numpy array.
run(inputs=None):
Runs the mlir_module with the given inputs, if the inputs are not
given it autogenerates the inputs. Also, the inputs should be a
numpy array.
input_info():
Gives the information about the inputs required by the `function_name`.
This can be expensive as it does string matching to do so.
@@ -63,94 +63,56 @@ class SharkInference:
def __init__(
self,
mlir_module: bytes,
mlir_module: str,
function_name: str = "forward",
device: str = "none",
mlir_dialect: str = "linalg",
is_benchmark: bool = False,
dispatch_benchmark: str = None,
dispatch_benchmark_dir: str = "temp_dispatch_benchmarks",
):
self.mlir_module = mlir_module
self.function_name = function_name
self.device = shark_args.device if device == "none" else device
self.mlir_dialect = mlir_dialect
self.is_benchmark = is_benchmark
self.dispatch_benchmarks = (
shark_args.dispatch_benchmarks
if dispatch_benchmark is None
else dispatch_benchmark
)
self.dispatch_benchmarks_dir = (
shark_args.dispatch_benchmarks_dir
if dispatch_benchmark_dir == "temp_dispatch_benchmarks"
else dispatch_benchmark_dir
)
self.shark_runner = None
def compile(self, extra_args=[]):
if self.dispatch_benchmarks is not None:
extra_args.append(
f"--iree-hal-dump-executable-sources-to={self.dispatch_benchmarks_dir}"
)
extra_args.append(
f"--iree-hal-dump-executable-binaries-to={self.dispatch_benchmarks_dir}"
)
temp_dir = self.dispatch_benchmarks_dir.split("/")
temp_dir[-1] = "temp_" + temp_dir[-1]
temp_dir = "/".join(temp_dir)
self.temp_dispatch_benchmarks_dir = temp_dir
extra_args.append(
f"--iree-hal-dump-executable-benchmarks-to={self.temp_dispatch_benchmarks_dir}"
)
def compile(self):
if self.is_benchmark == True:
from shark.shark_benchmark_runner import SharkBenchmarkRunner
self.shark_runner = SharkBenchmarkRunner(
self.mlir_module,
self.function_name,
self.device,
self.mlir_dialect,
extra_args=extra_args,
)
else:
self.shark_runner = SharkRunner(
self.mlir_module,
self.function_name,
self.device,
self.mlir_dialect,
extra_args=extra_args,
)
if self.dispatch_benchmarks is not None:
create_dispatch_dirs(self.dispatch_benchmarks_dir, self.device)
compile_benchmark_dirs(
self.dispatch_benchmarks_dir,
self.device,
self.dispatch_benchmarks,
)
os.system(f"rm -rf {self.temp_dispatch_benchmarks_dir}")
# inputs are considered to be tuple of np.array.
def __call__(self, function_name: str, inputs: tuple, send_to_host=True):
return self.shark_runner.run(function_name, inputs, send_to_host)
# Get all function names defined within the compiled module.
def get_functions_in_module(self):
return self.shark_runner.get_functions_in_module()
def forward(self, inputs: tuple):
return self.shark_runner.run(inputs)
# Captures the static input information from the mlir_module.
# TODO(pashu123): Generate the input information for dynamic shapes.
def _input_info(self, function_name):
def _input_info(self):
# func_key to get the line which contains the function.
func_key = "func.func @" + function_name
func_key = "func.func @" + self.function_name
func_header = None
for line in str(self.mlir_module).splitlines():
if func_key in line:
func_header = line
break
if func_header is None:
print(f"Function: {function_name} not found")
print(f"Function: {self.function_name} not found")
import re
@@ -182,22 +144,21 @@ class SharkInference:
# TODO: Instead of passing directory and having names decided by the module
# , user may want to save the module with manual names.
def save_module(self, dir=os.getcwd(), module_name=None, extra_args=[]):
def save_module(self, dir=os.getcwd()):
return export_iree_module_to_vmfb(
self.mlir_module,
self.device,
dir,
self.mlir_dialect,
module_name=module_name,
extra_args=extra_args,
self.function_name,
)
# load and return the module.
def load_module(self, path, extra_args=[]):
def load_module(self, path):
self.shark_runner = SharkRunner(
function_name=self.function_name,
device=self.device,
compile_vmfb=False,
extra_args=extra_args,
)
(
self.shark_runner.iree_compilation_module,
@@ -205,5 +166,6 @@ class SharkInference:
) = load_flatbuffer(
path,
self.device,
self.function_name,
)
return

View File

@@ -25,7 +25,7 @@ import sys
# supported dialects by the shark-runtime.
supported_dialects = {"linalg", "mhlo", "tosa", "tf-lite", "tm_tensor"}
supported_dialects = {"linalg", "mhlo", "tosa", "tf-lite"}
class SharkRunner:
@@ -39,6 +39,8 @@ class SharkRunner:
----------
mlir_module : str
mlir_module represented in string.
function_name : str
function to execute in the given mlir_module.
device : str
device to execute the mlir_module on.
currently supports cpu, cuda, vulkan, and metal backends.
@@ -48,10 +50,10 @@ class SharkRunner:
Methods
-------
run(function_name, inputs=None):
Runs the function with `function_name` within the mlir_module along
with the given inputs, if the inputs are not given it autogenerates the
inputs. Also, the inputs should be a numpy array.
run(inputs=None):
Runs the mlir_module with the given inputs, if the inputs are not
given it autogenerates the inputs. Also, the inputs should be a
numpy array.
input_info():
Gives the information about the inputs required by the `function_name`.
This can be expensive as it does string matching to do so.
@@ -59,19 +61,19 @@ class SharkRunner:
def __init__(
self,
mlir_module: bytes = None,
mlir_module: str = "none",
function_name: str = "forward",
device: str = "none",
mlir_dialect: str = "linalg",
extra_args: list = [],
compile_vmfb: bool = True,
):
self.mlir_module = mlir_module
self.function_name = function_name
self.device = shark_args.device if device == "none" else device
self.mlir_dialect = mlir_dialect
self.extra_args = extra_args
if check_device_drivers(self.device):
print(device_driver_info(self.device))
device_driver_info(self.device)
sys.exit(1)
if compile_vmfb == True:
@@ -83,19 +85,13 @@ class SharkRunner:
self.mlir_module,
self.device,
self.mlir_dialect,
extra_args=self.extra_args,
func_name=self.function_name,
)
def run(self, function_name, inputs: tuple, send_to_host=False):
def run(self, inputs: tuple):
return get_results(
self.iree_compilation_module,
function_name,
inputs,
self.iree_config,
self.mlir_dialect,
send_to_host,
)
# Get all function names defined within the compiled module.
def get_functions_in_module(self):
return self.iree_compilation_module._vm_module.function_names

View File

@@ -15,7 +15,6 @@
from shark.parser import shark_args
from shark.shark_runner import SharkRunner
from shark.backward_makefx import MakeFxModule
from shark.shark_importer import import_with_fx
import numpy as np
from tqdm import tqdm
import sys
@@ -68,21 +67,23 @@ class SharkTrainer:
self.frontend = frontend
# Training function is needed in the case of torch_fn.
def compile(self, training_fn=None, extra_args=[]):
def compile(self, training_fn=None):
if self.frontend in ["torch", "pytorch"]:
packed_inputs = (
dict(self.model.named_parameters()),
dict(self.model.named_buffers()),
tuple(self.input),
)
mlir_module, func_name = import_with_fx(
training_fn, packed_inputs, False, [], training=True
aot_module = MakeFxModule(
self.model, tuple(self.input), custom_inference_fn=training_fn
)
aot_module.generate_graph()
# Returns the backward graph.
training_graph = aot_module.training_graph
weights = self.get_torch_params()
self.shark_runner = SharkRunner(
mlir_module,
training_graph,
weights + self.input,
self.dynamic,
self.device,
"tm_tensor",
extra_args=extra_args,
self.jit_trace,
self.from_aot,
self.frontend,
)
elif self.frontend in ["tensorflow", "tf", "mhlo"]:
self.shark_runner = SharkRunner(
@@ -111,8 +112,8 @@ class SharkTrainer:
params = [x.numpy() for x in params]
print(f"Training started for {num_iters} iterations:")
for i in tqdm(range(num_iters)):
params = self.shark_runner.run(
"forward", params + self.input, self.frontend
params = self.shark_runner.forward(
params + self.input, self.frontend
)
return params

View File

@@ -1,315 +0,0 @@
# Copyright 2022 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from iree.runtime import query_available_drivers, get_driver
from shark.shark_downloader import download_model
from shark.shark_inference import SharkInference
from typing import List, Optional, Tuple
import numpy as np
import argparse
from shark.iree_utils._common import _IREE_DEVICE_MAP
import multiprocessing
from shark.shark_runner import supported_dialects
import logging
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures.thread import ThreadPoolExecutor
import time
import numpy as np
IREE_TO_SHARK_DRIVER_MAP = {v: k for k, v in _IREE_DEVICE_MAP.items()}
def stress_test_compiled_model(
shark_module_path: str,
function_name: str,
device: str,
inputs: List[np.ndarray],
golden_out: List[np.ndarray],
batch_size: int,
max_iterations: int,
max_duration_seconds: float,
inference_timeout_seconds: float,
tolerance_nulp: int,
stress_test_index: int,
):
logging.info(
f"Running stress test {stress_test_index} on device {device}."
)
# All interactions with the module must run in a single thread.
# We are using execution in a sperate thread in order to be able
# to wait with a timeout on the inference operation.
module_executor = ThreadPoolExecutor(1)
shark_module = module_executor.submit(
SharkInference,
mlir_module=bytes(),
function_name=function_name,
device=device,
).result()
module_executor.submit(
shark_module.load_module, shark_module_path
).result()
input_batches = [np.repeat(arr, batch_size, axis=0) for arr in inputs]
golden_output_batches = np.repeat(golden_out, batch_size, axis=0)
report_interval_seconds = 10
start_time = time.time()
previous_report_time = start_time
first_iteration_output = None
for i in range(max_iterations):
output = module_executor.submit(
shark_module.forward, input_batches
).result(inference_timeout_seconds)
if first_iteration_output is None:
np.testing.assert_array_almost_equal_nulp(
golden_output_batches, output, nulp=tolerance_nulp
)
first_iteration_output = output
else:
np.testing.assert_array_equal(output, first_iteration_output)
current_time = time.time()
if report_interval_seconds < current_time - previous_report_time:
logging.info(
f"Stress test {stress_test_index} on device "
f"{device} at iteration {i+1}"
)
previous_report_time = current_time
if max_duration_seconds < current_time - start_time:
return
logging.info(f"Stress test {stress_test_index} on device {device} done.")
def get_device_type(device_name: str):
return device_name.split("://", 1)[0]
def get_device_types(device_names: str):
return [get_device_type(device_name) for device_name in device_names]
def query_devices(device_types: Optional[List[str]] = None) -> List[str]:
devices = []
if device_types is None:
device_types = [
IREE_TO_SHARK_DRIVER_MAP[name]
for name in query_available_drivers()
if name in IREE_TO_SHARK_DRIVER_MAP
]
for device_type in device_types:
driver = get_driver(_IREE_DEVICE_MAP[device_type])
device_infos = driver.query_available_devices()
for device_info in device_infos:
uri_path = (
device_info["path"]
if device_info["path"] != ""
else str(device_info["device_id"])
)
device_uri = f"{device_type}://{uri_path}"
devices.append(device_uri)
return devices
def compile_stress_test_module(
device_types: List[str], mlir_model: str, func_name: str, mlir_dialect: str
) -> List[str]:
shark_module_paths = []
for device_type in device_types:
logging.info(
f"Compiling stress test model for device type {device_type}."
)
shark_module = SharkInference(
mlir_model,
func_name,
mlir_dialect=mlir_dialect,
device=device_type,
)
shark_module_paths.append(shark_module.save_module())
return shark_module_paths
def stress_test(
model_name: str,
dynamic_model: bool = False,
device_types: Optional[List[str]] = None,
device_names: Optional[List[str]] = None,
batch_size: int = 1,
max_iterations: int = 10**7,
max_duration_seconds: float = 3600,
inference_timeout_seconds: float = 60,
mlir_dialect: str = "linalg",
frontend: str = "torch",
oversubscription_factor: int = 1,
tolerance_nulp: int = 50000,
):
logging.info(f"Downloading stress test model {model_name}.")
mlir_model, func_name, inputs, golden_out = download_model(
model_name=model_name, dynamic=dynamic_model, frontend=frontend
)
if device_names is None or device_types is not None:
device_names = [] if device_names is None else device_names
with ProcessPoolExecutor() as executor:
# query_devices needs to run in a separate process,
# because it will interfere with other processes that are forked later.
device_names.extend(
executor.submit(query_devices, device_types).result()
)
device_types_set = list(set(get_device_types(device_names)))
with ProcessPoolExecutor() as executor:
# This needs to run in a subprocess because when compiling for CUDA,
# some stuff get intialized and cuInit will fail in a forked process
# later. It should be just compiling, but alas.
shark_module_paths_set = executor.submit(
compile_stress_test_module,
device_types_set,
mlir_model,
func_name,
mlir_dialect,
).result()
device_type_shark_module_path_map = {
device_type: module_path
for device_type, module_path in zip(
device_types_set, shark_module_paths_set
)
}
device_name_shark_module_path_map = {
device_name: device_type_shark_module_path_map[
get_device_type(device_name)
]
for device_name in device_names
}
# This needs to run in a spearate process, because it uses the drvier chache
# in IREE and a subsequent call to `iree.runtime.SystemContext.add_vm_module`
# in a forked process will hang.
with multiprocessing.Pool(
len(device_name_shark_module_path_map) * oversubscription_factor
) as process_pool:
process_pool.starmap(
stress_test_compiled_model,
[
(
module_path,
func_name,
device_name,
inputs,
golden_out,
batch_size,
max_iterations,
max_duration_seconds,
inference_timeout_seconds,
tolerance_nulp,
stress_test_index,
)
for stress_test_index, (device_name, module_path) in enumerate(
list(device_name_shark_module_path_map.items())
* oversubscription_factor
)
],
)
if __name__ == "__main__":
logging.basicConfig(encoding="utf-8", level=logging.INFO)
parser = argparse.ArgumentParser(
description="Downloads, compiles and runs a model from the tank to stress test the system."
)
parser.add_argument(
"--model", type=str, help="Model name in the tank.", default="alexnet"
)
parser.add_argument(
"--dynamic",
help="Use dynamic version of the model.",
action="store_true",
default=False,
)
parser.add_argument(
"--frontend", type=str, help="Frontend of the model.", default="torch"
)
parser.add_argument(
"--mlir-dialect",
type=str,
help="MLIR dialect of the model.",
default="linalg",
choices=supported_dialects,
)
parser.add_argument(
"--device-types",
type=str,
nargs="*",
choices=_IREE_DEVICE_MAP.keys(),
help="Runs the stress test on all devices with that type. "
"If absent and no deveices are specified "
"will run against all available devices.",
)
parser.add_argument(
"--devices",
type=str,
nargs="*",
help="List of devices to run the stress test on. "
"If device-types is specified will run against the union of the two.",
)
parser.add_argument(
"--batch-size",
type=int,
help="Number of inputs to feed into the model",
default=1,
)
parser.add_argument(
"--oversubscription",
type=int,
help="Oversubscrption factor. Each device will execute the model simultaneously "
"this many number of times.",
default=1,
)
parser.add_argument(
"--max-iterations",
type=int,
help="Maximum number of iterations to run the stress test per device.",
default=10**7,
)
parser.add_argument(
"--max-duration",
type=float,
help="Maximum number of seconds to run the stress test.",
default=3600,
)
parser.add_argument(
"--inference-timeout",
type=float,
help="Timeout in seconds for a single model inference operation.",
default=60,
)
parser.add_argument(
"--tolerance-nulp",
type=int,
help="The maximum number of unit in the last place for tolerance "
"when verifing results with the golden reference output.",
default=50000,
)
args = parser.parse_known_args()[0]
stress_test(
model_name=args.model,
dynamic_model=args.dynamic,
frontend=args.frontend,
mlir_dialect=args.mlir_dialect,
device_types=args.device_types,
device_names=args.devices,
batch_size=args.batch_size,
oversubscription_factor=args.oversubscription,
max_iterations=args.max_iterations,
max_duration_seconds=args.max_duration,
inference_timeout_seconds=args.inference_timeout,
tolerance_nulp=args.tolerance_nulp,
)

View File

@@ -1,31 +0,0 @@
# Copyright 2022 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import subprocess
import sys
import importlib.util
def test_stress_test():
subprocess.check_call(
[
sys.executable,
importlib.util.find_spec("shark.stress_test").origin,
"--model=squeezenet1_0",
"--devices",
"cpu",
"--max-iterations=1",
]
)

View File

@@ -17,7 +17,6 @@ import torch_mlir
from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
import tempfile
from shark.parser import shark_args
import io
def get_module_name_for_asm_dump(module):
@@ -56,9 +55,9 @@ def get_torch_mlir_module(
input: tuple,
dynamic: bool,
jit_trace: bool,
return_str: bool = False,
from_torchscript: bool = False,
):
"""Get the MLIR's linalg-on-tensors module from the torchscipt module."""
"""Get the MLIR's linalg-on-tensors module from torchscipt module."""
ignore_traced_shapes = False
if dynamic:
input = create_dynamic_placeholders(input)
@@ -67,16 +66,11 @@ def get_torch_mlir_module(
tempfile.tempdir = shark_args.repro_dir
mlir_module = torch_mlir.compile(
module = torch_mlir.compile(
module,
input,
output_type=torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=jit_trace,
ignore_traced_shapes=ignore_traced_shapes,
)
if return_str:
return mlir_module.operation.get_asm()
bytecode_stream = io.BytesIO()
mlir_module.operation.write_bytecode(bytecode_stream)
bytecode = bytecode_stream.getvalue()
return bytecode
return module

View File

@@ -1,211 +1,3 @@
## Supported and Validated Models
### PyTorch HuggingFace Models
| PyTorch Language Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :green_heart: (JIT) | :green_heart: | :green_heart: | :green_heart: |
| Albert | :green_heart: (JIT) | :green_heart: | :green_heart: | :green_heart: |
| BigBird | :green_heart: (AOT) | | | |
| dbmdz/ConvBERT | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| DistilBERT | :broken_heart: (JIT) | | | |
| GPT2 | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| MobileBert | :green_heart: (JIT) | :green_heart: | :green_heart: | :green_heart: |
| microsoft/beit | :green_heart: | :green_heart: | :broken_heart: | :broken_heart: |
| facebook/deit | :green_heart: | :green_heart: | :broken_heart: | :broken_heart: |
| facebook/convnext | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
### Torchvision Models
| TORCHVISION Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|--------------------|----------------------|----------|----------|-------------|
| AlexNet | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| MobileNetV2 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| MobileNetV3 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Unet | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnet18 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnet50 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnet101 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnext50_32x4d | :green_heart: (Script) | | | |
| SqueezeNet | :green_heart: (Script) | :green_heart: | :broken_heart: | :broken_heart: |
| EfficientNet | :green_heart: (Script) | | | |
| Regnet | :green_heart: (Script) | | | |
| Resnest | :broken_heart: (Script) | | | |
| Vision Transformer | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| VGG 16 | :green_heart: (Script) | :green_heart: | :green_heart: | |
| Wide Resnet | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| RAFT | :broken_heart: (JIT) | | | |
For more information refer to [MODEL TRACKING SHEET](https://docs.google.com/spreadsheets/d/15PcjKeHZIrB5LfDyuw7DGEEE8XnQEX2aX8lm8qbxV8A/edit#gid=0)
### Tensorflow Models (Inference)
| Hugging Face Models | tf-mhlo lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| MiniLM | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| albert-base-v2 | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| DistilBERT | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| CamemBert | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| ConvBert | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| Deberta | | | | |
| electra | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| funnel | | | | |
| layoutlm | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| longformer | | | | |
| mobile-bert | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| rembert | | | | |
| tapas | | | | |
| flaubert | :broken_heart: | :green_heart: | :green_heart: | :green_heart: |
| roberta | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| xlm-roberta | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| mpnet | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
### PyTorch Training Models
| Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :green_heart: | :green_heart: | | |
| FullyConnected | :green_heart: | :green_heart: | | |
### JAX Models
| Models | JAX-MHLO lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| DALL-E | :broken_heart: | :broken_heart: | | |
| FullyConnected | :green_heart: | :green_heart: | | |
<details>
<summary>TFLite Models</summary>
### TFLite Models
| Models | TOSA/LinAlg | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :broken_heart: | :broken_heart: | | |
| FullyConnected | :green_heart: | :green_heart: | | |
| albert | :green_heart: | :green_heart: | | |
| asr_conformer | :green_heart: | :green_heart: | | |
| bird_classifier | :green_heart: | :green_heart: | | |
| cartoon_gan | :green_heart: | :green_heart: | | |
| craft_text | :green_heart: | :green_heart: | | |
| deeplab_v3 | :green_heart: | :green_heart: | | |
| densenet | :green_heart: | :green_heart: | | |
| east_text_detector | :green_heart: | :green_heart: | | |
| efficientnet_lite0_int8 | :green_heart: | :green_heart: | | |
| efficientnet | :green_heart: | :green_heart: | | |
| gpt2 | :green_heart: | :green_heart: | | |
| image_stylization | :green_heart: | :green_heart: | | |
| inception_v4 | :green_heart: | :green_heart: | | |
| inception_v4_uint8 | :green_heart: | :green_heart: | | |
| lightning_fp16 | :green_heart: | :green_heart: | | |
| lightning_i8 | :green_heart: | :green_heart: | | |
| lightning | :green_heart: | :green_heart: | | |
| magenta | :green_heart: | :green_heart: | | |
| midas | :green_heart: | :green_heart: | | |
| mirnet | :green_heart: | :green_heart: | | |
| mnasnet | :green_heart: | :green_heart: | | |
| mobilebert_edgetpu_s_float | :green_heart: | :green_heart: | | |
| mobilebert_edgetpu_s_quant | :green_heart: | :green_heart: | | |
| mobilebert | :green_heart: | :green_heart: | | |
| mobilebert_tf2_float | :green_heart: | :green_heart: | | |
| mobilebert_tf2_quant | :green_heart: | :green_heart: | | |
| mobilenet_ssd_quant | :green_heart: | :green_heart: | | |
| mobilenet_v1 | :green_heart: | :green_heart: | | |
| mobilenet_v1_uint8 | :green_heart: | :green_heart: | | |
| mobilenet_v2_int8 | :green_heart: | :green_heart: | | |
| mobilenet_v2 | :green_heart: | :green_heart: | | |
| mobilenet_v2_uint8 | :green_heart: | :green_heart: | | |
| mobilenet_v3-large | :green_heart: | :green_heart: | | |
| mobilenet_v3-large_uint8 | :green_heart: | :green_heart: | | |
| mobilenet_v35-int8 | :green_heart: | :green_heart: | | |
| nasnet | :green_heart: | :green_heart: | | |
| person_detect | :green_heart: | :green_heart: | | |
| posenet | :green_heart: | :green_heart: | | |
| resnet_50_int8 | :green_heart: | :green_heart: | | |
| rosetta | :green_heart: | :green_heart: | | |
| spice | :green_heart: | :green_heart: | | |
| squeezenet | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v1 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v1_uint8 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2_fpnlite | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2_fpnlite_uint8 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2_int8 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2 | :green_heart: | :green_heart: | | |
| ssd_spaghettinet_large | :green_heart: | :green_heart: | | |
| ssd_spaghettinet_large_uint8 | :green_heart: | :green_heart: | | |
| visual_wake_words_i8 | :green_heart: | :green_heart: | | |
</details>
## Testing and Benchmarks
### Run all model tests on CPU/GPU/VULKAN/Metal
For a list of models included in our pytest model suite, see https://github.com/nod-ai/SHARK/blob/main/tank/all_models.csv
```shell
pytest tank/test_models.py
# Models included in the pytest suite can be found listed in all_models.csv.
# If on Linux for multithreading on CPU (faster results):
pytest tank/test_models.py -n auto
```
### Running specific tests
```shell
# Search for test cases by including a keyword that matches all or part of the test case's name;
pytest tank/test_models.py -k "keyword"
# Test cases are named uniformly by format test_module_<model_name_underscores_only>_<torch/tf>_<static/dynamic>_<device>.
# Example: Test all models on nvidia gpu:
pytest tank/test_models.py -k "cuda"
# Example: Test all tensorflow resnet models on Vulkan backend:
pytest tank/test_models.py -k "resnet and tf and vulkan"
# Exclude a test case:
pytest tank/test_models.py -k "not ..."
### Run benchmarks on SHARK tank pytests and generate bench_results.csv with results.
(the following requires source installation with `IMPORTER=1 ./setup_venv.sh`)
```shell
pytest --benchmark tank/test_models.py
# Just do static GPU benchmarks for PyTorch tests:
pytest --benchmark tank/test_models.py -k "pytorch and static and cuda"
```
### Benchmark Resnet50, MiniLM on CPU
(requires source installation with `IMPORTER=1 ./setup_venv.sh`)
```shell
# We suggest running the following commands as root before running benchmarks on CPU:
cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | awk -F, '{print $2}' | sort -n | uniq | ( while read X ; do echo $X ; echo 0 > /sys/devices/system/cpu/cpu$X/online ; done )
echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
# Benchmark canonical Resnet50 on CPU via pytest
pytest --benchmark tank/test_models.py -k "resnet50 and tf_static_cpu"
# Benchmark canonical MiniLM on CPU via pytest
pytest --benchmark tank/test_models.py -k "MiniLM and cpu"
# Benchmark MiniLM on CPU via transformer-benchmarks:
git clone --recursive https://github.com/nod-ai/transformer-benchmarks.git
cd transformer-benchmarks
./perf-ci.sh -n
# Check detail.csv for MLIR/IREE results.
```
To run the fine tuning example, from the root SHARK directory, run:
```shell
@@ -219,5 +11,3 @@ if running from a google vm, you can view jupyter notebooks on your local system
gcloud compute ssh <YOUR_INSTANCE_DETAILS> --ssh-flag="-N -L localhost:8888:localhost:8888"
```

View File

@@ -1,36 +1,34 @@
resnet50,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error: mostly conv"
albert-base-v2,mhlo,tf,1e-2,1e-2,default,None,False,False,False,""
roberta-base,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,""
bert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
camembert-base,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
dbmdz/convbert-base-turkish-cased,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,True,True,True,"https://github.com/iree-org/iree/issues/9971"
distilbert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
facebook/convnext-tiny-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,True,True,True,"https://github.com/nod-ai/SHARK/issues/311 & https://github.com/nod-ai/SHARK/issues/342"
funnel-transformer/small,mhlo,tf,1e-2,1e-3,default,None,True,True,True,"https://github.com/nod-ai/SHARK/issues/201"
google/electra-small-discriminator,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
google/mobilebert-uncased,mhlo,tf,1e-2,1e-3,default,None,True,False,False,"Fails during iree-compile."
google/vit-base-patch16-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
microsoft/MiniLM-L12-H384-uncased,mhlo,tf,1e-2,1e-3,tf_hf,None,True,False,False,"Fails during iree-compile."
microsoft/layoutlm-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
microsoft/mpnet-base,mhlo,tf,1e-2,1e-2,default,None,False,False,False,""
albert-base-v2,linalg,torch,1e-2,1e-3,default,None,True,True,True,"issue with aten.tanh in torch-mlir"
alexnet,linalg,torch,1e-2,1e-3,default,None,True,False,True,"https://github.com/nod-ai/SHARK/issues/879"
bert-base-cased,linalg,torch,1e-2,1e-3,default,None,False,False,False,""
bert-base-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,""
bert-base-uncased_fp16,linalg,torch,1e-1,1e-1,default,None,True,False,True,""
facebook/deit-small-distilled-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"Fails during iree-compile."
google/vit-base-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/311"
microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/390"
microsoft/MiniLM-L12-H384-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,True,""
microsoft/resnet-50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
google/mobilebert-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,True,"https://github.com/nod-ai/SHARK/issues/344"
mobilenet_v3_small,linalg,torch,1e-1,1e-2,default,nhcw-nhwc,False,True,True,"https://github.com/nod-ai/SHARK/issues/388"
nvidia/mit-b0,linalg,torch,1e-2,1e-3,default,None,True,True,True,"https://github.com/nod-ai/SHARK/issues/343"
resnet101,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
resnet18,linalg,torch,1e-2,1e-3,default,None,True,True,True,""
resnet50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
resnet50_fp16,linalg,torch,1e-2,1e-2,default,nhcw-nhwc,True,False,True,""
squeezenet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"https://github.com/nod-ai/SHARK/issues/388"
wide_resnet50_2,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
efficientnet-v2-s,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,True,"https://github.com/nod-ai/SHARK/issues/575"
mnasnet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"https://github.com/nod-ai/SHARK/issues/388"
resnet50,mhlo,tf,1e-02,1e-3,default
albert-base-v2,mhlo,tf,1e-02,1e-3,default
roberta-base,mhlo,tf,1e-02,1e-3,default
bert-base-uncased,mhlo,tf,1e-2,1e-3,default
camembert-base,mhlo,tf,1e-2,1e-3,default
dbmdz/convbert-base-turkish-cased,mhlo,tf,1e-2,1e-3,default
distilbert-base-uncased,mhlo,tf,1e-2,1e-3,default
facebook/convnext-tiny-224,mhlo,tf,1e-2,1e-3,tf_vit
funnel-transformer/small,mhlo,tf,1e-2,1e-3,default
google/electra-small-discriminator,mhlo,tf,1e-2,1e-3,default
google/mobilebert-uncased,mhlo,tf,1e-2,1e-3,default
google/vit-base-patch16-224,mhlo,tf,1e-2,1e-3,tf_vit
hf-internal-testing/tiny-random-flaubert,mhlo,tf,1e-2,1e-3,default
microsoft/MiniLM-L12-H384-uncased,mhlo,tf,1e-2,1e-3,tf_hf
microsoft/layoutlm-base-uncased,mhlo,tf,1e-2,1e-3,default
microsoft/mpnet-base,mhlo,tf,1e-2,1e-3,default
albert-base-v2,linalg,torch,1e-2,1e-3,default
alexnet,linalg,torch,1e-2,1e-3,default
bert-base-cased,linalg,torch,1e-2,1e-3,default
bert-base-uncased,linalg,torch,1e-2,1e-3,default
distilbert-base-uncased,linalg,torch,1e-2,1e-3,default
facebook/deit-small-distilled-patch16-224,linalg,torch,1e-2,1e-3,default
google/vit-base-patch16-224,linalg,torch,1e-2,1e-3,default
microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default
microsoft/MiniLM-L12-H384-uncased,linalg,torch,1e-2,1e-3,default
microsoft/resnet-50,linalg,torch,1e-2,1e-3,default
google/mobilebert-uncased,linalg,torch,1e-2,1e-3,default
mobilenet_v3_small,linalg,torch,1e-2,1e-3,default
nvidia/mit-b0,linalg,torch,1e-2,1e-3,default
resnet101,linalg,torch,1e-2,1e-3,default
resnet18,linalg,torch,1e-2,1e-3,default
resnet50,linalg,torch,1e-2,1e-3,default
squeezenet1_0,linalg,torch,1e-2,1e-3,default
wide_resnet50_2,linalg,torch,1e-2,1e-3,default
1 resnet50 mhlo tf 1e-2 1e-02 1e-3 default nhcw-nhwc False False True Vulkan Numerical Error: mostly conv
2 albert-base-v2 mhlo tf 1e-2 1e-02 1e-2 1e-3 default None False False False
3 roberta-base mhlo tf 1e-02 1e-3 default nhcw-nhwc False False False
4 bert-base-uncased mhlo tf 1e-2 1e-3 default None False False False
5 camembert-base mhlo tf 1e-2 1e-3 default None False False False
6 dbmdz/convbert-base-turkish-cased mhlo tf 1e-2 1e-3 default nhcw-nhwc True True True https://github.com/iree-org/iree/issues/9971
7 distilbert-base-uncased mhlo tf 1e-2 1e-3 default None False False False
8 facebook/convnext-tiny-224 mhlo tf 1e-2 1e-3 tf_vit nhcw-nhwc True True True https://github.com/nod-ai/SHARK/issues/311 & https://github.com/nod-ai/SHARK/issues/342
9 funnel-transformer/small mhlo tf 1e-2 1e-3 default None True True True https://github.com/nod-ai/SHARK/issues/201
10 google/electra-small-discriminator mhlo tf 1e-2 1e-3 default None False False False
11 google/mobilebert-uncased mhlo tf 1e-2 1e-3 default None True False False Fails during iree-compile.
12 google/vit-base-patch16-224 mhlo tf 1e-2 1e-3 tf_vit nhcw-nhwc False False True Vulkan Numerical Error (mostly conv)
13 microsoft/MiniLM-L12-H384-uncased hf-internal-testing/tiny-random-flaubert mhlo tf 1e-2 1e-3 tf_hf default None True False False Fails during iree-compile.
14 microsoft/layoutlm-base-uncased microsoft/MiniLM-L12-H384-uncased mhlo tf 1e-2 1e-3 default tf_hf None False False False
15 microsoft/mpnet-base microsoft/layoutlm-base-uncased mhlo tf 1e-2 1e-2 1e-3 default None False False False
16 albert-base-v2 microsoft/mpnet-base linalg mhlo torch tf 1e-2 1e-3 default None True True True issue with aten.tanh in torch-mlir
17 alexnet albert-base-v2 linalg torch 1e-2 1e-3 default None True False True https://github.com/nod-ai/SHARK/issues/879
18 bert-base-cased alexnet linalg torch 1e-2 1e-3 default None False False False
19 bert-base-uncased bert-base-cased linalg torch 1e-2 1e-3 default None False False False
20 bert-base-uncased_fp16 bert-base-uncased linalg torch 1e-1 1e-2 1e-1 1e-3 default None True False True
21 facebook/deit-small-distilled-patch16-224 distilbert-base-uncased linalg torch 1e-2 1e-3 default nhcw-nhwc False True False Fails during iree-compile.
22 google/vit-base-patch16-224 facebook/deit-small-distilled-patch16-224 linalg torch 1e-2 1e-3 default nhcw-nhwc False True False https://github.com/nod-ai/SHARK/issues/311
23 microsoft/beit-base-patch16-224-pt22k-ft22k google/vit-base-patch16-224 linalg torch 1e-2 1e-3 default nhcw-nhwc False True False https://github.com/nod-ai/SHARK/issues/390
24 microsoft/MiniLM-L12-H384-uncased microsoft/beit-base-patch16-224-pt22k-ft22k linalg torch 1e-2 1e-3 default None False False True
25 microsoft/resnet-50 microsoft/MiniLM-L12-H384-uncased linalg torch 1e-2 1e-3 default nhcw-nhwc False False True Vulkan Numerical Error (mostly conv)
26 google/mobilebert-uncased microsoft/resnet-50 linalg torch 1e-2 1e-3 default None False False True https://github.com/nod-ai/SHARK/issues/344
27 mobilenet_v3_small google/mobilebert-uncased linalg torch 1e-1 1e-2 1e-2 1e-3 default nhcw-nhwc False True True https://github.com/nod-ai/SHARK/issues/388
28 nvidia/mit-b0 mobilenet_v3_small linalg torch 1e-2 1e-3 default None True True True https://github.com/nod-ai/SHARK/issues/343
29 resnet101 nvidia/mit-b0 linalg torch 1e-2 1e-3 default nhcw-nhwc False False True Vulkan Numerical Error (mostly conv)
30 resnet18 resnet101 linalg torch 1e-2 1e-3 default None True True True
31 resnet50 resnet18 linalg torch 1e-2 1e-3 default nhcw-nhwc False False True Vulkan Numerical Error (mostly conv)
32 resnet50_fp16 resnet50 linalg torch 1e-2 1e-2 1e-3 default nhcw-nhwc True False True
33 squeezenet1_0 linalg torch 1e-2 1e-3 default nhcw-nhwc False False True https://github.com/nod-ai/SHARK/issues/388
34 wide_resnet50_2 linalg torch 1e-2 1e-3 default nhcw-nhwc False False True Vulkan Numerical Error (mostly conv)
efficientnet-v2-s mhlo tf 1e-02 1e-3 default nhcw-nhwc False False True https://github.com/nod-ai/SHARK/issues/575
mnasnet1_0 linalg torch 1e-2 1e-3 default nhcw-nhwc False False True https://github.com/nod-ai/SHARK/issues/388

View File

@@ -1,9 +1,8 @@
from shark.shark_inference import SharkInference
from shark.shark_downloader import download_model
from shark.shark_downloader import download_torch_model
mlir_model, func_name, inputs, golden_out = download_model(
"bert-base-uncased_tosa",
frontend="torch",
mlir_model, func_name, inputs, golden_out = download_torch_model(
"bert-base-uncased_tosa"
)
shark_module = SharkInference(

View File

@@ -72,8 +72,7 @@ class BertModule(tf.Module):
input_signature=[
bert_input, # inputs
tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32), # labels
],
jit_compile=True,
]
)
def learn(self, inputs, labels):
with tf.GradientTape() as tape:

Some files were not shown because too many files have changed in this diff Show More