Compare commits

..

1 Commits

Author SHA1 Message Date
Stanley Winata
c47218c972 WIP: Intel-GPU integration 2022-08-10 11:24:00 -07:00
228 changed files with 3757 additions and 23002 deletions

View File

@@ -1,37 +0,0 @@
# See: https://github.com/llvm/torch-mlir/issues/1374
name: Publish releases page
on:
workflow_dispatch:
jobs:
scrape_and_publish_releases:
name: "Scrape and publish releases"
runs-on: ubuntu-latest
# Don't run this in everyone's forks.
if: github.repository == 'nod-ai/SHARK'
steps:
- name: Checking out repository
uses: actions/checkout@v2
with:
token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
- name: Run scrape releases script
run: python ./build_tools/scrape_releases.py nod-ai SHARK > /tmp/index.html
shell: bash
- run: git fetch --all
- run: git switch github-pages
- run: git config --global user.email "none@none.com"
- run: git config --global user.name "nod-ai"
- run: mv /tmp/index.html package-index/index.html
- run: git add package-index/index.html
# Only try to make a commit if the file has changed.
- run: git diff --cached --exit-code || git commit -m "Update releases."
- name: GitHub Push
uses: ad-m/github-push-action@v0.6.0
with:
github_token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
branch: github-pages

View File

@@ -9,87 +9,13 @@ on:
workflow_dispatch:
jobs:
windows-build:
runs-on: windows-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.10"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Compute version
shell: powershell
run: |
$package_version = $(Get-Date -UFormat "%Y%m%d")+"."+${{ github.run_number }}
$package_version_ = $(Get-Date -UFormat "%Y%m%d")+"_"+${{ github.run_number }}
$tag_name=$package_version
echo "package_version=$package_version" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
echo "package_version_=$package_version_" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
echo "tag_name=$tag_name" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
- name: Create Release
id: create_release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:
tag_name: ${{ env.tag_name }}
release_name: nod.ai SHARK ${{ env.tag_name }}
body: |
Automatic snapshot release of nod.ai SHARK.
draft: true
prerelease: false
- name: Build Package
shell: powershell
run: |
./setup_venv.ps1
pyinstaller web/shark_sd.spec
mv ./dist/shark_sd.exe ./dist/shark_sd_${{ env.package_version_ }}.exe
# GHA windows VM OOMs so disable for now
#- name: Build and validate the SHARK Runtime package
# shell: powershell
# run: |
# $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
# pip wheel -v -w dist . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
- uses: actions/upload-artifact@v2
with:
path: dist/*
- name: Upload Release Assets
id: upload-release-assets
uses: dwenegar/upload-release-assets@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:
release_id: ${{ steps.create_release.outputs.id }}
assets_path: ./dist/*
- name: Publish Release
id: publish_release
uses: eregon/publish-release@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:
release_id: ${{ steps.create_release.outputs.id }}
linux-build:
build:
runs-on: a100
strategy:
fail-fast: false
matrix:
python-version: ["3.10"]
backend: [IREE, SHARK]
steps:
- uses: actions/checkout@v3
@@ -105,55 +31,63 @@ jobs:
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
- name: Compute version
run: |
package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
tag_name="${package_version}"
echo "package_version=${package_version}" >> $GITHUB_ENV
echo "tag_name=${tag_name}" >> $GITHUB_ENV
- name: Create Release
id: create_release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:
tag_name: ${{ env.tag_name }}
release_name: nod.ai SHARK ${{ env.tag_name }}
body: |
Automatic snapshot release of nod.ai SHARK.
draft: true
prerelease: false
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest toml
if [ -f requirements.txt ]; then pip install -r requirements.txt -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html; fi
if [ -f requirements.txt ]; then pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/SHARK-Runtime/releases; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude shark.venv,lit.cfg.py
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py
- name: Build and validate the IREE package
if: ${{ matrix.backend == 'IREE' }}
continue-on-error: true
run: |
cd $GITHUB_WORKSPACE
USE_IREE=1 VENV_DIR=iree.venv NIGHTLY=1 ./setup_venv.sh
source iree.venv/bin/activate
package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
SHARK_PACKAGE_VERSION=${package_version} \
pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://iree-org.github.io/iree/pip-release-links.html
# Install the built wheel
pip install ./wheelhouse/nodai*
# Validate the Models
/bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" tank/test_models.py |
tail -n 1 |
tee -a pytest_results.txt
if !(grep -Fxq " failed" pytest_results.txt)
then
export SHA=$(git log -1 --format='%h')
gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/${DATE}_$SHA
gsutil -m cp -r gs://shark_tank/${DATE}_$SHA/* gs://shark_tank/latest/
fi
rm -rf ./wheelhouse/nodai*
- name: Build and validate the SHARK Runtime package
if: ${{ matrix.backend == 'SHARK' }}
- name: Build and validate the package
run: |
cd $GITHUB_WORKSPACE
NIGHTLY=1 ./setup_venv.sh
./setup_venv.sh
source shark.venv/bin/activate
package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
SHARK_PACKAGE_VERSION=${package_version} \
pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/SHARK-Runtime/releases
# Install the built wheel
pip install ./wheelhouse/nodai*
# Validate the Models
pytest --ci --ci_sha=${SHORT_SHA} tank/test_models.py |
tail -n 1 |
tee -a pytest_results.txt
pytest -k 'not benchmark' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/
- name: Upload Release Assets
id: upload-release-assets
uses: dwenegar/upload-release-assets@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:
release_id: ${{ steps.create_release.outputs.id }}
assets_path: ./wheelhouse/nodai_*.whl
- name: Publish Release
id: publish_release
uses: eregon/publish-release@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:
release_id: ${{ steps.create_release.outputs.id }}

View File

@@ -10,21 +10,13 @@ on:
branches: [ main ]
workflow_dispatch:
# Ensure that only a single job or workflow using the same
# concurrency group will run at a time. This would cancel
# any in-progress jobs in the same github workflow and github
# ref (e.g. refs/heads/main or refs/pull/<pr_number>/merge).
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
build-validate:
strategy:
fail-fast: true
matrix:
os: [icelake, a100, MacStudio, ubuntu-latest]
suite: [cpu,cuda,vulkan]
os: [a100, MacStudio, ubuntu-latest]
suite: [cpu,gpu,vulkan]
python-version: ["3.10"]
include:
- os: ubuntu-latest
@@ -33,38 +25,27 @@ jobs:
- os: ubuntu-latest
suite: vulkan
- os: ubuntu-latest
suite: cuda
suite: gpu
- os: ubuntu-latest
suite: cpu
- os: MacStudio
suite: cuda
suite: gpu
- os: MacStudio
suite: cpu
- os: icelake
suite: vulkan
- os: icelake
suite: cuda
- os: a100
suite: cpu
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
- name: Set Environment Variables
run: |
echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
- name: Set up Python Version File ${{ matrix.python-version }}
if: matrix.os == 'a100' || matrix.os == 'ubuntu-latest' || matrix.os == 'icelake'
if: matrix.os == 'a100' || matrix.os == 'ubuntu-latest'
run: |
# See https://github.com/actions/setup-python/issues/433
echo ${{ matrix.python-version }} >> $GITHUB_WORKSPACE/.python-version
- name: Set up Python ${{ matrix.python-version }}
if: matrix.os == 'a100' || matrix.os == 'ubuntu-latest' || matrix.os == 'icelake'
if: matrix.os == 'a100' || matrix.os == 'ubuntu-latest'
uses: actions/setup-python@v4
with:
python-version: '${{ matrix.python-version }}'
@@ -90,45 +71,26 @@ jobs:
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude lit.cfg.py
- name: Validate Models on CPU
- name: Validate CPU Models
if: matrix.suite == 'cpu'
run: |
cd $GITHUB_WORKSPACE
PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
source shark.venv/bin/activate
pytest --benchmark --ci --ci_sha=${SHORT_SHA} -s --local_tank_cache="/data/anush/shark_cache" tank/test_models.py -k cpu --update_tank
gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv
- name: Validate Models on NVIDIA GPU
if: matrix.suite == 'cuda'
run: |
cd $GITHUB_WORKSPACE
PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
source shark.venv/bin/activate
pytest --benchmark --ci --ci_sha=${SHORT_SHA} -s --local_tank_cache="/data/anush/shark_cache" tank/test_models.py -k cuda --update_tank
gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
- name: Validate Vulkan Models (MacOS)
if: matrix.suite == 'vulkan' && matrix.os == 'MacStudio'
run: |
cd $GITHUB_WORKSPACE
PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
source shark.venv/bin/activate
echo "VULKAN SDK PATH wo setup: $VULKAN_SDK"
cd /Users/anush/VulkanSDK/1.3.224.1/
source setup-env.sh
cd $GITHUB_WORKSPACE
echo "VULKAN SDK PATH with setup: $VULKAN_SDK"
echo $PATH
pip list | grep -E "torch|iree"
pytest -s --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" tank/test_models.py -k vulkan --update_tank
- name: Validate Vulkan Models (a100)
if: matrix.suite == 'vulkan' && matrix.os != 'MacStudio'
run: |
cd $GITHUB_WORKSPACE
PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
source shark.venv/bin/activate
pytest --benchmark --ci --ci_sha=${SHORT_SHA} -s --local_tank_cache="/data/anush/shark_cache" tank/test_models.py -k vulkan --update_tank
pytest -k 'cpu' --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py
- name: Validate GPU Models
if: matrix.suite == 'gpu'
run: |
cd $GITHUB_WORKSPACE
PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
source shark.venv/bin/activate
pytest -k "gpu" --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py
- name: Validate Vulkan Models
if: matrix.suite == 'vulkan'
run: |
cd $GITHUB_WORKSPACE
PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
source shark.venv/bin/activate
pytest -k 'vulkan' --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py

8
.gitignore vendored
View File

@@ -31,6 +31,7 @@ MANIFEST
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
@@ -162,14 +163,7 @@ cython_debug/
# Shark related artefacts
*venv/
shark_tmp/
*.vmfb
.use-iree
tank/dict_configs.py
# ORT related artefacts
cache_models/
onnx_models/
#web logging
web/logs/
web/stored_results/stable_diffusion/

218
LICENSE
View File

@@ -1,218 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
---- LLVM Exceptions to the Apache 2.0 License ----
As an exception, if, as a result of your compiling your source code, portions
of this Software are embedded into an Object form of such source code, you
may redistribute such embedded portions in such Object form without complying
with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
In addition, if you combine or link compiled forms of this Software with
software that is licensed under the GPLv2 ("Combined Software") and if a
court of competent jurisdiction determines that the patent provision (Section
3), the indemnity provision (Section 9) or other Section of the License
conflicts with the conditions of the GPLv2, you may retroactively and
prospectively choose to deem waived or otherwise exclude such Section(s) of
the License, but only in their entirety and only with respect to the Combined
Software.

393
README.md
View File

@@ -5,123 +5,25 @@ High Performance Machine Learning and Data Analytics for CPUs, GPUs, Accelerator
[![Nightly Release](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml)
[![Validate torch-models on Shark Runtime](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml)
## Communication Channels
## Installation (Windows, Linux and macOS)
## Check out the code
```shell
git clone https://github.com/nod-ai/SHARK.git
cd SHARK
```
## Setup your Python VirtualEnvironment and Dependencies
### Windows 10/11 Users
* Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)
* Install Git for Windows from [here](https://git-scm.com/download/win)
#### Allow the install script to run in Powershell
```powershell
set-executionpolicy remotesigned
```
#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
```powershell
./setup_venv.ps1 #You can re-run this script to get the latest version
```
### Linux / macOS Users
```shell
./setup_venv.sh
source shark.venv/bin/activate
```
* [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
* [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
### Run Stable Diffusion on your device - WebUI
#### Windows 10/11 Users
```powershell
(shark.venv) PS C:\Users\nod\SHARK> cd web
(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
```
#### Linux Users
```shell
(shark.venv) > cd web
(shark.venv) > python index.py
```
#### Access Stable Diffusion on http://localhost:8080/?__theme=dark
<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
### Run Stable Diffusion on your device - Commandline
#### Install your hardware drivers
* [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mril-iree)
* [macOS Users] Download and install the latest Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home)
* [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
Other users please ensure you have your latest vendor drivers and Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home) and if you are using vulkan check `vulkaninfo` works in a terminal window
#### Windows 10/11 Users
```powershell
(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
```
#### Linux / macOS Users
```shell
python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
```
You can replace `vulkan` with `cpu` to run on your CPU or with `cuda` to run on CUDA devices. If you have multiple vulkan devices you can address them with `--device=vulkan://1` etc
The output on a 6900XT would like:
```shell
44it [00:08, 5.14it/s]i = 44 t = 120 (191ms)
45it [00:08, 5.15it/s]i = 45 t = 100 (191ms)
46it [00:08, 5.16it/s]i = 46 t = 80 (191ms)
47it [00:09, 5.16it/s]i = 47 t = 60 (193ms)
48it [00:09, 5.15it/s]i = 48 t = 40 (195ms)
49it [00:09, 5.12it/s]i = 49 t = 20 (196ms)
50it [00:09, 5.14it/s]
Average step time: 192.8154182434082ms/it
Total image generation runtime (s): 10.390909433364868
(shark.venv) PS C:\g\shark>
```
Here are some samples generated:
![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
For more options to the Stable Diffusion model read [this](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md)
Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware.
## Installation
<details>
<summary>Binary Installation</summary>
<summary>Installation (Linux and macOS)</summary>
### Setup a new pip Virtual Environment
This step sets up a new VirtualEnv for Python
```shell
python --version #Check you have 3.10 on Linux, macOS or Windows Powershell
python --version #Check you have 3.7->3.10 on Linux or 3.10 on macOS
python -m venv shark_venv
source shark_venv/bin/activate # Use shark_venv/Scripts/activate on Windows
source shark_venv/bin/activate
# If you are using conda create and activate a new conda env
@@ -136,21 +38,16 @@ python -m pip install --upgrade pip
This step pip installs SHARK and related packages on Linux Python 3.7, 3.8, 3.9, 3.10 and macOS Python 3.10
```shell
pip install nodai-shark -f https://nod-ai.github.io/SHARK/package-index/ -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
pip install nodai-shark -f https://github.com/nod-ai/SHARK/releases -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/shark-runtime/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
```
### Run shark tank model tests.
```shell
pytest tank/test_models.py
```
See tank/README.md for a more detailed walkthrough of our pytest suite and CLI.
If you are on an Intel macOS machine you need this [workaround](https://github.com/nod-ai/SHARK/issues/102) for an upstream issue.
### Download and run Resnet50 sample
```shell
curl -O https://raw.githubusercontent.com/nod-ai/SHARK/main/shark/examples/shark_inference/resnet50_script.py
#Install deps for test script
pip install --pre torch torchvision torchaudio tqdm pillow gsutil --extra-index-url https://download.pytorch.org/whl/nightly/cpu
pip install --pre torch torchvision torchaudio tqdm pillow --extra-index-url https://download.pytorch.org/whl/nightly/cpu
python ./resnet50_script.py --device="cpu" #use cuda or vulkan or metal
```
@@ -164,78 +61,78 @@ python ./minilm_jit.py --device="cpu" #use cuda or vulkan or metal
</details>
<details>
<summary>Development, Testing and Benchmarks</summary>
<summary>Source Installation</summary>
If you want to use Python3.10 and with TF Import tools you can use the environment variables like:
Set `USE_IREE=1` to use upstream IREE
```
# PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh
```
## Check out the code
### Run any of the hundreds of SHARK tank models via the test framework
```shell
python -m shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
# Or a pytest
pytest tank/test_models.py -k "MiniLM"
git clone https://github.com/nod-ai/SHARK.git
```
If you are a *Torch-mlir developer or an IREE developer* and want to test local changes you can uninstall
## Setup your Python VirtualEnvironment and Dependencies
```shell
# Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...).
./setup_venv.sh
source shark.venv/bin/activate
```
For example if you want to use Python3.10 and upstream IREE with TF Import tools you can use the environment variables like:
```
# PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 USE_IREE=1 ./setup_venv.sh
```
If you are a Torch-mlir developer or an IREE developer and want to test local changes you can uninstall
the provided packages with `pip uninstall torch-mlir` and / or `pip uninstall iree-compiler iree-runtime` and build locally
with Python bindings and set your PYTHONPATH as mentioned [here](https://github.com/iree-org/iree/tree/main/docs/api_docs/python#install-iree-binaries)
with Python bindings and set your PYTHONPATH as mentioned [here](https://google.github.io/iree/bindings/python/)
for IREE and [here](https://github.com/llvm/torch-mlir/blob/main/development.md#setup-python-environment-to-export-the-built-python-packages)
for Torch-MLIR.
### How to use your locally built Torch-MLIR with SHARK
### Run a demo script
```shell
1.) Run `./setup_venv.sh in SHARK` and activate `shark.venv` virtual env.
2.) Run `pip uninstall torch-mlir`.
3.) Go to your local Torch-MLIR directory.
4.) Activate mlir_venv virtual envirnoment.
5.) Run `pip uninstall -r requirements.txt`.
6.) Run `pip install -r requirements.txt`.
7.) Build Torch-MLIR.
8.) Activate shark.venv virtual environment from the Torch-MLIR directory.
8.) Run `export PYTHONPATH=`pwd`/build/tools/torch-mlir/python_packages/torch_mlir:`pwd`/examples` in the Torch-MLIR directory.
9.) Go to the SHARK directory.
```
Now the SHARK will use your locally build Torch-MLIR repo.
## Benchmarking Dispatches
To produce benchmarks of individual dispatches, you can add `--dispatch_benchmarks=All --dispatch_benchmarks_dir=<output_dir>` to your command line argument.
If you only want to compile specific dispatches, you can specify them with a space seperated string instead of `"All"`. E.G. `--dispatch_benchmarks="0 1 2 10"`
if you want to instead incorporate this into a python script, you can pass the `dispatch_benchmarks` and `dispatch_benchmarks_dir` commands when initializing `SharkInference`, and the benchmarks will be generated when compiled. E.G:
```
shark_module = SharkInference(
mlir_model,
func_name,
device=args.device,
mlir_dialect="tm_tensor",
dispatch_benchmarks="all",
dispatch_benchmarks_dir="results"
)
python -m shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
# Or a pytest
pytest tank/tf/hf_masked_lm/albert-base-v2_test.py::AlbertBaseModuleTest::test_module_static_cpu
```
Output will include:
- An ordered list ordered-dispatches.txt of all the dispatches with their runtime
- Inside the specified directory, there will be a directory for each dispatch (there will be mlir files for all dispatches, but only compiled binaries and benchmark data for the specified dispatches)
- An .mlir file containing the dispatch benchmark
- A compiled .vmfb file containing the dispatch benchmark
- An .mlir file containing just the hal executable
- A compiled .vmfb file of the hal executable
- A .txt file containing benchmark output
See tank/README.md for instructions on how to run model tests and benchmarks from the SHARK tank.
</details>
<details>
<summary>Testing</summary>
### Run all model tests on CPU/GPU/VULKAN/Metal
```shell
pytest tank
# If on Linux for quicker results:
pytest tank -n auto
```
### Running specific tests
```shell
# Run tests for a specific model:
pytest tank/<MODEL_NAME> #i.e., pytest tank/bert-base-uncased
# Run tests for a specific case:
pytest tank/<MODEL_NAME>/<MODEL_TEST>.py::<MODEL>ModuleTest::<CASE>
# i.e., pytest tank/bert-base-uncased/bert-base-uncased_test.py::BertModuleTest::test_module_static_cpu
# For frontends other than pytorch, if available for a model, add frontend to filename: tank/bert-base-uncased/bert-base-uncased_tf_test.py
# Run all tests, including tests for benchmarking and SHARK modules:
# From base SHARK directory,
pytest
```
### Run all model benchmark tests on CPU/GPU/VULKAN/Metal
```shell
pytest benchmarks
```
</details>
<details>
<summary>API Reference</summary>
@@ -286,26 +183,160 @@ result = shark_module.forward((arg0, arg1))
```
</details>
## Supported and Validated Models
SHARK is maintained to support the latest innovations in ML Models:
<details>
<summary>PyTorch Models</summary>
| TF HuggingFace Models | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------|----------|-------------|
| BERT | :green_heart: | :green_heart: | :green_heart: |
| DistilBERT | :green_heart: | :green_heart: | :green_heart: |
| GPT2 | :green_heart: | :green_heart: | :green_heart: |
| BLOOM | :green_heart: | :green_heart: | :green_heart: |
| Stable Diffusion | :green_heart: | :green_heart: | :green_heart: |
| Vision Transformer | :green_heart: | :green_heart: | :green_heart: |
| ResNet50 | :green_heart: | :green_heart: | :green_heart: |
### Huggingface PyTorch Models
For a complete list of the models supported in SHARK, please refer to [tank/README.md](https://github.com/nod-ai/SHARK/blob/main/tank/README.md).
| Hugging Face Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :green_heart: (JIT) | :green_heart: | :green_heart: | :green_heart: |
| Albert | :green_heart: (JIT) | :green_heart: | :green_heart: | :green_heart: |
| BigBird | :green_heart: (AOT) | | | |
| DistilBERT | :green_heart: (JIT) | :green_heart: | :green_heart: | :green_heart: |
| GPT2 | :broken_heart: (AOT) | | | |
| MobileBert | :green_heart: (JIT) | :green_heart: | :green_heart: | :green_heart: |
## Communication Channels
### Torchvision Models
* [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
* [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
| TORCHVISION Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|--------------------|----------------------|----------|----------|-------------|
| AlexNet | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| DenseNet121 | :green_heart: (Script) | | | |
| MNasNet1_0 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| MobileNetV2 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| MobileNetV3 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Unet | :broken_heart: (Script) | | | |
| Resnet18 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnet50 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnet101 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnext50_32x4d | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| ShuffleNet_v2 | :broken_heart: (Script) | | | |
| SqueezeNet | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| EfficientNet | :green_heart: (Script) | | | |
| Regnet | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnest | :broken_heart: (Script) | | | |
| Vision Transformer | :green_heart: (Script) | | | |
| VGG 16 | :green_heart: (Script) | :green_heart: | :green_heart: | |
| Wide Resnet | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| RAFT | :broken_heart: (JIT) | | | |
For more information refer to [MODEL TRACKING SHEET](https://docs.google.com/spreadsheets/d/15PcjKeHZIrB5LfDyuw7DGEEE8XnQEX2aX8lm8qbxV8A/edit#gid=0)
### PyTorch Training Models
| Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :broken_heart: | :broken_heart: | | |
| FullyConnected | :green_heart: | :green_heart: | | |
</details>
<details>
<summary>JAX Models</summary>
### JAX Models
| Models | JAX-MHLO lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| DALL-E | :broken_heart: | :broken_heart: | | |
| FullyConnected | :green_heart: | :green_heart: | | |
</details>
<details>
<summary>TFLite Models</summary>
### TFLite Models
| Models | TOSA/LinAlg | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :broken_heart: | :broken_heart: | | |
| FullyConnected | :green_heart: | :green_heart: | | |
| albert | :green_heart: | :green_heart: | | |
| asr_conformer | :green_heart: | :green_heart: | | |
| bird_classifier | :green_heart: | :green_heart: | | |
| cartoon_gan | :green_heart: | :green_heart: | | |
| craft_text | :green_heart: | :green_heart: | | |
| deeplab_v3 | :green_heart: | :green_heart: | | |
| densenet | :green_heart: | :green_heart: | | |
| east_text_detector | :green_heart: | :green_heart: | | |
| efficientnet_lite0_int8 | :green_heart: | :green_heart: | | |
| efficientnet | :green_heart: | :green_heart: | | |
| gpt2 | :green_heart: | :green_heart: | | |
| image_stylization | :green_heart: | :green_heart: | | |
| inception_v4 | :green_heart: | :green_heart: | | |
| inception_v4_uint8 | :green_heart: | :green_heart: | | |
| lightning_fp16 | :green_heart: | :green_heart: | | |
| lightning_i8 | :green_heart: | :green_heart: | | |
| lightning | :green_heart: | :green_heart: | | |
| magenta | :green_heart: | :green_heart: | | |
| midas | :green_heart: | :green_heart: | | |
| mirnet | :green_heart: | :green_heart: | | |
| mnasnet | :green_heart: | :green_heart: | | |
| mobilebert_edgetpu_s_float | :green_heart: | :green_heart: | | |
| mobilebert_edgetpu_s_quant | :green_heart: | :green_heart: | | |
| mobilebert | :green_heart: | :green_heart: | | |
| mobilebert_tf2_float | :green_heart: | :green_heart: | | |
| mobilebert_tf2_quant | :green_heart: | :green_heart: | | |
| mobilenet_ssd_quant | :green_heart: | :green_heart: | | |
| mobilenet_v1 | :green_heart: | :green_heart: | | |
| mobilenet_v1_uint8 | :green_heart: | :green_heart: | | |
| mobilenet_v2_int8 | :green_heart: | :green_heart: | | |
| mobilenet_v2 | :green_heart: | :green_heart: | | |
| mobilenet_v2_uint8 | :green_heart: | :green_heart: | | |
| mobilenet_v3-large | :green_heart: | :green_heart: | | |
| mobilenet_v3-large_uint8 | :green_heart: | :green_heart: | | |
| mobilenet_v35-int8 | :green_heart: | :green_heart: | | |
| nasnet | :green_heart: | :green_heart: | | |
| person_detect | :green_heart: | :green_heart: | | |
| posenet | :green_heart: | :green_heart: | | |
| resnet_50_int8 | :green_heart: | :green_heart: | | |
| rosetta | :green_heart: | :green_heart: | | |
| spice | :green_heart: | :green_heart: | | |
| squeezenet | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v1 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v1_uint8 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2_fpnlite | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2_fpnlite_uint8 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2_int8 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2 | :green_heart: | :green_heart: | | |
| ssd_spaghettinet_large | :green_heart: | :green_heart: | | |
| ssd_spaghettinet_large_uint8 | :green_heart: | :green_heart: | | |
| visual_wake_words_i8 | :green_heart: | :green_heart: | | |
</details>
<details>
<summary>TF Models</summary>
### Tensorflow Models (Inference)
| Hugging Face Models | tf-mhlo lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| albert-base-v2 | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| DistilBERT | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| CamemBert | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| ConvBert | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| Deberta | | | | |
| electra | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| funnel | | | | |
| layoutlm | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| longformer | | | | |
| mobile-bert | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| remembert | | | | |
| tapas | | | | |
| flaubert | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| roberta | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| xlm-roberta | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| mpnet | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
</details>
## Related Projects

View File

@@ -42,7 +42,7 @@ class TFHuggingFaceLanguage(tf.Module):
input_ids=x, attention_mask=y, token_type_ids=z, training=False
)
@tf.function(input_signature=tf_bert_input, jit_compile=True)
@tf.function(input_signature=tf_bert_input)
def forward(self, input_ids, attention_mask, token_type_ids):
return self.m.predict(input_ids, attention_mask, token_type_ids)

View File

@@ -1,5 +0,0 @@
#!/bin/bash
IMPORTER=1 ./setup_venv.sh
source $GITHUB_WORKSPACE/shark.venv/bin/activate
python generate_sharktank.py --upload=False --ci_tank_dir=True

View File

@@ -1,37 +0,0 @@
"""Scrapes the github releases API to generate a static pip-install-able releases page.
See https://github.com/llvm/torch-mlir/issues/1374
"""
import argparse
import json
import requests
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("owner", type=str)
parser.add_argument("repo", type=str)
args = parser.parse_args()
# Get releases
response = requests.get(
f"https://api.github.com/repos/{args.owner}/{args.repo}/releases"
)
body = json.loads(response.content)
# Parse releases
releases = []
for row in body:
for asset in row["assets"]:
releases.append((asset["name"], asset["browser_download_url"]))
# Output HTML
html = """<!DOCTYPE html>
<html>
<body>
"""
for name, url in releases:
html += f" <a href='{url}'>{name}</a><br />\n"
html += """ </body>
</html>"""
print(html)

View File

@@ -1,8 +0,0 @@
# IREE Compiler/ Runtime Version:
20221207.350
# SHARK Compiler/ Runtime Version:
20221207.236
# Torch-MLIR Version for IREE:
20221207.680
# Torch-MLIR Version for SHARK:
20221207.680

View File

@@ -1,5 +1,17 @@
def pytest_addoption(parser):
# Attaches SHARK command-line arguments to the pytest machinery.
parser.addoption(
"--save_mlir",
action="store_true",
default="False",
help="Pass option to save input MLIR",
)
parser.addoption(
"--save_vmfb",
action="store_true",
default="False",
help="Pass option to save IREE output .vmfb",
)
parser.addoption(
"--benchmark",
action="store_true",
@@ -7,56 +19,8 @@ def pytest_addoption(parser):
help="Pass option to benchmark and write results.csv",
)
parser.addoption(
"--onnx_bench",
"--save_temps",
action="store_true",
default="False",
help="Add ONNX benchmark results to pytest benchmarks.",
)
parser.addoption(
"--tf32",
action="store_true",
default="False",
help="Use TensorFloat-32 calculations.",
)
parser.addoption(
"--save_repro",
action="store_true",
default="False",
help="Pass option to save reproduction artifacts to SHARK/shark_tmp/test_case/",
)
parser.addoption(
"--save_fails",
action="store_true",
default="False",
help="Save reproduction artifacts for a test case only if it fails. Default is False.",
)
parser.addoption(
"--ci",
action="store_true",
default="False",
help="Enables uploading of reproduction artifacts upon test case failure during iree-compile or validation. Must be passed with --ci_sha option ",
)
parser.addoption(
"--update_tank",
action="store_false",
default="False",
help="Update local shark tank with latest artifacts.",
)
parser.addoption(
"--ci_sha",
action="store",
default="None",
help="Passes the github SHA of the CI workflow to include in google storage directory for reproduction artifacts.",
)
parser.addoption(
"--local_tank_cache",
action="store",
default="",
help="Specify the directory in which all downloaded shark_tank artifacts will be cached.",
)
parser.addoption(
"--tank_url",
type=str,
default="gs://shark_tank/latest",
help="URL to bucket from which to download SHARK tank artifacts. Default is gs://shark_tank/latest",
help="Saves IREE reproduction artifacts for filing upstream issues.",
)

3
cpp/.gitignore vendored
View File

@@ -1,3 +0,0 @@
*.mlir
*.vmfb
*.ini

View File

@@ -1,52 +0,0 @@
# Copyright 2022 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
cmake_minimum_required(VERSION 3.21...3.23)
#-------------------------------------------------------------------------------
# Project configuration
#-------------------------------------------------------------------------------
project(iree-samples C CXX)
set(CMAKE_C_STANDARD 11)
set(CMAKE_CXX_STANDARD 17)
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
#-------------------------------------------------------------------------------
# Core project dependency
#-------------------------------------------------------------------------------
message(STATUS "Fetching core IREE repo (this may take a few minutes)...")
# Note: for log output, set -DFETCHCONTENT_QUIET=OFF,
# see https://gitlab.kitware.com/cmake/cmake/-/issues/18238#note_440475
include(FetchContent)
FetchContent_Declare(
iree
GIT_REPOSITORY https://github.com/nod-ai/shark-runtime.git
GIT_TAG shark
GIT_SUBMODULES_RECURSE OFF
GIT_SHALLOW OFF
GIT_PROGRESS ON
USES_TERMINAL_DOWNLOAD ON
)
# Extend module path to find MLIR CMake modules.
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_BINARY_DIR}/lib/cmake/mlir")
# Disable core project features not needed for these out of tree samples.
set(IREE_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(IREE_BUILD_SAMPLES OFF CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(iree)
FetchContent_GetProperties(iree SOURCE_DIR IREE_SOURCE_DIR)
#-------------------------------------------------------------------------------
# Individual samples
#-------------------------------------------------------------------------------
add_subdirectory(vulkan_gui)

View File

@@ -1,82 +0,0 @@
# SHARK C/C++ Samples
These C/C++ samples can be built using CMake. The samples depend on the main
SHARK-Runtime project's C/C++ sources, including both the runtime and the compiler.
Individual samples may require additional dependencies. Watch CMake's output
for information about which you are missing for individual samples.
On Windows we recommend using https://github.com/microsoft/vcpkg to download packages for
your system. The general setup flow looks like
*Install and activate SHARK*
```bash
source shark.venv/bin/activate #follow main repo instructions to setup your venv
```
*Install Dependencies*
```bash
vcpkg install [library] --triplet [your platform]
vcpkg integrate install
# Then pass `-DCMAKE_TOOLCHAIN_FILE=[check logs for path]` when configuring CMake
```
In Ubuntu Linux you can install
```bash
sudo apt install libsdl2-dev
```
*Build*
```bash
cd cpp
cmake -GNinja -B build/
cmake --build build/
```
*Prepare the model*
```bash
wget https://storage.googleapis.com/shark_tank/latest/resnet50_tf/resnet50_tf.mlir
iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvm-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 resnet50_tf.mlir -o resnet50_tf.vmfb
```
*Prepare the input*
```bash
python save_img.py
```
Note that this requires tensorflow, e.g.
```bash
python -m pip install tensorflow
```
*Run the vulkan_gui*
```bash
./build/vulkan_gui/iree-samples-resnet-vulkan-gui
```
## Other models
A tool for benchmarking other models is built and can be invoked with a command like the following
```bash
./build/vulkan_gui/iree-vulkan-gui --module-file=path/to/.vmfb --function_input=...
```
see `./build/vulkan_gui/iree-vulkan-gui --help` for an explanation on the function input. For example, stable diffusion unet can be tested with the following commands:
```bash
wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/stable_diff_tf.mlir
iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 stable_diff_tf.mlir -o stable_diff_tf.vmfb
./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=2x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32
```
VAE and Autoencoder are also available
```bash
# VAE
wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/vae_tf/vae.mlir
iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 vae.mlir -o vae.vmfb
./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x4x64x64xf32
# CLIP Autoencoder
wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/clip_tf/clip_autoencoder.mlir
iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 clip_autoencoder.mlir -o clip_autoencoder.vmfb
./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x77xi32 --function_input=1x77xi32
```

Binary file not shown.

Before

Width:  |  Height:  |  Size: 26 KiB

View File

@@ -1,18 +0,0 @@
import numpy as np
import tensorflow as tf
from shark.shark_inference import SharkInference
def load_and_preprocess_image(fname: str):
image = tf.io.read_file(fname)
image = tf.image.decode_image(image, channels=3)
image = tf.image.resize(image, (224, 224))
image = image[tf.newaxis, :]
# preprocessing pipeline
input_tensor = tf.keras.applications.resnet50.preprocess_input(image)
return input_tensor
data = load_and_preprocess_image("dog_imagenet.jpg").numpy()
data.tofile("dog.bin")

View File

@@ -1,84 +0,0 @@
# Copyright 2022 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
if(NOT IREE_TARGET_BACKEND_LLVM_CPU OR
NOT IREE_HAL_EXECUTABLE_LOADER_EMBEDDED_ELF)
message(STATUS "Missing LLVM backend and/or embeddded elf loader, skipping vision_inference sample")
return()
endif()
# vcpkg install stb
# tested with version 2021-09-10
find_package(Stb)
if(NOT Stb_FOUND)
message(STATUS "Could not find Stb, skipping vision inference sample")
return()
endif()
# Compile mnist.mlir to mnist.vmfb.
set(_COMPILE_TOOL_EXECUTABLE $<TARGET_FILE:iree-compile>)
set(_COMPILE_ARGS)
list(APPEND _COMPILE_ARGS "--iree-input-type=mhlo")
list(APPEND _COMPILE_ARGS "--iree-hal-target-backends=llvm-cpu")
list(APPEND _COMPILE_ARGS "${IREE_SOURCE_DIR}/samples/models/mnist.mlir")
list(APPEND _COMPILE_ARGS "-o")
list(APPEND _COMPILE_ARGS "mnist.vmfb")
add_custom_command(
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb
COMMAND ${_COMPILE_TOOL_EXECUTABLE} ${_COMPILE_ARGS}
DEPENDS ${_COMPILE_TOOL_EXECUTABLE} "${IREE_SOURCE_DIR}/samples/models/mnist.mlir"
)
# Embed mnist.vmfb into a C file as mnist_bytecode_module_c.[h/c]
set(_EMBED_DATA_EXECUTABLE $<TARGET_FILE:generate_embed_data>)
set(_EMBED_ARGS)
list(APPEND _EMBED_ARGS "--output_header=mnist_bytecode_module_c.h")
list(APPEND _EMBED_ARGS "--output_impl=mnist_bytecode_module_c.c")
list(APPEND _EMBED_ARGS "--identifier=iree_samples_vision_inference_mnist_bytecode_module")
list(APPEND _EMBED_ARGS "--flatten")
list(APPEND _EMBED_ARGS "${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb")
add_custom_command(
OUTPUT "mnist_bytecode_module_c.h" "mnist_bytecode_module_c.c"
COMMAND ${_EMBED_DATA_EXECUTABLE} ${_EMBED_ARGS}
DEPENDS ${_EMBED_DATA_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb
)
# Define a library target for mnist_bytecode_module_c.
add_library(iree_samples_vision_inference_mnist_bytecode_module_c OBJECT)
target_sources(iree_samples_vision_inference_mnist_bytecode_module_c
PRIVATE
mnist_bytecode_module_c.h
mnist_bytecode_module_c.c
)
# Define the sample executable.
set(_NAME "iree-run-mnist-module")
add_executable(${_NAME} "")
target_sources(${_NAME}
PRIVATE
"image_util.h"
"image_util.c"
"iree-run-mnist-module.c"
)
set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "iree-run-mnist-module")
target_include_directories(${_NAME} PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
)
target_include_directories(${_NAME} PRIVATE
${Stb_INCLUDE_DIR}
)
target_link_libraries(${_NAME}
iree_base_base
iree_base_tracing
iree_hal_hal
iree_runtime_runtime
iree_samples_vision_inference_mnist_bytecode_module_c
)
# Define a target that copies the test image into the build directory.
add_custom_target(iree_samples_vision_inference_test_image
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/mnist_test.png" "${CMAKE_CURRENT_BINARY_DIR}/mnist_test.png")
add_dependencies(${_NAME} iree_samples_vision_inference_test_image)
message(STATUS "Configured vision_inference sample successfully")

View File

@@ -1,8 +0,0 @@
# Vision Inference Sample (C code)
This sample demonstrates how to run a MNIST handwritten digit detection vision
model on an image using IREE's C API.
A similar sample is implemented using a Python script and IREE's command line
tools over in the primary iree repository at
https://github.com/iree-org/iree/tree/main/samples/vision_inference

View File

@@ -1,224 +0,0 @@
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "image_util.h"
#include <math.h>
#include "iree/base/internal/flags.h"
#include "iree/base/tracing.h"
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
iree_status_t iree_tools_utils_pixel_rescaled_to_buffer(
const uint8_t* pixel_data, iree_host_size_t buffer_length,
const float* input_range, iree_host_size_t range_length,
float* out_buffer) {
IREE_TRACE_ZONE_BEGIN(z0);
if (range_length != 2) {
IREE_TRACE_ZONE_END(z0);
return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
"range defined as 2-element [min, max] array.");
}
float input_scale = fabsf(input_range[1] - input_range[0]) / 2.0f;
float input_offset = (input_range[0] + input_range[1]) / 2.0f;
const float kUint8Mean = 127.5f;
for (int i = 0; i < buffer_length; ++i) {
out_buffer[i] =
(((float)(pixel_data[i])) - kUint8Mean) / kUint8Mean * input_scale +
input_offset;
}
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
}
iree_status_t iree_tools_utils_load_pixel_data_impl(
const iree_string_view_t filename, const iree_hal_dim_t* shape,
iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length) {
int img_dims[3];
if (stbi_info(filename.data, img_dims, &(img_dims[1]), &(img_dims[2])) == 0) {
return iree_make_status(IREE_STATUS_NOT_FOUND, "can't load image %.*s",
(int)filename.size, filename.data);
}
if (!(element_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 ||
element_type == IREE_HAL_ELEMENT_TYPE_SINT_8 ||
element_type == IREE_HAL_ELEMENT_TYPE_UINT_8)) {
char element_type_str[16];
IREE_RETURN_IF_ERROR(iree_hal_format_element_type(
element_type, sizeof(element_type_str), element_type_str, NULL));
return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
"element type %s not supported", element_type_str);
}
switch (shape_rank) {
case 2: { // Assume tensor <height x width>
if (img_dims[2] != 1 || (shape[0] != img_dims[1]) ||
(shape[1] != img_dims[0])) {
return iree_make_status(
IREE_STATUS_INVALID_ARGUMENT,
"image size: %dx%dx%d, expected: %" PRIdim "x%" PRIdim, img_dims[0],
img_dims[1], img_dims[2], shape[1], shape[0]);
}
break;
}
case 3: { // Assume tensor <height x width x channel>
if (shape[0] != img_dims[1] || shape[1] != img_dims[0] ||
shape[2] != img_dims[2]) {
return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
"image size: %dx%dx%d, expected: %" PRIdim
"x%" PRIdim "x%" PRIdim,
img_dims[0], img_dims[1], img_dims[2], shape[1],
shape[0], shape[2]);
}
break;
}
case 4: { // Assume tensor <batch x height x width x channel>
if (shape[1] != img_dims[1] || shape[2] != img_dims[0] ||
shape[3] != img_dims[2]) {
return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
"image size: %dx%dx%d, expected: %" PRIdim
"x%" PRIdim "x%" PRIdim,
img_dims[0], img_dims[1], img_dims[2], shape[2],
shape[1], shape[3]);
}
break;
}
default:
return iree_make_status(
IREE_STATUS_INVALID_ARGUMENT,
"Input buffer shape rank %" PRIhsz " not supported", shape_rank);
}
// Drop the alpha channel if present.
int req_ch = (img_dims[2] >= 3) ? 3 : 0;
*out_pixel_data = stbi_load(filename.data, img_dims, &(img_dims[1]),
&(img_dims[2]), req_ch);
if (*out_pixel_data == NULL) {
return iree_make_status(IREE_STATUS_NOT_FOUND, "can't load image %.*s",
(int)filename.size, filename.data);
}
*out_buffer_length =
img_dims[0] * img_dims[1] * (img_dims[2] > 3 ? 3 : img_dims[2]);
return iree_ok_status();
}
iree_status_t iree_tools_utils_load_pixel_data(
const iree_string_view_t filename, const iree_hal_dim_t* shape,
iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length) {
IREE_TRACE_ZONE_BEGIN(z0);
iree_status_t result = iree_tools_utils_load_pixel_data_impl(
filename, shape, shape_rank, element_type, out_pixel_data,
out_buffer_length);
IREE_TRACE_ZONE_END(z0);
return result;
}
iree_status_t iree_tools_utils_buffer_view_from_image(
const iree_string_view_t filename, const iree_hal_dim_t* shape,
iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
iree_hal_allocator_t* allocator, iree_hal_buffer_view_t** out_buffer_view) {
IREE_TRACE_ZONE_BEGIN(z0);
*out_buffer_view = NULL;
if (element_type != IREE_HAL_ELEMENT_TYPE_SINT_8 &&
element_type != IREE_HAL_ELEMENT_TYPE_UINT_8) {
IREE_TRACE_ZONE_END(z0);
return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
"element type should be i8 or u8");
}
iree_status_t result;
uint8_t* pixel_data = NULL;
iree_host_size_t buffer_length;
result = iree_tools_utils_load_pixel_data(
filename, shape, shape_rank, element_type, &pixel_data, &buffer_length);
if (iree_status_is_ok(result)) {
iree_host_size_t element_byte =
iree_hal_element_dense_byte_count(element_type);
// SINT_8 and UINT_8 perform direct buffer wrap.
result = iree_hal_buffer_view_allocate_buffer(
allocator, shape_rank, shape, element_type,
IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR,
(iree_hal_buffer_params_t){
.type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
.access = IREE_HAL_MEMORY_ACCESS_READ,
.usage = IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE |
IREE_HAL_BUFFER_USAGE_TRANSFER,
},
iree_make_const_byte_span(pixel_data, element_byte * buffer_length),
out_buffer_view);
}
stbi_image_free(pixel_data);
IREE_TRACE_ZONE_END(z0);
return result;
}
typedef struct iree_tools_utils_buffer_view_load_params_t {
const uint8_t* pixel_data;
iree_host_size_t pixel_data_length;
const float* input_range;
iree_host_size_t input_range_length;
} iree_tools_utils_buffer_view_load_params_t;
static iree_status_t iree_tools_utils_buffer_view_load_image_rescaled(
iree_hal_buffer_mapping_t* mapping, void* user_data) {
iree_tools_utils_buffer_view_load_params_t* params =
(iree_tools_utils_buffer_view_load_params_t*)user_data;
return iree_tools_utils_pixel_rescaled_to_buffer(
params->pixel_data, params->pixel_data_length, params->input_range,
params->input_range_length, (float*)mapping->contents.data);
}
iree_status_t iree_tools_utils_buffer_view_from_image_rescaled(
const iree_string_view_t filename, const iree_hal_dim_t* shape,
iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
iree_hal_allocator_t* allocator, const float* input_range,
iree_host_size_t input_range_length,
iree_hal_buffer_view_t** out_buffer_view) {
IREE_TRACE_ZONE_BEGIN(z0);
*out_buffer_view = NULL;
if (element_type != IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
IREE_TRACE_ZONE_END(z0);
return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
"element type should be f32");
}
// Classic row-major image layout.
iree_hal_encoding_type_t encoding_type =
IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR;
// Load pixel data from the file into a new host memory allocation (the only
// interface stb_image provides). A real application would want to use the
// generation callback to directly decode the image into the target mapped
// device buffer.
uint8_t* pixel_data = NULL;
iree_host_size_t buffer_length = 0;
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_tools_utils_load_pixel_data(filename, shape, shape_rank,
element_type, &pixel_data,
&buffer_length));
iree_tools_utils_buffer_view_load_params_t params = {
.pixel_data = pixel_data,
.pixel_data_length = buffer_length,
.input_range = input_range,
.input_range_length = input_range_length,
};
iree_status_t status = iree_hal_buffer_view_generate_buffer(
allocator, shape_rank, shape, element_type, encoding_type,
(iree_hal_buffer_params_t){
.type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
IREE_HAL_MEMORY_TYPE_HOST_VISIBLE,
.usage = IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE |
IREE_HAL_BUFFER_USAGE_TRANSFER |
IREE_HAL_BUFFER_USAGE_MAPPING,
},
iree_tools_utils_buffer_view_load_image_rescaled, &params,
out_buffer_view);
stbi_image_free(pixel_data);
IREE_TRACE_ZONE_END(z0);
return status;
}

View File

@@ -1,77 +0,0 @@
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_
#define IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_
#include "iree/base/api.h"
#include "iree/hal/api.h"
#include "iree/hal/buffer_view.h"
#if __cplusplus
extern "C" {
#endif // __cplusplus
// Loads the image at |filename| into |out_pixel_data| and sets
// |out_buffer_length| to its length.
//
// The image dimension must match the width, height, and channel in|shape|,
// while 2 <= |shape_rank| <= 4 to match the image tensor format.
//
// The file must be in a format supported by stb_image.h.
// The returned |out_pixel_data| buffer must be released by the caller.
iree_status_t iree_tools_utils_load_pixel_data(
const iree_string_view_t filename, const iree_hal_dim_t* shape,
iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length);
// Parse the content in an image file in |filename| into a HAL buffer view
// |out_buffer_view|. |out_buffer_view| properties are defined by |shape|,
// |shape_rank|, and |element_type|, while being allocated by |allocator|.
//
// The |element_type| has to be SINT_8 or UINT_8. For FLOAT_32, use
// |iree_tools_utils_buffer_view_from_image_rescaled| instead.
//
// The returned |out_buffer_view| must be released by the caller.
iree_status_t iree_tools_utils_buffer_view_from_image(
const iree_string_view_t filename, const iree_hal_dim_t* shape,
iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
iree_hal_allocator_t* allocator, iree_hal_buffer_view_t** out_buffer_view);
// Parse the content in an image file in |filename| into a HAL buffer view
// |out_buffer_view|. |out_buffer_view| properties are defined by |shape|,
// |shape_rank|, and |element_type|, while being allocated by |allocator|.
// The value in |out_buffer_view| is rescaled with |input_range|.
//
// The |element_type| has to be FLOAT_32, For SINT_8 or UINT_8, use
// |iree_tools_utils_buffer_view_from_image| instead.
//
// The returned |out_buffer_view| must be released by the caller.
iree_status_t iree_tools_utils_buffer_view_from_image_rescaled(
const iree_string_view_t filename, const iree_hal_dim_t* shape,
iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
iree_hal_allocator_t* allocator, const float* input_range,
iree_host_size_t input_range_length,
iree_hal_buffer_view_t** out_buffer_view);
// Normalize uint8_t |pixel_data| of the size |buffer_length| to float buffer
// |out_buffer| with the range |input_range|.
//
// float32_x = (uint8_x - 127.5) / 127.5 * input_scale + input_offset, where
// input_scale = abs(|input_range[0]| - |input_range[1]| / 2
// input_offset = |input_range[0]| + |input_range[1]| / 2
//
// |out_buffer| needs to be allocated before the call.
iree_status_t iree_tools_utils_pixel_rescaled_to_buffer(
const uint8_t* pixel_data, iree_host_size_t pixel_count,
const float* input_range, iree_host_size_t input_range_length,
float* out_buffer);
#if __cplusplus
}
#endif // __cplusplus
#endif // IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_

View File

@@ -1,121 +0,0 @@
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This sample uses image_util to load a hand-written image as an
// iree_hal_buffer_view_t then passes it to the bytecode module built from
// mnist.mlir on the CPU backend with the local-task driver.
#include <float.h>
#include "image_util.h"
#include "iree/runtime/api.h"
#include "mnist_bytecode_module_c.h"
iree_status_t Run(const iree_string_view_t image_path) {
iree_runtime_instance_options_t instance_options;
iree_runtime_instance_options_initialize(IREE_API_VERSION_LATEST,
&instance_options);
iree_runtime_instance_options_use_all_available_drivers(&instance_options);
iree_runtime_instance_t* instance = NULL;
IREE_RETURN_IF_ERROR(iree_runtime_instance_create(
&instance_options, iree_allocator_system(), &instance));
// TODO(#5724): move device selection into the compiled modules.
iree_hal_device_t* device = NULL;
IREE_RETURN_IF_ERROR(iree_runtime_instance_try_create_default_device(
instance, iree_make_cstring_view("local-task"), &device));
// Create one session per loaded module to hold the module state.
iree_runtime_session_options_t session_options;
iree_runtime_session_options_initialize(&session_options);
iree_runtime_session_t* session = NULL;
IREE_RETURN_IF_ERROR(iree_runtime_session_create_with_device(
instance, &session_options, device,
iree_runtime_instance_host_allocator(instance), &session));
iree_hal_device_release(device);
const struct iree_file_toc_t* module_file =
iree_samples_vision_inference_mnist_bytecode_module_create();
IREE_RETURN_IF_ERROR(iree_runtime_session_append_bytecode_module_from_memory(
session, iree_make_const_byte_span(module_file->data, module_file->size),
iree_allocator_null()));
iree_runtime_call_t call;
IREE_RETURN_IF_ERROR(iree_runtime_call_initialize_by_name(
session, iree_make_cstring_view("module.predict"), &call));
// Prepare the input hal buffer view with image_util library.
// The input of the mmist model is single 28x28 pixel image as a
// tensor<1x28x28x1xf32>, with pixels in [0.0, 1.0].
iree_hal_buffer_view_t* buffer_view = NULL;
iree_hal_dim_t buffer_shape[] = {1, 28, 28, 1};
iree_hal_element_type_t hal_element_type = IREE_HAL_ELEMENT_TYPE_FLOAT_32;
float input_range[2] = {0.0f, 1.0f};
IREE_RETURN_IF_ERROR(
iree_tools_utils_buffer_view_from_image_rescaled(
image_path, buffer_shape, IREE_ARRAYSIZE(buffer_shape),
hal_element_type, iree_hal_device_allocator(device), input_range,
IREE_ARRAYSIZE(input_range), &buffer_view),
"load image");
IREE_RETURN_IF_ERROR(
iree_runtime_call_inputs_push_back_buffer_view(&call, buffer_view));
iree_hal_buffer_view_release(buffer_view);
IREE_RETURN_IF_ERROR(iree_runtime_call_invoke(&call, /*flags=*/0));
// Get the result buffers from the invocation.
iree_hal_buffer_view_t* ret_buffer_view = NULL;
IREE_RETURN_IF_ERROR(
iree_runtime_call_outputs_pop_front_buffer_view(&call, &ret_buffer_view));
// Read back the results. The output of the mnist model is a 1x10 prediction
// confidence values for each digit in [0, 9].
float predictions[1 * 10] = {0.0f};
IREE_RETURN_IF_ERROR(iree_hal_device_transfer_d2h(
iree_runtime_session_device(session),
iree_hal_buffer_view_buffer(ret_buffer_view), 0, predictions,
sizeof(predictions), IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
iree_infinite_timeout()));
iree_hal_buffer_view_release(ret_buffer_view);
// Get the highest index from the output.
float result_val = FLT_MIN;
int result_idx = 0;
for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(predictions); ++i) {
if (predictions[i] > result_val) {
result_val = predictions[i];
result_idx = i;
}
}
fprintf(stdout, "Detected number: %d\n", result_idx);
iree_runtime_call_deinitialize(&call);
iree_runtime_session_release(session);
iree_runtime_instance_release(instance);
return iree_ok_status();
}
int main(int argc, char** argv) {
if (argc > 2) {
fprintf(stderr, "Usage: iree-run-mnist-module <image file>\n");
return -1;
}
iree_string_view_t image_path;
if (argc == 1) {
image_path = iree_make_cstring_view("mnist_test.png");
} else {
image_path = iree_make_cstring_view(argv[1]);
}
iree_status_t result = Run(image_path);
if (!iree_status_is_ok(result)) {
iree_status_fprint(stderr, result);
iree_status_ignore(result);
return -1;
}
iree_status_ignore(result);
return 0;
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 261 B

View File

@@ -1,116 +0,0 @@
# Copyright 2022 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
if(NOT IREE_TARGET_BACKEND_VULKAN_SPIRV OR
NOT IREE_HAL_DRIVER_VULKAN)
message(STATUS "Missing Vulkan backend and/or driver, skipping vulkan_gui sample")
return()
endif()
# This target statically links against Vulkan.
# One way to achieve this is by installing the Vulkan SDK from
# https://vulkan.lunarg.com/.
include(FindVulkan)
if(NOT Vulkan_FOUND)
message(STATUS "Could not find Vulkan, skipping vulkan_gui sample")
return()
endif()
# vcpkg install sdl2[vulkan]
# tested with versions 2.0.14#4 - 2.0.22#1
find_package(SDL2)
if(NOT SDL2_FOUND)
message(STATUS "Could not find SDL2, skipping vulkan_gui sample")
return()
endif()
FetchContent_Declare(
imgui
GIT_REPOSITORY https://github.com/ocornut/imgui
GIT_TAG master
)
FetchContent_MakeAvailable(imgui)
# Dear ImGui
set(IMGUI_DIR ${CMAKE_BINARY_DIR}/_deps/imgui-src)
message("Looking for Imgui in ${IMGUI_DIR}")
include_directories(${IMGUI_DIR} ${IMGUI_DIR}/backends ..)
function(iree_vulkan_sample)
cmake_parse_arguments(
_RULE
""
"NAME"
"SRCS"
${ARGN}
)
# Define the sample executable.
set(_NAME "${_RULE_NAME}")
set(SRCS "${_RULE_SRCS}")
add_executable(${_NAME} "")
target_sources(${_NAME}
PRIVATE
${SRCS}
"${IMGUI_DIR}/backends/imgui_impl_sdl.cpp"
"${IMGUI_DIR}/backends/imgui_impl_vulkan.cpp"
"${IMGUI_DIR}/imgui.cpp"
"${IMGUI_DIR}/imgui_draw.cpp"
"${IMGUI_DIR}/imgui_demo.cpp"
"${IMGUI_DIR}/imgui_tables.cpp"
"${IMGUI_DIR}/imgui_widgets.cpp"
)
set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "${_NAME}")
target_include_directories(${_NAME} PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
)
target_link_libraries(${_NAME}
SDL2::SDL2
Vulkan::Vulkan
iree_runtime_runtime
iree_base_internal_main
iree_hal_drivers_vulkan_registration_registration
iree_modules_hal_hal
iree_vm_vm
iree_vm_bytecode_module
iree_vm_cc
iree_tooling_vm_util_cc
iree_tooling_context_util
)
if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(_GUI_LINKOPTS "-SUBSYSTEM:CONSOLE")
else()
set(_GUI_LINKOPTS "")
endif()
target_link_options(${_NAME}
PRIVATE
${_GUI_LINKOPTS}
)
endfunction()
iree_vulkan_sample(
NAME
iree-samples-resnet-vulkan-gui
SRCS
vulkan_resnet_inference_gui.cc
)
iree_vulkan_sample(
NAME
iree-vulkan-gui
SRCS
vulkan_inference_gui.cc
)
message(STATUS "Configured vulkan_gui sample successfully")

View File

@@ -1,4 +0,0 @@
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = "arith.mulf"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
return %0 : tensor<4xf32>
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because it is too large Load Diff

View File

@@ -1,957 +0,0 @@
// Copyright 2019 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Vulkan Graphics + IREE API Integration Sample.
#include <SDL.h>
#include <SDL_vulkan.h>
#include <imgui.h>
#include <imgui_impl_sdl.h>
#include <imgui_impl_vulkan.h>
#include <vulkan/vulkan.h>
#include <cstring>
#include <set>
#include <vector>
#include <fstream>
#include <array>
#include <cstdio>
#include <cstdlib>
#include <iterator>
#include <string>
#include <utility>
#include "iree/hal/drivers/vulkan/api.h"
// IREE's C API:
#include "iree/base/api.h"
#include "iree/hal/api.h"
#include "iree/hal/drivers/vulkan/registration/driver_module.h"
#include "iree/modules/hal/module.h"
#include "iree/vm/api.h"
#include "iree/vm/bytecode_module.h"
#include "iree/vm/ref_cc.h"
// iree-run-module
#include "iree/base/internal/flags.h"
#include "iree/base/status_cc.h"
#include "iree/base/tracing.h"
#include "iree/modules/hal/types.h"
#include "iree/tooling/comparison.h"
#include "iree/tooling/context_util.h"
#include "iree/tooling/vm_util_cc.h"
// Other dependencies (helpers, etc.)
#include "iree/base/internal/main.h"
#define IMGUI_UNLIMITED_FRAME_RATE
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
IREE_FLAG(string, entry_function, "",
"Name of a function contained in the module specified by module_file "
"to run.");
// TODO(benvanik): move --function_input= flag into a util.
static iree_status_t parse_function_io(iree_string_view_t flag_name,
void* storage,
iree_string_view_t value) {
auto* list = (std::vector<std::string>*)storage;
list->push_back(std::string(value.data, value.size));
return iree_ok_status();
}
static void print_function_io(iree_string_view_t flag_name, void* storage,
FILE* file) {
auto* list = (std::vector<std::string>*)storage;
if (list->empty()) {
fprintf(file, "# --%.*s=\n", (int)flag_name.size, flag_name.data);
} else {
for (size_t i = 0; i < list->size(); ++i) {
fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
list->at(i).c_str());
}
}
}
static std::vector<std::string> FLAG_function_inputs;
IREE_FLAG_CALLBACK(
parse_function_io, print_function_io, &FLAG_function_inputs, function_input,
"An input (a) value or (b) buffer of the format:\n"
" (a) scalar value\n"
" value\n"
" e.g.: --function_input=\"3.14\"\n"
" (b) buffer:\n"
" [shape]xtype=[value]\n"
" e.g.: --function_input=\"2x2xi32=1 2 3 4\"\n"
"Optionally, brackets may be used to separate the element values:\n"
" 2x2xi32=[[1 2][3 4]]\n"
"Raw binary files can be read to provide buffer contents:\n"
" 2x2xi32=@some/file.bin\n"
"numpy npy files (from numpy.save) can be read to provide 1+ values:\n"
" @some.npy\n"
"Each occurrence of the flag indicates an input in the order they were\n"
"specified on the command line.");
typedef struct iree_file_toc_t {
const char* name; // the file's original name
char* data; // beginning of the file
size_t size; // length of the file
} iree_file_toc_t;
bool load_file(const char* filename, char** pOut, size_t* pSize)
{
FILE* f = fopen(filename, "rb");
if (f == NULL)
{
fprintf(stderr, "Can't open %s\n", filename);
return false;
}
fseek(f, 0L, SEEK_END);
*pSize = ftell(f);
fseek(f, 0L, SEEK_SET);
*pOut = (char*)malloc(*pSize);
size_t size = fread(*pOut, *pSize, 1, f);
fclose(f);
return size != 0;
}
static VkAllocationCallbacks* g_Allocator = NULL;
static VkInstance g_Instance = VK_NULL_HANDLE;
static VkPhysicalDevice g_PhysicalDevice = VK_NULL_HANDLE;
static VkDevice g_Device = VK_NULL_HANDLE;
static uint32_t g_QueueFamily = (uint32_t)-1;
static VkQueue g_Queue = VK_NULL_HANDLE;
static VkPipelineCache g_PipelineCache = VK_NULL_HANDLE;
static VkDescriptorPool g_DescriptorPool = VK_NULL_HANDLE;
static ImGui_ImplVulkanH_Window g_MainWindowData;
static uint32_t g_MinImageCount = 2;
static bool g_SwapChainRebuild = false;
static int g_SwapChainResizeWidth = 0;
static int g_SwapChainResizeHeight = 0;
static void check_vk_result(VkResult err) {
if (err == 0) return;
fprintf(stderr, "VkResult: %d\n", err);
abort();
}
// Returns the names of the Vulkan layers used for the given IREE
// |extensibility_set| and |features|.
std::vector<const char*> GetIreeLayers(
iree_hal_vulkan_extensibility_set_t extensibility_set,
iree_hal_vulkan_features_t features) {
iree_host_size_t required_count;
iree_hal_vulkan_query_extensibility_set(
features, extensibility_set, /*string_capacity=*/0, &required_count,
/*out_string_values=*/NULL);
std::vector<const char*> layers(required_count);
iree_hal_vulkan_query_extensibility_set(features, extensibility_set,
layers.size(), &required_count,
layers.data());
return layers;
}
// Returns the names of the Vulkan extensions used for the given IREE
// |extensibility_set| and |features|.
std::vector<const char*> GetIreeExtensions(
iree_hal_vulkan_extensibility_set_t extensibility_set,
iree_hal_vulkan_features_t features) {
iree_host_size_t required_count;
iree_hal_vulkan_query_extensibility_set(
features, extensibility_set, /*string_capacity=*/0, &required_count,
/*out_string_values=*/NULL);
std::vector<const char*> extensions(required_count);
iree_hal_vulkan_query_extensibility_set(features, extensibility_set,
extensions.size(), &required_count,
extensions.data());
return extensions;
}
// Returns the names of the Vulkan extensions used for the given IREE
// |vulkan_features|.
std::vector<const char*> GetDeviceExtensions(
VkPhysicalDevice physical_device,
iree_hal_vulkan_features_t vulkan_features) {
std::vector<const char*> iree_required_extensions = GetIreeExtensions(
IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_REQUIRED,
vulkan_features);
std::vector<const char*> iree_optional_extensions = GetIreeExtensions(
IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
vulkan_features);
uint32_t extension_count = 0;
check_vk_result(vkEnumerateDeviceExtensionProperties(
physical_device, nullptr, &extension_count, nullptr));
std::vector<VkExtensionProperties> extension_properties(extension_count);
check_vk_result(vkEnumerateDeviceExtensionProperties(
physical_device, nullptr, &extension_count, extension_properties.data()));
// Merge extensions lists, including optional and required for simplicity.
std::set<const char*> ext_set;
ext_set.insert("VK_KHR_swapchain");
ext_set.insert(iree_required_extensions.begin(),
iree_required_extensions.end());
for (int i = 0; i < iree_optional_extensions.size(); ++i) {
const char* optional_extension = iree_optional_extensions[i];
for (int j = 0; j < extension_count; ++j) {
if (strcmp(optional_extension, extension_properties[j].extensionName) ==
0) {
ext_set.insert(optional_extension);
break;
}
}
}
std::vector<const char*> extensions(ext_set.begin(), ext_set.end());
return extensions;
}
std::vector<const char*> GetInstanceLayers(
iree_hal_vulkan_features_t vulkan_features) {
// Query the layers that IREE wants / needs.
std::vector<const char*> required_layers = GetIreeLayers(
IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_REQUIRED, vulkan_features);
std::vector<const char*> optional_layers = GetIreeLayers(
IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_OPTIONAL, vulkan_features);
// Query the layers that are available on the Vulkan ICD.
uint32_t layer_property_count = 0;
check_vk_result(
vkEnumerateInstanceLayerProperties(&layer_property_count, NULL));
std::vector<VkLayerProperties> layer_properties(layer_property_count);
check_vk_result(vkEnumerateInstanceLayerProperties(&layer_property_count,
layer_properties.data()));
// Match between optional/required and available layers.
std::vector<const char*> layers;
for (const char* layer_name : required_layers) {
bool found = false;
for (const auto& layer_property : layer_properties) {
if (std::strcmp(layer_name, layer_property.layerName) == 0) {
found = true;
layers.push_back(layer_name);
break;
}
}
if (!found) {
fprintf(stderr, "Required layer %s not available\n", layer_name);
abort();
}
}
for (const char* layer_name : optional_layers) {
for (const auto& layer_property : layer_properties) {
if (std::strcmp(layer_name, layer_property.layerName) == 0) {
layers.push_back(layer_name);
break;
}
}
}
return layers;
}
std::vector<const char*> GetInstanceExtensions(
SDL_Window* window, iree_hal_vulkan_features_t vulkan_features) {
// Ask SDL for its list of required instance extensions.
uint32_t sdl_extensions_count = 0;
SDL_Vulkan_GetInstanceExtensions(window, &sdl_extensions_count, NULL);
std::vector<const char*> sdl_extensions(sdl_extensions_count);
SDL_Vulkan_GetInstanceExtensions(window, &sdl_extensions_count,
sdl_extensions.data());
std::vector<const char*> iree_required_extensions = GetIreeExtensions(
IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_REQUIRED,
vulkan_features);
std::vector<const char*> iree_optional_extensions = GetIreeExtensions(
IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_OPTIONAL,
vulkan_features);
// Merge extensions lists, including optional and required for simplicity.
std::set<const char*> ext_set;
ext_set.insert(sdl_extensions.begin(), sdl_extensions.end());
ext_set.insert(iree_required_extensions.begin(),
iree_required_extensions.end());
ext_set.insert(iree_optional_extensions.begin(),
iree_optional_extensions.end());
std::vector<const char*> extensions(ext_set.begin(), ext_set.end());
return extensions;
}
void SetupVulkan(iree_hal_vulkan_features_t vulkan_features,
const char** instance_layers, uint32_t instance_layers_count,
const char** instance_extensions,
uint32_t instance_extensions_count,
const VkAllocationCallbacks* allocator, VkInstance* instance,
uint32_t* queue_family_index,
VkPhysicalDevice* physical_device, VkQueue* queue,
VkDevice* device, VkDescriptorPool* descriptor_pool) {
VkResult err;
// Create Vulkan Instance
{
VkInstanceCreateInfo create_info = {};
create_info.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
create_info.enabledLayerCount = instance_layers_count;
create_info.ppEnabledLayerNames = instance_layers;
create_info.enabledExtensionCount = instance_extensions_count;
create_info.ppEnabledExtensionNames = instance_extensions;
err = vkCreateInstance(&create_info, allocator, instance);
check_vk_result(err);
}
// Select GPU
{
uint32_t gpu_count;
err = vkEnumeratePhysicalDevices(*instance, &gpu_count, NULL);
check_vk_result(err);
IM_ASSERT(gpu_count > 0);
VkPhysicalDevice* gpus =
(VkPhysicalDevice*)malloc(sizeof(VkPhysicalDevice) * gpu_count);
err = vkEnumeratePhysicalDevices(*instance, &gpu_count, gpus);
check_vk_result(err);
// Use the first reported GPU for simplicity.
*physical_device = gpus[0];
VkPhysicalDeviceProperties properties;
vkGetPhysicalDeviceProperties(*physical_device, &properties);
fprintf(stdout, "Selected Vulkan device: '%s'\n", properties.deviceName);
free(gpus);
}
// Select queue family. We want a single queue with graphics and compute for
// simplicity, but we could also discover and use separate queues for each.
{
uint32_t count;
vkGetPhysicalDeviceQueueFamilyProperties(*physical_device, &count, NULL);
VkQueueFamilyProperties* queues = (VkQueueFamilyProperties*)malloc(
sizeof(VkQueueFamilyProperties) * count);
vkGetPhysicalDeviceQueueFamilyProperties(*physical_device, &count, queues);
for (uint32_t i = 0; i < count; i++) {
if (queues[i].queueFlags &
(VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT)) {
*queue_family_index = i;
break;
}
}
free(queues);
IM_ASSERT(*queue_family_index != (uint32_t)-1);
}
// Create Logical Device (with 1 queue)
{
std::vector<const char*> device_extensions =
GetDeviceExtensions(*physical_device, vulkan_features);
const float queue_priority[] = {1.0f};
VkDeviceQueueCreateInfo queue_info = {};
queue_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
queue_info.queueFamilyIndex = *queue_family_index;
queue_info.queueCount = 1;
queue_info.pQueuePriorities = queue_priority;
VkDeviceCreateInfo create_info = {};
create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
create_info.queueCreateInfoCount = 1;
create_info.pQueueCreateInfos = &queue_info;
create_info.enabledExtensionCount =
static_cast<uint32_t>(device_extensions.size());
create_info.ppEnabledExtensionNames = device_extensions.data();
// Enable timeline semaphores.
VkPhysicalDeviceFeatures2 features2;
memset(&features2, 0, sizeof(features2));
features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
create_info.pNext = &features2;
VkPhysicalDeviceTimelineSemaphoreFeatures semaphore_features;
memset(&semaphore_features, 0, sizeof(semaphore_features));
semaphore_features.sType =
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES;
semaphore_features.pNext = features2.pNext;
features2.pNext = &semaphore_features;
semaphore_features.timelineSemaphore = VK_TRUE;
err = vkCreateDevice(*physical_device, &create_info, allocator, device);
check_vk_result(err);
vkGetDeviceQueue(*device, *queue_family_index, 0, queue);
}
// Create Descriptor Pool
{
VkDescriptorPoolSize pool_sizes[] = {
{VK_DESCRIPTOR_TYPE_SAMPLER, 1000},
{VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1000},
{VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 1000},
{VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1000},
{VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, 1000},
{VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, 1000},
{VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1000},
{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1000},
{VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, 1000},
{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC, 1000},
{VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT, 1000}};
VkDescriptorPoolCreateInfo pool_info = {};
pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
pool_info.maxSets = 1000 * IREE_ARRAYSIZE(pool_sizes);
pool_info.poolSizeCount = (uint32_t)IREE_ARRAYSIZE(pool_sizes);
pool_info.pPoolSizes = pool_sizes;
err =
vkCreateDescriptorPool(*device, &pool_info, allocator, descriptor_pool);
check_vk_result(err);
}
}
void SetupVulkanWindow(ImGui_ImplVulkanH_Window* wd,
const VkAllocationCallbacks* allocator,
VkInstance instance, uint32_t queue_family_index,
VkPhysicalDevice physical_device, VkDevice device,
VkSurfaceKHR surface, int width, int height,
uint32_t min_image_count) {
wd->Surface = surface;
// Check for WSI support
VkBool32 res;
vkGetPhysicalDeviceSurfaceSupportKHR(physical_device, queue_family_index,
wd->Surface, &res);
if (res != VK_TRUE) {
fprintf(stderr, "Error no WSI support on physical device 0\n");
exit(-1);
}
// Select Surface Format
const VkFormat requestSurfaceImageFormat[] = {
VK_FORMAT_B8G8R8A8_UNORM, VK_FORMAT_R8G8B8A8_UNORM,
VK_FORMAT_B8G8R8_UNORM, VK_FORMAT_R8G8B8_UNORM};
const VkColorSpaceKHR requestSurfaceColorSpace =
VK_COLORSPACE_SRGB_NONLINEAR_KHR;
wd->SurfaceFormat = ImGui_ImplVulkanH_SelectSurfaceFormat(
physical_device, wd->Surface, requestSurfaceImageFormat,
(size_t)IREE_ARRAYSIZE(requestSurfaceImageFormat),
requestSurfaceColorSpace);
// Select Present Mode
#ifdef IMGUI_UNLIMITED_FRAME_RATE
VkPresentModeKHR present_modes[] = {VK_PRESENT_MODE_MAILBOX_KHR,
VK_PRESENT_MODE_IMMEDIATE_KHR,
VK_PRESENT_MODE_FIFO_KHR};
#else
VkPresentModeKHR present_modes[] = {VK_PRESENT_MODE_FIFO_KHR};
#endif
wd->PresentMode = ImGui_ImplVulkanH_SelectPresentMode(
physical_device, wd->Surface, &present_modes[0],
IREE_ARRAYSIZE(present_modes));
// Create SwapChain, RenderPass, Framebuffer, etc.
IM_ASSERT(min_image_count >= 2);
ImGui_ImplVulkanH_CreateOrResizeWindow(instance, physical_device, device, wd,
queue_family_index, allocator, width,
height, min_image_count);
// Set clear color.
ImVec4 clear_color = ImVec4(0.45f, 0.55f, 0.60f, 1.00f);
memcpy(&wd->ClearValue.color.float32[0], &clear_color, 4 * sizeof(float));
}
void RenderFrame(ImGui_ImplVulkanH_Window* wd, VkDevice device, VkQueue queue) {
VkResult err;
VkSemaphore image_acquired_semaphore =
wd->FrameSemaphores[wd->SemaphoreIndex].ImageAcquiredSemaphore;
VkSemaphore render_complete_semaphore =
wd->FrameSemaphores[wd->SemaphoreIndex].RenderCompleteSemaphore;
err = vkAcquireNextImageKHR(device, wd->Swapchain, UINT64_MAX,
image_acquired_semaphore, VK_NULL_HANDLE,
&wd->FrameIndex);
check_vk_result(err);
ImGui_ImplVulkanH_Frame* fd = &wd->Frames[wd->FrameIndex];
{
err = vkWaitForFences(
device, 1, &fd->Fence, VK_TRUE,
UINT64_MAX); // wait indefinitely instead of periodically checking
check_vk_result(err);
err = vkResetFences(device, 1, &fd->Fence);
check_vk_result(err);
}
{
err = vkResetCommandPool(device, fd->CommandPool, 0);
check_vk_result(err);
VkCommandBufferBeginInfo info = {};
info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
info.flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
err = vkBeginCommandBuffer(fd->CommandBuffer, &info);
check_vk_result(err);
}
{
VkRenderPassBeginInfo info = {};
info.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
info.renderPass = wd->RenderPass;
info.framebuffer = fd->Framebuffer;
info.renderArea.extent.width = wd->Width;
info.renderArea.extent.height = wd->Height;
info.clearValueCount = 1;
info.pClearValues = &wd->ClearValue;
vkCmdBeginRenderPass(fd->CommandBuffer, &info, VK_SUBPASS_CONTENTS_INLINE);
}
// Record Imgui Draw Data and draw funcs into command buffer
ImGui_ImplVulkan_RenderDrawData(ImGui::GetDrawData(), fd->CommandBuffer);
// Submit command buffer
vkCmdEndRenderPass(fd->CommandBuffer);
{
VkPipelineStageFlags wait_stage =
VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
VkSubmitInfo info = {};
info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
info.waitSemaphoreCount = 1;
info.pWaitSemaphores = &image_acquired_semaphore;
info.pWaitDstStageMask = &wait_stage;
info.commandBufferCount = 1;
info.pCommandBuffers = &fd->CommandBuffer;
info.signalSemaphoreCount = 1;
info.pSignalSemaphores = &render_complete_semaphore;
err = vkEndCommandBuffer(fd->CommandBuffer);
check_vk_result(err);
err = vkQueueSubmit(queue, 1, &info, fd->Fence);
check_vk_result(err);
}
}
void PresentFrame(ImGui_ImplVulkanH_Window* wd, VkQueue queue) {
VkSemaphore render_complete_semaphore =
wd->FrameSemaphores[wd->SemaphoreIndex].RenderCompleteSemaphore;
VkPresentInfoKHR info = {};
info.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
info.waitSemaphoreCount = 1;
info.pWaitSemaphores = &render_complete_semaphore;
info.swapchainCount = 1;
info.pSwapchains = &wd->Swapchain;
info.pImageIndices = &wd->FrameIndex;
VkResult err = vkQueuePresentKHR(queue, &info);
check_vk_result(err);
wd->SemaphoreIndex =
(wd->SemaphoreIndex + 1) %
wd->ImageCount; // Now we can use the next set of semaphores
}
static void CleanupVulkan() {
vkDestroyDescriptorPool(g_Device, g_DescriptorPool, g_Allocator);
vkDestroyDevice(g_Device, g_Allocator);
vkDestroyInstance(g_Instance, g_Allocator);
}
static void CleanupVulkanWindow() {
ImGui_ImplVulkanH_DestroyWindow(g_Instance, g_Device, &g_MainWindowData,
g_Allocator);
}
namespace iree {
extern "C" int iree_main(int argc, char** argv) {
iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);
if (argc > 1) {
// Avoid iree-run-module spinning endlessly on stdin if the user uses single
// dashes for flags.
printf(
"[ERROR] unexpected positional argument (expected none)."
" Did you use pass a flag with a single dash ('-')?"
" Use '--' instead.\n");
return 1;
}
// --------------------------------------------------------------------------
// Create a window.
if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) != 0) {
fprintf(stderr, "Failed to initialize SDL\n");
abort();
return 1;
}
// Setup window
// clang-format off
SDL_WindowFlags window_flags = (SDL_WindowFlags)(
SDL_WINDOW_VULKAN | SDL_WINDOW_RESIZABLE | SDL_WINDOW_ALLOW_HIGHDPI);
// clang-format on
SDL_Window* window = SDL_CreateWindow(
"IREE Samples - Vulkan Inference GUI", SDL_WINDOWPOS_CENTERED,
SDL_WINDOWPOS_CENTERED, 1280, 720, window_flags);
if (window == nullptr)
{
const char* sdl_err = SDL_GetError();
fprintf(stderr, "Error, SDL_CreateWindow returned: %s\n", sdl_err);
abort();
return 1;
}
// Setup Vulkan
iree_hal_vulkan_features_t iree_vulkan_features =
static_cast<iree_hal_vulkan_features_t>(
IREE_HAL_VULKAN_FEATURE_ENABLE_VALIDATION_LAYERS |
IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS);
std::vector<const char*> layers = GetInstanceLayers(iree_vulkan_features);
std::vector<const char*> extensions =
GetInstanceExtensions(window, iree_vulkan_features);
SetupVulkan(iree_vulkan_features, layers.data(),
static_cast<uint32_t>(layers.size()), extensions.data(),
static_cast<uint32_t>(extensions.size()), g_Allocator,
&g_Instance, &g_QueueFamily, &g_PhysicalDevice, &g_Queue,
&g_Device, &g_DescriptorPool);
// Create Window Surface
VkSurfaceKHR surface;
VkResult err;
if (SDL_Vulkan_CreateSurface(window, g_Instance, &surface) == 0) {
fprintf(stderr, "Failed to create Vulkan surface.\n");
abort();
return 1;
}
// Create Framebuffers
int w, h;
SDL_GetWindowSize(window, &w, &h);
ImGui_ImplVulkanH_Window* wd = &g_MainWindowData;
SetupVulkanWindow(wd, g_Allocator, g_Instance, g_QueueFamily,
g_PhysicalDevice, g_Device, surface, w, h, g_MinImageCount);
// Setup Dear ImGui context
IMGUI_CHECKVERSION();
ImGui::CreateContext();
ImGuiIO& io = ImGui::GetIO();
(void)io;
ImGui::StyleColorsDark();
// Setup Platform/Renderer bindings
ImGui_ImplSDL2_InitForVulkan(window);
ImGui_ImplVulkan_InitInfo init_info = {};
init_info.Instance = g_Instance;
init_info.PhysicalDevice = g_PhysicalDevice;
init_info.Device = g_Device;
init_info.QueueFamily = g_QueueFamily;
init_info.Queue = g_Queue;
init_info.PipelineCache = g_PipelineCache;
init_info.DescriptorPool = g_DescriptorPool;
init_info.Allocator = g_Allocator;
init_info.MinImageCount = g_MinImageCount;
init_info.ImageCount = wd->ImageCount;
init_info.CheckVkResultFn = check_vk_result;
ImGui_ImplVulkan_Init(&init_info, wd->RenderPass);
// Upload Fonts
{
// Use any command queue
VkCommandPool command_pool = wd->Frames[wd->FrameIndex].CommandPool;
VkCommandBuffer command_buffer = wd->Frames[wd->FrameIndex].CommandBuffer;
err = vkResetCommandPool(g_Device, command_pool, 0);
check_vk_result(err);
VkCommandBufferBeginInfo begin_info = {};
begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
begin_info.flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
err = vkBeginCommandBuffer(command_buffer, &begin_info);
check_vk_result(err);
ImGui_ImplVulkan_CreateFontsTexture(command_buffer);
VkSubmitInfo end_info = {};
end_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
end_info.commandBufferCount = 1;
end_info.pCommandBuffers = &command_buffer;
err = vkEndCommandBuffer(command_buffer);
check_vk_result(err);
err = vkQueueSubmit(g_Queue, 1, &end_info, VK_NULL_HANDLE);
check_vk_result(err);
err = vkDeviceWaitIdle(g_Device);
check_vk_result(err);
ImGui_ImplVulkan_DestroyFontUploadObjects();
}
// Demo state.
bool show_iree_window = true;
// --------------------------------------------------------------------------
// Setup IREE.
// Check API version.
iree_api_version_t actual_version;
iree_status_t status =
iree_api_version_check(IREE_API_VERSION_LATEST, &actual_version);
if (iree_status_is_ok(status)) {
fprintf(stdout, "IREE runtime API version: %d\n", actual_version);
} else {
fprintf(stderr, "Unsupported runtime API version: %d\n", actual_version);
abort();
}
// Create a runtime Instance.
iree_vm_instance_t* iree_instance = nullptr;
IREE_CHECK_OK(
iree_vm_instance_create(iree_allocator_system(), &iree_instance));
// Register HAL drivers and VM module types.
IREE_CHECK_OK(iree_hal_vulkan_driver_module_register(
iree_hal_driver_registry_default()));
IREE_CHECK_OK(iree_hal_module_register_all_types(iree_instance));
// Create IREE Vulkan Driver and Device, sharing our VkInstance/VkDevice.
fprintf(stdout, "Creating Vulkan driver/device\n");
// Load symbols from our static `vkGetInstanceProcAddr` for IREE to use.
iree_hal_vulkan_syms_t* iree_vk_syms = nullptr;
IREE_CHECK_OK(iree_hal_vulkan_syms_create(
reinterpret_cast<void*>(&vkGetInstanceProcAddr), iree_allocator_system(),
&iree_vk_syms));
// Create the driver sharing our VkInstance.
iree_hal_driver_t* iree_vk_driver = nullptr;
iree_string_view_t driver_identifier = iree_make_cstring_view("vulkan");
iree_hal_vulkan_driver_options_t driver_options;
driver_options.api_version = VK_API_VERSION_1_0;
driver_options.requested_features = static_cast<iree_hal_vulkan_features_t>(
IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS);
IREE_CHECK_OK(iree_hal_vulkan_driver_create_using_instance(
driver_identifier, &driver_options, iree_vk_syms, g_Instance,
iree_allocator_system(), &iree_vk_driver));
// Create a device sharing our VkDevice and queue.
// We could also create a separate (possibly low priority) compute queue for
// IREE, and/or provide a dedicated transfer queue.
iree_string_view_t device_identifier = iree_make_cstring_view("vulkan");
iree_hal_vulkan_queue_set_t compute_queue_set;
compute_queue_set.queue_family_index = g_QueueFamily;
compute_queue_set.queue_indices = 1 << 0;
iree_hal_vulkan_queue_set_t transfer_queue_set;
transfer_queue_set.queue_indices = 0;
iree_hal_device_t* iree_vk_device = nullptr;
IREE_CHECK_OK(iree_hal_vulkan_wrap_device(
device_identifier, &driver_options.device_options, iree_vk_syms,
g_Instance, g_PhysicalDevice, g_Device, &compute_queue_set,
&transfer_queue_set, iree_allocator_system(), &iree_vk_device));
// Create a HAL module using the HAL device.
iree_vm_module_t* hal_module = nullptr;
IREE_CHECK_OK(iree_hal_module_create(iree_instance, iree_vk_device,
IREE_HAL_MODULE_FLAG_NONE,
iree_allocator_system(), &hal_module));
// Load bytecode module
//iree_file_toc_t module_file_toc;
//const char network_model[] = "resnet50_tf.vmfb";
//fprintf(stdout, "Loading: %s\n", network_model);
//if (load_file(network_model, &module_file_toc.data, &module_file_toc.size) == false)
//{
// abort();
// return 1;
//}
//fprintf(stdout, "module size: %zu\n", module_file_toc.size);
iree_vm_module_t* bytecode_module = nullptr;
iree_status_t module_status = iree_tooling_load_module_from_flags(
iree_instance, iree_allocator_system(), &bytecode_module);
if (!iree_status_is_ok(module_status))
return -1;
//IREE_CHECK_OK(iree_vm_bytecode_module_create(
// iree_instance,
// iree_const_byte_span_t{
// reinterpret_cast<const uint8_t*>(module_file_toc.data),
// module_file_toc.size},
// iree_allocator_null(), iree_allocator_system(), &bytecode_module));
//// Query for details about what is in the loaded module.
//iree_vm_module_signature_t bytecode_module_signature =
// iree_vm_module_signature(bytecode_module);
//fprintf(stdout, "Module loaded, have <%" PRIhsz "> exported functions:\n",
// bytecode_module_signature.export_function_count);
//for (int i = 0; i < bytecode_module_signature.export_function_count; ++i) {
// iree_vm_function_t function;
// IREE_CHECK_OK(iree_vm_module_lookup_function_by_ordinal(
// bytecode_module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
// auto function_name = iree_vm_function_name(&function);
// auto function_signature = iree_vm_function_signature(&function);
// fprintf(stdout, " %d: '%.*s' with calling convention '%.*s'\n", i,
// (int)function_name.size, function_name.data,
// (int)function_signature.calling_convention.size,
// function_signature.calling_convention.data);
//}
// Allocate a context that will hold the module state across invocations.
iree_vm_context_t* iree_context = nullptr;
std::vector<iree_vm_module_t*> modules = {hal_module, bytecode_module};
IREE_CHECK_OK(iree_vm_context_create_with_modules(
iree_instance, IREE_VM_CONTEXT_FLAG_NONE, modules.size(), modules.data(),
iree_allocator_system(), &iree_context));
fprintf(stdout, "Context with modules is ready for use\n");
// Lookup the entry point function.
iree_vm_function_t main_function;
const char kMainFunctionName[] = "module.forward";
IREE_CHECK_OK(iree_vm_context_resolve_function(
iree_context,
iree_string_view_t{kMainFunctionName, sizeof(kMainFunctionName) - 1},
&main_function));
iree_string_view_t main_function_name = iree_vm_function_name(&main_function);
fprintf(stdout, "Resolved main function named '%.*s'\n",
(int)main_function_name.size, main_function_name.data);
// --------------------------------------------------------------------------
// Write inputs into mappable buffers.
iree_hal_allocator_t* allocator =
iree_hal_device_allocator(iree_vk_device);
//iree_hal_memory_type_t input_memory_type =
// static_cast<iree_hal_memory_type_t>(
// IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
// IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE);
//iree_hal_buffer_usage_t input_buffer_usage =
// static_cast<iree_hal_buffer_usage_t>(IREE_HAL_BUFFER_USAGE_DEFAULT);
//iree_hal_buffer_params_t buffer_params;
//buffer_params.type = input_memory_type;
//buffer_params.usage = input_buffer_usage;
//buffer_params.access = IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE;
// Wrap input buffers in buffer views.
vm::ref<iree_vm_list_t> inputs;
iree_status_t input_status = ParseToVariantList(
allocator,
iree::span<const std::string>{FLAG_function_inputs.data(),
FLAG_function_inputs.size()},
iree_allocator_system(), &inputs);
if (!iree_status_is_ok(input_status))
return -1;
//vm::ref<iree_vm_list_t> inputs;
//IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 6, iree_allocator_system(), &inputs));
//iree_hal_buffer_view_t* input0_buffer_view = nullptr;
//constexpr iree_hal_dim_t input_buffer_shape[] = {1, 224, 224, 3};
//IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer(
// allocator,
// /*shape_rank=*/4, /*shape=*/input_buffer_shape,
// IREE_HAL_ELEMENT_TYPE_FLOAT_32,
// IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
// iree_make_const_byte_span(&input_res50, sizeof(input_res50)),
// &input0_buffer_view));
//auto input0_buffer_view_ref = iree_hal_buffer_view_move_ref(input0_buffer_view);
//IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs.get(), &input0_buffer_view_ref));
// Prepare outputs list to accept results from the invocation.
vm::ref<iree_vm_list_t> outputs;
constexpr iree_hal_dim_t kOutputCount = 1000;
IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, kOutputCount * sizeof(float), iree_allocator_system(), &outputs));
// --------------------------------------------------------------------------
// Main loop.
bool done = false;
while (!done) {
SDL_Event event;
while (SDL_PollEvent(&event)) {
if (event.type == SDL_QUIT) {
done = true;
}
ImGui_ImplSDL2_ProcessEvent(&event);
if (event.type == SDL_QUIT) done = true;
if (event.type == SDL_WINDOWEVENT &&
event.window.event == SDL_WINDOWEVENT_RESIZED &&
event.window.windowID == SDL_GetWindowID(window)) {
g_SwapChainResizeWidth = (int)event.window.data1;
g_SwapChainResizeHeight = (int)event.window.data2;
g_SwapChainRebuild = true;
}
}
if (g_SwapChainRebuild) {
g_SwapChainRebuild = false;
ImGui_ImplVulkan_SetMinImageCount(g_MinImageCount);
ImGui_ImplVulkanH_CreateOrResizeWindow(
g_Instance, g_PhysicalDevice, g_Device, &g_MainWindowData,
g_QueueFamily, g_Allocator, g_SwapChainResizeWidth,
g_SwapChainResizeHeight, g_MinImageCount);
g_MainWindowData.FrameIndex = 0;
}
// Start the Dear ImGui frame
ImGui_ImplVulkan_NewFrame();
ImGui_ImplSDL2_NewFrame(window);
ImGui::NewFrame();
// Custom window.
{
ImGui::Begin("IREE Vulkan Integration Demo", &show_iree_window);
ImGui::Separator();
// ImGui Inputs for two input tensors.
// Run computation whenever any of the values changes.
static bool dirty = true;
if (dirty) {
// Synchronously invoke the function.
IREE_CHECK_OK(iree_vm_invoke(iree_context, main_function,
IREE_VM_INVOCATION_FLAG_NONE,
/*policy=*/nullptr, inputs.get(),
outputs.get(), iree_allocator_system()));
// we want to run continuously so we can use tools like RenderDoc, RGP, etc...
dirty = true;
}
// Framerate counter.
ImGui::Text("Application average %.3f ms/frame (%.1f FPS)",
1000.0f / ImGui::GetIO().Framerate, ImGui::GetIO().Framerate);
ImGui::End();
}
// Rendering
ImGui::Render();
RenderFrame(wd, g_Device, g_Queue);
PresentFrame(wd, g_Queue);
}
// --------------------------------------------------------------------------
// --------------------------------------------------------------------------
// Cleanup
iree_vm_module_release(hal_module);
iree_vm_module_release(bytecode_module);
iree_vm_context_release(iree_context);
iree_hal_device_release(iree_vk_device);
iree_hal_allocator_release(allocator);
iree_hal_driver_release(iree_vk_driver);
iree_hal_vulkan_syms_release(iree_vk_syms);
iree_vm_instance_release(iree_instance);
err = vkDeviceWaitIdle(g_Device);
check_vk_result(err);
ImGui_ImplVulkan_Shutdown();
ImGui_ImplSDL2_Shutdown();
ImGui::DestroyContext();
CleanupVulkanWindow();
CleanupVulkan();
SDL_DestroyWindow(window);
SDL_Quit();
// --------------------------------------------------------------------------
return 0;
}
} // namespace iree

File diff suppressed because it is too large Load Diff

View File

@@ -2,23 +2,19 @@
"""SHARK Tank"""
# python generate_sharktank.py, you have to give a csv tile with [model_name, model_download_url]
# will generate local shark tank folder like this:
# HOME
# /.local
# /shark_tank
# /albert_lite_base
# /...model_name...
# /SHARK
# /gen_shark_tank
# /albert_lite_base
# /...model_name...
#
import os
import csv
import argparse
from shark.shark_importer import SharkImporter
from shark.parser import shark_args
import tensorflow as tf
import subprocess as sp
import hashlib
import numpy as np
from pathlib import Path
visible_default = tf.config.list_physical_devices("GPU")
try:
@@ -30,6 +26,9 @@ except:
# Invalid device or cannot modify virtual devices once initialized.
pass
# All generated models and metadata will be saved under this directory.
WORKDIR = os.path.join(os.path.dirname(__file__), "gen_shark_tank")
def create_hash(file_name):
with open(file_name, "rb") as f:
@@ -43,7 +42,6 @@ def create_hash(file_name):
def save_torch_model(torch_model_list):
from tank.model_utils import get_hf_model
from tank.model_utils import get_vision_model
from tank.model_utils import get_hf_img_cls_model
with open(torch_model_list) as csvfile:
torch_reader = csv.reader(csvfile, delimiter=",")
@@ -52,10 +50,8 @@ def save_torch_model(torch_model_list):
torch_model_name = row[0]
tracing_required = row[1]
model_type = row[2]
is_dynamic = row[3]
tracing_required = False if tracing_required == "False" else True
is_dynamic = False if is_dynamic == "False" else True
model = None
input = None
@@ -63,8 +59,6 @@ def save_torch_model(torch_model_list):
model, input, _ = get_vision_model(torch_model_name)
elif model_type == "hf":
model, input, _ = get_hf_model(torch_model_name)
elif model_type == "hf_img_cls":
model, input, _ = get_hf_img_cls_model(torch_model_name)
torch_model_name = torch_model_name.replace("/", "_")
torch_model_dir = os.path.join(
@@ -90,22 +84,17 @@ def save_torch_model(torch_model_list):
)
np.save(os.path.join(torch_model_dir, "hash"), np.array(mlir_hash))
# Generate torch dynamic models.
if is_dynamic:
mlir_importer.import_debug(
is_dynamic=True,
tracing_required=tracing_required,
dir=torch_model_dir,
model_name=torch_model_name + "_dynamic",
)
mlir_importer.import_debug(
is_dynamic=True,
tracing_required=tracing_required,
dir=torch_model_dir,
model_name=torch_model_name + "_dynamic",
)
def save_tf_model(tf_model_list):
from tank.model_utils_tf import (
get_causal_image_model,
get_causal_lm_model,
get_keras_model,
get_TFhf_model,
)
from tank.model_utils_tf import get_causal_lm_model
from tank.model_utils_tf import get_causal_image_model
with open(tf_model_list) as csvfile:
tf_reader = csv.reader(csvfile, delimiter=",")
@@ -116,15 +105,11 @@ def save_tf_model(tf_model_list):
model = None
input = None
print(f"Generating artifacts for model {tf_model_name}")
print(model_type)
if model_type == "hf":
model, input, _ = get_causal_lm_model(tf_model_name)
if model_type == "img":
model, input, _ = get_causal_image_model(tf_model_name)
if model_type == "keras":
model, input, _ = get_keras_model(tf_model_name)
if model_type == "TFhf":
model, input, _ = get_TFhf_model(tf_model_name)
tf_model_name = tf_model_name.replace("/", "_")
tf_model_dir = os.path.join(WORKDIR, str(tf_model_name) + "_tf")
@@ -205,14 +190,14 @@ if __name__ == "__main__":
parser.add_argument(
"--torch_model_csv",
type=lambda x: is_valid_file(x),
default="./tank/torch_model_list.csv",
default="./tank/pytorch/torch_model_list.csv",
help="""Contains the file with torch_model name and args.
Please see: https://github.com/nod-ai/SHARK/blob/main/tank/torch_model_list.csv""",
Please see: https://github.com/nod-ai/SHARK/blob/main/tank/pytorch/torch_model_list.csv""",
)
parser.add_argument(
"--tf_model_csv",
type=lambda x: is_valid_file(x),
default="./tank/tf_model_list.csv",
default="./tank/tf/tf_model_list.csv",
help="Contains the file with tf model name and args.",
)
parser.add_argument(
@@ -221,21 +206,9 @@ if __name__ == "__main__":
default="./tank/tflite/tflite_model_list.csv",
help="Contains the file with tf model name and args.",
)
parser.add_argument(
"--ci_tank_dir",
type=bool,
default=False,
)
parser.add_argument("--upload", type=bool, default=False)
args = parser.parse_args()
home = str(Path.home())
if args.ci_tank_dir == True:
WORKDIR = os.path.join(os.path.dirname(__file__), "gen_shark_tank")
else:
WORKDIR = os.path.join(home, ".local/shark_tank/")
if args.torch_model_csv:
save_torch_model(args.torch_model_csv)
@@ -246,6 +219,5 @@ if __name__ == "__main__":
save_tflite_model(args.tflite_model_csv)
if args.upload:
git_hash = sp.getoutput("git log -1 --format='%h'") + "/"
print("uploading files to gs://shark_tank/" + git_hash)
os.system(f"gsutil cp -r {WORKDIR}* gs://shark_tank/" + git_hash)
print("uploading files to gs://shark_tank/")
os.system("gsutil cp -r ./gen_shark_tank/* gs://shark_tank/")

View File

@@ -4,9 +4,9 @@ requires = [
"wheel",
"packaging",
"numpy>=1.22.4",
"torch-mlir>=20221021.633",
"iree-compiler>=20221022.190",
"iree-runtime>=20221022.190",
"numpy==1.22.4",
"torch-mlir>=20220428.420",
"iree-compiler>=20220427.13",
"iree-runtime>=20220427.13",
]
build-backend = "setuptools.build_meta"

View File

@@ -1,3 +1,3 @@
[pytest]
addopts = --verbose -p no:warnings
norecursedirs = inference tank/tflite examples benchmarks shark
norecursedirs = inference tank/tflite

View File

@@ -1,4 +1,4 @@
-f https://download.pytorch.org/whl/nightly/cpu/
-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
--pre
numpy
@@ -19,17 +19,13 @@ tensorflow-macos
tensorflow-metal
#tf-models-nightly
#tensorflow-text-nightly
transformers
transformers==4.18.0
tensorflow-probability
#jax[cpu]
# tflitehub dependencies.
Pillow
# web dependecies.
gradio
altair
# Testing and support.
#lit
#pyyaml

View File

@@ -2,6 +2,7 @@
--pre
numpy==1.22.4
torch
torchvision
tqdm
@@ -13,12 +14,10 @@ iree-tools-tf
# TensorFlow and JAX.
gin-config
tensorflow==2.10
keras==2.10
tensorflow
#tf-models-nightly
#tensorflow-text-nightly
transformers
diffusers
transformers==4.18.0
#tensorflow-probability
#jax[cpu]
@@ -29,13 +28,6 @@ Pillow
# Testing and support.
lit
pyyaml
python-dateutil
sacremoses
# web dependecies.
gradio
altair
scipy
#ONNX and ORT for benchmarking
#--extra-index-url https://test.pypi.org/simple/

View File

@@ -1,23 +1,13 @@
setuptools
wheel
pyinstaller
# SHARK Runner
tqdm
# SHARK Downloader
google-cloud-storage
gsutil
# Testing
pytest
pytest-xdist
Pillow
parameterized
# Add transformers, diffusers and scipy since it most commonly used
transformers
diffusers
scipy
ftfy
gradio
altair

View File

@@ -6,19 +6,7 @@ import os
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
with open("build_tools/shark_versions.txt", "r") as sv:
lines = [line.rstrip() for line in sv]
TM_VERSION = lines[7]
IREE_VERSION = lines[3]
PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.4"
backend_deps = []
if "NO_BACKEND" in os.environ.keys():
backend_deps = [
f"iree-compiler=={IREE_VERSION}",
f"iree-runtime>={IREE_VERSION}",
]
setup(
name="nodai-SHARK",
@@ -39,11 +27,12 @@ setup(
"Operating System :: OS Independent",
],
packages=find_packages(exclude=("examples")),
python_requires=">=3.9",
python_requires=">=3.7",
install_requires=[
"numpy",
"PyYAML",
f"torch-mlir=={TM_VERSION}",
]
+ backend_deps,
"torch-mlir>=20220428.420",
"iree-compiler>=20220427.13",
"iree-runtime>=20220427.13",
],
)

View File

@@ -1,39 +0,0 @@
#Write-Host "Installing python"
#Start-Process winget install Python.Python.3.10 '/quiet InstallAllUsers=1 PrependPath=1' -wait -NoNewWindow
#Write-Host "python installation completed successfully"
#Write-Host "Reload environment variables"
#$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
#Write-Host "Reloaded environment variables"
# redirect stderr into stdout
$p = &{python -V} 2>&1
# check if an ErrorRecord was returned
$version = if($p -is [System.Management.Automation.ErrorRecord])
{
# grab the version string from the error message
$p.Exception.Message
}
else
{
# otherwise return as is
$p
}
Write-Host "Python version found is"
Write-Host $p
Write-Host "Installing Build Dependencies"
python -m venv .\shark.venv\
.\shark.venv\Scripts\activate
pip install -r requirements.txt
pip install --pre torch-mlir torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/
pip install --upgrade -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html iree-compiler iree-runtime
Write-Host "Building SHARK..."
pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
Write-Host "Build and installation completed successfully"
Write-Host "Source your venv with ./shark.venv/Scripts/activate"

View File

@@ -7,8 +7,6 @@
# VENV_DIR=myshark.venv #create a venv called myshark.venv
# USE_IREE=1 #use stock IREE instead of Nod.ai's SHARK build
# IMPORTER=1 #Install importer deps
# BENCHMARK=1 #Install benchmark deps
# NO_BACKEND=1 #Don't install iree or shark backend
# if you run the script from a conda env it will install in your conda env
TD="$(cd $(dirname $0) && pwd)"
@@ -76,19 +74,11 @@ fi
$PYTHON -m pip install --upgrade pip || die "Could not upgrade pip"
$PYTHON -m pip install --upgrade -r "$TD/requirements.txt"
if [ "$torch_mlir_bin" = true ]; then
TM_VERSION=$(sed '8q;d' build_tools/shark_versions.txt)
if [[ $(uname -s) = 'Darwin' ]]; then
echo "MacOS detected. Installing torch-mlir from .whl, to avoid dependency problems with torch."
$PYTHON -m pip install --pre --no-cache-dir torch-mlir==${TM_VERSION} -f https://llvm.github.io/torch-mlir/package-index/ -f https://download.pytorch.org/whl/nightly/torch/
elif [[ ! -z "${NIGHTLY}" ]]; then
$PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
$PYTHON -m pip install --find-links https://github.com/llvm/torch-mlir/releases torch-mlir --extra-index-url https://download.pytorch.org/whl/nightly/cpu
if [ $? -eq 0 ];then
echo "Successfully Installed torch-mlir"
else
$PYTHON -m pip install --pre torch-mlir==${TM_VERSION} -f https://llvm.github.io/torch-mlir/package-index/
if [ $? -eq 0 ]; then
echo "Successfully Installed torch-mlir"
else
echo "Could not install torch-mlir" >&2
fi
echo "Could not install torch-mlir" >&2
fi
else
echo "${Red}No binaries found for Python $PYTHON_VERSION_X_Y on $(uname -s)"
@@ -97,65 +87,34 @@ else
exit 1
fi
if [[ -z "${USE_IREE}" ]]; then
rm .use-iree
RUNTIME="https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html"
RUNTIME_VERSION=$(sed '4q;d' build_tools/shark_versions.txt)
TM_VERSION=$(sed '8q;d' build_tools/shark_versions.txt)
RUNTIME="nod-ai/SHARK-Runtime"
else
touch ./.use-iree
RUNTIME="https://iree-org.github.io/iree/pip-release-links.html"
RUNTIME_VERSION=$(sed '2q;d' build_tools/shark_versions.txt)
TM_VERSION=$(sed '6q;d' build_tools/shark_versions.txt)
fi
if [[ -z "${NO_BACKEND}" ]]; then
echo "Installing ${RUNTIME}..."
$PYTHON -m pip install --upgrade --find-links ${RUNTIME} iree-compiler==${RUNTIME_VERSION} iree-runtime==${RUNTIME_VERSION}
else
echo "Not installing a backend, please make sure to add your backend to PYTHONPATH"
RUNTIME="google/iree"
fi
echo "Installing ${RUNTIME}..."
$PYTHON -m pip install --find-links https://github.com/${RUNTIME}/releases iree-compiler iree-runtime
if [[ ! -z "${IMPORTER}" ]]; then
echo "${Yellow}Installing importer tools.."
if [[ $(uname -s) = 'Linux' ]]; then
echo "${Yellow}Linux detected.. installing Linux importer tools"
#Always get the importer tools from upstream IREE
$PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer.txt" -f https://iree-org.github.io/iree/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
$PYTHON -m pip install --upgrade -r "$TD/requirements-importer.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://test.pypi.org/simple/ --extra-index-url https://download.pytorch.org/whl/nightly/cu116
elif [[ $(uname -s) = 'Darwin' ]]; then
echo "${Yellow}macOS detected.. installing macOS importer tools"
#Conda seems to have some problems installing these packages and hope they get resolved upstream.
$PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer-macos.txt" -f ${RUNTIME} --extra-index-url https://download.pytorch.org/whl/nightly/cpu
$PYTHON -m pip install --upgrade -r "$TD/requirements-importer-macos.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
fi
fi
$PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f https://download.pytorch.org/whl/nightly/torch/
$PYTHON -m pip install -e . --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://github.com/llvm/torch-mlir/releases -f https://github.com/${RUNTIME}/releases
if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
$PYTHON -m pip uninstall -y torch torchvision
$PYTHON -m pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu117
$PYTHON -m pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
if [ $? -eq 0 ];then
echo "Successfully Installed torch + cu117."
echo "Successfully Installed torch + cu116."
else
echo "Could not install torch + cu117." >&2
fi
fi
if [[ ! -z "${ONNX}" ]]; then
echo "${Yellow}Installing ONNX and onnxruntime for benchmarks..."
$PYTHON -m pip install onnx onnxruntime psutil
if [ $? -eq 0 ];then
echo "Successfully installed ONNX and ONNX runtime."
else
echo "Could not install ONNX." >&2
fi
fi
if [[ ! -z "${NIGHTLY}" ]]; then
$PYTHON -m pip install --upgrade --pre iree-compiler iree-runtime torch-mlir -f https://llvm.github.io/torch-mlir/package-index/ -f $RUNTIME -f https://download.pytorch.org/whl/nightly/torch/
if [ $? -eq 0 ];then
echo "Successfully Installed latest packages for nightly job."
else
echo "Could not install latest IREE and Torch-MLIR." >&2
echo "Could not install torch + cu116." >&2
fi
fi

View File

@@ -1,70 +0,0 @@
import torchdynamo
import torch
import torch_mlir
from shark.sharkdynamo.utils import make_shark_compiler
import warnings, logging
warnings.simplefilter("ignore")
torchdynamo.config.log_level = logging.ERROR
torchdynamo.reset()
@torchdynamo.optimize(
make_shark_compiler(use_tracing=False, device="cuda", verbose=False)
)
def foo(t):
return 2 * t
example_input = torch.rand((2, 3))
x = foo(example_input)
print(x)
torchdynamo.reset()
@torchdynamo.optimize(
make_shark_compiler(use_tracing=False, device="cuda", verbose=False)
)
def foo(a, b):
x = a / (a + 1)
if b.sum() < 0:
b = b * -1
return x * b
print(foo(torch.rand((2, 3)), -torch.rand((2, 3))))
torchdynamo.reset()
@torchdynamo.optimize(
make_shark_compiler(use_tracing=False, device="cuda", verbose=True)
)
def foo(a):
for i in range(10):
a += 1.0
return a
print(foo(torch.rand((1, 2))))
torchdynamo.reset()
@torchdynamo.optimize(
make_shark_compiler(use_tracing=False, device="cuda", verbose=True)
)
def test_unsupported_types(t, y):
return t, 2 * y
str_input = "hello"
tensor_input = torch.randn(2)
print(test_unsupported_types(str_input, tensor_input))

View File

@@ -1,73 +0,0 @@
import torch
import numpy as np
model = torch.hub.load(
"pytorch/vision:v0.10.0", "squeezenet1_0", pretrained=True
)
model.eval()
# from PIL import Image
# from torchvision import transforms
# import urllib
#
# url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
# try: urllib.URLopener().retrieve(url, filename)
# except: urllib.request.urlretrieve(url, filename)
#
#
# input_image = Image.open(filename)
# preprocess = transforms.Compose([
# transforms.Resize(256),
# transforms.CenterCrop(224),
# transforms.ToTensor(),
# transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
# ])
# input_tensor = preprocess(input_image)
# input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
# print(input_batch.shape) # size = [1, 3, 224, 224]
# The above is code for generating sample inputs from an image. We can just use
# random values for accuracy testing though
input_batch = torch.randn(1, 3, 224, 224)
# Focus on CPU for now
if False and torch.cuda.is_available():
input_batch = input_batch.to("cuda")
model.to("cuda")
with torch.no_grad():
output = model(input_batch)
# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes
golden_confidences = output[0]
# The output has unnormalized scores. To get probabilities, you can run a softmax on it.
golden_probabilities = torch.nn.functional.softmax(
golden_confidences, dim=0
).numpy()
golden_confidences = golden_confidences.numpy()
from shark.torch_mlir_lockstep_tensor import TorchMLIRLockstepTensor
input_detached_clone = input_batch.clone()
eager_input_batch = TorchMLIRLockstepTensor(input_detached_clone)
print("getting torch-mlir result")
output = model(eager_input_batch)
static_output = output.elem
confidences = static_output[0]
probabilities = torch.nn.functional.softmax(
torch.from_numpy(confidences), dim=0
).numpy()
print("The obtained result via shark is: ", confidences)
print("The golden result is:", golden_confidences)
np.testing.assert_allclose(
golden_confidences, confidences, rtol=1e-02, atol=1e-03
)
np.testing.assert_allclose(
golden_probabilities, probabilities, rtol=1e-02, atol=1e-03
)

View File

@@ -22,7 +22,7 @@ class CLIPModule(tf.Module):
input_ids=x, attention_mask=y, pixel_values=z
)
@tf.function(input_signature=clip_vit_inputs, jit_compile=True)
@tf.function(input_signature=clip_vit_inputs)
def forward(self, input_ids, attention_mask, pixel_values):
return self.m.predict(
input_ids, attention_mask, pixel_values

View File

@@ -1,15 +0,0 @@
## Running ESRGAN
```
1. pip install numpy opencv-python
2. mkdir InputImages
(this is where all the input images will reside in)
3. mkdir OutputImages
(this is where the model will generate all the images)
4. mkdir models
(save the .pth checkpoint file here)
5. python esrgan.py
```
- Download [RRDB_ESRGAN_x4.pth](https://drive.google.com/drive/u/0/folders/17VYV_SoZZesU6mbxz2dMAIccSSlqLecY) and place it in the `models` directory as mentioned above in step 4.
- Credits : [ESRGAN](https://github.com/xinntao/ESRGAN)

View File

@@ -1,240 +0,0 @@
from ast import arg
import os.path as osp
import glob
import cv2
import numpy as np
import torch
from torch.fx.experimental.proxy_tensor import make_fx
from torch._decomp import get_decompositions
from shark.shark_inference import SharkInference
import torch_mlir
import tempfile
import functools
import torch
import torch.nn as nn
import torch.nn.functional as F
def make_layer(block, n_layers):
layers = []
for _ in range(n_layers):
layers.append(block())
return nn.Sequential(*layers)
class ResidualDenseBlock_5C(nn.Module):
def __init__(self, nf=64, gc=32, bias=True):
super(ResidualDenseBlock_5C, self).__init__()
# gc: growth channel, i.e. intermediate channels
self.conv1 = nn.Conv2d(nf, gc, 3, 1, 1, bias=bias)
self.conv2 = nn.Conv2d(nf + gc, gc, 3, 1, 1, bias=bias)
self.conv3 = nn.Conv2d(nf + 2 * gc, gc, 3, 1, 1, bias=bias)
self.conv4 = nn.Conv2d(nf + 3 * gc, gc, 3, 1, 1, bias=bias)
self.conv5 = nn.Conv2d(nf + 4 * gc, nf, 3, 1, 1, bias=bias)
self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
# initialization
# mutil.initialize_weights([self.conv1, self.conv2, self.conv3, self.conv4, self.conv5], 0.1)
def forward(self, x):
x1 = self.lrelu(self.conv1(x))
x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
return x5 * 0.2 + x
class RRDB(nn.Module):
"""Residual in Residual Dense Block"""
def __init__(self, nf, gc=32):
super(RRDB, self).__init__()
self.RDB1 = ResidualDenseBlock_5C(nf, gc)
self.RDB2 = ResidualDenseBlock_5C(nf, gc)
self.RDB3 = ResidualDenseBlock_5C(nf, gc)
def forward(self, x):
out = self.RDB1(x)
out = self.RDB2(out)
out = self.RDB3(out)
return out * 0.2 + x
class RRDBNet(nn.Module):
def __init__(self, in_nc, out_nc, nf, nb, gc=32):
super(RRDBNet, self).__init__()
RRDB_block_f = functools.partial(RRDB, nf=nf, gc=gc)
self.conv_first = nn.Conv2d(in_nc, nf, 3, 1, 1, bias=True)
self.RRDB_trunk = make_layer(RRDB_block_f, nb)
self.trunk_conv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
#### upsampling
self.upconv1 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
self.upconv2 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
self.HRconv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
self.conv_last = nn.Conv2d(nf, out_nc, 3, 1, 1, bias=True)
self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
def forward(self, x):
fea = self.conv_first(x)
trunk = self.trunk_conv(self.RRDB_trunk(fea))
fea = fea + trunk
fea = self.lrelu(
self.upconv1(F.interpolate(fea, scale_factor=2, mode="nearest"))
)
fea = self.lrelu(
self.upconv2(F.interpolate(fea, scale_factor=2, mode="nearest"))
)
out = self.conv_last(self.lrelu(self.HRconv(fea)))
return out
############### Parsing args #####################
import argparse
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
p.add_argument("--device", type=str, default="cpu", help="the device to use")
p.add_argument(
"--mlir_loc",
type=str,
default=None,
help="location of the model's mlir file",
)
args = p.parse_args()
###################################################
def inference(input_m):
return model(input_m)
def load_mlir(mlir_loc):
import os
if mlir_loc == None:
return None
print(f"Trying to load the model from {mlir_loc}.")
with open(os.path.join(mlir_loc)) as f:
mlir_module = f.read()
return mlir_module
def compile_through_fx(model, inputs, mlir_loc=None):
module = load_mlir(mlir_loc)
if module == None:
fx_g = make_fx(
model,
decomposition_table=get_decompositions(
[
torch.ops.aten.embedding_dense_backward,
torch.ops.aten.native_layer_norm_backward,
torch.ops.aten.slice_backward,
torch.ops.aten.select_backward,
torch.ops.aten.norm.ScalarOpt_dim,
torch.ops.aten.native_group_norm,
torch.ops.aten.upsample_bilinear2d.vec,
torch.ops.aten.split.Tensor,
torch.ops.aten.split_with_sizes,
]
),
)(inputs)
fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
fx_g.recompile()
def strip_overloads(gm):
"""
Modifies the target of graph nodes in :attr:`gm` to strip overloads.
Args:
gm(fx.GraphModule): The input Fx graph module to be modified
"""
for node in gm.graph.nodes:
if isinstance(node.target, torch._ops.OpOverload):
node.target = node.target.overloadpacket
gm.recompile()
strip_overloads(fx_g)
ts_g = torch.jit.script(fx_g)
print("Torchscript graph generated successfully")
module = torch_mlir.compile(
ts_g,
inputs,
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=False,
verbose=False,
)
mlir_model = str(module)
func_name = "forward"
shark_module = SharkInference(
mlir_model, func_name, device=args.device, mlir_dialect="linalg"
)
shark_module.compile()
return shark_module
model_path = "models/RRDB_ESRGAN_x4.pth" # models/RRDB_ESRGAN_x4.pth OR models/RRDB_PSNR_x4.pth
# device = torch.device('cuda') # if you want to run on CPU, change 'cuda' -> cpu
device = torch.device("cpu")
test_img_folder = "InputImages/*"
model = RRDBNet(3, 3, 64, 23, gc=32)
model.load_state_dict(torch.load(model_path), strict=True)
model.eval()
model = model.to(device)
print("Model path {:s}. \nTesting...".format(model_path))
if __name__ == "__main__":
idx = 0
for path in glob.glob(test_img_folder):
idx += 1
base = osp.splitext(osp.basename(path))[0]
print(idx, base)
# read images
img = cv2.imread(path, cv2.IMREAD_COLOR)
img = img * 1.0 / 255
img = torch.from_numpy(
np.transpose(img[:, :, [2, 1, 0]], (2, 0, 1))
).float()
img_LR = img.unsqueeze(0)
img_LR = img_LR.to(device)
with torch.no_grad():
shark_module = compile_through_fx(inference, img_LR)
shark_output = shark_module.forward((img_LR,))
shark_output = torch.from_numpy(shark_output)
shark_output = (
shark_output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
)
esrgan_output = (
model(img_LR).data.squeeze().float().cpu().clamp_(0, 1).numpy()
)
# SHARK OUTPUT
shark_output = np.transpose(shark_output[[2, 1, 0], :, :], (1, 2, 0))
shark_output = (shark_output * 255.0).round()
cv2.imwrite(
"OutputImages/{:s}_rlt_shark_output.png".format(base), shark_output
)
print("Generated SHARK's output")
# ESRGAN OUTPUT
esrgan_output = np.transpose(esrgan_output[[2, 1, 0], :, :], (1, 2, 0))
esrgan_output = (esrgan_output * 255.0).round()
cv2.imwrite(
"OutputImages/{:s}_rlt_esrgan_output.png".format(base),
esrgan_output,
)
print("Generated ESRGAN's output")

View File

@@ -18,23 +18,14 @@ class AlbertModule(torch.nn.Module):
self.model.eval()
def forward(self, input_ids, attention_mask):
return self.model(
input_ids=input_ids, attention_mask=attention_mask
).logits
return self.model(input_ids=input_ids, attention_mask=attention_mask).logits
if __name__ == "__main__":
# Prepping Data
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
text = "This [MASK] is very tasty."
encoded_inputs = tokenizer(
text,
padding="max_length",
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
return_tensors="pt",
)
inputs = (encoded_inputs["input_ids"], encoded_inputs["attention_mask"])
encoded_inputs = tokenizer(text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="pt")
inputs = (encoded_inputs["input_ids"],encoded_inputs["attention_mask"])
mlir_importer = SharkImporter(
AlbertModule(),
inputs,
@@ -43,46 +34,26 @@ if __name__ == "__main__":
minilm_mlir, func_name = mlir_importer.import_mlir(
is_dynamic=False, tracing_required=True
)
shark_module = SharkInference(
minilm_mlir, func_name, mlir_dialect="linalg"
)
shark_module = SharkInference(minilm_mlir, func_name, mlir_dialect="linalg")
shark_module.compile()
token_logits = torch.tensor(shark_module.forward(inputs))
mask_id = torch.where(
encoded_inputs["input_ids"] == tokenizer.mask_token_id
)[1]
mask_id = torch.where(encoded_inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_id, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
for token in top_5_tokens:
print(
f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
)
print(f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'")
while True:
try:
new_text = input("Give me a sentence with [MASK] to fill: ")
encoded_inputs = tokenizer(
new_text,
padding="max_length",
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
return_tensors="pt",
)
inputs = (
encoded_inputs["input_ids"],
encoded_inputs["attention_mask"],
)
encoded_inputs = tokenizer(new_text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="pt")
inputs = (encoded_inputs["input_ids"],encoded_inputs["attention_mask"])
token_logits = torch.tensor(shark_module.forward(inputs))
mask_id = torch.where(
encoded_inputs["input_ids"] == tokenizer.mask_token_id
)[1]
mask_id = torch.where(encoded_inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_id, :]
top_5_tokens = (
torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
)
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
for token in top_5_tokens:
print(
f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
)
print(f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'")
except KeyboardInterrupt:
print("Exiting program.")
break

View File

@@ -18,17 +18,15 @@ BATCH_SIZE = 1
# Create a set of inputs
t5_inputs = [
tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32)
]
class AlbertModule(tf.Module):
def __init__(self):
super(AlbertModule, self).__init__()
self.m = TFAutoModelForMaskedLM.from_pretrained("albert-base-v2")
self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)
self.m.predict = lambda x,y: self.m(input_ids=x, attention_mask=y)
@tf.function(input_signature=t5_inputs, jit_compile=True)
@tf.function(input_signature=t5_inputs)
def forward(self, input_ids, attention_mask):
return self.m.predict(input_ids, attention_mask)
@@ -38,14 +36,8 @@ if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
# text = "This is a great [MASK]."
text = "This [MASK] is very tasty."
encoded_inputs = tokenizer(
text,
padding="max_length",
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
return_tensors="tf",
)
inputs = (encoded_inputs["input_ids"], encoded_inputs["attention_mask"])
encoded_inputs = tokenizer(text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="tf")
inputs = (encoded_inputs["input_ids"],encoded_inputs["attention_mask"])
mlir_importer = SharkImporter(
AlbertModule(),
inputs,
@@ -59,42 +51,22 @@ if __name__ == "__main__":
output_idx = 0
data_idx = 1
token_logits = shark_module.forward(inputs)[output_idx][data_idx]
mask_id = np.where(
tf.squeeze(encoded_inputs["input_ids"]) == tokenizer.mask_token_id
)
mask_id = np.where(tf.squeeze(encoded_inputs["input_ids"]) == tokenizer.mask_token_id)
mask_token_logits = token_logits[0, mask_id, :]
top_5_tokens = np.flip(np.argsort(mask_token_logits)).squeeze()[0:5]
for token in top_5_tokens:
print(
f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
)
print(f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'")
while True:
try:
new_text = input("Give me a sentence with [MASK] to fill: ")
encoded_inputs = tokenizer(
new_text,
padding="max_length",
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
return_tensors="tf",
)
inputs = (
encoded_inputs["input_ids"],
encoded_inputs["attention_mask"],
)
encoded_inputs = tokenizer(new_text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="tf")
inputs = (encoded_inputs["input_ids"],encoded_inputs["attention_mask"])
token_logits = shark_module.forward(inputs)[output_idx][data_idx]
mask_id = np.where(
tf.squeeze(encoded_inputs["input_ids"])
== tokenizer.mask_token_id
)
mask_id = np.where(tf.squeeze(encoded_inputs["input_ids"]) == tokenizer.mask_token_id)
mask_token_logits = token_logits[0, mask_id, :]
top_5_tokens = np.flip(np.argsort(mask_token_logits)).squeeze()[
0:5
]
top_5_tokens = np.flip(np.argsort(mask_token_logits)).squeeze()[0:5]
for token in top_5_tokens:
print(
f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
)
print(f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'")
except KeyboardInterrupt:
print("Exiting program.")
sys.exit()

View File

@@ -0,0 +1,35 @@
from PIL import Image
import requests
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
from shark.shark_inference import SharkInference
from shark.shark_importer import SharkImporter
from iree.compiler import tf as tfc
from iree.compiler import compile_str
from iree import runtime as ireert
import os
import numpy as np
MAX_SEQUENCE_LENGTH = 512
BATCH_SIZE = 1
if __name__ == "__main__":
# Prepping Data
model = AutoModelForMaskedLM.from_pretrained("albert-base-v2")
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
text = "This [MASK] is very tasty."
inputs = tokenizer(text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="pt")
token_logits = model(**inputs).logits
print(token_logits)
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# print(mask_token_logits)
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
print(np.argsort(mask_token_logits.detach().numpy()))
# print(top_5_tokens)
for token in top_5_tokens:
print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

View File

@@ -1,14 +0,0 @@
from shark.shark_inference import SharkInference
from shark.shark_downloader import download_model
mlir_model, func_name, inputs, golden_out = download_model(
"bloom", frontend="torch"
)
shark_module = SharkInference(
mlir_model, func_name, device="cpu", mlir_dialect="tm_tensor"
)
shark_module.compile()
result = shark_module.forward(inputs)
print("The obtained result via shark is: ", result)
print("The golden result is:", golden_out)

View File

@@ -19,7 +19,7 @@ class GPT2Module(tf.Module):
self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)
@tf.function(input_signature=gpt2_inputs, jit_compile=True)
@tf.function(input_signature=gpt2_inputs)
def forward(self, input_ids, attention_mask):
return self.m.predict(input_ids, attention_mask)

View File

@@ -26,7 +26,7 @@ class BertModule(tf.Module):
input_ids=x, attention_mask=y, token_type_ids=z, training=False
)
@tf.function(input_signature=bert_input, jit_compile=True)
@tf.function(input_signature=bert_input)
def forward(self, input_ids, attention_mask, token_type_ids):
return self.m.predict(input_ids, attention_mask, token_type_ids)

View File

@@ -1,15 +1,14 @@
from shark.shark_inference import SharkInference
from shark.shark_downloader import download_model
from shark.shark_downloader import download_torch_model
mlir_model, func_name, inputs, golden_out = download_model(
"microsoft/MiniLM-L12-H384-uncased",
frontend="torch",
mlir_model, func_name, inputs, golden_out = download_torch_model(
"microsoft/MiniLM-L12-H384-uncased"
)
shark_module = SharkInference(
mlir_model, func_name, device="cpu", mlir_dialect="linalg"
mlir_model, func_name, mlir_dialect="linalg"
)
shark_module.compile()
result = shark_module.forward(inputs)

View File

@@ -26,7 +26,7 @@ class BertModule(tf.Module):
input_ids=x, attention_mask=y, token_type_ids=z, training=False
)
@tf.function(input_signature=bert_input, jit_compile=True)
@tf.function(input_signature=bert_input)
def forward(self, input_ids, attention_mask, token_type_ids):
return self.m.predict(input_ids, attention_mask, token_type_ids)

View File

@@ -23,7 +23,7 @@ input = torch.randn(1, 3, 224, 224)
mlir_importer = SharkImporter(
ResnestModule(),
(input,),
(input),
frontend="torch",
)
@@ -33,7 +33,9 @@ mlir_importer = SharkImporter(
print(golden_out)
shark_module = SharkInference(vision_mlir, func_name, mlir_dialect="linalg")
shark_module = SharkInference(
vision_mlir, func_name, device="cpu", mlir_dialect="linalg"
)
shark_module.compile()
result = shark_module.forward((input,))
result = shark_module.forward((input))
print("Obtained result", result)

View File

@@ -1,76 +0,0 @@
from shark.shark_inference import SharkInference
from shark.parser import shark_args
import torch
import numpy as np
import sys
import torchvision.models as models
import torch_mlir
torch.manual_seed(0)
class VisionModule(torch.nn.Module):
def __init__(self):
super().__init__()
self.model = models.resnet50(pretrained=True)
self.train(False)
def forward(self, input):
return self.model.forward(input)
model = VisionModule()
test_input = torch.randn(1, 3, 224, 224)
actual_out = model(test_input)
test_input_fp16 = test_input.to(device=torch.device("cuda"), dtype=torch.half)
model_fp16 = model.half()
model_fp16.eval()
model_fp16.to("cuda")
actual_out_fp16 = model_fp16(test_input_fp16)
ts_g = torch.jit.trace(model_fp16, [test_input_fp16])
module = torch_mlir.compile(
ts_g,
(test_input_fp16),
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=True,
verbose=False,
)
# from contextlib import redirect_stdout
# with open('resnet50_fp16_linalg_ir.mlir', 'w') as f:
# with redirect_stdout(f):
# print(module.operation.get_asm())
mlir_model = module
func_name = "forward"
shark_module = SharkInference(
mlir_model, func_name, device="cuda", mlir_dialect="linalg"
)
shark_module.compile()
def shark_result(x):
x_ny = x.cpu().detach().numpy()
inputs = (x_ny,)
result = shark_module.forward(inputs)
return torch.from_numpy(result)
observed_out = shark_result(test_input_fp16)
print("Golden result:", actual_out_fp16)
print("SHARK result:", observed_out)
actual_out_fp16 = actual_out_fp16.to(device=torch.device("cpu"))
print(
torch.testing.assert_allclose(
actual_out_fp16, observed_out, rtol=1e-2, atol=1e-2
)
)

View File

@@ -5,7 +5,7 @@ import torchvision.models as models
from torchvision import transforms
import sys
from shark.shark_inference import SharkInference
from shark.shark_downloader import download_model
from shark.shark_downloader import download_torch_model
################################## Preprocessing inputs and model ############
@@ -66,14 +66,10 @@ labels = load_labels()
## Can pass any img or input to the forward module.
mlir_model, func_name, inputs, golden_out = download_model(
"resnet50", frontend="torch"
)
mlir_model, func_name, inputs, golden_out = download_torch_model("resnet50")
shark_module = SharkInference(mlir_model, func_name, mlir_dialect="linalg")
shark_module.compile()
path = shark_module.save_module()
shark_module.load_module(path)
result = shark_module.forward((img.detach().numpy(),))
print("The top 3 results obtained via shark_runner is:")

View File

@@ -1,392 +0,0 @@
# Description: an implementation of a deep learning recommendation model (DLRM)
# The model input consists of dense and sparse features. The former is a vector
# of floating point values. The latter is a list of sparse indices into
# embedding tables, which consist of vectors of floating point values.
# The selected vectors are passed to mlp networks denoted by triangles,
# in some cases the vectors are interacted through operators (Ops).
#
# output:
# vector of values
# model: |
# /\
# /__\
# |
# _____________________> Op <___________________
# / | \
# /\ /\ /\
# /__\ /__\ ... /__\
# | | |
# | Op Op
# | ____/__\_____ ____/__\____
# | |_Emb_|____|__| ... |_Emb_|__|___|
# input:
# [ dense features ] [sparse indices] , ..., [sparse indices]
#
# More precise definition of model layers:
# 1) fully connected layers of an mlp
# z = f(y)
# y = Wx + b
#
# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
# z = Op(e1,...,ek)
# obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
#
# 3) Operator Op can be one of the following
# Sum(e1,...,ek) = e1 + ... + ek
# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
# Cat(e1,...,ek) = [e1', ..., ek']'
# where ' denotes transpose operation
#
# References:
# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang,
# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu,
# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii,
# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko,
# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong,
# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and
# Recommendation Systems", CoRR, arXiv:1906.00091, 2019
import argparse
import sys
import numpy as np
import torch
import torch.nn as nn
from shark.shark_inference import SharkInference
from shark.shark_importer import SharkImporter
torch.manual_seed(0)
np.random.seed(0)
### define dlrm in PyTorch ###
class DLRM_Net(nn.Module):
def create_mlp(self, ln, sigmoid_layer):
# build MLP layer by layer
layers = nn.ModuleList()
for i in range(0, ln.size - 1):
n = ln[i]
m = ln[i + 1]
# construct fully connected operator
LL = nn.Linear(int(n), int(m), bias=True)
# initialize the weights
# with torch.no_grad():
# custom Xavier input, output or two-sided fill
mean = 0.0 # std_dev = np.sqrt(variance)
std_dev = np.sqrt(2 / (m + n)) # np.sqrt(1 / m) # np.sqrt(1 / n)
W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
std_dev = np.sqrt(1 / m) # np.sqrt(2 / (m + 1))
bt = np.random.normal(mean, std_dev, size=m).astype(np.float32)
LL.weight.data = torch.tensor(W, requires_grad=True)
LL.bias.data = torch.tensor(bt, requires_grad=True)
# approach 2
# LL.weight.data.copy_(torch.tensor(W))
# LL.bias.data.copy_(torch.tensor(bt))
# approach 3
# LL.weight = Parameter(torch.tensor(W),requires_grad=True)
# LL.bias = Parameter(torch.tensor(bt),requires_grad=True)
layers.append(LL)
# construct sigmoid or relu operator
if i == sigmoid_layer:
layers.append(nn.Sigmoid())
else:
layers.append(nn.ReLU())
# approach 1: use ModuleList
# return layers
# approach 2: use Sequential container to wrap all layers
return torch.nn.Sequential(*layers)
def create_emb(self, m, ln, weighted_pooling=None):
emb_l = nn.ModuleList()
v_W_l = []
for i in range(0, ln.size):
n = ln[i]
# construct embedding operator
EE = nn.EmbeddingBag(n, m, mode="sum")
# initialize embeddings
# nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n))
W = np.random.uniform(
low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
).astype(np.float32)
# approach 1
print(W)
EE.weight.data = torch.tensor(W, requires_grad=True)
# approach 2
# EE.weight.data.copy_(torch.tensor(W))
# approach 3
# EE.weight = Parameter(torch.tensor(W),requires_grad=True)
if weighted_pooling is None:
v_W_l.append(None)
else:
v_W_l.append(torch.ones(n, dtype=torch.float32))
emb_l.append(EE)
return emb_l, v_W_l
def __init__(
self,
m_spa=None,
ln_emb=None,
ln_bot=None,
ln_top=None,
arch_interaction_op=None,
arch_interaction_itself=False,
sigmoid_bot=-1,
sigmoid_top=-1,
weighted_pooling=None,
):
super(DLRM_Net, self).__init__()
if (
(m_spa is not None)
and (ln_emb is not None)
and (ln_bot is not None)
and (ln_top is not None)
and (arch_interaction_op is not None)
):
# save arguments
self.output_d = 0
self.arch_interaction_op = arch_interaction_op
self.arch_interaction_itself = arch_interaction_itself
if weighted_pooling is not None and weighted_pooling != "fixed":
self.weighted_pooling = "learned"
else:
self.weighted_pooling = weighted_pooling
# create operators
self.emb_l, w_list = self.create_emb(
m_spa, ln_emb, weighted_pooling
)
if self.weighted_pooling == "learned":
self.v_W_l = nn.ParameterList()
for w in w_list:
self.v_W_l.append(nn.Parameter(w))
else:
self.v_W_l = w_list
self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
self.top_l = self.create_mlp(ln_top, sigmoid_top)
def apply_mlp(self, x, layers):
return layers(x)
def apply_emb(self, lS_o, lS_i, emb_l, v_W_l):
# WARNING: notice that we are processing the batch at once. We implicitly
# assume that the data is laid out such that:
# 1. each embedding is indexed with a group of sparse indices,
# corresponding to a single lookup
# 2. for each embedding the lookups are further organized into a batch
# 3. for a list of embedding tables there is a list of batched lookups
# TORCH-MLIR
# We are passing all the embeddings as arguments for easy parsing.
ly = []
for k, sparse_index_group_batch in enumerate(lS_i):
sparse_offset_group_batch = lS_o[k]
# embedding lookup
# We are using EmbeddingBag, which implicitly uses sum operator.
# The embeddings are represented as tall matrices, with sum
# happening vertically across 0 axis, resulting in a row vector
# E = emb_l[k]
if v_W_l[k] is not None:
per_sample_weights = v_W_l[k].gather(
0, sparse_index_group_batch
)
else:
per_sample_weights = None
E = emb_l[k]
V = E(
sparse_index_group_batch,
sparse_offset_group_batch,
per_sample_weights=per_sample_weights,
)
ly.append(V)
return ly
def interact_features(self, x, ly):
if self.arch_interaction_op == "dot":
# concatenate dense and sparse features
(batch_size, d) = x.shape
T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
# perform a dot product
Z = torch.bmm(T, torch.transpose(T, 1, 2))
# append dense feature with the interactions (into a row vector)
# approach 1: all
# Zflat = Z.view((batch_size, -1))
# approach 2: unique
_, ni, nj = Z.shape
# approach 1: tril_indices
# offset = 0 if self.arch_interaction_itself else -1
# li, lj = torch.tril_indices(ni, nj, offset=offset)
# approach 2: custom
offset = 1 if self.arch_interaction_itself else 0
li = torch.tensor(
[i for i in range(ni) for j in range(i + offset)]
)
lj = torch.tensor(
[j for i in range(nj) for j in range(i + offset)]
)
Zflat = Z[:, li, lj]
# concatenate dense features and interactions
R = torch.cat([x] + [Zflat], dim=1)
elif self.arch_interaction_op == "cat":
# concatenation features (into a row vector)
R = torch.cat([x] + ly, dim=1)
else:
sys.exit(
"ERROR: --arch-interaction-op="
+ self.arch_interaction_op
+ " is not supported"
)
return R
def forward(self, dense_x, lS_o, *lS_i):
return self.sequential_forward(dense_x, lS_o, lS_i)
def sequential_forward(self, dense_x, lS_o, lS_i):
# process dense features (using bottom mlp), resulting in a row vector
x = self.apply_mlp(dense_x, self.bot_l)
# debug prints
# print("intermediate")
# print(x.detach().cpu().numpy())
# process sparse features(using embeddings), resulting in a list of row vectors
ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)
# for y in ly:
# print(y.detach().cpu().numpy())
# interact features (dense and sparse)
z = self.interact_features(x, ly)
# print(z.detach().cpu().numpy())
# obtain probability of a click (using top mlp)
p = self.apply_mlp(z, self.top_l)
# # clamp output if needed
# if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
# z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
# else:
# z = p
return p
def dash_separated_ints(value):
vals = value.split("-")
for val in vals:
try:
int(val)
except ValueError:
raise argparse.ArgumentTypeError(
"%s is not a valid dash separated list of ints" % value
)
return value
# model related parameters
parser = argparse.ArgumentParser(
description="Train Deep Learning Recommendation Model (DLRM)"
)
parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
parser.add_argument(
"--arch-embedding-size", type=dash_separated_ints, default="4-3-2"
)
# j will be replaced with the table number
parser.add_argument(
"--arch-mlp-bot", type=dash_separated_ints, default="4-3-2"
)
parser.add_argument(
"--arch-mlp-top", type=dash_separated_ints, default="8-2-1"
)
parser.add_argument(
"--arch-interaction-op", type=str, choices=["dot", "cat"], default="dot"
)
parser.add_argument(
"--arch-interaction-itself", action="store_true", default=False
)
parser.add_argument("--weighted-pooling", type=str, default=None)
args = parser.parse_args()
ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
ln_top = np.fromstring(args.arch_mlp_top, dtype=int, sep="-")
m_den = ln_bot[0]
ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
m_spa = args.arch_sparse_feature_size
ln_emb = np.asarray(ln_emb)
num_fea = ln_emb.size + 1 # num sparse + num dense features
# Initialize the model.
dlrm_model = DLRM_Net(
m_spa=m_spa,
ln_emb=ln_emb,
ln_bot=ln_bot,
ln_top=ln_top,
arch_interaction_op=args.arch_interaction_op,
)
# Inputs to the model.
dense_inp = torch.tensor([[0.6965, 0.2861, 0.2269, 0.5513]])
vs0 = torch.tensor([[0], [0], [0]], dtype=torch.int64)
vsi = torch.tensor([1, 2, 3]), torch.tensor([1]), torch.tensor([1])
input_dlrm = (dense_inp, vs0, *vsi)
golden_output = dlrm_model(dense_inp, vs0, *vsi)
mlir_importer = SharkImporter(
dlrm_model,
input_dlrm,
frontend="torch",
)
(dlrm_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
tracing_required=True
)
shark_module = SharkInference(
dlrm_mlir, func_name, device="vulkan", mlir_dialect="linalg"
)
shark_module.compile()
result = shark_module.forward(input_dlrm)
np.testing.assert_allclose(
golden_output.detach().numpy(), result, rtol=1e-02, atol=1e-03
)
# Verified via torch-mlir.
# import torch_mlir
# from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
# module = torch_mlir.compile(
# dlrm_model, inputs, use_tracing=True, output_type="linalg-on-tensors"
# )
# backend = refbackend.RefBackendLinalgOnTensorsBackend()
# compiled = backend.compile(module)
# jit_module = backend.load(compiled)
# dense_numpy = dense_inp.numpy()
# vs0_numpy = vs0.numpy()
# vsi_numpy = [inp.numpy() for inp in vsi]
# numpy_inp = (dense_numpy, vs0_numpy, *vsi_numpy)
# print(jit_module.forward(*numpy_inp))

View File

@@ -1,314 +0,0 @@
import torch
from torch import nn
from torchrec.datasets.utils import Batch
from torchrec.modules.crossnet import LowRankCrossNet
from torchrec.sparse.jagged_tensor import KeyedJaggedTensor, KeyedTensor
from torchrec.modules.embedding_configs import EmbeddingBagConfig
from torchrec.modules.embedding_modules import EmbeddingBagCollection
from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
from typing import Dict, List, Optional, Tuple
from torchrec.models.dlrm import (
choose,
DenseArch,
DLRM,
InteractionArch,
SparseArch,
OverArch,
)
from shark.shark_inference import SharkInference
from shark.shark_importer import SharkImporter
import numpy as np
torch.manual_seed(0)
np.random.seed(0)
def calculate_offsets(tensor_list, prev_values, prev_offsets):
offset_init = 0
offset_list = []
values_list = []
if prev_offsets != None:
offset_init = prev_values.shape[-1]
for tensor in tensor_list:
offset_list.append(offset_init)
offset_init += tensor.shape[0]
concatendated_tensor_list = torch.cat(tensor_list)
if prev_values != None:
concatendated_tensor_list = torch.cat(
[prev_values, concatendated_tensor_list]
)
concatenated_offsets = torch.tensor(offset_list)
if prev_offsets != None:
concatenated_offsets = torch.cat([prev_offsets, concatenated_offsets])
return concatendated_tensor_list, concatenated_offsets
# Have to make combined_keys as dict as to which embedding bags they
# point to. {f1: 0, f3: 0, f2: 1}
# The result will be a triple containing values, indices and pointer tensor.
def to_list(key_jagged, combined_keys):
key_jagged_dict = key_jagged.to_dict()
combined_list = []
for key in combined_keys:
prev_values, prev_offsets = calculate_offsets(
key_jagged_dict[key].to_dense(), None, None
)
print(prev_values)
print(prev_offsets)
combined_list.append(prev_values)
combined_list.append(prev_offsets)
combined_list.append(torch.tensor(combined_keys[key]))
return combined_list
class SparseArchShark(nn.Module):
def create_emb(self, embedding_dim, num_embeddings_list):
embedding_list = nn.ModuleList()
for i in range(0, num_embeddings_list.size):
num_embeddings = num_embeddings_list[i]
EE = nn.EmbeddingBag(num_embeddings, embedding_dim, mode="sum")
W = np.random.uniform(
low=-np.sqrt(1 / num_embeddings),
high=np.sqrt(1 / num_embeddings),
size=(num_embeddings, embedding_dim),
).astype(np.float32)
EE.weight.data = torch.tensor(W, requires_grad=True)
embedding_list.append(EE)
return embedding_list
def __init__(
self,
embedding_dim,
total_features,
num_embeddings_list,
):
super(SparseArchShark, self).__init__()
self.embedding_dim = embedding_dim
self.num_features = total_features
self.embedding_list = self.create_emb(
embedding_dim, num_embeddings_list
)
def forward(self, *batched_inputs):
concatenated_list = []
input_enum, embedding_enum = 0, 0
for k in range(len(batched_inputs) // 3):
values = batched_inputs[input_enum]
input_enum += 1
offsets = batched_inputs[input_enum]
input_enum += 1
embedding_pointer = int(batched_inputs[input_enum])
input_enum += 1
E = self.embedding_list[embedding_pointer]
V = E(values, offsets)
concatenated_list.append(V)
return torch.cat(concatenated_list, dim=1).reshape(
-1, self.num_features, self.embedding_dim
)
def test_sparse_arch() -> None:
D = 3
eb1_config = EmbeddingBagConfig(
name="t1",
embedding_dim=D,
num_embeddings=10,
feature_names=["f1", "f3"],
)
eb2_config = EmbeddingBagConfig(
name="t2",
embedding_dim=D,
num_embeddings=10,
feature_names=["f2"],
)
ebc = EmbeddingBagCollection(tables=[eb1_config, eb2_config])
w1 = ebc.embedding_bags["t1"].weight
w2 = ebc.embedding_bags["t2"].weight
sparse_arch = SparseArch(ebc)
keys = ["f1", "f2", "f3", "f4", "f5"]
offsets = torch.tensor([0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 19])
features = KeyedJaggedTensor.from_offsets_sync(
keys=keys,
values=torch.tensor(
[1, 2, 4, 5, 4, 3, 2, 9, 1, 2, 4, 5, 4, 3, 2, 9, 1, 2, 3]
),
offsets=offsets,
)
sparse_archi = SparseArchShark(D, 3, np.array([10, 10]))
sparse_archi.embedding_list[0].weight = w1
sparse_archi.embedding_list[1].weight = w2
inputs = to_list(features, {"f1": 0, "f3": 0, "f2": 1})
test_results = sparse_archi(*inputs)
sparse_features = sparse_arch(features)
torch.allclose(
sparse_features,
test_results,
rtol=1e-4,
atol=1e-4,
)
test_sparse_arch()
class DLRMShark(nn.Module):
def __init__(
self,
embedding_dim,
total_features,
num_embeddings_list,
dense_in_features: int,
dense_arch_layer_sizes: List[int],
over_arch_layer_sizes: List[int],
) -> None:
super().__init__()
self.sparse_arch: SparseArchShark = SparseArchShark(
embedding_dim, total_features, num_embeddings_list
)
num_sparse_features: int = total_features
self.dense_arch = DenseArch(
in_features=dense_in_features,
layer_sizes=dense_arch_layer_sizes,
)
self.inter_arch = InteractionArch(
num_sparse_features=num_sparse_features,
)
over_in_features: int = (
embedding_dim
+ choose(num_sparse_features, 2)
+ num_sparse_features
)
self.over_arch = OverArch(
in_features=over_in_features,
layer_sizes=over_arch_layer_sizes,
)
def forward(
self, dense_features: torch.Tensor, *sparse_features
) -> torch.Tensor:
embedded_dense = self.dense_arch(dense_features)
embedded_sparse = self.sparse_arch(*sparse_features)
concatenated_dense = self.inter_arch(
dense_features=embedded_dense, sparse_features=embedded_sparse
)
logits = self.over_arch(concatenated_dense)
return logits
def test_dlrm() -> None:
B = 2
D = 8
dense_in_features = 100
eb1_config = EmbeddingBagConfig(
name="t1",
embedding_dim=D,
num_embeddings=100,
feature_names=["f1", "f3"],
)
eb2_config = EmbeddingBagConfig(
name="t2",
embedding_dim=D,
num_embeddings=100,
feature_names=["f2"],
)
ebc = EmbeddingBagCollection(tables=[eb1_config, eb2_config])
sparse_features = KeyedJaggedTensor.from_offsets_sync(
keys=["f1", "f3", "f2"],
values=torch.tensor([1, 2, 4, 5, 4, 3, 2, 9, 1, 2, 3]),
offsets=torch.tensor([0, 2, 4, 6, 8, 10, 11]),
)
ebc = EmbeddingBagCollection(tables=[eb1_config, eb2_config])
sparse_nn = DLRM(
embedding_bag_collection=ebc,
dense_in_features=dense_in_features,
dense_arch_layer_sizes=[20, D],
over_arch_layer_sizes=[5, 1],
)
sparse_nn_nod = DLRMShark(
embedding_dim=8,
total_features=3,
num_embeddings_list=np.array([100, 100]),
dense_in_features=dense_in_features,
dense_arch_layer_sizes=[20, D],
over_arch_layer_sizes=[5, 1],
)
dense_features = torch.rand((B, dense_in_features))
x = to_list(sparse_features, {"f1": 0, "f3": 0, "f2": 1})
w1 = ebc.embedding_bags["t1"].weight
w2 = ebc.embedding_bags["t2"].weight
sparse_nn_nod.sparse_arch.embedding_list[0].weight = w1
sparse_nn_nod.sparse_arch.embedding_list[1].weight = w2
sparse_nn_nod.dense_arch.load_state_dict(sparse_nn.dense_arch.state_dict())
sparse_nn_nod.inter_arch.load_state_dict(sparse_nn.inter_arch.state_dict())
sparse_nn_nod.over_arch.load_state_dict(sparse_nn.over_arch.state_dict())
logits = sparse_nn(
dense_features=dense_features,
sparse_features=sparse_features,
)
logits_nod = sparse_nn_nod(dense_features, *x)
# print(logits)
# print(logits_nod)
# Import the module and print.
mlir_importer = SharkImporter(
sparse_nn_nod,
(dense_features, *x),
frontend="torch",
)
(dlrm_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
tracing_required=True
)
shark_module = SharkInference(
dlrm_mlir, func_name, device="cpu", mlir_dialect="linalg"
)
shark_module.compile()
result = shark_module.forward(inputs)
np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
torch.allclose(
logits,
logits_nod,
rtol=1e-4,
atol=1e-4,
)
test_dlrm()

View File

@@ -1,272 +0,0 @@
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
import torch
from PIL import Image
from diffusers import LMSDiscreteScheduler
from tqdm.auto import tqdm
from shark.shark_inference import SharkInference
from torch.fx.experimental.proxy_tensor import make_fx
from torch._decomp import get_decompositions
import torch_mlir
import tempfile
import numpy as np
# pip install diffusers
# pip install scipy
############### Parsing args #####################
import argparse
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
p.add_argument(
"--prompt",
type=str,
default="a photograph of an astronaut riding a horse",
help="the text prompt to use",
)
p.add_argument("--device", type=str, default="cpu", help="the device to use")
p.add_argument("--steps", type=int, default=10, help="the device to use")
p.add_argument("--mlir_loc", type=str, default=None, help="the device to use")
p.add_argument("--vae_loc", type=str, default=None, help="the device to use")
args = p.parse_args()
#####################################################
def load_mlir(mlir_loc):
import os
if mlir_loc == None:
return None
print(f"Trying to load the model from {mlir_loc}.")
with open(os.path.join(mlir_loc)) as f:
mlir_module = f.read()
return mlir_module
def compile_through_fx(model, inputs, mlir_loc=None, extra_args=[]):
module = load_mlir(mlir_loc)
if mlir_loc == None:
fx_g = make_fx(
model,
decomposition_table=get_decompositions(
[
torch.ops.aten.embedding_dense_backward,
torch.ops.aten.native_layer_norm_backward,
torch.ops.aten.slice_backward,
torch.ops.aten.select_backward,
torch.ops.aten.norm.ScalarOpt_dim,
torch.ops.aten.native_group_norm,
torch.ops.aten.upsample_bilinear2d.vec,
torch.ops.aten.split.Tensor,
torch.ops.aten.split_with_sizes,
]
),
)(*inputs)
fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
fx_g.recompile()
def strip_overloads(gm):
"""
Modifies the target of graph nodes in :attr:`gm` to strip overloads.
Args:
gm(fx.GraphModule): The input Fx graph module to be modified
"""
for node in gm.graph.nodes:
if isinstance(node.target, torch._ops.OpOverload):
node.target = node.target.overloadpacket
gm.recompile()
strip_overloads(fx_g)
ts_g = torch.jit.script(fx_g)
module = torch_mlir.compile(
ts_g,
inputs,
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=False,
verbose=False,
)
mlir_model = module
func_name = "forward"
shark_module = SharkInference(
mlir_model,
func_name,
device=args.device,
mlir_dialect="tm_tensor",
)
shark_module.compile(extra_args)
return shark_module
if __name__ == "__main__":
YOUR_TOKEN = "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"
# 1. Load the autoencoder model which will be used to decode the latents into image space.
vae = AutoencoderKL.from_pretrained(
"CompVis/stable-diffusion-v1-4",
subfolder="vae",
use_auth_token=YOUR_TOKEN,
)
# 2. Load the tokenizer and text encoder to tokenize and encode the text.
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
)
class VaeModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.vae = AutoencoderKL.from_pretrained(
"CompVis/stable-diffusion-v1-4",
subfolder="vae",
use_auth_token=YOUR_TOKEN,
)
def forward(self, input):
return self.vae.decode(input, return_dict=False)[0]
vae = VaeModel()
vae_input = torch.rand(1, 4, 64, 64)
shark_vae = compile_through_fx(vae, (vae_input,), args.vae_loc)
# Wrap the unet model to return tuples.
class UnetModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.unet = UNet2DConditionModel.from_pretrained(
"CompVis/stable-diffusion-v1-4",
subfolder="unet",
use_auth_token=YOUR_TOKEN,
)
self.in_channels = self.unet.in_channels
self.train(False)
def forward(self, x, y, z):
return self.unet.forward(x, y, z, return_dict=False)[0]
# 3. The UNet model for generating the latents.
unet = UnetModel()
latent_model_input = torch.rand([2, 4, 64, 64])
text_embeddings = torch.rand([2, 77, 768])
shark_unet = compile_through_fx(
unet,
(latent_model_input, torch.tensor([1.0]), text_embeddings),
args.mlir_loc,
["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
)
# torch.jit.script(unet)
scheduler = LMSDiscreteScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
num_train_timesteps=1000,
)
prompt = [args.prompt]
height = 512 # default height of Stable Diffusion
width = 512 # default width of Stable Diffusion
num_inference_steps = args.steps # Number of denoising steps
guidance_scale = 7.5 # Scale for classifier-free guidance
generator = torch.manual_seed(
42
) # Seed generator to create the inital latent noise
batch_size = len(prompt)
text_input = tokenizer(
prompt,
padding="max_length",
max_length=tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
text_embeddings = text_encoder(text_input.input_ids)[0]
max_length = text_input.input_ids.shape[-1]
uncond_input = tokenizer(
[""] * batch_size,
padding="max_length",
max_length=max_length,
return_tensors="pt",
)
uncond_embeddings = text_encoder(uncond_input.input_ids)[0]
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
latents = torch.randn(
(batch_size, unet.in_channels, height // 8, width // 8),
generator=generator,
)
# latents = latents.to(torch_device)
scheduler.set_timesteps(num_inference_steps)
latents = latents * scheduler.sigmas[0]
# print(latents, latents.shape)
for i, t in tqdm(enumerate(scheduler.timesteps)):
print(f"i = {i} t = {t}")
# expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
latent_model_input = torch.cat([latents] * 2)
sigma = scheduler.sigmas[i]
latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
# predict the noise residual
# with torch.no_grad():
# noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)
latent_model_input_numpy = latent_model_input.detach().numpy()
text_embeddings_numpy = text_embeddings.detach().numpy()
noise_pred = shark_unet.forward(
(
latent_model_input_numpy,
np.array([t]).astype(np.float32),
text_embeddings_numpy,
)
)
noise_pred = torch.from_numpy(noise_pred)
# perform guidance
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + guidance_scale * (
noise_pred_text - noise_pred_uncond
)
# compute the previous noisy sample x_t -> x_t-1
latents = scheduler.step(noise_pred, i, latents)["prev_sample"]
# print("Latents shape : ", latents.shape)
# scale and decode the image latents with vae
latents = 1 / 0.18215 * latents
latents_numpy = latents.detach().numpy()
image = shark_vae.forward((latents_numpy,))
image = torch.from_numpy(image)
image = (image / 2 + 0.5).clamp(0, 1)
image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
images = (image * 255).round().astype("uint8")
pil_images = [Image.fromarray(image) for image in images]
pil_images[0].save("astro.jpg")

View File

@@ -1,280 +0,0 @@
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
import torch
from PIL import Image
from diffusers import LMSDiscreteScheduler
from tqdm.auto import tqdm
from shark.shark_inference import SharkInference
from torch.fx.experimental.proxy_tensor import make_fx
from torch._decomp import get_decompositions
import torch_mlir
import tempfile
import numpy as np
# pip install diffusers
# pip install scipy
############### Parsing args #####################
import argparse
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
p.add_argument(
"--prompt",
type=str,
default="a photograph of an astronaut riding a horse",
help="the text prompt to use",
)
p.add_argument("--device", type=str, default="cpu", help="the device to use")
p.add_argument("--steps", type=int, default=50, help="the device to use")
p.add_argument("--mlir_loc", type=str, default=None, help="the device to use")
p.add_argument("--vae_loc", type=str, default=None, help="the device to use")
args = p.parse_args()
#####################################################
def fp16_unet():
from shark.shark_downloader import download_model
mlir_model, func_name, inputs, golden_out = download_model(
"stable_diff_f16_18_OCT",
tank_url="gs://shark_tank/prashant_nod",
frontend="torch",
)
shark_module = SharkInference(
mlir_model, func_name, device=args.device, mlir_dialect="linalg"
)
shark_module.compile()
return shark_module
def load_mlir(mlir_loc):
import os
if mlir_loc == None:
return None
print(f"Trying to load the model from {mlir_loc}.")
with open(os.path.join(mlir_loc)) as f:
mlir_module = f.read()
return mlir_module
def compile_through_fx(model, inputs, mlir_loc=None):
module = load_mlir(mlir_loc)
if mlir_loc == None:
fx_g = make_fx(
model,
decomposition_table=get_decompositions(
[
torch.ops.aten.embedding_dense_backward,
torch.ops.aten.native_layer_norm_backward,
torch.ops.aten.slice_backward,
torch.ops.aten.select_backward,
torch.ops.aten.norm.ScalarOpt_dim,
torch.ops.aten.native_group_norm,
torch.ops.aten.upsample_bilinear2d.vec,
torch.ops.aten.split.Tensor,
torch.ops.aten.split_with_sizes,
]
),
)(*inputs)
fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
fx_g.recompile()
def strip_overloads(gm):
"""
Modifies the target of graph nodes in :attr:`gm` to strip overloads.
Args:
gm(fx.GraphModule): The input Fx graph module to be modified
"""
for node in gm.graph.nodes:
if isinstance(node.target, torch._ops.OpOverload):
node.target = node.target.overloadpacket
gm.recompile()
strip_overloads(fx_g)
ts_g = torch.jit.script(fx_g)
module = torch_mlir.compile(
ts_g,
inputs,
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=False,
verbose=False,
)
mlir_model = module
func_name = "forward"
shark_module = SharkInference(
mlir_model, func_name, device=args.device, mlir_dialect="linalg"
)
shark_module.compile()
return shark_module
if __name__ == "__main__":
YOUR_TOKEN = "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"
# 1. Load the autoencoder model which will be used to decode the latents into image space.
vae = AutoencoderKL.from_pretrained(
"CompVis/stable-diffusion-v1-4",
subfolder="vae",
use_auth_token=YOUR_TOKEN,
)
# 2. Load the tokenizer and text encoder to tokenize and encode the text.
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
)
class VaeModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.vae = AutoencoderKL.from_pretrained(
"CompVis/stable-diffusion-v1-4",
subfolder="vae",
use_auth_token=YOUR_TOKEN,
)
def forward(self, input):
return self.vae.decode(input, return_dict=False)[0]
vae = VaeModel()
vae_input = torch.rand(1, 4, 64, 64)
shark_vae = compile_through_fx(vae, (vae_input,), args.vae_loc)
# Wrap the unet model to return tuples.
class UnetModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.unet = UNet2DConditionModel.from_pretrained(
"CompVis/stable-diffusion-v1-4",
subfolder="unet",
use_auth_token=YOUR_TOKEN,
)
self.in_channels = self.unet.in_channels
self.train(False)
def forward(self, x, y, z):
return self.unet.forward(x, y, z, return_dict=False)[0]
# # 3. The UNet model for generating the latents.
unet = UnetModel()
shark_unet = fp16_unet()
scheduler = LMSDiscreteScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
num_train_timesteps=1000,
)
prompt = [args.prompt]
height = 512 # default height of Stable Diffusion
width = 512 # default width of Stable Diffusion
num_inference_steps = args.steps # Number of denoising steps
guidance_scale = 7.5 # Scale for classifier-free guidance
generator = torch.manual_seed(
42
) # Seed generator to create the inital latent noise
batch_size = len(prompt)
text_input = tokenizer(
prompt,
padding="max_length",
max_length=tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
text_embeddings = text_encoder(text_input.input_ids)[0]
max_length = text_input.input_ids.shape[-1]
uncond_input = tokenizer(
[""] * batch_size,
padding="max_length",
max_length=max_length,
return_tensors="pt",
)
uncond_embeddings = text_encoder(uncond_input.input_ids)[0]
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
latents = torch.randn(
(batch_size, unet.in_channels, height // 8, width // 8),
generator=generator,
)
# latents = latents.to(torch_device)
scheduler.set_timesteps(num_inference_steps)
latents = latents * scheduler.sigmas[0]
# print(latents, latents.shape)
for i, t in tqdm(enumerate(scheduler.timesteps)):
print(f"i = {i} t = {t}")
# expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
latent_model_input = torch.cat([latents] * 2)
sigma = scheduler.sigmas[i]
latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
# predict the noise residual
# with torch.no_grad():
# noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)
latent_model_input_numpy = (
latent_model_input.detach().numpy().astype(np.half)
)
text_embeddings_numpy = (
text_embeddings.detach().numpy().astype(np.half)
)
noise_pred = shark_unet.forward(
(
latent_model_input_numpy,
np.array([t]).astype(np.half),
text_embeddings_numpy,
)
)
noise_pred = torch.from_numpy(noise_pred).to(torch.float32)
# perform guidance
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + guidance_scale * (
noise_pred_text - noise_pred_uncond
)
# compute the previous noisy sample x_t -> x_t-1
latents = scheduler.step(noise_pred, i, latents)["prev_sample"]
# print("Latents shape : ", latents.shape)
# scale and decode the image latents with vae
latents = 1 / 0.18215 * latents
latents_numpy = latents.detach().numpy()
image = shark_vae.forward((latents_numpy,))
image = torch.from_numpy(image)
image = (image / 2 + 0.5).clamp(0, 1)
image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
images = (image * 255).round().astype("uint8")
pil_images = [Image.fromarray(image) for image in images]
pil_images[0].save("astro.jpg")

View File

@@ -1,313 +0,0 @@
import math
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras_cv.models.generative.stable_diffusion.clip_tokenizer import (
SimpleTokenizer,
)
from keras_cv.models.generative.stable_diffusion.constants import (
_ALPHAS_CUMPROD,
)
from keras_cv.models.generative.stable_diffusion.constants import (
_UNCONDITIONAL_TOKENS,
)
from keras_cv.models.generative.stable_diffusion.decoder import Decoder
from keras_cv.models.generative.stable_diffusion.text_encoder import (
TextEncoder,
)
from shark.shark_inference import SharkInference
from shark.shark_downloader import download_model
from PIL import Image
# pip install "git+https://github.com/keras-team/keras-cv.git"
# pip install tensorflow_dataset
############### Parsing args #####################
import argparse
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
p.add_argument(
"--prompt",
type=str,
default="a photograph of an astronaut riding a horse",
help="the text prompt to use",
)
p.add_argument("--device", type=str, default="cpu", help="the device to use")
p.add_argument(
"--steps", type=int, default=10, help="the number of steps to use"
)
p.add_argument(
"--save_path",
type=str,
default=None,
help="the file to save the resulting image to. (default to <input prompt>.jpg)",
)
args = p.parse_args()
#####################################################
MAX_PROMPT_LENGTH = 77
class SharkStableDiffusion:
"""Shark implementation of Stable Diffusion based on model from keras_cv.
Stable Diffusion is a powerful image generation model that can be used,
among other things, to generate pictures according to a short text description
(called a "prompt").
Arguments:
device: Device to use with SHARK. Default: cpu
jit_compile: Whether to compile the underlying models to XLA.
This can lead to a significant speedup on some systems. Default: False.
References:
- [About Stable Diffusion](https://stability.ai/blog/stable-diffusion-announcement)
- [Original implementation](https://github.com/CompVis/stable-diffusion)
"""
def __init__(self, device="cpu", jit_compile=True):
self.img_height = 512
self.img_width = 512
self.tokenizer = SimpleTokenizer()
# Create models
self.text_encoder = TextEncoder(MAX_PROMPT_LENGTH)
mlir_model, func_name, inputs, golden_out = download_model(
"stable_diff", tank_url="gs://shark_tank/quinn", frontend="tf"
)
shark_module = SharkInference(
mlir_model, func_name, device=device, mlir_dialect="mhlo"
)
shark_module.compile()
self.diffusion_model = shark_module
self.decoder = Decoder(self.img_height, self.img_width)
if jit_compile:
self.text_encoder.compile(jit_compile=True)
self.decoder.compile(jit_compile=True)
print(
"By using this model checkpoint, you acknowledge that its usage is "
"subject to the terms of the CreativeML Open RAIL-M license at "
"https://raw.githubusercontent.com/CompVis/stable-diffusion/main/LICENSE"
)
# Load weights
text_encoder_weights_fpath = keras.utils.get_file(
origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_encoder.h5",
file_hash="4789e63e07c0e54d6a34a29b45ce81ece27060c499a709d556c7755b42bb0dc4",
)
decoder_weights_fpath = keras.utils.get_file(
origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_decoder.h5",
file_hash="ad350a65cc8bc4a80c8103367e039a3329b4231c2469a1093869a345f55b1962",
)
self.text_encoder.load_weights(text_encoder_weights_fpath)
self.decoder.load_weights(decoder_weights_fpath)
def text_to_image(
self,
prompt,
batch_size=1,
num_steps=25,
unconditional_guidance_scale=7.5,
seed=None,
):
encoded_text = self.encode_text(prompt)
return self.generate_image(
encoded_text,
batch_size=batch_size,
num_steps=num_steps,
unconditional_guidance_scale=unconditional_guidance_scale,
seed=seed,
)
def encode_text(self, prompt):
"""Encodes a prompt into a latent text encoding.
The encoding produced by this method should be used as the
`encoded_text` parameter of `StableDiffusion.generate_image`. Encoding
text separately from generating an image can be used to arbitrarily
modify the text encoding priot to image generation, e.g. for walking
between two prompts.
Args:
prompt: a string to encode, must be 77 tokens or shorter.
Example:
```python
from keras_cv.models import StableDiffusion
model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
encoded_text = model.encode_text("Tacos at dawn")
img = model.generate_image(encoded_text)
```
"""
# Tokenize prompt (i.e. starting context)
inputs = self.tokenizer.encode(prompt)
if len(inputs) > MAX_PROMPT_LENGTH:
raise ValueError(
f"Prompt is too long (should be <= {MAX_PROMPT_LENGTH} tokens)"
)
phrase = inputs + [49407] * (MAX_PROMPT_LENGTH - len(inputs))
phrase = tf.convert_to_tensor([phrase], dtype=tf.int32)
context = self.text_encoder.predict_on_batch(
[phrase, self._get_pos_ids()]
)
return context
def generate_image(
self,
encoded_text,
batch_size=1,
num_steps=25,
unconditional_guidance_scale=7.5,
diffusion_noise=None,
seed=None,
):
"""Generates an image based on encoded text.
The encoding passed to this method should be derived from
`StableDiffusion.encode_text`.
Args:
encoded_text: Tensor of shape (`batch_size`, 77, 768), or a Tensor
of shape (77, 768). When the batch axis is omitted, the same encoded
text will be used to produce every generated image.
batch_size: number of images to generate. Default: 1.
num_steps: number of diffusion steps (controls image quality).
Default: 25.
unconditional_guidance_scale: float controling how closely the image
should adhere to the prompt. Larger values result in more
closely adhering to the prompt, but will make the image noisier.
Default: 7.5.
diffusion_noise: Tensor of shape (`batch_size`, img_height // 8,
img_width // 8, 4), or a Tensor of shape (img_height // 8,
img_width // 8, 4). Optional custom noise to seed the diffusion
process. When the batch axis is omitted, the same noise will be
used to seed diffusion for every generated image.
seed: integer which is used to seed the random generation of
diffusion noise, only to be specified if `diffusion_noise` is
None.
Example:
```python
from keras_cv.models import StableDiffusion
batch_size = 8
model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
e_tacos = model.encode_text("Tacos at dawn")
e_watermelons = model.encode_text("Watermelons at dusk")
e_interpolated = tf.linspace(e_tacos, e_watermelons, batch_size)
images = model.generate_image(e_interpolated, batch_size=batch_size)
```
"""
if diffusion_noise is not None and seed is not None:
raise ValueError(
"`diffusion_noise` and `seed` should not both be passed to "
"`generate_image`. `seed` is only used to generate diffusion "
"noise when it's not already user-specified."
)
encoded_text = tf.squeeze(encoded_text)
if encoded_text.shape.rank == 2:
encoded_text = tf.repeat(
tf.expand_dims(encoded_text, axis=0), batch_size, axis=0
)
context = encoded_text
unconditional_context = tf.repeat(
self._get_unconditional_context(), batch_size, axis=0
)
context = tf.concat([context, unconditional_context], 0)
if diffusion_noise is not None:
diffusion_noise = tf.squeeze(diffusion_noise)
if diffusion_noise.shape.rank == 3:
diffusion_noise = tf.repeat(
tf.expand_dims(diffusion_noise, axis=0), batch_size, axis=0
)
latent = diffusion_noise
else:
latent = self._get_initial_diffusion_noise(batch_size, seed)
# Iterative reverse diffusion stage
timesteps = tf.range(1, 1000, 1000 // num_steps)
alphas, alphas_prev = self._get_initial_alphas(timesteps)
progbar = keras.utils.Progbar(len(timesteps))
iteration = 0
for index, timestep in list(enumerate(timesteps))[::-1]:
latent_prev = latent # Set aside the previous latent vector
t_emb = self._get_timestep_embedding(timestep, batch_size)
# Prepare the latent and unconditional latent to be run with a single forward call
latent = tf.concat([latent, latent], 0)
t_emb = tf.concat([t_emb, t_emb], 0)
latent_numpy = self.diffusion_model.forward(
[latent.numpy(), t_emb.numpy(), context.numpy()]
)
latent = tf.convert_to_tensor(latent_numpy, dtype=tf.float32)
latent, unconditional_latent = tf.split(latent, 2)
latent = unconditional_latent + unconditional_guidance_scale * (
latent - unconditional_latent
)
a_t, a_prev = alphas[index], alphas_prev[index]
pred_x0 = (latent_prev - math.sqrt(1 - a_t) * latent) / math.sqrt(
a_t
)
latent = (
latent * math.sqrt(1.0 - a_prev) + math.sqrt(a_prev) * pred_x0
)
iteration += 1
progbar.update(iteration)
# Decoding stage
decoded = self.decoder.predict_on_batch(latent)
decoded = ((decoded + 1) / 2) * 255
return np.clip(decoded, 0, 255).astype("uint8")
def _get_unconditional_context(self):
unconditional_tokens = tf.convert_to_tensor(
[_UNCONDITIONAL_TOKENS], dtype=tf.int32
)
unconditional_context = self.text_encoder.predict_on_batch(
[unconditional_tokens, self._get_pos_ids()]
)
return unconditional_context
def _get_timestep_embedding(
self, timestep, batch_size, dim=320, max_period=10000
):
half = dim // 2
freqs = tf.math.exp(
-math.log(max_period) * tf.range(0, half, dtype=tf.float32) / half
)
args = tf.convert_to_tensor([timestep], dtype=tf.float32) * freqs
embedding = tf.concat([tf.math.cos(args), tf.math.sin(args)], 0)
embedding = tf.reshape(embedding, [1, -1])
return tf.repeat(embedding, batch_size, axis=0)
def _get_initial_alphas(self, timesteps):
alphas = [_ALPHAS_CUMPROD[t] for t in timesteps]
alphas_prev = [1.0] + alphas[:-1]
return alphas, alphas_prev
def _get_initial_diffusion_noise(self, batch_size, seed):
return tf.random.normal(
(batch_size, self.img_height // 8, self.img_width // 8, 4),
seed=seed,
)
@staticmethod
def _get_pos_ids():
return tf.convert_to_tensor(
[list(range(MAX_PROMPT_LENGTH))], dtype=tf.int32
)
if __name__ == "__main__":
SD = SharkStableDiffusion(device=args.device)
images = SD.text_to_image(args.prompt, num_steps=args.steps)
pil_images = [Image.fromarray(image) for image in images]
save_fname = args.prompt + ".jpg"
if args.save_path is not None:
save_fname = args.save_path
pil_images[0].save(save_fname)

View File

@@ -1,2 +0,0 @@
*.vmfb
*.jpg

View File

@@ -1,44 +0,0 @@
# STABLE DIFFUSION
## Installation
Follow setup instructions in the main [README.md](https://github.com/nod-ai/SHARK#readme) for regular usage.
## Debug commands and other advanced usage follows.
```shell
python main.py --precision="fp32"|"fp16" --device="cpu"|"cuda"|"vulkan" --import_mlir|--no-import_mlir --prompt "enter the text"
```
## dump all dispatch .spv and isa using amdllpc
```shell
python main.py --precision="fp16" --device="vulkan" --iree-vulkan-target-triple=rdna3-unknown-linux --no-load_vmfb --dispatch_benchmarks="all" --dispatch_benchmarks_dir="SD_dispatches" --dump_isa
```
## Compile and save the .vmfb (using vulkan fp16 as an example):
```shell
python shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb
```
## Capture an RGP trace
```shell
python shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb --enable_rgp
```
## Run the vae module with iree-benchmark-module (NCHW, fp16, vulkan, for example):
```shell
iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --device=vulkan --function_input=1x4x64x64xf16
```
## Run the unet module with iree-benchmark-module (same config as above):
```shell
##if you want to use .npz inputs:
unzip ~/.local/shark_tank/<your unet>/inputs.npz
iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --function_input=@arr_0.npy --function_input=1xf16 --function_input=@arr_2.npy --function_input=@arr_3.npy --function_input=@arr_4.npy
```

View File

@@ -1,25 +0,0 @@
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(
text=["a photo of a cat", "a photo of a dog"],
images=image,
return_tensors="pt",
padding=True,
)
outputs = model(**inputs)
logits_per_image = (
outputs.logits_per_image
) # this is the image-text similarity score
probs = logits_per_image.softmax(
dim=1
) # we can take the softmax to get the label probabilities

View File

@@ -1,188 +0,0 @@
from transformers import CLIPTextModel, CLIPTokenizer
import torch
from PIL import Image
from diffusers import (
LMSDiscreteScheduler,
PNDMScheduler,
DDIMScheduler,
DPMSolverMultistepScheduler,
EulerDiscreteScheduler,
)
from tqdm.auto import tqdm
import numpy as np
from stable_args import args
from utils import get_shark_model, set_iree_runtime_flags
from opt_params import get_unet, get_vae, get_clip
import time
from model_wrappers import get_vae_mlir
from shark.iree_utils.compile_utils import dump_isas
# Helper function to profile the vulkan device.
def start_profiling(file_path="foo.rdc", profiling_mode="queue"):
if args.vulkan_debug_utils and "vulkan" in args.device:
import iree
print(f"Profiling and saving to {file_path}.")
vulkan_device = iree.runtime.get_device(args.device)
vulkan_device.begin_profiling(mode=profiling_mode, file_path=file_path)
return vulkan_device
return None
def end_profiling(device):
if device:
return device.end_profiling()
if __name__ == "__main__":
dtype = torch.float32 if args.precision == "fp32" else torch.half
prompt = args.prompts
height = 512 # default height of Stable Diffusion
width = 512 # default width of Stable Diffusion
if args.version == "v2":
height = 768
width = 768
num_inference_steps = args.steps # Number of denoising steps
# Scale for classifier-free guidance
guidance_scale = torch.tensor(args.guidance_scale).to(torch.float32)
generator = torch.manual_seed(
args.seed
) # Seed generator to create the inital latent noise
batch_size = len(prompt)
set_iree_runtime_flags()
unet = get_unet()
vae = get_vae()
clip = get_clip()
if args.dump_isa:
dump_isas(args.dispatch_benchmarks_dir)
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
scheduler = DPMSolverMultistepScheduler.from_pretrained(
"CompVis/stable-diffusion-v1-4",
subfolder="scheduler",
)
if args.version == "v2":
tokenizer = CLIPTokenizer.from_pretrained(
"stabilityai/stable-diffusion-2", subfolder="tokenizer"
)
scheduler = DPMSolverMultistepScheduler.from_pretrained(
"stabilityai/stable-diffusion-2",
subfolder="scheduler",
)
if args.version == "v2.1base":
tokenizer = CLIPTokenizer.from_pretrained(
"stabilityai/stable-diffusion-2-1-base", subfolder="tokenizer"
)
scheduler = EulerDiscreteScheduler.from_pretrained(
"stabilityai/stable-diffusion-2-1-base",
subfolder="scheduler",
)
start = time.time()
text_input = tokenizer(
prompt,
padding="max_length",
max_length=args.max_length,
truncation=True,
return_tensors="pt",
)
clip_inf_start = time.time()
text_embeddings = clip.forward((text_input.input_ids,))
clip_inf_end = time.time()
text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
max_length = text_input.input_ids.shape[-1]
uncond_input = tokenizer(
[""] * batch_size,
padding="max_length",
max_length=max_length,
return_tensors="pt",
)
uncond_clip_inf_start = time.time()
uncond_embeddings = clip.forward((uncond_input.input_ids,))
uncond_clip_inf_end = time.time()
uncond_embeddings = torch.from_numpy(uncond_embeddings).to(dtype)
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
latents = torch.randn(
(batch_size, 4, height // 8, width // 8),
generator=generator,
dtype=torch.float32,
).to(dtype)
scheduler.set_timesteps(num_inference_steps)
scheduler.is_scale_input_called = True
latents = latents * scheduler.init_noise_sigma
text_embeddings_numpy = text_embeddings.detach().numpy()
avg_ms = 0
for i, t in tqdm(enumerate(scheduler.timesteps)):
step_start = time.time()
print(f"i = {i} t = {t}", end="")
timestep = torch.tensor([t]).to(dtype).detach().numpy()
latent_model_input = scheduler.scale_model_input(latents, t)
latents_numpy = latent_model_input.detach().numpy()
profile_device = start_profiling(file_path="unet.rdc")
noise_pred = unet.forward(
(
latents_numpy,
timestep,
text_embeddings_numpy,
guidance_scale,
)
)
end_profiling(profile_device)
noise_pred = torch.from_numpy(noise_pred)
step_time = time.time() - step_start
avg_ms += step_time
step_ms = int((step_time) * 1000)
print(f" ({step_ms}ms)")
latents = scheduler.step(noise_pred, t, latents).prev_sample
avg_ms = 1000 * avg_ms / args.steps
print(f"Average step time: {avg_ms}ms/it")
# scale and decode the image latents with vae
latents = 1 / 0.18215 * latents
# latents = latents.
latents_numpy = latents.detach().numpy()
profile_device = start_profiling(file_path="vae.rdc")
vae_start = time.time()
image = vae.forward((latents_numpy,))
vae_end = time.time()
end_profiling(profile_device)
image = torch.from_numpy(image)
image = image.detach().cpu().permute(0, 2, 3, 1) * 255.0
images = image.numpy().round().astype("uint8")
total_end = time.time()
clip_inf_time = (clip_inf_end - clip_inf_start) * 1000
uncond_clip_inf_time = (uncond_clip_inf_end - uncond_clip_inf_start) * 1000
avg_clip_inf = (clip_inf_time + uncond_clip_inf_time) / 2
vae_inf_time = (vae_end - vae_start) * 1000
print(
f"Clip Inference Avg time (ms) = ({clip_inf_time:.3f} + {uncond_clip_inf_time:.3f}) / 2 = {avg_clip_inf:.3f}"
)
print(f"VAE Inference time (ms): {vae_inf_time:.3f}")
print(f"Total image generation runtime (s): {total_end - start:.4f}")
pil_images = [Image.fromarray(image) for image in images]
for i in range(batch_size):
pil_images[i].save(f"{args.prompts[i]}_{i}.jpg")

View File

@@ -1,184 +0,0 @@
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
from transformers import CLIPTextModel
from utils import compile_through_fx
from stable_args import args
import torch
BATCH_SIZE = len(args.prompts)
model_config = {
"v2": "stabilityai/stable-diffusion-2",
"v2.1base": "stabilityai/stable-diffusion-2-1-base",
"v1.4": "CompVis/stable-diffusion-v1-4",
}
model_input = {
"v2": {
"clip": (torch.randint(1, 2, (1, 77)),),
"vae": (torch.randn(1, 4, 96, 96),),
"unet": (
torch.randn(1, 4, 96, 96), # latents
torch.tensor([1]).to(torch.float32), # timestep
torch.randn(2, 77, 1024), # embedding
torch.tensor(1).to(torch.float32), # guidance_scale
),
},
"v2.1base": {
"clip": (torch.randint(1, 2, (1, 77)),),
"vae": (torch.randn(1, 4, 64, 64),),
"unet": (
torch.randn(1, 4, 64, 64), # latents
torch.tensor([1]).to(torch.float32), # timestep
torch.randn(2, 77, 1024), # embedding
torch.tensor(1).to(torch.float32), # guidance_scale
),
},
"v1.4": {
"clip": (torch.randint(1, 2, (1, 77)),),
"vae": (torch.randn(1, 4, 64, 64),),
"unet": (
torch.randn(1, 4, 64, 64),
torch.tensor([1]).to(torch.float32), # timestep
torch.randn(2, 77, 768),
torch.tensor(1).to(torch.float32),
),
},
}
# revision param for from_pretrained defaults to "main" => fp32
model_revision = "fp16" if args.precision == "fp16" else "main"
def get_clip_mlir(model_name="clip_text", extra_args=[]):
text_encoder = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
)
if args.version == "v2":
text_encoder = CLIPTextModel.from_pretrained(
model_config[args.version], subfolder="text_encoder"
)
class CLIPText(torch.nn.Module):
def __init__(self):
super().__init__()
self.text_encoder = text_encoder
def forward(self, input):
return self.text_encoder(input)[0]
clip_model = CLIPText()
shark_clip = compile_through_fx(
clip_model,
model_input[args.version]["clip"],
model_name=model_name,
extra_args=extra_args,
)
return shark_clip
def get_vae_mlir(model_name="vae", extra_args=[]):
class VaeModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.vae = AutoencoderKL.from_pretrained(
model_config[args.version],
subfolder="vae",
revision=model_revision,
)
def forward(self, input):
x = self.vae.decode(input, return_dict=False)[0]
return (x / 2 + 0.5).clamp(0, 1)
vae = VaeModel()
if args.precision == "fp16":
vae = vae.half().cuda()
inputs = tuple(
[
inputs.half().cuda()
for inputs in model_input[args.version]["vae"]
]
)
else:
inputs = model_input[args.version]["vae"]
shark_vae = compile_through_fx(
vae,
inputs,
model_name=model_name,
extra_args=extra_args,
)
return shark_vae
def get_vae_encode_mlir(model_name="vae_encode", extra_args=[]):
class VaeEncodeModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.vae = AutoencoderKL.from_pretrained(
model_config[args.version],
subfolder="vae",
revision="fp16",
)
def forward(self, x):
input = 2 * (x - 0.5)
return self.vae.encode(input, return_dict=False)[0]
vae = VaeEncodeModel()
vae = vae.half().cuda()
inputs = tuple(
[inputs.half().cuda() for inputs in model_input[args.version]["vae"]]
)
shark_vae = compile_through_fx(
vae,
inputs,
model_name=model_name,
extra_args=extra_args,
)
return shark_vae
def get_unet_mlir(model_name="unet", extra_args=[]):
class UnetModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.unet = UNet2DConditionModel.from_pretrained(
model_config[args.version],
subfolder="unet",
revision=model_revision,
)
self.in_channels = self.unet.in_channels
self.train(False)
def forward(self, latent, timestep, text_embedding, guidance_scale):
# expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
latents = torch.cat([latent] * 2)
unet_out = self.unet.forward(
latents, timestep, text_embedding, return_dict=False
)[0]
noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
noise_pred = noise_pred_uncond + guidance_scale * (
noise_pred_text - noise_pred_uncond
)
return noise_pred
unet = UnetModel()
if args.precision == "fp16":
unet = unet.half().cuda()
inputs = tuple(
[
inputs.half().cuda() if len(inputs.shape) != 0 else inputs
for inputs in model_input[args.version]["unet"]
]
)
else:
inputs = model_input[args.version]["unet"]
shark_unet = compile_through_fx(
unet,
inputs,
model_name=model_name,
extra_args=extra_args,
)
return shark_unet

View File

@@ -1,153 +0,0 @@
import sys
from model_wrappers import (
get_vae_mlir,
get_vae_encode_mlir,
get_unet_mlir,
get_clip_mlir,
)
from stable_args import args
from utils import get_shark_model
BATCH_SIZE = len(args.prompts)
if BATCH_SIZE != 1:
sys.exit("Only batch size 1 is supported.")
def get_unet():
iree_flags = []
if len(args.iree_vulkan_target_triple) > 0:
iree_flags.append(
f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
)
# Tuned model is present for `fp16` precision.
if args.precision == "fp16":
if args.use_tuned:
bucket = "gs://shark_tank/vivian"
model_name = "unet_1dec_fp16_tuned"
return get_shark_model(bucket, model_name, iree_flags)
else:
bucket = "gs://shark_tank/stable_diffusion"
model_name = "unet_8dec_fp16"
if args.version == "v2.1base":
model_name = "unet2base_8dec_fp16"
iree_flags += [
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=32",
"--iree-flow-enable-conv-img2col-transform",
]
if args.import_mlir:
return get_unet_mlir(model_name, iree_flags)
return get_shark_model(bucket, model_name, iree_flags)
# Tuned model is not present for `fp32` case.
if args.precision == "fp32":
bucket = "gs://shark_tank/stable_diffusion"
model_name = "unet_1dec_fp32"
iree_flags += [
"--iree-flow-enable-conv-nchw-to-nhwc-transform",
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=16",
]
if args.import_mlir:
return get_unet_mlir(model_name, iree_flags)
return get_shark_model(bucket, model_name, iree_flags)
if args.precision == "int8":
bucket = "gs://shark_tank/prashant_nod"
model_name = "unet_int8"
iree_flags += [
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=32",
]
sys.exit("int8 model is currently in maintenance.")
# # TODO: Pass iree_flags to the exported model.
# if args.import_mlir:
# sys.exit(
# "--import_mlir is not supported for the int8 model, try --no-import_mlir flag."
# )
# return get_shark_model(bucket, model_name, iree_flags)
def get_vae():
iree_flags = []
if len(args.iree_vulkan_target_triple) > 0:
iree_flags.append(
f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
)
if args.precision in ["fp16", "int8"]:
bucket = "gs://shark_tank/stable_diffusion"
model_name = "vae_8dec_fp16"
if args.version == "v2.1base":
model_name = "vae2base_8dec_fp16"
iree_flags += [
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=32",
"--iree-flow-enable-conv-img2col-transform",
]
if args.import_mlir:
return get_vae_mlir(model_name, iree_flags)
return get_shark_model(bucket, model_name, iree_flags)
if args.precision == "fp32":
bucket = "gs://shark_tank/stable_diffusion"
model_name = "vae_1dec_fp32"
iree_flags += [
"--iree-flow-enable-conv-nchw-to-nhwc-transform",
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=16",
]
if args.import_mlir:
return get_vae_mlir(model_name, iree_flags)
return get_shark_model(bucket, model_name, iree_flags)
def get_vae_encode():
iree_flags = []
if len(args.iree_vulkan_target_triple) > 0:
iree_flags.append(
f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
)
if args.precision in ["fp16", "int8"]:
bucket = "gs://shark_tank/stable_diffusion"
model_name = "vae_encode_1dec_fp16"
if args.version == "v2":
model_name = "vae2_encode_29nov_fp16"
iree_flags += [
"--iree-flow-enable-conv-nchw-to-nhwc-transform",
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=32",
]
if args.import_mlir:
return get_vae_encode_mlir(model_name, iree_flags)
return get_shark_model(bucket, model_name, iree_flags)
if args.precision == "fp32":
bucket = "gs://shark_tank/stable_diffusion"
model_name = "vae_encode_1dec_fp32"
iree_flags += [
"--iree-flow-enable-conv-nchw-to-nhwc-transform",
"--iree-flow-enable-padding-linalg-ops",
"--iree-flow-linalg-ops-padding-size=16",
]
if args.import_mlir:
return get_vae_mlir(model_name, iree_flags)
return get_shark_model(bucket, model_name, iree_flags)
def get_clip():
iree_flags = []
if len(args.iree_vulkan_target_triple) > 0:
iree_flags.append(
f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
)
bucket = "gs://shark_tank/stable_diffusion"
model_name = "clip_8dec_fp32"
if args.version == "v2.1base":
model_name = "clip2base_8dec_fp32"
iree_flags += [
"--iree-flow-linalg-ops-padding-size=16",
"--iree-flow-enable-padding-linalg-ops",
]
if args.import_mlir:
return get_clip_mlir(model_name, iree_flags)
return get_shark_model(bucket, model_name, iree_flags)

View File

@@ -1,44 +0,0 @@
Compile / Run Instructions:
To compile .vmfb for SD (vae, unet, CLIP), run the following commands with the .mlir in your local shark_tank cache (default location for Linux users is `~/.local/shark_tank`). These will be available once the script from [this README](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md) is run once.
Running the script mentioned above with the `--save_vmfb` flag will also save the .vmfb in your SHARK base directory if you want to skip straight to benchmarks.
Compile Commands FP32/FP16:
```shell
Vulkan AMD:
iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
# add --mlir-print-debuginfo --mlir-print-op-on-diagnostic=true for debug
# use iree-input-type=mhlo for tf models
CUDA NVIDIA:
iree-compile --iree-input-type=none --iree-hal-target-backends=cuda --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
CPU:
iree-compile --iree-input-type=none --iree-hal-target-backends=llvm-cpu --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
```
Run / Benchmark Command (FP32 - NCHW):
(NEED to use BS=2 since we do two forward passes to unet as a result of classifier free guidance.)
```shell
## Vulkan AMD:
iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --device=vulkan --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
## CUDA:
iree-benchmark-module --module_file=/path/to/vmfb --entry_function=forward --device=cuda --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
## CPU:
iree-benchmark-module --module_file=/path/to/vmfb --entry_function=forward --device=local-task --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
```
Run via vulkan_gui for RGP Profiling:
To build the vulkan app for profiling UNet follow the instructions [here](https://github.com/nod-ai/SHARK/tree/main/cpp) and then run the following command from the cpp directory with your compiled stable_diff.vmfb
```shell
./build/vulkan_gui/iree-vulkan-gui --module_file=/path/to/unet.vmfb --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
```

View File

@@ -1,128 +0,0 @@
import argparse
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
p.add_argument(
"--prompts",
nargs="+",
default=["a photograph of an astronaut riding a horse"],
help="text of which images to be generated.",
)
p.add_argument(
"--device", type=str, default="cpu", help="device to run the model."
)
p.add_argument(
"--steps",
type=int,
default=50,
help="the no. of steps to do the sampling.",
)
p.add_argument(
"--version",
type=str,
default="v2.1base",
help="Specify version of stable diffusion model",
)
p.add_argument(
"--seed",
type=int,
default=42,
help="the seed to use.",
)
p.add_argument(
"--guidance_scale",
type=float,
default=7.5,
help="the value to be used for guidance scaling.",
)
p.add_argument(
"--import_mlir",
default=False,
action=argparse.BooleanOptionalAction,
help="imports the model from torch module to shark_module otherwise downloads the model from shark_tank.",
)
p.add_argument(
"--precision", type=str, default="fp16", help="precision to run the model."
)
p.add_argument(
"--max_length",
type=int,
default=77,
help="max length of the tokenizer output.",
)
p.add_argument(
"--load_vmfb",
default=True,
action=argparse.BooleanOptionalAction,
help="attempts to load the model from a precompiled flatbuffer and compiles + saves it if not found.",
)
p.add_argument(
"--save_vmfb",
default=False,
action=argparse.BooleanOptionalAction,
help="saves the compiled flatbuffer to the local directory",
)
p.add_argument(
"--iree-vulkan-target-triple",
type=str,
default="",
help="Specify target triple for vulkan",
)
p.add_argument(
"--vulkan_debug_utils",
default=False,
action=argparse.BooleanOptionalAction,
help="Profiles vulkan device and collects the .rdc info",
)
p.add_argument(
"--use_tuned",
default=False,
action=argparse.BooleanOptionalAction,
help="Download and use the tuned version of the model if available",
)
p.add_argument(
"--dump_isa",
default=False,
action="store_true",
help="When enabled call amdllpc to get ISA dumps. use with dispatch benchmarks.",
)
p.add_argument(
"--dispatch_benchmarks",
default=None,
help='dispatches to return benchamrk data on. use "All" for all, and None for none.',
)
p.add_argument(
"--dispatch_benchmarks_dir",
default="temp_dispatch_benchmarks",
help='directory where you want to store dispatch data generated with "--dispatch_benchmarks"',
)
p.add_argument(
"--vulkan_large_heap_block_size",
default="4294967296",
help="flag for setting VMA preferredLargeHeapBlockSize for vulkan device, default is 4G",
)
p.add_argument(
"--enable_rgp",
default=False,
action=argparse.BooleanOptionalAction,
help="flag for inserting debug frames between iterations for use with rgp.",
)
args = p.parse_args()

View File

@@ -1,111 +0,0 @@
# Stable Diffusion optimized for AMD RDNA2/RDNA3 GPUs
## Install the latest AMD Drivers
### RDNA2 Drivers:
*AMD Software: Adrenalin Edition 22.11.1 for MLIR/IREE Driver Version 22.20.29.09 for Windows® 10 and Windows® 11 (Windows Driver Store Version 31.0.12029.9003)*
https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mlir-iree
Note that if you previously tried Stable Diffusion with a different driver, it may be necessary to clear vulkan cache after changing drivers.
For Windows users this can be done by clearing the contents of `C:\Users\<username>\AppData\Local\AMD\VkCache\`. On Linux the same cache is typically located at `~/.cache/AMD/VkCache/`.
## Installation
Download the latest Windows SHARK SD binary [here](https://github.com/nod-ai/SHARK/releases/download/20221213.383/shark_sd_20221213_383.exe). Accept if Windows warns of an unsigned .exe.
#### Access Stable Diffusion on http://localhost:8080/?__theme=dark
<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
Here are some samples generated:
![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
<details>
<summary>Advanced Installation </summary>
## Setup your Python VirtualEnvironment and Dependencies
### Windows 10/11 Users
* Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)
* Install Git for Windows from [here](https://git-scm.com/download/win)
#### Allow the install script to run in Powershell
```powershell
set-executionpolicy remotesigned
```
#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
```powershell
git clone https://github.com/nod-ai/SHARK.git
cd SHARK
./setup_venv.ps1 #You can re-run this script to get the latest version
```
### Linux
```shell
git clone https://github.com/nod-ai/SHARK.git
cd SHARK
./setup_venv.sh
source shark.venv/bin/activate
```
### Run Stable Diffusion on your device - WebUI
#### Windows 10/11 Users
```powershell
(shark.venv) PS C:\Users\nod\SHARK> cd web
(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
```
#### Linux Users
```shell
(shark.venv) > cd web
(shark.venv) > python index.py
```
### Run Stable Diffusion on your device - Commandline
#### Windows 10/11 Users
```powershell
(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
```
#### Linux
```shell
python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
```
The output on a 6900XT would like:
```shell
44it [00:08, 5.14it/s]i = 44 t = 120 (191ms)
45it [00:08, 5.15it/s]i = 45 t = 100 (191ms)
46it [00:08, 5.16it/s]i = 46 t = 80 (191ms)
47it [00:09, 5.16it/s]i = 47 t = 60 (193ms)
48it [00:09, 5.15it/s]i = 48 t = 40 (195ms)
49it [00:09, 5.12it/s]i = 49 t = 20 (196ms)
50it [00:09, 5.14it/s]
Average step time: 192.8154182434082ms/it
Total image generation runtime (s): 10.390909433364868
(shark.venv) PS C:\g\shark>
```
For more options to the Stable Diffusion model read [this](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md)
</details>
<details>
<summary>Discord link</summary>
Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware.
</details>

View File

@@ -1,83 +0,0 @@
import os
import torch
from shark.shark_inference import SharkInference
from stable_args import args
from shark.shark_importer import import_with_fx
from shark.iree_utils.vulkan_utils import set_iree_vulkan_runtime_flags
def _compile_module(shark_module, model_name, extra_args=[]):
if args.load_vmfb or args.save_vmfb:
device = (
args.device
if "://" not in args.device
else "-".join(args.device.split("://"))
)
extended_name = "{}_{}".format(model_name, device)
vmfb_path = os.path.join(os.getcwd(), extended_name + ".vmfb")
if args.load_vmfb and os.path.isfile(vmfb_path) and not args.save_vmfb:
print(f"loading existing vmfb from: {vmfb_path}")
shark_module.load_module(vmfb_path, extra_args=extra_args)
else:
if args.save_vmfb:
print("Saving to {}".format(vmfb_path))
else:
print(
"No vmfb found. Compiling and saving to {}".format(
vmfb_path
)
)
path = shark_module.save_module(
os.getcwd(), extended_name, extra_args
)
shark_module.load_module(path, extra_args=extra_args)
else:
shark_module.compile(extra_args)
return shark_module
# Downloads the model from shark_tank and returns the shark_module.
def get_shark_model(tank_url, model_name, extra_args=[]):
from shark.shark_downloader import download_model
mlir_model, func_name, inputs, golden_out = download_model(
model_name,
tank_url=tank_url,
frontend="torch",
)
shark_module = SharkInference(
mlir_model, func_name, device=args.device, mlir_dialect="linalg"
)
return _compile_module(shark_module, model_name, extra_args)
# Converts the torch-module into a shark_module.
def compile_through_fx(model, inputs, model_name, extra_args=[]):
mlir_module, func_name = import_with_fx(model, inputs)
shark_module = SharkInference(
mlir_module,
func_name,
device=args.device,
mlir_dialect="linalg",
)
return _compile_module(shark_module, model_name, extra_args)
def set_iree_runtime_flags():
vulkan_runtime_flags = [
f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
]
if args.enable_rgp:
vulkan_runtime_flags += [
f"--enable_rgp=true",
f"--vulkan_debug_utils=true",
]
if "vulkan" in args.device:
set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
return

View File

@@ -0,0 +1,47 @@
from PIL import Image
import requests
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Model
import torch
from shark.shark_inference import SharkInference
from shark.shark_importer import SharkImporter
from iree.compiler import tf as tfc
from iree.compiler import compile_str
from iree import runtime as ireert
import os
MAX_SEQUENCE_LENGTH = 512
BATCH_SIZE = 1
class T5Module(torch.nn.Module):
def __init__(self):
super().__init__()
self.model = T5ForConditionalGeneration.from_pretrained("t5-small")
self.model.eval()
def forward(self, input_ids):
return self.model.generate(input_ids)
if __name__ == "__main__":
# Prepping Data
tokenizer = T5Tokenizer.from_pretrained("t5-small")
text = "I love the distilled version of models."
task_prefix = "translate English to German: "
encoded_input = tokenizer(task_prefix + text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="pt").input_ids
inputs = (encoded_input)
mlir_importer = SharkImporter(
T5Module(),
inputs,
frontend="torch",
)
import pdb; pdb.set_trace()
minilm_mlir, func_name = mlir_importer.import_mlir(
is_dynamic=True, tracing_required=True
)
shark_module = SharkInference(minilm_mlir, func_name, mlir_dialect="linalg")
shark_module.compile()
import pdb; pdb.set_trace()
output = shark_module.forward(inputs)
print(tokenizer.batch_decode(output, skip_special_tokens=True))

View File

@@ -18,7 +18,7 @@ class T5Module(tf.Module):
self.m = TFT5Model.from_pretrained("t5-small")
self.m.predict = lambda x, y: self.m(input_ids=x, decoder_input_ids=y)
@tf.function(input_signature=t5_inputs, jit_compile=True)
@tf.function(input_signature=t5_inputs)
def forward(self, input_ids, decoder_input_ids):
return self.m.predict(input_ids, decoder_input_ids)

View File

@@ -0,0 +1,51 @@
from PIL import Image
import requests
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration
import tensorflow as tf
from shark.shark_inference import SharkInference
from shark.shark_importer import SharkImporter
from iree.compiler import tf as tfc
from iree.compiler import compile_str
from iree import runtime as ireert
import os
MAX_SEQUENCE_LENGTH = 512
BATCH_SIZE = 1
# Create a set of inputs
t5_inputs = [
tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32)
]
class T5Module(tf.Module):
def __init__(self):
super(T5Module, self).__init__()
self.m = TFT5ForConditionalGeneration.from_pretrained("t5-small")
self.m.predict = lambda x: self.m.generate(input_ids=x)
@tf.function(input_signature=t5_inputs)
def forward(self, input_ids):
return self.m.predict(input_ids)
if __name__ == "__main__":
# Prepping Data
tokenizer = T5Tokenizer.from_pretrained("t5-small")
text = "I love the distilled version of models."
task_prefix = "translate English to German: "
encoded_input = tokenizer(task_prefix + text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="tf").input_ids
inputs = (encoded_input)
mlir_importer = SharkImporter(
T5Module(),
inputs,
frontend="tf",
)
minilm_mlir, func_name = mlir_importer.import_mlir(
is_dynamic=False, tracing_required=False
)
shark_module = SharkInference(minilm_mlir, func_name, mlir_dialect="mhlo")
shark_module.compile()
import pdb; pdb.set_trace()
output = shark_module.forward(inputs)
print(tokenizer.batch_decode(output, skip_special_tokens=True))

View File

@@ -1,9 +1,8 @@
import torch
import numpy as np
from shark.shark_inference import SharkInference
from shark.shark_importer import SharkImporter
from shark_runner import SharkInference
# Currently not supported aten.transpose_conv2d missing.
class UnetModule(torch.nn.Module):
def __init__(self):
super().__init__()
@@ -15,7 +14,7 @@ class UnetModule(torch.nn.Module):
init_features=32,
pretrained=True,
)
self.model.eval()
self.train(False)
def forward(self, input):
return self.model(input)
@@ -23,17 +22,10 @@ class UnetModule(torch.nn.Module):
input = torch.randn(1, 3, 224, 224)
mlir_importer = SharkImporter(
print(input)
shark_module = SharkInference(
UnetModule(),
(input,),
frontend="torch",
)
(vision_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
tracing_required=False
)
shark_module = SharkInference(vision_mlir, func_name, mlir_dialect="linalg")
shark_module.compile()
result = shark_module.forward((input,))
np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
shark_module.benchmark_forward((input,))
print(input)

View File

@@ -1,13 +1,11 @@
from shark.shark_inference import SharkInference
from shark.shark_downloader import download_model
from shark.shark_downloader import download_torch_model
mlir_model, func_name, inputs, golden_out = download_model(
"v_diffusion", frontend="torch"
)
mlir_model, func_name, inputs, golden_out = download_torch_model("v_diffusion")
shark_module = SharkInference(
mlir_model, func_name, device="vulkan", mlir_dialect="linalg"
mlir_model, func_name, mlir_dialect="linalg"
)
shark_module.compile()
result = shark_module.forward(inputs)

View File

@@ -52,8 +52,7 @@ class BertModule(tf.Module):
input_signature=[
bert_input, # inputs
tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32), # labels
],
jit_compile=True,
]
)
def forward(self, inputs, labels):
with tf.GradientTape() as tape:

View File

@@ -1,41 +0,0 @@
# Stable Diffusion Img2Img model
## Installation
<details>
<summary>Installation (Linux)</summary>
### Activate shark.venv Virtual Environment
```shell
source shark.venv/bin/activate
# Some older pip installs may not be able to handle the recent PyTorch deps
python -m pip install --upgrade pip
```
### Install dependencies
# Run the setup.sh script
```shell
./setup.sh
```
### Run the Stable diffusion Img2Img model
To run the model with the default set of images and params, run:
```shell
python stable_diffusion_img2img.py
```
To run the model with your set of images, and parameters you need to specify the following params:
1.) Input images directory with the arg `--input_dir` containing 3-5 images.
2.) What to teach the model? Using the arg `--what_to_teach`, allowed values are `object` or `style`.
3.) Placeholder token using the arg `--placeholder_token`, that represents your new concept. It should be passed with the opening and closing angle brackets. For ex: token is `cat-toy`, it should be passed as `<cat-toy>`.
4.) Initializer token using the arg `--initializer_token`, which summarise what is your new concept.
For the result, you need to pass the text prompt with the arg: `--prompt`. The prompt string should contain a "*s" in it, which will be replaced by the placeholder token during the inference.
By default the result images will go into the `sd_result` dir. To specify your output dir use the arg: `--output_dir`.
The default value of max_training_steps is `3000`, which takes some hours to complete. You can pass the smaller value with the arg `--training_steps`. Specify the number of images to be sampled for the result with the `--num_inference_samples` arg.

View File

@@ -1,25 +0,0 @@
#!/bin/bash
TD="$(cd $(dirname $0) && pwd)"
if [ -z "$PYTHON" ]; then
PYTHON="$(which python3)"
fi
function die() {
echo "Error executing command: $*"
exit 1
}
PYTHON_VERSION_X_Y=`${PYTHON} -c 'import sys; version=sys.version_info[:2]; print("{0}.{1}".format(*version))'`
echo "Python: $PYTHON"
echo "Python version: $PYTHON_VERSION_X_Y"
mkdir input_images
wget https://huggingface.co/datasets/valhalla/images/resolve/main/2.jpeg -P input_images/
wget https://huggingface.co/datasets/valhalla/images/resolve/main/3.jpeg -P input_images/
wget https://huggingface.co/datasets/valhalla/images/resolve/main/5.jpeg -P input_images/
wget https://huggingface.co/datasets/valhalla/images/resolve/main/6.jpeg -P input_images/
pip install diffusers["training"]==0.4.1 transformers ftfy opencv-python

View File

@@ -1,597 +0,0 @@
# Textual-inversion fine-tuning for Stable Diffusion using diffusers
# This script shows how to "teach" Stable Diffusion a new concept via
# textual-inversion using 🤗 Hugging Face [🧨 Diffusers library](https://github.com/huggingface/diffusers).
# By using just 3-5 images you can teach new concepts to Stable Diffusion
# and personalize the model on your own images.
import argparse
import itertools
import math
import os
import random
import cv2
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch.utils.data import Dataset
import PIL
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from diffusers import (
AutoencoderKL,
DDPMScheduler,
PNDMScheduler,
StableDiffusionPipeline,
UNet2DConditionModel,
)
from diffusers.hub_utils import init_git_repo, push_to_hub
from diffusers.optimization import get_scheduler
from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
from PIL import Image
from torchvision import transforms
from tqdm.auto import tqdm
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
YOUR_TOKEN = "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"
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
p.add_argument(
"--input_dir",
type=str,
default="input_images/",
help="the directory contains the images used for fine tuning",
)
p.add_argument(
"--output_dir",
type=str,
default="sd_result",
help="the directory contains the images used for fine tuning",
)
p.add_argument(
"--training_steps",
type=int,
default=3000,
help="the maximum number of training steps",
)
p.add_argument("--seed", type=int, default=42, help="the random seed")
p.add_argument(
"--what_to_teach",
type=str,
choices=["object", "style"],
default="object",
help="what is it that you are teaching?",
)
p.add_argument(
"--placeholder_token",
type=str,
default="<cat-toy>",
help="It is the token you are going to use to represent your new concept",
)
p.add_argument(
"--initializer_token",
type=str,
default="toy",
help="It is a word that can summarise what is your new concept",
)
p.add_argument(
"--inference_steps",
type=int,
default=50,
help="the number of steps for inference",
)
p.add_argument(
"--num_inference_samples",
type=int,
default=4,
help="the number of samples for inference",
)
p.add_argument(
"--prompt",
type=str,
default="a grafitti in a wall with a *s on it",
help="the text prompt to use",
)
args = p.parse_args()
if "*s" not in args.prompt:
raise ValueError(
f'The prompt should have a "*s" which will be replaced by a placeholder token.'
)
prompt1, prompt2 = args.prompt.split("*s")
args.prompt = prompt1 + args.placeholder_token + prompt2
pretrained_model_name_or_path = "CompVis/stable-diffusion-v1-4"
# Load input images.
images = []
for filename in os.listdir(args.input_dir):
img = cv2.imread(os.path.join(args.input_dir, filename))
if img is not None:
images.append(img)
# Setup the prompt templates for training
imagenet_templates_small = [
"a photo of a {}",
"a rendering of a {}",
"a cropped photo of the {}",
"the photo of a {}",
"a photo of a clean {}",
"a photo of a dirty {}",
"a dark photo of the {}",
"a photo of my {}",
"a photo of the cool {}",
"a close-up photo of a {}",
"a bright photo of the {}",
"a cropped photo of a {}",
"a photo of the {}",
"a good photo of the {}",
"a photo of one {}",
"a close-up photo of the {}",
"a rendition of the {}",
"a photo of the clean {}",
"a rendition of a {}",
"a photo of a nice {}",
"a good photo of a {}",
"a photo of the nice {}",
"a photo of the small {}",
"a photo of the weird {}",
"a photo of the large {}",
"a photo of a cool {}",
"a photo of a small {}",
]
imagenet_style_templates_small = [
"a painting in the style of {}",
"a rendering in the style of {}",
"a cropped painting in the style of {}",
"the painting in the style of {}",
"a clean painting in the style of {}",
"a dirty painting in the style of {}",
"a dark painting in the style of {}",
"a picture in the style of {}",
"a cool painting in the style of {}",
"a close-up painting in the style of {}",
"a bright painting in the style of {}",
"a cropped painting in the style of {}",
"a good painting in the style of {}",
"a close-up painting in the style of {}",
"a rendition in the style of {}",
"a nice painting in the style of {}",
"a small painting in the style of {}",
"a weird painting in the style of {}",
"a large painting in the style of {}",
]
# Setup the dataset
class TextualInversionDataset(Dataset):
def __init__(
self,
data_root,
tokenizer,
learnable_property="object", # [object, style]
size=512,
repeats=100,
interpolation="bicubic",
flip_p=0.5,
set="train",
placeholder_token="*",
center_crop=False,
):
self.data_root = data_root
self.tokenizer = tokenizer
self.learnable_property = learnable_property
self.size = size
self.placeholder_token = placeholder_token
self.center_crop = center_crop
self.flip_p = flip_p
self.image_paths = [
os.path.join(self.data_root, file_path)
for file_path in os.listdir(self.data_root)
]
self.num_images = len(self.image_paths)
self._length = self.num_images
if set == "train":
self._length = self.num_images * repeats
self.interpolation = {
"linear": PIL.Image.LINEAR,
"bilinear": PIL.Image.BILINEAR,
"bicubic": PIL.Image.BICUBIC,
"lanczos": PIL.Image.LANCZOS,
}[interpolation]
self.templates = (
imagenet_style_templates_small
if learnable_property == "style"
else imagenet_templates_small
)
self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
def __len__(self):
return self._length
def __getitem__(self, i):
example = {}
image = Image.open(self.image_paths[i % self.num_images])
if not image.mode == "RGB":
image = image.convert("RGB")
placeholder_string = self.placeholder_token
text = random.choice(self.templates).format(placeholder_string)
example["input_ids"] = self.tokenizer(
text,
padding="max_length",
truncation=True,
max_length=self.tokenizer.model_max_length,
return_tensors="pt",
).input_ids[0]
# default to score-sde preprocessing
img = np.array(image).astype(np.uint8)
if self.center_crop:
crop = min(img.shape[0], img.shape[1])
h, w, = (
img.shape[0],
img.shape[1],
)
img = img[
(h - crop) // 2 : (h + crop) // 2,
(w - crop) // 2 : (w + crop) // 2,
]
image = Image.fromarray(img)
image = image.resize(
(self.size, self.size), resample=self.interpolation
)
image = self.flip_transform(image)
image = np.array(image).astype(np.uint8)
image = (image / 127.5 - 1.0).astype(np.float32)
example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
return example
# Setting up the model
# Load the tokenizer and add the placeholder token as a additional special token.
# Please read and if you agree accept the LICENSE
# [here](https://huggingface.co/CompVis/stable-diffusion-v1-4) if you see an error
tokenizer = CLIPTokenizer.from_pretrained(
pretrained_model_name_or_path,
subfolder="tokenizer",
use_auth_token=YOUR_TOKEN,
)
# Add the placeholder token in tokenizer
num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
if num_added_tokens == 0:
raise ValueError(
f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
" `placeholder_token` that is not already in the tokenizer."
)
# Get token ids for our placeholder and initializer token.
# This code block will complain if initializer string is not a single token
# Convert the initializer_token, placeholder_token to ids
token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
# Check if initializer_token is a single token or a sequence of tokens
if len(token_ids) > 1:
raise ValueError("The initializer token must be a single token.")
initializer_token_id = token_ids[0]
placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
# Load the Stable Diffusion model
# Load models and create wrapper for stable diffusion
text_encoder = CLIPTextModel.from_pretrained(
pretrained_model_name_or_path,
subfolder="text_encoder",
use_auth_token=YOUR_TOKEN,
)
vae = AutoencoderKL.from_pretrained(
pretrained_model_name_or_path,
subfolder="vae",
use_auth_token=YOUR_TOKEN,
)
unet = UNet2DConditionModel.from_pretrained(
pretrained_model_name_or_path,
subfolder="unet",
use_auth_token=YOUR_TOKEN,
)
# We have added the `placeholder_token` in the `tokenizer` so we resize the token embeddings here,
# this will a new embedding vector in the token embeddings for our `placeholder_token`
text_encoder.resize_token_embeddings(len(tokenizer))
# Initialise the newly added placeholder token with the embeddings of the initializer token
token_embeds = text_encoder.get_input_embeddings().weight.data
token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
# In Textual-Inversion we only train the newly added embedding vector,
# so lets freeze rest of the model parameters here.
def freeze_params(params):
for param in params:
param.requires_grad = False
# Freeze vae and unet
freeze_params(vae.parameters())
freeze_params(unet.parameters())
# Freeze all parameters except for the token embeddings in text encoder
params_to_freeze = itertools.chain(
text_encoder.text_model.encoder.parameters(),
text_encoder.text_model.final_layer_norm.parameters(),
text_encoder.text_model.embeddings.position_embedding.parameters(),
)
freeze_params(params_to_freeze)
# Creating our training data
train_dataset = TextualInversionDataset(
data_root=args.input_dir,
tokenizer=tokenizer,
size=512,
placeholder_token=args.placeholder_token,
repeats=100,
learnable_property=args.what_to_teach, # Option selected above between object and style
center_crop=False,
set="train",
)
def create_dataloader(train_batch_size=1):
return torch.utils.data.DataLoader(
train_dataset, batch_size=train_batch_size, shuffle=True
)
# Create noise_scheduler for training.
noise_scheduler = DDPMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
num_train_timesteps=1000,
tensor_format="pt",
)
# Define hyperparameters for our training
hyperparameters = {
"learning_rate": 5e-04,
"scale_lr": True,
"max_train_steps": args.training_steps,
"train_batch_size": 1,
"gradient_accumulation_steps": 4,
"seed": args.seed,
"output_dir": "sd-concept-output",
}
def training_function(text_encoder, vae, unet):
logger = get_logger(__name__)
train_batch_size = hyperparameters["train_batch_size"]
gradient_accumulation_steps = hyperparameters[
"gradient_accumulation_steps"
]
learning_rate = hyperparameters["learning_rate"]
max_train_steps = hyperparameters["max_train_steps"]
output_dir = hyperparameters["output_dir"]
accelerator = Accelerator(
gradient_accumulation_steps=gradient_accumulation_steps,
)
train_dataloader = create_dataloader(train_batch_size)
if hyperparameters["scale_lr"]:
learning_rate = (
learning_rate
* gradient_accumulation_steps
* train_batch_size
* accelerator.num_processes
)
# Initialize the optimizer
optimizer = torch.optim.AdamW(
text_encoder.get_input_embeddings().parameters(), # only optimize the embeddings
lr=learning_rate,
)
text_encoder, optimizer, train_dataloader = accelerator.prepare(
text_encoder, optimizer, train_dataloader
)
# Move vae and unet to device
vae.to(accelerator.device)
unet.to(accelerator.device)
# Keep vae and unet in eval model as we don't train these
vae.eval()
unet.eval()
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(
len(train_dataloader) / gradient_accumulation_steps
)
num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
# Train!
total_batch_size = (
train_batch_size
* accelerator.num_processes
* gradient_accumulation_steps
)
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Instantaneous batch size per device = {train_batch_size}")
logger.info(
f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
)
logger.info(
f" Gradient Accumulation steps = {gradient_accumulation_steps}"
)
logger.info(f" Total optimization steps = {max_train_steps}")
# Only show the progress bar once on each machine.
progress_bar = tqdm(
range(max_train_steps), disable=not accelerator.is_local_main_process
)
progress_bar.set_description("Steps")
global_step = 0
for epoch in range(num_train_epochs):
text_encoder.train()
for step, batch in enumerate(train_dataloader):
with accelerator.accumulate(text_encoder):
# Convert images to latent space
latents = (
vae.encode(batch["pixel_values"])
.latent_dist.sample()
.detach()
)
latents = latents * 0.18215
# Sample noise that we'll add to the latents
noise = torch.randn(latents.shape).to(latents.device)
bsz = latents.shape[0]
# Sample a random timestep for each image
timesteps = torch.randint(
0,
noise_scheduler.num_train_timesteps,
(bsz,),
device=latents.device,
).long()
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_latents = noise_scheduler.add_noise(
latents, noise, timesteps
)
# Get the text embedding for conditioning
encoder_hidden_states = text_encoder(batch["input_ids"])[0]
# Predict the noise residual
noise_pred = unet(
noisy_latents, timesteps, encoder_hidden_states
).sample
loss = (
F.mse_loss(noise_pred, noise, reduction="none")
.mean([1, 2, 3])
.mean()
)
accelerator.backward(loss)
# Zero out the gradients for all token embeddings except the newly added
# embeddings for the concept, as we only want to optimize the concept embeddings
if accelerator.num_processes > 1:
grads = (
text_encoder.module.get_input_embeddings().weight.grad
)
else:
grads = text_encoder.get_input_embeddings().weight.grad
# Get the index for tokens that we want to zero the grads for
index_grads_to_zero = (
torch.arange(len(tokenizer)) != placeholder_token_id
)
grads.data[index_grads_to_zero, :] = grads.data[
index_grads_to_zero, :
].fill_(0)
optimizer.step()
optimizer.zero_grad()
# Checks if the accelerator has performed an optimization step behind the scenes
if accelerator.sync_gradients:
progress_bar.update(1)
global_step += 1
logs = {"loss": loss.detach().item()}
progress_bar.set_postfix(**logs)
if global_step >= max_train_steps:
break
accelerator.wait_for_everyone()
# Create the pipeline using using the trained modules and save it.
if accelerator.is_main_process:
pipeline = StableDiffusionPipeline(
text_encoder=accelerator.unwrap_model(text_encoder),
vae=vae,
unet=unet,
tokenizer=tokenizer,
scheduler=PNDMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
skip_prk_steps=True,
),
safety_checker=StableDiffusionSafetyChecker.from_pretrained(
"CompVis/stable-diffusion-safety-checker"
),
feature_extractor=CLIPFeatureExtractor.from_pretrained(
"openai/clip-vit-base-patch32"
),
)
pipeline.save_pretrained(output_dir)
# Also save the newly trained embeddings
learned_embeds = (
accelerator.unwrap_model(text_encoder)
.get_input_embeddings()
.weight[placeholder_token_id]
)
learned_embeds_dict = {
args.placeholder_token: learned_embeds.detach().cpu()
}
torch.save(
learned_embeds_dict, os.path.join(output_dir, "learned_embeds.bin")
)
import accelerate
accelerate.notebook_launcher(
training_function, args=(text_encoder, vae, unet), num_processes=1
)
# Set up the pipeline
pipe = StableDiffusionPipeline.from_pretrained(
hyperparameters["output_dir"],
# torch_dtype=torch.float16,
)
all_images = []
for _ in range(args.num_inference_samples):
images = pipe(
[args.prompt],
num_inference_steps=args.inference_steps,
guidance_scale=7.5,
).images
all_images.extend(images)
# output_path = os.path.abspath(os.path.join(os.getcwd(), args.output_dir))
if not os.path.isdir(args.output_dir):
os.mkdir(args.output_dir)
[
image.save(f"{args.output_dir}/{i}.jpeg")
for i, image in enumerate(all_images)
]

View File

@@ -48,8 +48,8 @@ class EagerModeIREELinalgOnTensorsBackend(TorchMLIREagerBackend):
def __init__(self, device: str):
self.torch_device_str = device
self.config = ireert.Config(IREE_DEVICE_MAP[device])
self.raw_device_str = device
self.iree_device_str = IREE_DEVICE_MAP[device]
self.config = ireert.Config(self.iree_device_str)
def get_torch_metadata(
self, tensor: DeviceArray, kwargs: Dict[str, Any]
@@ -71,7 +71,7 @@ class EagerModeIREELinalgOnTensorsBackend(TorchMLIREagerBackend):
"EagerMode",
)
callable, _ = get_iree_compiled_module(
imported_module, self.raw_device_str, func_name=fn_name
imported_module, self.iree_device_str, func_name=fn_name
)
return callable

View File

@@ -37,51 +37,30 @@ def run_cmd(cmd):
sys.exit("Exiting program due to error running:", cmd)
def iree_device_map(device):
uri_parts = device.split("://", 2)
if len(uri_parts) == 1:
return _IREE_DEVICE_MAP[uri_parts[0]]
else:
return f"{_IREE_DEVICE_MAP[uri_parts[0]]}://{uri_parts[1]}"
def get_supported_device_list():
return list(_IREE_DEVICE_MAP.keys())
_IREE_DEVICE_MAP = {
IREE_DEVICE_MAP = {
"cpu": "local-task",
"gpu": "cuda",
"cuda": "cuda",
"vulkan": "vulkan",
"metal": "vulkan",
"rocm": "rocm",
"intel-gpu": "level_zero",
"intel-gpu" : "level_zero",
}
def iree_target_map(device):
if "://" in device:
device = device.split("://")[0]
return _IREE_TARGET_MAP[device]
_IREE_TARGET_MAP = {
"cpu": "llvm-cpu",
IREE_TARGET_MAP = {
"cpu": "dylib",
"gpu": "cuda",
"cuda": "cuda",
"vulkan": "vulkan",
"metal": "vulkan",
"rocm": "rocm",
"intel-gpu": "opencl-spirv",
"intel-gpu" : "opencl-spirv",
}
# Finds whether the required drivers are installed for the given device.
def check_device_drivers(device):
"""Checks necessary drivers present for gpu and vulkan devices"""
if "://" in device:
device = device.split("://")[0]
if device == "cuda":
if device in ["gpu", "cuda"]:
try:
subprocess.check_output("nvidia-smi")
except Exception:
@@ -92,18 +71,10 @@ def check_device_drivers(device):
except Exception:
return True
elif device in ["intel-gpu"]:
try:
subprocess.check_output(["dpkg", "-L", "intel-level-zero-gpu"])
return False
except Exception:
return True
# TODO: Add intel gpu check.
return False
elif device == "cpu":
return False
elif device == "rocm":
try:
subprocess.check_output("rocminfo")
except Exception:
return True
# Unknown device.
else:
return True
@@ -113,11 +84,9 @@ def check_device_drivers(device):
# Installation info for the missing device drivers.
def device_driver_info(device):
if device == "cuda":
if device in ["gpu", "cuda"]:
return "nvidia-smi not found, please install the required drivers from https://www.nvidia.in/Download/index.aspx?lang=en-in"
elif device in ["metal", "vulkan"]:
return "vulkaninfo not found, Install from https://vulkan.lunarg.com/sdk/home or your distribution"
elif device == "rocm":
return "rocm info not found. Please install rocm"
else:
return f"{device} is not supported."

View File

@@ -13,13 +13,12 @@
# limitations under the License.
import iree.runtime.scripts.iree_benchmark_module as benchmark_module
from shark.iree_utils._common import run_cmd, iree_device_map
from shark.iree_utils.cpu_utils import get_cpu_count
from shark.iree_utils._common import run_cmd, IREE_DEVICE_MAP
import numpy as np
import os
import re
UNIT_TO_SECOND_MAP = {"us": 1e-6, "ms": 0.001, "s": 1}
UNIT_TO_SECOND_MAP = {"ms": 0.001, "s": 1}
def tensor_to_type_str(input_tensors: tuple, mlir_dialect: str):
@@ -35,12 +34,9 @@ def tensor_to_type_str(input_tensors: tuple, mlir_dialect: str):
dtype_string = str(input_tensor.dtype).replace("torch.", "")
elif mlir_dialect in ["mhlo", "tflite"]:
dtype = input_tensor.dtype
try:
dtype_string = re.findall("'[^\"]*'", str(dtype))[0].replace(
"'", ""
)
except IndexError:
dtype_string = str(dtype)
dtype_string = re.findall("'[^\"]*'", str(dtype))[0].replace(
"'", ""
)
regex_split = re.compile("([a-zA-Z]+)([0-9]+)")
match = regex_split.match(dtype_string)
mlir_type_string = str(match.group(1)[0]) + str(match.group(2))
@@ -70,40 +66,10 @@ def build_benchmark_args(
# TODO: Replace name of train with actual train fn name.
fn_name = "train"
benchmark_cl.append(f"--entry_function={fn_name}")
benchmark_cl.append(f"--device={iree_device_map(device)}")
benchmark_cl.append(f"--device={IREE_DEVICE_MAP[device]}")
mlir_input_types = tensor_to_type_str(input_tensors, mlir_dialect)
for mlir_input in mlir_input_types:
benchmark_cl.append(f"--function_input={mlir_input}")
if device == "cpu":
num_cpus = get_cpu_count()
if num_cpus is not None:
benchmark_cl.append(f"--task_topology_max_group_count={num_cpus}")
time_extractor = "| awk 'END{{print $2 $3}}'"
benchmark_cl.append(time_extractor)
return benchmark_cl
def build_benchmark_args_non_tensor_input(
input_file: str,
device: str,
inputs: tuple,
mlir_dialect: str,
function_name: str,
):
"""
Inputs: input_file leading to vmfb, input_tensor to function, target device,
and whether it is training or not.
Outputs: string that execute benchmark-module on target model.
"""
path = benchmark_module.__path__[0]
benchmarker_path = os.path.join(path, "..", "..", "iree-benchmark-module")
benchmark_cl = [benchmarker_path, f"--module_file={input_file}"]
# TODO: The function named can be passed as one of the args.
if function_name:
benchmark_cl.append(f"--entry_function={function_name}")
benchmark_cl.append(f"--device={iree_device_map(device)}")
for input in inputs:
benchmark_cl.append(f"--function_input={input}")
time_extractor = "| awk 'END{{print $2 $3}}'"
benchmark_cl.append(time_extractor)
return benchmark_cl

View File

@@ -13,34 +13,24 @@
# limitations under the License.
import iree.runtime as ireert
import iree.compiler as ireec
from shark.iree_utils._common import iree_device_map, iree_target_map
from shark.iree_utils.benchmark_utils import *
from shark.parser import shark_args
from shark.iree_utils._common import IREE_DEVICE_MAP, IREE_TARGET_MAP
import numpy as np
import os
import re
# Get the iree-compile arguments given device.
def get_iree_device_args(device, extra_args=[]):
if "://" in device:
device = device.split("://")[0]
def get_iree_device_args(device):
if device == "cpu":
from shark.iree_utils.cpu_utils import get_iree_cpu_args
return get_iree_cpu_args()
if device == "cuda":
if device in ["gpu", "cuda"]:
from shark.iree_utils.gpu_utils import get_iree_gpu_args
return get_iree_gpu_args()
if device in ["metal", "vulkan"]:
from shark.iree_utils.vulkan_utils import get_iree_vulkan_args
return get_iree_vulkan_args(extra_args=extra_args)
if device == "rocm":
from shark.iree_utils.gpu_utils import get_iree_rocm_args
return get_iree_rocm_args()
return get_iree_vulkan_args()
return []
@@ -64,182 +54,17 @@ def get_iree_common_args():
return [
"--iree-stream-resource-index-bits=64",
"--iree-vm-target-index-bits=64",
"--iree-util-zero-fill-elided-attrs",
]
# Args that are suitable only for certain models or groups of models.
# shark_args are passed down from pytests to control which models compile with these flags,
# but they can also be set in shark/parser.py
def get_model_specific_args():
ms_args = []
if shark_args.enable_conv_transform == True:
ms_args += ["--iree-flow-enable-conv-nchw-to-nhwc-transform"]
return ms_args
def create_dispatch_dirs(bench_dir, device):
protected_files = ["ordered-dispatches.txt"]
bench_dir_path = bench_dir.split("/")
bench_dir_path[-1] = "temp_" + bench_dir_path[-1]
tmp_bench_dir = "/".join(bench_dir_path)
for f_ in os.listdir(bench_dir):
if os.path.isfile(f"{bench_dir}/{f_}") and f_ not in protected_files:
dir_name = re.sub("\.\S*$", "", f_)
if os.path.exists(f"{bench_dir}/{dir_name}"):
os.system(f"rm -rf {bench_dir}/{dir_name}")
os.system(f"mkdir {bench_dir}/{dir_name}")
os.system(f"mv {bench_dir}/{f_} {bench_dir}/{dir_name}/{f_}")
for f_ in os.listdir(tmp_bench_dir):
if os.path.isfile(f"{tmp_bench_dir}/{f_}"):
dir_name = ""
for d_ in os.listdir(bench_dir):
if re.search(f"{d_}(?=\D)", f_):
dir_name = d_
if dir_name != "":
os.system(
f"mv {tmp_bench_dir}/{f_} {bench_dir}/{dir_name}/{dir_name}_benchmark.mlir"
)
def dump_isas(bench_dir):
for d_ in os.listdir(bench_dir):
if os.path.isdir(f"{bench_dir}/{d_}"):
for f_ in os.listdir(f"{bench_dir}/{d_}"):
if f_.endswith(".spv"):
os.system(
f"amdllpc -gfxip 11.0 {bench_dir}/{d_}/{f_} -v > \
{bench_dir}/{d_}/isa.txt"
)
def compile_benchmark_dirs(bench_dir, device, dispatch_benchmarks):
benchmark_runtimes = {}
dispatch_list = []
all_dispatches = False
if dispatch_benchmarks.lower().strip() == "all":
all_dispatches = True
else:
try:
dispatch_list = [
int(dispatch_index)
for dispatch_index in dispatch_benchmarks.split(" ")
]
except:
print("ERROR: Invalid dispatch benchmarks")
return None
for d_ in os.listdir(bench_dir):
if os.path.isdir(f"{bench_dir}/{d_}"):
in_dispatches = False
for dispatch in dispatch_list:
if str(dispatch) in d_:
in_dispatches = True
if all_dispatches or in_dispatches:
for f_ in os.listdir(f"{bench_dir}/{d_}"):
if "benchmark.mlir" in f_:
dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
module = dispatch_file.read()
dispatch_file.close()
flatbuffer_blob = ireec.compile_str(
module, target_backends=[iree_target_map(device)]
)
vmfb_file = open(
f"{bench_dir}/{d_}/{d_}_benchmark.vmfb", "wb"
)
vmfb_file.write(flatbuffer_blob)
vmfb_file.close()
config = get_iree_runtime_config(device)
vm_module = ireert.VmModule.from_flatbuffer(
config.vm_instance, flatbuffer_blob
)
benchmark_cl = build_benchmark_args_non_tensor_input(
input_file=f"{bench_dir}/{d_}/{d_}_benchmark.vmfb",
device=device,
inputs=(0,),
mlir_dialect="linalg",
function_name="",
)
benchmark_bash = open(
f"{bench_dir}/{d_}/{d_}_benchmark.sh", "w+"
)
benchmark_bash.write("#!/bin/bash\n")
benchmark_bash.write(" ".join(benchmark_cl))
benchmark_bash.close()
benchmark_data = run_benchmark_module(benchmark_cl)
benchmark_file = open(
f"{bench_dir}/{d_}/{d_}_data.txt", "w+"
)
benchmark_file.write(f"DISPATCH: {d_}\n")
benchmark_file.write(str(benchmark_data) + "\n")
benchmark_file.write(
"SHARK BENCHMARK RESULT: "
+ str(1 / (benchmark_data * 0.001))
+ "\n"
)
benchmark_file.close()
benchmark_runtimes[d_] = 1 / (benchmark_data * 0.001)
elif ".mlir" in f_ and "benchmark" not in f_:
dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
module = dispatch_file.read()
dispatch_file.close()
module = re.sub(
"hal.executable private",
"hal.executable public",
module,
)
flatbuffer_blob = ireec.compile_str(
module,
target_backends=[iree_target_map(device)],
extra_args=["--compile-mode=hal-executable"],
)
spirv_file = open(
f"{bench_dir}/{d_}/{d_}_spirv.vmfb", "wb"
)
spirv_file.write(flatbuffer_blob)
spirv_file.close()
ordered_dispatches = [
(k, v)
for k, v in sorted(
benchmark_runtimes.items(), key=lambda item: item[1]
)
][::-1]
f_ = open(f"{bench_dir}/ordered-dispatches.txt", "w+")
for dispatch in ordered_dispatches:
f_.write(f"{dispatch[0]}: {dispatch[1]}ms\n")
f_.close()
def compile_module_to_flatbuffer(
module,
device,
frontend,
func_name,
model_config_path,
extra_args,
model_name="None",
module, device, frontend, func_name, model_config_path
):
# Setup Compile arguments wrt to frontends.
input_type = ""
args = get_iree_frontend_args(frontend)
args += get_iree_device_args(device, extra_args)
args += get_iree_common_args()
args += get_model_specific_args()
args += extra_args
args += get_iree_device_args(device)
# args += get_iree_common_args()
if frontend in ["tensorflow", "tf"]:
input_type = "mhlo"
@@ -247,24 +72,24 @@ def compile_module_to_flatbuffer(
input_type = frontend
elif frontend in ["tflite", "tflite-tosa"]:
input_type = "tosa"
elif frontend in ["tm_tensor"]:
input_type = ireec.InputType.TM_TENSOR
# TODO: make it simpler.
# Compile according to the input type, else just try compiling.
if input_type not in ["mhlo", "tosa"]:
module = str(module)
if input_type != "":
# Currently for MHLO/TOSA.
flatbuffer_blob = ireec.compile_str(
module,
target_backends=[iree_target_map(device)],
target_backends=[IREE_TARGET_MAP[device]],
extra_args=args,
input_type=input_type,
)
else:
# Currently for Torch.
flatbuffer_blob = ireec.compile_str(
module,
target_backends=[iree_target_map(device)],
str(module),
target_backends=[IREE_TARGET_MAP[device]],
extra_args=args,
)
@@ -273,10 +98,8 @@ def compile_module_to_flatbuffer(
def get_iree_module(flatbuffer_blob, device, func_name):
# Returns the compiled module and the configs.
config = get_iree_runtime_config(device)
vm_module = ireert.VmModule.from_flatbuffer(
config.vm_instance, flatbuffer_blob
)
vm_module = ireert.VmModule.from_flatbuffer(flatbuffer_blob)
config = ireert.Config(IREE_DEVICE_MAP[device])
ctx = ireert.SystemContext(config=config)
ctx.add_vm_module(vm_module)
ModuleCompiled = ctx.modules.module[func_name]
@@ -289,44 +112,27 @@ def get_iree_compiled_module(
frontend: str = "torch",
func_name: str = "forward",
model_config_path: str = None,
extra_args: list = [],
):
"""Given a module returns the compiled .vmfb and configs"""
flatbuffer_blob = compile_module_to_flatbuffer(
module, device, frontend, func_name, model_config_path, extra_args
module, device, frontend, func_name, model_config_path
)
return get_iree_module(flatbuffer_blob, device, func_name)
def load_flatbuffer(
flatbuffer_path: str, device: str, func_name: str = "forward"
):
with open(os.path.join(flatbuffer_path), "rb") as f:
flatbuffer_blob = f.read()
return get_iree_module(flatbuffer_blob, device, func_name)
def export_iree_module_to_vmfb(
module,
device: str,
directory: str,
mlir_dialect: str = "linalg",
frontend: str = "torch",
func_name: str = "forward",
model_config_path: str = None,
module_name: str = None,
extra_args: list = [],
):
# Compiles the module given specs and saves it as .vmfb file.
flatbuffer_blob = compile_module_to_flatbuffer(
module, device, mlir_dialect, func_name, model_config_path, extra_args
module, device, frontend, func_name, model_config_path
)
if module_name is None:
device_name = (
device if "://" not in device else "-".join(device.split("://"))
)
module_name = f"{mlir_dialect}_{func_name}_{device_name}"
module_name = f"{frontend}_{func_name}_{device}"
filename = os.path.join(directory, module_name + ".vmfb")
print(f"Saved vmfb in {filename}.")
with open(filename, "wb") as f:
@@ -362,10 +168,4 @@ def get_results(compiled_vm, input, config, frontend="torch"):
res = np.array(data, dtype=object)
return np.copy(res)
else:
return result.to_host()
def get_iree_runtime_config(device):
device = iree_device_map(device)
config = ireert.Config(device=ireert.get_device(device))
return config
return np.copy(np.asarray(result, dtype=result.dtype))

View File

@@ -16,17 +16,6 @@
import subprocess
def get_cpu_count():
import multiprocessing
try:
cpu_count = multiprocessing.cpu_count()
return cpu_count
except NotImplementedError:
return None
# Get the default cpu args.
def get_iree_cpu_args():
find_triple_cmd = "uname -s -m"

View File

@@ -16,7 +16,6 @@
import iree.runtime as ireert
import ctypes
from shark.parser import shark_args
# Get the default gpu args given the architecture.
def get_iree_gpu_args():
@@ -24,9 +23,7 @@ def get_iree_gpu_args():
ireert.flags.parse_flags("--cuda_allow_inline_execution")
# TODO: Give the user_interface to pass the sm_arch.
sm_arch = get_cuda_sm_cc()
if (
sm_arch in ["sm_70", "sm_72", "sm_75", "sm_80", "sm_84", "sm_86"]
) and (shark_args.enable_tf32 == True):
if sm_arch in ["sm_70", "sm_72", "sm_75", "sm_80", "sm_84", "sm_86"]:
return [
"--iree-hal-cuda-disable-loop-nounroll-wa",
f"--iree-hal-cuda-llvm-target-arch={sm_arch}",
@@ -35,18 +32,6 @@ def get_iree_gpu_args():
return ["--iree-hal-cuda-disable-loop-nounroll-wa"]
# Get the default gpu args given the architecture.
def get_iree_rocm_args():
ireert.flags.FUNCTION_INPUT_VALIDATION = False
# TODO: find a way to get arch from code.
rocm_arch = "gfx908"
return [
f"--iree-rocm-target-chip={rocm_arch}",
"--iree-rocm-link-bc=true",
"--iree-rocm-bc-dir=/opt/rocm/amdgcn/bitcode",
]
# Some constants taken from cuda.h
CUDA_SUCCESS = 0
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16

View File

@@ -14,69 +14,27 @@
# All the iree_vulkan related functionalities go here.
from os import linesep
from shark.iree_utils._common import run_cmd
import iree.runtime as ireert
from sys import platform
def get_vulkan_device_name():
vulkaninfo_dump = run_cmd("vulkaninfo").split(linesep)
vulkaninfo_list = [s.strip() for s in vulkaninfo_dump if "deviceName" in s]
if len(vulkaninfo_list) == 0:
raise ValueError("No device name found in VulkanInfo!")
if len(vulkaninfo_list) > 1:
print(
f"Found {len(vulkaninfo_list)} device names. choosing first one: {vulkaninfo_list[0]}"
)
return vulkaninfo_list[0]
def get_os_name():
if platform.startswith("linux"):
return "linux"
elif platform == "darwin":
return "macos"
elif platform == "win32":
return "windows"
else:
print("Cannot detect OS type, defaulting to linux.")
return "linux"
def get_vulkan_triple_flag(extra_args=[]):
if "-iree-vulkan-target-triple=" in " ".join(extra_args):
print(f"Using target triple from command line args")
return None
system_os = get_os_name()
vulkan_device = get_vulkan_device_name()
if all(x in vulkan_device for x in ("Apple", "M1")):
print(f"Found {vulkan_device} Device. Using m1-moltenvk-macos")
def get_vulkan_triple_flag():
vulkan_device_cmd = "vulkaninfo | grep deviceName | awk 'END{{print $NF}}'"
vulkan_device = run_cmd(vulkan_device_cmd).strip()
if vulkan_device == "Ultra":
print("Found MacStudio M1 Device. Using m1-moltenvk-macos")
return "-iree-vulkan-target-triple=m1-moltenvk-macos"
elif all(x in vulkan_device for x in ("Apple", "M2")):
elif vulkan_device == "M2":
print("Found Apple M2 Device. Using m1-moltenvk-macos")
return "-iree-vulkan-target-triple=m1-moltenvk-macos"
elif all(x in vulkan_device for x in ("A100", "SXM4")):
print(
f"Found {vulkan_device} Device. Using ampere-rtx3080-{system_os}"
)
return f"-iree-vulkan-target-triple=ampere-rtx3080-{system_os}"
elif all(x in vulkan_device for x in ("RTX", "3090")):
print(
f"Found {vulkan_device} Device. Using ampere-rtx3090-{system_os}"
)
return f"-iree-vulkan-target-triple=ampere-rtx3090-{system_os}"
elif all(x in vulkan_device for x in ("RTX", "4090")):
print(
f"Found {vulkan_device} Device. Using ampere-rtx3090-{system_os}"
)
return f"-iree-vulkan-target-triple=ampere-rtx3090-{system_os}"
elif all(x in vulkan_device for x in ("AMD", "7900")):
print(f"Found {vulkan_device} Device. Using rdna3-7900-{system_os}")
return f"-iree-vulkan-target-triple=rdna3-7900-{system_os}"
elif any(x in vulkan_device for x in ("AMD", "Radeon")):
print(f"Found AMD device. Using rdna2-unknown-{system_os}")
return f"-iree-vulkan-target-triple=rdna2-unknown-{system_os}"
elif vulkan_device == "M1":
print("Found Apple M1 Device. Using m1-moltenvk-macos")
return "-iree-vulkan-target-triple=m1-moltenvk-macos"
elif vulkan_device == "A100-SXM4-40GB":
print("Found Nvidia Device. Using ampere-rtx3080-linux")
return "-iree-vulkan-target-triple=ampere-rtx3080-linux"
elif vulkan_device == "3090":
print("Found Nvidia Device. Using ampere-rtx3090-linux")
return "-iree-vulkan-target-triple=ampere-rtx3090-linux"
else:
print(
"""Optimized kernel for your target device is not added yet.
@@ -87,16 +45,10 @@ def get_vulkan_triple_flag(extra_args=[]):
return None
def get_iree_vulkan_args(extra_args=[]):
def get_iree_vulkan_args():
# vulkan_flag = ["--iree-flow-demote-i64-to-i32"]
vulkan_flag = []
vulkan_triple_flag = get_vulkan_triple_flag(extra_args)
vulkan_triple_flag = get_vulkan_triple_flag()
if vulkan_triple_flag is not None:
vulkan_flag.append(vulkan_triple_flag)
return vulkan_flag
def set_iree_vulkan_runtime_flags(flags):
for flag in flags:
ireert.flags.parse_flags(flag)
return

View File

@@ -12,34 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Usage:
This function takes the model mlir file and the tuned config file as input,
and output a new mlir file with lowering configs annotated on certain ops.
There are two ways to utilize the function:
1. Call model_annotation function within another python script
from shark.model_annotation import model_annotation
with create_context() as ctx:
module = model_annotation(ctx, input_contents=..., config_path=..., search_op=...)
2. Run model_annotation.py directly
python model_annotation.py path_to_original_mlir path_to_config_file
"""
import sys
import json
import os
import sys
from typing import Dict, List
from typing import List, Dict
from iree.compiler import ir
from iree.compiler.transforms import ireec as ireec_trans
MATMUL_OP_NAMES = set(
["linalg.matmul", "linalg.batch_matmul", "mhlo.dot", "mhlo.dot_general"]
)
idx = 0
def model_annotation(
ctx: ir.Context,
*,
input_contents: str,
config_path: str,
search_op: str = "matmul",
ctx: ir.Context, *, input_contents: str, config_path: str
):
if os.path.isfile(input_contents):
with open(input_contents, "rb") as f:
@@ -53,35 +41,21 @@ def model_annotation(
# The Python API does not expose a general walk() function, so we just
# do it ourselves.
walk_children(module.operation, configs, 0, search_op)
walk_children(module.operation, configs)
if not module.operation.verify():
raise RuntimeError("Modified program does not verify!")
# More efficient than: print(module)
# - Disables verification (already done above)
# - Writes as binary, avoiding costly unicode conversions
sys.stdout.buffer.write(
module.operation.get_asm(assume_verified=True, binary=True)
)
return module
def walk_children(
op: ir.Operation, configs: List[Dict], idx: int, search_op: str
):
if search_op == "matmul":
op_names = ["linalg.matmul", "mhlo.dot"]
elif search_op == "bmm":
op_names = ["linalg.batch_matmul", "mhlo.dot_general"]
elif search_op == "conv":
op_names = ["mhlo.convolution", "linalg.conv_2d_nhwc_hwcf"]
elif search_op == "all":
op_names = [
"mhlo.dot",
"mhlo.dot_general",
"mhlo.convolution",
"linalg.matmul",
"linalg.batch_matmul",
"linalg.conv_2d_nhwc_hwcf",
]
else:
raise ValueError(f"{search_op} op is not tunable.")
def walk_children(op: ir.Operation, configs: List[Dict]):
for region in op.regions:
for block in region.blocks:
for child_op in block.operations:
@@ -89,38 +63,34 @@ def walk_children(
# 'operation' and 'name' attributes.
if isinstance(child_op, ir.OpView):
child_op = child_op.operation
if child_op.name in op_names and idx < len(configs):
add_attributes(child_op, configs[idx])
if child_op.name in MATMUL_OP_NAMES:
global idx
(
tile_sizes,
pipeline,
workgroup_size,
split_k,
pipeline_depth,
) = parse_config(configs[idx])
add_compilation_info(
child_op,
tile_sizes=tile_sizes,
pipeline=pipeline,
workgroup_size=workgroup_size,
pipeline_depth=pipeline_depth,
)
if split_k:
add_split_k(child_op, split_k)
idx = idx + 1
print(f"Updated op {child_op}", file=sys.stderr)
walk_children(child_op, configs, idx, search_op)
def add_attributes(op: ir.Operation, config: Dict):
(
tile_sizes,
pipeline,
workgroup_size,
split_k,
pipeline_depth,
) = parse_config(config)
add_compilation_info(
op,
tile_sizes=tile_sizes,
pipeline=pipeline,
workgroup_size=workgroup_size,
pipeline_depth=pipeline_depth,
)
if split_k:
add_attribute_by_name(op, "iree_flow_split_k", split_k)
walk_children(child_op, configs)
def parse_config(config: Dict):
split_k = None
pipeline_depth = None
if "GPU" in config["pipeline"]:
if config["pipeline"] == "GPU" or config["pipeline"] == "GPU_TENSORCORE":
pipeline = (
"LLVMGPUMatmulSimt"
if config["pipeline"] == "GPU"
@@ -128,31 +98,24 @@ def parse_config(config: Dict):
)
tile_sizes = [config["work_group_tile_sizes"]]
workgroup_size = config["work_group_sizes"]
if "pipeline_depth" in config.keys():
try:
pipeline_depth = config["pipeline_depth"]
if "split_k" in config.keys():
except:
pipeline_depth = None
try:
split_k = config["split_k"]
elif "SPIRV" in config["pipeline"]:
pipeline = config["pipeline"]
tile_sizes = [
config["work_group_tile_sizes"],
config["parallel_tile_sizes"],
config["reduction_tile_sizes"],
]
if "vector_tile_sizes" in config.keys():
tile_sizes += [config["vector_tile_sizes"]]
if "window_tile_sizes" in config.keys():
tile_sizes += [config["window_tile_sizes"]]
workgroup_size = config["work_group_sizes"]
except:
split_k = None
else:
# For IREE CPU pipelines
pipeline = config["pipeline"]
tile_sizes = [
config["work_group_tile_sizes"],
config["parallel_tile_sizes"],
config["reduction_tile_sizes"],
config["l1_tile_sizes"],
config["vector_tile_sizes"],
]
workgroup_size = []
split_k = None
pipeline_depth = None
return tile_sizes, pipeline, workgroup_size, split_k, pipeline_depth
@@ -182,9 +145,9 @@ def add_compilation_info(
op.attributes["compilation_info"] = attr
def add_attribute_by_name(op: ir.Operation, name: str, val: int):
attr = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), val)
op.attributes[name] = attr
def add_split_k(op: ir.Operation, k: int):
attr = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), k)
op.attributes["iree_flow_split_k"] = attr
def create_context() -> ir.Context:
@@ -196,14 +159,6 @@ def create_context() -> ir.Context:
if __name__ == "__main__":
with create_context() as ctx:
module = model_annotation(
ctx,
input_contents=sys.argv[1],
config_path=sys.argv[2],
search_op="all",
model_annotation(
ctx, input_contents=sys.argv[1], config_path=sys.argv[2]
)
mlir_str = str(module)
filename = "tuned_model.mlir"
with open(filename, "w") as f:
f.write(mlir_str)
print(f"Saved mlir in {filename}.")

View File

@@ -38,7 +38,7 @@ parser.add_argument(
"--device",
type=str,
default="cpu",
help="Device on which shark_runner runs. options are cpu, cuda, and vulkan",
help="Device on which shark_runner runs. options are cpu, gpu, and vulkan",
)
parser.add_argument(
"--repro_dir",
@@ -47,10 +47,16 @@ parser.add_argument(
default="./shark_tmp",
)
parser.add_argument(
"--enable_tf32",
type=bool,
"--save_mlir",
default=False,
help="Enables TF32 precision calculations on supported GPUs.",
action="store_true",
help="Saves input MLIR module to /tmp/ directory.",
)
parser.add_argument(
"--save_vmfb",
default=False,
action="store_true",
help="Saves iree .vmfb module to /tmp/ directory.",
)
parser.add_argument(
"--model_config_path",
@@ -61,55 +67,14 @@ parser.add_argument(
parser.add_argument(
"--num_warmup_iterations",
type=int,
default=5,
default=2,
help="Run the model for the specified number of warmup iterations.",
)
parser.add_argument(
"--num_iterations",
type=int,
default=100,
default=1,
help="Run the model for the specified number of iterations.",
)
parser.add_argument(
"--onnx_bench",
default=False,
action="store_true",
help="When enabled, pytest bench results will include ONNX benchmark results.",
)
parser.add_argument(
"--shark_prefix",
default="latest",
help="gs://shark_tank/<this_flag>/model_directories",
)
parser.add_argument(
"--update_tank",
default=False,
action="store_true",
help="When enabled, SHARK downloader will update local shark_tank if local hash is different from latest upstream hash.",
)
parser.add_argument(
"--local_tank_cache",
default="",
help="Specify where to save downloaded shark_tank artifacts. If this is not set, the default is ~/.local/shark_tank/.",
)
parser.add_argument(
"--dispatch_benchmarks",
default=None,
help='dispatches to return benchamrk data on. use "All" for all, and None for none.',
)
parser.add_argument(
"--dispatch_benchmarks_dir",
default="temp_dispatch_benchmarks",
help='directory where you want to store dispatch data generated with "--dispatch_benchmarks"',
)
parser.add_argument(
"--enable_conv_transform",
default=False,
action="store_false",
help="Enables the --iree-flow-enable-conv-nchw-to-nhwc-transform flag.",
)
shark_args, unknown = parser.parse_known_args()

View File

@@ -19,74 +19,37 @@ from shark.iree_utils.benchmark_utils import (
run_benchmark_module,
)
from shark.parser import shark_args
from tank.model_utils import get_torch_model
from datetime import datetime
import time
import csv
import os
class OnnxFusionOptions(object):
def __init__(self):
self.disable_gelu = False
self.disable_layer_norm = False
self.disable_attention = False
self.disable_skip_layer_norm = False
self.disable_embed_layer_norm = False
self.disable_bias_skip_layer_norm = False
self.disable_bias_gelu = False
self.enable_gelu_approximation = False
self.use_mask_index = False
self.no_attention_mask = False
def check_requirements(frontend):
import importlib
has_pkgs = False
if frontend == "torch":
tv_spec = importlib.util.find_spec("torchvision")
has_pkgs = tv_spec is not None
elif frontend in ["tensorflow", "tf"]:
keras_spec = importlib.util.find_spec("keras")
tf_spec = importlib.util.find_spec("tensorflow")
has_pkgs = keras_spec is not None and tf_spec is not None
return has_pkgs
class SharkBenchmarkRunner(SharkRunner):
# SharkRunner derived class with Benchmarking capabilities.
def __init__(
self,
mlir_module: bytes,
mlir_module: str,
function_name: str = "forward",
device: str = "none",
mlir_dialect: str = "linalg",
extra_args: list = [],
frontend: str = "torch",
):
self.device = shark_args.device if device == "none" else device
self.frontend = frontend
self.frontend_model = None
self.vmfb_file = None
self.mlir_dialect = mlir_dialect
self.extra_args = extra_args
SharkRunner.__init__(
self,
mlir_module,
function_name,
device,
self.mlir_dialect,
self.extra_args,
compile_vmfb=True,
mlir_dialect,
)
if self.vmfb_file == None:
self.vmfb_file = export_iree_module_to_vmfb(
mlir_module,
device,
shark_args.repro_dir,
self.mlir_dialect,
function_name,
extra_args=self.extra_args,
mlir_module, device, shark_args.repro_dir, self.frontend
)
def setup_cl(self, input_tensors):
@@ -97,25 +60,23 @@ class SharkBenchmarkRunner(SharkRunner):
mlir_dialect=self.mlir_dialect,
)
def benchmark_frontend(self, modelname):
if self.mlir_dialect in ["linalg", "torch"]:
def benchmark_frontend(self, inputs, modelname):
if self.frontend in ["pytorch", "torch"]:
return self.benchmark_torch(modelname)
elif self.mlir_dialect in ["mhlo", "tf"]:
return self.benchmark_tf(modelname)
elif self.frontend in ["tensorflow", "tf"]:
return self.benchmark_tf(inputs, modelname)
def benchmark_torch(self, modelname):
import torch
from tank.model_utils import get_torch_model
if self.device == "cuda":
if self.device == "gpu":
torch.set_default_tensor_type(torch.cuda.FloatTensor)
else:
torch.set_default_tensor_type(torch.FloatTensor)
torch_device = torch.device(
"cuda:0" if self.device == "cuda" else "cpu"
"cuda:0" if self.device == "gpu" else "cpu"
)
HFmodel, input = get_torch_model(modelname)[:2]
HFmodel, input, act_out = get_torch_model(modelname)
frontend_model = HFmodel.model
frontend_model.to(torch_device)
input.to(torch_device)
@@ -137,49 +98,27 @@ class SharkBenchmarkRunner(SharkRunner):
f"{((end-begin)/shark_args.num_iterations)*1000}",
]
def benchmark_tf(self, modelname):
import tensorflow as tf
def benchmark_tf(self, frontend_model, inputs):
for i in range(shark_args.num_warmup_iterations):
frontend_model.forward(*inputs)
visible_default = tf.config.list_physical_devices("GPU")
try:
tf.config.set_visible_devices([], "GPU")
visible_devices = tf.config.get_visible_devices()
for device in visible_devices:
assert device.device_type != "GPU"
except:
# Invalid device or cannot modify virtual devices once initialized.
pass
from tank.model_utils_tf import get_tf_model
# tf_device = "/GPU:0" if self.device == "cuda" else "/CPU:0"
tf_device = "/CPU:0"
with tf.device(tf_device):
model, input, = get_tf_model(
modelname
)[:2]
frontend_model = model
for i in range(shark_args.num_warmup_iterations):
frontend_model.forward(*input)
begin = time.time()
for i in range(shark_args.num_iterations):
out = frontend_model.forward(*input)
if i == shark_args.num_iterations - 1:
end = time.time()
break
print(
f"TF benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
)
return [
f"{shark_args.num_iterations/(end-begin)}",
f"{((end-begin)/shark_args.num_iterations)*1000}",
]
begin = time.time()
for i in range(shark_args.num_iterations):
out = frontend_model.forward(*inputs)
if i == shark_args.num_iterations - 1:
end = time.time()
break
print(
f"TF benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
)
return [
f"{shark_args.num_iterations/(end-begin)}",
f"{((end-begin)/shark_args.num_iterations)*1000}",
]
def benchmark_c(self):
result = run_benchmark_module(self.benchmark_cl)
print(f"Shark-IREE-C benchmark:{result} iter/second")
print(f"Shark-{self.frontend} C-benchmark:{result} iter/second")
return [f"{result}", f"{1000/result}"]
def benchmark_python(self, inputs):
@@ -193,134 +132,32 @@ class SharkBenchmarkRunner(SharkRunner):
if i == shark_args.num_iterations - 1:
end = time.time()
print(
f"Shark-IREE Python benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
f"Shark-{self.frontend} Python-benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
)
return [
f"{shark_args.num_iterations/(end-begin)}",
f"{((end-begin)/shark_args.num_iterations)*1000}",
]
def benchmark_onnx(self, modelname, inputs):
if self.device == "cuda":
print(
"Currently GPU benchmarking on ONNX is not supported in SHARK."
)
return ["N/A", "N/A"]
else:
from onnxruntime.transformers.benchmark import run_onnxruntime
from onnxruntime.transformers.huggingface_models import MODELS
from onnxruntime.transformers.benchmark_helper import (
ConfigModifier,
Precision,
)
import psutil
if modelname == "microsoft/MiniLM-L12-H384-uncased":
modelname = "bert-base-uncased"
if modelname not in MODELS:
print(
f"{modelname} is currently not supported in ORT's HF. Check \
https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
for currently supported models. Exiting benchmark ONNX."
)
return ["N/A", "N/A"]
use_gpu = self.device == "cuda"
num_threads = psutil.cpu_count(logical=False)
batch_sizes = [1]
sequence_lengths = [128]
cache_dir = os.path.join(".", "cache_models")
onnx_dir = os.path.join(".", "onnx_models")
verbose = False
input_counts = [1]
optimize_onnx = True
validate_onnx = False
disable_ort_io_binding = False
use_raw_attention_mask = True
model_fusion_statistics = {}
overwrite = False
model_source = "pt" # Either "pt" or "tf"
provider = None
config_modifier = ConfigModifier(None)
onnx_args = OnnxFusionOptions()
result = run_onnxruntime(
use_gpu,
provider,
(modelname,),
None,
config_modifier,
Precision.FLOAT32,
num_threads,
batch_sizes,
sequence_lengths,
shark_args.num_iterations,
input_counts,
optimize_onnx,
validate_onnx,
cache_dir,
onnx_dir,
verbose,
overwrite,
disable_ort_io_binding,
use_raw_attention_mask,
model_fusion_statistics,
model_source,
onnx_args,
)
print(
f"ONNX ORT-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
)
return [
result[0]["QPS"],
result[0]["average_latency_ms"],
]
def get_metadata(self, modelname):
with open("./tank/model_metadata.csv", mode="r") as csvfile:
torch_reader = csv.reader(csvfile, delimiter=",")
fields = next(torch_reader)
for row in torch_reader:
torch_model_name = row[0]
if torch_model_name == modelname:
param_count = row[3]
model_tags = row[4]
model_notes = row[5]
return [param_count, model_tags, model_notes]
def compare_bench_results(self, baseline: str, result: str):
if baseline is not None:
# Takes a baseline and a result string and calculates a comparison, e.g. "1.04x baseline".
a = float(baseline)
b = float(result)
comparison = a / b
comp_str = f"{round(comparison, 2)}x baseline"
else:
comp_str = "N/A"
return comp_str
def benchmark_all(self, inputs: tuple):
self.benchmark_frontend(inputs)
self.benchmark_python(inputs)
self.benchmark_c()
def benchmark_all_csv(
self, inputs: tuple, modelname, dynamic, device_str, frontend
):
self.setup_cl(inputs)
field_names = [
"platform",
"model",
"engine",
"dialect",
"dynamic",
"device",
"shape_type",
"data_type",
"iter/sec",
"ms/iter",
"vs. PyTorch/TF",
"iterations",
"param_count",
"tags",
"notes",
"datetime",
]
engines = ["frontend", "shark_python", "shark_iree_c"]
if shark_args.onnx_bench == True:
engines.append("onnxruntime")
platforms = ["frontend", "shark_python", "shark_iree_c"]
if not os.path.exists("bench_results.csv"):
with open("bench_results.csv", mode="w", newline="") as f:
@@ -332,69 +169,26 @@ for currently supported models. Exiting benchmark ONNX."
bench_result = {}
bench_result["model"] = modelname
if dynamic == True:
bench_result["shape_type"] = "dynamic"
bench_result["dynamic"] = "True"
else:
bench_result["shape_type"] = "static"
bench_result["dynamic"] = "False"
bench_result["device"] = device_str
bench_result["data_type"] = inputs[0].dtype
for e in engines:
(
bench_result["param_count"],
bench_result["tags"],
bench_result["notes"],
) = ["", "", ""]
if e == "frontend":
bench_result["engine"] = frontend
if check_requirements(frontend):
(
bench_result["iter/sec"],
bench_result["ms/iter"],
) = self.benchmark_frontend(modelname)
self.frontend_result = bench_result["ms/iter"]
bench_result["vs. PyTorch/TF"] = "baseline"
(
bench_result["param_count"],
bench_result["tags"],
bench_result["notes"],
) = self.get_metadata(modelname)
else:
self.frontend_result = None
continue
elif e == "shark_python":
bench_result["engine"] = "shark_python"
(
bench_result["iter/sec"],
bench_result["ms/iter"],
) = self.benchmark_python(inputs)
bench_result[
"vs. PyTorch/TF"
] = self.compare_bench_results(
self.frontend_result, bench_result["ms/iter"]
)
elif e == "shark_iree_c":
bench_result["engine"] = "shark_iree_c"
(
bench_result["iter/sec"],
bench_result["ms/iter"],
) = self.benchmark_c()
bench_result[
"vs. PyTorch/TF"
] = self.compare_bench_results(
self.frontend_result, bench_result["ms/iter"]
)
elif e == "onnxruntime":
bench_result["engine"] = "onnxruntime"
(
bench_result["iter/sec"],
bench_result["ms/iter"],
) = self.benchmark_onnx(modelname, inputs)
bench_result["dialect"] = self.mlir_dialect
bench_result["iterations"] = shark_args.num_iterations
for p in platforms:
if p == "frontend":
bench_result["platform"] = frontend
bench_result["iter/sec"] = self.benchmark_frontend(
inputs, modelname
)[0]
bench_result["ms/iter"] = self.benchmark_frontend(
inputs, modelname
)[1]
elif p == "shark_python":
bench_result["platform"] = "shark_python"
bench_result["iter/sec"] = self.benchmark_python(inputs)[0]
bench_result["ms/iter"] = self.benchmark_python(inputs)[1]
else:
bench_result["platform"] = "shark_iree_c"
bench_result["iter/sec"] = self.benchmark_c()[0]
bench_result["ms/iter"] = self.benchmark_c()[1]
bench_result["datetime"] = str(datetime.now())
writer.writerow(bench_result)

View File

@@ -14,51 +14,10 @@
import numpy as np
import os
import sys
import urllib.request
import json
import hashlib
from pathlib import Path
from shark.parser import shark_args
from google.cloud import storage
def download_public_file(
full_gs_url, destination_folder_name, single_file=False
):
"""Downloads a public blob from the bucket."""
# bucket_name = "gs://your-bucket-name/path/to/file"
# destination_file_name = "local/path/to/file"
storage_client = storage.Client.create_anonymous_client()
bucket_name = full_gs_url.split("/")[2]
source_blob_name = None
dest_filename = None
desired_file = None
if single_file:
desired_file = full_gs_url.split("/")[-1]
source_blob_name = "/".join(full_gs_url.split("/")[3:-1])
destination_folder_name, dest_filename = os.path.split(
destination_folder_name
)
else:
source_blob_name = "/".join(full_gs_url.split("/")[3:])
bucket = storage_client.bucket(bucket_name)
blobs = bucket.list_blobs(prefix=source_blob_name)
if not os.path.exists(destination_folder_name):
os.mkdir(destination_folder_name)
for blob in blobs:
blob_name = blob.name.split("/")[-1]
if single_file:
if blob_name == desired_file:
destination_filename = os.path.join(
destination_folder_name, dest_filename
)
blob.download_to_filename(destination_filename)
else:
continue
destination_filename = os.path.join(destination_folder_name, blob_name)
blob.download_to_filename(destination_filename)
input_type_to_np_dtype = {
"float32": np.float32,
@@ -70,27 +29,11 @@ input_type_to_np_dtype = {
"int8": np.int8,
}
# Save the model in the home local so it needn't be fetched everytime in the CI.
home = str(Path.home())
alt_path = os.path.join(os.path.dirname(__file__), "../gen_shark_tank/")
custom_path = shark_args.local_tank_cache
if os.path.exists(alt_path):
WORKDIR = alt_path
print(
f"Using {WORKDIR} as shark_tank directory. Delete this directory if you aren't working from locally generated shark_tank."
)
if custom_path:
if not os.path.exists(custom_path):
os.mkdir(custom_path)
WORKDIR = custom_path
print(f"Using {WORKDIR} as local shark_tank cache directory.")
else:
WORKDIR = os.path.join(home, ".local/shark_tank/")
print(
f"shark_tank local cache is located at {WORKDIR} . You may change this by setting the --local_tank_cache= flag"
)
WORKDIR = os.path.join(home, ".local/shark_tank/")
print(WORKDIR)
# Checks whether the directory and files exists.
@@ -118,64 +61,57 @@ def check_dir_exists(model_name, frontend="torch", dynamic=""):
and os.path.isfile(os.path.join(model_dir, "golden_out.npz"))
and os.path.isfile(os.path.join(model_dir, "hash.npy"))
):
print(f"""Using cached models from {WORKDIR}...""")
print(
f"""The models are present in the {WORKDIR}. If you want a fresh
download, consider deleting the directory."""
)
return True
return False
# Downloads the torch model from gs://shark_tank dir.
def download_model(
model_name,
dynamic=False,
tank_url="gs://shark_tank/latest",
frontend=None,
tuned=None,
):
def download_torch_model(model_name, dynamic=False):
model_name = model_name.replace("/", "_")
dyn_str = "_dynamic" if dynamic else ""
os.makedirs(WORKDIR, exist_ok=True)
model_dir_name = model_name + "_" + frontend
model_dir = os.path.join(WORKDIR, model_dir_name)
full_gs_url = tank_url.rstrip("/") + "/" + model_dir_name
model_dir_name = model_name + "_torch"
if not check_dir_exists(
model_dir_name, frontend=frontend, dynamic=dyn_str
):
print(f"Downloading artifacts for model {model_name}...")
download_public_file(full_gs_url, model_dir)
def gs_download_model():
gs_command = (
'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank'
+ "/"
+ model_dir_name
+ " "
+ WORKDIR
)
if os.system(gs_command) != 0:
raise Exception("model not present in the tank. Contact Nod Admin")
if not check_dir_exists(model_dir_name, frontend="torch", dynamic=dyn_str):
gs_download_model()
else:
if not _internet_connected():
print(
"No internet connection. Using the model already present in the tank."
)
else:
local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
gs_hash_url = (
tank_url.rstrip("/") + "/" + model_dir_name + "/hash.npy"
)
download_public_file(
gs_hash_url,
os.path.join(model_dir, "upstream_hash.npy"),
single_file=True,
)
upstream_hash = str(
np.load(os.path.join(model_dir, "upstream_hash.npy"))
)
if local_hash != upstream_hash:
if shark_args.update_tank == True:
print(f"Updating artifacts for model {model_name}...")
download_public_file(full_gs_url, WORKDIR)
else:
print(
"Hash does not match upstream in gs://shark_tank/. If you are using SHARK Downloader with locally generated artifacts, this is working as intended."
)
model_dir = os.path.join(WORKDIR, model_dir_name)
local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
gs_hash = (
'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank'
+ "/"
+ model_dir_name
+ "/hash.npy"
+ " "
+ os.path.join(model_dir, "upstream_hash.npy")
)
if os.system(gs_hash) != 0:
raise Exception("hash of the model not present in the tank.")
upstream_hash = str(
np.load(os.path.join(model_dir, "upstream_hash.npy"))
)
if local_hash != upstream_hash:
gs_download_model()
model_dir = os.path.join(WORKDIR, model_dir_name)
tuned_str = "" if tuned is None else "_" + tuned
suffix = f"{dyn_str}_{frontend}{tuned_str}.mlir"
filename = os.path.join(model_dir, model_name + suffix)
with open(filename, mode="rb") as f:
with open(
os.path.join(model_dir, model_name + dyn_str + "_torch.mlir")
) as f:
mlir_file = f.read()
function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
@@ -187,11 +123,106 @@ def download_model(
return mlir_file, function_name, inputs_tuple, golden_out_tuple
def _internet_connected():
import requests as req
# Downloads the tflite model from gs://shark_tank dir.
def download_tflite_model(model_name, dynamic=False):
dyn_str = "_dynamic" if dynamic else ""
os.makedirs(WORKDIR, exist_ok=True)
model_dir_name = model_name + "_tflite"
try:
req.get("http://1.1.1.1")
return True
except:
return False
def gs_download_model():
gs_command = (
'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank'
+ "/"
+ model_dir_name
+ " "
+ WORKDIR
)
if os.system(gs_command) != 0:
raise Exception("model not present in the tank. Contact Nod Admin")
if not check_dir_exists(
model_dir_name, frontend="tflite", dynamic=dyn_str
):
gs_download_model()
else:
model_dir = os.path.join(WORKDIR, model_dir_name)
local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
gs_hash = (
'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank'
+ "/"
+ model_dir_name
+ "/hash.npy"
+ " "
+ os.path.join(model_dir, "upstream_hash.npy")
)
if os.system(gs_hash) != 0:
raise Exception("hash of the model not present in the tank.")
upstream_hash = str(
np.load(os.path.join(model_dir, "upstream_hash.npy"))
)
if local_hash != upstream_hash:
gs_download_model()
model_dir = os.path.join(WORKDIR, model_dir_name)
with open(
os.path.join(model_dir, model_name + dyn_str + "_tflite.mlir")
) as f:
mlir_file = f.read()
function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
inputs = np.load(os.path.join(model_dir, "inputs.npz"))
golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))
inputs_tuple = tuple([inputs[key] for key in inputs])
golden_out_tuple = tuple([golden_out[key] for key in golden_out])
return mlir_file, function_name, inputs_tuple, golden_out_tuple
def download_tf_model(model_name):
model_name = model_name.replace("/", "_")
os.makedirs(WORKDIR, exist_ok=True)
model_dir_name = model_name + "_tf"
def gs_download_model():
gs_command = (
'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank'
+ "/"
+ model_dir_name
+ " "
+ WORKDIR
)
if os.system(gs_command) != 0:
raise Exception("model not present in the tank. Contact Nod Admin")
if not check_dir_exists(model_dir_name, frontend="tf"):
gs_download_model()
else:
model_dir = os.path.join(WORKDIR, model_dir_name)
local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
gs_hash = (
'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank'
+ "/"
+ model_dir_name
+ "/hash.npy"
+ " "
+ os.path.join(model_dir, "upstream_hash.npy")
)
if os.system(gs_hash) != 0:
raise Exception("hash of the model not present in the tank.")
upstream_hash = str(
np.load(os.path.join(model_dir, "upstream_hash.npy"))
)
if local_hash != upstream_hash:
gs_download_model()
model_dir = os.path.join(WORKDIR, model_dir_name)
with open(os.path.join(model_dir, model_name + "_tf.mlir")) as f:
mlir_file = f.read()
function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
inputs = np.load(os.path.join(model_dir, "inputs.npz"))
golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))
inputs_tuple = tuple([inputs[key] for key in inputs])
golden_out_tuple = tuple([golden_out[key] for key in golden_out])
return mlir_file, function_name, inputs_tuple, golden_out_tuple

View File

@@ -75,24 +75,21 @@ class SharkImporter:
self.module, self.inputs, is_dynamic, tracing_required
)
def _tf_mlir(self, func_name, save_dir="./shark_tmp/"):
def _tf_mlir(self, func_name):
from iree.compiler import tf as tfc
return tfc.compile_module(
self.module,
exported_names=[func_name],
import_only=True,
output_file=save_dir,
self.module, exported_names=[func_name], import_only=True
)
def _tflite_mlir(self, func_name, save_dir="./shark_tmp/"):
def _tflite_mlir(self, func_name):
from iree.compiler import tflite as tflitec
from shark.iree_utils._common import IREE_TARGET_MAP
self.mlir_model = tflitec.compile_file(
self.raw_model_file, # in tflite, it is a path to .tflite file, not a tflite interpreter
input_type="tosa",
import_only=True,
output_file=save_dir,
)
return self.mlir_model
@@ -102,7 +99,6 @@ class SharkImporter:
is_dynamic=False,
tracing_required=False,
func_name="forward",
save_dir="./shark_tmp/",
):
if self.frontend in ["torch", "pytorch"]:
if self.inputs == None:
@@ -112,15 +108,15 @@ class SharkImporter:
sys.exit(1)
return self._torch_mlir(is_dynamic, tracing_required), func_name
if self.frontend in ["tf", "tensorflow"]:
return self._tf_mlir(func_name, save_dir), func_name
return self._tf_mlir(func_name), func_name
if self.frontend in ["tflite", "tf-lite"]:
func_name = "main"
return self._tflite_mlir(func_name, save_dir), func_name
return self._tflite_mlir(func_name), func_name
# Converts the frontend specific tensors into np array.
def convert_to_numpy(self, array_tuple: tuple):
if self.frontend in ["torch", "pytorch"]:
return [x.detach().cpu().numpy() for x in array_tuple]
return [x.detach().numpy() for x in array_tuple]
if self.frontend in ["tf", "tensorflow"]:
return [x.numpy() for x in array_tuple]
@@ -134,20 +130,19 @@ class SharkImporter:
outputs_name = "golden_out.npz"
func_file_name = "function_name"
model_name_mlir = model_name + "_" + self.frontend + ".mlir"
try:
inputs = [x.cpu().detach() for x in inputs]
except AttributeError:
try:
inputs = [x.numpy() for x in inputs]
except AttributeError:
inputs = [x for x in inputs]
np.savez(os.path.join(dir, inputs_name), *inputs)
np.savez(os.path.join(dir, outputs_name), *outputs)
np.save(os.path.join(dir, func_file_name), np.array(func_name))
mlir_str = mlir_data
if self.frontend == "torch":
with open(os.path.join(dir, model_name_mlir), "wb") as mlir_file:
mlir_file.write(mlir_data)
mlir_str = mlir_data.operation.get_asm()
elif self.frontend == "tf":
mlir_str = mlir_data.decode("utf-8")
elif self.frontend == "tflite":
mlir_str = mlir_data.decode("utf-8")
with open(os.path.join(dir, model_name_mlir), "w") as mlir_file:
mlir_file.write(mlir_str)
return
@@ -164,13 +159,9 @@ class SharkImporter:
f"There is no input provided: {self.inputs}, please provide inputs or simply run import_mlir."
)
sys.exit(1)
model_name_mlir = model_name + "_" + self.frontend + ".mlir"
artifact_path = os.path.join(dir, model_name_mlir)
imported_mlir = self.import_mlir(
is_dynamic,
tracing_required,
func_name,
save_dir=artifact_path,
is_dynamic, tracing_required, func_name
)
# TODO: Make sure that any generic function name is accepted. Currently takes in the default function names.
# TODO: Check for multiple outputs.
@@ -180,7 +171,7 @@ class SharkImporter:
golden_out = self.module(*self.inputs)
if torch.is_tensor(golden_out):
golden_out = tuple(
golden_out.detach().cpu().numpy(),
golden_out.detach().numpy(),
)
else:
golden_out = self.convert_to_numpy(golden_out)
@@ -208,11 +199,9 @@ class SharkImporter:
)
elif golden_out is tuple:
golden_out = self.convert_to_numpy(golden_out)
elif hasattr(golden_out, "logits"):
else:
# from transformers import TFSequenceClassifierOutput
golden_out = golden_out.logits
else:
golden_out = golden_out.last_hidden_state
# Save the artifacts in the directory dir.
self.save_data(
dir,
@@ -243,59 +232,3 @@ class SharkImporter:
self.inputs,
golden_out,
)
# Applies fx conversion to the model and imports the mlir.
def import_with_fx(model, inputs, debug=False):
import torch
from torch.fx.experimental.proxy_tensor import make_fx
from torch._decomp import get_decompositions
# TODO: Control the decompositions.
fx_g = make_fx(
model,
decomposition_table=get_decompositions(
[
torch.ops.aten.embedding_dense_backward,
torch.ops.aten.native_layer_norm_backward,
torch.ops.aten.slice_backward,
torch.ops.aten.select_backward,
torch.ops.aten.norm.ScalarOpt_dim,
torch.ops.aten.native_group_norm,
torch.ops.aten.upsample_bilinear2d.vec,
torch.ops.aten.split.Tensor,
torch.ops.aten.split_with_sizes,
torch.ops.aten.native_layer_norm,
]
),
)(*inputs)
fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
fx_g.recompile()
def strip_overloads(gm):
"""
Modifies the target of graph nodes in :attr:`gm` to strip overloads.
Args:
gm(fx.GraphModule): The input Fx graph module to be modified
"""
for node in gm.graph.nodes:
if isinstance(node.target, torch._ops.OpOverload):
node.target = node.target.overloadpacket
gm.recompile()
strip_overloads(fx_g)
mlir_importer = SharkImporter(
fx_g,
inputs,
frontend="torch",
)
if debug:
(mlir_module, func_name), _, _ = mlir_importer.import_debug()
return mlir_module, func_name
mlir_module, func_name = mlir_importer.import_mlir()
return mlir_module, func_name

View File

@@ -9,15 +9,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from shark.iree_utils.compile_utils import (
export_iree_module_to_vmfb,
load_flatbuffer,
create_dispatch_dirs,
compile_benchmark_dirs,
)
import os
from shark.shark_runner import SharkRunner
from shark.parser import shark_args
import numpy as np
@@ -39,7 +31,7 @@ class SharkInference:
Attributes
----------
mlir_module : str
mlir_module represented in string; modules from torch-mlir are serialized in bytecode format.
mlir_module represented in string.
function_name : str
function to execute in the given mlir_module.
device : str
@@ -65,48 +57,21 @@ class SharkInference:
def __init__(
self,
mlir_module: bytes,
mlir_module: str,
function_name: str = "forward",
device: str = "none",
mlir_dialect: str = "linalg",
is_benchmark: bool = False,
dispatch_benchmark: str = None,
dispatch_benchmark_dir: str = "temp_dispatch_benchmarks",
):
self.mlir_module = mlir_module
self.function_name = function_name
self.device = shark_args.device if device == "none" else device
self.device = device
self.mlir_dialect = mlir_dialect
self.is_benchmark = is_benchmark
self.dispatch_benchmarks = (
shark_args.dispatch_benchmarks
if dispatch_benchmark is None
else dispatch_benchmark
)
self.dispatch_benchmarks_dir = (
shark_args.dispatch_benchmarks_dir
if dispatch_benchmark_dir == "temp_dispatch_benchmarks"
else dispatch_benchmark_dir
)
self.shark_runner = None
def compile(self, extra_args=[]):
if self.dispatch_benchmarks is not None:
extra_args.append(
f"--iree-hal-dump-executable-sources-to={self.dispatch_benchmarks_dir}"
)
extra_args.append(
f"--iree-hal-dump-executable-binaries-to={self.dispatch_benchmarks_dir}"
)
temp_dir = self.dispatch_benchmarks_dir.split("/")
temp_dir[-1] = "temp_" + temp_dir[-1]
temp_dir = "/".join(temp_dir)
self.temp_dispatch_benchmarks_dir = temp_dir
extra_args.append(
f"--iree-hal-dump-executable-benchmarks-to={self.temp_dispatch_benchmarks_dir}"
)
def compile(self):
if self.is_benchmark == True:
from shark.shark_benchmark_runner import SharkBenchmarkRunner
@@ -116,7 +81,6 @@ class SharkInference:
self.function_name,
self.device,
self.mlir_dialect,
extra_args=extra_args,
)
else:
@@ -125,18 +89,8 @@ class SharkInference:
self.function_name,
self.device,
self.mlir_dialect,
extra_args=extra_args,
)
if self.dispatch_benchmarks is not None:
create_dispatch_dirs(self.dispatch_benchmarks_dir, self.device)
compile_benchmark_dirs(
self.dispatch_benchmarks_dir,
self.device,
self.dispatch_benchmarks,
)
os.system(f"rm -rf {self.temp_dispatch_benchmarks_dir}")
# inputs are considered to be tuple of np.array.
def forward(self, inputs: tuple):
return self.shark_runner.run(inputs)
@@ -181,34 +135,3 @@ class SharkInference:
)
)
return tuple(inputs)
# TODO: Instead of passing directory and having names decided by the module
# , user may want to save the module with manual names.
def save_module(self, dir=os.getcwd(), module_name=None, extra_args=[]):
return export_iree_module_to_vmfb(
self.mlir_module,
self.device,
dir,
self.mlir_dialect,
self.function_name,
module_name=module_name,
extra_args=extra_args,
)
# load and return the module.
def load_module(self, path, extra_args=[]):
self.shark_runner = SharkRunner(
function_name=self.function_name,
device=self.device,
compile_vmfb=False,
extra_args=extra_args,
)
(
self.shark_runner.iree_compilation_module,
self.shark_runner.iree_config,
) = load_flatbuffer(
path,
self.device,
self.function_name,
)
return

View File

@@ -16,7 +16,6 @@ from shark.iree_utils.compile_utils import (
get_iree_compiled_module,
get_results,
export_iree_module_to_vmfb,
load_flatbuffer,
)
from shark.iree_utils._common import check_device_drivers, device_driver_info
from shark.parser import shark_args
@@ -25,7 +24,7 @@ import sys
# supported dialects by the shark-runtime.
supported_dialects = {"linalg", "mhlo", "tosa", "tf-lite", "tm_tensor"}
supported_dialects = {"linalg", "mhlo", "tosa", "tf-lite"}
class SharkRunner:
@@ -61,35 +60,30 @@ class SharkRunner:
def __init__(
self,
mlir_module: bytes = None,
mlir_module: str,
function_name: str = "forward",
device: str = "none",
mlir_dialect: str = "linalg",
extra_args: list = [],
compile_vmfb: bool = True,
):
self.mlir_module = mlir_module
self.function_name = function_name
self.device = shark_args.device if device == "none" else device
self.mlir_dialect = mlir_dialect
self.extra_args = extra_args
if check_device_drivers(self.device):
print(device_driver_info(self.device))
device_driver_info(self.device)
sys.exit(1)
if compile_vmfb == True:
# Compile the module to get the .vmfb.
(
self.iree_compilation_module,
self.iree_config,
) = get_iree_compiled_module(
self.mlir_module,
self.device,
self.mlir_dialect,
func_name=self.function_name,
extra_args=self.extra_args,
)
# Compile the module to get the .vmfb.
(
self.iree_compilation_module,
self.iree_config,
) = get_iree_compiled_module(
self.mlir_module,
self.device,
self.mlir_dialect,
func_name=self.function_name,
)
def run(self, inputs: tuple):
return get_results(
@@ -98,3 +92,10 @@ class SharkRunner:
self.iree_config,
self.mlir_dialect,
)
# TODO: Instead of passing directory and having names decided by the module
# , user may want to save the module with manual names.
def save_module(self, dir=os.getcwd()):
return export_iree_module_to_vmfb(
self.model, self.device, dir, self.mlir_dialect
)

View File

@@ -1,11 +0,0 @@
1. Install torchdynamo
- `git clone https://github.com/pytorch/torchdynamo.git`
- `cd torchdynamo`
- `python -m pip install -r requirements.txt`
- `python setup.py develop`
2. Install functorch
- `python -m pip install -v "git+https://github.com/pytorch/pytorch.git@$(python -c "import torch.version; print(torch.version.git_version)")#subdirectory=functorch"`
3. Run examples.
- `python shark/examples/shark_dynamo/basic_examples.py`

View File

@@ -1,157 +0,0 @@
import functools
import time
from typing import List, Optional
import torch
from torch.fx.experimental.proxy_tensor import make_fx
from functorch._src.compile_utils import strip_overloads
from shark.shark_inference import SharkInference
from torch._decomp import get_decompositions
import torch_mlir
# TODO: Control decompositions.
def default_decompositions():
return get_decompositions(
[
torch.ops.aten.embedding_dense_backward,
torch.ops.aten.native_layer_norm_backward,
torch.ops.aten.slice_backward,
torch.ops.aten.select_backward,
torch.ops.aten.norm.ScalarOpt_dim,
torch.ops.aten.native_group_norm,
torch.ops.aten.upsample_bilinear2d.vec,
torch.ops.aten.split.Tensor,
torch.ops.aten.split_with_sizes,
]
)
def timeit(*, append_time_to: Optional[List] = None):
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time_ns()
result = func(*args, **kwargs)
end_time = time.time_ns()
if append_time_to is not None:
append_time_to.append(end_time - start_time)
return result
return wrapper
return decorator
def _returns_nothing(fx_g: torch.fx.GraphModule) -> bool:
for node in fx_g.graph.nodes:
if node.op == "output":
assert (
len(node.args) == 1
), "Output node must have a single argument"
node_arg = node.args[0]
if isinstance(node_arg, tuple):
return len(node_arg) == 0
return False
def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
"""
Replace tuple with tuple element in functions that return one-element tuples.
Returns true if an unwrapping took place, and false otherwise.
"""
unwrapped_tuple = False
for node in fx_g.graph.nodes:
if node.op == "output":
assert (
len(node.args) == 1
), "Output node must have a single argument"
node_arg = node.args[0]
if isinstance(node_arg, tuple):
if len(node_arg) == 1:
node.args = (node_arg[0],)
unwrapped_tuple = True
break
if unwrapped_tuple:
fx_g.graph.lint()
fx_g.recompile()
return unwrapped_tuple
def make_shark_compiler(use_tracing: bool, device: str, verbose=False):
def compiler(
fx_graph: torch.fx.GraphModule,
example_inputs: List[torch.Tensor],
):
"""Compile GraphModule using torch-mlir + SHARK."""
if verbose:
print("Compiling graph...")
if _returns_nothing(fx_graph):
return fx_graph
was_unwrapped = _unwrap_single_tuple_return(fx_graph)
fx_graph = make_fx(
fx_graph, decomposition_table=default_decompositions()
)(*example_inputs)
strip_overloads(fx_graph)
if verbose:
print("torch.fx graph:")
print(fx_graph.graph)
ts_compiler = torch.jit.trace if use_tracing else torch.jit.script
ts_graph = ts_compiler(fx_graph, example_inputs)
if verbose:
torch_mlir_module = torch_mlir.compile(
ts_graph,
example_inputs,
output_type=torch_mlir.OutputType.TORCH,
)
print("\n\ntorch-mlir backend contract graph:")
print(torch_mlir_module)
linalg_module = torch_mlir.compile(
ts_graph,
example_inputs,
output_type=torch_mlir.OutputType.LINALG_ON_TENSORS,
)
shark_module = SharkInference(
linalg_module, "forward", mlir_dialect="linalg", device=device
)
shark_module.compile()
def forward(*inputs):
result = shark_module.forward(inputs)
result = tuple() if result is None else result
return (result,) if was_unwrapped else result
return forward
return compiler
def check_results(compiled_results, eager_results):
for compiled_result, eager_result in zip(compiled_results, eager_results):
if not torch.allclose(
compiled_result.to("cpu"), eager_result.to("cpu"), atol=1e-5
):
print("Compiled result does not match eager result")
return
print("Compiled result matches eager result!")
def print_time_stats(times):
times_tensor = torch.tensor(times)
def quantile_ms(q):
return torch.quantile(times_tensor.to(float), q).item() / 1e6
print(f"Median: {quantile_ms(0.5)} ms")
print(f"10%ile: {quantile_ms(0.1)} ms")
print(f"90%ile: {quantile_ms(0.9)} ms")
print(f"Total: {torch.sum(times_tensor) / 1e6} ms")
print()

View File

@@ -1,296 +0,0 @@
# Copyright 2022 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from iree.runtime import query_available_drivers, get_driver
from shark.shark_downloader import download_model
from shark.shark_inference import SharkInference
from typing import List, Optional, Tuple
import numpy as np
import argparse
from shark.iree_utils._common import _IREE_DEVICE_MAP
import multiprocessing
from shark.shark_runner import supported_dialects
import logging
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures.thread import ThreadPoolExecutor
import time
import numpy as np
IREE_TO_SHARK_DRIVER_MAP = {v: k for k, v in _IREE_DEVICE_MAP.items()}
def stress_test_compiled_model(
shark_module_path: str,
function_name: str,
device: str,
inputs: List[np.ndarray],
golden_out: List[np.ndarray],
batch_size: int,
max_iterations: int,
max_duration_seconds: float,
inference_timeout_seconds: float,
tolerance_nulp: int,
stress_test_index: int,
):
logging.info(
f"Running stress test {stress_test_index} on device {device}."
)
shark_module = SharkInference(
mlir_module=bytes(), function_name=function_name, device=device
)
shark_module.load_module(shark_module_path)
input_batches = [np.repeat(arr, batch_size, axis=0) for arr in inputs]
golden_output_batches = np.repeat(golden_out, batch_size, axis=0)
report_interval_seconds = 10
start_time = time.time()
previous_report_time = start_time
executor = ThreadPoolExecutor(1)
first_iteration_output = None
for i in range(max_iterations):
inference_task = executor.submit(shark_module.forward, input_batches)
output = inference_task.result(inference_timeout_seconds)
if first_iteration_output is None:
np.testing.assert_array_almost_equal_nulp(
golden_output_batches, output, nulp=tolerance_nulp
)
first_iteration_output = output
else:
np.testing.assert_array_equal(output, first_iteration_output)
current_time = time.time()
if report_interval_seconds < current_time - previous_report_time:
logging.info(
f"Stress test {stress_test_index} on device "
f"{device} at iteration {i+1}"
)
previous_report_time = current_time
if max_duration_seconds < current_time - start_time:
return
logging.info(f"Stress test {stress_test_index} on device {device} done.")
def get_device_type(device_name: str):
return device_name.split("://", 1)[0]
def get_device_types(device_names: str):
return [get_device_type(device_name) for device_name in device_names]
def query_devices(device_types: Optional[List[str]] = None) -> List[str]:
devices = []
if device_types is None:
device_types = [
IREE_TO_SHARK_DRIVER_MAP[name]
for name in query_available_drivers()
if name in IREE_TO_SHARK_DRIVER_MAP
]
for device_type in device_types:
driver = get_driver(_IREE_DEVICE_MAP[device_type])
device_infos = driver.query_available_devices()
for device_info in device_infos:
uri_path = (
device_info["path"]
if device_info["path"] != ""
else str(device_info["device_id"])
)
device_uri = f"{device_type}://{uri_path}"
devices.append(device_uri)
return devices
def compile_stress_test_module(
device_types: List[str], mlir_model: str, func_name: str, mlir_dialect: str
) -> List[str]:
shark_module_paths = []
for device_type in device_types:
logging.info(
f"Compiling stress test model for device type {device_type}."
)
shark_module = SharkInference(
mlir_model,
func_name,
mlir_dialect=mlir_dialect,
device=device_type,
)
shark_module_paths.append(shark_module.save_module())
return shark_module_paths
def stress_test(
model_name: str,
dynamic_model: bool = False,
device_types: Optional[List[str]] = None,
device_names: Optional[List[str]] = None,
batch_size: int = 1,
max_iterations: int = 10**7,
max_duration_seconds: float = 3600,
inference_timeout_seconds: float = 60,
mlir_dialect: str = "linalg",
frontend: str = "torch",
oversubscription_factor: int = 1,
tolerance_nulp: int = 50000,
):
logging.info(f"Downloading stress test model {model_name}.")
mlir_model, func_name, inputs, golden_out = download_model(
model_name=model_name, dynamic=dynamic_model, frontend=frontend
)
if device_names is None or device_types is not None:
device_names = [] if device_names is None else device_names
with ProcessPoolExecutor() as executor:
device_names.extend(
executor.submit(query_devices, device_types).result()
)
device_types_set = list(set(get_device_types(device_names)))
shark_module_paths_set = compile_stress_test_module(
device_types_set, mlir_model, func_name, mlir_dialect
)
device_type_shark_module_path_map = {
device_type: module_path
for device_type, module_path in zip(
device_types_set, shark_module_paths_set
)
}
device_name_shark_module_path_map = {
device_name: device_type_shark_module_path_map[
get_device_type(device_name)
]
for device_name in device_names
}
# This needs to run in a spearate process, because it uses the drvier chache
# in IREE and a subsequent call to `iree.runtime.SystemContext.add_vm_module`
# in a forked process will hang.
with multiprocessing.Pool(
len(device_name_shark_module_path_map) * oversubscription_factor
) as process_pool:
process_pool.starmap(
stress_test_compiled_model,
[
(
module_path,
func_name,
device_name,
inputs,
golden_out,
batch_size,
max_iterations,
max_duration_seconds,
inference_timeout_seconds,
tolerance_nulp,
stress_test_index,
)
for stress_test_index, (device_name, module_path) in enumerate(
list(device_name_shark_module_path_map.items())
* oversubscription_factor
)
],
)
if __name__ == "__main__":
logging.basicConfig(encoding="utf-8", level=logging.INFO)
parser = argparse.ArgumentParser(
description="Downloads, compiles and runs a model from the tank to stress test the system."
)
parser.add_argument(
"--model", type=str, help="Model name in the tank.", default="alexnet"
)
parser.add_argument(
"--dynamic",
help="Use dynamic version of the model.",
action="store_true",
default=False,
)
parser.add_argument(
"--frontend", type=str, help="Frontend of the model.", default="torch"
)
parser.add_argument(
"--mlir-dialect",
type=str,
help="MLIR dialect of the model.",
default="linalg",
choices=supported_dialects,
)
parser.add_argument(
"--device-types",
type=str,
nargs="*",
choices=_IREE_DEVICE_MAP.keys(),
help="Runs the stress test on all devices with that type. "
"If absent and no deveices are specified "
"will run against all available devices.",
)
parser.add_argument(
"--devices",
type=str,
nargs="*",
help="List of devices to run the stress test on. "
"If device-types is specified will run against the union of the two.",
)
parser.add_argument(
"--batch-size",
type=int,
help="Number of inputs to feed into the model",
default=1,
)
parser.add_argument(
"--oversubscription",
type=int,
help="Oversubscrption factor. Each device will execute the model simultaneously "
"this many number of times.",
default=1,
)
parser.add_argument(
"--max-iterations",
type=int,
help="Maximum number of iterations to run the stress test per device.",
default=10**7,
)
parser.add_argument(
"--max-duration",
type=float,
help="Maximum number of seconds to run the stress test.",
default=3600,
)
parser.add_argument(
"--inference-timeout",
type=float,
help="Timeout in seconds for a single model inference operation.",
default=60,
)
parser.add_argument(
"--tolerance-nulp",
type=int,
help="The maximum number of unit in the last place for tolerance "
"when verifing results with the golden reference output.",
default=50000,
)
args = parser.parse_known_args()[0]
stress_test(
model_name=args.model,
dynamic_model=args.dynamic,
frontend=args.frontend,
mlir_dialect=args.mlir_dialect,
device_types=args.device_types,
device_names=args.devices,
batch_size=args.batch_size,
oversubscription_factor=args.oversubscription,
max_iterations=args.max_iterations,
max_duration_seconds=args.max_duration,
inference_timeout_seconds=args.inference_timeout,
tolerance_nulp=args.tolerance_nulp,
)

View File

@@ -1,31 +0,0 @@
# Copyright 2022 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import subprocess
import sys
import importlib.util
def test_stress_test():
subprocess.check_call(
[
sys.executable,
importlib.util.find_spec("shark.stress_test").origin,
"--model=squeezenet1_0",
"--devices",
"cpu",
"--max-iterations=1",
]
)

View File

@@ -1,220 +0,0 @@
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# Also available under a BSD-style license. See LICENSE.
import contextlib
import re
import traceback
import warnings
from typing import Any
import numpy as np
import torch
from torch.utils._pytree import tree_map
from torch_mlir.eager_mode.ir_building import build_mlir_module
from torch_mlir.eager_mode.torch_mlir_dispatch import (
UnsupportedByTorchMlirEagerMode,
normalize_args_kwargs,
check_get_aliased_arg,
)
from torch_mlir.eager_mode import EAGER_MODE_DEBUG
from torch_mlir.eager_mode.torch_mlir_tensor import (
TorchMLIRTensor,
check_requires_grad,
make_wrapper_subclass_from_torch_tensor,
make_bare_wrapper_subclass,
UNSUPPORTED_OPS,
no_dispatch,
)
from torch_mlir.eager_mode import torch_mlir_tensor
from shark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend
backend = EagerModeIREELinalgOnTensorsBackend("cpu")
torch_mlir_tensor.backend = backend
rtol = 1e-04
atol = 1e-05
class TorchMLIRLockstepTensor(TorchMLIRTensor):
"""This class overrides the dispatching for TorchMLIRTensor to allow for an op-by-op numerical comparison between PyTorch and the Torch-MLIR -> IREE backend compilation pipeline. This only supports the IREE backend and focuses on op-by-op level verification.
TODO: Extend this to do a cumulative trace with summary statistics at the end. Possibly requires a wrapper environment to store full trace info.
"""
def __new__(cls, elem, **kwargs):
if kwargs.get("constructing_from_device_tensor", False):
tensor_meta_data = backend.get_torch_metadata(elem, kwargs)
r = make_bare_wrapper_subclass(
cls=cls,
size=tensor_meta_data.size,
strides=tensor_meta_data.strides,
storage_offset=tensor_meta_data.storage_offset,
dtype=tensor_meta_data.dtype,
layout=tensor_meta_data.layout,
device=tensor_meta_data.device,
requires_grad=tensor_meta_data.requires_grad,
)
r.elem = elem
elif isinstance(elem, torch.nn.Parameter):
r = make_wrapper_subclass_from_torch_tensor(
cls, elem.data, **kwargs
)
# This is a hack to handle non-contiguous data through IREE-backend
nt = elem.detach().data.numpy()
if not nt.flags["C_CONTIGUOUS"]:
nt = np.ascontiguousarray(nt, dtype=nt.dtype)
r.elem = backend.transfer_from_torch_to_device(
torch.from_numpy(nt)
)
elif isinstance(elem, torch.Tensor):
r = make_wrapper_subclass_from_torch_tensor(cls, elem, **kwargs)
# Ditto TODO: Find a better way to handle this
nt = elem.numpy()
if not nt.flags["C_CONTIGUOUS"]:
nt = np.ascontiguousarray(nt, dtype=nt.dtype)
r.elem = backend.transfer_from_torch_to_device(
torch.from_numpy(nt)
)
# This branch handles the case when a python scalar is passed to some op
# or is returned from some aten op, such as _local_scalar_dense.
elif isinstance(elem, (int, float, bool)):
return elem
else:
raise ValueError(f"Unknown element type: {type(elem)}")
return r
def __repr__(self):
if self.grad_fn:
return f"TorchMLIRLockstepTensor({self.elem}, backend={backend.__class__.__name__}, grad_fn={self.grad_fn})"
else:
return f"TorchMLIRLockstepTensor({self.elem}, backend={backend.__class__.__name__})"
"""This does essentially the same dispatch as TorchMLIRTensor but operates as if debug mode is enabled. The numeric verification happens after the Torch-MLIR result is obtained by comparing against the
"""
@classmethod
def __torch_dispatch__(cls, func, _types, args=(), kwargs=None):
requires_grad = check_requires_grad(*args, **kwargs)
try:
with no_dispatch():
if hasattr(func, "op_name"):
op_name = func.op_name
elif hasattr(func, "__name__"):
# Handle builtin_function_or_method.
op_name = func.__name__
else:
raise RuntimeError(f"op {func} has no name")
if UNSUPPORTED_OPS.match(op_name):
raise UnsupportedByTorchMlirEagerMode(op_name)
if not hasattr(func, "_schema"):
raise RuntimeError(f"op {func} has no schema.")
normalized_kwargs = normalize_args_kwargs(func, args, kwargs)
if "layout" in normalized_kwargs and normalized_kwargs[
"layout"
] not in {0, None}:
raise UnsupportedByTorchMlirEagerMode(
f"{normalized_kwargs['layout']} layout not supported."
)
if "memory_format" in normalized_kwargs and normalized_kwargs[
"memory_format"
] not in {0, None}:
raise UnsupportedByTorchMlirEagerMode(
f"{normalized_kwargs['memory_format']} memory format not supported."
)
eager_module = build_mlir_module(func, normalized_kwargs)
device_tensor_args = [
kwarg.elem
for _, kwarg in normalized_kwargs.items()
if isinstance(kwarg, cls)
]
assert len(eager_module.body.operations[0].arguments) == len(
device_tensor_args
), "Number of parameters and number of arguments differs."
op_mlir_backend_callable = backend.compile(eager_module)
out = op_mlir_backend_callable(*device_tensor_args)
out = tree_map(
lambda x: cls(
x,
requires_grad=requires_grad,
constructing_from_device_tensor=True,
),
out,
)
# Numeric verification; Value for comparison comes from PyTorch eager
with no_dispatch():
unwrapped_args = tree_map(cls.unwrap, args)
unwrapped_kwargs = tree_map(cls.unwrap, kwargs)
if "_reshape_alias" in op_name:
native_out = torch.ops.aten.view(
unwrapped_args[0], unwrapped_args[1]
)
else:
native_out = func(*unwrapped_args, **unwrapped_kwargs)
native_out = tree_map(
lambda x: cls(x, requires_grad=requires_grad), native_out
).elem
tmp_out = out.elem
try:
np.testing.assert_allclose(
native_out.to_host(),
tmp_out.to_host(),
rtol=rtol,
atol=atol,
)
except Exception as e:
shaped_args = [
arg.shape if torch.is_tensor(arg) else arg
for arg in unwrapped_args
]
shaped_kwargs = [
kwarg.shape if torch.is_tensor(kwarg) else kwarg
for kwarg in unwrapped_kwargs
]
warnings.warn(
f"Lockstep accuracy verification failed with error: *{str(e)}*; "
f"Dispatched function name: *{str(func)}*; "
f"Dispatched function args: *{str(shaped_args)}*; "
f"Dispatched function kwargs: *{str(shaped_kwargs)}*; "
)
except Exception as e:
warnings.warn(traceback.format_exc())
if isinstance(e, UnsupportedByTorchMlirEagerMode):
warnings.warn(
f"Couldn't use TorchMLIR eager because current incompatibility: *{str(e)}*; running through PyTorch eager."
)
else:
warnings.warn(
f"Couldn't use TorchMLIR eager because of error: *{str(e)}*; "
f"Running through PyTorch eager"
)
with no_dispatch():
unwrapped_args = tree_map(cls.unwrap, args)
unwrapped_kwargs = tree_map(cls.unwrap, kwargs)
if "_reshape_alias" in op_name:
out = torch.ops.aten.view(
unwrapped_args[0], unwrapped_args[1]
)
else:
out = func(*unwrapped_args, **unwrapped_kwargs)
out = tree_map(lambda x: cls(x, requires_grad=requires_grad), out)
maybe_aliased_arg_name = check_get_aliased_arg(func)
if maybe_aliased_arg_name is not None:
warnings.warn(
f"Found aliased arg, but didn't copy tensor contents. This could lead to incorrect results for E2E model execution but doesn't affect the validity of the lockstep op verification."
)
# TODO: Find a way to handle argument aliasing for IREE backend
# backend.copy_into(normalized_kwargs[maybe_aliased_arg_name].elem, out.elem)
return out

View File

@@ -12,12 +12,26 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import io
import pickle
from torch_mlir.dialects.torch.importer.jit_ir import (
ClassAnnotator,
ModuleBuilder,
)
from torch_mlir_e2e_test.torchscript.serialization import (
extract_serializable_annotations,
apply_serializable_annotations,
SerializableTest,
)
from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
from torch_mlir.passmanager import PassManager
from torch_mlir_e2e_test.torchscript.annotations import annotate_args, export
from torch_mlir.ir import StringAttr
import torch_mlir
from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
import tempfile
from shark.parser import shark_args
import io
def get_module_name_for_asm_dump(module):
@@ -31,6 +45,22 @@ def get_module_name_for_asm_dump(module):
).value
def get_input_annotations(inputs: tuple, dynamic: bool) -> list:
"""TODO: Include necessary documentation"""
annotations_list = [None]
for i in inputs:
temp_list = []
if dynamic:
temp_list.append([-1 for i in range(len(i.shape))])
else:
temp_list.append(list(i.shape))
temp_list.append(i.dtype)
temp_list.append(True)
annotations_list.append(tuple(temp_list))
return annotations_list
def run_on_refbackend(torch_module, inputs):
backend = refbackend.RefBackendLinalgOnTensorsBackend()
compiled = backend.compile(torch_module)
@@ -39,16 +69,42 @@ def run_on_refbackend(torch_module, inputs):
return jit_module.forward(np_inputs[0])
# Creates dynamic dims for all dims.
# TODO: Pass user specified dynamic dims.
def create_dynamic_placeholders(inputs):
placeholders = []
for inp in inputs:
placeholder = torch_mlir.TensorPlaceholder.like(
inp, dynamic_axes=[i for i in range(len(inp.shape))]
)
placeholders.append(placeholder)
return tuple(placeholders)
def shark_jit_trace(
module, input: tuple, dynamic: bool, tracing_required: bool
):
"""TODO: Include necessary documentation."""
if not tracing_required:
return torch.jit.script(module)
traced_module = torch.jit.trace_module(module, {"forward": input})
actual_script = traced_module._actual_script_module
export(actual_script.forward)
annotate_args_decorator = annotate_args(
get_input_annotations(input, dynamic)
)
annotate_args_decorator(actual_script.forward)
module = torch.jit.script(actual_script)
# TODO: remove saved annotations.pickle
torchscript_module_bytes = module.save_to_buffer(
{
"annotations.pkl": pickle.dumps(
extract_serializable_annotations(module)
)
}
)
serializable_test = SerializableTest(
unique_name="", program=torchscript_module_bytes, trace=None
)
_extra_files = {"annotations.pkl": ""}
module = torch.jit.load(
io.BytesIO(serializable_test.program), _extra_files=_extra_files
)
# Load the pickled annotations.
annotations = pickle.loads(_extra_files["annotations.pkl"])
apply_serializable_annotations(module, annotations)
return module
def get_torch_mlir_module(
@@ -56,24 +112,41 @@ def get_torch_mlir_module(
input: tuple,
dynamic: bool,
jit_trace: bool,
from_torchscript: bool = False,
):
"""Get the MLIR's linalg-on-tensors module from the torchscipt module."""
ignore_traced_shapes = False
if dynamic:
input = create_dynamic_placeholders(input)
if jit_trace:
ignore_traced_shapes = True
"""TODO: Include necessary documentation."""
tempfile.tempdir = shark_args.repro_dir
# Static modules compiles well with the torch_mlir.compile API.
# We will always jit_trace = True with the API since we always
# want to propagate static shapes.
if not dynamic:
module = torch_mlir.compile(
module,
input,
output_type=torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=jit_trace,
)
return module
mlir_module = torch_mlir.compile(
module,
input,
output_type=torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=jit_trace,
ignore_traced_shapes=ignore_traced_shapes,
# Tracing is not required from the aot_module.
if not from_torchscript:
module = shark_jit_trace(module, input, dynamic, jit_trace)
mb = ModuleBuilder()
class_annotator = ClassAnnotator()
class_annotator.exportNone(module._c._type())
class_annotator.exportPath(module._c._type(), ["forward"])
class_annotator.annotateArgs(
module._c._type(),
["forward"],
get_input_annotations(input, dynamic),
)
bytecode_stream = io.BytesIO()
mlir_module.operation.write_bytecode(bytecode_stream)
bytecode = bytecode_stream.getvalue()
return bytecode
mb.import_module(module._c, class_annotator)
with mb.module.context:
pm = PassManager.parse(
"torchscript-module-to-torch-backend-pipeline,torch-backend-to-linalg-on-tensors-backend-pipeline"
)
pm.run(mb.module)
return mb.module

View File

@@ -1,223 +0,0 @@
## Supported and Validated Models
### PyTorch HuggingFace Models
| PyTorch Language Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :green_heart: (JIT) | :green_heart: | :green_heart: | :green_heart: |
| Albert | :green_heart: (JIT) | :green_heart: | :green_heart: | :green_heart: |
| BigBird | :green_heart: (AOT) | | | |
| dbmdz/ConvBERT | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| DistilBERT | :broken_heart: (JIT) | | | |
| GPT2 | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| MobileBert | :green_heart: (JIT) | :green_heart: | :green_heart: | :green_heart: |
| microsoft/beit | :green_heart: | :green_heart: | :broken_heart: | :broken_heart: |
| facebook/deit | :green_heart: | :green_heart: | :broken_heart: | :broken_heart: |
| facebook/convnext | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
### Torchvision Models
| TORCHVISION Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|--------------------|----------------------|----------|----------|-------------|
| AlexNet | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| MobileNetV2 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| MobileNetV3 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Unet | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnet18 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnet50 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnet101 | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| Resnext50_32x4d | :green_heart: (Script) | | | |
| SqueezeNet | :green_heart: (Script) | :green_heart: | :broken_heart: | :broken_heart: |
| EfficientNet | :green_heart: (Script) | | | |
| Regnet | :green_heart: (Script) | | | |
| Resnest | :broken_heart: (Script) | | | |
| Vision Transformer | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| VGG 16 | :green_heart: (Script) | :green_heart: | :green_heart: | |
| Wide Resnet | :green_heart: (Script) | :green_heart: | :green_heart: | :green_heart: |
| RAFT | :broken_heart: (JIT) | | | |
For more information refer to [MODEL TRACKING SHEET](https://docs.google.com/spreadsheets/d/15PcjKeHZIrB5LfDyuw7DGEEE8XnQEX2aX8lm8qbxV8A/edit#gid=0)
### Tensorflow Models (Inference)
| Hugging Face Models | tf-mhlo lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| MiniLM | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| albert-base-v2 | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| DistilBERT | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| CamemBert | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| ConvBert | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| Deberta | | | | |
| electra | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| funnel | | | | |
| layoutlm | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| longformer | | | | |
| mobile-bert | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| rembert | | | | |
| tapas | | | | |
| flaubert | :broken_heart: | :green_heart: | :green_heart: | :green_heart: |
| roberta | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| xlm-roberta | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
| mpnet | :green_heart: | :green_heart: | :green_heart: | :green_heart: |
### PyTorch Training Models
| Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :green_heart: | :green_heart: | | |
| FullyConnected | :green_heart: | :green_heart: | | |
### JAX Models
| Models | JAX-MHLO lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| DALL-E | :broken_heart: | :broken_heart: | | |
| FullyConnected | :green_heart: | :green_heart: | | |
<details>
<summary>TFLite Models</summary>
### TFLite Models
| Models | TOSA/LinAlg | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------------------|----------|----------|-------------|
| BERT | :broken_heart: | :broken_heart: | | |
| FullyConnected | :green_heart: | :green_heart: | | |
| albert | :green_heart: | :green_heart: | | |
| asr_conformer | :green_heart: | :green_heart: | | |
| bird_classifier | :green_heart: | :green_heart: | | |
| cartoon_gan | :green_heart: | :green_heart: | | |
| craft_text | :green_heart: | :green_heart: | | |
| deeplab_v3 | :green_heart: | :green_heart: | | |
| densenet | :green_heart: | :green_heart: | | |
| east_text_detector | :green_heart: | :green_heart: | | |
| efficientnet_lite0_int8 | :green_heart: | :green_heart: | | |
| efficientnet | :green_heart: | :green_heart: | | |
| gpt2 | :green_heart: | :green_heart: | | |
| image_stylization | :green_heart: | :green_heart: | | |
| inception_v4 | :green_heart: | :green_heart: | | |
| inception_v4_uint8 | :green_heart: | :green_heart: | | |
| lightning_fp16 | :green_heart: | :green_heart: | | |
| lightning_i8 | :green_heart: | :green_heart: | | |
| lightning | :green_heart: | :green_heart: | | |
| magenta | :green_heart: | :green_heart: | | |
| midas | :green_heart: | :green_heart: | | |
| mirnet | :green_heart: | :green_heart: | | |
| mnasnet | :green_heart: | :green_heart: | | |
| mobilebert_edgetpu_s_float | :green_heart: | :green_heart: | | |
| mobilebert_edgetpu_s_quant | :green_heart: | :green_heart: | | |
| mobilebert | :green_heart: | :green_heart: | | |
| mobilebert_tf2_float | :green_heart: | :green_heart: | | |
| mobilebert_tf2_quant | :green_heart: | :green_heart: | | |
| mobilenet_ssd_quant | :green_heart: | :green_heart: | | |
| mobilenet_v1 | :green_heart: | :green_heart: | | |
| mobilenet_v1_uint8 | :green_heart: | :green_heart: | | |
| mobilenet_v2_int8 | :green_heart: | :green_heart: | | |
| mobilenet_v2 | :green_heart: | :green_heart: | | |
| mobilenet_v2_uint8 | :green_heart: | :green_heart: | | |
| mobilenet_v3-large | :green_heart: | :green_heart: | | |
| mobilenet_v3-large_uint8 | :green_heart: | :green_heart: | | |
| mobilenet_v35-int8 | :green_heart: | :green_heart: | | |
| nasnet | :green_heart: | :green_heart: | | |
| person_detect | :green_heart: | :green_heart: | | |
| posenet | :green_heart: | :green_heart: | | |
| resnet_50_int8 | :green_heart: | :green_heart: | | |
| rosetta | :green_heart: | :green_heart: | | |
| spice | :green_heart: | :green_heart: | | |
| squeezenet | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v1 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v1_uint8 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2_fpnlite | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2_fpnlite_uint8 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2_int8 | :green_heart: | :green_heart: | | |
| ssd_mobilenet_v2 | :green_heart: | :green_heart: | | |
| ssd_spaghettinet_large | :green_heart: | :green_heart: | | |
| ssd_spaghettinet_large_uint8 | :green_heart: | :green_heart: | | |
| visual_wake_words_i8 | :green_heart: | :green_heart: | | |
</details>
## Testing and Benchmarks
### Run all model tests on CPU/GPU/VULKAN/Metal
For a list of models included in our pytest model suite, see https://github.com/nod-ai/SHARK/blob/main/tank/all_models.csv
```shell
pytest tank/test_models.py
# Models included in the pytest suite can be found listed in all_models.csv.
# If on Linux for multithreading on CPU (faster results):
pytest tank/test_models.py -n auto
```
### Running specific tests
```shell
# Search for test cases by including a keyword that matches all or part of the test case's name;
pytest tank/test_models.py -k "keyword"
# Test cases are named uniformly by format test_module_<model_name_underscores_only>_<torch/tf>_<static/dynamic>_<device>.
# Example: Test all models on nvidia gpu:
pytest tank/test_models.py -k "cuda"
# Example: Test all tensorflow resnet models on Vulkan backend:
pytest tank/test_models.py -k "resnet and tf and vulkan"
# Exclude a test case:
pytest tank/test_models.py -k "not ..."
### Run benchmarks on SHARK tank pytests and generate bench_results.csv with results.
(the following requires source installation with `IMPORTER=1 ./setup_venv.sh`)
```shell
pytest --benchmark tank/test_models.py
# Just do static GPU benchmarks for PyTorch tests:
pytest --benchmark tank/test_models.py -k "pytorch and static and cuda"
```
### Benchmark Resnet50, MiniLM on CPU
(requires source installation with `IMPORTER=1 ./setup_venv.sh`)
```shell
# We suggest running the following commands as root before running benchmarks on CPU:
cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | awk -F, '{print $2}' | sort -n | uniq | ( while read X ; do echo $X ; echo 0 > /sys/devices/system/cpu/cpu$X/online ; done )
echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
# Benchmark canonical Resnet50 on CPU via pytest
pytest --benchmark tank/test_models.py -k "resnet50 and tf_static_cpu"
# Benchmark canonical MiniLM on CPU via pytest
pytest --benchmark tank/test_models.py -k "MiniLM and cpu"
# Benchmark MiniLM on CPU via transformer-benchmarks:
git clone --recursive https://github.com/nod-ai/transformer-benchmarks.git
cd transformer-benchmarks
./perf-ci.sh -n
# Check detail.csv for MLIR/IREE results.
```
To run the fine tuning example, from the root SHARK directory, run:
```shell
IMPORTER=1 ./setup_venv.sh
source shark.venv/bin/activate
pip install jupyter tf-models-nightly tf-datasets
jupyter-notebook
```
if running from a google vm, you can view jupyter notebooks on your local system with:
```shell
gcloud compute ssh <YOUR_INSTANCE_DETAILS> --ssh-flag="-N -L localhost:8888:localhost:8888"
```

View File

@@ -0,0 +1,60 @@
from shark.iree_utils._common import check_device_drivers, device_driver_info
from shark.shark_inference import SharkInference
from shark.shark_downloader import download_tf_model
import iree.compiler as ireec
import unittest
import pytest
import numpy as np
class AlbertBaseModuleTester:
def __init__(
self,
benchmark=False,
):
self.benchmark = benchmark
def create_and_check_module(self, dynamic, device):
model, func_name, inputs, golden_out = download_tf_model(
"albert-base-v2"
)
shark_module = SharkInference(
model, func_name, device=device, mlir_dialect="mhlo"
)
shark_module.compile()
result = shark_module.forward(inputs)
np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
class AlbertBaseModuleTest(unittest.TestCase):
@pytest.fixture(autouse=True)
def configure(self, pytestconfig):
self.module_tester = AlbertBaseModuleTester(self)
self.module_tester.benchmark = pytestconfig.getoption("benchmark")
def test_module_static_cpu(self):
dynamic = False
device = "cpu"
self.module_tester.create_and_check_module(dynamic, device)
@pytest.mark.skipif(
check_device_drivers("gpu"), reason=device_driver_info("gpu")
)
def test_module_static_gpu(self):
dynamic = False
device = "gpu"
self.module_tester.create_and_check_module(dynamic, device)
@pytest.mark.skipif(
check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
)
def test_module_static_vulkan(self):
dynamic = False
device = "vulkan"
self.module_tester.create_and_check_module(dynamic, device)
if __name__ == "__main__":
unittest.main()

Some files were not shown because too many files have changed in this diff Show More